From ec323897e306596b61288d362cbe8f8a8f419062 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 5 Feb 2025 22:11:21 +1100 Subject: [PATCH 1/8] dcache: Use expanded per-way TLB and cache tag hit information Rather than combining the results of the per-way comparators into an encoded 'hit_way' variable, use the individual results directly using AND-OR type networks where possible, in order to reduce utilization and improve timing. Signed-off-by: Paul Mackerras --- dcache.vhdl | 128 ++++++++++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 59 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index ff7383c..566389c 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -101,6 +101,7 @@ architecture rtl of dcache is subtype row_t is unsigned(ROW_BITS-1 downto 0); subtype index_t is unsigned(INDEX_BITS-1 downto 0); subtype way_t is unsigned(WAY_BITS-1 downto 0); + subtype way_expand_t is std_ulogic_vector(NUM_WAYS-1 downto 0); subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way @@ -149,7 +150,7 @@ architecture rtl of dcache is subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); subtype tlb_way_ptes_t is std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0); type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t; - type hit_way_set_t is array(tlb_way_t) of way_t; + type tlb_expand_t is array(tlb_way_t) of std_ulogic; signal dtlb_valids : tlb_valids_t; signal dtlb_tags : tlb_tags_t; @@ -179,6 +180,13 @@ architecture rtl of dcache is return pa; end; + function andor(mask : std_ulogic; in1 : std_ulogic_vector(7 downto 0); + in2 : std_ulogic_vector(7 downto 0)) return std_ulogic_vector is + variable t : std_ulogic_vector(7 downto 0) := (others => mask); + begin + return in2 or (in1 and t); + end; + constant real_mode_perm_attr : perm_attr_t := (nocache => '0', others => '1'); -- Cache state machine @@ -401,6 +409,7 @@ architecture rtl of dcache is -- Async signals on incoming request signal req_index : index_t; signal req_hit_way : way_t; + signal req_hit_ways : way_expand_t; signal req_is_hit : std_ulogic; signal req_tag : cache_tag_t; signal req_op_load_hit : std_ulogic; @@ -448,6 +457,7 @@ architecture rtl of dcache is signal tlb_read_valid : std_ulogic; signal tlb_hit : std_ulogic; signal tlb_hit_way : tlb_way_sig_t; + signal tlb_hit_expand : tlb_expand_t; signal pte : tlb_pte_t; signal ra : real_addr_t; signal valid_ra : std_ulogic; @@ -741,33 +751,34 @@ begin variable hitway : tlb_way_sig_t; variable hit : std_ulogic; variable eatag : tlb_tag_t; + variable hitpte : tlb_pte_t; begin tlb_req_index <= unsigned(r0.req.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ)); hitway := to_unsigned(0, TLB_WAY_BITS); hit := '0'; + hitpte := (others => '0'); + tlb_hit_expand <= (others => '0'); eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); for i in tlb_way_t loop - if tlb_read_valid = '1' and tlb_valid_way(i) = '1' and + if tlb_valid_way(i) = '1' and read_tlb_tag(i, tlb_tag_way) = eatag then hitway := to_unsigned(i, TLB_WAY_BITS); - hit := '1'; + hit := tlb_read_valid; + hitpte := hitpte or read_tlb_pte(i, tlb_pte_way); + tlb_hit_expand(i) <= '1'; end if; end loop; tlb_hit <= hit and r0_valid; tlb_hit_way <= hitway; - if tlb_hit = '1' then - pte <= read_tlb_pte(to_integer(hitway), tlb_pte_way); - else - pte <= (others => '0'); - end if; + pte <= hitpte; valid_ra <= tlb_hit or not r0.req.virt_mode; tlb_miss <= r0_valid and r0.req.virt_mode and not tlb_hit; if r0.req.virt_mode = '1' then - ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + ra <= hitpte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); - perm_attr <= extract_perm_attr(pte); + perm_attr <= extract_perm_attr(hitpte); else ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); @@ -793,11 +804,13 @@ begin dtlb_valids(i) <= (others => '0'); end loop; elsif tlbie = '1' then - if tlb_hit = '1' then - assert not is_X(tlb_req_index); - assert not is_X(tlb_hit_way); - dtlb_valids(to_integer(tlb_req_index))(to_integer(tlb_hit_way)) <= '0'; - end if; + for i in tlb_way_t loop + if tlb_hit_expand(i) = '1' then + assert not is_X(tlb_req_index); + assert not is_X(tlb_hit_way); + dtlb_valids(to_integer(tlb_req_index))(i) <= '0'; + end if; + end loop; elsif tlbwe = '1' then assert not is_X(tlb_req_index); repl_way := to_unsigned(0, TLB_WAY_BITS); @@ -941,19 +954,15 @@ begin variable rindex : index_t; variable is_hit : std_ulogic; variable hit_way : way_t; + variable hit_ways : way_expand_t; variable go : std_ulogic; variable nc : std_ulogic; variable s_hit : std_ulogic; variable s_tag : cache_tag_t; variable s_pte : tlb_pte_t; variable s_ra : real_addr_t; - variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); - variable hit_way_set : hit_way_set_t; - variable rel_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable rel_match : std_ulogic; - variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable fwd_match : std_ulogic; - variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable snoop_match : std_ulogic; variable hit_reload : std_ulogic; variable dawr_match : std_ulogic; @@ -982,51 +991,44 @@ begin -- we compare each way with each of the real addresses from each way of -- the TLB, and then decide later which match to use. hit_way := to_unsigned(0, WAY_BITS); + hit_ways := (others => '0'); is_hit := '0'; rel_match := '0'; fwd_match := '0'; snoop_match := '0'; if r0.req.virt_mode = '1' then - rel_matches := (others => '0'); - fwd_matches := (others => '0'); - snp_matches := (others => '0'); for j in tlb_way_t loop - hit_way_set(j) := to_unsigned(0, WAY_BITS); - s_hit := '0'; - s_pte := read_tlb_pte(j, tlb_pte_way); - s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - r0.req.addr(TLB_LG_PGSZ - 1 downto 0); - s_tag := get_tag(s_ra); - if go = '1' then + if tlb_valid_way(j) = '1' then + s_hit := '0'; + s_pte := read_tlb_pte(j, tlb_pte_way); + s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.req.addr(TLB_LG_PGSZ - 1 downto 0); + s_tag := get_tag(s_ra); assert not is_X(s_tag); - end if; - for i in 0 to NUM_WAYS-1 loop - if go = '1' and cache_valids(to_integer(rindex))(i) = '1' and - read_tag(i, cache_tag_set) = s_tag and - tlb_valid_way(j) = '1' then - hit_way_set(j) := to_unsigned(i, WAY_BITS); - s_hit := '1'; - if snoop_hits(i) = '1' then - snp_matches(j) := '1'; + for i in 0 to NUM_WAYS-1 loop + if cache_valids(to_integer(rindex))(i) = '1' and + read_tag(i, cache_tag_set) = s_tag and + tlb_hit_expand(j) = '1' then + hit_ways(i) := '1'; + hit_way := to_unsigned(i, WAY_BITS); + if go = '1' then + is_hit := '1'; + if snoop_hits(i) = '1' then + snoop_match := '1'; + end if; + end if; + end if; + end loop; + if go = '1' and tlb_hit_expand(j) = '1' then + if not is_X(r1.reload_tag) and s_tag = r1.reload_tag then + rel_match := '1'; + end if; + if s_tag = r1.forward_tag then + fwd_match := '1'; end if; end if; - end loop; - hit_set(j) := s_hit; - if go = '1' and not is_X(r1.reload_tag) and s_tag = r1.reload_tag then - rel_matches(j) := '1'; - end if; - if go = '1' and s_tag = r1.forward_tag then - fwd_matches(j) := '1'; end if; end loop; - if tlb_hit = '1' and go = '1' then - assert not is_X(tlb_hit_way); - is_hit := hit_set(to_integer(tlb_hit_way)); - hit_way := hit_way_set(to_integer(tlb_hit_way)); - rel_match := rel_matches(to_integer(tlb_hit_way)); - fwd_match := fwd_matches(to_integer(tlb_hit_way)); - snoop_match := snp_matches(to_integer(tlb_hit_way)); - end if; else s_tag := get_tag(r0.req.addr); if go = '1' then @@ -1035,6 +1037,7 @@ begin for i in 0 to NUM_WAYS-1 loop if go = '1' and cache_valids(to_integer(rindex))(i) = '1' and read_tag(i, cache_tag_set) = s_tag then + hit_ways(i) := '1'; hit_way := to_unsigned(i, WAY_BITS); is_hit := '1'; if snoop_hits(i) = '1' then @@ -1121,6 +1124,8 @@ begin r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or use_forward_rl; hit_way := replace_way; + hit_ways := (others => '0'); + hit_ways(to_integer(replace_way)) := '1'; hit_reload := is_hit; elsif r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' and r0.req.nc = '0' and perm_attr.nocache = '0' and r1.prev_hit = '1' then @@ -1132,10 +1137,13 @@ begin -- NB lq to noncacheable isn't required to be atomic per the ISA. is_hit := '1'; hit_way := r1.prev_way; + hit_ways := (others => '0'); + hit_ways(to_integer(r1.prev_way)) := '1'; end if; -- The way that matched on a hit req_hit_way <= hit_way; + req_hit_ways <= hit_ways; req_is_hit <= is_hit; req_hit_reload <= hit_reload; @@ -1357,6 +1365,7 @@ begin variable j : integer; variable sel : std_ulogic_vector(1 downto 0); variable data_out : std_ulogic_vector(63 downto 0); + variable byte_out : std_ulogic_vector(7 downto 0); begin if rising_edge(clk) then if r0_valid = '1' then @@ -1386,11 +1395,12 @@ begin when "10" => data_out(j + 7 downto j) := r1.forward_data(j + 7 downto j); when others => - if is_X(req_hit_way) then - data_out(j + 7 downto j) := (others => 'X'); - else - data_out(j + 7 downto j) := cache_out(to_integer(req_hit_way))(j + 7 downto j); - end if; + byte_out := (others => '0'); + for w in 0 to NUM_WAYS-1 loop + byte_out := andor(req_hit_ways(w), cache_out(w)(j + 7 downto j), + byte_out); + end loop; + data_out(j + 7 downto j) := byte_out; end case; end loop; r1.data_out <= data_out; From 2529bb66ad01c8758b39910c72b52b60d472907c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 10 Mar 2025 15:00:55 +1100 Subject: [PATCH 2/8] dcache: Implement dcbz to non-cacheable memory properly A dcbz operation to memory that is mapped as non-cacheable in the page tables doesn't cause an alignment interrupt, but neither was it implemented properly in the dcache. It does do 8 writes to memory but it also creates a zero-filled line in the cache. This fixes it so that dcbz to memory mapped non-cacheable doesn't write the cache tag or set any line valid. We now have r1.reloading which is 1 only in RELOAD_WAIT_ACK state, but only if the memory is cacheable and therefore the cache should be updated (i.e. it is zero in RELOAD_WAIT_ACK state if we are doing a non-cacheable dcbz). We can now also remove the code in loadstore1 that checks for non-cacheable dcbz, which only triggered when doing dcbz in real mode to an address in the Cxxxxxxx range. Also remove some unused variables and signals. Signed-off-by: Paul Mackerras --- dcache.vhdl | 25 ++++++++++++++----------- loadstore1.vhdl | 1 - 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 566389c..1e7fff1 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -367,6 +367,7 @@ architecture rtl of dcache is write_tag : std_ulogic; slow_valid : std_ulogic; wb : wishbone_master_out; + reloading : std_ulogic; reload_tag : cache_tag_t; store_way : way_t; store_row : row_t; @@ -431,7 +432,6 @@ architecture rtl of dcache is signal r0_valid : std_ulogic; signal r0_stall : std_ulogic; - signal fwd_same_tag : std_ulogic; signal use_forward_st : std_ulogic; signal use_forward_rl : std_ulogic; signal use_forward2 : std_ulogic; @@ -957,7 +957,6 @@ begin variable hit_ways : way_expand_t; variable go : std_ulogic; variable nc : std_ulogic; - variable s_hit : std_ulogic; variable s_tag : cache_tag_t; variable s_pte : tlb_pte_t; variable s_ra : real_addr_t; @@ -999,7 +998,6 @@ begin if r0.req.virt_mode = '1' then for j in tlb_way_t loop if tlb_valid_way(j) = '1' then - s_hit := '0'; s_pte := read_tlb_pte(j, tlb_pte_way); s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & r0.req.addr(TLB_LG_PGSZ - 1 downto 0); @@ -1053,7 +1051,6 @@ begin end if; end if; req_same_tag <= rel_match; - fwd_same_tag <= fwd_match; -- This is 1 if the snooped write from the previous cycle hits the same -- cache line that is being accessed in this cycle. @@ -1072,7 +1069,7 @@ begin if rel_match = '1' and r1.store_row = req_row then -- Use the forwarding path if this cycle is a write to this row use_forward_st <= r1.write_bram; - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + if r1.reloading = '1' and wishbone_in.ack = '1' then use_forward_rl <= '1'; end if; end if; @@ -1105,12 +1102,12 @@ begin end if; -- See if the request matches the line currently being reloaded - if r1.state = RELOAD_WAIT_ACK and rel_match = '1' then + if r1.reloading = '1' and rel_match = '1' then assert not is_X(rindex); assert not is_X(r1.store_index); end if; hit_reload := '0'; - if r1.state = RELOAD_WAIT_ACK and rel_match = '1' and + if r1.reloading = '1' and rel_match = '1' and rindex = r1.store_index then -- Ignore is_hit from above, because a load miss writes the new tag -- but doesn't clear the valid bit on the line before refilling it. @@ -1347,7 +1344,7 @@ begin wr_sel_m <= (others => '0'); if r1.write_bram = '1' or - (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1') then + (r1.reloading = '1' and wishbone_in.ack = '1') then assert not is_X(replace_way); if to_unsigned(i, WAY_BITS) = replace_way then wr_sel_m <= ram_wr_select; @@ -1410,7 +1407,7 @@ begin r1.forward_row <= r1.store_row; r1.forward_sel <= ram_wr_select; r1.forward_valid <= r1.write_bram; - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + if r1.reloading = '1' and wishbone_in.ack = '1' then r1.forward_valid <= '1'; end if; @@ -1476,6 +1473,7 @@ begin r1.wb.stb <= '0'; r1.ls_valid <= '0'; r1.mmu_done <= '0'; + r1.reloading <= '0'; r1.acks_pending <= to_unsigned(0, 3); r1.stalled <= '0'; r1.dec_acks <= '0'; @@ -1654,6 +1652,7 @@ begin if req.nc = '0' then -- Track that we had one request sent r1.state <= RELOAD_WAIT_ACK; + r1.reloading <= '1'; r1.write_tag <= '1'; ev.load_miss <= '1'; @@ -1690,7 +1689,8 @@ begin -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading r1.state <= RELOAD_WAIT_ACK; - r1.write_tag <= not req.is_hit; + r1.reloading <= not req.nc; + r1.write_tag <= not req.nc and not req.is_hit; r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -1763,7 +1763,10 @@ begin -- Cache line is now valid assert not is_X(r1.store_index); assert not is_X(r1.store_way); - cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1'; + if r1.reloading = '1' then + cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1'; + end if; + r1.reloading <= '0'; ev.dcache_refill <= not r1.dcbz; -- Second half of a lq/lqarx can assume a hit on this line now diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e3bd558..a274d0f 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -695,7 +695,6 @@ begin v.flush := '1'; when OP_DCBZ => v.dcbz := '1'; - v.align_intr := v.nc; when OP_TLBIE => v.tlbie := '1'; v.is_slbia := l_in.insn(7); From 9645ab6e1fe63b7897b3c3a99c0fc7ef2b909d8d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 11 Mar 2025 20:25:10 +1100 Subject: [PATCH 3/8] dcache: Rework forwarding and same-page logic This gets rid of some largish comparators in the dcache_request process by matching index and way that hit in the cache tags instead of comparing tag values. That is, some tag comparisons can be replaced by seeing if both tags hit in the same cache way. When reloading a cache line, we now set it valid at the beginning of the reload, so that we get hits to compare. While the reload is still occurring, accesses to doublewords that haven't yet been read are indicated with req_is_hit = 0 and req_hit_reload = 1 (i.e. are considered to be a miss, at least for now). For the comparison of whether a subsequent access is to the same page as stores already being performed, in virtual mode (TLB being used) we now compare the way and index of the hit in the TLB, and in real mode we compare the effective address. If any new entry has been loaded into the TLB since the access we're comparing against, then it is considered to be a different page. Signed-off-by: Paul Mackerras --- dcache.vhdl | 293 ++++++++++++++++++++++++++++------------------------ 1 file changed, 157 insertions(+), 136 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 1e7fff1..af9bb0f 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -304,27 +304,32 @@ architecture rtl of dcache is signal r0_full : std_ulogic; type mem_access_request_t is record - op_lmiss : std_ulogic; - op_store : std_ulogic; - op_flush : std_ulogic; - op_sync : std_ulogic; - nc : std_ulogic; - valid : std_ulogic; - dcbz : std_ulogic; - flush : std_ulogic; - touch : std_ulogic; - sync : std_ulogic; - reserve : std_ulogic; - first_dw : std_ulogic; - last_dw : std_ulogic; - real_addr : real_addr_t; - data : std_ulogic_vector(63 downto 0); - byte_sel : std_ulogic_vector(7 downto 0); - is_hit : std_ulogic; - hit_way : way_t; - same_tag : std_ulogic; - mmu_req : std_ulogic; - dawr_m : std_ulogic; + op_lmiss : std_ulogic; + op_store : std_ulogic; + op_flush : std_ulogic; + op_sync : std_ulogic; + nc : std_ulogic; + valid : std_ulogic; + dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; + sync : std_ulogic; + reserve : std_ulogic; + first_dw : std_ulogic; + last_dw : std_ulogic; + real_addr : real_addr_t; + data : std_ulogic_vector(63 downto 0); + byte_sel : std_ulogic_vector(7 downto 0); + is_hit : std_ulogic; + hit_way : way_t; + hit_ways : way_expand_t; + hit_reload : std_ulogic; + same_page : std_ulogic; + mmu_req : std_ulogic; + dawr_m : std_ulogic; + tlb_hit : std_ulogic; + tlb_index : tlb_index_sig_t; + tlb_way : tlb_way_sig_t; end record; -- First stage register, contains state for stage 1 of load hits @@ -344,6 +349,7 @@ architecture rtl of dcache is cache_hit : std_ulogic; prev_hit : std_ulogic; prev_way : way_t; + prev_hit_ways : way_expand_t; prev_hit_reload : std_ulogic; -- TLB hit state @@ -351,13 +357,16 @@ architecture rtl of dcache is tlb_hit_way : tlb_way_sig_t; tlb_hit_index : tlb_index_sig_t; tlb_victim : tlb_way_sig_t; + ls_tlb_hit : std_ulogic; + tlb_acc_index : tlb_index_sig_t; + tlb_acc_way : tlb_way_sig_t; -- data buffer for data forwarded from writes to reads forward_data : std_ulogic_vector(63 downto 0); - forward_tag : cache_tag_t; forward_sel : std_ulogic_vector(7 downto 0); forward_valid : std_ulogic; forward_row : row_t; + forward_way : way_t; data_out : std_ulogic_vector(63 downto 0); -- Cache miss state (reload state machine) @@ -370,6 +379,7 @@ architecture rtl of dcache is reloading : std_ulogic; reload_tag : cache_tag_t; store_way : way_t; + store_ways : way_expand_t; store_row : row_t; store_index : index_t; end_row_ix : row_in_line_t; @@ -421,7 +431,7 @@ architecture rtl of dcache is signal req_op_bad : std_ulogic; signal req_op_nop : std_ulogic; signal req_data : std_ulogic_vector(63 downto 0); - signal req_same_tag : std_ulogic; + signal req_same_page : std_ulogic; signal req_go : std_ulogic; signal req_nc : std_ulogic; signal req_hit_reload : std_ulogic; @@ -774,15 +784,27 @@ begin pte <= hitpte; valid_ra <= tlb_hit or not r0.req.virt_mode; tlb_miss <= r0_valid and r0.req.virt_mode and not tlb_hit; + + -- extract real address, permissions, attributes + -- also detect whether this access is to the same page as the previous one + req_same_page <= '0'; if r0.req.virt_mode = '1' then ra <= hitpte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= extract_perm_attr(hitpte); + if tlb_read_valid = '1' and r1.state = STORE_WAIT_ACK and r1.ls_tlb_hit = '1' and + tlb_req_index = r1.tlb_acc_index and hitway = r1.tlb_acc_way then + req_same_page <= '1'; + end if; else ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= real_mode_perm_attr; + if r0.req.addr(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) = + wb_to_addr(r1.wb.adr)(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) then + req_same_page <= '1'; + end if; end if; end process; @@ -807,7 +829,6 @@ begin for i in tlb_way_t loop if tlb_hit_expand(i) = '1' then assert not is_X(tlb_req_index); - assert not is_X(tlb_hit_way); dtlb_valids(to_integer(tlb_req_index))(i) <= '0'; end if; end loop; @@ -965,6 +986,11 @@ begin variable snoop_match : std_ulogic; variable hit_reload : std_ulogic; variable dawr_match : std_ulogic; + variable idx_reload : way_expand_t; + variable maybe_fwd_rl : way_expand_t; + variable maybe_fwd_st : way_expand_t; + variable maybe_fwd2 : way_expand_t; + variable wr_row_match : std_ulogic; begin -- Extract line, row and tag from request rindex := get_index(r0.req.addr); @@ -978,23 +1004,65 @@ begin end if; go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.ls_error; - if is_X(r0.req.addr) then + if is_X(ra) then go := '0'; end if; - if go = '1' then - assert not is_X(r1.forward_tag); + + -- See if the request matches the line currently being reloaded + if go = '1' and r1.reloading = '1' then + assert not is_X(r1.store_index); + assert not is_X(r1.store_row); + assert not is_X(r1.store_way); + end if; + wr_row_match := '0'; + if go = '1' and req_row = r1.store_row then + wr_row_match := '1'; + end if; + idx_reload := (others => '0'); + maybe_fwd_rl := (others => '0'); + if go = '1' and r1.reloading = '1' and rindex = r1.store_index then + -- Way r1.store_way at this index is currently being reloaded. + -- If we detect that this way is the one that hits below, + -- and this is a load, then this is a hit only if r1.rows_valid() + -- is true, or if the data currently arriving on the wishbone is + -- the row we want. + if wr_row_match = '1' and wishbone_in.ack = '1' then + maybe_fwd_rl := r1.store_ways; + elsif r0.req.load = '1' and r0.req.touch = '0' and + r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '0' then + idx_reload := r1.store_ways; + end if; + end if; + + -- See if request matches the location being stored in this cycle + maybe_fwd_st := (others => '0'); + if wr_row_match = '1' and r1.write_bram = '1' then + maybe_fwd_st := r1.store_ways; + end if; + + -- See if request matches the location stored to in the previous cycle + maybe_fwd2 := (others => '0'); + if go = '1' and r1.forward_valid = '1' and req_row = r1.forward_row then + assert not is_X(r1.forward_way); + maybe_fwd2(to_integer(r1.forward_way)) := '1'; + end if; + + hit_ways := (others => '0'); + if r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' then + -- For the second half of an atomic quadword load, just use the + -- same way as the first half, without considering whether the line + -- is valid; it is as if we had read the second dword at the same + -- time as the first dword, and the line was valid back then. + -- If the line is currently being reloaded and the doubleword we want + -- hasn't come yet, then idx_reload() will be 1 and we treat this + -- as a miss in order to wait for it. + hit_ways := r1.prev_hit_ways; end if; -- Test if pending request is a hit on any way -- In order to make timing in virtual mode, when we are using the TLB, -- we compare each way with each of the real addresses from each way of -- the TLB, and then decide later which match to use. - hit_way := to_unsigned(0, WAY_BITS); - hit_ways := (others => '0'); - is_hit := '0'; - rel_match := '0'; - fwd_match := '0'; - snoop_match := '0'; if r0.req.virt_mode = '1' then for j in tlb_way_t loop if tlb_valid_way(j) = '1' then @@ -1008,23 +1076,8 @@ begin read_tag(i, cache_tag_set) = s_tag and tlb_hit_expand(j) = '1' then hit_ways(i) := '1'; - hit_way := to_unsigned(i, WAY_BITS); - if go = '1' then - is_hit := '1'; - if snoop_hits(i) = '1' then - snoop_match := '1'; - end if; - end if; end if; end loop; - if go = '1' and tlb_hit_expand(j) = '1' then - if not is_X(r1.reload_tag) and s_tag = r1.reload_tag then - rel_match := '1'; - end if; - if s_tag = r1.forward_tag then - fwd_match := '1'; - end if; - end if; end if; end loop; else @@ -1036,54 +1089,33 @@ begin if go = '1' and cache_valids(to_integer(rindex))(i) = '1' and read_tag(i, cache_tag_set) = s_tag then hit_ways(i) := '1'; - hit_way := to_unsigned(i, WAY_BITS); - is_hit := '1'; - if snoop_hits(i) = '1' then - snoop_match := '1'; - end if; end if; end loop; - if go = '1' and not is_X(r1.reload_tag) and s_tag = r1.reload_tag then - rel_match := '1'; - end if; - if go = '1' and s_tag = r1.forward_tag then - fwd_match := '1'; - end if; end if; - req_same_tag <= rel_match; + + hit_way := to_unsigned(0, WAY_BITS); + is_hit := '0'; + hit_reload := '0'; + for i in 0 to NUM_WAYS-1 loop + if hit_ways(i) = '1' then + hit_way := to_unsigned(i, WAY_BITS); + is_hit := go and not idx_reload(i); + hit_reload := go and idx_reload(i); + end if; + end loop; -- This is 1 if the snooped write from the previous cycle hits the same -- cache line that is being accessed in this cycle. req_snoop_hit <= '0'; - if go = '1' and snoop_match = '1' and get_index(snoop_paddr) = rindex then - req_snoop_hit <= '1'; + if go = '1' and get_index(snoop_paddr) = rindex then + -- (ignore idx_reload here since snooped writes can't happen while we're reloading) + req_snoop_hit <= or (snoop_hits and hit_ways); end if; -- Whether to use forwarded data for a load or not - use_forward_st <= '0'; - use_forward_rl <= '0'; - if rel_match = '1' then - assert not is_X(r1.store_row); - assert not is_X(req_row); - end if; - if rel_match = '1' and r1.store_row = req_row then - -- Use the forwarding path if this cycle is a write to this row - use_forward_st <= r1.write_bram; - if r1.reloading = '1' and wishbone_in.ack = '1' then - use_forward_rl <= '1'; - end if; - end if; - use_forward2 <= '0'; - if fwd_match = '1' then - assert not is_X(r1.forward_row); - if is_X(req_row) then - report "req_row=" & to_hstring(req_row) & " addr=" & to_hstring(r0.req.addr) & " go=" & std_ulogic'image(go); - end if; - assert not is_X(req_row); - end if; - if fwd_match = '1' and r1.forward_row = req_row then - use_forward2 <= r1.forward_valid; - end if; + use_forward_rl <= or (hit_ways and maybe_fwd_rl); + use_forward_st <= or (hit_ways and maybe_fwd_st); + use_forward2 <= or (hit_ways and maybe_fwd2); -- The way to replace on a miss replace_way <= to_unsigned(0, WAY_BITS); @@ -1101,42 +1133,8 @@ begin end if; end if; - -- See if the request matches the line currently being reloaded - if r1.reloading = '1' and rel_match = '1' then - assert not is_X(rindex); - assert not is_X(r1.store_index); - end if; - hit_reload := '0'; - if r1.reloading = '1' and rel_match = '1' and - rindex = r1.store_index then - -- Ignore is_hit from above, because a load miss writes the new tag - -- but doesn't clear the valid bit on the line before refilling it. - -- For a store, consider this a hit even if the row isn't valid - -- since it will be by the time we perform the store. - -- For a load, check the appropriate row valid bit; but also, - -- if use_forward_rl is 1 then we can consider this a hit. - -- For a touch, since the line we want is being reloaded already, - -- consider this a hit. - is_hit := not r0.req.load or r0.req.touch or - r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or - use_forward_rl; - hit_way := replace_way; - hit_ways := (others => '0'); - hit_ways(to_integer(replace_way)) := '1'; - hit_reload := is_hit; - elsif r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' and - r0.req.nc = '0' and perm_attr.nocache = '0' and r1.prev_hit = '1' then - -- For the second half of an atomic quadword load, just use the - -- same way as the first half, without considering whether the line - -- is valid; it is as if we had read the second dword at the same - -- time as the first dword, and the line was valid back then. - -- (Cases where the line is currently being reloaded are handled above.) - -- NB lq to noncacheable isn't required to be atomic per the ISA. - is_hit := '1'; - hit_way := r1.prev_way; - hit_ways := (others => '0'); - hit_ways(to_integer(r1.prev_way)) := '1'; - end if; + req_go <= go; + req_nc <= nc; -- The way that matched on a hit req_hit_way <= hit_way; @@ -1191,8 +1189,6 @@ begin req_op_load_miss <= not is_hit; -- includes non-cacheable loads end if; end if; - req_go <= go; - req_nc <= nc; -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. @@ -1403,9 +1399,9 @@ begin r1.data_out <= data_out; r1.forward_data <= ram_wr_data; - r1.forward_tag <= r1.reload_tag; r1.forward_row <= r1.store_row; r1.forward_sel <= ram_wr_select; + r1.forward_way <= replace_way; r1.forward_valid <= r1.write_bram; if r1.reloading = '1' and wishbone_in.ack = '1' then r1.forward_valid <= '1'; @@ -1479,6 +1475,7 @@ begin r1.dec_acks <= '0'; r1.prev_hit <= '0'; r1.prev_hit_reload <= '0'; + r1.prev_hit_ways <= (others => '0'); reservation.valid <= '0'; reservation.addr <= (others => '0'); @@ -1529,7 +1526,16 @@ begin end if; end loop; r1.store_way <= replace_way; + r1.store_ways <= (others => '0'); + r1.store_ways(to_integer(replace_way)) <= '1'; r1.write_tag <= '0'; + -- Set the line valid now. While the line is being + -- reloaded, the hit detection logic will use r1.rows_valid + -- to determine hits on this line. + cache_valids(to_integer(r1.store_index))(to_integer(replace_way)) <= '1'; + -- record which way was used, for possible 2nd half of lqarx + r1.prev_hit_ways <= (others => '0'); + r1.prev_hit_ways(to_integer(replace_way)) <= '1'; end if; -- Take request from r1.req if there is one there, @@ -1552,6 +1558,9 @@ begin req.first_dw := not r0.req.atomic_qw or r0.req.atomic_first; req.last_dw := not r0.req.atomic_qw or r0.req.atomic_last; req.real_addr := ra; + req.tlb_hit := tlb_hit; + req.tlb_index := tlb_req_index; + req.tlb_way := tlb_hit_way; -- Force data to 0 for dcbz if r0.req.dcbz = '1' then req.data := (others => '0'); @@ -1567,8 +1576,10 @@ begin req.byte_sel := r0.req.byte_sel; end if; req.hit_way := req_hit_way; + req.hit_ways := req_hit_ways; req.is_hit := req_is_hit; - req.same_tag := req_same_tag; + req.hit_reload := req_hit_reload; + req.same_page := req_same_page; -- Store the incoming request from r0, if it is a slow request -- Note that r1.full = 1 implies none of the req_op_* are 1. @@ -1579,6 +1590,9 @@ begin r1.full <= req_op_load_miss or req_op_store or req_op_flush or req_op_sync; end if; end if; + if r0_valid = '1' and r0.tlbld = '1' then + r1.ls_tlb_hit <= '0'; + end if; -- Signals for PLRU update and victim selection r1.hit_way <= req_hit_way; @@ -1594,6 +1608,7 @@ begin if req_go = '1' then r1.prev_hit <= req_is_hit; r1.prev_way <= req_hit_way; + r1.prev_hit_ways <= req_hit_ways; r1.prev_hit_reload <= req_hit_reload; end if; @@ -1624,10 +1639,14 @@ begin r1.store_row <= get_row(req.real_addr); r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1; r1.reload_tag <= get_tag(req.real_addr); - r1.req.same_tag <= '1'; + r1.req.hit_reload <= '1'; + r1.ls_tlb_hit <= req.tlb_hit and not req.mmu_req; + r1.tlb_acc_index <= req.tlb_index; + r1.tlb_acc_way <= req.tlb_way; if req.is_hit = '1' then r1.store_way <= req.hit_way; + r1.store_ways <= req.hit_ways; end if; -- Reset per-row valid bits, ready for handling the next load miss @@ -1737,9 +1756,12 @@ begin assert not is_X(r1.store_row); assert not is_X(r1.req.real_addr); end if; - if r1.full = '1' and r1.req.same_tag = '1' and - ((r1.dcbz = '1' and r1.req.dcbz = '1') or r1.req.op_lmiss = '1') and - r1.store_row = get_row(r1.req.real_addr) then + -- r1.req.hit_reload is always 1 for the request that + -- started this reload, and otherwise always 0 for dcbz + -- (since it is considered a store). + if r1.full = '1' and r1.req.hit_reload = '1' and + get_row_of_line(r1.store_row) = + get_row_of_line(get_row(r1.req.real_addr)) then r1.full <= '0'; r1.slow_valid <= '1'; if r1.mmu_req = '0' then @@ -1763,9 +1785,6 @@ begin -- Cache line is now valid assert not is_X(r1.store_index); assert not is_X(r1.store_way); - if r1.reloading = '1' then - cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1'; - end if; r1.reloading <= '0'; ev.dcache_refill <= not r1.dcbz; @@ -1773,6 +1792,7 @@ begin -- if the first half hit this line. r1.prev_hit <= r1.prev_hit_reload; r1.prev_way <= r1.store_way; + r1.prev_hit_ways <= r1.store_ways; r1.state <= IDLE; end if; @@ -1791,19 +1811,20 @@ begin -- DO_STCX state, unless they are the second half of a -- successful stqcx, which is handled here. if req.valid = '1' then - r1.wb.adr(SET_SIZE_BITS - ROW_OFF_BITS - 1 downto 0) <= - req.real_addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS); + r1.wb.adr(TLB_LG_PGSZ - ROW_OFF_BITS - 1 downto 0) <= + req.real_addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS); r1.wb.dat <= req.data; r1.wb.sel <= req.byte_sel; end if; assert not is_X(acks); r1.wb.stb <= '0'; - if req.op_store = '1' and req.same_tag = '1' and req.dcbz = '0' and + if req.op_store = '1' and req.same_page = '1' and req.dcbz = '0' and (req.reserve = '0' or r1.atomic_more = '1') then if acks < 7 then r1.wb.stb <= '1'; stbs_done := false; r1.store_way <= req.hit_way; + r1.store_ways <= req.hit_ways; r1.store_row <= get_row(req.real_addr); r1.write_bram <= req.is_hit; r1.atomic_more <= not req.last_dw; From 26507450b798dc6bc554d4f9247655015230aa06 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 12 Mar 2025 10:58:43 +1100 Subject: [PATCH 4/8] dcache: Remove reset on read port of cache tag RAM The reset was added originally to reduce metavalue warnings in simulation, is not necessary for correct operation, and showed up as a critical path in synthesis for the Xilinx Artix-7. Remove it when doing synthesis; for simulation we set the value read to X rather than 0 in order to catch any use of the previously reset value. Signed-off-by: Paul Mackerras --- core.vhdl | 1 + dcache.vhdl | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/core.vhdl b/core.vhdl index c94db6f..d4efcf3 100644 --- a/core.vhdl +++ b/core.vhdl @@ -468,6 +468,7 @@ begin dcache_0: entity work.dcache generic map( + SIM => SIM, LINE_SIZE => 64, NUM_LINES => DCACHE_NUM_LINES, NUM_WAYS => DCACHE_NUM_WAYS, diff --git a/dcache.vhdl b/dcache.vhdl index af9bb0f..a98dde2 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -14,6 +14,7 @@ use work.wishbone_types.all; entity dcache is generic ( + SIM : boolean := false; -- Line size in bytes LINE_SIZE : positive := 64; -- Number of lines in a set @@ -922,10 +923,10 @@ begin index := get_index(d_in.addr); valid := d_in.valid; end if; - if valid = '1' then + if valid = '1' or not SIM then cache_tag_set <= cache_tags(to_integer(index)); else - cache_tag_set <= (others => '0'); + cache_tag_set <= (others => 'X'); end if; end if; end process; From 4278387b21af84ed2bbdc890ea0bf7c7144e0c7d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 12 Mar 2025 15:16:34 +1100 Subject: [PATCH 5/8] dcache: Simplify reservation logic With some slight arrangement of the state machine in the dcache_slow process, we can remove one of the two comparators that detect writes by other entities to the reservation granule. The state machine now sets the wishbone cyc signal on the transition from IDLE to DO_STCX state. Once we see the wishbone stall signal at 0, we consider we have the wishbone and we can assert stb to do the write provided that the stcx is to the reservation address and we haven't seen another write to the reservation granule. We keep the comparator that compares the snoop address delayed by one cycle, in order to make timing easier, and the one (or more) cycle delay between cyc and stb covers that one cycle delay in the kill_rsrv signal. Signed-off-by: Paul Mackerras --- dcache.vhdl | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index a98dde2..cf72927 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -416,7 +416,6 @@ architecture rtl of dcache is signal reservation : reservation_t; signal kill_rsrv : std_ulogic; - signal kill_rsrv2 : std_ulogic; -- Async signals on incoming request signal req_index : index_t; @@ -936,9 +935,6 @@ begin snoop_addr <= addr_to_real(wb_to_addr(snoop_in.adr)); snoop_active <= snoop_in.cyc and snoop_in.stb and snoop_in.we and not (r1.wb.cyc and not wishbone_in.stall); - kill_rsrv <= '1' when (snoop_active = '1' and reservation.valid = '1' and - snoop_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) - else '0'; -- Cache tag RAM second read port, for snooping cache_tag_read_2 : process(clk) @@ -954,10 +950,9 @@ begin end if; end process; - -- Compare the previous cycle's snooped store address to the reservation, - -- to catch the case where a write happens on cycle 1 of a cached larx - kill_rsrv2 <= '1' when (snoop_valid = '1' and reservation.valid = '1' and - snoop_paddr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + -- Compare the previous cycle's snooped store address to the reservation + kill_rsrv <= '1' when (snoop_valid = '1' and reservation.valid = '1' and + snoop_paddr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) else '0'; snoop_tag_match : process(all) @@ -1493,10 +1488,8 @@ begin r1.mmu_done <= (r0_valid and (r0.tlbie or r0.tlbld)) or (req_op_load_hit and r0.mmu_req); - -- The kill_rsrv2 term covers the case where the reservation - -- address was set at the beginning of this cycle, and a store - -- to that address happened in the previous cycle. - if kill_rsrv = '1' or kill_rsrv2 = '1' then + -- Clear the reservation if another entity writes to that line + if kill_rsrv = '1' then reservation.valid <= '0'; end if; if req_go = '1' and access_ok = '1' and r0.req.load = '1' and @@ -1689,9 +1682,18 @@ begin if req.op_store = '1' then if req.reserve = '1' then - -- stcx needs to wait until next cycle - -- for the reservation address check - r1.state <= DO_STCX; + if reservation.valid = '0' or kill_rsrv = '1' then + -- someone else has stored to the reservation granule + r1.stcx_fail <= '1'; + r1.full <= '0'; + r1.ls_valid <= '1'; + else + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + -- stcx needs to wait to assert stb until next cycle + -- for the reservation address check + r1.state <= DO_STCX; + end if; elsif req.dcbz = '0' then r1.state <= STORE_WAIT_ACK; r1.full <= '0'; @@ -1876,28 +1878,21 @@ begin if reservation.valid = '0' or kill_rsrv = '1' or r1.req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) /= reservation.addr then -- Wrong address, didn't have reservation, or lost reservation - -- Abandon the wishbone cycle if started and fail the stcx. + -- Abandon the wishbone cycle and fail the stcx. r1.stcx_fail <= '1'; r1.full <= '0'; r1.ls_valid <= '1'; r1.state <= IDLE; r1.wb.cyc <= '0'; - r1.wb.stb <= '0'; reservation.valid <= '0'; -- If this is the first half of a stqcx., the second half -- will fail also because the reservation is not valid. r1.state <= IDLE; - elsif r1.wb.cyc = '0' then - -- Right address and have reservation, so start the - -- wishbone cycle - r1.wb.we <= '1'; - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - elsif r1.wb.stb = '1' and wishbone_in.stall = '0' then - -- Store has been accepted, so now we can write the - -- cache data RAM and complete the request + elsif wishbone_in.stall = '0' then + -- We have the wishbone, so now we can assert stb, + -- write the cache data RAM and complete the request r1.write_bram <= r1.req.is_hit; - r1.wb.stb <= '0'; + r1.wb.stb <= '1'; r1.full <= '0'; r1.slow_valid <= '1'; r1.ls_valid <= '1'; From 5168242cd5424c1f6799ce4676086ef4bac784bb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 13 Mar 2025 14:55:07 +1100 Subject: [PATCH 6/8] dcache: Rework forwarding data paths This rearranges the multiplexing of cache read data with forwarded store data with the aim of shortening the path from the req_hit_ways signal to the r1.data_out register. The forwarding decisions are now made for each way independently and the the results then combined according to which way detected a cache hit. Signed-off-by: Paul Mackerras --- dcache.vhdl | 82 ++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index cf72927..6409ab0 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -181,9 +181,9 @@ architecture rtl of dcache is return pa; end; - function andor(mask : std_ulogic; in1 : std_ulogic_vector(7 downto 0); - in2 : std_ulogic_vector(7 downto 0)) return std_ulogic_vector is - variable t : std_ulogic_vector(7 downto 0) := (others => mask); + function andor(mask : std_ulogic; in1 : std_ulogic_vector(wishbone_data_bits-1 downto 0); + in2 : std_ulogic_vector(wishbone_data_bits-1 downto 0)) return std_ulogic_vector is + variable t : std_ulogic_vector(wishbone_data_bits-1 downto 0) := (others => mask); begin return in2 or (in1 and t); end; @@ -442,9 +442,9 @@ architecture rtl of dcache is signal r0_valid : std_ulogic; signal r0_stall : std_ulogic; - signal use_forward_st : std_ulogic; - signal use_forward_rl : std_ulogic; - signal use_forward2 : std_ulogic; + signal use_forward_st : way_expand_t; + signal use_forward_rl : way_expand_t; + signal use_forward2 : way_expand_t; -- Cache RAM interface type cache_ram_out_t is array(0 to NUM_WAYS-1) of cache_row_t; @@ -1109,9 +1109,14 @@ begin end if; -- Whether to use forwarded data for a load or not - use_forward_rl <= or (hit_ways and maybe_fwd_rl); - use_forward_st <= or (hit_ways and maybe_fwd_st); - use_forward2 <= or (hit_ways and maybe_fwd2); + if r1.dcbz = '0' then + use_forward_rl <= maybe_fwd_rl; + use_forward_st <= maybe_fwd_st; + else + use_forward_rl <= (others => '0'); + use_forward_st <= maybe_fwd_rl; + end if; + use_forward2 <= maybe_fwd2; -- The way to replace on a miss replace_way <= to_unsigned(0, WAY_BITS); @@ -1319,11 +1324,28 @@ begin wr_data => ram_wr_data ); process(all) + variable dword : cache_row_t; + variable j : integer; begin -- Cache hit reads do_read <= early_rd_valid; rd_addr <= std_ulogic_vector(early_req_row); - cache_out(i) <= dout; + + -- Forward write data from this cycle or the previous + dword := (others => '0'); + for b in 0 to ROW_SIZE - 1 loop + j := b * 8; + if use_forward_rl(i) = '1' then + dword(j + 7 downto j) := wishbone_in.dat(j + 7 downto j); + elsif use_forward_st(i) = '1' and r1.req.byte_sel(b) = '1' then + dword(j + 7 downto j) := r1.req.data(j + 7 downto j); + elsif use_forward2(i) = '1' and r1.forward_sel(b) = '1' then + dword(j + 7 downto j) := r1.forward_data(j + 7 downto j); + else + dword(j + 7 downto j) := dout(j + 7 downto j); + end if; + end loop; + cache_out(i) <= dword; -- Write mux: -- @@ -1354,44 +1376,20 @@ begin variable j : integer; variable sel : std_ulogic_vector(1 downto 0); variable data_out : std_ulogic_vector(63 downto 0); - variable byte_out : std_ulogic_vector(7 downto 0); begin if rising_edge(clk) then if r0_valid = '1' then r1.mmu_req <= r0.mmu_req; end if; - -- Bypass/forwarding multiplexer for load data. - -- Use the bypass if are reading the row of BRAM that was written 0 or 1 - -- cycles ago, including for the slow_valid = 1 cases (i.e. completing a - -- load miss or a non-cacheable load), which are handled via the r1.full case. - for i in 0 to 7 loop - if r1.full = '1' or use_forward_rl = '1' then - sel := '0' & r1.dcbz; - elsif use_forward_st = '1' and r1.req.byte_sel(i) = '1' then - sel := "01"; - elsif use_forward2 = '1' and r1.forward_sel(i) = '1' then - sel := "10"; - else - sel := "11"; - end if; - j := i * 8; - case sel is - when "00" => - data_out(j + 7 downto j) := wishbone_in.dat(j + 7 downto j); - when "01" => - data_out(j + 7 downto j) := r1.req.data(j + 7 downto j); - when "10" => - data_out(j + 7 downto j) := r1.forward_data(j + 7 downto j); - when others => - byte_out := (others => '0'); - for w in 0 to NUM_WAYS-1 loop - byte_out := andor(req_hit_ways(w), cache_out(w)(j + 7 downto j), - byte_out); - end loop; - data_out(j + 7 downto j) := byte_out; - end case; - end loop; + data_out := (others => '0'); + if r1.full = '1' then + data_out := wishbone_in.dat; + else + for w in 0 to NUM_WAYS-1 loop + data_out := andor(req_hit_ways(w), cache_out(w), data_out); + end loop; + end if; r1.data_out <= data_out; r1.forward_data <= ram_wr_data; From c938246cc8f3c62e0b4cef358b5cb17a4d83a84e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 5 Apr 2025 09:39:48 +1100 Subject: [PATCH 7/8] dcache: Simplify addressing of the dcache TLB Instead of having TLB invalidation and TLB load requests come through the dcache main path, these operations are now done in one cycle entirely based on signals from the MMU, and don't involve the TLB read path or the dcache state machine at all. So that we know which way of the TLB to affect for invalidations, loadstore1 now sends down a "TLB probe" operation for tlbie instructions which goes through the dcache pipeline and sets the r1.tlb_hit_* fields which are used in the subsequent invalidation operation from the MMU (if it is a single-page invalidation). TLB load operations write to the way identified by r1.victim_way, which was set on the TLB miss that triggered the TLB reload. Since we are writing just one way of the TLB tags now, rather than writing all ways with one way's value changed, we now pad each way to a multiple of 8 bits so that byte write-enables can be used to select which way gets written. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + dcache.vhdl | 129 +++++++++++++++++++++--------------------------- loadstore1.vhdl | 10 ++-- mmu.vhdl | 25 ++-------- 4 files changed, 67 insertions(+), 98 deletions(-) diff --git a/common.vhdl b/common.vhdl index 0207fe1..8c42caf 100644 --- a/common.vhdl +++ b/common.vhdl @@ -675,6 +675,7 @@ package common is atomic_last : std_ulogic; virt_mode : std_ulogic; priv_mode : std_ulogic; + tlb_probe : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1 byte_sel : std_ulogic_vector(7 downto 0); diff --git a/dcache.vhdl b/dcache.vhdl index 6409ab0..fe9950b 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -135,7 +135,8 @@ architecture rtl of dcache is constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); constant TLB_WAY_BITS : natural := maximum(log2(TLB_NUM_WAYS), 1); constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS); - constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS; + constant TLB_EA_TAG_WIDTH : natural := TLB_EA_TAG_BITS + 7 - ((TLB_EA_TAG_BITS + 7) mod 8); + constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_WIDTH; constant TLB_PTE_BITS : natural := 64; constant TLB_PTE_WAY_BITS : natural := TLB_NUM_WAYS * TLB_PTE_BITS; @@ -294,9 +295,6 @@ architecture rtl of dcache is -- Stage 0 register, basically contains just the latched request type reg_stage_0_t is record req : Loadstore1ToDcacheType; - tlbie : std_ulogic; -- indicates a tlbie request (from MMU) - doall : std_ulogic; -- with tlbie, indicates flush whole TLB - tlbld : std_ulogic; -- indicates a TLB load request (from MMU) mmu_req : std_ulogic; -- indicates source of request d_valid : std_ulogic; -- indicates req.data is valid now end record; @@ -356,6 +354,7 @@ architecture rtl of dcache is -- TLB hit state tlb_hit : std_ulogic; tlb_hit_way : tlb_way_sig_t; + tlb_hit_ways : tlb_expand_t; tlb_hit_index : tlb_index_sig_t; tlb_victim : tlb_way_sig_t; ls_tlb_hit : std_ulogic; @@ -566,19 +565,10 @@ architecture rtl of dcache is function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t) return tlb_tag_t is variable j : integer; begin - j := way * TLB_EA_TAG_BITS; + j := way * TLB_EA_TAG_WIDTH; return tags(j + TLB_EA_TAG_BITS - 1 downto j); end; - -- Write a TLB tag to a TLB tag memory row - procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t; - tag: tlb_tag_t) is - variable j : integer; - begin - j := way * TLB_EA_TAG_BITS; - tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag; - end; - -- Read a PTE from a TLB PTE memory row function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t) return tlb_pte_t is variable j : integer; @@ -587,13 +577,6 @@ architecture rtl of dcache is return ptes(j + TLB_PTE_BITS - 1 downto j); end; - procedure write_tlb_pte(way: tlb_way_t; ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is - variable j : integer; - begin - j := way * TLB_PTE_BITS; - ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; - end; - begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -623,26 +606,19 @@ begin if m_in.valid = '1' then r.req := Loadstore1ToDcacheInit; r.req.valid := '1'; - r.req.load := not (m_in.tlbie or m_in.tlbld); + r.req.load := '1'; r.req.priv_mode := '1'; r.req.addr := m_in.addr; - r.req.data := m_in.pte; r.req.byte_sel := (others => '1'); - r.tlbie := m_in.tlbie; - r.doall := m_in.doall; - r.tlbld := m_in.tlbld; r.mmu_req := '1'; r.d_valid := '1'; else r.req := d_in; r.req.data := (others => '0'); - r.tlbie := '0'; - r.doall := '0'; - r.tlbld := '0'; r.mmu_req := '0'; r.d_valid := '0'; end if; - if r.req.valid = '1' and r.doall = '0' then + if r.req.valid = '1' then assert not is_X(r.req.addr) severity failure; end if; if rst = '1' then @@ -809,48 +785,39 @@ begin end process; tlb_update : process(clk) - variable tlbie : std_ulogic; - variable tlbwe : std_ulogic; - variable repl_way : tlb_way_sig_t; - variable eatag : tlb_tag_t; - variable tagset : tlb_way_tags_t; - variable pteset : tlb_way_ptes_t; + variable tlb_wr_index : tlb_index_sig_t; + variable j, k : integer; begin if rising_edge(clk) then - tlbie := r0_valid and r0.tlbie; - tlbwe := r0_valid and r0.tlbld; - ev.dtlb_miss_resolved <= tlbwe; - if rst = '1' or (tlbie = '1' and r0.doall = '1') then + tlb_wr_index := unsigned(m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + downto TLB_LG_PGSZ)); + ev.dtlb_miss_resolved <= m_in.tlbld; + if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then -- clear all valid bits at once for i in tlb_index_t loop dtlb_valids(i) <= (others => '0'); end loop; - elsif tlbie = '1' then + elsif m_in.tlbie = '1' then for i in tlb_way_t loop - if tlb_hit_expand(i) = '1' then - assert not is_X(tlb_req_index); - dtlb_valids(to_integer(tlb_req_index))(i) <= '0'; + if r1.tlb_hit_ways(i) = '1' then + assert not is_X(tlb_wr_index); + dtlb_valids(to_integer(tlb_wr_index))(i) <= '0'; end if; end loop; - elsif tlbwe = '1' then - assert not is_X(tlb_req_index); - repl_way := to_unsigned(0, TLB_WAY_BITS); - if TLB_NUM_WAYS > 1 then - if tlb_hit = '1' then - repl_way := tlb_hit_way; - else - repl_way := unsigned(r1.tlb_victim); + elsif m_in.tlbld = '1' then + assert not is_X(tlb_wr_index); + assert not is_X(r1.tlb_victim); + for way in 0 to TLB_NUM_WAYS - 1 loop + if TLB_NUM_WAYS = 1 or way = to_integer(unsigned(r1.tlb_victim)) then + j := way * TLB_EA_TAG_WIDTH; + dtlb_tags(to_integer(tlb_wr_index))(j + TLB_EA_TAG_WIDTH - 1 downto j) <= + (TLB_EA_TAG_WIDTH - 1 downto TLB_EA_TAG_BITS => '0') & + m_in.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + k := way * TLB_PTE_BITS; + dtlb_ptes(to_integer(tlb_wr_index))(k + TLB_PTE_BITS - 1 downto k) <= m_in.pte; + dtlb_valids(to_integer(tlb_wr_index))(way) <= '1'; end if; - assert not is_X(repl_way); - end if; - eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); - tagset := tlb_tag_way; - write_tlb_tag(to_integer(repl_way), tagset, eatag); - dtlb_tags(to_integer(tlb_req_index)) <= tagset; - pteset := tlb_pte_way; - write_tlb_pte(to_integer(repl_way), pteset, r0.req.data); - dtlb_ptes(to_integer(tlb_req_index)) <= pteset; - dtlb_valids(to_integer(tlb_req_index))(to_integer(repl_way)) <= '1'; + end loop; end if; end if; end process; @@ -914,10 +881,10 @@ begin if rising_edge(clk) then if r0_stall = '1' then index := req_index; - valid := r0.req.valid and not (r0.tlbie or r0.tlbld); + valid := r0.req.valid; elsif m_in.valid = '1' then index := get_index(m_in.addr); - valid := not (m_in.tlbie or m_in.tlbld); + valid := '1'; else index := get_index(d_in.addr); valid := d_in.valid; @@ -999,7 +966,7 @@ begin dawr_match := r0.req.dawr_match; end if; - go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.ls_error; + go := r0_valid and not r1.ls_error; if is_X(ra) then go := '0'; end if; @@ -1173,6 +1140,12 @@ begin else req_op_nop <= '1'; end if; + elsif r0.req.tlb_probe = '1' then + -- TLB probe is sent down by loadstore1 before sending a TLB + -- invalidation to mmu, to get r1.tlb_hit_* set correctly + -- (for a single-page invalidation) for the address. + -- It doesn't require r1.ls_valid to be set on completion, + -- so there is nothing else to do here. elsif access_ok = '0' then req_op_bad <= '1'; elsif r0.req.flush = '1' then @@ -1198,7 +1171,7 @@ begin if r0_stall = '0' then if m_in.valid = '1' then early_req_row <= get_row(m_in.addr); - early_rd_valid <= not (m_in.tlbie or m_in.tlbld); + early_rd_valid <= '1'; else early_req_row <= get_row(d_in.addr); early_rd_valid <= d_in.valid and d_in.load; @@ -1417,13 +1390,23 @@ begin end if; -- Record TLB hit information for updating TLB PLRU - r1.tlb_hit <= tlb_hit; - r1.tlb_hit_way <= tlb_hit_way; - r1.tlb_hit_index <= tlb_req_index; + -- and for invalidating or updating TLB contents + if r0_valid = '1' then + r1.tlb_hit <= tlb_hit; + r1.tlb_hit_way <= tlb_hit_way; + r1.tlb_hit_ways <= tlb_hit_expand; + r1.tlb_hit_index <= tlb_req_index; + else + r1.tlb_hit <= '0'; + end if; -- determine victim way in the TLB in the cycle after -- we detect the TLB miss if r1.ls_error = '1' then - r1.tlb_victim <= unsigned(tlb_plru_victim); + if r1.tlb_hit = '0' then + r1.tlb_victim <= unsigned(tlb_plru_victim); + else + r1.tlb_victim <= r1.tlb_hit_way; + end if; end if; end if; @@ -1482,9 +1465,7 @@ begin r1.stcx_fail <= '0'; r1.ls_valid <= (req_op_load_hit or req_op_nop) and not r0.mmu_req; - -- complete tlbies and TLB loads in the third cycle - r1.mmu_done <= (r0_valid and (r0.tlbie or r0.tlbld)) or - (req_op_load_hit and r0.mmu_req); + r1.mmu_done <= req_op_load_hit and r0.mmu_req; -- Clear the reservation if another entity writes to that line if kill_rsrv = '1' then @@ -1582,7 +1563,7 @@ begin r1.full <= req_op_load_miss or req_op_store or req_op_flush or req_op_sync; end if; end if; - if r0_valid = '1' and r0.tlbld = '1' then + if m_in.tlbld = '1' or m_in.tlbie = '1' then r1.ls_tlb_hit <= '0'; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a274d0f..5d05bbb 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -712,8 +712,8 @@ begin v.mmu_op := '1'; when others => end case; - v.dc_req := l_in.valid and (v.load or v.store or v.sync or v.dcbz) and not v.align_intr and - not hash_nop; + v.dc_req := l_in.valid and (v.load or v.store or v.sync or v.dcbz or v.tlbie) and + not v.align_intr and not hash_nop; v.incomplete := v.dc_req and v.two_dwords; -- Work out controls for load and store formatting @@ -873,7 +873,7 @@ begin dawrx_match_enable(r3.dawrx(i), r1.req.virt_mode, r1.req.priv_mode, r1.req.store) then dawr_match := r1.req.valid and r1.req.dc_req and not r3.dawr_upd and - not (r1.req.touch or r1.req.sync or r1.req.flush); + not (r1.req.touch or r1.req.sync or r1.req.flush or r1.req.tlbie); end if; end loop; stage1_dawr_match <= dawr_match; @@ -918,7 +918,7 @@ begin v.req.store_data := store_data; v.req.dawr_intr := dawr_match; v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and - not r1.req.incomplete and not r1.req.hashcmp; + not r1.req.incomplete and not r1.req.hashcmp and not r1.req.tlbie; v.wait_mmu := r1.req.valid and r1.req.mmu_op; if r1.req.valid = '1' and (r1.req.align_intr or r1.req.hashcmp) = '1' then v.busy := '1'; @@ -1263,6 +1263,7 @@ begin d_out.sync <= stage1_req.sync; d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; + d_out.tlb_probe <= stage1_req.tlbie; d_out.atomic_qw <= stage1_req.atomic_qw; d_out.atomic_first <= stage1_req.atomic_first; d_out.atomic_last <= stage1_req.atomic_last; @@ -1279,6 +1280,7 @@ begin d_out.sync <= r2.req.sync; d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; + d_out.tlb_probe <= r2.req.tlbie; d_out.atomic_qw <= r2.req.atomic_qw; d_out.atomic_first <= r2.req.atomic_first; d_out.atomic_last <= r2.req.atomic_last; diff --git a/mmu.vhdl b/mmu.vhdl index fb63cfd..91429f9 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -28,7 +28,6 @@ architecture behave of mmu is type state_t is (IDLE, DO_TLBIE, - TLB_WAIT, PART_TBL_READ, PART_TBL_WAIT, PART_TBL_DONE, @@ -195,7 +194,6 @@ begin variable v : reg_stage_t; variable dcreq : std_ulogic; variable tlb_load : std_ulogic; - variable itlb_load : std_ulogic; variable tlbie_req : std_ulogic; variable ptbl_rd : std_ulogic; variable prtbl_rd : std_ulogic; @@ -225,7 +223,6 @@ begin v.perm_err := '0'; v.rc_error := '0'; tlb_load := '0'; - itlb_load := '0'; tlbie_req := '0'; v.inval_all := '0'; ptbl_rd := '0'; @@ -309,14 +306,8 @@ begin end if; when DO_TLBIE => - dcreq := '1'; tlbie_req := '1'; - v.state := TLB_WAIT; - - when TLB_WAIT => - if d_in.done = '1' then - v.state := RADIX_FINISH; - end if; + v.state := RADIX_FINISH; when PART_TBL_READ => dcreq := '1'; @@ -438,20 +429,14 @@ begin when RADIX_LOAD_TLB => tlb_load := '1'; - if r.iside = '0' then - dcreq := '1'; - v.state := TLB_WAIT; - else - itlb_load := '1'; - v.state := IDLE; - end if; + v.state := RADIX_FINISH; when RADIX_FINISH => v.state := IDLE; end case; - if v.state = RADIX_FINISH or (v.state = RADIX_LOAD_TLB and r.iside = '1') then + if v.state = RADIX_FINISH then v.err := v.invalid or v.badtree or v.segerror or v.perm_err or v.rc_error; v.done := not v.err; end if; @@ -505,11 +490,11 @@ begin d_out.valid <= dcreq; d_out.tlbie <= tlbie_req; d_out.doall <= r.inval_all; - d_out.tlbld <= tlb_load; + d_out.tlbld <= tlb_load and not r.iside; d_out.addr <= addr; d_out.pte <= tlb_data; - i_out.tlbld <= itlb_load; + i_out.tlbld <= tlb_load and r.iside; i_out.tlbie <= tlbie_req; i_out.doall <= r.inval_all; i_out.addr <= addr; From 1da8476cf9e9d4994832b8432b09add1b30e718c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 8 Apr 2025 20:15:06 +1000 Subject: [PATCH 8/8] dcache: Simplify forwarding of load data while reloading a cache line This removes a dependency of req_is_hit and similar signals on the wishbone ack input, by removing use_forward_rl, and making idx_reload not dependent on wr_row_match and wishbone_in.ack. Previously if a load in r0 hit the doubleword being supplied from memory, that was treated as a hit and the data was forwarded via a multiplexer associated with the cache RAM. Now it is called a miss and completed by the logic in the RELOAD_WAIT_ACK state of the state machine. The only downside is that now the selection of data source in the dcache_fast_hit process depends on req_is_hit rather than r1.full. Overall this change seems to reduce the number of LUTs, and make timing easier on the ECP-5. Signed-off-by: Paul Mackerras --- dcache.vhdl | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index fe9950b..86cbc2c 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -442,7 +442,6 @@ architecture rtl of dcache is signal r0_stall : std_ulogic; signal use_forward_st : way_expand_t; - signal use_forward_rl : way_expand_t; signal use_forward2 : way_expand_t; -- Cache RAM interface @@ -950,7 +949,6 @@ begin variable hit_reload : std_ulogic; variable dawr_match : std_ulogic; variable idx_reload : way_expand_t; - variable maybe_fwd_rl : way_expand_t; variable maybe_fwd_st : way_expand_t; variable maybe_fwd2 : way_expand_t; variable wr_row_match : std_ulogic; @@ -982,16 +980,13 @@ begin wr_row_match := '1'; end if; idx_reload := (others => '0'); - maybe_fwd_rl := (others => '0'); if go = '1' and r1.reloading = '1' and rindex = r1.store_index then -- Way r1.store_way at this index is currently being reloaded. -- If we detect that this way is the one that hits below, -- and this is a load, then this is a hit only if r1.rows_valid() -- is true, or if the data currently arriving on the wishbone is -- the row we want. - if wr_row_match = '1' and wishbone_in.ack = '1' then - maybe_fwd_rl := r1.store_ways; - elsif r0.req.load = '1' and r0.req.touch = '0' and + if r0.req.load = '1' and r0.req.touch = '0' and r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '0' then idx_reload := r1.store_ways; end if; @@ -1076,14 +1071,8 @@ begin end if; -- Whether to use forwarded data for a load or not - if r1.dcbz = '0' then - use_forward_rl <= maybe_fwd_rl; - use_forward_st <= maybe_fwd_st; - else - use_forward_rl <= (others => '0'); - use_forward_st <= maybe_fwd_rl; - end if; - use_forward2 <= maybe_fwd2; + use_forward_st <= maybe_fwd_st; + use_forward2 <= maybe_fwd2; -- The way to replace on a miss replace_way <= to_unsigned(0, WAY_BITS); @@ -1256,9 +1245,8 @@ begin end process; -- RAM write data and select multiplexers - ram_wr_data <= r1.req.data when r1.write_bram = '1' else - wishbone_in.dat when r1.dcbz = '0' else - (others => '0'); + ram_wr_data <= r1.req.data when r1.write_bram = '1' or r1.dcbz = '1' else + wishbone_in.dat; ram_wr_select <= r1.req.byte_sel when r1.write_bram = '1' else (others => '1'); @@ -1308,9 +1296,7 @@ begin dword := (others => '0'); for b in 0 to ROW_SIZE - 1 loop j := b * 8; - if use_forward_rl(i) = '1' then - dword(j + 7 downto j) := wishbone_in.dat(j + 7 downto j); - elsif use_forward_st(i) = '1' and r1.req.byte_sel(b) = '1' then + if use_forward_st(i) = '1' and r1.req.byte_sel(b) = '1' then dword(j + 7 downto j) := r1.req.data(j + 7 downto j); elsif use_forward2(i) = '1' and r1.forward_sel(b) = '1' then dword(j + 7 downto j) := r1.forward_data(j + 7 downto j); @@ -1356,7 +1342,7 @@ begin end if; data_out := (others => '0'); - if r1.full = '1' then + if req_is_hit = '0' then data_out := wishbone_in.dat; else for w in 0 to NUM_WAYS-1 loop @@ -1551,7 +1537,7 @@ begin req.hit_way := req_hit_way; req.hit_ways := req_hit_ways; req.is_hit := req_is_hit; - req.hit_reload := req_hit_reload; + req.hit_reload := req_hit_reload and req_op_load_miss; req.same_page := req_same_page; -- Store the incoming request from r0, if it is a slow request @@ -1604,7 +1590,7 @@ begin r1.wb.adr <= addr_to_wb(req.real_addr); r1.wb.sel <= req.byte_sel; r1.wb.dat <= req.data; - r1.dcbz <= req.dcbz; + r1.dcbz <= req.dcbz and req.valid; r1.atomic_more <= not req.last_dw; -- Keep track of our index and way for subsequent stores. @@ -1741,9 +1727,8 @@ begin -- r1.req.hit_reload is always 1 for the request that -- started this reload, and otherwise always 0 for dcbz -- (since it is considered a store). - if r1.full = '1' and r1.req.hit_reload = '1' and - get_row_of_line(r1.store_row) = - get_row_of_line(get_row(r1.req.real_addr)) then + if req.hit_reload = '1' and + get_row_of_line(r1.store_row) = get_row_of_line(get_row(req.real_addr)) then r1.full <= '0'; r1.slow_valid <= '1'; if r1.mmu_req = '0' then