From 9645ab6e1fe63b7897b3c3a99c0fc7ef2b909d8d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 11 Mar 2025 20:25:10 +1100 Subject: [PATCH] dcache: Rework forwarding and same-page logic This gets rid of some largish comparators in the dcache_request process by matching index and way that hit in the cache tags instead of comparing tag values. That is, some tag comparisons can be replaced by seeing if both tags hit in the same cache way. When reloading a cache line, we now set it valid at the beginning of the reload, so that we get hits to compare. While the reload is still occurring, accesses to doublewords that haven't yet been read are indicated with req_is_hit = 0 and req_hit_reload = 1 (i.e. are considered to be a miss, at least for now). For the comparison of whether a subsequent access is to the same page as stores already being performed, in virtual mode (TLB being used) we now compare the way and index of the hit in the TLB, and in real mode we compare the effective address. If any new entry has been loaded into the TLB since the access we're comparing against, then it is considered to be a different page. Signed-off-by: Paul Mackerras --- dcache.vhdl | 293 ++++++++++++++++++++++++++++------------------------ 1 file changed, 157 insertions(+), 136 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 1e7fff1..af9bb0f 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -304,27 +304,32 @@ architecture rtl of dcache is signal r0_full : std_ulogic; type mem_access_request_t is record - op_lmiss : std_ulogic; - op_store : std_ulogic; - op_flush : std_ulogic; - op_sync : std_ulogic; - nc : std_ulogic; - valid : std_ulogic; - dcbz : std_ulogic; - flush : std_ulogic; - touch : std_ulogic; - sync : std_ulogic; - reserve : std_ulogic; - first_dw : std_ulogic; - last_dw : std_ulogic; - real_addr : real_addr_t; - data : std_ulogic_vector(63 downto 0); - byte_sel : std_ulogic_vector(7 downto 0); - is_hit : std_ulogic; - hit_way : way_t; - same_tag : std_ulogic; - mmu_req : std_ulogic; - dawr_m : std_ulogic; + op_lmiss : std_ulogic; + op_store : std_ulogic; + op_flush : std_ulogic; + op_sync : std_ulogic; + nc : std_ulogic; + valid : std_ulogic; + dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; + sync : std_ulogic; + reserve : std_ulogic; + first_dw : std_ulogic; + last_dw : std_ulogic; + real_addr : real_addr_t; + data : std_ulogic_vector(63 downto 0); + byte_sel : std_ulogic_vector(7 downto 0); + is_hit : std_ulogic; + hit_way : way_t; + hit_ways : way_expand_t; + hit_reload : std_ulogic; + same_page : std_ulogic; + mmu_req : std_ulogic; + dawr_m : std_ulogic; + tlb_hit : std_ulogic; + tlb_index : tlb_index_sig_t; + tlb_way : tlb_way_sig_t; end record; -- First stage register, contains state for stage 1 of load hits @@ -344,6 +349,7 @@ architecture rtl of dcache is cache_hit : std_ulogic; prev_hit : std_ulogic; prev_way : way_t; + prev_hit_ways : way_expand_t; prev_hit_reload : std_ulogic; -- TLB hit state @@ -351,13 +357,16 @@ architecture rtl of dcache is tlb_hit_way : tlb_way_sig_t; tlb_hit_index : tlb_index_sig_t; tlb_victim : tlb_way_sig_t; + ls_tlb_hit : std_ulogic; + tlb_acc_index : tlb_index_sig_t; + tlb_acc_way : tlb_way_sig_t; -- data buffer for data forwarded from writes to reads forward_data : std_ulogic_vector(63 downto 0); - forward_tag : cache_tag_t; forward_sel : std_ulogic_vector(7 downto 0); forward_valid : std_ulogic; forward_row : row_t; + forward_way : way_t; data_out : std_ulogic_vector(63 downto 0); -- Cache miss state (reload state machine) @@ -370,6 +379,7 @@ architecture rtl of dcache is reloading : std_ulogic; reload_tag : cache_tag_t; store_way : way_t; + store_ways : way_expand_t; store_row : row_t; store_index : index_t; end_row_ix : row_in_line_t; @@ -421,7 +431,7 @@ architecture rtl of dcache is signal req_op_bad : std_ulogic; signal req_op_nop : std_ulogic; signal req_data : std_ulogic_vector(63 downto 0); - signal req_same_tag : std_ulogic; + signal req_same_page : std_ulogic; signal req_go : std_ulogic; signal req_nc : std_ulogic; signal req_hit_reload : std_ulogic; @@ -774,15 +784,27 @@ begin pte <= hitpte; valid_ra <= tlb_hit or not r0.req.virt_mode; tlb_miss <= r0_valid and r0.req.virt_mode and not tlb_hit; + + -- extract real address, permissions, attributes + -- also detect whether this access is to the same page as the previous one + req_same_page <= '0'; if r0.req.virt_mode = '1' then ra <= hitpte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= extract_perm_attr(hitpte); + if tlb_read_valid = '1' and r1.state = STORE_WAIT_ACK and r1.ls_tlb_hit = '1' and + tlb_req_index = r1.tlb_acc_index and hitway = r1.tlb_acc_way then + req_same_page <= '1'; + end if; else ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= real_mode_perm_attr; + if r0.req.addr(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) = + wb_to_addr(r1.wb.adr)(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) then + req_same_page <= '1'; + end if; end if; end process; @@ -807,7 +829,6 @@ begin for i in tlb_way_t loop if tlb_hit_expand(i) = '1' then assert not is_X(tlb_req_index); - assert not is_X(tlb_hit_way); dtlb_valids(to_integer(tlb_req_index))(i) <= '0'; end if; end loop; @@ -965,6 +986,11 @@ begin variable snoop_match : std_ulogic; variable hit_reload : std_ulogic; variable dawr_match : std_ulogic; + variable idx_reload : way_expand_t; + variable maybe_fwd_rl : way_expand_t; + variable maybe_fwd_st : way_expand_t; + variable maybe_fwd2 : way_expand_t; + variable wr_row_match : std_ulogic; begin -- Extract line, row and tag from request rindex := get_index(r0.req.addr); @@ -978,23 +1004,65 @@ begin end if; go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.ls_error; - if is_X(r0.req.addr) then + if is_X(ra) then go := '0'; end if; - if go = '1' then - assert not is_X(r1.forward_tag); + + -- See if the request matches the line currently being reloaded + if go = '1' and r1.reloading = '1' then + assert not is_X(r1.store_index); + assert not is_X(r1.store_row); + assert not is_X(r1.store_way); + end if; + wr_row_match := '0'; + if go = '1' and req_row = r1.store_row then + wr_row_match := '1'; + end if; + idx_reload := (others => '0'); + maybe_fwd_rl := (others => '0'); + if go = '1' and r1.reloading = '1' and rindex = r1.store_index then + -- Way r1.store_way at this index is currently being reloaded. + -- If we detect that this way is the one that hits below, + -- and this is a load, then this is a hit only if r1.rows_valid() + -- is true, or if the data currently arriving on the wishbone is + -- the row we want. + if wr_row_match = '1' and wishbone_in.ack = '1' then + maybe_fwd_rl := r1.store_ways; + elsif r0.req.load = '1' and r0.req.touch = '0' and + r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '0' then + idx_reload := r1.store_ways; + end if; + end if; + + -- See if request matches the location being stored in this cycle + maybe_fwd_st := (others => '0'); + if wr_row_match = '1' and r1.write_bram = '1' then + maybe_fwd_st := r1.store_ways; + end if; + + -- See if request matches the location stored to in the previous cycle + maybe_fwd2 := (others => '0'); + if go = '1' and r1.forward_valid = '1' and req_row = r1.forward_row then + assert not is_X(r1.forward_way); + maybe_fwd2(to_integer(r1.forward_way)) := '1'; + end if; + + hit_ways := (others => '0'); + if r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' then + -- For the second half of an atomic quadword load, just use the + -- same way as the first half, without considering whether the line + -- is valid; it is as if we had read the second dword at the same + -- time as the first dword, and the line was valid back then. + -- If the line is currently being reloaded and the doubleword we want + -- hasn't come yet, then idx_reload() will be 1 and we treat this + -- as a miss in order to wait for it. + hit_ways := r1.prev_hit_ways; end if; -- Test if pending request is a hit on any way -- In order to make timing in virtual mode, when we are using the TLB, -- we compare each way with each of the real addresses from each way of -- the TLB, and then decide later which match to use. - hit_way := to_unsigned(0, WAY_BITS); - hit_ways := (others => '0'); - is_hit := '0'; - rel_match := '0'; - fwd_match := '0'; - snoop_match := '0'; if r0.req.virt_mode = '1' then for j in tlb_way_t loop if tlb_valid_way(j) = '1' then @@ -1008,23 +1076,8 @@ begin read_tag(i, cache_tag_set) = s_tag and tlb_hit_expand(j) = '1' then hit_ways(i) := '1'; - hit_way := to_unsigned(i, WAY_BITS); - if go = '1' then - is_hit := '1'; - if snoop_hits(i) = '1' then - snoop_match := '1'; - end if; - end if; end if; end loop; - if go = '1' and tlb_hit_expand(j) = '1' then - if not is_X(r1.reload_tag) and s_tag = r1.reload_tag then - rel_match := '1'; - end if; - if s_tag = r1.forward_tag then - fwd_match := '1'; - end if; - end if; end if; end loop; else @@ -1036,54 +1089,33 @@ begin if go = '1' and cache_valids(to_integer(rindex))(i) = '1' and read_tag(i, cache_tag_set) = s_tag then hit_ways(i) := '1'; - hit_way := to_unsigned(i, WAY_BITS); - is_hit := '1'; - if snoop_hits(i) = '1' then - snoop_match := '1'; - end if; end if; end loop; - if go = '1' and not is_X(r1.reload_tag) and s_tag = r1.reload_tag then - rel_match := '1'; - end if; - if go = '1' and s_tag = r1.forward_tag then - fwd_match := '1'; - end if; end if; - req_same_tag <= rel_match; + + hit_way := to_unsigned(0, WAY_BITS); + is_hit := '0'; + hit_reload := '0'; + for i in 0 to NUM_WAYS-1 loop + if hit_ways(i) = '1' then + hit_way := to_unsigned(i, WAY_BITS); + is_hit := go and not idx_reload(i); + hit_reload := go and idx_reload(i); + end if; + end loop; -- This is 1 if the snooped write from the previous cycle hits the same -- cache line that is being accessed in this cycle. req_snoop_hit <= '0'; - if go = '1' and snoop_match = '1' and get_index(snoop_paddr) = rindex then - req_snoop_hit <= '1'; + if go = '1' and get_index(snoop_paddr) = rindex then + -- (ignore idx_reload here since snooped writes can't happen while we're reloading) + req_snoop_hit <= or (snoop_hits and hit_ways); end if; -- Whether to use forwarded data for a load or not - use_forward_st <= '0'; - use_forward_rl <= '0'; - if rel_match = '1' then - assert not is_X(r1.store_row); - assert not is_X(req_row); - end if; - if rel_match = '1' and r1.store_row = req_row then - -- Use the forwarding path if this cycle is a write to this row - use_forward_st <= r1.write_bram; - if r1.reloading = '1' and wishbone_in.ack = '1' then - use_forward_rl <= '1'; - end if; - end if; - use_forward2 <= '0'; - if fwd_match = '1' then - assert not is_X(r1.forward_row); - if is_X(req_row) then - report "req_row=" & to_hstring(req_row) & " addr=" & to_hstring(r0.req.addr) & " go=" & std_ulogic'image(go); - end if; - assert not is_X(req_row); - end if; - if fwd_match = '1' and r1.forward_row = req_row then - use_forward2 <= r1.forward_valid; - end if; + use_forward_rl <= or (hit_ways and maybe_fwd_rl); + use_forward_st <= or (hit_ways and maybe_fwd_st); + use_forward2 <= or (hit_ways and maybe_fwd2); -- The way to replace on a miss replace_way <= to_unsigned(0, WAY_BITS); @@ -1101,42 +1133,8 @@ begin end if; end if; - -- See if the request matches the line currently being reloaded - if r1.reloading = '1' and rel_match = '1' then - assert not is_X(rindex); - assert not is_X(r1.store_index); - end if; - hit_reload := '0'; - if r1.reloading = '1' and rel_match = '1' and - rindex = r1.store_index then - -- Ignore is_hit from above, because a load miss writes the new tag - -- but doesn't clear the valid bit on the line before refilling it. - -- For a store, consider this a hit even if the row isn't valid - -- since it will be by the time we perform the store. - -- For a load, check the appropriate row valid bit; but also, - -- if use_forward_rl is 1 then we can consider this a hit. - -- For a touch, since the line we want is being reloaded already, - -- consider this a hit. - is_hit := not r0.req.load or r0.req.touch or - r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or - use_forward_rl; - hit_way := replace_way; - hit_ways := (others => '0'); - hit_ways(to_integer(replace_way)) := '1'; - hit_reload := is_hit; - elsif r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' and - r0.req.nc = '0' and perm_attr.nocache = '0' and r1.prev_hit = '1' then - -- For the second half of an atomic quadword load, just use the - -- same way as the first half, without considering whether the line - -- is valid; it is as if we had read the second dword at the same - -- time as the first dword, and the line was valid back then. - -- (Cases where the line is currently being reloaded are handled above.) - -- NB lq to noncacheable isn't required to be atomic per the ISA. - is_hit := '1'; - hit_way := r1.prev_way; - hit_ways := (others => '0'); - hit_ways(to_integer(r1.prev_way)) := '1'; - end if; + req_go <= go; + req_nc <= nc; -- The way that matched on a hit req_hit_way <= hit_way; @@ -1191,8 +1189,6 @@ begin req_op_load_miss <= not is_hit; -- includes non-cacheable loads end if; end if; - req_go <= go; - req_nc <= nc; -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. @@ -1403,9 +1399,9 @@ begin r1.data_out <= data_out; r1.forward_data <= ram_wr_data; - r1.forward_tag <= r1.reload_tag; r1.forward_row <= r1.store_row; r1.forward_sel <= ram_wr_select; + r1.forward_way <= replace_way; r1.forward_valid <= r1.write_bram; if r1.reloading = '1' and wishbone_in.ack = '1' then r1.forward_valid <= '1'; @@ -1479,6 +1475,7 @@ begin r1.dec_acks <= '0'; r1.prev_hit <= '0'; r1.prev_hit_reload <= '0'; + r1.prev_hit_ways <= (others => '0'); reservation.valid <= '0'; reservation.addr <= (others => '0'); @@ -1529,7 +1526,16 @@ begin end if; end loop; r1.store_way <= replace_way; + r1.store_ways <= (others => '0'); + r1.store_ways(to_integer(replace_way)) <= '1'; r1.write_tag <= '0'; + -- Set the line valid now. While the line is being + -- reloaded, the hit detection logic will use r1.rows_valid + -- to determine hits on this line. + cache_valids(to_integer(r1.store_index))(to_integer(replace_way)) <= '1'; + -- record which way was used, for possible 2nd half of lqarx + r1.prev_hit_ways <= (others => '0'); + r1.prev_hit_ways(to_integer(replace_way)) <= '1'; end if; -- Take request from r1.req if there is one there, @@ -1552,6 +1558,9 @@ begin req.first_dw := not r0.req.atomic_qw or r0.req.atomic_first; req.last_dw := not r0.req.atomic_qw or r0.req.atomic_last; req.real_addr := ra; + req.tlb_hit := tlb_hit; + req.tlb_index := tlb_req_index; + req.tlb_way := tlb_hit_way; -- Force data to 0 for dcbz if r0.req.dcbz = '1' then req.data := (others => '0'); @@ -1567,8 +1576,10 @@ begin req.byte_sel := r0.req.byte_sel; end if; req.hit_way := req_hit_way; + req.hit_ways := req_hit_ways; req.is_hit := req_is_hit; - req.same_tag := req_same_tag; + req.hit_reload := req_hit_reload; + req.same_page := req_same_page; -- Store the incoming request from r0, if it is a slow request -- Note that r1.full = 1 implies none of the req_op_* are 1. @@ -1579,6 +1590,9 @@ begin r1.full <= req_op_load_miss or req_op_store or req_op_flush or req_op_sync; end if; end if; + if r0_valid = '1' and r0.tlbld = '1' then + r1.ls_tlb_hit <= '0'; + end if; -- Signals for PLRU update and victim selection r1.hit_way <= req_hit_way; @@ -1594,6 +1608,7 @@ begin if req_go = '1' then r1.prev_hit <= req_is_hit; r1.prev_way <= req_hit_way; + r1.prev_hit_ways <= req_hit_ways; r1.prev_hit_reload <= req_hit_reload; end if; @@ -1624,10 +1639,14 @@ begin r1.store_row <= get_row(req.real_addr); r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1; r1.reload_tag <= get_tag(req.real_addr); - r1.req.same_tag <= '1'; + r1.req.hit_reload <= '1'; + r1.ls_tlb_hit <= req.tlb_hit and not req.mmu_req; + r1.tlb_acc_index <= req.tlb_index; + r1.tlb_acc_way <= req.tlb_way; if req.is_hit = '1' then r1.store_way <= req.hit_way; + r1.store_ways <= req.hit_ways; end if; -- Reset per-row valid bits, ready for handling the next load miss @@ -1737,9 +1756,12 @@ begin assert not is_X(r1.store_row); assert not is_X(r1.req.real_addr); end if; - if r1.full = '1' and r1.req.same_tag = '1' and - ((r1.dcbz = '1' and r1.req.dcbz = '1') or r1.req.op_lmiss = '1') and - r1.store_row = get_row(r1.req.real_addr) then + -- r1.req.hit_reload is always 1 for the request that + -- started this reload, and otherwise always 0 for dcbz + -- (since it is considered a store). + if r1.full = '1' and r1.req.hit_reload = '1' and + get_row_of_line(r1.store_row) = + get_row_of_line(get_row(r1.req.real_addr)) then r1.full <= '0'; r1.slow_valid <= '1'; if r1.mmu_req = '0' then @@ -1763,9 +1785,6 @@ begin -- Cache line is now valid assert not is_X(r1.store_index); assert not is_X(r1.store_way); - if r1.reloading = '1' then - cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1'; - end if; r1.reloading <= '0'; ev.dcache_refill <= not r1.dcbz; @@ -1773,6 +1792,7 @@ begin -- if the first half hit this line. r1.prev_hit <= r1.prev_hit_reload; r1.prev_way <= r1.store_way; + r1.prev_hit_ways <= r1.store_ways; r1.state <= IDLE; end if; @@ -1791,19 +1811,20 @@ begin -- DO_STCX state, unless they are the second half of a -- successful stqcx, which is handled here. if req.valid = '1' then - r1.wb.adr(SET_SIZE_BITS - ROW_OFF_BITS - 1 downto 0) <= - req.real_addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS); + r1.wb.adr(TLB_LG_PGSZ - ROW_OFF_BITS - 1 downto 0) <= + req.real_addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS); r1.wb.dat <= req.data; r1.wb.sel <= req.byte_sel; end if; assert not is_X(acks); r1.wb.stb <= '0'; - if req.op_store = '1' and req.same_tag = '1' and req.dcbz = '0' and + if req.op_store = '1' and req.same_page = '1' and req.dcbz = '0' and (req.reserve = '0' or r1.atomic_more = '1') then if acks < 7 then r1.wb.stb <= '1'; stbs_done := false; r1.store_way <= req.hit_way; + r1.store_ways <= req.hit_ways; r1.store_row <= get_row(req.real_addr); r1.write_bram <= req.is_hit; r1.atomic_more <= not req.last_dw;