From 00efcc2c3b446bbdd41daca1c70ec826e9d00a5c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 2 Jan 2025 13:40:21 +1100 Subject: [PATCH] dcache: Make aligned quadword loads and stores actually be atomic This implements logic in the dcache to make aligned quadword loads and stores atomic with respect to other mechanisms that access memory. Such loads and stores are already marked with the atomic_qw bit in Loadstore1ToDcacheType. For quadword loads where the first dword access hits in the cache, we record the fact of the hit and the cache way used (r1.prev_hit and r1.prev_way). The second dword access then assumes a hit on the same way even if the cache line has been invalidated in the mean time by a snooped store. This gives the same effect as would loading both dwords at the time of the first dword load. For a lqarx, the reservation is set at the time of the first dword load, so if there is such a snooped store, the reservation will be invalid by the time the lqarx completes. If the first dword load hits on the cache line being refilled, so should the second, unless the refill finishes. In that case we set r1.prev_hit and r1.prev_way so the second load can use the line just refilled (but only if the first dword hit the line being refilled). For stores, the req.atomic_more flag is set on the first dword store, and that causes the STORE_WAIT_ACK state to wait for the next request without dropping cyc, so it is not possible for another wishbone master to insert an access between the writes of the two dwords to memory. For store-conditionals, DO_STCX state now transitions to STORE_WAIT_ACK state once the store has been accepted (stall is false). This means that the second store for a stqcx can be handled in the same way as the second store for a stq. Once the first store for a stqcx has succeeded, the second store is done unconditionally. Signed-off-by: Paul Mackerras --- dcache.vhdl | 141 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 45 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 5eb659d..f4403e4 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -264,6 +264,23 @@ architecture rtl of dcache is -- subsequent load requests to the same line can be completed as -- soon as the necessary data comes in from memory, without -- waiting for the whole line to be read. + -- + -- Aligned loads and stores of a doubleword or less are atomic + -- because they are done in a single wishbone operation. + -- For quadword atomic loads and stores we rely on the wishbone + -- arbiter not interrupting access to a target once it has first + -- given access; i.e. once we have the main wishbone, no other + -- master gets access until we drop cyc. + -- + -- Note on loads potentially hitting the victim line that is + -- currently being replaced: the new tag is available starting + -- with the 3rd cycle of RELOAD_WAIT_ACK state. As long as the + -- first read on the wishbone takes at least one cycle (i.e. the + -- ack doesn't arrive in the same cycle as stb was asserted), + -- r1.full will be true at least until that 3rd cycle and so a load + -- following a load miss can't hit on the old tag of the victim + -- line. As long as ack is not generated combinationally from + -- stb, this will be fine. -- Stage 0 register, basically contains just the latched request type reg_stage_0_t is record @@ -307,12 +324,16 @@ architecture rtl of dcache is full : std_ulogic; -- have uncompleted request mmu_req : std_ulogic; -- request is from MMU req : mem_access_request_t; + atomic_more : std_ulogic; -- atomic request isn't finished -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; hit_index : index_t; cache_hit : std_ulogic; + prev_hit : std_ulogic; + prev_way : way_t; + prev_hit_reload : std_ulogic; -- TLB hit state tlb_hit : std_ulogic; @@ -389,6 +410,7 @@ architecture rtl of dcache is signal req_same_tag : std_ulogic; signal req_go : std_ulogic; signal req_nc : std_ulogic; + signal req_hit_reload : std_ulogic; signal early_req_row : row_t; signal early_rd_valid : std_ulogic; @@ -927,6 +949,7 @@ begin variable fwd_match : std_ulogic; variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable snoop_match : std_ulogic; + variable hit_reload : std_ulogic; begin -- Extract line, row and tag from request rindex := get_index(r0.req.addr); @@ -1071,6 +1094,7 @@ begin assert not is_X(rindex); assert not is_X(r1.store_index); end if; + hit_reload := '0'; if r1.state = RELOAD_WAIT_ACK and rel_match = '1' and rindex = r1.store_index then -- Ignore is_hit from above, because a load miss writes the new tag @@ -1085,11 +1109,23 @@ begin r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or use_forward_rl; hit_way := replace_way; + hit_reload := is_hit; + elsif r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' and + r0.req.nc = '0' and perm_attr.nocache = '0' and r1.prev_hit = '1' then + -- For the second half of an atomic quadword load, just use the + -- same way as the first half, without considering whether the line + -- is valid; it is as if we had read the second dword at the same + -- time as the first dword, and the line was valid back then. + -- (Cases where the line is currently being reloaded are handled above.) + -- NB lq to noncacheable isn't required to be atomic per the ISA. + is_hit := '1'; + hit_way := r1.prev_way; end if; -- The way that matched on a hit req_hit_way <= hit_way; req_is_hit <= is_hit; + req_hit_reload <= hit_reload; -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP @@ -1418,6 +1454,8 @@ begin r1.acks_pending <= to_unsigned(0, 3); r1.stalled <= '0'; r1.dec_acks <= '0'; + r1.prev_hit <= '0'; + r1.prev_hit_reload <= '0'; reservation.valid <= '0'; reservation.addr <= (others => '0'); @@ -1443,9 +1481,7 @@ begin if req_go = '1' and access_ok = '1' and r0.req.load = '1' and r0.req.reserve = '1' and r0.req.atomic_first = '1' then reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); - if req_is_hit = '1' then - reservation.valid <= not req_snoop_hit; - end if; + reservation.valid <= req_is_hit and not req_snoop_hit; end if; -- Do invalidations from snooped stores to memory @@ -1488,8 +1524,8 @@ begin req.flush := r0.req.flush; req.touch := r0.req.touch; req.reserve := r0.req.reserve; - req.first_dw := r0.req.atomic_first; - req.last_dw := r0.req.atomic_last; + req.first_dw := not r0.req.atomic_qw or r0.req.atomic_first; + req.last_dw := not r0.req.atomic_qw or r0.req.atomic_last; req.real_addr := ra; -- Force data to 0 for dcbz if r0.req.dcbz = '1' then @@ -1528,6 +1564,11 @@ begin if req_op_load_miss = '1' or (r0.req.dcbz = '1' and req_is_hit = '0') then r1.choose_victim <= '1'; end if; + if req_go = '1' then + r1.prev_hit <= req_is_hit; + r1.prev_way <= req_hit_way; + r1.prev_hit_reload <= req_hit_reload; + end if; -- Update count of pending acks acks := r1.acks_pending; @@ -1549,6 +1590,7 @@ begin r1.wb.sel <= req.byte_sel; r1.wb.dat <= req.data; r1.dcbz <= req.dcbz; + r1.atomic_more <= not req.last_dw; -- Keep track of our index and way for subsequent stores. r1.store_index <= get_index(req.real_addr); @@ -1659,7 +1701,7 @@ begin assert not is_X(r1.req.real_addr); end if; if r1.full = '1' and r1.req.same_tag = '1' and - ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op_lmiss = '1') and + ((r1.dcbz = '1' and r1.req.dcbz = '1') or r1.req.op_lmiss = '1') and r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; @@ -1668,12 +1710,9 @@ begin else r1.mmu_done <= '1'; end if; - -- NB: for lqarx, set the reservation on the first - -- dword so that a snooped store between the two - -- dwords will kill the reservation. - if req.reserve = '1' and req.first_dw = '1' then + -- NB: for lqarx, set the reservation on the first dword + if r1.req.reserve = '1' and r1.req.first_dw = '1' then reservation.valid <= '1'; - reservation.addr <= req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); end if; end if; @@ -1690,6 +1729,10 @@ begin cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1'; ev.dcache_refill <= not r1.dcbz; + -- Second half of a lq/lqarx can assume a hit on this line now + -- if the first half hit this line. + r1.prev_hit <= r1.prev_hit_reload; + r1.prev_way <= r1.store_way; r1.state <= IDLE; end if; @@ -1703,6 +1746,10 @@ begin if wishbone_in.stall = '0' then -- See if there is another store waiting to be done -- which is in the same real page. + -- This could be either in r1.req or in r0. + -- Ignore store-conditionals, they have to go through + -- DO_STCX state, unless they are the second half of a + -- successful stqcx, which is handled here. if req.valid = '1' then r1.wb.adr(SET_SIZE_BITS - ROW_OFF_BITS - 1 downto 0) <= req.real_addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS); @@ -1710,28 +1757,33 @@ begin r1.wb.sel <= req.byte_sel; end if; assert not is_X(acks); - if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and - req.op_store = '1' then - r1.wb.stb <= '1'; - stbs_done := false; - r1.store_way <= req.hit_way; - r1.store_row <= get_row(req.real_addr); - r1.write_bram <= req.is_hit; - r1.full <= '0'; - r1.slow_valid <= '1'; - -- Store requests never come from the MMU - r1.ls_valid <= '1'; - stbs_done := false; + r1.wb.stb <= '0'; + if req.op_store = '1' and req.same_tag = '1' and req.dcbz = '0' and + (req.reserve = '0' or r1.atomic_more = '1') then + if acks < 7 then + r1.wb.stb <= '1'; + stbs_done := false; + r1.store_way <= req.hit_way; + r1.store_row <= get_row(req.real_addr); + r1.write_bram <= req.is_hit; + r1.atomic_more <= not req.last_dw; + r1.full <= '0'; + r1.slow_valid <= '1'; + -- Store requests never come from the MMU + r1.ls_valid <= '1'; + end if; else - r1.wb.stb <= '0'; stbs_done := true; + if req.valid = '1' then + r1.atomic_more <= '0'; + end if; end if; end if; -- Got ack ? See if complete. - if wishbone_in.ack = '1' then + if stbs_done and r1.atomic_more = '0' then assert not is_X(acks); - if stbs_done and acks = 1 then + if acks = 0 or (wishbone_in.ack = '1' and acks = 1) then r1.state <= IDLE; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -1770,31 +1822,30 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; reservation.valid <= '0'; + -- If this is the first half of a stqcx., the second half + -- will fail also because the reservation is not valid. + r1.state <= IDLE; elsif r1.wb.cyc = '0' then -- Right address and have reservation, so start the -- wishbone cycle r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; - else - if wishbone_in.stall = '0' then - -- Store has been accepted, so now we can write the - -- cache data RAM - r1.write_bram <= req.is_hit; - r1.wb.stb <= '0'; - end if; - if wishbone_in.ack = '1' then - r1.state <= IDLE; - r1.wb.cyc <= '0'; - r1.wb.stb <= '0'; - r1.full <= '0'; - r1.slow_valid <= '1'; - r1.ls_valid <= '1'; - -- For stqcx., kill the reservation on the last dword - if r1.req.last_dw = '1' then - reservation.valid <= '0'; - end if; - end if; + elsif r1.wb.stb = '1' and wishbone_in.stall = '0' then + -- Store has been accepted, so now we can write the + -- cache data RAM and complete the request + r1.write_bram <= r1.req.is_hit; + r1.wb.stb <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + reservation.valid <= '0'; + -- For a stqcx, STORE_WAIT_ACK will issue the second half + -- without checking the reservation, which is what we want + -- given that the first half has gone out. + -- With r1.atomic_more set, STORE_WAIT_ACK won't exit to + -- IDLE state until it sees the second half. + r1.state <= STORE_WAIT_ACK; end if; when FLUSH_CYCLE =>