From 81d777be02a1052433bad993063c0f50c13d8131 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 4 Mar 2020 16:20:05 +1100 Subject: [PATCH] dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras --- common.vhdl | 2 + dcache.vhdl | 98 +++++++++++++++++++++++-------------------------- loadstore1.vhdl | 4 ++ 3 files changed, 51 insertions(+), 53 deletions(-) diff --git a/common.vhdl b/common.vhdl index 84bbc47..4b879a1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -229,6 +229,8 @@ package common is xerc : xer_common_t; reserve : std_ulogic; rc : std_ulogic; + early_low_addr : std_ulogic_vector(11 downto 0); + early_valid : std_ulogic; end record; type DcacheToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 75b10c7..265022b 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -124,6 +124,7 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing + PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD NEXT_DWORD, -- Starting the 2nd xfer of misaligned LOAD_UPDATE, -- Load with update extra cycle LOAD_UPDATE2, -- Load with update extra cycle @@ -184,24 +185,6 @@ architecture rtl of dcache is signal r1 : reg_stage_1_t; - -- Second stage register, only used for load hits - -- - type reg_stage_2_t is record - hit_way : way_t; - hit_load_valid : std_ulogic; - load_is_update : std_ulogic; - load_reg : std_ulogic_vector(4 downto 0); - data_shift : std_ulogic_vector(2 downto 0); - length : std_ulogic_vector(3 downto 0); - sign_extend : std_ulogic; - byte_reverse : std_ulogic; - xerc : xer_common_t; - last_dword : std_ulogic; - second_dword : std_ulogic; - end record; - - signal r2 : reg_stage_2_t; - -- Reservation information -- type reservation_t is record @@ -221,6 +204,10 @@ architecture rtl of dcache is signal req_addr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); signal req_sel : std_ulogic_vector(7 downto 0); + signal next_addr : std_ulogic_vector(63 downto 0); + + signal early_req_addr : std_ulogic_vector(11 downto 0); + signal early_req_row : row_t; signal cancel_store : std_ulogic; signal set_rsrv : std_ulogic; @@ -404,6 +391,12 @@ begin end generate; end generate; + -- Wishbone read and write and BRAM write sel bits generation + bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); + + -- See if the operation crosses two doublewords + two_dwords <= or (bus_sel(15 downto 8)); + -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; @@ -444,6 +437,9 @@ begin req_laddr <= req_addr(63 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); + -- Address of next doubleword, used for unaligned accesses + next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; + -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; @@ -480,17 +476,21 @@ begin req_op <= op; + -- Versions of the address and row number that are valid one cycle earlier + -- in the cases where we need to read the cache data BRAM. + if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then + early_req_addr <= next_addr(11 downto 0); + elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then + early_req_addr <= r1.next_addr(11 downto 0); + else + early_req_addr <= d_in.early_low_addr; + end if; + early_req_row <= get_row(x"0000000000000" & early_req_addr); end process; -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Wishbone read and write and BRAM write sel bits generation - bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); - - -- See if the operation crosses two doublewords - two_dwords <= or (bus_sel(15 downto 8)); - -- TODO: Generate errors -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; @@ -540,14 +540,14 @@ begin -- The mux on d_out.write reg defaults to the normal load hit case. d_out.write_enable <= '0'; d_out.valid <= '0'; - d_out.write_reg <= r2.load_reg; - d_out.write_data <= cache_out(r2.hit_way); - d_out.write_len <= r2.length; - d_out.write_shift <= r2.data_shift; - d_out.sign_extend <= r2.sign_extend; - d_out.byte_reverse <= r2.byte_reverse; - d_out.second_word <= r2.second_dword; - d_out.xerc <= r2.xerc; + d_out.write_reg <= r1.req.write_reg; + d_out.write_data <= cache_out(r1.hit_way); + d_out.write_len <= r1.req.length; + d_out.write_shift <= r1.req.addr(2 downto 0); + d_out.sign_extend <= r1.req.sign_extend; + d_out.byte_reverse <= r1.req.byte_reverse; + d_out.second_word <= r1.second_dword; + d_out.xerc <= r1.req.xerc; d_out.rc <= '0'; -- loads never have rc=1 d_out.store_done <= '0'; @@ -562,26 +562,27 @@ begin -- -- Sanity: Only one of these must be set in any given cycle - assert (r1.update_valid and r2.hit_load_valid) /= '1' report + assert (r1.update_valid and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with update_valid" severity FAILURE; assert (r1.slow_valid and r1.stcx_fail) /= '1' report "unexpected slow_valid collision with stcx_fail" severity FAILURE; - assert ((r1.slow_valid or r1.stcx_fail) and r2.hit_load_valid) /= '1' report + assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report "unexpected update_valid collision with slow_valid or stcx_fail" severity FAILURE; - -- Delayed load hit case is the standard path - if r2.hit_load_valid = '1' then + -- Load hit case is the standard path + if r1.hit_load_valid = '1' then d_out.write_enable <= '1'; -- If there isn't another dword to go and -- it's not a load with update, complete it now - if r2.last_dword = '1' and r2.load_is_update = '0' then + if (r1.second_dword or not r1.two_dwords) = '1' and + r1.req.update = '0' then report "completing load hit"; d_out.valid <= '1'; end if; @@ -693,7 +694,7 @@ begin begin -- Cache hit reads do_read <= '1'; - rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + rd_addr <= std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS)); cache_out(i) <= dout; -- Write mux: @@ -732,23 +733,11 @@ begin -- -- Cache hit synchronous machine for the easy case. This handles - -- non-update form load hits and stage 1 to stage 2 transfers + -- non-update form load hits -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- stage 1 -> stage 2 - r2.hit_load_valid <= r1.hit_load_valid; - r2.hit_way <= r1.hit_way; - r2.load_is_update <= r1.req.update; - r2.load_reg <= r1.req.write_reg; - r2.data_shift <= r1.req.addr(2 downto 0); - r2.length <= r1.req.length; - r2.sign_extend <= r1.req.sign_extend; - r2.byte_reverse <= r1.req.byte_reverse; - r2.second_dword <= r1.second_dword; - r2.last_dword <= r1.second_dword or not r1.two_dwords; - -- If we have a request incoming, we have to latch it as d_in.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are @@ -759,7 +748,7 @@ begin r1.req <= d_in; r1.second_dword <= '0'; r1.two_dwords <= two_dwords; - r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; + r1.next_addr <= next_addr; r1.next_sel <= bus_sel(15 downto 8); report "op:" & op_t'image(req_op) & @@ -912,6 +901,9 @@ begin when OP_BAD => end case; + when PRE_NEXT_DWORD => + r1.state <= NEXT_DWORD; + when RELOAD_WAIT_ACK => -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; @@ -958,7 +950,7 @@ begin -- we also need to do the deferred update cycle. r1.slow_valid <= '1'; if r1.two_dwords and not r1.second_dword then - r1.state <= NEXT_DWORD; + r1.state <= PRE_NEXT_DWORD; elsif r1.req.update = '1' then r1.state <= LOAD_UPDATE2; report "completing miss with load-update !"; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a0c0beb..a25e617 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -89,5 +89,9 @@ begin -- Update outputs l_out <= r; + + -- Asynchronous output of the low-order address bits (latched in dcache) + l_out.early_low_addr <= lsu_sum(11 downto 0); + l_out.early_valid <= l_in.valid; end process; end;