diff --git a/common.vhdl b/common.vhdl index 84bbc47..4b879a1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -229,6 +229,8 @@ package common is xerc : xer_common_t; reserve : std_ulogic; rc : std_ulogic; + early_low_addr : std_ulogic_vector(11 downto 0); + early_valid : std_ulogic; end record; type DcacheToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 75b10c7..265022b 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -124,6 +124,7 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing + PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD NEXT_DWORD, -- Starting the 2nd xfer of misaligned LOAD_UPDATE, -- Load with update extra cycle LOAD_UPDATE2, -- Load with update extra cycle @@ -184,24 +185,6 @@ architecture rtl of dcache is signal r1 : reg_stage_1_t; - -- Second stage register, only used for load hits - -- - type reg_stage_2_t is record - hit_way : way_t; - hit_load_valid : std_ulogic; - load_is_update : std_ulogic; - load_reg : std_ulogic_vector(4 downto 0); - data_shift : std_ulogic_vector(2 downto 0); - length : std_ulogic_vector(3 downto 0); - sign_extend : std_ulogic; - byte_reverse : std_ulogic; - xerc : xer_common_t; - last_dword : std_ulogic; - second_dword : std_ulogic; - end record; - - signal r2 : reg_stage_2_t; - -- Reservation information -- type reservation_t is record @@ -221,6 +204,10 @@ architecture rtl of dcache is signal req_addr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); signal req_sel : std_ulogic_vector(7 downto 0); + signal next_addr : std_ulogic_vector(63 downto 0); + + signal early_req_addr : std_ulogic_vector(11 downto 0); + signal early_req_row : row_t; signal cancel_store : std_ulogic; signal set_rsrv : std_ulogic; @@ -404,6 +391,12 @@ begin end generate; end generate; + -- Wishbone read and write and BRAM write sel bits generation + bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); + + -- See if the operation crosses two doublewords + two_dwords <= or (bus_sel(15 downto 8)); + -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; @@ -444,6 +437,9 @@ begin req_laddr <= req_addr(63 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); + -- Address of next doubleword, used for unaligned accesses + next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; + -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; @@ -480,17 +476,21 @@ begin req_op <= op; + -- Versions of the address and row number that are valid one cycle earlier + -- in the cases where we need to read the cache data BRAM. + if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then + early_req_addr <= next_addr(11 downto 0); + elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then + early_req_addr <= r1.next_addr(11 downto 0); + else + early_req_addr <= d_in.early_low_addr; + end if; + early_req_row <= get_row(x"0000000000000" & early_req_addr); end process; -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Wishbone read and write and BRAM write sel bits generation - bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); - - -- See if the operation crosses two doublewords - two_dwords <= or (bus_sel(15 downto 8)); - -- TODO: Generate errors -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; @@ -540,14 +540,14 @@ begin -- The mux on d_out.write reg defaults to the normal load hit case. d_out.write_enable <= '0'; d_out.valid <= '0'; - d_out.write_reg <= r2.load_reg; - d_out.write_data <= cache_out(r2.hit_way); - d_out.write_len <= r2.length; - d_out.write_shift <= r2.data_shift; - d_out.sign_extend <= r2.sign_extend; - d_out.byte_reverse <= r2.byte_reverse; - d_out.second_word <= r2.second_dword; - d_out.xerc <= r2.xerc; + d_out.write_reg <= r1.req.write_reg; + d_out.write_data <= cache_out(r1.hit_way); + d_out.write_len <= r1.req.length; + d_out.write_shift <= r1.req.addr(2 downto 0); + d_out.sign_extend <= r1.req.sign_extend; + d_out.byte_reverse <= r1.req.byte_reverse; + d_out.second_word <= r1.second_dword; + d_out.xerc <= r1.req.xerc; d_out.rc <= '0'; -- loads never have rc=1 d_out.store_done <= '0'; @@ -562,26 +562,27 @@ begin -- -- Sanity: Only one of these must be set in any given cycle - assert (r1.update_valid and r2.hit_load_valid) /= '1' report + assert (r1.update_valid and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with update_valid" severity FAILURE; assert (r1.slow_valid and r1.stcx_fail) /= '1' report "unexpected slow_valid collision with stcx_fail" severity FAILURE; - assert ((r1.slow_valid or r1.stcx_fail) and r2.hit_load_valid) /= '1' report + assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report "unexpected update_valid collision with slow_valid or stcx_fail" severity FAILURE; - -- Delayed load hit case is the standard path - if r2.hit_load_valid = '1' then + -- Load hit case is the standard path + if r1.hit_load_valid = '1' then d_out.write_enable <= '1'; -- If there isn't another dword to go and -- it's not a load with update, complete it now - if r2.last_dword = '1' and r2.load_is_update = '0' then + if (r1.second_dword or not r1.two_dwords) = '1' and + r1.req.update = '0' then report "completing load hit"; d_out.valid <= '1'; end if; @@ -693,7 +694,7 @@ begin begin -- Cache hit reads do_read <= '1'; - rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + rd_addr <= std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS)); cache_out(i) <= dout; -- Write mux: @@ -732,23 +733,11 @@ begin -- -- Cache hit synchronous machine for the easy case. This handles - -- non-update form load hits and stage 1 to stage 2 transfers + -- non-update form load hits -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- stage 1 -> stage 2 - r2.hit_load_valid <= r1.hit_load_valid; - r2.hit_way <= r1.hit_way; - r2.load_is_update <= r1.req.update; - r2.load_reg <= r1.req.write_reg; - r2.data_shift <= r1.req.addr(2 downto 0); - r2.length <= r1.req.length; - r2.sign_extend <= r1.req.sign_extend; - r2.byte_reverse <= r1.req.byte_reverse; - r2.second_dword <= r1.second_dword; - r2.last_dword <= r1.second_dword or not r1.two_dwords; - -- If we have a request incoming, we have to latch it as d_in.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are @@ -759,7 +748,7 @@ begin r1.req <= d_in; r1.second_dword <= '0'; r1.two_dwords <= two_dwords; - r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; + r1.next_addr <= next_addr; r1.next_sel <= bus_sel(15 downto 8); report "op:" & op_t'image(req_op) & @@ -912,6 +901,9 @@ begin when OP_BAD => end case; + when PRE_NEXT_DWORD => + r1.state <= NEXT_DWORD; + when RELOAD_WAIT_ACK => -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; @@ -958,7 +950,7 @@ begin -- we also need to do the deferred update cycle. r1.slow_valid <= '1'; if r1.two_dwords and not r1.second_dword then - r1.state <= NEXT_DWORD; + r1.state <= PRE_NEXT_DWORD; elsif r1.req.update = '1' then r1.state <= LOAD_UPDATE2; report "completing miss with load-update !"; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a0c0beb..a25e617 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -89,5 +89,9 @@ begin -- Update outputs l_out <= r; + + -- Asynchronous output of the low-order address bits (latched in dcache) + l_out.early_low_addr <= lsu_sum(11 downto 0); + l_out.early_valid <= l_in.valid; end process; end;