dcache: Cleanup (mostly cosmetic)

Clearly separate the 2 stages of load hits, improve naming and
comments, clarify the writeback controls etc...

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
pull/114/head
Benjamin Herrenschmidt 5 years ago
parent 265fbf894b
commit 587a5e3c45

@ -36,6 +36,8 @@ entity dcache is
d_in : in Loadstore1ToDcacheType; d_in : in Loadstore1ToDcacheType;
d_out : out DcacheToWritebackType; d_out : out DcacheToWritebackType;


stall_out : out std_ulogic;

wishbone_out : out wishbone_master_out; wishbone_out : out wishbone_master_out;
wishbone_in : in wishbone_slave_out wishbone_in : in wishbone_slave_out
); );
@ -147,31 +149,39 @@ architecture rtl of dcache is
STORE_WAIT_ACK, -- Store wait ack STORE_WAIT_ACK, -- Store wait ack
NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack


type reg_internal_t is record
req_latch : Loadstore1ToDcacheType; --
-- Dcache operations:
-- Cache hit state (Latches for 1 cycle BRAM access) --
-- In order to make timing, we use the BRAMs with an output buffer,
-- which means that the BRAM output is delayed by an extra cycle.
--
-- Thus, the dcache has a 2-stage internal pipeline for cache hits
-- with no stalls.
--
-- All other operations are handled via stalling in the first stage.
--
-- The second stage can thus complete a hit at the same time as the
-- first stage emits a stall for a complex op.
--

-- First stage register, contains state for stage 1 of load hits
-- and for the state machine used by all other operations
--
type reg_stage_1_t is record
-- Latch the complete request from ls1
req : Loadstore1ToDcacheType;

-- Cache hit state
hit_way : way_t; hit_way : way_t;
hit_load_valid : std_ulogic; hit_load_valid : std_ulogic;


-- 1-cycle delayed signals to account for the BRAM extra
-- buffer that seems necessary to make timing on load hits
--
hit_way_delayed : way_t;
hit_load_delayed : std_ulogic;
hit_load_upd_delayed : std_ulogic;
hit_load_reg_delayed : std_ulogic_vector(4 downto 0);
hit_data_shift_delayed : std_ulogic_vector(2 downto 0);
hit_dlength_delayed : std_ulogic_vector(3 downto 0);
hit_sign_ext_delayed : std_ulogic;
hit_byte_rev_delayed : std_ulogic;

-- Register update (load/store with update) -- Register update (load/store with update)
update_valid : std_ulogic; update_valid : std_ulogic;


-- Data buffer for "slow" read ops (load miss and NC loads). -- Data buffer for "slow" read ops (load miss and NC loads).
slow_data : std_ulogic_vector(63 downto 0); slow_data : std_ulogic_vector(63 downto 0);
slow_valid : std_ulogic; slow_valid : std_ulogic;


-- Cache miss state (reload state machine) -- Cache miss state (reload state machine)
state : state_t; state : state_t;
@ -180,7 +190,22 @@ architecture rtl of dcache is
store_index : index_t; store_index : index_t;
end record; end record;


signal r : reg_internal_t; signal r1 : reg_stage_1_t;

-- Second stage register, only used for load hits
--
type reg_stage_2_t is record
hit_way : way_t;
hit_load_valid : std_ulogic;
load_is_update : std_ulogic;
load_reg : std_ulogic_vector(4 downto 0);
data_shift : std_ulogic_vector(2 downto 0);
length : std_ulogic_vector(3 downto 0);
sign_extend : std_ulogic;
byte_reverse : std_ulogic;
end record;

signal r2 : reg_stage_2_t;


-- Async signals on incoming request -- Async signals on incoming request
signal req_index : index_t; signal req_index : index_t;
@ -201,6 +226,10 @@ architecture rtl of dcache is
signal bus_sel : wishbone_sel_type; signal bus_sel : wishbone_sel_type;
signal store_data : wishbone_data_type; signal store_data : wishbone_data_type;
--
-- Helper functions to decode incoming requests
--

-- Return the cache line index (tag index) for an address -- Return the cache line index (tag index) for an address
function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is
begin begin
@ -384,16 +413,29 @@ begin


req_op <= op; req_op <= op;


-- XXX GENERATE ERRORS end process;
-- err_nc_collision <= '1' when op = OP_BAD else '0';


-- XXX Generate stalls --
-- stall_out <= r.state /= IDLE ? -- Misc signal assignments
--


end process; -- Wire up wishbone request latch out of stage 1
wishbone_out <= r1.wb;

-- Wishbone & BRAM write data formatting for stores (most of it already
-- happens in loadstore1, this is the remaining data shifting)
--
store_data <= std_logic_vector(shift_left(unsigned(d_in.data),
wishbone_data_shift(d_in.addr)));

-- Wishbone read and write and BRAM write sel bits generation
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);

-- TODO: Generate errors
-- err_nc_collision <= '1' when req_op = OP_BAD else '0';


-- Wire up wishbone request latch -- Generate stalls from stage 1 state machine
wishbone_out <= r.wb; stall_out <= '1' when r1.state /= IDLE else '0';


-- Writeback (loads and reg updates) & completion control logic -- Writeback (loads and reg updates) & completion control logic
-- --
@ -403,12 +445,12 @@ begin
-- The mux on d_out.write reg defaults to the normal load hit case. -- The mux on d_out.write reg defaults to the normal load hit case.
d_out.write_enable <= '0'; d_out.write_enable <= '0';
d_out.valid <= '0'; d_out.valid <= '0';
d_out.write_reg <= r.hit_load_reg_delayed; d_out.write_reg <= r2.load_reg;
d_out.write_data <= cache_out(r.hit_way_delayed); d_out.write_data <= cache_out(r2.hit_way);
d_out.write_len <= r.hit_dlength_delayed; d_out.write_len <= r2.length;
d_out.write_shift <= r.hit_data_shift_delayed; d_out.write_shift <= r2.data_shift;
d_out.sign_extend <= r.hit_sign_ext_delayed; d_out.sign_extend <= r2.sign_extend;
d_out.byte_reverse <= r.hit_byte_rev_delayed; d_out.byte_reverse <= r2.byte_reverse;
d_out.second_word <= '0'; d_out.second_word <= '0';


-- We have a valid load or store hit or we just completed a slow -- We have a valid load or store hit or we just completed a slow
@ -422,60 +464,60 @@ begin
-- --


-- Sanity: Only one of these must be set in any given cycle -- Sanity: Only one of these must be set in any given cycle
assert (r.update_valid and r.hit_load_delayed) /= '1' report assert (r1.update_valid and r2.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with update_valid" "unexpected hit_load_delayed collision with update_valid"
severity FAILURE; severity FAILURE;
assert (r.slow_valid and r.hit_load_delayed) /= '1' report assert (r1.slow_valid and r2.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with slow_valid" "unexpected hit_load_delayed collision with slow_valid"
severity FAILURE; severity FAILURE;
assert (r.slow_valid and r.update_valid) /= '1' report assert (r1.slow_valid and r1.update_valid) /= '1' report
"unexpected update_valid collision with slow_valid" "unexpected update_valid collision with slow_valid"
severity FAILURE; severity FAILURE;


-- Delayed load hit case is the standard path -- Delayed load hit case is the standard path
if r.hit_load_delayed = '1' then if r2.hit_load_valid = '1' then
d_out.write_enable <= '1'; d_out.write_enable <= '1';


-- If it's not a load with update, complete it now -- If it's not a load with update, complete it now
if r.hit_load_upd_delayed = '0' then if r2.load_is_update = '0' then
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if; end if;


-- Slow ops (load miss, NC, stores) -- Slow ops (load miss, NC, stores)
if r.slow_valid = '1' then if r1.slow_valid = '1' then
-- If it's a load, enable register writeback and switch -- If it's a load, enable register writeback and switch
-- mux accordingly -- mux accordingly
-- --
if r.req_latch.load then if r1.req.load then
d_out.write_reg <= r.req_latch.write_reg; d_out.write_reg <= r1.req.write_reg;
d_out.write_enable <= '1'; d_out.write_enable <= '1';


-- Read data comes from the slow data latch, formatter -- Read data comes from the slow data latch, formatter
-- from the latched request. -- from the latched request.
-- --
d_out.write_data <= r.slow_data; d_out.write_data <= r1.slow_data;
d_out.write_shift <= r.req_latch.addr(2 downto 0); d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r.req_latch.sign_extend; d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r.req_latch.byte_reverse; d_out.byte_reverse <= r1.req.byte_reverse;
d_out.write_len <= r.req_latch.length; d_out.write_len <= r1.req.length;
end if; end if;


-- If it's a store or a non-update load form, complete now -- If it's a store or a non-update load form, complete now
if r.req_latch.load = '0' or r.req_latch.update = '0' then if r1.req.load = '0' or r1.req.update = '0' then
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if; end if;


-- We have a register update to do. -- We have a register update to do.
if r.update_valid = '1' then if r1.update_valid = '1' then
d_out.write_enable <= '1'; d_out.write_enable <= '1';
d_out.write_reg <= r.req_latch.update_reg; d_out.write_reg <= r1.req.update_reg;


-- Change the read data mux to the address that's going into -- Change the read data mux to the address that's going into
-- the register and the formatter does nothing. -- the register and the formatter does nothing.
-- --
d_out.write_data <= r.req_latch.addr; d_out.write_data <= r1.req.addr;
d_out.write_shift <= "000"; d_out.write_shift <= "000";
d_out.write_len <= "1000"; d_out.write_len <= "1000";
d_out.sign_extend <= '0'; d_out.sign_extend <= '0';
@ -484,26 +526,14 @@ begin
-- If it was a load, this completes the operation (load with -- If it was a load, this completes the operation (load with
-- update case). -- update case).
-- --
if r.req_latch.load = '1' then if r1.req.load = '1' then
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if; end if;


end process; end process;


-- Misc data & sel signals --
misc: process(d_in)
begin
-- Wishbone & BRAM write data formatting for stores (most of it already
-- happens in loadstore1, this is the remaining sel generation and shifting)
--
store_data <= std_logic_vector(shift_left(unsigned(d_in.data),
wishbone_data_shift(d_in.addr)));

-- Wishbone read and write and BRAM write sel bits generation
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);
end process;

-- Generate a cache RAM for each way. This handles the normal -- Generate a cache RAM for each way. This handles the normal
-- reads, writes from reloads and the special store-hit update -- reads, writes from reloads and the special store-hit update
-- path as well. -- path as well.
@ -552,7 +582,7 @@ begin
-- For timing, the mux on wr_data/sel/addr is not dependent on anything -- For timing, the mux on wr_data/sel/addr is not dependent on anything
-- other than the current state. Only the do_write signal is. -- other than the current state. Only the do_write signal is.
-- --
if r.state = IDLE then if r1.state = IDLE then
-- When IDLE, the only write path is the store-hit update case -- When IDLE, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= store_data; wr_data <= store_data;
@ -561,41 +591,39 @@ begin
-- Otherwise, we might be doing a reload -- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat; wr_data <= wishbone_in.dat;
wr_sel <= (others => '1'); wr_sel <= (others => '1');
wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.wb.adr), ROW_BITS));
end if; end if;


-- The two actual write cases here -- The two actual write cases here
do_write <= '0'; do_write <= '0';
if r.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r.store_way = i then if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then
do_write <= '1'; do_write <= '1';
end if; end if;
if req_op = OP_STORE_HIT and req_hit_way = i then if req_op = OP_STORE_HIT and req_hit_way = i then
assert r.state /= RELOAD_WAIT_ACK report "Store hit while in state:" & assert r1.state /= RELOAD_WAIT_ACK report "Store hit while in state:" &
state_t'image(r.state) state_t'image(r1.state)
severity FAILURE; severity FAILURE;
do_write <= '1'; do_write <= '1';
end if; end if;
end process; end process;
end generate; end generate;


--
-- Cache hit synchronous machine for the easy case. This handles -- Cache hit synchronous machine for the easy case. This handles
-- non-update form load hits. -- non-update form load hits and stage 1 to stage 2 transfers
-- --
dcache_fast_hit : process(clk) dcache_fast_hit : process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
-- 1-cycle delayed signals for load hit response -- stage 1 -> stage 2
r.hit_load_delayed <= r.hit_load_valid; r2.hit_load_valid <= r1.hit_load_valid;
r.hit_way_delayed <= r.hit_way; r2.hit_way <= r1.hit_way;
r.hit_load_upd_delayed <= r.req_latch.update; r2.load_is_update <= r1.req.update;
r.hit_load_reg_delayed <= r.req_latch.write_reg; r2.load_reg <= r1.req.write_reg;
r.hit_data_shift_delayed <= r.req_latch.addr(2 downto 0); r2.data_shift <= r1.req.addr(2 downto 0);
r.hit_sign_ext_delayed <= r.req_latch.sign_extend; r2.length <= r1.req.length;
r.hit_byte_rev_delayed <= r.req_latch.byte_reverse; r2.sign_extend <= r1.req.sign_extend;
r.hit_dlength_delayed <= r.req_latch.length; r2.byte_reverse <= r1.req.byte_reverse;

-- On-cycle pulse values get reset on every cycle
r.hit_load_valid <= '0';


-- If we have a request incoming, we have to latch it as d_in.valid -- If we have a request incoming, we have to latch it as d_in.valid
-- is only set for a single cycle. It's up to the control logic to -- is only set for a single cycle. It's up to the control logic to
@ -604,7 +632,7 @@ begin
-- a stall output if necessary). -- a stall output if necessary).


if d_in.valid = '1' then if d_in.valid = '1' then
r.req_latch <= d_in; r1.req <= d_in;


report "op:" & op_t'image(req_op) & report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(d_in.addr) & " addr:" & to_hstring(d_in.addr) &
@ -618,12 +646,15 @@ begin


-- Fast path for load/store hits. Set signals for the writeback controls. -- Fast path for load/store hits. Set signals for the writeback controls.
if req_op = OP_LOAD_HIT then if req_op = OP_LOAD_HIT then
r.hit_way <= req_hit_way; r1.hit_way <= req_hit_way;
r.hit_load_valid <= '1'; r1.hit_load_valid <= '1';
else
r1.hit_load_valid <= '0';
end if; end if;
end if; end if;
end process; end process;


--
-- Every other case is handled by this stage machine: -- Every other case is handled by this stage machine:
-- --
-- * Cache load miss/reload (in conjunction with "rams") -- * Cache load miss/reload (in conjunction with "rams")
@ -631,7 +662,8 @@ begin
-- * Load hits for non-cachable forms -- * Load hits for non-cachable forms
-- * Stores (the collision case is handled in "rams") -- * Stores (the collision case is handled in "rams")
-- --
-- All wishbone requests generation is done here -- All wishbone requests generation is done here. This machine
-- operates at stage 1.
-- --
dcache_slow : process(clk) dcache_slow : process(clk)
variable way : integer range 0 to NUM_WAYS-1; variable way : integer range 0 to NUM_WAYS-1;
@ -643,32 +675,32 @@ begin
for i in index_t loop for i in index_t loop
cache_valids(i) <= (others => '0'); cache_valids(i) <= (others => '0');
end loop; end loop;
r.state <= IDLE; r1.state <= IDLE;
r.slow_valid <= '0'; r1.slow_valid <= '0';
r.update_valid <= '0'; r1.update_valid <= '0';
r.wb.cyc <= '0'; r1.wb.cyc <= '0';
r.wb.stb <= '0'; r1.wb.stb <= '0';


-- Not useful normally but helps avoiding tons of sim warnings -- Not useful normally but helps avoiding tons of sim warnings
r.wb.adr <= (others => '0'); r1.wb.adr <= (others => '0');
else else
-- One cycle pulses reset -- One cycle pulses reset
r.slow_valid <= '0'; r1.slow_valid <= '0';
r.update_valid <= '0'; r1.update_valid <= '0';


-- We cannot currently process a new request when not idle -- We cannot currently process a new request when not idle
assert req_op = OP_NONE or r.state = IDLE report "request " & assert req_op = OP_NONE or r1.state = IDLE report "request " &
op_t'image(req_op) & " while in state " & state_t'image(r.state) op_t'image(req_op) & " while in state " & state_t'image(r1.state)
severity FAILURE; severity FAILURE;


-- Main state machine -- Main state machine
case r.state is case r1.state is
when IDLE => when IDLE =>
case req_op is case req_op is
when OP_LOAD_HIT => when OP_LOAD_HIT =>
-- We have a load with update hit, we need the delayed update cycle -- We have a load with update hit, we need the delayed update cycle
if d_in.update = '1' then if d_in.update = '1' then
r.state <= LOAD_UPDATE; r1.state <= LOAD_UPDATE;
end if; end if;


when OP_LOAD_MISS => when OP_LOAD_MISS =>
@ -696,40 +728,40 @@ begin
end loop; end loop;


-- Keep track of our index and way for subsequent stores. -- Keep track of our index and way for subsequent stores.
r.store_index <= req_index; r1.store_index <= req_index;
r.store_way <= way; r1.store_way <= way;


-- Prep for first wishbone read. We calculate the address of -- Prep for first wishbone read. We calculate the address of
-- the start of the cache line -- the start of the cache line
-- --
r.wb.adr <= d_in.addr(63 downto LINE_OFF_BITS) & r1.wb.adr <= d_in.addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0'); (LINE_OFF_BITS-1 downto 0 => '0');
r.wb.sel <= (others => '1'); r1.wb.sel <= (others => '1');
r.wb.we <= '0'; r1.wb.we <= '0';
r.wb.cyc <= '1'; r1.wb.cyc <= '1';
r.wb.stb <= '1'; r1.wb.stb <= '1';
r.state <= RELOAD_WAIT_ACK; r1.state <= RELOAD_WAIT_ACK;


when OP_LOAD_NC => when OP_LOAD_NC =>
r.wb.sel <= bus_sel; r1.wb.sel <= bus_sel;
r.wb.adr <= d_in.addr(63 downto 3) & "000"; r1.wb.adr <= d_in.addr(63 downto 3) & "000";
r.wb.cyc <= '1'; r1.wb.cyc <= '1';
r.wb.stb <= '1'; r1.wb.stb <= '1';
r.wb.we <= '0'; r1.wb.we <= '0';
r.state <= NC_LOAD_WAIT_ACK; r1.state <= NC_LOAD_WAIT_ACK;


when OP_STORE_HIT | OP_STORE_MISS => when OP_STORE_HIT | OP_STORE_MISS =>
-- For store-with-update do the register update -- For store-with-update do the register update
if d_in.update = '1' then if d_in.update = '1' then
r.update_valid <= '1'; r1.update_valid <= '1';
end if; end if;
r.wb.sel <= bus_sel; r1.wb.sel <= bus_sel;
r.wb.adr <= d_in.addr(63 downto 3) & "000"; r1.wb.adr <= d_in.addr(63 downto 3) & "000";
r.wb.dat <= store_data; r1.wb.dat <= store_data;
r.wb.cyc <= '1'; r1.wb.cyc <= '1';
r.wb.stb <= '1'; r1.wb.stb <= '1';
r.wb.we <= '1'; r1.wb.we <= '1';
r.state <= STORE_WAIT_ACK; r1.state <= STORE_WAIT_ACK;


-- OP_NONE and OP_BAD do nothing -- OP_NONE and OP_BAD do nothing
when OP_NONE => when OP_NONE =>
@ -746,51 +778,51 @@ begin
-- not idle, which we don't currently know how to deal -- not idle, which we don't currently know how to deal
-- with. -- with.
-- --
if r.wb.adr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = if r1.wb.adr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) =
r.req_latch.addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) then r1.req.addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) then
r.slow_data <= wishbone_in.dat; r1.slow_data <= wishbone_in.dat;
end if; end if;


-- That was the last word ? We are done -- That was the last word ? We are done
if is_last_row(r.wb.adr) then if is_last_row(r1.wb.adr) then
cache_valids(r.store_index)(way) <= '1'; cache_valids(r1.store_index)(way) <= '1';
r.wb.cyc <= '0'; r1.wb.cyc <= '0';
r.wb.stb <= '0'; r1.wb.stb <= '0';


-- Complete the load that missed. For load with update -- Complete the load that missed. For load with update
-- we also need to do the deferred update cycle. -- we also need to do the deferred update cycle.
-- --
r.slow_valid <= '1'; r1.slow_valid <= '1';
if r.req_latch.load = '1' and r.req_latch.update = '1' then if r1.req.load = '1' and r1.req.update = '1' then
r.state <= LOAD_UPDATE; r1.state <= LOAD_UPDATE;
report "completing miss with load-update !"; report "completing miss with load-update !";
else else
r.state <= IDLE; r1.state <= IDLE;
report "completing miss !"; report "completing miss !";
end if; end if;
else else
-- Otherwise, calculate the next row address -- Otherwise, calculate the next row address
r.wb.adr <= next_row_addr(r.wb.adr); r1.wb.adr <= next_row_addr(r1.wb.adr);
end if; end if;
end if; end if;


when LOAD_UPDATE => when LOAD_UPDATE =>
-- We need the extra cycle to complete a load with update -- We need the extra cycle to complete a load with update
r.state <= LOAD_UPDATE2; r1.state <= LOAD_UPDATE2;
when LOAD_UPDATE2 => when LOAD_UPDATE2 =>
-- We need the extra cycle to complete a load with update -- We need the extra cycle to complete a load with update
r.update_valid <= '1'; r1.update_valid <= '1';
r.state <= IDLE; r1.state <= IDLE;


when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK =>
if wishbone_in.ack = '1' then if wishbone_in.ack = '1' then
if r.state = NC_LOAD_WAIT_ACK then if r1.state = NC_LOAD_WAIT_ACK then
r.slow_data <= wishbone_in.dat; r1.slow_data <= wishbone_in.dat;
end if; end if;
r.slow_valid <= '1'; r1.slow_valid <= '1';
r.wb.cyc <= '0'; r1.wb.cyc <= '0';
r.wb.stb <= '0'; r1.wb.stb <= '0';
r.state <= IDLE; r1.state <= IDLE;
end if; end if;
end case; end case;
end if; end if;

Loading…
Cancel
Save