dcache: Split PLRU into storage and logic

Rather than having update and decode logic for each individual PLRU
as well as a register to store the current PLRU state, we now put the
PLRU state in a little RAM, which will typically use LUT RAM on FPGAs,
and have just a single copy of the logic to calculate the pseudo-LRU
way and to update the PLRU state.

The PLRU RAM that apples to the data storage (as opposed to the TLB)
is read asynchronously in the cycle after the cache tag matching is
done.  At the end of that cycle the PLRU RAM entry is updated if the
access was a cache hit, or a victim way is calculated and stored if
the access was a cache miss.  It is possible that a cache miss doesn't
start being handled until later, in which case the stored victim way
is used later when the miss gets handled.

Similarly for the TLB PLRU, the RAM is read asynchronously in the
cycle after a TLB lookup is done, and either updated at the end of
that cycle (for a hit), or a victim is chosen and stored for when the
TLB miss is satisfied.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/408/head
Paul Mackerras 2 years ago
parent 86212dc879
commit a1f5867919

@ -317,6 +317,7 @@ architecture rtl of dcache is
tlb_hit : std_ulogic; tlb_hit : std_ulogic;
tlb_hit_way : tlb_way_sig_t; tlb_hit_way : tlb_way_sig_t;
tlb_hit_index : tlb_index_sig_t; tlb_hit_index : tlb_index_sig_t;
tlb_victim : tlb_way_sig_t;


-- data buffer for data forwarded from writes to reads -- data buffer for data forwarded from writes to reads
forward_data : std_ulogic_vector(63 downto 0); forward_data : std_ulogic_vector(63 downto 0);
@ -342,6 +343,8 @@ architecture rtl of dcache is
acks_pending : unsigned(2 downto 0); acks_pending : unsigned(2 downto 0);
inc_acks : std_ulogic; inc_acks : std_ulogic;
dec_acks : std_ulogic; dec_acks : std_ulogic;
choose_victim : std_ulogic;
victim_way : way_t;


-- Signals to complete (possibly with error) -- Signals to complete (possibly with error)
ls_valid : std_ulogic; ls_valid : std_ulogic;
@ -398,8 +401,7 @@ architecture rtl of dcache is
signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0);


-- PLRU output interface -- PLRU output interface
type plru_out_t is array(0 to NUM_LINES-1) of std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_victim : way_t;
signal plru_victim : plru_out_t;
signal replace_way : way_t; signal replace_way : way_t;


-- Wishbone read/write/cache write formatting signals -- Wishbone read/write/cache write formatting signals
@ -423,8 +425,7 @@ architecture rtl of dcache is
signal tlb_miss : std_ulogic; signal tlb_miss : std_ulogic;


-- TLB PLRU output interface -- TLB PLRU output interface
type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_victim : tlb_plru_out_t;


signal snoop_tag_set : cache_tags_set_t; signal snoop_tag_set : cache_tags_set_t;
signal snoop_valid : std_ulogic; signal snoop_valid : std_ulogic;
@ -650,39 +651,49 @@ begin
end process; end process;


-- Generate TLB PLRUs -- Generate TLB PLRUs
maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate
type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_ram : tlb_plru_array;
signal tlb_plru_cur : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_upd : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
begin begin
tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate tlb_plru : entity work.plrufn
-- TLB PLRU interface generic map (
signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); BITS => TLB_WAY_BITS
signal tlb_plru_acc_en : std_ulogic; )
signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); port map (
begin acc => tlb_plru_acc,
tlb_plru : entity work.plru tree_in => tlb_plru_cur,
generic map ( tree_out => tlb_plru_upd,
BITS => TLB_WAY_BITS lru => tlb_plru_out
) );
port map (
clk => clk, process(all)
rst => rst, begin
acc => tlb_plru_acc, -- Read PLRU bits from array
acc_en => tlb_plru_acc_en, if is_X(r1.tlb_hit_index) then
lru => tlb_plru_out tlb_plru_cur <= (others => 'X');
); else

tlb_plru_cur <= tlb_plru_ram(to_integer(r1.tlb_hit_index));
process(all) end if;
begin
-- PLRU interface -- PLRU interface
if not is_X(r1.tlb_hit_index) and r1.tlb_hit_index = i then tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way);
tlb_plru_acc_en <= r1.tlb_hit; tlb_plru_victim <= tlb_plru_out;
assert not is_X(r1.tlb_hit_way); end process;
else
tlb_plru_acc_en <= '0'; -- synchronous writes to TLB PLRU array
end if; process(clk)
tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way); begin
tlb_plru_victim(i) <= tlb_plru_out; if rising_edge(clk) then
end process; if r1.tlb_hit = '1' then
end generate; assert not is_X(r1.tlb_hit_index) severity failure;
tlb_plru_ram(to_integer(r1.tlb_hit_index)) <= tlb_plru_upd;
end if;
end if;
end process;
end generate; end generate;


tlb_search : process(all) tlb_search : process(all)
@ -753,7 +764,7 @@ begin
if tlb_hit = '1' then if tlb_hit = '1' then
repl_way := tlb_hit_way; repl_way := tlb_hit_way;
else else
repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index))); repl_way := unsigned(r1.tlb_victim);
end if; end if;
assert not is_X(repl_way); assert not is_X(repl_way);
end if; end if;
@ -770,39 +781,49 @@ begin
end process; end process;


-- Generate PLRUs -- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate maybe_plrus : if NUM_WAYS > 1 generate
type plru_array is array(0 to NUM_LINES-1) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_ram : plru_array;
signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin begin
plrus: for i in 0 to NUM_LINES-1 generate plru : entity work.plrufn
-- PLRU interface generic map (
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); BITS => WAY_BITS
signal plru_acc_en : std_ulogic; )
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); port map (
acc => plru_acc,
begin tree_in => plru_cur,
plru : entity work.plru tree_out => plru_upd,
generic map ( lru => plru_out
BITS => WAY_BITS );
)
port map ( process(all)
clk => clk, begin
rst => rst, -- Read PLRU bits from array
acc => plru_acc, if is_X(r1.hit_index) then
acc_en => plru_acc_en, plru_cur <= (others => 'X');
lru => plru_out else
); plru_cur <= plru_ram(to_integer(r1.hit_index));

end if;
process(all)
begin -- PLRU interface
-- PLRU interface plru_acc <= std_ulogic_vector(r1.hit_way);
if not is_X(r1.hit_index) and r1.hit_index = to_unsigned(i, INDEX_BITS) then plru_victim <= unsigned(plru_out);
plru_acc_en <= r1.cache_hit; end process;
else
plru_acc_en <= '0'; -- synchronous writes to PLRU array
end if; process(clk)
plru_acc <= std_ulogic_vector(r1.hit_way); begin
plru_victim(i) <= plru_out; if rising_edge(clk) then
end process; if r1.cache_hit = '1' then
end generate; assert not is_X(r1.hit_index) severity failure;
plru_ram(to_integer(r1.hit_index)) <= plru_upd;
end if;
end if;
end process;
end generate; end generate;


-- Cache tag RAM read port -- Cache tag RAM read port
@ -980,8 +1001,13 @@ begin
replace_way <= to_unsigned(0, WAY_BITS); replace_way <= to_unsigned(0, WAY_BITS);
if NUM_WAYS > 1 then if NUM_WAYS > 1 then
if r1.write_tag = '1' then if r1.write_tag = '1' then
assert not is_X(r1.store_index); if r1.choose_victim = '1' then
replace_way <= unsigned(plru_victim(to_integer(r1.store_index))); replace_way <= plru_victim;
else
-- Cache victim way was chosen earlier,
-- in the cycle after the miss was detected.
replace_way <= r1.victim_way;
end if;
else else
replace_way <= r1.store_way; replace_way <= r1.store_way;
end if; end if;
@ -1305,8 +1331,6 @@ begin
end if; end if;


-- Fast path for load/store hits. Set signals for the writeback controls. -- Fast path for load/store hits. Set signals for the writeback controls.
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
if req_op = OP_LOAD_HIT then if req_op = OP_LOAD_HIT then
r1.hit_load_valid <= '1'; r1.hit_load_valid <= '1';
else else
@ -1340,6 +1364,11 @@ begin
r1.tlb_hit <= tlb_hit; r1.tlb_hit <= tlb_hit;
r1.tlb_hit_way <= tlb_hit_way; r1.tlb_hit_way <= tlb_hit_way;
r1.tlb_hit_index <= tlb_req_index; r1.tlb_hit_index <= tlb_req_index;
-- determine victim way in the TLB in the cycle after
-- we detect the TLB miss
if r1.ls_error = '1' then
r1.tlb_victim <= unsigned(tlb_plru_victim);
end if;


end if; end if;
end process; end process;
@ -1364,6 +1393,7 @@ begin
ev.load_miss <= '0'; ev.load_miss <= '0';
ev.store_miss <= '0'; ev.store_miss <= '0';
ev.dtlb_miss <= tlb_miss; ev.dtlb_miss <= tlb_miss;
r1.choose_victim <= '0';


-- On reset, clear all valid bits to force misses -- On reset, clear all valid bits to force misses
if rst = '1' then if rst = '1' then
@ -1460,6 +1490,17 @@ begin
end if; end if;
end if; end if;


-- Signals for PLRU update and victim selection
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
-- Record victim way in the cycle after we see a load or dcbz miss
if r1.choose_victim = '1' then
r1.victim_way <= plru_victim;
end if;
if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then
r1.choose_victim <= '1';
end if;

-- Main state machine -- Main state machine
case r1.state is case r1.state is
when IDLE => when IDLE =>

Loading…
Cancel
Save