dcache: Split PLRU into storage and logic

Rather than having update and decode logic for each individual PLRU
as well as a register to store the current PLRU state, we now put the
PLRU state in a little RAM, which will typically use LUT RAM on FPGAs,
and have just a single copy of the logic to calculate the pseudo-LRU
way and to update the PLRU state.

The PLRU RAM that apples to the data storage (as opposed to the TLB)
is read asynchronously in the cycle after the cache tag matching is
done.  At the end of that cycle the PLRU RAM entry is updated if the
access was a cache hit, or a victim way is calculated and stored if
the access was a cache miss.  It is possible that a cache miss doesn't
start being handled until later, in which case the stored victim way
is used later when the miss gets handled.

Similarly for the TLB PLRU, the RAM is read asynchronously in the
cycle after a TLB lookup is done, and either updated at the end of
that cycle (for a hit), or a victim is chosen and stored for when the
TLB miss is satisfied.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/408/head
Paul Mackerras 2 years ago
parent 86212dc879
commit a1f5867919

@ -317,6 +317,7 @@ architecture rtl of dcache is
tlb_hit : std_ulogic;
tlb_hit_way : tlb_way_sig_t;
tlb_hit_index : tlb_index_sig_t;
tlb_victim : tlb_way_sig_t;

-- data buffer for data forwarded from writes to reads
forward_data : std_ulogic_vector(63 downto 0);
@ -342,6 +343,8 @@ architecture rtl of dcache is
acks_pending : unsigned(2 downto 0);
inc_acks : std_ulogic;
dec_acks : std_ulogic;
choose_victim : std_ulogic;
victim_way : way_t;

-- Signals to complete (possibly with error)
ls_valid : std_ulogic;
@ -398,8 +401,7 @@ architecture rtl of dcache is
signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0);

-- PLRU output interface
type plru_out_t is array(0 to NUM_LINES-1) of std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_victim : plru_out_t;
signal plru_victim : way_t;
signal replace_way : way_t;

-- Wishbone read/write/cache write formatting signals
@ -423,8 +425,7 @@ architecture rtl of dcache is
signal tlb_miss : std_ulogic;

-- TLB PLRU output interface
type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_victim : tlb_plru_out_t;
signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);

signal snoop_tag_set : cache_tags_set_t;
signal snoop_valid : std_ulogic;
@ -650,39 +651,49 @@ begin
end process;

-- Generate TLB PLRUs
maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate
maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate
type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_ram : tlb_plru_array;
signal tlb_plru_cur : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_upd : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
begin
tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate
-- TLB PLRU interface
signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_acc_en : std_ulogic;
signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
begin
tlb_plru : entity work.plru
generic map (
BITS => TLB_WAY_BITS
)
port map (
clk => clk,
rst => rst,
acc => tlb_plru_acc,
acc_en => tlb_plru_acc_en,
lru => tlb_plru_out
);

process(all)
begin
-- PLRU interface
if not is_X(r1.tlb_hit_index) and r1.tlb_hit_index = i then
tlb_plru_acc_en <= r1.tlb_hit;
assert not is_X(r1.tlb_hit_way);
else
tlb_plru_acc_en <= '0';
end if;
tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way);
tlb_plru_victim(i) <= tlb_plru_out;
end process;
end generate;
tlb_plru : entity work.plrufn
generic map (
BITS => TLB_WAY_BITS
)
port map (
acc => tlb_plru_acc,
tree_in => tlb_plru_cur,
tree_out => tlb_plru_upd,
lru => tlb_plru_out
);

process(all)
begin
-- Read PLRU bits from array
if is_X(r1.tlb_hit_index) then
tlb_plru_cur <= (others => 'X');
else
tlb_plru_cur <= tlb_plru_ram(to_integer(r1.tlb_hit_index));
end if;

-- PLRU interface
tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way);
tlb_plru_victim <= tlb_plru_out;
end process;

-- synchronous writes to TLB PLRU array
process(clk)
begin
if rising_edge(clk) then
if r1.tlb_hit = '1' then
assert not is_X(r1.tlb_hit_index) severity failure;
tlb_plru_ram(to_integer(r1.tlb_hit_index)) <= tlb_plru_upd;
end if;
end if;
end process;
end generate;

tlb_search : process(all)
@ -753,7 +764,7 @@ begin
if tlb_hit = '1' then
repl_way := tlb_hit_way;
else
repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index)));
repl_way := unsigned(r1.tlb_victim);
end if;
assert not is_X(repl_way);
end if;
@ -770,39 +781,49 @@ begin
end process;

-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
maybe_plrus : if NUM_WAYS > 1 generate
type plru_array is array(0 to NUM_LINES-1) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_ram : plru_array;
signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plrus: for i in 0 to NUM_LINES-1 generate
-- PLRU interface
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
plru : entity work.plrufn
generic map (
BITS => WAY_BITS
)
port map (
acc => plru_acc,
tree_in => plru_cur,
tree_out => plru_upd,
lru => plru_out
);

process(all)
begin
-- Read PLRU bits from array
if is_X(r1.hit_index) then
plru_cur <= (others => 'X');
else
plru_cur <= plru_ram(to_integer(r1.hit_index));
end if;

begin
plru : entity work.plru
generic map (
BITS => WAY_BITS
)
port map (
clk => clk,
rst => rst,
acc => plru_acc,
acc_en => plru_acc_en,
lru => plru_out
);

process(all)
begin
-- PLRU interface
if not is_X(r1.hit_index) and r1.hit_index = to_unsigned(i, INDEX_BITS) then
plru_acc_en <= r1.cache_hit;
else
plru_acc_en <= '0';
end if;
plru_acc <= std_ulogic_vector(r1.hit_way);
plru_victim(i) <= plru_out;
end process;
end generate;
-- PLRU interface
plru_acc <= std_ulogic_vector(r1.hit_way);
plru_victim <= unsigned(plru_out);
end process;

-- synchronous writes to PLRU array
process(clk)
begin
if rising_edge(clk) then
if r1.cache_hit = '1' then
assert not is_X(r1.hit_index) severity failure;
plru_ram(to_integer(r1.hit_index)) <= plru_upd;
end if;
end if;
end process;
end generate;

-- Cache tag RAM read port
@ -980,8 +1001,13 @@ begin
replace_way <= to_unsigned(0, WAY_BITS);
if NUM_WAYS > 1 then
if r1.write_tag = '1' then
assert not is_X(r1.store_index);
replace_way <= unsigned(plru_victim(to_integer(r1.store_index)));
if r1.choose_victim = '1' then
replace_way <= plru_victim;
else
-- Cache victim way was chosen earlier,
-- in the cycle after the miss was detected.
replace_way <= r1.victim_way;
end if;
else
replace_way <= r1.store_way;
end if;
@ -1305,8 +1331,6 @@ begin
end if;

-- Fast path for load/store hits. Set signals for the writeback controls.
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
if req_op = OP_LOAD_HIT then
r1.hit_load_valid <= '1';
else
@ -1340,6 +1364,11 @@ begin
r1.tlb_hit <= tlb_hit;
r1.tlb_hit_way <= tlb_hit_way;
r1.tlb_hit_index <= tlb_req_index;
-- determine victim way in the TLB in the cycle after
-- we detect the TLB miss
if r1.ls_error = '1' then
r1.tlb_victim <= unsigned(tlb_plru_victim);
end if;

end if;
end process;
@ -1364,6 +1393,7 @@ begin
ev.load_miss <= '0';
ev.store_miss <= '0';
ev.dtlb_miss <= tlb_miss;
r1.choose_victim <= '0';

-- On reset, clear all valid bits to force misses
if rst = '1' then
@ -1460,6 +1490,17 @@ begin
end if;
end if;

-- Signals for PLRU update and victim selection
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
-- Record victim way in the cycle after we see a load or dcbz miss
if r1.choose_victim = '1' then
r1.victim_way <= plru_victim;
end if;
if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then
r1.choose_victim <= '1';
end if;

-- Main state machine
case r1.state is
when IDLE =>

Loading…
Cancel
Save