Merge pull request #408 from paulusmack/plru-improvement

PLRU improvements
pull/410/head
Michael Neuling 2 years ago committed by GitHub
commit 84a0fba25d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -68,8 +68,8 @@ all: $(all)
$(shell scripts/make_version.sh git.vhdl)

core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl predecode.vhdl \
decode1.vhdl helpers.vhdl insn_helpers.vhdl \
utils.vhdl plru.vhdl plrufn.vhdl cache_ram.vhdl icache.vhdl \
predecode.vhdl decode1.vhdl helpers.vhdl insn_helpers.vhdl \
control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \

@ -84,7 +84,8 @@ architecture rtl of dcache is
-- TAG_WIDTH is the width in bits of each way of the tag RAM
constant TAG_WIDTH : natural := TAG_BITS + 7 - ((TAG_BITS + 7) mod 8);
-- WAY_BITS is the number of bits to select a way
constant WAY_BITS : natural := log2(NUM_WAYS);
-- Make sure this is at least 1, to avoid 0-element vectors
constant WAY_BITS : natural := maximum(log2(NUM_WAYS), 1);

-- Example of layout for 32 lines of 64 bytes:
--
@ -130,7 +131,7 @@ architecture rtl of dcache is

-- L1 TLB.
constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE);
constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS);
constant TLB_WAY_BITS : natural := maximum(log2(TLB_NUM_WAYS), 1);
constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS);
constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS;
constant TLB_PTE_BITS : natural := 64;
@ -316,6 +317,7 @@ architecture rtl of dcache is
tlb_hit : std_ulogic;
tlb_hit_way : tlb_way_sig_t;
tlb_hit_index : tlb_index_sig_t;
tlb_victim : tlb_way_sig_t;

-- data buffer for data forwarded from writes to reads
forward_data : std_ulogic_vector(63 downto 0);
@ -341,6 +343,8 @@ architecture rtl of dcache is
acks_pending : unsigned(2 downto 0);
inc_acks : std_ulogic;
dec_acks : std_ulogic;
choose_victim : std_ulogic;
victim_way : way_t;

-- Signals to complete (possibly with error)
ls_valid : std_ulogic;
@ -397,8 +401,7 @@ architecture rtl of dcache is
signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0);

-- PLRU output interface
type plru_out_t is array(0 to NUM_LINES-1) of std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_victim : plru_out_t;
signal plru_victim : way_t;
signal replace_way : way_t;

-- Wishbone read/write/cache write formatting signals
@ -422,8 +425,7 @@ architecture rtl of dcache is
signal tlb_miss : std_ulogic;

-- TLB PLRU output interface
type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_victim : tlb_plru_out_t;
signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);

signal snoop_tag_set : cache_tags_set_t;
signal snoop_valid : std_ulogic;
@ -649,39 +651,49 @@ begin
end process;

-- Generate TLB PLRUs
maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate
maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate
type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_ram : tlb_plru_array;
signal tlb_plru_cur : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_upd : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
begin
tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate
-- TLB PLRU interface
signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_acc_en : std_ulogic;
signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
begin
tlb_plru : entity work.plru
generic map (
BITS => TLB_WAY_BITS
)
port map (
clk => clk,
rst => rst,
acc => tlb_plru_acc,
acc_en => tlb_plru_acc_en,
lru => tlb_plru_out
);

process(all)
begin
-- PLRU interface
if not is_X(r1.tlb_hit_index) and r1.tlb_hit_index = i then
tlb_plru_acc_en <= r1.tlb_hit;
assert not is_X(r1.tlb_hit_way);
else
tlb_plru_acc_en <= '0';
end if;
tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way);
tlb_plru_victim(i) <= tlb_plru_out;
end process;
end generate;
tlb_plru : entity work.plrufn
generic map (
BITS => TLB_WAY_BITS
)
port map (
acc => tlb_plru_acc,
tree_in => tlb_plru_cur,
tree_out => tlb_plru_upd,
lru => tlb_plru_out
);

process(all)
begin
-- Read PLRU bits from array
if is_X(r1.tlb_hit_index) then
tlb_plru_cur <= (others => 'X');
else
tlb_plru_cur <= tlb_plru_ram(to_integer(r1.tlb_hit_index));
end if;

-- PLRU interface
tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way);
tlb_plru_victim <= tlb_plru_out;
end process;

-- synchronous writes to TLB PLRU array
process(clk)
begin
if rising_edge(clk) then
if r1.tlb_hit = '1' then
assert not is_X(r1.tlb_hit_index) severity failure;
tlb_plru_ram(to_integer(r1.tlb_hit_index)) <= tlb_plru_upd;
end if;
end if;
end process;
end generate;

tlb_search : process(all)
@ -747,13 +759,15 @@ begin
end if;
elsif tlbwe = '1' then
assert not is_X(tlb_req_index);
if tlb_hit = '1' then
repl_way := tlb_hit_way;
else
assert not is_X(tlb_plru_victim(to_integer(tlb_req_index)));
repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index)));
repl_way := to_unsigned(0, TLB_WAY_BITS);
if TLB_NUM_WAYS > 1 then
if tlb_hit = '1' then
repl_way := tlb_hit_way;
else
repl_way := unsigned(r1.tlb_victim);
end if;
assert not is_X(repl_way);
end if;
assert not is_X(repl_way);
eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS);
tagset := tlb_tag_way;
write_tlb_tag(to_integer(repl_way), tagset, eatag);
@ -767,39 +781,49 @@ begin
end process;

-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
maybe_plrus : if NUM_WAYS > 1 generate
type plru_array is array(0 to NUM_LINES-1) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_ram : plru_array;
signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plrus: for i in 0 to NUM_LINES-1 generate
-- PLRU interface
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plru : entity work.plru
generic map (
BITS => WAY_BITS
)
port map (
clk => clk,
rst => rst,
acc => plru_acc,
acc_en => plru_acc_en,
lru => plru_out
);

process(all)
begin
-- PLRU interface
if not is_X(r1.hit_index) and r1.hit_index = to_unsigned(i, INDEX_BITS) then
plru_acc_en <= r1.cache_hit;
else
plru_acc_en <= '0';
end if;
plru_acc <= std_ulogic_vector(r1.hit_way);
plru_victim(i) <= plru_out;
end process;
end generate;
plru : entity work.plrufn
generic map (
BITS => WAY_BITS
)
port map (
acc => plru_acc,
tree_in => plru_cur,
tree_out => plru_upd,
lru => plru_out
);

process(all)
begin
-- Read PLRU bits from array
if is_X(r1.hit_index) then
plru_cur <= (others => 'X');
else
plru_cur <= plru_ram(to_integer(r1.hit_index));
end if;

-- PLRU interface
plru_acc <= std_ulogic_vector(r1.hit_way);
plru_victim <= unsigned(plru_out);
end process;

-- synchronous writes to PLRU array
process(clk)
begin
if rising_edge(clk) then
if r1.cache_hit = '1' then
assert not is_X(r1.hit_index) severity failure;
plru_ram(to_integer(r1.hit_index)) <= plru_upd;
end if;
end if;
end process;
end generate;

-- Cache tag RAM read port
@ -974,11 +998,19 @@ begin
end if;

-- The way to replace on a miss
if r1.write_tag = '1' then
assert not is_X(r1.store_index);
replace_way <= unsigned(plru_victim(to_integer(r1.store_index)));
else
replace_way <= r1.store_way;
replace_way <= to_unsigned(0, WAY_BITS);
if NUM_WAYS > 1 then
if r1.write_tag = '1' then
if r1.choose_victim = '1' then
replace_way <= plru_victim;
else
-- Cache victim way was chosen earlier,
-- in the cycle after the miss was detected.
replace_way <= r1.victim_way;
end if;
else
replace_way <= r1.store_way;
end if;
end if;

-- See if the request matches the line currently being reloaded
@ -1299,8 +1331,6 @@ begin
end if;

-- Fast path for load/store hits. Set signals for the writeback controls.
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
if req_op = OP_LOAD_HIT then
r1.hit_load_valid <= '1';
else
@ -1334,6 +1364,11 @@ begin
r1.tlb_hit <= tlb_hit;
r1.tlb_hit_way <= tlb_hit_way;
r1.tlb_hit_index <= tlb_req_index;
-- determine victim way in the TLB in the cycle after
-- we detect the TLB miss
if r1.ls_error = '1' then
r1.tlb_victim <= unsigned(tlb_plru_victim);
end if;

end if;
end process;
@ -1358,6 +1393,7 @@ begin
ev.load_miss <= '0';
ev.store_miss <= '0';
ev.dtlb_miss <= tlb_miss;
r1.choose_victim <= '0';

-- On reset, clear all valid bits to force misses
if rst = '1' then
@ -1454,6 +1490,17 @@ begin
end if;
end if;

-- Signals for PLRU update and victim selection
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
-- Record victim way in the cycle after we see a load or dcbz miss
if r1.choose_victim = '1' then
r1.victim_way <= plru_victim;
end if;
if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then
r1.choose_victim <= '1';
end if;

-- Main state machine
case r1.state is
when IDLE =>

@ -12,7 +12,6 @@
-- efficient use of distributed RAM and less logic/muxes. Currently we
-- write TAG_BITS width which may not match full ram blocks and might
-- cause muxes to be inferred for "partial writes".
-- * Check if making the read size of PLRU a ROM helps utilization
--
library ieee;
use ieee.std_logic_1164.all;
@ -102,7 +101,8 @@ architecture rtl of icache is
-- the +1 is to allow the endianness to be stored in the tag
constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS + 1;
-- WAY_BITS is the number of bits to select a way
constant WAY_BITS : natural := log2(NUM_WAYS);
-- Make sure this is at least 1, to avoid 0-element vectors
constant WAY_BITS : natural := maximum(log2(NUM_WAYS), 1);

-- Example of layout for 32 lines of 64 bytes:
--
@ -235,8 +235,7 @@ architecture rtl of icache is
signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0);

-- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_victim : plru_out_t;
signal plru_victim : way_sig_t;

-- Memory write snoop signals
signal snoop_valid : std_ulogic;
@ -446,40 +445,48 @@ begin
-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_ram : plru_array;
signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plrus: for i in 0 to NUM_LINES-1 generate
-- PLRU interface
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plru : entity work.plru
generic map (
BITS => WAY_BITS
)
port map (
clk => clk,
rst => rst,
acc => plru_acc,
acc_en => plru_acc_en,
lru => plru_out
);

process(all)
begin
-- PLRU interface
if is_X(r.hit_nia) then
plru_acc_en <= 'X';
elsif get_index(r.hit_nia) = i then
plru_acc_en <= r.hit_valid;
else
plru_acc_en <= '0';
end if;
plru_acc <= std_ulogic_vector(r.hit_way);
plru_victim(i) <= plru_out;
end process;
end generate;
plru : entity work.plrufn
generic map (
BITS => WAY_BITS
)
port map (
acc => plru_acc,
tree_in => plru_cur,
tree_out => plru_upd,
lru => plru_out
);

process(all)
begin
-- Read PLRU bits from array
if is_X(r.hit_nia) then
plru_cur <= (others => 'X');
else
plru_cur <= plru_ram(to_integer(get_index(r.hit_nia)));
end if;

-- PLRU interface
plru_acc <= std_ulogic_vector(r.hit_way);
plru_victim <= unsigned(plru_out);
end process;

-- synchronous writes to PLRU array
process(clk)
begin
if rising_edge(clk) then
if r.hit_valid = '1' then
assert not is_X(r.hit_nia) severity failure;
plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd;
end if;
end if;
end process;
end generate;

-- TLB hit detection and real address generation
@ -787,8 +794,11 @@ begin
assert not is_X(r.store_row) severity failure;
assert not is_X(r.recv_row) severity failure;
if r.state = CLR_TAG then
-- Get victim way from plru
replace_way := unsigned(plru_victim(to_integer(r.store_index)));
replace_way := to_unsigned(0, WAY_BITS);
if NUM_WAYS > 1 then
-- Get victim way from plru
replace_way := plru_victim;
end if;
r.store_way <= replace_way;

-- Force misses on that way while reloading that line

@ -305,8 +305,7 @@ architecture behaviour of litedram_wrapper is
signal cache_out : cache_ram_out_t;

-- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_victim : plru_out_t;
signal plru_victim : way_t;

--
-- Helper functions to decode incoming requests
@ -565,39 +564,44 @@ begin
end generate;

-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
maybe_plrus : if NUM_WAYS > 1 generate
type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_ram : plru_array;
signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plrus: for i in 0 to NUM_LINES-1 generate
plru : entity work.plrufn
generic map (
BITS => WAY_BITS
)
port map (
acc => plru_acc,
tree_in => plru_cur,
tree_out => plru_upd,
lru => plru_out
);

process(all)
begin
-- Read PLRU bits from array
plru_cur <= plru_ram(req_index);

-- PLRU interface
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
plru_victim <= to_integer(unsigned(plru_out));
end process;

-- synchronous writes to PLRU array
process(system_clk)
begin
plru : entity work.plru
generic map (
BITS => WAY_BITS
)
port map (
clk => system_clk,
rst => system_reset,
acc => plru_acc,
acc_en => plru_acc_en,
lru => plru_out
);

process(req_index, req_op, req_hit_way, plru_out)
begin
-- PLRU interface
if (req_op = OP_LOAD_HIT or
req_op = OP_STORE_HIT) and req_index = i then
plru_acc_en <= '1';
else
plru_acc_en <= '0';
if rising_edge(system_clk) then
if (req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT) then
plru_ram(req_index) <= plru_upd;
end if;
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
plru_victim(i) <= plru_out;
end process;
end generate;
end if;
end process;
end generate;

--
@ -1023,7 +1027,7 @@ begin
-- We need to read a cache line
if req_op = OP_LOAD_MISS and not wait_qdrain then
-- Grab way to replace
refill_way <= to_integer(unsigned(plru_victim(req_index)));
refill_way <= plru_victim;

-- Keep track of our index and way for subsequent stores
refill_index <= req_index;

@ -34,6 +34,7 @@ filesets:
- core.vhdl
- icache.vhdl
- plru.vhdl
- plrufn.vhdl
- cache_ram.vhdl
- core_debug.vhdl
- utils.vhdl

@ -0,0 +1,72 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use ieee.math_real.all;

entity plrufn is
generic (
BITS : positive := 2
)
;
port (
acc : in std_ulogic_vector(BITS-1 downto 0);
tree_in : in std_ulogic_vector(2 ** BITS - 2 downto 0);
tree_out : out std_ulogic_vector(2 ** BITS - 2 downto 0);
lru : out std_ulogic_vector(BITS-1 downto 0)
);
end entity plrufn;

architecture rtl of plrufn is
-- Each level of the tree (from leaf to root) has half the number of nodes
-- of the previous level. So for a 2^N bits LRU, we have a level of N/2 bits
-- one of N/4 bits etc.. down to 1. This gives us 2^N-1 nodes. Ie, 2 bits
-- LRU has 3 nodes (2 + 1), 4 bits LRU has 15 nodes (8 + 4 + 2 + 1) etc...
constant count : positive := 2 ** BITS - 1;
subtype node_t is integer range 0 to count - 1;
begin

get_lru: process(tree_in)
variable node : node_t;
variable abit : std_ulogic;
begin
node := 0;
for i in 0 to BITS-1 loop
abit := tree_in(node);
if is_X(abit) then
abit := '0';
end if;
lru(BITS-1-i) <= abit;
if i /= BITS-1 then
node := node * 2;
if abit = '1' then
node := node + 2;
else
node := node + 1;
end if;
end if;
end loop;
end process;

update_lru: process(all)
variable node : node_t;
variable abit : std_ulogic;
begin
tree_out <= tree_in;
node := 0;
for i in 0 to BITS-1 loop
abit := acc(BITS-1-i);
if is_X(abit) then
abit := '0';
end if;
tree_out(node) <= not abit;
if i /= BITS-1 then
node := node * 2;
if abit = '1' then
node := node + 2;
else
node := node + 1;
end if;
end if;
end loop;
end process;
end;
Loading…
Cancel
Save