From 82c8b2eae0fbfb96274c07078a2745c9553c8f7a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 7 Sep 2022 15:32:46 +1000 Subject: [PATCH 1/5] icache: Fix compilation with NUM_WAYS = 1 Signed-off-by: Paul Mackerras --- icache.vhdl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index 9113ae6..592e901 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -102,7 +102,8 @@ architecture rtl of icache is -- the +1 is to allow the endianness to be stored in the tag constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS + 1; -- WAY_BITS is the number of bits to select a way - constant WAY_BITS : natural := log2(NUM_WAYS); + -- Make sure this is at least 1, to avoid 0-element vectors + constant WAY_BITS : natural := maximum(log2(NUM_WAYS), 1); -- Example of layout for 32 lines of 64 bytes: -- @@ -787,8 +788,11 @@ begin assert not is_X(r.store_row) severity failure; assert not is_X(r.recv_row) severity failure; if r.state = CLR_TAG then - -- Get victim way from plru - replace_way := unsigned(plru_victim(to_integer(r.store_index))); + replace_way := to_unsigned(0, WAY_BITS); + if NUM_WAYS > 1 then + -- Get victim way from plru + replace_way := unsigned(plru_victim(to_integer(r.store_index))); + end if; r.store_way <= replace_way; -- Force misses on that way while reloading that line From cd2e1741132d3a796da088b3c9bfa2b9611f4eb0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 7 Sep 2022 16:02:06 +1000 Subject: [PATCH 2/5] dcache: Fix compilation with NUM_WAYS and/or TLB_NUM_WAYS = 1 Signed-off-by: Paul Mackerras --- dcache.vhdl | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 6f59fab..a29cf6f 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -84,7 +84,8 @@ architecture rtl of dcache is -- TAG_WIDTH is the width in bits of each way of the tag RAM constant TAG_WIDTH : natural := TAG_BITS + 7 - ((TAG_BITS + 7) mod 8); -- WAY_BITS is the number of bits to select a way - constant WAY_BITS : natural := log2(NUM_WAYS); + -- Make sure this is at least 1, to avoid 0-element vectors + constant WAY_BITS : natural := maximum(log2(NUM_WAYS), 1); -- Example of layout for 32 lines of 64 bytes: -- @@ -130,7 +131,7 @@ architecture rtl of dcache is -- L1 TLB. constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); - constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS); + constant TLB_WAY_BITS : natural := maximum(log2(TLB_NUM_WAYS), 1); constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS); constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS; constant TLB_PTE_BITS : natural := 64; @@ -747,13 +748,15 @@ begin end if; elsif tlbwe = '1' then assert not is_X(tlb_req_index); - if tlb_hit = '1' then - repl_way := tlb_hit_way; - else - assert not is_X(tlb_plru_victim(to_integer(tlb_req_index))); - repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index))); + repl_way := to_unsigned(0, TLB_WAY_BITS); + if TLB_NUM_WAYS > 1 then + if tlb_hit = '1' then + repl_way := tlb_hit_way; + else + repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index))); + end if; + assert not is_X(repl_way); end if; - assert not is_X(repl_way); eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); tagset := tlb_tag_way; write_tlb_tag(to_integer(repl_way), tagset, eatag); @@ -974,11 +977,14 @@ begin end if; -- The way to replace on a miss - if r1.write_tag = '1' then - assert not is_X(r1.store_index); - replace_way <= unsigned(plru_victim(to_integer(r1.store_index))); - else - replace_way <= r1.store_way; + replace_way <= to_unsigned(0, WAY_BITS); + if NUM_WAYS > 1 then + if r1.write_tag = '1' then + assert not is_X(r1.store_index); + replace_way <= unsigned(plru_victim(to_integer(r1.store_index))); + else + replace_way <= r1.store_way; + end if; end if; -- See if the request matches the line currently being reloaded From 86212dc8797a454c8dcd52c2b275731ec1dffc6e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 7 Sep 2022 20:18:18 +1000 Subject: [PATCH 3/5] icache: Split PLRU into storage and logic Rather than having update and decode logic for each individual PLRU as well as a register to store the current PLRU state, we now put the PLRU state in a little RAM, which will typically use LUT RAM on FPGAs, and have just a single copy of the logic to calculate the pseudo-LRU way and to update the PLRU state. This logic is in the plrufn module and is just combinatorial logic. A new module was created for this as other parts of the system are still using plru.vhdl. The PLRU RAM in the icache is read asynchronously in the cycle after the cache tag matching is done. At the end of that cycle the PLRU RAM entry is updated if the access was a cache hit, or a victim way is calculated and stored if the access was a cache miss and miss handling is starting in this cycle. Signed-off-by: Paul Mackerras --- Makefile | 4 +-- icache.vhdl | 80 +++++++++++++++++++++++++++----------------------- microwatt.core | 1 + plrufn.vhdl | 72 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 39 deletions(-) create mode 100644 plrufn.vhdl diff --git a/Makefile b/Makefile index 3861603..12a9317 100644 --- a/Makefile +++ b/Makefile @@ -68,8 +68,8 @@ all: $(all) $(shell scripts/make_version.sh git.vhdl) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ - utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl predecode.vhdl \ - decode1.vhdl helpers.vhdl insn_helpers.vhdl \ + utils.vhdl plru.vhdl plrufn.vhdl cache_ram.vhdl icache.vhdl \ + predecode.vhdl decode1.vhdl helpers.vhdl insn_helpers.vhdl \ control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \ diff --git a/icache.vhdl b/icache.vhdl index 592e901..0467630 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -12,7 +12,6 @@ -- efficient use of distributed RAM and less logic/muxes. Currently we -- write TAG_BITS width which may not match full ram blocks and might -- cause muxes to be inferred for "partial writes". --- * Check if making the read size of PLRU a ROM helps utilization -- library ieee; use ieee.std_logic_1164.all; @@ -236,8 +235,7 @@ architecture rtl of icache is signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0); -- PLRU output interface - type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_victim : plru_out_t; + signal plru_victim : way_sig_t; -- Memory write snoop signals signal snoop_valid : std_ulogic; @@ -447,40 +445,48 @@ begin -- Generate PLRUs maybe_plrus: if NUM_WAYS > 1 generate + type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_ram : plru_array; + signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin - plrus: for i in 0 to NUM_LINES-1 generate - -- PLRU interface - signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_acc_en : std_ulogic; - signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); - - begin - plru : entity work.plru - generic map ( - BITS => WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => plru_acc, - acc_en => plru_acc_en, - lru => plru_out - ); - - process(all) - begin - -- PLRU interface - if is_X(r.hit_nia) then - plru_acc_en <= 'X'; - elsif get_index(r.hit_nia) = i then - plru_acc_en <= r.hit_valid; - else - plru_acc_en <= '0'; - end if; - plru_acc <= std_ulogic_vector(r.hit_way); - plru_victim(i) <= plru_out; - end process; - end generate; + plru : entity work.plrufn + generic map ( + BITS => WAY_BITS + ) + port map ( + acc => plru_acc, + tree_in => plru_cur, + tree_out => plru_upd, + lru => plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r.hit_nia) then + plru_cur <= (others => 'X'); + else + plru_cur <= plru_ram(to_integer(get_index(r.hit_nia))); + end if; + + -- PLRU interface + plru_acc <= std_ulogic_vector(r.hit_way); + plru_victim <= unsigned(plru_out); + end process; + + -- synchronous writes to PLRU array + process(clk) + begin + if rising_edge(clk) then + if r.hit_valid = '1' then + assert not is_X(r.hit_nia) severity failure; + plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd; + end if; + end if; + end process; end generate; -- TLB hit detection and real address generation @@ -791,7 +797,7 @@ begin replace_way := to_unsigned(0, WAY_BITS); if NUM_WAYS > 1 then -- Get victim way from plru - replace_way := unsigned(plru_victim(to_integer(r.store_index))); + replace_way := plru_victim; end if; r.store_way <= replace_way; diff --git a/microwatt.core b/microwatt.core index b14a61e..bb6770d 100644 --- a/microwatt.core +++ b/microwatt.core @@ -34,6 +34,7 @@ filesets: - core.vhdl - icache.vhdl - plru.vhdl + - plrufn.vhdl - cache_ram.vhdl - core_debug.vhdl - utils.vhdl diff --git a/plrufn.vhdl b/plrufn.vhdl new file mode 100644 index 0000000..13ffd6a --- /dev/null +++ b/plrufn.vhdl @@ -0,0 +1,72 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use ieee.math_real.all; + +entity plrufn is + generic ( + BITS : positive := 2 + ) + ; + port ( + acc : in std_ulogic_vector(BITS-1 downto 0); + tree_in : in std_ulogic_vector(2 ** BITS - 2 downto 0); + tree_out : out std_ulogic_vector(2 ** BITS - 2 downto 0); + lru : out std_ulogic_vector(BITS-1 downto 0) + ); +end entity plrufn; + +architecture rtl of plrufn is + -- Each level of the tree (from leaf to root) has half the number of nodes + -- of the previous level. So for a 2^N bits LRU, we have a level of N/2 bits + -- one of N/4 bits etc.. down to 1. This gives us 2^N-1 nodes. Ie, 2 bits + -- LRU has 3 nodes (2 + 1), 4 bits LRU has 15 nodes (8 + 4 + 2 + 1) etc... + constant count : positive := 2 ** BITS - 1; + subtype node_t is integer range 0 to count - 1; +begin + + get_lru: process(tree_in) + variable node : node_t; + variable abit : std_ulogic; + begin + node := 0; + for i in 0 to BITS-1 loop + abit := tree_in(node); + if is_X(abit) then + abit := '0'; + end if; + lru(BITS-1-i) <= abit; + if i /= BITS-1 then + node := node * 2; + if abit = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end process; + + update_lru: process(all) + variable node : node_t; + variable abit : std_ulogic; + begin + tree_out <= tree_in; + node := 0; + for i in 0 to BITS-1 loop + abit := acc(BITS-1-i); + if is_X(abit) then + abit := '0'; + end if; + tree_out(node) <= not abit; + if i /= BITS-1 then + node := node * 2; + if abit = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end process; +end; From a1f58679197fee9a8036539be585414ba0ee57df Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 7 Sep 2022 20:21:42 +1000 Subject: [PATCH 4/5] dcache: Split PLRU into storage and logic Rather than having update and decode logic for each individual PLRU as well as a register to store the current PLRU state, we now put the PLRU state in a little RAM, which will typically use LUT RAM on FPGAs, and have just a single copy of the logic to calculate the pseudo-LRU way and to update the PLRU state. The PLRU RAM that apples to the data storage (as opposed to the TLB) is read asynchronously in the cycle after the cache tag matching is done. At the end of that cycle the PLRU RAM entry is updated if the access was a cache hit, or a victim way is calculated and stored if the access was a cache miss. It is possible that a cache miss doesn't start being handled until later, in which case the stored victim way is used later when the miss gets handled. Similarly for the TLB PLRU, the RAM is read asynchronously in the cycle after a TLB lookup is done, and either updated at the end of that cycle (for a hit), or a victim is chosen and stored for when the TLB miss is satisfied. Signed-off-by: Paul Mackerras --- dcache.vhdl | 187 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 114 insertions(+), 73 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index a29cf6f..75c2ce0 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -317,6 +317,7 @@ architecture rtl of dcache is tlb_hit : std_ulogic; tlb_hit_way : tlb_way_sig_t; tlb_hit_index : tlb_index_sig_t; + tlb_victim : tlb_way_sig_t; -- data buffer for data forwarded from writes to reads forward_data : std_ulogic_vector(63 downto 0); @@ -342,6 +343,8 @@ architecture rtl of dcache is acks_pending : unsigned(2 downto 0); inc_acks : std_ulogic; dec_acks : std_ulogic; + choose_victim : std_ulogic; + victim_way : way_t; -- Signals to complete (possibly with error) ls_valid : std_ulogic; @@ -398,8 +401,7 @@ architecture rtl of dcache is signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); -- PLRU output interface - type plru_out_t is array(0 to NUM_LINES-1) of std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_victim : plru_out_t; + signal plru_victim : way_t; signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals @@ -423,8 +425,7 @@ architecture rtl of dcache is signal tlb_miss : std_ulogic; -- TLB PLRU output interface - type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - signal tlb_plru_victim : tlb_plru_out_t; + signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); signal snoop_tag_set : cache_tags_set_t; signal snoop_valid : std_ulogic; @@ -650,39 +651,49 @@ begin end process; -- Generate TLB PLRUs - maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate + maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate + type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_ram : tlb_plru_array; + signal tlb_plru_cur : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_upd : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); begin - tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate - -- TLB PLRU interface - signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - signal tlb_plru_acc_en : std_ulogic; - signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - begin - tlb_plru : entity work.plru - generic map ( - BITS => TLB_WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => tlb_plru_acc, - acc_en => tlb_plru_acc_en, - lru => tlb_plru_out - ); - - process(all) - begin - -- PLRU interface - if not is_X(r1.tlb_hit_index) and r1.tlb_hit_index = i then - tlb_plru_acc_en <= r1.tlb_hit; - assert not is_X(r1.tlb_hit_way); - else - tlb_plru_acc_en <= '0'; - end if; - tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way); - tlb_plru_victim(i) <= tlb_plru_out; - end process; - end generate; + tlb_plru : entity work.plrufn + generic map ( + BITS => TLB_WAY_BITS + ) + port map ( + acc => tlb_plru_acc, + tree_in => tlb_plru_cur, + tree_out => tlb_plru_upd, + lru => tlb_plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r1.tlb_hit_index) then + tlb_plru_cur <= (others => 'X'); + else + tlb_plru_cur <= tlb_plru_ram(to_integer(r1.tlb_hit_index)); + end if; + + -- PLRU interface + tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way); + tlb_plru_victim <= tlb_plru_out; + end process; + + -- synchronous writes to TLB PLRU array + process(clk) + begin + if rising_edge(clk) then + if r1.tlb_hit = '1' then + assert not is_X(r1.tlb_hit_index) severity failure; + tlb_plru_ram(to_integer(r1.tlb_hit_index)) <= tlb_plru_upd; + end if; + end if; + end process; end generate; tlb_search : process(all) @@ -753,7 +764,7 @@ begin if tlb_hit = '1' then repl_way := tlb_hit_way; else - repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index))); + repl_way := unsigned(r1.tlb_victim); end if; assert not is_X(repl_way); end if; @@ -770,39 +781,49 @@ begin end process; -- Generate PLRUs - maybe_plrus: if NUM_WAYS > 1 generate + maybe_plrus : if NUM_WAYS > 1 generate + type plru_array is array(0 to NUM_LINES-1) of std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_ram : plru_array; + signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin - plrus: for i in 0 to NUM_LINES-1 generate - -- PLRU interface - signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_acc_en : std_ulogic; - signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); - - begin - plru : entity work.plru - generic map ( - BITS => WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => plru_acc, - acc_en => plru_acc_en, - lru => plru_out - ); - - process(all) - begin - -- PLRU interface - if not is_X(r1.hit_index) and r1.hit_index = to_unsigned(i, INDEX_BITS) then - plru_acc_en <= r1.cache_hit; - else - plru_acc_en <= '0'; - end if; - plru_acc <= std_ulogic_vector(r1.hit_way); - plru_victim(i) <= plru_out; - end process; - end generate; + plru : entity work.plrufn + generic map ( + BITS => WAY_BITS + ) + port map ( + acc => plru_acc, + tree_in => plru_cur, + tree_out => plru_upd, + lru => plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r1.hit_index) then + plru_cur <= (others => 'X'); + else + plru_cur <= plru_ram(to_integer(r1.hit_index)); + end if; + + -- PLRU interface + plru_acc <= std_ulogic_vector(r1.hit_way); + plru_victim <= unsigned(plru_out); + end process; + + -- synchronous writes to PLRU array + process(clk) + begin + if rising_edge(clk) then + if r1.cache_hit = '1' then + assert not is_X(r1.hit_index) severity failure; + plru_ram(to_integer(r1.hit_index)) <= plru_upd; + end if; + end if; + end process; end generate; -- Cache tag RAM read port @@ -980,8 +1001,13 @@ begin replace_way <= to_unsigned(0, WAY_BITS); if NUM_WAYS > 1 then if r1.write_tag = '1' then - assert not is_X(r1.store_index); - replace_way <= unsigned(plru_victim(to_integer(r1.store_index))); + if r1.choose_victim = '1' then + replace_way <= plru_victim; + else + -- Cache victim way was chosen earlier, + -- in the cycle after the miss was detected. + replace_way <= r1.victim_way; + end if; else replace_way <= r1.store_way; end if; @@ -1305,8 +1331,6 @@ begin end if; -- Fast path for load/store hits. Set signals for the writeback controls. - r1.hit_way <= req_hit_way; - r1.hit_index <= req_index; if req_op = OP_LOAD_HIT then r1.hit_load_valid <= '1'; else @@ -1340,6 +1364,11 @@ begin r1.tlb_hit <= tlb_hit; r1.tlb_hit_way <= tlb_hit_way; r1.tlb_hit_index <= tlb_req_index; + -- determine victim way in the TLB in the cycle after + -- we detect the TLB miss + if r1.ls_error = '1' then + r1.tlb_victim <= unsigned(tlb_plru_victim); + end if; end if; end process; @@ -1364,6 +1393,7 @@ begin ev.load_miss <= '0'; ev.store_miss <= '0'; ev.dtlb_miss <= tlb_miss; + r1.choose_victim <= '0'; -- On reset, clear all valid bits to force misses if rst = '1' then @@ -1460,6 +1490,17 @@ begin end if; end if; + -- Signals for PLRU update and victim selection + r1.hit_way <= req_hit_way; + r1.hit_index <= req_index; + -- Record victim way in the cycle after we see a load or dcbz miss + if r1.choose_victim = '1' then + r1.victim_way <= plru_victim; + end if; + if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then + r1.choose_victim <= '1'; + end if; + -- Main state machine case r1.state is when IDLE => From b8f9c833f85a13514f61b64895fa9ae6f7c2d671 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 19 Sep 2022 18:05:30 +1000 Subject: [PATCH 5/5] litedram: Split L2 PLRU into storage and logic As has been done for the L1 dcache and icache, this puts the L2 cache PLRU state into a little RAM and has a single copy of the logic to calculate the pseudo-LRU way and update the PLRU state. Signed-off-by: Paul Mackerras --- litedram/extras/litedram-wrapper-l2.vhdl | 68 +++++++++++++----------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/litedram/extras/litedram-wrapper-l2.vhdl b/litedram/extras/litedram-wrapper-l2.vhdl index 6c0967b..4bea293 100644 --- a/litedram/extras/litedram-wrapper-l2.vhdl +++ b/litedram/extras/litedram-wrapper-l2.vhdl @@ -305,8 +305,7 @@ architecture behaviour of litedram_wrapper is signal cache_out : cache_ram_out_t; -- PLRU output interface - type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_victim : plru_out_t; + signal plru_victim : way_t; -- -- Helper functions to decode incoming requests @@ -565,39 +564,44 @@ begin end generate; -- Generate PLRUs - maybe_plrus: if NUM_WAYS > 1 generate + maybe_plrus : if NUM_WAYS > 1 generate + type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_ram : plru_array; + signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin - plrus: for i in 0 to NUM_LINES-1 generate + plru : entity work.plrufn + generic map ( + BITS => WAY_BITS + ) + port map ( + acc => plru_acc, + tree_in => plru_cur, + tree_out => plru_upd, + lru => plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + plru_cur <= plru_ram(req_index); + -- PLRU interface - signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_acc_en : std_ulogic; - signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); + plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_victim <= to_integer(unsigned(plru_out)); + end process; + + -- synchronous writes to PLRU array + process(system_clk) begin - plru : entity work.plru - generic map ( - BITS => WAY_BITS - ) - port map ( - clk => system_clk, - rst => system_reset, - acc => plru_acc, - acc_en => plru_acc_en, - lru => plru_out - ); - - process(req_index, req_op, req_hit_way, plru_out) - begin - -- PLRU interface - if (req_op = OP_LOAD_HIT or - req_op = OP_STORE_HIT) and req_index = i then - plru_acc_en <= '1'; - else - plru_acc_en <= '0'; + if rising_edge(system_clk) then + if (req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT) then + plru_ram(req_index) <= plru_upd; end if; - plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); - plru_victim(i) <= plru_out; - end process; - end generate; + end if; + end process; end generate; -- @@ -1019,7 +1023,7 @@ begin -- We need to read a cache line if req_op = OP_LOAD_MISS and not wait_qdrain then -- Grab way to replace - refill_way <= to_integer(unsigned(plru_victim(req_index))); + refill_way <= plru_victim; -- Keep track of our index and way for subsequent stores refill_index <= req_index;