diff --git a/Makefile b/Makefile index 3861603..12a9317 100644 --- a/Makefile +++ b/Makefile @@ -68,8 +68,8 @@ all: $(all) $(shell scripts/make_version.sh git.vhdl) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ - utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl predecode.vhdl \ - decode1.vhdl helpers.vhdl insn_helpers.vhdl \ + utils.vhdl plru.vhdl plrufn.vhdl cache_ram.vhdl icache.vhdl \ + predecode.vhdl decode1.vhdl helpers.vhdl insn_helpers.vhdl \ control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \ diff --git a/dcache.vhdl b/dcache.vhdl index 6f59fab..75c2ce0 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -84,7 +84,8 @@ architecture rtl of dcache is -- TAG_WIDTH is the width in bits of each way of the tag RAM constant TAG_WIDTH : natural := TAG_BITS + 7 - ((TAG_BITS + 7) mod 8); -- WAY_BITS is the number of bits to select a way - constant WAY_BITS : natural := log2(NUM_WAYS); + -- Make sure this is at least 1, to avoid 0-element vectors + constant WAY_BITS : natural := maximum(log2(NUM_WAYS), 1); -- Example of layout for 32 lines of 64 bytes: -- @@ -130,7 +131,7 @@ architecture rtl of dcache is -- L1 TLB. constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); - constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS); + constant TLB_WAY_BITS : natural := maximum(log2(TLB_NUM_WAYS), 1); constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS); constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS; constant TLB_PTE_BITS : natural := 64; @@ -316,6 +317,7 @@ architecture rtl of dcache is tlb_hit : std_ulogic; tlb_hit_way : tlb_way_sig_t; tlb_hit_index : tlb_index_sig_t; + tlb_victim : tlb_way_sig_t; -- data buffer for data forwarded from writes to reads forward_data : std_ulogic_vector(63 downto 0); @@ -341,6 +343,8 @@ architecture rtl of dcache is acks_pending : unsigned(2 downto 0); inc_acks : std_ulogic; dec_acks : std_ulogic; + choose_victim : std_ulogic; + victim_way : way_t; -- Signals to complete (possibly with error) ls_valid : std_ulogic; @@ -397,8 +401,7 @@ architecture rtl of dcache is signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); -- PLRU output interface - type plru_out_t is array(0 to NUM_LINES-1) of std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_victim : plru_out_t; + signal plru_victim : way_t; signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals @@ -422,8 +425,7 @@ architecture rtl of dcache is signal tlb_miss : std_ulogic; -- TLB PLRU output interface - type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - signal tlb_plru_victim : tlb_plru_out_t; + signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); signal snoop_tag_set : cache_tags_set_t; signal snoop_valid : std_ulogic; @@ -649,39 +651,49 @@ begin end process; -- Generate TLB PLRUs - maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate + maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate + type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_ram : tlb_plru_array; + signal tlb_plru_cur : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_upd : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); begin - tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate - -- TLB PLRU interface - signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - signal tlb_plru_acc_en : std_ulogic; - signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - begin - tlb_plru : entity work.plru - generic map ( - BITS => TLB_WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => tlb_plru_acc, - acc_en => tlb_plru_acc_en, - lru => tlb_plru_out - ); - - process(all) - begin - -- PLRU interface - if not is_X(r1.tlb_hit_index) and r1.tlb_hit_index = i then - tlb_plru_acc_en <= r1.tlb_hit; - assert not is_X(r1.tlb_hit_way); - else - tlb_plru_acc_en <= '0'; - end if; - tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way); - tlb_plru_victim(i) <= tlb_plru_out; - end process; - end generate; + tlb_plru : entity work.plrufn + generic map ( + BITS => TLB_WAY_BITS + ) + port map ( + acc => tlb_plru_acc, + tree_in => tlb_plru_cur, + tree_out => tlb_plru_upd, + lru => tlb_plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r1.tlb_hit_index) then + tlb_plru_cur <= (others => 'X'); + else + tlb_plru_cur <= tlb_plru_ram(to_integer(r1.tlb_hit_index)); + end if; + + -- PLRU interface + tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way); + tlb_plru_victim <= tlb_plru_out; + end process; + + -- synchronous writes to TLB PLRU array + process(clk) + begin + if rising_edge(clk) then + if r1.tlb_hit = '1' then + assert not is_X(r1.tlb_hit_index) severity failure; + tlb_plru_ram(to_integer(r1.tlb_hit_index)) <= tlb_plru_upd; + end if; + end if; + end process; end generate; tlb_search : process(all) @@ -747,13 +759,15 @@ begin end if; elsif tlbwe = '1' then assert not is_X(tlb_req_index); - if tlb_hit = '1' then - repl_way := tlb_hit_way; - else - assert not is_X(tlb_plru_victim(to_integer(tlb_req_index))); - repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index))); + repl_way := to_unsigned(0, TLB_WAY_BITS); + if TLB_NUM_WAYS > 1 then + if tlb_hit = '1' then + repl_way := tlb_hit_way; + else + repl_way := unsigned(r1.tlb_victim); + end if; + assert not is_X(repl_way); end if; - assert not is_X(repl_way); eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); tagset := tlb_tag_way; write_tlb_tag(to_integer(repl_way), tagset, eatag); @@ -767,39 +781,49 @@ begin end process; -- Generate PLRUs - maybe_plrus: if NUM_WAYS > 1 generate + maybe_plrus : if NUM_WAYS > 1 generate + type plru_array is array(0 to NUM_LINES-1) of std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_ram : plru_array; + signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin - plrus: for i in 0 to NUM_LINES-1 generate - -- PLRU interface - signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_acc_en : std_ulogic; - signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); - - begin - plru : entity work.plru - generic map ( - BITS => WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => plru_acc, - acc_en => plru_acc_en, - lru => plru_out - ); - - process(all) - begin - -- PLRU interface - if not is_X(r1.hit_index) and r1.hit_index = to_unsigned(i, INDEX_BITS) then - plru_acc_en <= r1.cache_hit; - else - plru_acc_en <= '0'; - end if; - plru_acc <= std_ulogic_vector(r1.hit_way); - plru_victim(i) <= plru_out; - end process; - end generate; + plru : entity work.plrufn + generic map ( + BITS => WAY_BITS + ) + port map ( + acc => plru_acc, + tree_in => plru_cur, + tree_out => plru_upd, + lru => plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r1.hit_index) then + plru_cur <= (others => 'X'); + else + plru_cur <= plru_ram(to_integer(r1.hit_index)); + end if; + + -- PLRU interface + plru_acc <= std_ulogic_vector(r1.hit_way); + plru_victim <= unsigned(plru_out); + end process; + + -- synchronous writes to PLRU array + process(clk) + begin + if rising_edge(clk) then + if r1.cache_hit = '1' then + assert not is_X(r1.hit_index) severity failure; + plru_ram(to_integer(r1.hit_index)) <= plru_upd; + end if; + end if; + end process; end generate; -- Cache tag RAM read port @@ -974,11 +998,19 @@ begin end if; -- The way to replace on a miss - if r1.write_tag = '1' then - assert not is_X(r1.store_index); - replace_way <= unsigned(plru_victim(to_integer(r1.store_index))); - else - replace_way <= r1.store_way; + replace_way <= to_unsigned(0, WAY_BITS); + if NUM_WAYS > 1 then + if r1.write_tag = '1' then + if r1.choose_victim = '1' then + replace_way <= plru_victim; + else + -- Cache victim way was chosen earlier, + -- in the cycle after the miss was detected. + replace_way <= r1.victim_way; + end if; + else + replace_way <= r1.store_way; + end if; end if; -- See if the request matches the line currently being reloaded @@ -1299,8 +1331,6 @@ begin end if; -- Fast path for load/store hits. Set signals for the writeback controls. - r1.hit_way <= req_hit_way; - r1.hit_index <= req_index; if req_op = OP_LOAD_HIT then r1.hit_load_valid <= '1'; else @@ -1334,6 +1364,11 @@ begin r1.tlb_hit <= tlb_hit; r1.tlb_hit_way <= tlb_hit_way; r1.tlb_hit_index <= tlb_req_index; + -- determine victim way in the TLB in the cycle after + -- we detect the TLB miss + if r1.ls_error = '1' then + r1.tlb_victim <= unsigned(tlb_plru_victim); + end if; end if; end process; @@ -1358,6 +1393,7 @@ begin ev.load_miss <= '0'; ev.store_miss <= '0'; ev.dtlb_miss <= tlb_miss; + r1.choose_victim <= '0'; -- On reset, clear all valid bits to force misses if rst = '1' then @@ -1454,6 +1490,17 @@ begin end if; end if; + -- Signals for PLRU update and victim selection + r1.hit_way <= req_hit_way; + r1.hit_index <= req_index; + -- Record victim way in the cycle after we see a load or dcbz miss + if r1.choose_victim = '1' then + r1.victim_way <= plru_victim; + end if; + if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then + r1.choose_victim <= '1'; + end if; + -- Main state machine case r1.state is when IDLE => diff --git a/icache.vhdl b/icache.vhdl index 9113ae6..0467630 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -12,7 +12,6 @@ -- efficient use of distributed RAM and less logic/muxes. Currently we -- write TAG_BITS width which may not match full ram blocks and might -- cause muxes to be inferred for "partial writes". --- * Check if making the read size of PLRU a ROM helps utilization -- library ieee; use ieee.std_logic_1164.all; @@ -102,7 +101,8 @@ architecture rtl of icache is -- the +1 is to allow the endianness to be stored in the tag constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS + 1; -- WAY_BITS is the number of bits to select a way - constant WAY_BITS : natural := log2(NUM_WAYS); + -- Make sure this is at least 1, to avoid 0-element vectors + constant WAY_BITS : natural := maximum(log2(NUM_WAYS), 1); -- Example of layout for 32 lines of 64 bytes: -- @@ -235,8 +235,7 @@ architecture rtl of icache is signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0); -- PLRU output interface - type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_victim : plru_out_t; + signal plru_victim : way_sig_t; -- Memory write snoop signals signal snoop_valid : std_ulogic; @@ -446,40 +445,48 @@ begin -- Generate PLRUs maybe_plrus: if NUM_WAYS > 1 generate + type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_ram : plru_array; + signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin - plrus: for i in 0 to NUM_LINES-1 generate - -- PLRU interface - signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_acc_en : std_ulogic; - signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); - - begin - plru : entity work.plru - generic map ( - BITS => WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => plru_acc, - acc_en => plru_acc_en, - lru => plru_out - ); - - process(all) - begin - -- PLRU interface - if is_X(r.hit_nia) then - plru_acc_en <= 'X'; - elsif get_index(r.hit_nia) = i then - plru_acc_en <= r.hit_valid; - else - plru_acc_en <= '0'; - end if; - plru_acc <= std_ulogic_vector(r.hit_way); - plru_victim(i) <= plru_out; - end process; - end generate; + plru : entity work.plrufn + generic map ( + BITS => WAY_BITS + ) + port map ( + acc => plru_acc, + tree_in => plru_cur, + tree_out => plru_upd, + lru => plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r.hit_nia) then + plru_cur <= (others => 'X'); + else + plru_cur <= plru_ram(to_integer(get_index(r.hit_nia))); + end if; + + -- PLRU interface + plru_acc <= std_ulogic_vector(r.hit_way); + plru_victim <= unsigned(plru_out); + end process; + + -- synchronous writes to PLRU array + process(clk) + begin + if rising_edge(clk) then + if r.hit_valid = '1' then + assert not is_X(r.hit_nia) severity failure; + plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd; + end if; + end if; + end process; end generate; -- TLB hit detection and real address generation @@ -787,8 +794,11 @@ begin assert not is_X(r.store_row) severity failure; assert not is_X(r.recv_row) severity failure; if r.state = CLR_TAG then - -- Get victim way from plru - replace_way := unsigned(plru_victim(to_integer(r.store_index))); + replace_way := to_unsigned(0, WAY_BITS); + if NUM_WAYS > 1 then + -- Get victim way from plru + replace_way := plru_victim; + end if; r.store_way <= replace_way; -- Force misses on that way while reloading that line diff --git a/litedram/extras/litedram-wrapper-l2.vhdl b/litedram/extras/litedram-wrapper-l2.vhdl index 2ee77f8..652e727 100644 --- a/litedram/extras/litedram-wrapper-l2.vhdl +++ b/litedram/extras/litedram-wrapper-l2.vhdl @@ -305,8 +305,7 @@ architecture behaviour of litedram_wrapper is signal cache_out : cache_ram_out_t; -- PLRU output interface - type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_victim : plru_out_t; + signal plru_victim : way_t; -- -- Helper functions to decode incoming requests @@ -565,39 +564,44 @@ begin end generate; -- Generate PLRUs - maybe_plrus: if NUM_WAYS > 1 generate + maybe_plrus : if NUM_WAYS > 1 generate + type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_ram : plru_array; + signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin - plrus: for i in 0 to NUM_LINES-1 generate + plru : entity work.plrufn + generic map ( + BITS => WAY_BITS + ) + port map ( + acc => plru_acc, + tree_in => plru_cur, + tree_out => plru_upd, + lru => plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + plru_cur <= plru_ram(req_index); + -- PLRU interface - signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_acc_en : std_ulogic; - signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); + plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_victim <= to_integer(unsigned(plru_out)); + end process; + + -- synchronous writes to PLRU array + process(system_clk) begin - plru : entity work.plru - generic map ( - BITS => WAY_BITS - ) - port map ( - clk => system_clk, - rst => system_reset, - acc => plru_acc, - acc_en => plru_acc_en, - lru => plru_out - ); - - process(req_index, req_op, req_hit_way, plru_out) - begin - -- PLRU interface - if (req_op = OP_LOAD_HIT or - req_op = OP_STORE_HIT) and req_index = i then - plru_acc_en <= '1'; - else - plru_acc_en <= '0'; + if rising_edge(system_clk) then + if (req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT) then + plru_ram(req_index) <= plru_upd; end if; - plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); - plru_victim(i) <= plru_out; - end process; - end generate; + end if; + end process; end generate; -- @@ -1023,7 +1027,7 @@ begin -- We need to read a cache line if req_op = OP_LOAD_MISS and not wait_qdrain then -- Grab way to replace - refill_way <= to_integer(unsigned(plru_victim(req_index))); + refill_way <= plru_victim; -- Keep track of our index and way for subsequent stores refill_index <= req_index; diff --git a/microwatt.core b/microwatt.core index b14a61e..bb6770d 100644 --- a/microwatt.core +++ b/microwatt.core @@ -34,6 +34,7 @@ filesets: - core.vhdl - icache.vhdl - plru.vhdl + - plrufn.vhdl - cache_ram.vhdl - core_debug.vhdl - utils.vhdl diff --git a/plrufn.vhdl b/plrufn.vhdl new file mode 100644 index 0000000..13ffd6a --- /dev/null +++ b/plrufn.vhdl @@ -0,0 +1,72 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use ieee.math_real.all; + +entity plrufn is + generic ( + BITS : positive := 2 + ) + ; + port ( + acc : in std_ulogic_vector(BITS-1 downto 0); + tree_in : in std_ulogic_vector(2 ** BITS - 2 downto 0); + tree_out : out std_ulogic_vector(2 ** BITS - 2 downto 0); + lru : out std_ulogic_vector(BITS-1 downto 0) + ); +end entity plrufn; + +architecture rtl of plrufn is + -- Each level of the tree (from leaf to root) has half the number of nodes + -- of the previous level. So for a 2^N bits LRU, we have a level of N/2 bits + -- one of N/4 bits etc.. down to 1. This gives us 2^N-1 nodes. Ie, 2 bits + -- LRU has 3 nodes (2 + 1), 4 bits LRU has 15 nodes (8 + 4 + 2 + 1) etc... + constant count : positive := 2 ** BITS - 1; + subtype node_t is integer range 0 to count - 1; +begin + + get_lru: process(tree_in) + variable node : node_t; + variable abit : std_ulogic; + begin + node := 0; + for i in 0 to BITS-1 loop + abit := tree_in(node); + if is_X(abit) then + abit := '0'; + end if; + lru(BITS-1-i) <= abit; + if i /= BITS-1 then + node := node * 2; + if abit = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end process; + + update_lru: process(all) + variable node : node_t; + variable abit : std_ulogic; + begin + tree_out <= tree_in; + node := 0; + for i in 0 to BITS-1 loop + abit := acc(BITS-1-i); + if is_X(abit) then + abit := '0'; + end if; + tree_out(node) <= not abit; + if i /= BITS-1 then + node := node * 2; + if abit = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end process; +end;