diff --git a/Makefile b/Makefile index 3861603..12a9317 100644 --- a/Makefile +++ b/Makefile @@ -68,8 +68,8 @@ all: $(all) $(shell scripts/make_version.sh git.vhdl) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ - utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl predecode.vhdl \ - decode1.vhdl helpers.vhdl insn_helpers.vhdl \ + utils.vhdl plru.vhdl plrufn.vhdl cache_ram.vhdl icache.vhdl \ + predecode.vhdl decode1.vhdl helpers.vhdl insn_helpers.vhdl \ control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \ diff --git a/icache.vhdl b/icache.vhdl index 592e901..0467630 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -12,7 +12,6 @@ -- efficient use of distributed RAM and less logic/muxes. Currently we -- write TAG_BITS width which may not match full ram blocks and might -- cause muxes to be inferred for "partial writes". --- * Check if making the read size of PLRU a ROM helps utilization -- library ieee; use ieee.std_logic_1164.all; @@ -236,8 +235,7 @@ architecture rtl of icache is signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0); -- PLRU output interface - type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_victim : plru_out_t; + signal plru_victim : way_sig_t; -- Memory write snoop signals signal snoop_valid : std_ulogic; @@ -447,40 +445,48 @@ begin -- Generate PLRUs maybe_plrus: if NUM_WAYS > 1 generate + type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_ram : plru_array; + signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin - plrus: for i in 0 to NUM_LINES-1 generate - -- PLRU interface - signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_acc_en : std_ulogic; - signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); - - begin - plru : entity work.plru - generic map ( - BITS => WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => plru_acc, - acc_en => plru_acc_en, - lru => plru_out - ); - - process(all) - begin - -- PLRU interface - if is_X(r.hit_nia) then - plru_acc_en <= 'X'; - elsif get_index(r.hit_nia) = i then - plru_acc_en <= r.hit_valid; - else - plru_acc_en <= '0'; - end if; - plru_acc <= std_ulogic_vector(r.hit_way); - plru_victim(i) <= plru_out; - end process; - end generate; + plru : entity work.plrufn + generic map ( + BITS => WAY_BITS + ) + port map ( + acc => plru_acc, + tree_in => plru_cur, + tree_out => plru_upd, + lru => plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r.hit_nia) then + plru_cur <= (others => 'X'); + else + plru_cur <= plru_ram(to_integer(get_index(r.hit_nia))); + end if; + + -- PLRU interface + plru_acc <= std_ulogic_vector(r.hit_way); + plru_victim <= unsigned(plru_out); + end process; + + -- synchronous writes to PLRU array + process(clk) + begin + if rising_edge(clk) then + if r.hit_valid = '1' then + assert not is_X(r.hit_nia) severity failure; + plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd; + end if; + end if; + end process; end generate; -- TLB hit detection and real address generation @@ -791,7 +797,7 @@ begin replace_way := to_unsigned(0, WAY_BITS); if NUM_WAYS > 1 then -- Get victim way from plru - replace_way := unsigned(plru_victim(to_integer(r.store_index))); + replace_way := plru_victim; end if; r.store_way <= replace_way; diff --git a/microwatt.core b/microwatt.core index b14a61e..bb6770d 100644 --- a/microwatt.core +++ b/microwatt.core @@ -34,6 +34,7 @@ filesets: - core.vhdl - icache.vhdl - plru.vhdl + - plrufn.vhdl - cache_ram.vhdl - core_debug.vhdl - utils.vhdl diff --git a/plrufn.vhdl b/plrufn.vhdl new file mode 100644 index 0000000..13ffd6a --- /dev/null +++ b/plrufn.vhdl @@ -0,0 +1,72 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use ieee.math_real.all; + +entity plrufn is + generic ( + BITS : positive := 2 + ) + ; + port ( + acc : in std_ulogic_vector(BITS-1 downto 0); + tree_in : in std_ulogic_vector(2 ** BITS - 2 downto 0); + tree_out : out std_ulogic_vector(2 ** BITS - 2 downto 0); + lru : out std_ulogic_vector(BITS-1 downto 0) + ); +end entity plrufn; + +architecture rtl of plrufn is + -- Each level of the tree (from leaf to root) has half the number of nodes + -- of the previous level. So for a 2^N bits LRU, we have a level of N/2 bits + -- one of N/4 bits etc.. down to 1. This gives us 2^N-1 nodes. Ie, 2 bits + -- LRU has 3 nodes (2 + 1), 4 bits LRU has 15 nodes (8 + 4 + 2 + 1) etc... + constant count : positive := 2 ** BITS - 1; + subtype node_t is integer range 0 to count - 1; +begin + + get_lru: process(tree_in) + variable node : node_t; + variable abit : std_ulogic; + begin + node := 0; + for i in 0 to BITS-1 loop + abit := tree_in(node); + if is_X(abit) then + abit := '0'; + end if; + lru(BITS-1-i) <= abit; + if i /= BITS-1 then + node := node * 2; + if abit = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end process; + + update_lru: process(all) + variable node : node_t; + variable abit : std_ulogic; + begin + tree_out <= tree_in; + node := 0; + for i in 0 to BITS-1 loop + abit := acc(BITS-1-i); + if is_X(abit) then + abit := '0'; + end if; + tree_out(node) <= not abit; + if i /= BITS-1 then + node := node * 2; + if abit = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end process; +end;