diff --git a/Makefile b/Makefile index 7b28b31..f69dd15 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,10 @@ fetch2.o: common.o wishbone_types.o glibc_random_helpers.o: glibc_random.o: glibc_random_helpers.o helpers.o: -icache.o: common.o wishbone_types.o +cache_ram.o: +plru.o: +plru_tb.o: plru.o +icache.o: common.o wishbone_types.o plru.o cache_ram.o icache_tb.o: common.o wishbone_types.o icache.o simple_ram_behavioural.o insn_helpers.o: loadstore1.o: common.o helpers.o @@ -75,6 +78,9 @@ fetch_tb: fetch_tb.o icache_tb: icache_tb.o $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o $@ +plru_tb: plru_tb.o + $(GHDL) -e $(GHDLFLAGS) $@ + loadstore_tb: loadstore_tb.o $(GHDL) -e $(GHDLFLAGS) $@ diff --git a/cache_ram.vhdl b/cache_ram.vhdl new file mode 100644 index 0000000..e0ffd17 --- /dev/null +++ b/cache_ram.vhdl @@ -0,0 +1,46 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use ieee.math_real.all; + +entity cache_ram is + generic( + ROW_BITS : integer := 16; + WIDTH : integer := 64 + ); + + port( + clk : in std_logic; + rd_en : in std_logic; + rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0); + rd_data : out std_logic_vector(WIDTH - 1 downto 0); + wr_en : in std_logic; + wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0); + wr_data : in std_logic_vector(WIDTH - 1 downto 0) + ); + +end cache_ram; + +architecture rtl of cache_ram is + constant SIZE : integer := 2**ROW_BITS; + + type ram_type is array (0 to SIZE - 1) of std_logic_vector(WIDTH - 1 downto 0); + signal ram : ram_type; + attribute ram_style : string; + attribute ram_style of ram : signal is "block"; + attribute ram_decomp : string; + attribute ram_decomp of ram : signal is "power"; + +begin + process(clk) + begin + if rising_edge(clk) then + if wr_en = '1' then + ram(to_integer(unsigned(wr_addr))) <= wr_data; + end if; + if rd_en = '1' then + rd_data <= ram(to_integer(unsigned(rd_addr))); + end if; + end if; + end process; +end; diff --git a/common.vhdl b/common.vhdl index fc6d888..3d02997 100644 --- a/common.vhdl +++ b/common.vhdl @@ -12,17 +12,16 @@ package common is carry: std_ulogic; end record; - type Fetch1ToFetch2Type is record - nia: std_ulogic_vector(63 downto 0); - end record; - - type Fetch2ToIcacheType is record + type Fetch1ToIcacheType is record req: std_ulogic; - addr: std_ulogic_vector(63 downto 0); + stop_mark: std_ulogic; + nia: std_ulogic_vector(63 downto 0); end record; type IcacheToFetch2Type is record - ack: std_ulogic; + valid: std_ulogic; + stop_mark: std_ulogic; + nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); end record; diff --git a/core.vhdl b/core.vhdl index df40d43..43b338d 100644 --- a/core.vhdl +++ b/core.vhdl @@ -33,11 +33,10 @@ end core; architecture behave of core is -- fetch signals - signal fetch1_to_fetch2: Fetch1ToFetch2Type; signal fetch2_to_decode1: Fetch2ToDecode1Type; -- icache signals - signal fetch2_to_icache : Fetch2ToIcacheType; + signal fetch1_to_icache : Fetch1ToIcacheType; signal icache_to_fetch2 : IcacheToFetch2Type; -- decode signals @@ -74,8 +73,8 @@ architecture behave of core is -- local signals signal fetch1_stall_in : std_ulogic; + signal icache_stall_out : std_ulogic; signal fetch2_stall_in : std_ulogic; - signal fetch2_stall_out : std_ulogic; signal decode1_stall_in : std_ulogic; signal decode2_stall_out : std_ulogic; @@ -107,43 +106,43 @@ begin rst => core_rst, stall_in => fetch1_stall_in, flush_in => flush, - e_in => execute1_to_fetch1, - f_out => fetch1_to_fetch2 - ); - - fetch1_stall_in <= fetch2_stall_out or decode2_stall_out; - - fetch2_0: entity work.fetch2 - port map ( - clk => clk, - rst => core_rst, - stall_in => fetch2_stall_in, - stall_out => fetch2_stall_out, - flush_in => flush, - i_in => icache_to_fetch2, - i_out => fetch2_to_icache, stop_in => dbg_core_stop, - f_in => fetch1_to_fetch2, - f_out => fetch2_to_decode1 + e_in => execute1_to_fetch1, + i_out => fetch1_to_icache ); - fetch2_stall_in <= decode2_stall_out; + fetch1_stall_in <= icache_stall_out or decode2_stall_out; icache_0: entity work.icache generic map( - LINE_SIZE_DW => 8, - NUM_LINES => 16 + LINE_SIZE => 64, + NUM_LINES => 16, + NUM_WAYS => 2 ) port map( clk => clk, rst => icache_rst, - i_in => fetch2_to_icache, + i_in => fetch1_to_icache, i_out => icache_to_fetch2, + flush_in => flush, + stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, wishbone_in => wishbone_insn_in ); - icache_rst <= rst or dbg_icache_rst; + icache_rst <= rst or dbg_icache_rst; + + fetch2_0: entity work.fetch2 + port map ( + clk => clk, + rst => core_rst, + stall_in => fetch2_stall_in, + flush_in => flush, + i_in => icache_to_fetch2, + f_out => fetch2_to_decode1 + ); + + fetch2_stall_in <= decode2_stall_out; decode1_0: entity work.decode1 port map ( @@ -274,7 +273,7 @@ begin icache_rst => dbg_icache_rst, terminate => terminate, core_stopped => dbg_core_is_stopped, - nia => fetch1_to_fetch2.nia, + nia => fetch1_to_icache.nia, terminated_out => terminated_out ); diff --git a/core_debug.vhdl b/core_debug.vhdl index c93c70d..ae4414e 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -91,15 +91,15 @@ begin reg_write: process(clk) begin if rising_edge(clk) then + -- Reset the 1-cycle "do" signals + do_step <= '0'; + do_reset <= '0'; + do_icreset <= '0'; + if (rst) then stopping <= '0'; terminated <= '0'; else - -- Reset the 1-cycle "do" signals - do_step <= '0'; - do_reset <= '0'; - do_icreset <= '0'; - -- Edge detect on dmi_req for 1-shot pulses dmi_req_1 <= dmi_req; if dmi_req = '1' and dmi_req_1 = '0' then diff --git a/fetch1.vhdl b/fetch1.vhdl index 643e8c8..9cd5445 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -16,59 +16,111 @@ entity fetch1 is -- Control inputs: stall_in : in std_ulogic; flush_in : in std_ulogic; + stop_in : in std_ulogic; -- redirect from execution unit e_in : in Execute1ToFetch1Type; - -- fetch data out - f_out : out Fetch1ToFetch2Type + -- Request to icache + i_out : out Fetch1ToIcacheType ); end entity fetch1; architecture behaviour of fetch1 is - type reg_internal_type is record - nia_next : std_ulogic_vector(63 downto 0); + type stop_state_t is (RUNNING, STOPPED, RESTARTING); + type reg_internal_t is record + stop_state: stop_state_t; end record; - signal r_int, rin_int : reg_internal_type; - signal r, rin : Fetch1ToFetch2Type; + signal r, r_next : Fetch1ToIcacheType; + signal r_int, r_next_int : reg_internal_t; begin + regs : process(clk) begin if rising_edge(clk) then - r <= rin; - r_int <= rin_int; + if r /= r_next then + report "fetch1 rst:" & std_ulogic'image(rst) & + " R:" & std_ulogic'image(e_in.redirect) & + " S:" & std_ulogic'image(stall_in) & + " T:" & std_ulogic'image(stop_in) & + " nia:" & to_hstring(r_next.nia) & + " SM:" & std_ulogic'image(r_next.stop_mark); + end if; + r <= r_next; + r_int <= r_next_int; end if; end process; comb : process(all) - variable v : Fetch1ToFetch2Type; - variable v_int : reg_internal_type; + variable v : Fetch1ToIcacheType; + variable v_int : reg_internal_t; + variable increment : boolean; begin v := r; v_int := r_int; - if stall_in = '0' then - v.nia := r_int.nia_next; - end if; - - if e_in.redirect = '1' then - v.nia := e_in.redirect_nia; - end if; - if rst = '1' then - v.nia := RESET_ADDRESS; + v.nia := RESET_ADDRESS; + v_int.stop_state := RUNNING; + elsif e_in.redirect = '1' then + v.nia := e_in.redirect_nia; + elsif stall_in = '0' then + + -- For debug stop/step to work properly we need a little bit of + -- trickery here. If we just stop incrementing and send stop marks + -- when stop_in is set, then we'll increment on the cycle it clears + -- and end up never executing the instruction we were stopped on. + -- + -- Avoid this along with the opposite issue when stepping (stop is + -- cleared for only one cycle) is handled by the state machine below + -- + -- By default, increment addresses + increment := true; + case v_int.stop_state is + when RUNNING => + -- If we are running and stop_in is set, then stop incrementing, + -- we are now stopped. + if stop_in = '1' then + increment := false; + v_int.stop_state := STOPPED; + end if; + when STOPPED => + -- When stopped, never increment. If stop is cleared, go to state + -- "restarting" but still don't increment that cycle. stop_in is + -- now 0 so we'll send the NIA down without a stop mark. + increment := false; + if stop_in = '0' then + v_int.stop_state := RESTARTING; + end if; + when RESTARTING => + -- We have just sent the NIA down, we can start incrementing again. + -- If stop_in is still not set, go back to running normally. + -- If stop_in is set again (that was a one-cycle "step"), go + -- back to "stopped" state which means we'll stop incrementing + -- on the next cycle. This ensures we increment the PC once after + -- sending one instruction without a stop mark. Since stop_in is + -- now set, the new PC will be sent with a stop mark and thus not + -- executed. + if stop_in = '0' then + v_int.stop_state := RUNNING; + else + v_int.stop_state := STOPPED; + end if; + end case; + + if increment then + v.nia := std_logic_vector(unsigned(v.nia) + 4); + end if; end if; - v_int.nia_next := std_logic_vector(unsigned(v.nia) + 4); - - -- Update registers - rin <= v; - rin_int <= v_int; + v.req := not rst; + v.stop_mark := stop_in; - -- Update outputs - f_out <= r; + r_next <= v; + r_next_int <= v_int; - report "fetch1 R:" & std_ulogic'image(e_in.redirect) & " v.nia:" & to_hstring(v.nia) & " f_out.nia:" & to_hstring(f_out.nia); + -- Update outputs to the icache + i_out <= r; end process; diff --git a/fetch2.vhdl b/fetch2.vhdl index 2b34836..99f92ee 100644 --- a/fetch2.vhdl +++ b/fetch2.vhdl @@ -12,55 +12,107 @@ entity fetch2 is rst : in std_ulogic; stall_in : in std_ulogic; - stall_out : out std_ulogic; - flush_in : in std_ulogic; - stop_in : in std_ulogic; + -- Results from icache i_in : in IcacheToFetch2Type; - i_out : out Fetch2ToIcacheType; - - f_in : in Fetch1ToFetch2Type; + -- Output to decode f_out : out Fetch2ToDecode1Type ); end entity fetch2; architecture behaviour of fetch2 is + + -- The icache cannot stall, so we need to stash a cycle + -- of output from it when we stall. + type reg_internal_type is record + stash : IcacheToFetch2Type; + stash_valid : std_ulogic; + stopped : std_ulogic; + end record; + + signal r_int, rin_int : reg_internal_type; signal r, rin : Fetch2ToDecode1Type; + begin regs : process(clk) begin if rising_edge(clk) then + + if (r /= rin) then + report "fetch2 rst:" & std_ulogic'image(rst) & + " S:" & std_ulogic'image(stall_in) & + " F:" & std_ulogic'image(flush_in) & + " T:" & std_ulogic'image(rin.stop_mark) & + " V:" & std_ulogic'image(rin.valid) & + " nia:" & to_hstring(rin.nia); + end if; + -- Output state remains unchanged on stall, unless we are flushing if rst = '1' or flush_in = '1' or stall_in = '0' then r <= rin; end if; + + -- Internal state is updated on every clock + r_int <= rin_int; end if; end process; comb : process(all) - variable v : Fetch2ToDecode1Type; + variable v : Fetch2ToDecode1Type; + variable v_int : reg_internal_type; + variable v_i_in : IcacheToFetch2Type; begin v := r; + v_int := r_int; - -- asynchronous icache lookup - i_out.req <= '1'; - i_out.addr <= f_in.nia; - v.valid := i_in.ack; - v.nia := f_in.nia; - v.insn := i_in.insn; - stall_out <= stop_in or not i_in.ack; + -- If stalling, stash away the current input from the icache + if stall_in = '1' and v_int.stash_valid = '0' then + v_int.stash := i_in; + v_int.stash_valid := '1'; + end if; + + -- If unstalling, source input from the stash and invalidate it, + -- otherwise source normally from the icache. + -- + v_i_in := i_in; + if v_int.stash_valid = '1' and stall_in = '0' then + v_i_in := v_int.stash; + v_int.stash_valid := '0'; + end if; - if flush_in = '1' or stop_in = '1' then + v.valid := v_i_in.valid; + v.stop_mark := v_i_in.stop_mark; + v.nia := v_i_in.nia; + v.insn := v_i_in.insn; + + -- Clear stash internal valid bit on flush. We still mark + -- the stash itself as valid since we still want to override + -- whatever comes form icache when unstalling, but we'll + -- override it with something invalid. + -- + if flush_in = '1' then + v_int.stash.valid := '0'; + end if; + + -- If we are flushing or the instruction comes with a stop mark + -- we tag it as invalid so it doesn't get decoded and executed + if flush_in = '1' or v.stop_mark = '1' then v.valid := '0'; end if; - v.stop_mark := stop_in; + + -- Clear stash on reset + if rst = '1' then + v_int.stash_valid := '0'; + end if; -- Update registers rin <= v; + rin_int <= v_int; -- Update outputs f_out <= r; end process; + end architecture behaviour; diff --git a/icache.vhdl b/icache.vhdl index 2565219..89e491e 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -1,3 +1,21 @@ +-- +-- Set associative icache +-- +-- TODO (in no specific order): +-- +-- * Add debug interface to inspect cache content +-- * Add snoop/invalidate path +-- * Add multi-hit error detection +-- * Pipelined bus interface (wb or axi) +-- * Maybe add parity ? There's a few bits free in each BRAM row on Xilinx +-- * Add optimization: service hits on partially loaded lines +-- * Add optimization: (maybe) interrupt reload on fluch/redirect +-- * Check if playing with the geometry of the cache tags allow for more +-- efficient use of distributed RAM and less logic/muxes. Currently we +-- write TAG_BITS width which may not match full ram blocks and might +-- cause muxes to be inferred for "partial writes". +-- * Check if making the read size of PLRU a ROM helps utilization +-- library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; @@ -10,18 +28,23 @@ use work.wishbone_types.all; entity icache is generic ( - -- Line size in 64bit doublewords - LINE_SIZE_DW : natural := 8; - -- Number of lines - NUM_LINES : natural := 32 + -- Line size in bytes + LINE_SIZE : positive := 64; + -- Number of lines in a set + NUM_LINES : positive := 32; + -- Number of ways + NUM_WAYS : positive := 4 ); port ( clk : in std_ulogic; rst : in std_ulogic; - i_in : in Fetch2ToIcacheType; + i_in : in Fetch1ToIcacheType; i_out : out IcacheToFetch2Type; + stall_out : out std_ulogic; + flush_in : in std_ulogic; + wishbone_out : out wishbone_master_out; wishbone_in : in wishbone_slave_out ); @@ -48,126 +71,423 @@ architecture rtl of icache is end if; end function; - constant LINE_SIZE : natural := LINE_SIZE_DW*8; - constant OFFSET_BITS : natural := log2(LINE_SIZE); - constant INDEX_BITS : natural := log2(NUM_LINES); - constant TAG_BITS : natural := 64 - OFFSET_BITS - INDEX_BITS; + -- BRAM organisation: We never access more than wishbone_data_bits at + -- a time so to save resources we make the array only that wide, and + -- use consecutive indices for to make a cache "line" + -- + -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits) + constant ROW_SIZE : natural := wishbone_data_bits / 8; + -- ROW_PER_LINE is the number of row (wishbone transactions) in a line + constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE; + -- BRAM_ROWS is the number of rows in BRAM needed to represent the full + -- icache + constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE; + -- INSN_PER_ROW is the number of 32bit instructions per BRAM row + constant INSN_PER_ROW : natural := wishbone_data_bits / 32; + -- Bit fields counts in the address + + -- INSN_BITS is the number of bits to select an instruction in a row + constant INSN_BITS : natural := log2(INSN_PER_ROW); + -- ROW_BITS is the number of bits to select a row + constant ROW_BITS : natural := log2(BRAM_ROWS); + -- ROW_LINEBITS is the number of bits to select a row within a line + constant ROW_LINEBITS : natural := log2(ROW_PER_LINE); + -- LINE_OFF_BITS is the number of bits for the offset in a cache line + constant LINE_OFF_BITS : natural := log2(LINE_SIZE); + -- ROW_OFF_BITS is the number of bits for the offset in a row + constant ROW_OFF_BITS : natural := log2(ROW_SIZE); + -- INDEX_BITS is the number if bits to select a cache line + constant INDEX_BITS : natural := log2(NUM_LINES); + -- TAG_BITS is the number of bits of the tag part of the address + constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + -- WAY_BITS is the number of bits to select a way + constant WAY_BITS : natural := log2(NUM_WAYS); + + -- Example of layout for 32 lines of 64 bytes: + -- + -- .. tag |index| line | + -- .. | row | | + -- .. | | | |00| zero (2) + -- .. | | |-| | INSN_BITS (1) + -- .. | |---| | ROW_LINEBITS (3) + -- .. | |--- - --| LINE_OFF_BITS (6) + -- .. | |- --| ROW_OFF_BITS (3) + -- .. |----- ---| | ROW_BITS (8) + -- .. |-----| | INDEX_BITS (5) + -- .. --------| | TAG_BITS (53) + + subtype row_t is integer range 0 to BRAM_ROWS-1; + subtype index_t is integer range 0 to NUM_LINES-1; + subtype way_t is integer range 0 to NUM_WAYS-1; - subtype cacheline_type is std_logic_vector((LINE_SIZE*8)-1 downto 0); - type cacheline_array is array(0 to NUM_LINES-1) of cacheline_type; + -- The cache data BRAM organized as described above for each way + subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); - subtype cacheline_tag_type is std_logic_vector(TAG_BITS-1 downto 0); - type cacheline_tag_array is array(0 to NUM_LINES-1) of cacheline_tag_type; + -- The cache tags LUTRAM has a row per set. Vivado is a pain and will + -- not handle a clean (commented) definition of the cache tags as a 3d + -- memory. For now, work around it by putting all the tags + subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); +-- type cache_tags_set_t is array(way_t) of cache_tag_t; +-- type cache_tags_array_t is array(index_t) of cache_tags_set_t; + constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; + subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); + type cache_tags_array_t is array(index_t) of cache_tags_set_t; - signal cachelines : cacheline_array := (others => (others => '0')); - signal tags : cacheline_tag_array := (others => (others => '0')); - signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0) := (others => '0'); + -- The cache valid bits + subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); + type cache_valids_t is array(index_t) of cache_way_valids_t; + + -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + signal cache_tags : cache_tags_array_t; + signal cache_valids : cache_valids_t; attribute ram_style : string; - attribute ram_style of cachelines : signal is "block"; + attribute ram_style of cache_tags : signal is "distributed"; - attribute ram_decomp : string; - attribute ram_decomp of cachelines : signal is "power"; + -- Cache reload state machine + type state_t is (IDLE, WAIT_ACK); - type state_type is (IDLE, WAIT_ACK); + type reg_internal_t is record + -- Cache hit state (Latches for 1 cycle BRAM access) + hit_way : way_t; + hit_nia : std_ulogic_vector(63 downto 0); + hit_smark : std_ulogic; + hit_valid : std_ulogic; - type reg_internal_type is record - state : state_type; - w : wishbone_master_out; - store_index : integer range 0 to (NUM_LINES-1); - store_word : integer range 0 to (LINE_SIZE-1); + -- Cache miss state (reload state machine) + state : state_t; + wb : wishbone_master_out; + store_way : way_t; + store_index : index_t; end record; - signal r : reg_internal_type; + signal r : reg_internal_t; + + -- Async signals on incoming request + signal req_index : index_t; + signal req_row : row_t; + signal req_hit_way : way_t; + signal req_tag : cache_tag_t; + signal req_is_hit : std_ulogic; + signal req_is_miss : std_ulogic; + + -- Cache RAM interface + type cache_ram_out_t is array(way_t) of cache_row_t; + signal cache_out : cache_ram_out_t; - signal read_index : integer range 0 to NUM_LINES-1; - signal read_tag : std_ulogic_vector(63-OFFSET_BITS-INDEX_BITS downto 0); - signal read_miss : boolean; + -- PLRU output interface + type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_victim : plru_out_t; + + -- Return the cache line index (tag index) for an address + function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is + begin + return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + end; + + -- Return the cache row index (data memory) for an address + function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is + begin + return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); + end; - function get_index(addr: std_ulogic_vector(63 downto 0)) return integer is + -- Returns whether this is the last row of a line + function is_last_row(addr: std_ulogic_vector(63 downto 0)) return boolean is + constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); begin - return to_integer(unsigned(addr((OFFSET_BITS+INDEX_BITS-1) downto OFFSET_BITS))); + return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; end; - function get_word(addr: std_ulogic_vector(63 downto 0); data: cacheline_type) return std_ulogic_vector is - variable word : integer; + -- Return the address of the next row in the current cache line + function next_row_addr(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); + variable result : std_ulogic_vector(63 downto 0); begin - word := to_integer(unsigned(addr(OFFSET_BITS-1 downto 2))); - return data((word+1)*32-1 downto word*32); + -- Is there no simpler way in VHDL to generate that 3 bits adder ? + row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS); + row_idx := std_ulogic_vector(unsigned(row_idx) + 1); + result := addr; + result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx; + return result; end; - function get_tag(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + -- Read the instruction word for the given address in the current cache row + function read_insn_word(addr: std_ulogic_vector(63 downto 0); + data: cache_row_t) return std_ulogic_vector is + variable word: integer range 0 to INSN_PER_ROW-1; begin - return addr(63 downto OFFSET_BITS+INDEX_BITS); + word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2))); + return data(31+word*32 downto word*32); end; + + -- Get the tag value from the address + function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + begin + return addr(63 downto 64-TAG_BITS); + end; + + -- Read a tag from a tag memory row + function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is + begin + return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); + end; + + -- Write a tag to tag memory row + procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; + tag: cache_tag_t) is + begin + tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; + end; + begin - assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; - assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; - icache_read : process(all) + assert LINE_SIZE mod ROW_SIZE = 0; + assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; + assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; + assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE; + assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2" severity FAILURE; + assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) + report "geometry bits don't add up" severity FAILURE; + assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) + report "geometry bits don't add up" severity FAILURE; + assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + report "geometry bits don't add up" severity FAILURE; + assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + report "geometry bits don't add up" severity FAILURE; + + debug: process + begin + report "ROW_SIZE = " & natural'image(ROW_SIZE); + report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE); + report "BRAM_ROWS = " & natural'image(BRAM_ROWS); + report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW); + report "INSN_BITS = " & natural'image(INSN_BITS); + report "ROW_BITS = " & natural'image(ROW_BITS); + report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS); + report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS); + report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS); + report "INDEX_BITS = " & natural'image(INDEX_BITS); + report "TAG_BITS = " & natural'image(TAG_BITS); + report "WAY_BITS = " & natural'image(WAY_BITS); + wait; + end process; + + -- Generate a cache RAM for each way + rams: for i in 0 to NUM_WAYS-1 generate + signal do_write : std_ulogic; + signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); + signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); + signal dout : cache_row_t; begin - read_index <= get_index(i_in.addr); - read_tag <= get_tag(i_in.addr); - read_miss <= false; + way: entity work.cache_ram + generic map ( + ROW_BITS => ROW_BITS, + WIDTH => wishbone_data_bits + ) + port map ( + clk => clk, + rd_en => '1', -- fixme + rd_addr => rd_addr, + rd_data => dout, + wr_en => do_write, + wr_addr => wr_addr, + wr_data => wishbone_in.dat + ); + process(all) + begin + do_write <= '0'; + if wishbone_in.ack = '1' and r.store_way = i then + do_write <= '1'; + end if; + cache_out(i) <= dout; + rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); + end process; + end generate; + + -- Generate PLRUs + maybe_plrus: if NUM_WAYS > 1 generate + begin + plrus: for i in 0 to NUM_LINES-1 generate + -- PLRU interface + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_acc_en : std_ulogic; + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); + + begin + plru : entity work.plru + generic map ( + BITS => WAY_BITS + ) + port map ( + clk => clk, + rst => rst, + acc => plru_acc, + acc_en => plru_acc_en, + lru => plru_out + ); - i_out.ack <= '0'; - i_out.insn <= get_word(i_in.addr, cachelines(read_index)); + process(req_index, req_is_hit, req_hit_way, req_is_hit, plru_out) + begin + -- PLRU interface + if req_is_hit = '1' and req_index = i then + plru_acc_en <= req_is_hit; + else + plru_acc_en <= '0'; + end if; + plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_victim(i) <= plru_out; + end process; + end generate; + end generate; - if i_in.req = '1' then - if (tags_valid(read_index) = '1') and (tags(read_index) = read_tag) then - -- report hit asynchronously - i_out.ack <= '1'; - else - read_miss <= true; - end if; - end if; + -- Cache hit detection, output to fetch2 and other misc logic + icache_comb : process(all) + variable is_hit : std_ulogic; + variable hit_way : way_t; + begin + -- Extract line, row and tag from request + req_index <= get_index(i_in.nia); + req_row <= get_row(i_in.nia); + req_tag <= get_tag(i_in.nia); + + -- Test if pending request is a hit on any way + hit_way := 0; + is_hit := '0'; + for i in way_t loop + if read_tag(i, cache_tags(req_index)) = req_tag and + cache_valids(req_index)(i) = '1' then + hit_way := i; + is_hit := '1'; + end if; + end loop; + + -- Generate the "hit" and "miss" signals for the synchronous blocks + req_is_hit <= i_in.req and is_hit and not flush_in; + req_is_miss <= i_in.req and not is_hit and not flush_in; + req_hit_way <= hit_way; + + -- Output instruction from current cache row + -- + -- Note: This is a mild violation of our design principle of having pipeline + -- stages output from a clean latch. In this case we output the result + -- of a mux. The alternative would be output an entire row which + -- I prefer not to do just yet as it would force fetch2 to know about + -- some of the cache geometry information. + -- + i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way)); + i_out.valid <= r.hit_valid; + i_out.nia <= r.hit_nia; + i_out.stop_mark <= r.hit_smark; + + -- Stall fetch1 if we have a miss + stall_out <= not is_hit; + + -- Wishbone requests output (from the cache miss reload machine) + wishbone_out <= r.wb; end process; - wishbone_out <= r.w; + -- Cache hit synchronous machine + icache_hit : process(clk) + begin + if rising_edge(clk) then + -- On a hit, latch the request for the next cycle, when the BRAM data + -- will be available on the cache_out output of the corresponding way + -- + if req_is_hit = '1' then + r.hit_way <= req_hit_way; + r.hit_nia <= i_in.nia; + r.hit_smark <= i_in.stop_mark; + r.hit_valid <= '1'; - icache_write : process(clk) + report "cache hit nia:" & to_hstring(i_in.nia) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way: " & integer'image(req_hit_way); + else + r.hit_valid <= '0'; + + -- Send stop marks down regardless of validity + r.hit_smark <= i_in.stop_mark; + end if; + end if; + end process; + + -- Cache miss/reload synchronous machine + icache_miss : process(clk) + variable way : integer range 0 to NUM_WAYS-1; + variable tagset : cache_tags_set_t; begin if rising_edge(clk) then + -- On reset, clear all valid bits to force misses if rst = '1' then - tags_valid <= (others => '0'); + for i in index_t loop + cache_valids(i) <= (others => '0'); + end loop; r.state <= IDLE; - r.w.cyc <= '0'; - r.w.stb <= '0'; - end if; - - r.w.dat <= (others => '0'); - r.w.sel <= "11111111"; - r.w.we <= '0'; - - case r.state is - when IDLE => - if read_miss = true then - r.state <= WAIT_ACK; - r.store_word <= 0; - r.store_index <= read_index; - - tags(read_index) <= read_tag; - tags_valid(read_index) <= '0'; - - r.w.adr <= i_in.addr(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0'); - r.w.cyc <= '1'; - r.w.stb <= '1'; - end if; - when WAIT_ACK => - if wishbone_in.ack = '1' then - cachelines(r.store_index)((r.store_word+1)*64-1 downto ((r.store_word)*64)) <= wishbone_in.dat; - r.store_word <= r.store_word + 1; - - if r.store_word = (LINE_SIZE_DW-1) then - r.state <= IDLE; - tags_valid(r.store_index) <= '1'; - r.w.cyc <= '0'; - r.w.stb <= '0'; - else - r.w.adr(OFFSET_BITS-1 downto 3) <= std_ulogic_vector(to_unsigned(r.store_word+1, OFFSET_BITS-3)); - end if; - end if; - end case; - end if; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; + + -- We only ever do reads on wishbone + r.wb.dat <= (others => '0'); + r.wb.sel <= "11111111"; + r.wb.we <= '0'; + else + -- Main state machine + case r.state is + when IDLE => + -- We need to read a cache line + if req_is_miss = '1' then + way := to_integer(unsigned(plru_victim(req_index))); + + report "cache miss nia:" & to_hstring(i_in.nia) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " way:" & integer'image(way) & + " tag:" & to_hstring(req_tag); + + -- Force misses on that way while reloading that line + cache_valids(req_index)(way) <= '0'; + + -- Store new tag in selected way + for i in 0 to NUM_WAYS-1 loop + if i = way then + tagset := cache_tags(req_index); + write_tag(i, tagset, req_tag); + cache_tags(req_index) <= tagset; + end if; + end loop; + + -- Keep track of our index and way for subsequent stores + r.store_index <= req_index; + r.store_way <= way; + + -- Prep for first wishbone read. We calculate the address of + -- the start of the cache line + -- + r.wb.adr <= i_in.nia(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); + r.wb.cyc <= '1'; + r.wb.stb <= '1'; + + r.state <= WAIT_ACK; + end if; + when WAIT_ACK => + if wishbone_in.ack = '1' then + -- That was the last word ? We are done + if is_last_row(r.wb.adr) then + cache_valids(r.store_index)(way) <= '1'; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; + r.state <= IDLE; + else + -- Otherwise, calculate the next row address + r.wb.adr <= next_row_addr(r.wb.adr); + end if; + end if; + end case; + end if; + end if; end process; end; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 4955177..7aeb69c 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -12,7 +12,7 @@ architecture behave of icache_tb is signal clk : std_ulogic; signal rst : std_ulogic; - signal i_out : Fetch2ToIcacheType; + signal i_out : Fetch1ToIcacheType; signal i_in : IcacheToFetch2Type; signal wb_bram_in : wishbone_master_out; @@ -22,7 +22,7 @@ architecture behave of icache_tb is begin icache0: entity work.icache generic map( - LINE_SIZE_DW => 8, + LINE_SIZE => 64, NUM_LINES => 4 ) port map( @@ -30,6 +30,7 @@ begin rst => rst, i_in => i_out, i_out => i_in, + flush_in => '0', wishbone_out => wb_bram_in, wishbone_in => wb_bram_out ); @@ -66,16 +67,16 @@ begin stim: process begin i_out.req <= '0'; - i_out.addr <= (others => '0'); + i_out.nia <= (others => '0'); wait for 4*clk_period; i_out.req <= '1'; - i_out.addr <= x"0000000000000004"; + i_out.nia <= x"0000000000000004"; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000001"; i_out.req <= '0'; @@ -84,31 +85,31 @@ begin -- hit i_out.req <= '1'; - i_out.addr <= x"0000000000000008"; - wait for clk_period/2; - assert i_in.ack = '1'; + i_out.nia <= x"0000000000000008"; + wait for clk_period; + assert i_in.valid = '1'; assert i_in.insn = x"00000002"; - wait for clk_period/2; + wait for clk_period; -- another miss i_out.req <= '1'; - i_out.addr <= x"0000000000000040"; + i_out.nia <= x"0000000000000040"; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000010"; -- test something that aliases i_out.req <= '1'; - i_out.addr <= x"0000000000000100"; - wait for clk_period/2; - assert i_in.ack = '0'; - wait for clk_period/2; + i_out.nia <= x"0000000000000100"; + wait for clk_period; + assert i_in.valid = '0'; + wait for clk_period; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000040"; i_out.req <= '0'; diff --git a/microwatt.core b/microwatt.core index 6143f50..50e9957 100644 --- a/microwatt.core +++ b/microwatt.core @@ -29,6 +29,8 @@ filesets: - insn_helpers.vhdl - core.vhdl - icache.vhdl + - plru.vhdl + - cache_ram.vhdl - core_debug.vhdl file_type : vhdlSource-2008 diff --git a/plru.vhdl b/plru.vhdl new file mode 100644 index 0000000..6907c2b --- /dev/null +++ b/plru.vhdl @@ -0,0 +1,77 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use ieee.math_real.all; + +entity plru is + generic ( + BITS : positive := 2 + ) + ; + port ( + clk : in std_ulogic; + rst : in std_ulogic; + + acc : in std_ulogic_vector(BITS-1 downto 0); + acc_en : in std_ulogic; + lru : out std_ulogic_vector(BITS-1 downto 0) + ); +end entity plru; + +architecture rtl of plru is + constant count : positive := 2 ** BITS - 1; + subtype node_t is integer range 0 to count; + type tree_t is array(node_t) of std_ulogic; + + signal tree: tree_t; +begin + + -- XXX Check if we can turn that into a little ROM instead that + -- takes the tree bit vector and returns the LRU. See if it's better + -- in term of FPGA resouces usage... + get_lru: process(all) + variable node : node_t; + begin + node := 0; + for i in 0 to BITS-1 loop +-- report "GET: i:" & integer'image(i) & " node:" & integer'image(node) & " val:" & std_ulogic'image(tree(node)); + lru(BITS-1-i) <= tree(node); + if i /= BITS-1 then + node := node * 2; + if tree(node) = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end process; + + update_lru: process(clk) + variable node : node_t; + variable abit : std_ulogic; + begin + if rising_edge(clk) then + if rst = '1' then + tree <= (others => '0'); + elsif acc_en = '1' then + node := 0; + for i in 0 to BITS-1 loop + abit := acc(BITS-1-i); + tree(node) <= not abit; +-- report "UPD: i:" & integer'image(i) & " node:" & integer'image(node) & " val" & std_ulogic'image(not abit); + if i /= BITS-1 then + node := node * 2; + if abit = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end if; + end if; + end process; +end; + + diff --git a/plru_tb.vhdl b/plru_tb.vhdl new file mode 100644 index 0000000..18512e4 --- /dev/null +++ b/plru_tb.vhdl @@ -0,0 +1,109 @@ +library ieee; +use ieee.std_logic_1164.all; + +library work; +use work.common.all; +use work.wishbone_types.all; + +entity plru_tb is +end plru_tb; + +architecture behave of plru_tb is + signal clk : std_ulogic; + signal rst : std_ulogic; + + constant clk_period : time := 10 ns; + + signal acc_en : std_ulogic; + signal acc : std_ulogic_vector(2 downto 0); + signal lru : std_ulogic_vector(2 downto 0); + +begin + plru0: entity work.plru + generic map( + BITS => 3 + ) + port map( + clk => clk, + rst => rst, + + acc => acc, + acc_en => acc_en, + lru => lru + ); + + clk_process: process + begin + clk <= '0'; + wait for clk_period/2; + clk <= '1'; + wait for clk_period/2; + end process; + + rst_process: process + begin + rst <= '1'; + wait for 2*clk_period; + rst <= '0'; + wait; + end process; + + stim: process + begin + wait for 4*clk_period; + + report "accessing 1:"; + acc <= "001"; + acc_en <= '1'; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 2:"; + acc <= "010"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 7:"; + acc <= "111"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 4:"; + acc <= "100"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 3:"; + acc <= "011"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 5:"; + acc <= "101"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 3:"; + acc <= "011"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 5:"; + acc <= "101"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 6:"; + acc <= "110"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 0:"; + acc <= "000"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + assert false report "end of test" severity failure; + wait; + end process; +end;