From 963c22595571249131012acbe5a488f6b414552a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 1 Aug 2023 17:47:22 +1000 Subject: [PATCH] icache: Read icache tag RAM synchronously This uses the next_nia provided to us by fetch1 to enable the icache tag RAM to be read synchronously (using a clock edge), which should enable block RAMs to be used on FPGAs rather than LUT RAM or flip-flops. We define a separate RAM per way to avoid any problems with the tools trying to inference byte write enables for writing to a single way. Since next_nia can move on, we only get one shot at reading it the cache tag RAM entry for the current access. If it is a miss, then the state machine will read the cache line from RAM, and we can consider the access to be a hit once the state machine has brought in the doubleword we need. The TLB hit/miss check has been modified to check r.store_tag rather than the tag read from the tag RAM for this case. However, it is also possible that stall_in will be asserted for the whole time until the cache line refill is completed. To handle this case, we remember (in r.stalled_hit) that we detected a hit while stalled, and use that hit once stall_in is deasserted. This avoids doing an unnecesary second reload of the same cache line. The r.stalled_hit flag gets cleared in CLR_TAG state since that is when cache tags can be overwritten, meaning that a previously detected hit might no longer be valid. There is also the case where the tag read from the tag RAM is the one we are looking for, and is the same index as the line that is starting to be reloaded by the state machine. If the icache gets stalled for long enough that the line reload finishes, it would then be possible for the access to be detected as a hit even though the cache line has been overwritten. To counter this, we detect the case where the cache tag RAM entry being read is the same as the entry being written and set a 'tag_overwrite' flag bit to indicate that one of the tags in cache_tags_set is no longer valid. For snooping writes to memory, we have a second read port on the cache tag RAM. These tags are also read synchronously, so the logic for clearing cache line valid bits on a snoop has been adjusted (the tag comparisons and valid bit clearing now happen in the same cycle). This also simplifies the expression for 'insn' by removing a dependency on r.hit_valid, fixes the instruction value sent to the log, and deasserts stall_out when flush_in is true. Signed-off-by: Paul Mackerras --- icache.vhdl | 189 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 114 insertions(+), 75 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index 15ed7bd..4bd4491 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -139,28 +139,24 @@ architecture rtl of icache is -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(ROW_WIDTH-1 downto 0); - -- The cache tags LUTRAM has a row per set. Vivado is a pain and will - -- not handle a clean (commented) definition of the cache tags as a 3d - -- memory. For now, work around it by putting all the tags + -- We define a cache tag RAM per way, accessed synchronously subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); --- type cache_tags_set_t is array(way_t) of cache_tag_t; --- type cache_tags_array_t is array(index_t) of cache_tags_set_t; - constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; - subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); - type cache_tags_array_t is array(index_t) of cache_tags_set_t; + type cache_tags_set_t is array(way_t) of cache_tag_t; + type cache_tags_array_t is array(index_t) of cache_tag_t; + + -- Set of cache tags read on the last clock edge + signal cache_tags_set : cache_tags_set_t; + -- Set of cache tags for snooping writes to memory + signal snoop_tags_set : cache_tags_set_t; + -- Flags indicating write-hit-read on the cache tags + signal tag_overwrite : std_ulogic_vector(NUM_WAYS - 1 downto 0); -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; - - -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - signal cache_tags : cache_tags_array_t; signal cache_valids : cache_valids_t; - attribute ram_style : string; - attribute ram_style of cache_tags : signal is "distributed"; - -- L1 ITLB. constant TLB_BITS : natural := log2(TLB_SIZE); constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); @@ -216,6 +212,9 @@ architecture rtl of icache is end_row_ix : row_in_line_t; rows_valid : row_per_line_valid_t; + stalled_hit : std_ulogic; -- remembers hit while stalled + stalled_way : way_sig_t; + -- TLB miss state fetch_failed : std_ulogic; end record; @@ -248,9 +247,11 @@ architecture rtl of icache is signal plru_victim : way_sig_t; -- Memory write snoop signals - signal snoop_valid : std_ulogic; - signal snoop_index : index_sig_t; - signal snoop_hits : cache_way_valids_t; + signal snoop_valid : std_ulogic; + signal snoop_index : index_sig_t; + signal snoop_tag : cache_tag_t; + signal snoop_index2 : index_sig_t; + signal snoop_hits : cache_way_valids_t; signal log_insn : std_ulogic_vector(35 downto 0); @@ -329,19 +330,6 @@ architecture rtl of icache is return endian & addr(addr'left downto SET_SIZE_BITS); end; - -- Read a tag from a tag memory row - function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is - begin - return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); - end; - - -- Write a tag to tag memory row - procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; - tag: cache_tag_t) is - begin - tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; - end; - -- Simple hash for direct-mapped TLB index function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); @@ -423,7 +411,9 @@ begin signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal dout : cache_row_t; signal wr_sel : std_ulogic_vector(0 downto 0); + signal ic_tags : cache_tags_array_t; begin + -- Cache data RAMs, one per way way: entity work.cache_ram generic map ( ROW_BITS => ROW_BITS, @@ -451,6 +441,47 @@ begin wr_addr <= std_ulogic_vector(r.store_row); wr_sel(0) <= do_write; end process; + + -- Cache tag RAMs, one per way, are read and written synchronously. + -- They are instantiated like this instead of trying to describe them as + -- a single array in order to avoid problems with writing a single way. + process(clk) + variable replace_way : way_sig_t; + variable snoop_addr : real_addr_t; + begin + replace_way := to_unsigned(0, WAY_BITS); + if NUM_WAYS > 1 then + -- Get victim way from plru + replace_way := plru_victim; + end if; + if rising_edge(clk) then + -- Read tags using NIA for next cycle + if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then + cache_tags_set(i) <= ic_tags(to_integer(get_index(i_in.next_nia))); + -- Check for simultaneous write to the same location + tag_overwrite(i) <= '0'; + if r.state = CLR_TAG and r.store_index = get_index(i_in.next_nia) and + to_unsigned(i, WAY_BITS) = replace_way then + tag_overwrite(i) <= '1'; + end if; + end if; + + -- Second read port for snooping writes to memory + if (wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we) = '1' then + snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr)); + snoop_tags_set(i) <= ic_tags(to_integer(get_index(snoop_addr))); + end if; + + -- Write one tag when in CLR_TAG state + if r.state = CLR_TAG and to_unsigned(i, WAY_BITS) = replace_way then + ic_tags(to_integer(r.store_index)) <= r.store_tag; + end if; + + if rst = '1' then + tag_overwrite(i) <= '0'; + end if; + end if; + end process; end generate; -- Generate PLRUs @@ -616,17 +647,24 @@ begin end if; for i in way_t loop if i_in.req = '1' and - (cache_valids(to_integer(req_index))(i) = '1' or - (r.state = WAIT_ACK and - req_index = r.store_index and - to_unsigned(i, WAY_BITS) = r.store_way and - r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1')) then - if read_tag(i, cache_tags(to_integer(req_index))) = req_tag then - hit_way := to_unsigned(i, WAY_BITS); - is_hit := '1'; - end if; + cache_valids(to_integer(req_index))(i) = '1' and + tag_overwrite(i) = '0' and + cache_tags_set(i) = req_tag then + hit_way := to_unsigned(i, WAY_BITS); + is_hit := '1'; end if; end loop; + if r.state = WAIT_ACK and r.store_valid = '1' and + req_index = r.store_index and + req_tag = r.store_tag and + r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1' then + is_hit := '1'; + hit_way := r.store_way; + end if; + if r.stalled_hit = '1' then + is_hit := '1'; + hit_way := r.stalled_way; + end if; -- Generate the "hit" and "miss" signals for the synchronous blocks if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then @@ -646,20 +684,22 @@ begin -- I prefer not to do just yet as it would force fetch2 to know about -- some of the cache geometry information. -- - insn := (others => '0'); icode := INSN_illegal; - if r.hit_valid = '1' then - assert not is_X(r.hit_way) severity failure; + if is_X(r.hit_way) then + insn := (others => 'X'); + else insn := read_insn_word(r.hit_nia, cache_out(to_integer(r.hit_way))); - -- Currently we use only the top bit for indicating illegal - -- instructions because we know that insn_codes fit into 9 bits. - if is_X(insn) then - insn := (others => '0'); - elsif insn(ICWORDLEN - 1) = '0' then - icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS)))); - insn(31 downto 26) := recode_primary_opcode(icode); - end if; - end if; + end if; + assert not (r.hit_valid = '1' and is_X(r.hit_way)) severity failure; + -- Currently we use only the top bit for indicating illegal + -- instructions because we know that insn_codes fit into 9 bits. + if is_X(insn) then + insn := (others => '0'); + elsif insn(ICWORDLEN - 1) = '0' then + icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS)))); + insn(31 downto 26) := recode_primary_opcode(icode); + end if; + i_out.insn <= insn(31 downto 0); i_out.icode <= icode; log_insn <= insn; @@ -672,7 +712,7 @@ begin i_out.next_pred_ntaken <= r.pred_ntaken; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault - stall_out <= i_in.req and not (is_hit and access_ok); + stall_out <= i_in.req and not (is_hit and access_ok) and not flush_in; -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; @@ -684,9 +724,17 @@ begin if rising_edge(clk) then -- keep outputs to fetch2 unchanged on a stall -- except that flush or reset sets valid to 0 - if stall_in = '1' then - if rst = '1' or flush_in = '1' then - r.hit_valid <= '0'; + if rst = '1' or flush_in = '1' then + r.hit_valid <= '0'; + r.stalled_hit <= '0'; + r.stalled_way <= to_unsigned(0, WAY_BITS); + elsif stall_in = '1' then + if r.state = CLR_TAG then + r.stalled_hit <= '0'; + elsif req_is_hit = '1' then + -- if we have a hit while stalled, remember it + r.stalled_hit <= '1'; + r.stalled_way <= req_hit_way; end if; else -- On a hit, latch the request for the next cycle, when the BRAM data @@ -706,6 +754,7 @@ begin " way:" & to_hstring(req_hit_way) & " RA:" & to_hstring(real_addr); end if; + r.stalled_hit <= '0'; end if; if stall_in = '0' then -- Send stop marks and NIA down regardless of validity @@ -726,7 +775,6 @@ begin variable tagset : cache_tags_set_t; variable tag : cache_tag_t; variable snoop_addr : real_addr_t; - variable snoop_tag : cache_tag_t; variable snoop_cache_tags : cache_tags_set_t; variable replace_way : way_sig_t; begin @@ -759,15 +807,14 @@ begin snoop_valid <= wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we; snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr)); snoop_index <= get_index(snoop_addr); - snoop_tag := get_tag(snoop_addr, '0'); + snoop_tag <= get_tag(snoop_addr, '0'); snoop_hits <= (others => '0'); + + -- On the next cycle, match up tags with the snooped address + -- to see if any ways need to be invalidated if snoop_valid = '1' then - if is_X(snoop_addr) then - report "metavalue in snoop_addr" severity FAILURE; - end if; - snoop_cache_tags := cache_tags(to_integer(get_index(snoop_addr))); for i in way_t loop - tag := read_tag(i, snoop_cache_tags); + tag := snoop_tags_set(i); -- Ignore endian bit in comparison tag(TAG_BITS - 1) := '0'; if tag = snoop_tag then @@ -775,6 +822,7 @@ begin end if; end loop; end if; + snoop_index2 <= snoop_index; -- Process cache invalidations if inval_in = '1' then @@ -783,12 +831,12 @@ begin end loop; r.store_valid <= '0'; else - -- Do invalidations from snooped stores to memory, one - -- cycle after the address appears on wb_snoop_in. + -- Do invalidations from snooped stores to memory, + -- two cycles after the address appears on wb_snoop_in. for i in way_t loop if snoop_hits(i) = '1' then - assert not is_X(snoop_index) severity failure; - cache_valids(to_integer(snoop_index))(i) <= '0'; + assert not is_X(snoop_index2) severity failure; + cache_valids(to_integer(snoop_index2))(i) <= '0'; end if; end loop; end if; @@ -846,15 +894,6 @@ begin assert not is_X(replace_way) severity failure; cache_valids(to_integer(r.store_index))(to_integer(replace_way)) <= '0'; - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if to_unsigned(i, WAY_BITS) = replace_way then - tagset := cache_tags(to_integer(r.store_index)); - write_tag(i, tagset, r.store_tag); - cache_tags(to_integer(r.store_index)) <= tagset; - end if; - end loop; - r.state <= WAIT_ACK; end if;