From 73b6004ac6ff8787ff05497a6bc5965f0ccea2d3 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 15 Aug 2023 11:30:53 +1000 Subject: [PATCH] icache: Use next real address to index icache Now that we are translating the fetch effective address to real one cycle earlier, we can use the real address to index the icache array. This has the benefit that the set size can be larger than a page, enabling us to configure the icache to be larger without having to increase its associativity. Previously the set size was limited to the page size to avoid aliasing problems. Thus for example a 32kB icache would need to be 8-way associative, resulting in large numbers of LUTs being used for tag comparisons in FPGA implementations, and poor timing. With this change, a 32kB icache can be 1 or 2-way associative, which means deeper and narrower tag and data RAMs and fewer tag comparators. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + fetch1.vhdl | 1 + icache.vhdl | 35 ++++++++++++++++++----------------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/common.vhdl b/common.vhdl index efcf7b3..eefa2fd 100644 --- a/common.vhdl +++ b/common.vhdl @@ -245,6 +245,7 @@ package common is nia: std_ulogic_vector(63 downto 0); next_nia: std_ulogic_vector(63 downto 0); rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); + next_rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); end record; type IcacheToDecode1Type is record diff --git a/fetch1.vhdl b/fetch1.vhdl index 98116f9..677fa27 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -438,6 +438,7 @@ begin -- Update outputs to the icache i_out <= r; i_out.next_nia <= next_nia; + i_out.next_rpn <= v.rpn; end process; diff --git a/icache.vhdl b/icache.vhdl index cc1b2b3..8dfbd86 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -158,6 +158,7 @@ architecture rtl of icache is -- Cache hit state (Latches for 1 cycle BRAM access) hit_way : way_sig_t; hit_nia : std_ulogic_vector(63 downto 0); + hit_ra : real_addr_t; hit_smark : std_ulogic; hit_valid : std_ulogic; big_endian: std_ulogic; @@ -218,7 +219,7 @@ architecture rtl of icache is signal log_insn : std_ulogic_vector(35 downto 0); -- Return the cache line index (tag index) for an address - function get_index(addr: std_ulogic_vector) return index_sig_t is + function get_index(addr: real_addr_t) return index_sig_t is begin return unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)); end; @@ -400,6 +401,7 @@ begin process(clk) variable replace_way : way_sig_t; variable snoop_addr : real_addr_t; + variable next_raddr : real_addr_t; begin replace_way := to_unsigned(0, WAY_BITS); if NUM_WAYS > 1 then @@ -409,10 +411,11 @@ begin if rising_edge(clk) then -- Read tags using NIA for next cycle if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then - cache_tags_set(i) <= ic_tags(to_integer(get_index(i_in.next_nia))); + next_raddr := i_in.next_rpn & i_in.next_nia(MIN_LG_PGSZ - 1 downto 0); + cache_tags_set(i) <= ic_tags(to_integer(get_index(next_raddr))); -- Check for simultaneous write to the same location tag_overwrite(i) <= '0'; - if r.state = CLR_TAG and r.store_index = get_index(i_in.next_nia) and + if r.state = CLR_TAG and r.store_index = get_index(next_raddr) and to_unsigned(i, WAY_BITS) = replace_way then tag_overwrite(i) <= '1'; end if; @@ -459,10 +462,10 @@ begin process(all) begin -- Read PLRU bits from array - if is_X(r.hit_nia) then + if is_X(r.hit_ra) then plru_cur <= (others => 'X'); else - plru_cur <= plru_ram(to_integer(get_index(r.hit_nia))); + plru_cur <= plru_ram(to_integer(get_index(r.hit_ra))); end if; -- PLRU interface @@ -475,35 +478,32 @@ begin begin if rising_edge(clk) then if r.hit_valid = '1' then - assert not is_X(r.hit_nia) severity failure; - plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd; + assert not is_X(r.hit_ra) severity failure; + plru_ram(to_integer(get_index(r.hit_ra))) <= plru_upd; end if; end if; end process; end generate; - -- TLB hit detection and real address generation - itlb_lookup : process(all) - begin - real_addr <= i_in.rpn & i_in.nia(MIN_LG_PGSZ - 1 downto 0); - end process; - -- Cache hit detection, output to fetch2 and other misc logic icache_comb : process(all) variable is_hit : std_ulogic; variable hit_way : way_sig_t; variable insn : std_ulogic_vector(ICWORDLEN - 1 downto 0); variable icode : insn_code; + variable ra : real_addr_t; begin -- Extract line, row and tag from request - req_index <= get_index(i_in.nia); - req_row <= get_row(i_in.nia); - req_tag <= get_tag(real_addr, i_in.big_endian); + ra := i_in.rpn & i_in.nia(MIN_LG_PGSZ - 1 downto 0); + real_addr <= ra; + req_index <= get_index(ra); + req_row <= get_row(ra); + req_tag <= get_tag(ra, i_in.big_endian); -- Calculate address of beginning of cache row, will be -- used for cache miss processing if needed -- - req_raddr <= real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + req_raddr <= ra(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way @@ -627,6 +627,7 @@ begin -- Send stop marks and NIA down regardless of validity r.hit_smark <= i_in.stop_mark; r.hit_nia <= i_in.nia; + r.hit_ra <= real_addr; r.big_endian <= i_in.big_endian; r.predicted <= i_in.predicted; r.pred_ntaken <= i_in.pred_ntaken;