fetch1: Reorganize fetch1 to provide an asynchronous early next NIA to icache

This adds a next_nia field to the Fetch1ToIcacheType record, which
provides an indication of what will be in the nia field on the next
non-stalled cycle.  This is intended to be as fast as possible, being
a selection from two redirect addresses (from writeback and decode1)
or an internal register (r_int.next_nia).  Reset addresses and
predicted branch targets come through this internal register.

The rearrangement here has the side effect that we can now use the BTC
on the first instruction after a taken branch, whereas previously the
BTC was only active starting with the second instruction after a taken
branch.  This provides a slight improvement in performance.

This also fixes a buglet in icache where it would assert its stall
output when i_in.req was false.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/422/head
Paul Mackerras 10 months ago
parent 2dceb28830
commit e92d49375f

@ -238,6 +238,7 @@ package common is
predicted : std_ulogic; predicted : std_ulogic;
pred_ntaken : std_ulogic; pred_ntaken : std_ulogic;
nia: std_ulogic_vector(63 downto 0); nia: std_ulogic_vector(63 downto 0);
next_nia: std_ulogic_vector(63 downto 0);
end record; end record;


type IcacheToDecode1Type is record type IcacheToDecode1Type is record

@ -40,8 +40,7 @@ architecture behaviour of fetch1 is
type reg_internal_t is record type reg_internal_t is record
mode_32bit: std_ulogic; mode_32bit: std_ulogic;
rd_is_niap4: std_ulogic; rd_is_niap4: std_ulogic;
predicted_taken: std_ulogic; next_nia: std_ulogic_vector(63 downto 0);
predicted_nia: std_ulogic_vector(63 downto 0);
end record; end record;
signal r, r_next : Fetch1ToIcacheType; signal r, r_next : Fetch1ToIcacheType;
signal r_int, r_next_int : reg_internal_t; signal r_int, r_next_int : reg_internal_t;
@ -55,6 +54,7 @@ architecture behaviour of fetch1 is
constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS + 2; constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS + 2;
type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0); type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0);


signal btc_rd_addr : unsigned(BTC_ADDR_BITS - 1 downto 0);
signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0');
signal btc_rd_valid : std_ulogic := '0'; signal btc_rd_valid : std_ulogic := '0';


@ -64,7 +64,7 @@ begin
begin begin
if rising_edge(clk) then if rising_edge(clk) then
log_nia <= r.nia(63) & r.nia(43 downto 2); log_nia <= r.nia(63) & r.nia(43 downto 2);
if r /= r_next then if r /= r_next and advance_nia = '1' then
report "fetch1 rst:" & std_ulogic'image(rst) & report "fetch1 rst:" & std_ulogic'image(rst) &
" IR:" & std_ulogic'image(r_next.virt_mode) & " IR:" & std_ulogic'image(r_next.virt_mode) &
" P:" & std_ulogic'image(r_next.priv_mode) & " P:" & std_ulogic'image(r_next.priv_mode) &
@ -73,25 +73,16 @@ begin
" R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) & " R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) &
" S:" & std_ulogic'image(stall_in) & " S:" & std_ulogic'image(stall_in) &
" T:" & std_ulogic'image(stop_in) & " T:" & std_ulogic'image(stop_in) &
" nia:" & to_hstring(r_next.nia); " nia:" & to_hstring(r_next.nia) &
" req:" & std_ulogic'image(r_next.req);
end if; end if;
if rst = '1' or w_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then
r.virt_mode <= r_next.virt_mode;
r.priv_mode <= r_next.priv_mode;
r.big_endian <= r_next.big_endian;
r_int.mode_32bit <= r_next_int.mode_32bit;
end if;
if advance_nia = '1' then if advance_nia = '1' then
r.predicted <= r_next.predicted; r <= r_next;
r.pred_ntaken <= r_next.pred_ntaken; r_int <= r_next_int;
r.nia <= r_next.nia;
r_int.predicted_taken <= r_next_int.predicted_taken;
r_int.predicted_nia <= r_next_int.predicted_nia;
r_int.rd_is_niap4 <= r_next_int.rd_is_niap4;
end if; end if;
-- always send the up-to-date stop mark and req -- always send the up-to-date stop mark and req
r.stop_mark <= stop_in; r.stop_mark <= stop_in;
r.req <= not rst and not stop_in; r.req <= r_next.req;
end if; end if;
end process; end process;
log_out <= log_nia; log_out <= log_nia;
@ -119,15 +110,13 @@ begin
variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0);
begin begin
if rising_edge(clk) then if rising_edge(clk) then
raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) +
to_unsigned(2, BTC_ADDR_BITS);
if advance_nia = '1' then if advance_nia = '1' then
if is_X(raddr) then if is_X(btc_rd_addr) then
btc_rd_data <= (others => 'X'); btc_rd_data <= (others => 'X');
btc_rd_valid <= 'X'; btc_rd_valid <= 'X';
else else
btc_rd_data <= btc_memory(to_integer(raddr)); btc_rd_data <= btc_memory(to_integer(btc_rd_addr));
btc_rd_valid <= btc_valids(to_integer(raddr)); btc_rd_valid <= btc_valids(to_integer(btc_rd_addr));
end if; end if;
end if; end if;
if btc_wr = '1' then if btc_wr = '1' then
@ -147,67 +136,93 @@ begin
comb : process(all) comb : process(all)
variable v : Fetch1ToIcacheType; variable v : Fetch1ToIcacheType;
variable v_int : reg_internal_t; variable v_int : reg_internal_t;
variable next_nia : std_ulogic_vector(63 downto 0);
variable m32 : std_ulogic;
begin begin
v := r; v := r;
v_int := r_int; v_int := r_int;
v.predicted := '0'; v.predicted := '0';
v.pred_ntaken := '0'; v.pred_ntaken := '0';
v_int.predicted_taken := '0'; v.req := not (rst or stop_in);
v_int.rd_is_niap4 := '0'; -- reduce metavalue warnings in sim
if is_X(rst) then
v.req := '0';
end if;

-- Combinatorial computation of the CIA for the next cycle.
-- Needs to be simple so the result can be used for RAM
-- and TLB access in the icache.
-- If we are stalled, this still advances, and the assumption
-- is that it will not be used.
m32 := r_int.mode_32bit;
if w_in.redirect = '1' then
next_nia := w_in.redirect_nia(63 downto 2) & "00";
m32 := w_in.mode_32bit;
v.virt_mode := w_in.virt_mode;
v.priv_mode := w_in.priv_mode;
v.big_endian := w_in.big_endian;
v_int.mode_32bit := w_in.mode_32bit;
elsif d_in.redirect = '1' then
next_nia := d_in.redirect_nia(63 downto 2) & "00";
else
next_nia := r_int.next_nia;
end if;
if m32 = '1' then
next_nia(63 downto 32) := (others => '0');
end if;
v.nia := next_nia;

v_int.next_nia := std_ulogic_vector(unsigned(next_nia) + 4);

-- Use v_int.next_nia as the BTC read address before it gets possibly
-- overridden with the reset address or the predicted branch target
-- address, in order to improve timing. If it gets overridden then
-- rd_is_niap4 gets cleared to indicate that the BTC data doesn't apply.
btc_rd_addr <= unsigned(v_int.next_nia(BTC_ADDR_BITS + 1 downto 2));
v_int.rd_is_niap4 := '1';


if rst = '1' then if rst /= '0' then
if alt_reset_in = '1' then if alt_reset_in = '1' then
v.nia := ALT_RESET_ADDRESS; v_int.next_nia := ALT_RESET_ADDRESS;
else else
v.nia := RESET_ADDRESS; v_int.next_nia := RESET_ADDRESS;
end if; end if;
v.virt_mode := '0'; v.virt_mode := '0';
v.priv_mode := '1'; v.priv_mode := '1';
v.big_endian := '0'; v.big_endian := '0';
v_int.mode_32bit := '0'; v_int.mode_32bit := '0';
v_int.predicted_nia := (others => '0'); v_int.rd_is_niap4 := '0';
elsif w_in.redirect = '1' then end if;
v.nia := w_in.redirect_nia(63 downto 2) & "00";
if w_in.mode_32bit = '1' then -- If there is a valid entry in the BTC which corresponds to the next instruction,
v.nia(63 downto 32) := (others => '0'); -- use that to predict the address of the instruction after that.
end if; if rst = '0' and w_in.redirect = '0' and d_in.redirect = '0' and
v.virt_mode := w_in.virt_mode; btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
v.priv_mode := w_in.priv_mode;
v.big_endian := w_in.big_endian;
v_int.mode_32bit := w_in.mode_32bit;
elsif d_in.redirect = '1' then
v.nia := d_in.redirect_nia(63 downto 2) & "00";
if r_int.mode_32bit = '1' then
v.nia(63 downto 32) := (others => '0');
end if;
elsif r_int.predicted_taken = '1' then
v.nia := r_int.predicted_nia;
elsif r.req = '1' then
v_int.rd_is_niap4 := '1';
v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
if r_int.mode_32bit = '1' then
v.nia(63 downto 32) := x"00000000";
end if;
if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
btc_rd_data(BTC_WIDTH - 2) = r.virt_mode and btc_rd_data(BTC_WIDTH - 2) = r.virt_mode and
btc_rd_data(BTC_WIDTH - 3 downto BTC_TARGET_BITS) btc_rd_data(BTC_WIDTH - 3 downto BTC_TARGET_BITS)
= v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then = r_int.next_nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then
v_int.predicted_taken := btc_rd_data(BTC_WIDTH - 1); v.predicted := btc_rd_data(BTC_WIDTH - 1);
v.predicted := btc_rd_data(BTC_WIDTH - 1); v.pred_ntaken := not btc_rd_data(BTC_WIDTH - 1);
v.pred_ntaken := not btc_rd_data(BTC_WIDTH - 1); if btc_rd_data(BTC_WIDTH - 1) = '1' then
v_int.next_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";
v_int.rd_is_niap4 := '0';
end if; end if;
end if; end if;
v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";


-- If the last NIA value went down with a stop mark, it didn't get -- If the last NIA value went down with a stop mark, it didn't get
-- executed, and hence we shouldn't increment NIA. -- executed, and hence we shouldn't increment NIA.
advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in);
-- reduce metavalue warnings in sim
if is_X(rst) then
advance_nia <= '1';
end if;


r_next <= v; r_next <= v;
r_next_int <= v_int; r_next_int <= v_int;


-- Update outputs to the icache -- Update outputs to the icache
i_out <= r; i_out <= r;
i_out.next_nia <= next_nia;


end process; end process;



@ -636,7 +636,7 @@ begin
i_out.next_pred_ntaken <= r.pred_ntaken; i_out.next_pred_ntaken <= r.pred_ntaken;


-- Stall fetch1 if we have a miss on cache or TLB or a protection fault -- Stall fetch1 if we have a miss on cache or TLB or a protection fault
stall_out <= not (is_hit and access_ok); stall_out <= i_in.req and not (is_hit and access_ok);


-- Wishbone requests output (from the cache miss reload machine) -- Wishbone requests output (from the cache miss reload machine)
wishbone_out <= r.wb; wishbone_out <= r.wb;

Loading…
Cancel
Save