core: Remove fetch2 pipeline stage

The fetch2 stage existed primarily to provide a stash buffer for the
output of icache when a stall occurred.  However, we can get the same
effect -- of having the input to decode1 stay unchanged on a stall
cycle -- by using the read enable of the BRAMs in icache, and by
adding logic to keep the outputs unchanged on a clock cycle when
stall_in = 1.  This reduces branch and interrupt latency by one
cycle.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/208/head
Paul Mackerras 5 years ago
parent 49a4d9f67a
commit b5a7dbb78d

@ -42,7 +42,7 @@ all = core_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \
all: $(all)

core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
fetch2.vhdl utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \
cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \

@ -96,7 +96,7 @@ package common is
nia: std_ulogic_vector(63 downto 0);
end record;

type IcacheToFetch2Type is record
type IcacheToDecode1Type is record
valid: std_ulogic;
stop_mark: std_ulogic;
fetch_failed: std_ulogic;
@ -104,16 +104,6 @@ package common is
insn: std_ulogic_vector(31 downto 0);
end record;

type Fetch2ToDecode1Type is record
valid: std_ulogic;
stop_mark : std_ulogic;
fetch_failed: std_ulogic;
nia: std_ulogic_vector(63 downto 0);
insn: std_ulogic_vector(31 downto 0);
end record;
constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0',
nia => (others => '0'), insn => (others => '0'));

type Decode1ToDecode2Type is record
valid: std_ulogic;
stop_mark : std_ulogic;

@ -41,12 +41,9 @@ entity core is
end core;

architecture behave of core is
-- fetch signals
signal fetch2_to_decode1: Fetch2ToDecode1Type;

-- icache signals
signal fetch1_to_icache : Fetch1ToIcacheType;
signal icache_to_fetch2 : IcacheToFetch2Type;
signal icache_to_decode1 : IcacheToDecode1Type;
signal mmu_to_icache : MmuToIcacheType;

-- decode signals
@ -83,7 +80,7 @@ architecture behave of core is
-- local signals
signal fetch1_stall_in : std_ulogic;
signal icache_stall_out : std_ulogic;
signal fetch2_stall_in : std_ulogic;
signal icache_stall_in : std_ulogic;
signal decode1_stall_in : std_ulogic;
signal decode2_stall_in : std_ulogic;
signal decode2_stall_out : std_ulogic;
@ -145,7 +142,6 @@ architecture behave of core is
attribute keep_hierarchy : string;
attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of fetch2_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
@ -206,27 +202,18 @@ begin
clk => clk,
rst => rst_icache,
i_in => fetch1_to_icache,
i_out => icache_to_fetch2,
i_out => icache_to_decode1,
m_in => mmu_to_icache,
flush_in => flush,
inval_in => dbg_icache_rst or ex1_icache_inval,
stall_in => icache_stall_in,
stall_out => icache_stall_out,
wishbone_out => wishbone_insn_out,
wishbone_in => wishbone_insn_in,
log_out => log_data(96 downto 43)
);

fetch2_0: entity work.fetch2
port map (
clk => clk,
rst => rst_fetch2,
stall_in => fetch2_stall_in,
flush_in => flush,
i_in => icache_to_fetch2,
f_out => fetch2_to_decode1
);

fetch2_stall_in <= decode2_stall_out;
icache_stall_in <= decode2_stall_out;

decode1_0: entity work.decode1
port map (
@ -234,7 +221,7 @@ begin
rst => rst_dec1,
stall_in => decode1_stall_in,
flush_in => flush,
f_in => fetch2_to_decode1,
f_in => icache_to_decode1,
d_out => decode1_to_decode2,
log_out => log_data(109 downto 97)
);

@ -14,9 +14,8 @@ entity decode1 is
stall_in : in std_ulogic;
flush_in : in std_ulogic;

f_in : in Fetch2ToDecode1Type;
f_in : in IcacheToDecode1Type;
d_out : out Decode1ToDecode2Type;

log_out : out std_ulogic_vector(12 downto 0)
);
end entity decode1;

@ -1,123 +0,0 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.common.all;
use work.wishbone_types.all;

entity fetch2 is
port(
clk : in std_ulogic;
rst : in std_ulogic;

stall_in : in std_ulogic;
flush_in : in std_ulogic;

-- Results from icache
i_in : in IcacheToFetch2Type;

-- Output to decode
f_out : out Fetch2ToDecode1Type
);
end entity fetch2;

architecture behaviour of fetch2 is

-- The icache cannot stall, so we need to stash a cycle
-- of output from it when we stall.
type reg_internal_type is record
stash : IcacheToFetch2Type;
stash_valid : std_ulogic;
stopped : std_ulogic;
end record;

signal r_int, rin_int : reg_internal_type;
signal r, rin : Fetch2ToDecode1Type;

begin
regs : process(clk)
begin
if rising_edge(clk) then

if (r /= rin) then
report "fetch2 rst:" & std_ulogic'image(rst) &
" S:" & std_ulogic'image(stall_in) &
" F:" & std_ulogic'image(flush_in) &
" T:" & std_ulogic'image(rin.stop_mark) &
" V:" & std_ulogic'image(rin.valid) &
" FF:" & std_ulogic'image(rin.fetch_failed) &
" nia:" & to_hstring(rin.nia);
end if;

-- Output state remains unchanged on stall, unless we are flushing
if rst = '1' or flush_in = '1' or stall_in = '0' then
r <= rin;
end if;

-- Internal state is updated on every clock
r_int <= rin_int;
end if;
end process;

comb : process(all)
variable v : Fetch2ToDecode1Type;
variable v_int : reg_internal_type;
variable v_i_in : IcacheToFetch2Type;
begin
v := r;
v_int := r_int;

-- If stalling, stash away the current input from the icache
if stall_in = '1' and v_int.stash_valid = '0' then
v_int.stash := i_in;
v_int.stash_valid := '1';
end if;

-- If unstalling, source input from the stash and invalidate it,
-- otherwise source normally from the icache.
--
v_i_in := i_in;
if v_int.stash_valid = '1' and stall_in = '0' then
v_i_in := v_int.stash;
v_int.stash_valid := '0';
end if;

v.valid := v_i_in.valid;
v.stop_mark := v_i_in.stop_mark;
v.fetch_failed := v_i_in.fetch_failed;
v.nia := v_i_in.nia;
v.insn := v_i_in.insn;

-- Clear stash internal valid bit on flush. We still mark
-- the stash itself as valid since we still want to override
-- whatever comes form icache when unstalling, but we'll
-- override it with something invalid.
--
if flush_in = '1' then
v_int.stash.valid := '0';
v_int.stash.fetch_failed := '0';
end if;

-- If we are flushing or the instruction comes with a stop mark
-- we tag it as invalid so it doesn't get decoded and executed
if flush_in = '1' or v.stop_mark = '1' then
v.valid := '0';
v.fetch_failed := '0';
end if;

-- Clear stash on reset
if rst = '1' then
v_int.stash_valid := '0';
v.valid := '0';
end if;

-- Update registers
rin <= v;
rin_int <= v_int;

-- Update outputs
f_out <= r;
end process;

end architecture behaviour;

@ -48,10 +48,11 @@ entity icache is
rst : in std_ulogic;

i_in : in Fetch1ToIcacheType;
i_out : out IcacheToFetch2Type;
i_out : out IcacheToDecode1Type;

m_in : in MmuToIcacheType;

stall_in : in std_ulogic;
stall_out : out std_ulogic;
flush_in : in std_ulogic;
inval_in : in std_ulogic;
@ -366,7 +367,7 @@ begin
);
process(all)
begin
do_read <= '1';
do_read <= not stall_in;
do_write <= '0';
if wishbone_in.ack = '1' and r.store_way = i then
do_write <= '1';
@ -533,25 +534,32 @@ begin
icache_hit : process(clk)
begin
if rising_edge(clk) then
-- On a hit, latch the request for the next cycle, when the BRAM data
-- will be available on the cache_out output of the corresponding way
--
r.hit_valid <= req_is_hit;
-- Send stop marks and NIA down regardless of validity
r.hit_smark <= i_in.stop_mark;
r.hit_nia <= i_in.nia;
if req_is_hit = '1' then
r.hit_way <= req_hit_way;
r.hit_smark <= i_in.stop_mark;

report "cache hit nia:" & to_hstring(i_in.nia) &
" IR:" & std_ulogic'image(i_in.virt_mode) &
" SM:" & std_ulogic'image(i_in.stop_mark) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way:" & integer'image(req_hit_way) &
" RA:" & to_hstring(real_addr);
end if;
-- keep outputs to fetch2 unchanged on a stall
-- except that flush or reset sets valid to 0
if stall_in = '1' then
if rst = '1' or flush_in = '1' then
r.hit_valid <= '0';
end if;
else
-- On a hit, latch the request for the next cycle, when the BRAM data
-- will be available on the cache_out output of the corresponding way
--
r.hit_valid <= req_is_hit;
-- Send stop marks and NIA down regardless of validity
r.hit_smark <= i_in.stop_mark;
r.hit_nia <= i_in.nia;
if req_is_hit = '1' then
r.hit_way <= req_hit_way;

report "cache hit nia:" & to_hstring(i_in.nia) &
" IR:" & std_ulogic'image(i_in.virt_mode) &
" SM:" & std_ulogic'image(i_in.stop_mark) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way:" & integer'image(req_hit_way) &
" RA:" & to_hstring(real_addr);
end if;
end if;
end if;
end process;

@ -674,7 +682,7 @@ begin
-- TLB miss and protection fault processing
if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
r.fetch_failed <= '0';
elsif i_in.req = '1' and access_ok = '0' then
elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
r.fetch_failed <= '1';
end if;
end if;

@ -13,7 +13,7 @@ architecture behave of icache_tb is
signal rst : std_ulogic;

signal i_out : Fetch1ToIcacheType;
signal i_in : IcacheToFetch2Type;
signal i_in : IcacheToDecode1Type;

signal m_out : MmuToIcacheType;

@ -33,6 +33,7 @@ begin
i_in => i_out,
i_out => i_in,
m_in => m_out,
stall_in => '0',
flush_in => '0',
inval_in => '0',
wishbone_out => wb_bram_in,

@ -9,7 +9,6 @@ filesets:
- wishbone_types.vhdl
- common.vhdl
- fetch1.vhdl
- fetch2.vhdl
- decode1.vhdl
- helpers.vhdl
- decode2.vhdl

Loading…
Cancel
Save