core: Remove fetch2 pipeline stage

The fetch2 stage existed primarily to provide a stash buffer for the
output of icache when a stall occurred.  However, we can get the same
effect -- of having the input to decode1 stay unchanged on a stall
cycle -- by using the read enable of the BRAMs in icache, and by
adding logic to keep the outputs unchanged on a clock cycle when
stall_in = 1.  This reduces branch and interrupt latency by one
cycle.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/208/head
Paul Mackerras 5 years ago
parent 49a4d9f67a
commit b5a7dbb78d

@ -42,7 +42,7 @@ all = core_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \
all: $(all) all: $(all)


core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
fetch2.vhdl utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \ decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \
cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \ cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \

@ -96,7 +96,7 @@ package common is
nia: std_ulogic_vector(63 downto 0); nia: std_ulogic_vector(63 downto 0);
end record; end record;


type IcacheToFetch2Type is record type IcacheToDecode1Type is record
valid: std_ulogic; valid: std_ulogic;
stop_mark: std_ulogic; stop_mark: std_ulogic;
fetch_failed: std_ulogic; fetch_failed: std_ulogic;
@ -104,16 +104,6 @@ package common is
insn: std_ulogic_vector(31 downto 0); insn: std_ulogic_vector(31 downto 0);
end record; end record;


type Fetch2ToDecode1Type is record
valid: std_ulogic;
stop_mark : std_ulogic;
fetch_failed: std_ulogic;
nia: std_ulogic_vector(63 downto 0);
insn: std_ulogic_vector(31 downto 0);
end record;
constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0',
nia => (others => '0'), insn => (others => '0'));

type Decode1ToDecode2Type is record type Decode1ToDecode2Type is record
valid: std_ulogic; valid: std_ulogic;
stop_mark : std_ulogic; stop_mark : std_ulogic;

@ -41,12 +41,9 @@ entity core is
end core; end core;


architecture behave of core is architecture behave of core is
-- fetch signals
signal fetch2_to_decode1: Fetch2ToDecode1Type;

-- icache signals -- icache signals
signal fetch1_to_icache : Fetch1ToIcacheType; signal fetch1_to_icache : Fetch1ToIcacheType;
signal icache_to_fetch2 : IcacheToFetch2Type; signal icache_to_decode1 : IcacheToDecode1Type;
signal mmu_to_icache : MmuToIcacheType; signal mmu_to_icache : MmuToIcacheType;


-- decode signals -- decode signals
@ -83,7 +80,7 @@ architecture behave of core is
-- local signals -- local signals
signal fetch1_stall_in : std_ulogic; signal fetch1_stall_in : std_ulogic;
signal icache_stall_out : std_ulogic; signal icache_stall_out : std_ulogic;
signal fetch2_stall_in : std_ulogic; signal icache_stall_in : std_ulogic;
signal decode1_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic;
signal decode2_stall_in : std_ulogic; signal decode2_stall_in : std_ulogic;
signal decode2_stall_out : std_ulogic; signal decode2_stall_out : std_ulogic;
@ -145,7 +142,6 @@ architecture behave of core is
attribute keep_hierarchy : string; attribute keep_hierarchy : string;
attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of fetch2_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
@ -206,27 +202,18 @@ begin
clk => clk, clk => clk,
rst => rst_icache, rst => rst_icache,
i_in => fetch1_to_icache, i_in => fetch1_to_icache,
i_out => icache_to_fetch2, i_out => icache_to_decode1,
m_in => mmu_to_icache, m_in => mmu_to_icache,
flush_in => flush, flush_in => flush,
inval_in => dbg_icache_rst or ex1_icache_inval, inval_in => dbg_icache_rst or ex1_icache_inval,
stall_in => icache_stall_in,
stall_out => icache_stall_out, stall_out => icache_stall_out,
wishbone_out => wishbone_insn_out, wishbone_out => wishbone_insn_out,
wishbone_in => wishbone_insn_in, wishbone_in => wishbone_insn_in,
log_out => log_data(96 downto 43) log_out => log_data(96 downto 43)
); );


fetch2_0: entity work.fetch2 icache_stall_in <= decode2_stall_out;
port map (
clk => clk,
rst => rst_fetch2,
stall_in => fetch2_stall_in,
flush_in => flush,
i_in => icache_to_fetch2,
f_out => fetch2_to_decode1
);

fetch2_stall_in <= decode2_stall_out;


decode1_0: entity work.decode1 decode1_0: entity work.decode1
port map ( port map (
@ -234,7 +221,7 @@ begin
rst => rst_dec1, rst => rst_dec1,
stall_in => decode1_stall_in, stall_in => decode1_stall_in,
flush_in => flush, flush_in => flush,
f_in => fetch2_to_decode1, f_in => icache_to_decode1,
d_out => decode1_to_decode2, d_out => decode1_to_decode2,
log_out => log_data(109 downto 97) log_out => log_data(109 downto 97)
); );

@ -14,9 +14,8 @@ entity decode1 is
stall_in : in std_ulogic; stall_in : in std_ulogic;
flush_in : in std_ulogic; flush_in : in std_ulogic;


f_in : in Fetch2ToDecode1Type; f_in : in IcacheToDecode1Type;
d_out : out Decode1ToDecode2Type; d_out : out Decode1ToDecode2Type;

log_out : out std_ulogic_vector(12 downto 0) log_out : out std_ulogic_vector(12 downto 0)
); );
end entity decode1; end entity decode1;

@ -1,123 +0,0 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.common.all;
use work.wishbone_types.all;

entity fetch2 is
port(
clk : in std_ulogic;
rst : in std_ulogic;

stall_in : in std_ulogic;
flush_in : in std_ulogic;

-- Results from icache
i_in : in IcacheToFetch2Type;

-- Output to decode
f_out : out Fetch2ToDecode1Type
);
end entity fetch2;

architecture behaviour of fetch2 is

-- The icache cannot stall, so we need to stash a cycle
-- of output from it when we stall.
type reg_internal_type is record
stash : IcacheToFetch2Type;
stash_valid : std_ulogic;
stopped : std_ulogic;
end record;

signal r_int, rin_int : reg_internal_type;
signal r, rin : Fetch2ToDecode1Type;

begin
regs : process(clk)
begin
if rising_edge(clk) then

if (r /= rin) then
report "fetch2 rst:" & std_ulogic'image(rst) &
" S:" & std_ulogic'image(stall_in) &
" F:" & std_ulogic'image(flush_in) &
" T:" & std_ulogic'image(rin.stop_mark) &
" V:" & std_ulogic'image(rin.valid) &
" FF:" & std_ulogic'image(rin.fetch_failed) &
" nia:" & to_hstring(rin.nia);
end if;

-- Output state remains unchanged on stall, unless we are flushing
if rst = '1' or flush_in = '1' or stall_in = '0' then
r <= rin;
end if;

-- Internal state is updated on every clock
r_int <= rin_int;
end if;
end process;

comb : process(all)
variable v : Fetch2ToDecode1Type;
variable v_int : reg_internal_type;
variable v_i_in : IcacheToFetch2Type;
begin
v := r;
v_int := r_int;

-- If stalling, stash away the current input from the icache
if stall_in = '1' and v_int.stash_valid = '0' then
v_int.stash := i_in;
v_int.stash_valid := '1';
end if;

-- If unstalling, source input from the stash and invalidate it,
-- otherwise source normally from the icache.
--
v_i_in := i_in;
if v_int.stash_valid = '1' and stall_in = '0' then
v_i_in := v_int.stash;
v_int.stash_valid := '0';
end if;

v.valid := v_i_in.valid;
v.stop_mark := v_i_in.stop_mark;
v.fetch_failed := v_i_in.fetch_failed;
v.nia := v_i_in.nia;
v.insn := v_i_in.insn;

-- Clear stash internal valid bit on flush. We still mark
-- the stash itself as valid since we still want to override
-- whatever comes form icache when unstalling, but we'll
-- override it with something invalid.
--
if flush_in = '1' then
v_int.stash.valid := '0';
v_int.stash.fetch_failed := '0';
end if;

-- If we are flushing or the instruction comes with a stop mark
-- we tag it as invalid so it doesn't get decoded and executed
if flush_in = '1' or v.stop_mark = '1' then
v.valid := '0';
v.fetch_failed := '0';
end if;

-- Clear stash on reset
if rst = '1' then
v_int.stash_valid := '0';
v.valid := '0';
end if;

-- Update registers
rin <= v;
rin_int <= v_int;

-- Update outputs
f_out <= r;
end process;

end architecture behaviour;

@ -48,10 +48,11 @@ entity icache is
rst : in std_ulogic; rst : in std_ulogic;


i_in : in Fetch1ToIcacheType; i_in : in Fetch1ToIcacheType;
i_out : out IcacheToFetch2Type; i_out : out IcacheToDecode1Type;


m_in : in MmuToIcacheType; m_in : in MmuToIcacheType;


stall_in : in std_ulogic;
stall_out : out std_ulogic; stall_out : out std_ulogic;
flush_in : in std_ulogic; flush_in : in std_ulogic;
inval_in : in std_ulogic; inval_in : in std_ulogic;
@ -366,7 +367,7 @@ begin
); );
process(all) process(all)
begin begin
do_read <= '1'; do_read <= not stall_in;
do_write <= '0'; do_write <= '0';
if wishbone_in.ack = '1' and r.store_way = i then if wishbone_in.ack = '1' and r.store_way = i then
do_write <= '1'; do_write <= '1';
@ -533,25 +534,32 @@ begin
icache_hit : process(clk) icache_hit : process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
-- On a hit, latch the request for the next cycle, when the BRAM data -- keep outputs to fetch2 unchanged on a stall
-- will be available on the cache_out output of the corresponding way -- except that flush or reset sets valid to 0
-- if stall_in = '1' then
r.hit_valid <= req_is_hit; if rst = '1' or flush_in = '1' then
-- Send stop marks and NIA down regardless of validity r.hit_valid <= '0';
r.hit_smark <= i_in.stop_mark; end if;
r.hit_nia <= i_in.nia; else
if req_is_hit = '1' then -- On a hit, latch the request for the next cycle, when the BRAM data
r.hit_way <= req_hit_way; -- will be available on the cache_out output of the corresponding way
r.hit_smark <= i_in.stop_mark; --

r.hit_valid <= req_is_hit;
report "cache hit nia:" & to_hstring(i_in.nia) & -- Send stop marks and NIA down regardless of validity
" IR:" & std_ulogic'image(i_in.virt_mode) & r.hit_smark <= i_in.stop_mark;
" SM:" & std_ulogic'image(i_in.stop_mark) & r.hit_nia <= i_in.nia;
" idx:" & integer'image(req_index) & if req_is_hit = '1' then
" tag:" & to_hstring(req_tag) & r.hit_way <= req_hit_way;
" way:" & integer'image(req_hit_way) &
" RA:" & to_hstring(real_addr); report "cache hit nia:" & to_hstring(i_in.nia) &
end if; " IR:" & std_ulogic'image(i_in.virt_mode) &
" SM:" & std_ulogic'image(i_in.stop_mark) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way:" & integer'image(req_hit_way) &
" RA:" & to_hstring(real_addr);
end if;
end if;
end if; end if;
end process; end process;


@ -674,7 +682,7 @@ begin
-- TLB miss and protection fault processing -- TLB miss and protection fault processing
if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
r.fetch_failed <= '0'; r.fetch_failed <= '0';
elsif i_in.req = '1' and access_ok = '0' then elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
r.fetch_failed <= '1'; r.fetch_failed <= '1';
end if; end if;
end if; end if;

@ -13,7 +13,7 @@ architecture behave of icache_tb is
signal rst : std_ulogic; signal rst : std_ulogic;


signal i_out : Fetch1ToIcacheType; signal i_out : Fetch1ToIcacheType;
signal i_in : IcacheToFetch2Type; signal i_in : IcacheToDecode1Type;


signal m_out : MmuToIcacheType; signal m_out : MmuToIcacheType;


@ -33,6 +33,7 @@ begin
i_in => i_out, i_in => i_out,
i_out => i_in, i_out => i_in,
m_in => m_out, m_in => m_out,
stall_in => '0',
flush_in => '0', flush_in => '0',
inval_in => '0', inval_in => '0',
wishbone_out => wb_bram_in, wishbone_out => wb_bram_in,

@ -9,7 +9,6 @@ filesets:
- wishbone_types.vhdl - wishbone_types.vhdl
- common.vhdl - common.vhdl
- fetch1.vhdl - fetch1.vhdl
- fetch2.vhdl
- decode1.vhdl - decode1.vhdl
- helpers.vhdl - helpers.vhdl
- decode2.vhdl - decode2.vhdl

Loading…
Cancel
Save