FPU: Add stage-2 stall ability to FPU

This makes the FPU able to stall other units at execute stage 2 and be
stalled by other units (specifically the LSU).

This means that the completion and writeback for an instruction can
now end up being deferred until the second cycle of a following
instruction, i.e. the cycle when the state machine has gone through
IDLE state into one of the DO_* states, which means we need to latch
the destination FPR number, CR mask, etc. from the previous
instruction so that we present the correct information to writeback.

The advantage of this is that we can get rid of the in_progress signal
from the LSU.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/379/head
Paul Mackerras 2 years ago
parent ef122868d5
commit 9a8a8e50f8

@ -480,7 +480,6 @@ package common is
type Loadstore1ToExecute1Type is record
busy : std_ulogic;
l2stall : std_ulogic;
in_progress : std_ulogic;
end record;

type Loadstore1ToDcacheType is record
@ -640,16 +639,18 @@ package common is
frt : gspr_index_t;
rc : std_ulogic;
out_cr : std_ulogic;
stall : std_ulogic;
end record;
constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
itag => instr_tag_init,
insn => (others => '0'), fe_mode => "00", rc => '0',
fra => (others => '0'), frb => (others => '0'),
frc => (others => '0'), frt => (others => '0'),
single => '0', out_cr => '0');
single => '0', out_cr => '0', stall => '0');

type FPUToExecute1Type is record
busy : std_ulogic;
f2stall : std_ulogic;
exception : std_ulogic;
end record;
constant FPUToExecute1Init : FPUToExecute1Type := (others => '0');

@ -384,6 +384,7 @@ begin
port map (
clk => clk,
rst => rst_fpu,
flush_in => flush,
e_in => execute1_to_fpu,
e_out => fpu_to_execute1,
w_out => fpu_to_writeback

@ -442,9 +442,9 @@ begin
-- writeback, unless a pipeline flush has happened in the meantime.
xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc;

with e_in.unit select busy_out <=
l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU,
l_in.busy or ex1.busy or fp_in.busy when others;
-- N.B. the busy signal from each source includes the
-- stage2 stall from that source in it.
busy_out <= l_in.busy or ex1.busy or fp_in.busy;

valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt);

@ -1299,8 +1299,7 @@ begin
end if;
end if;

v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or
ex1.busy or fp_in.busy);
v.no_instr_avail := not (e_in.valid or l_in.busy or ex1.busy or fp_in.busy);

go := valid_in and not exception;
v.instr_dispatch := go;
@ -1436,7 +1435,7 @@ begin
lv.is_32bit := e_in.is_32bit;
lv.repeat := e_in.repeat;
lv.second := e_in.second;
lv.e2stall := '0';
lv.e2stall := fp_in.f2stall;

-- Outputs to FPU
fv.op := e_in.insn_type;
@ -1451,6 +1450,7 @@ begin
fv.frt := e_in.write_reg;
fv.rc := e_in.rc;
fv.out_cr := e_in.output_cr;
fv.stall := l_in.l2stall;

-- Update registers
ex1in <= v;
@ -1472,7 +1472,7 @@ begin
ctrl.cfar when SPRSEL_CFAR,
assemble_xer(ex1.e.xerc, ctrl.xer_low) when others;

stage2_stall <= l_in.l2stall or fp_in.busy;
stage2_stall <= l_in.l2stall or fp_in.f2stall;

-- Second execute stage control
execute2_1: process(all)

@ -15,6 +15,7 @@ entity fpu is
port (
clk : in std_ulogic;
rst : in std_ulogic;
flush_in : in std_ulogic;

e_in : in Execute1ToFPUType;
e_out : out FPUToExecute1Type;
@ -35,7 +36,7 @@ architecture behaviour of fpu is
mantissa : std_ulogic_vector(63 downto 0); -- 10.54 format
end record;

type state_t is (IDLE,
type state_t is (IDLE, DO_ILLEGAL,
DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
DO_FCFID, DO_FCTI,
@ -71,7 +72,9 @@ architecture behaviour of fpu is
type reg_type is record
state : state_t;
busy : std_ulogic;
f2stall : std_ulogic;
instr_done : std_ulogic;
complete : std_ulogic;
do_intr : std_ulogic;
illegal : std_ulogic;
op : insn_type_t;
@ -83,7 +86,9 @@ architecture behaviour of fpu is
rc : std_ulogic;
is_cmp : std_ulogic;
single_prec : std_ulogic;
sp_result : std_ulogic;
fpscr : std_ulogic_vector(31 downto 0);
comm_fpscr : std_ulogic_vector(31 downto 0); -- committed FPSCR value
a : fpu_reg_type;
b : fpu_reg_type;
c : fpu_reg_type;
@ -96,13 +101,17 @@ architecture behaviour of fpu is
result_class : fp_number_class;
result_exp : signed(EXP_BITS-1 downto 0);
shift : signed(EXP_BITS-1 downto 0);
writing_back : std_ulogic;
writing_fpr : std_ulogic;
write_reg : gspr_index_t;
complete_tag : instr_tag_t;
writing_cr : std_ulogic;
int_result : std_ulogic;
cr_result : std_ulogic_vector(3 downto 0);
cr_mask : std_ulogic_vector(7 downto 0);
old_exc : std_ulogic_vector(4 downto 0);
update_fprf : std_ulogic;
quieten_nan : std_ulogic;
nsnan_result : std_ulogic;
tiny : std_ulogic;
denorm : std_ulogic;
round_mode : std_ulogic_vector(2 downto 0);
@ -542,17 +551,30 @@ begin
fpu_0: process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
if rst = '1' or flush_in = '1' then
r.state <= IDLE;
r.busy <= '0';
r.f2stall <= '0';
r.instr_done <= '0';
r.complete <= '0';
r.illegal <= '0';
r.do_intr <= '0';
r.writing_fpr <= '0';
r.writing_cr <= '0';
r.fpscr <= (others => '0');
r.writing_back <= '0';
r.dest_fpr <= (others =>'0');
r.write_reg <= (others =>'0');
r.complete_tag.valid <= '0';
r.cr_mask <= (others =>'0');
r.cr_result <= (others =>'0');
r.instr_tag.valid <= '0';
if rst = '1' then
r.fpscr <= (others => '0');
r.comm_fpscr <= (others => '0');
elsif r.do_intr = '0' then
-- flush_in = 1 and not due to us generating an interrupt,
-- roll back to committed fpscr
r.fpscr <= r.comm_fpscr;
end if;
else
assert not (r.state /= IDLE and e_in.valid = '1') severity failure;
r <= rin;
@ -577,14 +599,19 @@ begin
end process;

e_out.busy <= r.busy;
e_out.f2stall <= r.f2stall;
e_out.exception <= r.fpscr(FPSCR_FEX);

w_out.valid <= r.instr_done and not r.do_intr;
w_out.instr_tag <= r.instr_tag;
w_out.write_enable <= r.writing_back;
w_out.write_reg <= r.dest_fpr;
-- Note that the cycle where r.complete = 1 for an instruction can be as
-- late as the second cycle of the following instruction (i.e. in the state
-- following IDLE state). Hence it is important that none of the fields of
-- r that are used below are modified in IDLE state.
w_out.valid <= r.complete;
w_out.instr_tag <= r.complete_tag;
w_out.write_enable <= r.writing_fpr and r.complete;
w_out.write_reg <= r.write_reg;
w_out.write_data <= fp_result;
w_out.write_cr_enable <= r.instr_done and (r.rc or r.is_cmp);
w_out.write_cr_enable <= r.writing_cr and r.complete;
w_out.write_cr_mask <= r.cr_mask;
w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result &
r.cr_result & r.cr_result & r.cr_result & r.cr_result;
@ -599,7 +626,6 @@ begin
variable bdec : fpu_reg_type;
variable cdec : fpu_reg_type;
variable fpscr_mask : std_ulogic_vector(31 downto 0);
variable illegal : std_ulogic;
variable j, k : integer;
variable flm : std_ulogic_vector(7 downto 0);
variable int_input : std_ulogic;
@ -644,12 +670,22 @@ begin
variable maddend : std_ulogic_vector(127 downto 0);
variable sum : std_ulogic_vector(63 downto 0);
variable round_inc : std_ulogic_vector(63 downto 0);
variable int_result : std_ulogic;
variable illegal : std_ulogic;
begin
v := r;
illegal := '0';
v.busy := '0';
v.complete := '0';
v.do_intr := '0';
int_input := '0';

if r.complete = '1' or r.do_intr = '1' then
v.instr_done := '0';
v.writing_fpr := '0';
v.writing_cr := '0';
v.comm_fpscr := r.fpscr;
v.illegal := '0';
end if;

-- capture incoming instruction
if e_in.valid = '1' then
v.insn := e_in.insn;
@ -660,14 +696,8 @@ begin
v.dest_fpr := e_in.frt;
v.single_prec := e_in.single;
v.longmask := e_in.single;
v.int_result := '0';
v.rc := e_in.rc;
v.is_cmp := e_in.out_cr;
if e_in.out_cr = '0' then
v.cr_mask := num_to_fxm(1);
else
v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn))));
end if;
int_input := '0';
if e_in.op = OP_FPOP_I then
int_input := '1';
@ -741,8 +771,6 @@ begin
pcmpb_lt := '1';
end if;

v.writing_back := '0';
v.instr_done := '0';
v.update_fprf := '0';
v.shift := to_signed(0, EXP_BITS);
v.first := '0';
@ -777,6 +805,8 @@ begin
pshift := '0';
renorm_sqrt := '0';
shiftin := '0';
int_result := '0';
illegal := '0';
case r.state is
when IDLE =>
v.use_a := '0';
@ -785,6 +815,7 @@ begin
v.invalid := '0';
v.negate := '0';
if e_in.valid = '1' then
v.busy := '1';
case e_in.insn(5 downto 1) is
when "00000" =>
if e_in.insn(8) = '1' then
@ -876,13 +907,17 @@ begin
end if;
v.state := DO_FMADD;
when others =>
illegal := '1';
v.state := DO_ILLEGAL;
end case;
end if;
v.x := '0';
v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
set_s := '1';

when DO_ILLEGAL =>
illegal := '1';
v.instr_done := '1';

when DO_MCRFS =>
j := to_integer(unsigned(insn_bfa(r.insn)));
for i in 0 to 7 loop
@ -894,11 +929,9 @@ begin
end loop;
v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF");
v.instr_done := '1';
v.state := IDLE;

when DO_FTDIV =>
v.instr_done := '1';
v.state := IDLE;
v.cr_result := "0000";
if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
(r.b.class = FINITE and r.b.mantissa(53) = '0') then
@ -917,7 +950,6 @@ begin

when DO_FTSQRT =>
v.instr_done := '1';
v.state := IDLE;
v.cr_result := "0000";
if r.b.class = ZERO or r.b.class = INFINITY or
(r.b.class = FINITE and r.b.mantissa(53) = '0') then
@ -932,7 +964,6 @@ begin
-- fcmp[uo]
-- r.opsel_a = AIN_B
v.instr_done := '1';
v.state := IDLE;
update_fx := '1';
v.result_exp := r.b.exponent;
if (r.a.class = NAN and r.a.mantissa(53) = '0') or
@ -993,7 +1024,6 @@ begin
end if;
end loop;
v.instr_done := '1';
v.state := IDLE;

when DO_MTFSFI =>
-- mtfsfi
@ -1007,20 +1037,17 @@ begin
end loop;
end if;
v.instr_done := '1';
v.state := IDLE;

when DO_FMRG =>
-- fmrgew, fmrgow
opsel_r <= RES_MISC;
misc_sel <= "01" & r.insn(8) & '0';
v.int_result := '1';
v.writing_back := '1';
int_result := '1';
v.writing_fpr := '1';
v.instr_done := '1';
v.state := IDLE;

when DO_MFFS =>
v.int_result := '1';
v.writing_back := '1';
v.writing_fpr := '1';
opsel_r <= RES_MISC;
case r.insn(20 downto 16) is
when "00000" =>
@ -1044,10 +1071,11 @@ begin
-- mffsl
fpscr_mask := x"0007F0FF";
when others =>
illegal := '1';
v.illegal := '1';
v.writing_fpr := '0';
end case;
int_result := '1';
v.instr_done := '1';
v.state := IDLE;

when DO_MTFSF =>
if r.insn(25) = '1' then
@ -1064,7 +1092,6 @@ begin
end if;
end loop;
v.instr_done := '1';
v.state := IDLE;

when DO_FMR =>
-- r.opsel_a = AIN_B
@ -1082,9 +1109,8 @@ begin
else
v.result_sign := r.a.negative; -- fcpsgn
end if;
v.writing_back := '1';
v.writing_fpr := '1';
v.instr_done := '1';
v.state := IDLE;

when DO_FRI => -- fri[nzpm]
-- r.opsel_a = AIN_B
@ -1153,7 +1179,7 @@ begin
invalid := '1';
end if;

v.int_result := '1';
int_result := '1';
case r.b.class is
when ZERO =>
arith_done := '1';
@ -1671,7 +1697,6 @@ begin
end if;
v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
v.instr_done := '1';
v.state := IDLE;

when MULT_1 =>
f_to_multiply.valid <= r.first;
@ -1849,7 +1874,6 @@ begin
v.cr_result(1) := exp_tiny or exp_huge;
if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then
v.instr_done := '1';
v.state := IDLE;
else
v.shift := r.a.exponent;
v.doing_ftdiv := "10";
@ -2054,6 +2078,7 @@ begin
when others => -- fctidu[z]
need_check := r.r(63);
end case;
int_result := '1';
if need_check = '1' then
v.state := INT_CHECK;
else
@ -2080,6 +2105,7 @@ begin
v.fpscr(FPSCR_XX) := '1';
end if;
end if;
int_result := '1';
arith_done := '1';

when INT_OFLOW =>
@ -2090,6 +2116,7 @@ begin
end if;
v.fpscr(FPSCR_VXCVI) := '1';
invalid := '1';
int_result := '1';
arith_done := '1';

when FRI_1 =>
@ -2306,11 +2333,10 @@ begin
-- Neither does enabled zero-divide exception
if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and
(zero_divide and r.fpscr(FPSCR_ZE)) = '0' then
v.writing_back := '1';
v.writing_fpr := '1';
v.update_fprf := '1';
end if;
v.instr_done := '1';
v.state := IDLE;
update_fx := '1';
end if;

@ -2530,12 +2556,6 @@ begin
v.shift := resize(signed('0' & clz) - 9, EXP_BITS);
end if;

if r.int_result = '1' then
fp_result <= r.r;
else
fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
r.single_prec, r.quieten_nan);
end if;
if r.update_fprf = '1' then
v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class,
r.r(54) and not r.denorm);
@ -2549,24 +2569,49 @@ begin
(v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then
v.fpscr(FPSCR_FX) := '1';
end if;
if r.rc = '1' then
v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
end if;

v.illegal := illegal;
if illegal = '1' then
v.instr_done := '0';
v.do_intr := '1';
v.writing_back := '0';
v.busy := '0';
v.state := IDLE;
if v.instr_done = '1' then
if r.state /= IDLE then
v.state := IDLE;
v.busy := '0';
v.f2stall := '0';
if r.rc = '1' then
v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
end if;
v.sp_result := r.single_prec;
v.int_result := int_result;
v.illegal := illegal;
v.nsnan_result := v.quieten_nan;
if r.is_cmp = '0' then
v.cr_mask := num_to_fxm(1);
else
v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(r.insn))));
end if;
v.writing_cr := r.is_cmp or r.rc;
v.write_reg := r.dest_fpr;
v.complete_tag := r.instr_tag;
end if;
if e_in.stall = '0' then
v.complete := not v.illegal;
v.do_intr := (v.fpscr(FPSCR_FEX) and r.fe_mode) or v.illegal;
end if;
-- N.B. We rely on execute1 to prevent any new instruction
-- coming in while e_in.stall = 1, without us needing to
-- have busy asserted.
else
v.do_intr := v.instr_done and v.fpscr(FPSCR_FEX) and r.fe_mode;
if v.state /= IDLE or v.do_intr = '1' then
v.busy := '1';
if r.state /= IDLE and e_in.stall = '0' then
v.f2stall := '1';
end if;
end if;

-- This mustn't depend on any fields of r that are modified in IDLE state.
if r.int_result = '1' then
fp_result <= r.r;
else
fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
r.sp_result, r.nsnan_result);
end if;

rin <= v;
end process;


@ -159,7 +159,6 @@ architecture behave of loadstore1 is
signal flush : std_ulogic;
signal busy : std_ulogic;
signal complete : std_ulogic;
signal in_progress : std_ulogic;
signal flushing : std_ulogic;

signal store_sp_data : std_ulogic_vector(31 downto 0);
@ -523,7 +522,6 @@ begin

busy <= dc_stall or d_in.error or r1.busy or r2.busy;
complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or r3.complete;
in_progress <= r1.req.valid or (r2.req.valid and not complete);

-- Processing done in the first cycle of a load/store instruction
loadstore1_1: process(all)
@ -981,7 +979,6 @@ begin
-- update busy signal back to execute1
e_out.busy <= busy;
e_out.l2stall <= dc_stall or d_in.error or r2.busy;
e_out.in_progress <= in_progress;

events <= r3.events;


Loading…
Cancel
Save