From ae2afeca5c7ba04eebb79e671534c3431006fe13 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 12 Nov 2020 22:07:33 +1100 Subject: [PATCH] core: Track CR hazards and bypasses using tags Signed-off-by: Paul Mackerras --- Makefile | 2 +- common.vhdl | 9 ++++-- control.vhdl | 58 ++++++++++++++++++---------------- core.vhdl | 3 ++ cr_hazard.vhdl | 86 -------------------------------------------------- decode1.vhdl | 2 +- decode2.vhdl | 25 ++++++++------- execute1.vhdl | 27 +++++++--------- microwatt.core | 1 - 9 files changed, 68 insertions(+), 145 deletions(-) delete mode 100644 cr_hazard.vhdl diff --git a/Makefile b/Makefile index bb39007..678bbfa 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ all: $(all) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ decode1.vhdl helpers.vhdl insn_helpers.vhdl \ - cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \ + control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \ loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \ diff --git a/common.vhdl b/common.vhdl index 8d1ca29..0151595 100644 --- a/common.vhdl +++ b/common.vhdl @@ -210,6 +210,12 @@ package common is end record; constant bypass_data_init : bypass_data_t := (tag => instr_tag_init, data => (others => '0')); + type cr_bypass_data_t is record + tag : instr_tag_t; + data : std_ulogic_vector(31 downto 0); + end record; + constant cr_bypass_data_init : cr_bypass_data_t := (tag => instr_tag_init, data => (others => '0')); + type Decode2ToExecute1Type is record valid: std_ulogic; unit : unit_t; @@ -225,7 +231,6 @@ package common is read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); cr: std_ulogic_vector(31 downto 0); - bypass_cr : std_ulogic; xerc: xer_common_t; lr: std_ulogic; br_abs: std_ulogic; @@ -255,7 +260,7 @@ package common is constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, write_reg_enable => '0', - bypass_cr => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', + lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), diff --git a/control.vhdl b/control.vhdl index c4b8d4e..5c83f78 100644 --- a/control.vhdl +++ b/control.vhdl @@ -35,10 +35,10 @@ entity control is gpr_c_read_in : in gspr_index_t; execute_next_tag : in instr_tag_t; + execute_next_cr_tag : in instr_tag_t; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; - cr_bypassable : in std_ulogic; valid_out : out std_ulogic; stall_out : out std_ulogic; @@ -64,11 +64,6 @@ architecture rtl of control is signal r_int, rin_int : reg_internal_type := reg_internal_init; - signal stall_a_out : std_ulogic; - signal stall_b_out : std_ulogic; - signal stall_c_out : std_ulogic; - signal cr_stall_out : std_ulogic; - signal gpr_write_valid : std_ulogic := '0'; signal cr_write_valid : std_ulogic := '0'; @@ -76,6 +71,7 @@ architecture rtl of control is wr_gpr : std_ulogic; reg : gspr_index_t; recent : std_ulogic; + wr_cr : std_ulogic; end record; type tag_regs_array is array(tag_number_t) of tag_register; @@ -84,31 +80,14 @@ architecture rtl of control is signal instr_tag : instr_tag_t; signal gpr_tag_stall : std_ulogic; + signal cr_tag_stall : std_ulogic; signal curr_tag : tag_number_t; signal next_tag : tag_number_t; -begin - cr_hazard0: entity work.cr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in.valid, - flush_in => flush_in, - issuing => valid_out, - - cr_read_in => cr_read_in, - cr_write_in => cr_write_valid, - bypassable => cr_bypassable, - - stall_out => cr_stall_out, - use_bypass => cr_bypass - ); + signal curr_cr_tag : tag_number_t; +begin control0: process(clk) begin if rising_edge(clk) then @@ -118,9 +97,11 @@ begin for i in tag_number_t loop if rst = '1' or flush_in = '1' then tag_regs(i).wr_gpr <= '0'; + tag_regs(i).wr_cr <= '0'; else if complete_in.valid = '1' and i = complete_in.tag then tag_regs(i).wr_gpr <= '0'; + tag_regs(i).wr_cr <= '0'; report "tag " & integer'image(i) & " not valid"; end if; if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then @@ -133,6 +114,7 @@ begin tag_regs(i).wr_gpr <= gpr_write_valid; tag_regs(i).reg <= gpr_write_in; tag_regs(i).recent <= gpr_write_valid; + tag_regs(i).wr_cr <= cr_write_valid; if gpr_write_valid = '1' then report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in); end if; @@ -141,8 +123,12 @@ begin end loop; if rst = '1' then curr_tag <= 0; + curr_cr_tag <= 0; else curr_tag <= next_tag; + if cr_write_valid = '1' then + curr_cr_tag <= instr_tag.tag; + end if; end if; end if; end process; @@ -158,6 +144,8 @@ begin variable byp_a : std_ulogic; variable byp_b : std_ulogic; variable byp_c : std_ulogic; + variable tag_cr : instr_tag_t; + variable byp_cr : std_ulogic; begin tag_a := instr_tag_init; for i in tag_number_t loop @@ -219,6 +207,20 @@ begin end if; next_tag <= incr_tag; instr_tag_out <= instr_tag; + + -- CR hazards + tag_cr.tag := curr_cr_tag; + tag_cr.valid := cr_read_in and tag_regs(curr_cr_tag).wr_cr; + if tag_match(tag_cr, complete_in) then + tag_cr.valid := '0'; + end if; + byp_cr := '0'; + if EX1_BYPASS and tag_match(execute_next_cr_tag, tag_cr) then + byp_cr := '1'; + end if; + + cr_bypass <= byp_cr; + cr_tag_stall <= tag_cr.valid and not byp_cr; end process; control1 : process(all) @@ -265,7 +267,7 @@ begin end if; else -- let it go out if there are no GPR or CR hazards - stall_tmp := gpr_tag_stall or cr_stall_out; + stall_tmp := gpr_tag_stall or cr_tag_stall; end if; end if; @@ -292,7 +294,7 @@ begin end if; else -- let it go out if there are no GPR or CR hazards - stall_tmp := gpr_tag_stall or cr_stall_out; + stall_tmp := gpr_tag_stall or cr_tag_stall; end if; end if; else diff --git a/core.vhdl b/core.vhdl index 71bf2c8..7dafd1c 100644 --- a/core.vhdl +++ b/core.vhdl @@ -68,6 +68,7 @@ architecture behave of core is signal execute1_to_writeback: Execute1ToWritebackType; signal execute1_to_fetch1: Execute1ToFetch1Type; signal execute1_bypass: bypass_data_t; + signal execute1_cr_bypass: cr_bypass_data_t; -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; @@ -275,6 +276,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file, execute_bypass => execute1_bypass, + execute_cr_bypass => execute1_cr_bypass, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -333,6 +335,7 @@ begin fp_out => execute1_to_fpu, e_out => execute1_to_writeback, bypass_data => execute1_bypass, + bypass_cr_data => execute1_cr_bypass, icache_inval => ex1_icache_inval, dbg_msr_out => msr, terminate_out => terminate, diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl deleted file mode 100644 index a6203a8..0000000 --- a/cr_hazard.vhdl +++ /dev/null @@ -1,86 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -entity cr_hazard is - generic ( - PIPELINE_DEPTH : natural := 1 - ); - port( - clk : in std_ulogic; - busy_in : in std_ulogic; - deferred : in std_ulogic; - complete_in : in std_ulogic; - flush_in : in std_ulogic; - issuing : in std_ulogic; - - cr_read_in : in std_ulogic; - cr_write_in : in std_ulogic; - bypassable : in std_ulogic; - - stall_out : out std_ulogic; - use_bypass : out std_ulogic - ); -end entity cr_hazard; -architecture behaviour of cr_hazard is - type pipeline_entry_type is record - valid : std_ulogic; - bypass : std_ulogic; - end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0'); - - type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; - constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); - - signal r, rin : pipeline_t := pipeline_t_init; -begin - cr_hazard0: process(clk) - begin - if rising_edge(clk) then - r <= rin; - end if; - end process; - - cr_hazard1: process(all) - variable v : pipeline_t; - begin - v := r; - - -- XXX assumes PIPELINE_DEPTH = 1 - if complete_in = '1' then - v(1).valid := '0'; - end if; - - use_bypass <= '0'; - stall_out <= '0'; - if cr_read_in = '1' then - loop_0: for i in 0 to PIPELINE_DEPTH loop - if v(i).valid = '1' then - if r(i).bypass = '1' then - use_bypass <= '1'; - else - stall_out <= '1'; - end if; - end if; - end loop; - end if; - - -- XXX assumes PIPELINE_DEPTH = 1 - if busy_in = '0' then - v(1) := r(0); - v(0).valid := '0'; - end if; - if deferred = '0' and issuing = '1' then - v(0).valid := cr_write_in; - v(0).bypass := bypassable; - end if; - if flush_in = '1' then - v(0).valid := '0'; - v(1).valid := '0'; - end if; - - -- update registers - rin <= v; - - end process; -end; diff --git a/decode1.vhdl b/decode1.vhdl index f62594b..2869c39 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -214,7 +214,7 @@ architecture behaviour of decode1 is 2#0100111010# => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cbcdtd 2#0100011010# => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cdtbcd 2#0000000000# => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmp - 2#0111111100# => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpb + 2#0111111100# => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpb 2#0011100000# => (ALU, NONE, OP_CMPEQB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpeqb 2#0000100000# => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpl 2#0011000000# => (ALU, NONE, OP_CMPRB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmprb diff --git a/decode2.vhdl b/decode2.vhdl index 51c8ef1..748edb9 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -37,7 +37,8 @@ entity decode2 is c_in : in CrFileToDecode2Type; c_out : out Decode2ToCrFileType; - execute_bypass : in bypass_data_t; + execute_bypass : in bypass_data_t; + execute_cr_bypass : in cr_bypass_data_t; log_out : out std_ulogic_vector(9 downto 0) ); @@ -300,9 +301,9 @@ architecture behaviour of decode2 is signal gpr_c_read : gspr_index_t; signal gpr_c_bypass : std_ulogic; + signal cr_read_valid : std_ulogic; signal cr_write_valid : std_ulogic; signal cr_bypass : std_ulogic; - signal cr_bypass_avail : std_ulogic; signal instr_tag : instr_tag_t; @@ -338,11 +339,11 @@ begin gpr_c_read_in => gpr_c_read, execute_next_tag => execute_bypass.tag, + execute_next_cr_tag => execute_cr_bypass.tag, - cr_read_in => d_in.decode.input_cr, + cr_read_in => cr_read_valid, cr_write_in => cr_write_valid, cr_bypass => cr_bypass, - cr_bypassable => cr_bypass_avail, valid_out => control_valid_out, stall_out => control_stall_out, @@ -391,7 +392,7 @@ begin --v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; - + decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1, d_in.nia); decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); @@ -467,8 +468,6 @@ begin if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); end if; - v.e.cr := c_in.read_cr_data; - v.e.bypass_cr := cr_bypass; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; @@ -516,6 +515,11 @@ begin v.e.read_data3 := decoded_reg_c.data; end case; + v.e.cr := c_in.read_cr_data; + if cr_bypass = '1' then + v.e.cr := execute_cr_bypass.data; + end if; + -- issue control control_valid_in <= d_in.valid; control_sgl_pipe <= d_in.decode.sgl_pipe; @@ -533,10 +537,9 @@ begin gpr_c_read <= decoded_reg_c.reg; cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); - cr_bypass_avail <= '0'; - if EX1_BYPASS and d_in.decode.unit = ALU then - cr_bypass_avail <= d_in.decode.output_cr; - end if; + -- Since ops that write CR only write some of the fields, + -- any op that writes CR effectively also reads it. + cr_read_valid <= cr_write_valid or d_in.decode.input_cr; v.e.valid := control_valid_out; if control_valid_out = '1' then diff --git a/execute1.vhdl b/execute1.vhdl index c0cc32f..c859689 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -38,6 +38,7 @@ entity execute1 is e_out : out Execute1ToWritebackType; bypass_data : out bypass_data_t; + bypass_cr_data : out cr_bypass_data_t; dbg_msr_out : out std_ulogic_vector(63 downto 0); @@ -412,15 +413,7 @@ begin v.e.xerc := e_in.xerc; end if; - -- CR forwarding cr_in <= e_in.cr; - if EX1_BYPASS and e_in.bypass_cr = '1' and r.e.write_cr_enable = '1' then - for i in 0 to 7 loop - if r.e.write_cr_mask(i) = '1' then - cr_in(i * 4 + 3 downto i * 4) <= r.e.write_cr_data(i * 4 + 3 downto i * 4); - end if; - end loop; - end if; v.mul_in_progress := '0'; v.div_in_progress := '0'; @@ -809,7 +802,6 @@ begin end if; bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; v.e.write_cr_mask := num_to_fxm(crnum); for i in 0 to 7 loop lo := i*4; @@ -831,7 +823,6 @@ begin newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; @@ -839,7 +830,6 @@ begin newcrf := ppc_cmpeqb(a_in, b_in); bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; @@ -913,7 +903,6 @@ begin if cr_op(0) = '0' then -- MCRF bf := insn_bf(e_in.insn); bfa := insn_bfa(e_in.insn); - v.e.write_cr_enable := '1'; crnum := to_integer(unsigned(bf)); scrnum := to_integer(unsigned(bfa)); v.e.write_cr_mask := num_to_fxm(crnum); @@ -930,7 +919,6 @@ begin v.e.write_cr_data(hi downto lo) := newcrf; end loop; else - v.e.write_cr_enable := '1'; bt := insn_bt(e_in.insn); ba := insn_ba(e_in.insn); bb := insn_bb(e_in.insn); @@ -954,7 +942,6 @@ begin newcrf := v.e.xerc.ov & v.e.xerc.ca & v.e.xerc.ov32 & v.e.xerc.ca32; bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; @@ -1007,7 +994,6 @@ begin when OP_MFCR => when OP_MTCRF => - v.e.write_cr_enable := '1'; if e_in.insn(20) = '0' then -- mtcrf v.e.write_cr_mask := insn_fxm(e_in.insn); @@ -1269,12 +1255,23 @@ begin end if; v.e.write_reg := current.write_reg; v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; + v.e.write_cr_enable := current.output_cr and v.e.valid and not exception; v.e.rc := current.rc and v.e.valid and not exception; bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid; bypass_data.tag.tag <= current.instr_tag.tag; bypass_data.data <= v.e.write_data; + bypass_cr_data.tag.valid <= current.instr_tag.valid and current.output_cr and v.e.valid; + bypass_cr_data.tag.tag <= current.instr_tag.tag; + for i in 0 to 7 loop + if v.e.write_cr_mask(i) = '1' then + bypass_cr_data.data(i*4 + 3 downto i*4) <= v.e.write_cr_data(i*4 + 3 downto i*4); + else + bypass_cr_data.data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4); + end if; + end loop; + -- Defer completion for one cycle when redirecting. -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 if v.redirect = '1' then diff --git a/microwatt.core b/microwatt.core index 0f77fba..79af3c1 100644 --- a/microwatt.core +++ b/microwatt.core @@ -19,7 +19,6 @@ filesets: - sim_console.vhdl - logical.vhdl - countzero.vhdl - - cr_hazard.vhdl - control.vhdl - execute1.vhdl - fpu.vhdl