Improve timing of redirect_nia going from writeback to fetch1

This gets rid of the adder in writeback that computes redirect_nia. Instead, the main adder in the ALU is used to compute the branch target for relative branches. We now decode b and bc differently depending on the AA field, generating INSN_brel, INSN_babs, INSN_bcrel or INSN_bcabs as appropriate. Each one has a separate entry in the decode table in decode1; the *rel versions use CIA as the A input. The bclr/bcctr/bctar and rfid instructions now select ramspr_result for the main result mux to get the redirect address into ex1.e.write_data. For branches which are predicted taken but not actually taken, we need to redirect to the following instruction. We also need to do that for isync. We do this in the execute2 stage since whether or not to do it depends on the branch result. The next_nia computation is moved to the execute2 stage and comes in via a new leg on the secondary result multiplexer, making next_nia available ultimately in ex2.e.write_data. This also means that the next_nia leg of the primary result multiplexer is gone. Incrementing last_nia by 4 for sc (so that SRR0 points to the following instruction) is also moved to execute2. Writing CIA+4 to LR was previously done through the main result multiplexer. Now it comes in explicitly in the ramspr write logic. Overall this removes the br_offset and abs_br fields and the logic to add br_offset and next_nia, and one leg of the primary result multiplexer, at the cost of a few extra control signals between execute1 and execute2 and some multiplexing for the ramspr write side and an extra input on the secondary result multiplexer. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
3 years ago · 1c4b5def36
parent 06ff486567
commit 1c4b5def36
7 changed files with 98 additions and 62 deletions
--- a/common.vhdl
+++ b/common.vhdl
@ -658,7 +658,6 @@ package common is
 	redirect: std_ulogic;
        redir_mode: std_ulogic_vector(3 downto 0);
        last_nia: std_ulogic_vector(63 downto 0);
-        br_offset: std_ulogic_vector(63 downto 0);
        br_last: std_ulogic;
        br_taken: std_ulogic;
        abs_br: std_ulogic;
@ -672,7 +671,7 @@ package common is
         write_data => (others => '0'), write_cr_mask => (others => '0'),
         write_cr_data => (others => '0'), write_reg => (others => '0'),
         interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000",
-         last_nia => (others => '0'), br_offset => (others => '0'),
+         last_nia => (others => '0'),
         br_last => '0', br_taken => '0', abs_br => '0',
         srr1 => (others => '0'), msr => (others => '0'));

--- a/decode1.vhdl
+++ b/decode1.vhdl
@ -94,8 +94,10 @@ architecture behaviour of decode1 is
        INSN_andi_dot    =>  (ALU,  NONE, OP_LOGIC,     NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE),
        INSN_andis_dot   =>  (ALU,  NONE, OP_LOGIC,     NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE),
        INSN_attn        =>  (ALU,  NONE, OP_ATTN,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE),
-        INSN_b           =>  (ALU,  NONE, OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
-        INSN_bc          =>  (ALU,  NONE, OP_BC,        NONE,       CONST_BD,    NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        INSN_brel        =>  (ALU,  NONE, OP_B,         CIA,        CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        INSN_babs        =>  (ALU,  NONE, OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        INSN_bcrel       =>  (ALU,  NONE, OP_BC,        CIA,        CONST_BD,    NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        INSN_bcabs       =>  (ALU,  NONE, OP_BC,        NONE,       CONST_BD,    NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
        INSN_bcctr       =>  (ALU,  NONE, OP_BCREG,     NONE,       NONE,        NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
        INSN_bclr        =>  (ALU,  NONE, OP_BCREG,     NONE,       NONE,        NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
        INSN_bctar       =>  (ALU,  NONE, OP_BCREG,     NONE,       NONE,        NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
@ -597,11 +599,11 @@ begin
        -- count cache or link stack.
        br_offset := (others => '0');
        case icode is
-            when INSN_b =>
+            when INSN_brel | INSN_babs =>
                -- Unconditional branches are always taken
                v.br_pred := '1';
                br_offset := signed(f_in.insn(25 downto 2));
-            when INSN_bc =>
+            when INSN_bcrel | INSN_bcabs =>
                -- Predict backward branches as taken, forward as untaken
                v.br_pred := f_in.insn(15);
                br_offset := resize(signed(f_in.insn(15 downto 2)), 24);
--- a/decode2.vhdl
+++ b/decode2.vhdl
@ -221,9 +221,8 @@ architecture behaviour of decode2 is
        OP_SHR      => "010",
        OP_EXTSWSLI => "010",
        OP_MUL_L64  => "011",           -- muldiv_result
-        OP_B        => "110",           -- next_nia
-        OP_BC       => "110",
-        OP_BCREG    => "110",
+        OP_BCREG    => "101",           -- ramspr_result
+        OP_RFID     => "101",
        OP_ADDG6S   => "111",           -- misc_result
        OP_ISEL     => "111",
        OP_DARN     => "111",
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@ -47,14 +47,16 @@ package decode_types is
        INSN_andi_dot, -- 10
        INSN_andis_dot,
        INSN_attn,
-        INSN_b,
-        INSN_bc,
+        INSN_brel,
+        INSN_babs,
+        INSN_bcrel,
+        INSN_bcabs,
        INSN_bcctr,
        INSN_bclr,
        INSN_bctar,
-        INSN_brh,
+        INSN_brh, -- 20
        INSN_brw,
-        INSN_brd, -- 20
+        INSN_brd,
        INSN_cbcdtd,
        INSN_cdtbcd,
        INSN_cmpi,
@ -62,9 +64,9 @@ package decode_types is
        INSN_cntlzw,
        INSN_cntlzd,
        INSN_cnttzw,
-        INSN_cnttzd,
+        INSN_cnttzd, -- 30
        INSN_crand,
-        INSN_crandc, -- 30
+        INSN_crandc,
        INSN_creqv,
        INSN_crnand,
        INSN_crnor,
@ -72,9 +74,9 @@ package decode_types is
        INSN_crorc,
        INSN_crxor,
        INSN_darn,
-        INSN_eieio,
+        INSN_eieio, -- 40
        INSN_extsb,
-        INSN_extsh, -- 40
+        INSN_extsh,
        INSN_extsw,
        INSN_extswsli,
        INSN_isync,
@ -82,9 +84,9 @@ package decode_types is
        INSN_ld,
        INSN_ldu,
        INSN_lhau,
-        INSN_lwa,
+        INSN_lwa, -- 50
        INSN_lwzu,
-        INSN_mcrf, -- 50
+        INSN_mcrf,
        INSN_mcrxrx,
        INSN_mfcr,
        INSN_mfmsr,
@ -92,9 +94,9 @@ package decode_types is
        INSN_mtcrf,
        INSN_mtmsr,
        INSN_mtmsrd,
-        INSN_mtspr,
+        INSN_mtspr, -- 60
        INSN_mulli,
-        INSN_neg, -- 60
+        INSN_neg,
        INSN_nop,
        INSN_ori,
        INSN_oris,
@ -102,9 +104,9 @@ package decode_types is
        INSN_popcntw,
        INSN_popcntd,
        INSN_prtyw,
-        INSN_prtyd,
+        INSN_prtyd, -- 70
        INSN_rfid,
-        INSN_rldic, -- 70
+        INSN_rldic,
        INSN_rldicl,
        INSN_rldicr,
        INSN_rldimi,
@ -112,9 +114,9 @@ package decode_types is
        INSN_rlwinm,
        INSN_rnop,
        INSN_sc,
-        INSN_setb,
+        INSN_setb, -- 80
        INSN_slbia,
-        INSN_sradi, -- 80
+        INSN_sradi,
        INSN_srawi,
        INSN_stbu,
        INSN_std,
@ -122,9 +124,9 @@ package decode_types is
        INSN_sthu,
        INSN_stwu,
        INSN_subfic,
-        INSN_subfme,
+        INSN_subfme, -- 90
        INSN_subfze,
-        INSN_sync, -- 90
+        INSN_sync,
        INSN_tdi,
        INSN_tlbsync,
        INSN_twi,
@ -132,7 +134,7 @@ package decode_types is
        INSN_xori,
        INSN_xoris,
        -- pad to 104
-        INSN_061, INSN_062, INSN_063, INSN_064, INSN_065, INSN_066, INSN_067,
+        INSN_063, INSN_064, INSN_065, INSN_066, INSN_067,

        -- Non-prefixed instructions that have a MLS:D prefixed form and
        -- their corresponding prefixed instructions.
@ -497,8 +499,10 @@ package body decode_types is
            when INSN_andi_dot  => return "011100";
            when INSN_andis_dot => return "011101";
            when INSN_attn      => return "000000";
-            when INSN_b         => return "010010";
-            when INSN_bc        => return "010000";
+            when INSN_brel      => return "010010";
+            when INSN_babs      => return "010010";
+            when INSN_bcrel     => return "010000";
+            when INSN_bcabs     => return "010000";
            when INSN_brh       => return "011111";
            when INSN_brw       => return "011111";
            when INSN_brd       => return "011111";
--- a/execute1.vhdl
+++ b/execute1.vhdl
@ -95,6 +95,7 @@ architecture behaviour of execute1 is
        exception : std_ulogic;
        trap : std_ulogic;
        advance_nia : std_ulogic;
+        redir_to_next : std_ulogic;
        new_msr : std_ulogic_vector(63 downto 0);
        take_branch : std_ulogic;
        direct_branch : std_ulogic;
@ -124,6 +125,9 @@ architecture behaviour of execute1 is
        res2_sel : std_ulogic_vector(1 downto 0);
        spr_select : spr_id;
        pmu_spr_num : std_ulogic_vector(4 downto 0);
+        redir_to_next : std_ulogic;
+        advance_nia : std_ulogic;
+        lr_from_next : std_ulogic;
 	mul_in_progress : std_ulogic;
        mul_finish : std_ulogic;
        div_in_progress : std_ulogic;
@ -145,6 +149,7 @@ architecture behaviour of execute1 is
         prev_prefixed => '0',
         oe => '0', mul_select => "00", res2_sel => "00",
         spr_select => spr_id_init, pmu_spr_num => 5x"0",
+         redir_to_next => '0', advance_nia => '0', lr_from_next => '0',
         mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
         no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
         taken_branch_event => '0', br_mispredict => '0',
@ -510,6 +515,7 @@ begin
        variable wr_addr : ramspr_index;
        variable even_wr_enab, odd_wr_enab : std_ulogic;
        variable even_wr_data, odd_wr_data : std_ulogic_vector(63 downto 0);
+        variable ramspr_even_data : std_ulogic_vector(63 downto 0);
        variable doit : std_ulogic;
    begin
        -- Read address mux and async RAM reading
@ -533,11 +539,16 @@ begin
        else
            wr_addr := ex1.ramspr_wraddr;
        end if;
+        if ex1.lr_from_next = '1' then
+            ramspr_even_data := next_nia;
+        else
+            ramspr_even_data := ex1.e.write_data;
+        end if;
        if interrupt_in.intr = '1' then
            even_wr_data := ex2.e.last_nia;
            odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1);
        else
-            even_wr_data := ex1.e.write_data;
+            even_wr_data := ramspr_even_data;
            odd_wr_data := ex1.ramspr_odd_data;
        end if;
        ramspr_wr_addr <= wr_addr;
@ -550,7 +561,7 @@ begin
        -- We assume no instruction executes in the cycle immediately following
        -- an interrupt, so we don't need to bypass interrupt data
        if ex1.se.ramspr_write_even = '1' and e_in.ramspr_even_rdaddr = ex1.ramspr_wraddr then
-            ramspr_even <= ex1.e.write_data;
+            ramspr_even <= ramspr_even_data;
        else
            ramspr_even <= even_rd_data;
        end if;
@ -593,7 +604,6 @@ begin
        shortmul_result    when "011",
        muldiv_result      when "100",
        ramspr_result      when "101",
-        next_nia           when "110",
        misc_result        when others;

    execute1_0: process(clk)
@ -1016,7 +1026,6 @@ begin
        v.e.mode_32bit := not ex1.msr(MSR_SF);
        v.e.instr_tag := e_in.instr_tag;
        v.e.last_nia := e_in.nia;
-        v.e.br_offset := 64x"4";

        v.se.ramspr_write_even := e_in.ramspr_write_even;
        v.se.ramspr_write_odd := e_in.ramspr_write_odd;
@ -1114,8 +1123,6 @@ begin
                v.direct_branch := '1';
                v.e.br_last := '1';
                v.e.br_taken := '1';
-                v.e.br_offset := b_in;
-                v.e.abs_br := insn_aa(e_in.insn);
                if e_in.br_pred = '0' then
                    -- should never happen
                    v.e.redirect := '1';
@ -1129,14 +1136,13 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
                v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd);
-                if v.take_branch = '1' then
-                    v.e.br_offset := b_in;
-                    v.e.abs_br := insn_aa(e_in.insn);
-                end if;
                -- Mispredicted branches cause a redirect
                if v.take_branch /= e_in.br_pred then
                    v.e.redirect := '1';
                end if;
+                if v.take_branch = '0' then
+                    v.redir_to_next := '1';
+                end if;
                v.direct_branch := '1';
                v.e.br_last := '1';
                v.e.br_taken := v.take_branch;
@ -1150,10 +1156,6 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
                v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd);
-                if v.take_branch = '1' then
-                    v.e.br_offset := ramspr_result;
-                    v.e.abs_br := '1';
-                end if;
                -- Indirect branches are never predicted taken
                v.e.redirect := v.take_branch;
                v.e.br_taken := v.take_branch;
@ -1177,8 +1179,6 @@ begin
                    v.new_msr(MSR_DR) := '1';
                end if;
                v.se.write_msr := '1';
-                v.e.br_offset := ramspr_result;
-                v.e.abs_br := '1';
                v.e.redirect := '1';
                v.se.write_cfar := '1';
                if HAS_FPU then
@ -1292,6 +1292,7 @@ begin

 	    when OP_ISYNC =>
 		v.e.redirect := '1';
+                v.redir_to_next := '1';

 	    when OP_ICBI =>
 		v.se.icache_inval := '1';
@ -1406,6 +1407,7 @@ begin
            v.mul_select := e_in.sub_select(1 downto 0);
            v.se := side_effect_init;
            v.ramspr_wraddr := e_in.ramspr_wraddr;
+            v.lr_from_next := e_in.lr;
            v.ramspr_odd_data := actions.ramspr_odd_data;
        end if;

@ -1423,9 +1425,6 @@ begin

        irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);

-	-- Next insn adder used in a couple of places
-	next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4);
-
 	-- rotator control signals
 	right_shift <= '1' when e_in.insn_type = OP_SHR else '0';
 	rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0';
@ -1507,10 +1506,9 @@ begin
            x_to_divider.valid <= actions.start_div;
            v.div_in_progress := actions.start_div;
            v.br_mispredict := v.e.redirect and actions.direct_branch;
+            v.advance_nia := actions.advance_nia;
+            v.redir_to_next := actions.redir_to_next;
            exception := actions.trap;
-            if actions.advance_nia = '1' then
-                v.e.last_nia := next_nia;
-            end if;

            -- Go busy while division is happening because the
            -- divider is not pipelined.  Also go busy while a
@ -1681,6 +1679,9 @@ begin
        variable sign, zero : std_ulogic;
        variable rcnz_hi, rcnz_lo : std_ulogic;
    begin
+	-- Next insn adder used in a couple of places
+	next_nia <= std_ulogic_vector(unsigned(ex1.e.last_nia) + 4);
+
 	v := ex2;
        if stage2_stall = '0' then
            v.e := ex1.e;
@ -1688,6 +1689,9 @@ begin
            v.ext_interrupt := ex1.ext_interrupt;
            v.taken_branch_event := ex1.taken_branch_event;
            v.br_mispredict := ex1.br_mispredict;
+            if ex1.advance_nia = '1' then
+                v.e.last_nia := next_nia;
+            end if;
        end if;

        if ex1.se.mult_32s = '1' and ex1.oe = '1' then
@ -1748,10 +1752,12 @@ begin
        else
            sprres := pmu_to_x.spr_val;
        end if;
-        if ex1.res2_sel(1) = '0' then
-            ex_result := rcresult;
-        else
+        if ex1.res2_sel(1) = '1' then
            ex_result := sprres;
+        elsif ex1.redir_to_next = '1' then
+            ex_result := next_nia;
+        else
+            ex_result := rcresult;
        end if;

        cr_res := ex1.e.write_cr_data;
--- a/predecode.vhdl
+++ b/predecode.vhdl
@ -38,8 +38,38 @@ architecture behaviour of predecoder is
        2#011100_00000# to 2#011100_11111# =>  INSN_andi_dot,
        2#011101_00000# to 2#011101_11111# =>  INSN_andis_dot,
        2#000000_00000#                    =>  INSN_attn,
-        2#010010_00000# to 2#010010_11111# =>  INSN_b,
-        2#010000_00000# to 2#010000_11111# =>  INSN_bc,
+        2#010010_00000# to 2#010010_00001# =>  INSN_brel,
+        2#010010_00010# to 2#010010_00011# =>  INSN_babs,
+        2#010010_00100# to 2#010010_00101# =>  INSN_brel,
+        2#010010_00110# to 2#010010_00111# =>  INSN_babs,
+        2#010010_01000# to 2#010010_01001# =>  INSN_brel,
+        2#010010_01010# to 2#010010_01011# =>  INSN_babs,
+        2#010010_01100# to 2#010010_01101# =>  INSN_brel,
+        2#010010_01110# to 2#010010_01111# =>  INSN_babs,
+        2#010010_10000# to 2#010010_10001# =>  INSN_brel,
+        2#010010_10010# to 2#010010_10011# =>  INSN_babs,
+        2#010010_10100# to 2#010010_10101# =>  INSN_brel,
+        2#010010_10110# to 2#010010_10111# =>  INSN_babs,
+        2#010010_11000# to 2#010010_11001# =>  INSN_brel,
+        2#010010_11010# to 2#010010_11011# =>  INSN_babs,
+        2#010010_11100# to 2#010010_11101# =>  INSN_brel,
+        2#010010_11110# to 2#010010_11111# =>  INSN_babs,
+        2#010000_00000# to 2#010000_00001# =>  INSN_bcrel,
+        2#010000_00010# to 2#010000_00011# =>  INSN_bcabs,
+        2#010000_00100# to 2#010000_00101# =>  INSN_bcrel,
+        2#010000_00110# to 2#010000_00111# =>  INSN_bcabs,
+        2#010000_01000# to 2#010000_01001# =>  INSN_bcrel,
+        2#010000_01010# to 2#010000_01011# =>  INSN_bcabs,
+        2#010000_01100# to 2#010000_01101# =>  INSN_bcrel,
+        2#010000_01110# to 2#010000_01111# =>  INSN_bcabs,
+        2#010000_10000# to 2#010000_10001# =>  INSN_bcrel,
+        2#010000_10010# to 2#010000_10011# =>  INSN_bcabs,
+        2#010000_10100# to 2#010000_10101# =>  INSN_bcrel,
+        2#010000_10110# to 2#010000_10111# =>  INSN_bcabs,
+        2#010000_11000# to 2#010000_11001# =>  INSN_bcrel,
+        2#010000_11010# to 2#010000_11011# =>  INSN_bcabs,
+        2#010000_11100# to 2#010000_11101# =>  INSN_bcrel,
+        2#010000_11110# to 2#010000_11111# =>  INSN_bcabs,
        2#001011_00000# to 2#001011_11111# =>  INSN_cmpi,
        2#001010_00000# to 2#001010_11111# =>  INSN_cmpli,
        2#100010_00000# to 2#100010_11111# =>  INSN_lbz,
--- a/writeback.vhdl
+++ b/writeback.vhdl
@ -174,11 +174,7 @@ begin
            f.big_endian := '0';
            f.mode_32bit := '0';
        else
-            if e_in.abs_br = '1' then
-                f.redirect_nia := e_in.br_offset;
-            else
-                f.redirect_nia := std_ulogic_vector(unsigned(e_in.last_nia) + unsigned(e_in.br_offset));
-            end if;
+            f.redirect_nia := e_in.write_data;
            -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1
            f.virt_mode := e_in.redir_mode(3);
            f.priv_mode := e_in.redir_mode(2);