From 535341961d1d4d5b6df98f4bf9c01ae0daf5d9bf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 25 Jul 2020 18:23:11 +1000 Subject: [PATCH 01/14] multiplier: Generalize interface to the multiplier This makes the interface to the multiplier more general so an instance of it can be used in the FPU. It now has a 128-bit addend that is added on to the product. Instead of an input to negate the output, it now has a "not_result" input to complement the output. Execute1 uses not_result=1 and addend=-1 to get the effect of negating the output. The interface is defined this way because this is what can be done easily with the Xilinx DSP slices in xilinx-mult.vhdl. This also adds clock enable signals to the DSP slices, mostly for the sake of reducing power consumption. Signed-off-by: Paul Mackerras --- common.vhdl | 27 ++++++++++--------- execute1.vhdl | 9 ++++--- multiply.vhdl | 22 ++++++++-------- multiply_tb.vhdl | 31 +++++++++++++++------- xilinx-mult.vhdl | 68 ++++++++++++++++++++++++------------------------ 5 files changed, 86 insertions(+), 71 deletions(-) diff --git a/common.vhdl b/common.vhdl index 28b3434..e05720b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -182,16 +182,25 @@ package common is is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0')); - type Execute1ToMultiplyType is record + type MultiplyInputType is record valid: std_ulogic; data1: std_ulogic_vector(63 downto 0); data2: std_ulogic_vector(63 downto 0); + addend: std_ulogic_vector(127 downto 0); is_32bit: std_ulogic; - neg_result: std_ulogic; + not_result: std_ulogic; + end record; + constant MultiplyInputInit : MultiplyInputType := (valid => '0', + is_32bit => '0', not_result => '0', + others => (others => '0')); + + type MultiplyOutputType is record + valid: std_ulogic; + result: std_ulogic_vector(127 downto 0); + overflow : std_ulogic; end record; - constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', - is_32bit => '0', neg_result => '0', - others => (others => '0')); + constant MultiplyOutputInit : MultiplyOutputType := (valid => '0', overflow => '0', + others => (others => '0')); type Execute1ToDividerType is record valid: std_ulogic; @@ -382,14 +391,6 @@ package common is write_cr_data => (others => '0'), write_reg => (others => '0'), exc_write_reg => (others => '0'), exc_write_data => (others => '0')); - type MultiplyToExecute1Type is record - valid: std_ulogic; - result: std_ulogic_vector(127 downto 0); - overflow : std_ulogic; - end record; - constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0', - others => (others => '0')); - type DividerToExecute1Type is record valid: std_ulogic; write_reg_data: std_ulogic_vector(63 downto 0); diff --git a/execute1.vhdl b/execute1.vhdl index 2722570..d48fee8 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -89,8 +89,8 @@ architecture behaviour of execute1 is signal countzero_result: std_ulogic_vector(63 downto 0); -- multiply signals - signal x_to_multiply: Execute1ToMultiplyType; - signal multiply_to_x: MultiplyToExecute1Type; + signal x_to_multiply: MultiplyInputType; + signal multiply_to_x: MultiplyOutputType; -- divider signals signal x_to_divider: Execute1ToDividerType; @@ -396,7 +396,7 @@ begin abs2 := - signed(b_in); end if; - x_to_multiply <= Execute1ToMultiplyInit; + x_to_multiply <= MultiplyInputInit; x_to_multiply.is_32bit <= e_in.is_32bit; x_to_divider <= Execute1ToDividerInit; @@ -406,7 +406,8 @@ begin x_to_divider.is_modulus <= '1'; end if; - x_to_multiply.neg_result <= sign1 xor sign2; + x_to_multiply.not_result <= sign1 xor sign2; + x_to_multiply.addend <= (others => sign1 xor sign2); x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); if e_in.is_32bit = '0' then -- 64-bit forms diff --git a/multiply.vhdl b/multiply.vhdl index 7a4c81b..b737a46 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -12,22 +12,22 @@ entity multiply is port ( clk : in std_logic; - m_in : in Execute1ToMultiplyType; - m_out : out MultiplyToExecute1Type + m_in : in MultiplyInputType; + m_out : out MultiplyOutputType ); end entity multiply; architecture behaviour of multiply is - signal m: Execute1ToMultiplyType := Execute1ToMultiplyInit; + signal m: MultiplyInputType := MultiplyInputInit; type multiply_pipeline_stage is record valid : std_ulogic; data : unsigned(127 downto 0); is_32bit : std_ulogic; - neg_res : std_ulogic; + not_res : std_ulogic; end record; constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', - is_32bit => '0', neg_res => '0', + is_32bit => '0', not_res => '0', data => (others => '0')); type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage; @@ -53,19 +53,19 @@ begin variable d2 : std_ulogic_vector(63 downto 0); variable ov : std_ulogic; begin + v := r; v.multiply_pipeline(0).valid := m.valid; - v.multiply_pipeline(0).data := unsigned(m.data1) * unsigned(m.data2); + v.multiply_pipeline(0).data := (unsigned(m.data1) * unsigned(m.data2)) + unsigned(m.addend); v.multiply_pipeline(0).is_32bit := m.is_32bit; - v.multiply_pipeline(0).neg_res := m.neg_result; + v.multiply_pipeline(0).not_res := m.not_result; loop_0: for i in 1 to PIPELINE_DEPTH-1 loop v.multiply_pipeline(i) := r.multiply_pipeline(i-1); end loop; - if v.multiply_pipeline(PIPELINE_DEPTH-1).neg_res = '0' then - d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data); - else - d := std_ulogic_vector(- signed(v.multiply_pipeline(PIPELINE_DEPTH-1).data)); + d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data); + if v.multiply_pipeline(PIPELINE_DEPTH-1).not_res = '1' then + d := not d; end if; ov := '0'; diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl index 87f029d..884b828 100644 --- a/multiply_tb.vhdl +++ b/multiply_tb.vhdl @@ -17,8 +17,8 @@ architecture behave of multiply_tb is constant pipeline_depth : integer := 4; - signal m1 : Execute1ToMultiplyType := Execute1ToMultiplyInit; - signal m2 : MultiplyToExecute1Type; + signal m1 : MultiplyInputType := MultiplyInputInit; + signal m2 : MultiplyOutputType; function absval(x: std_ulogic_vector) return std_ulogic_vector is begin @@ -45,6 +45,7 @@ begin stim_process: process variable ra, rb, rt, behave_rt: std_ulogic_vector(63 downto 0); variable si: std_ulogic_vector(15 downto 0); + variable sign: std_ulogic; begin wait for clk_period; @@ -90,7 +91,9 @@ begin m1.data1 <= absval(ra); m1.data2 <= absval(rb); - m1.neg_result <= ra(63) xor rb(63); + sign := ra(63) xor rb(63); + m1.not_result <= sign; + m1.addend <= (others => sign); m1.valid <= '1'; wait for clk_period; @@ -114,7 +117,8 @@ begin m1.data1 <= ra; m1.data2 <= rb; - m1.neg_result <= '0'; + m1.not_result <= '0'; + m1.addend <= (others => '0'); m1.valid <= '1'; wait for clk_period; @@ -138,7 +142,9 @@ begin m1.data1 <= absval(ra); m1.data2 <= absval(rb); - m1.neg_result <= ra(63) xor rb(63); + sign := ra(63) xor rb(63); + m1.not_result <= sign; + m1.addend <= (others => sign); m1.valid <= '1'; wait for clk_period; @@ -164,7 +170,9 @@ begin m1.data1(31 downto 0) <= absval(ra(31 downto 0)); m1.data2 <= (others => '0'); m1.data2(31 downto 0) <= absval(rb(31 downto 0)); - m1.neg_result <= ra(31) xor rb(31); + sign := ra(31) xor rb(31); + m1.not_result <= sign; + m1.addend <= (others => sign); m1.valid <= '1'; wait for clk_period; @@ -190,7 +198,9 @@ begin m1.data1(31 downto 0) <= absval(ra(31 downto 0)); m1.data2 <= (others => '0'); m1.data2(31 downto 0) <= absval(rb(31 downto 0)); - m1.neg_result <= ra(31) xor rb(31); + sign := ra(31) xor rb(31); + m1.not_result <= sign; + m1.addend <= (others => sign); m1.valid <= '1'; wait for clk_period; @@ -217,7 +227,8 @@ begin m1.data1(31 downto 0) <= ra(31 downto 0); m1.data2 <= (others => '0'); m1.data2(31 downto 0) <= rb(31 downto 0); - m1.neg_result <= '0'; + m1.not_result <= '0'; + m1.addend <= (others => '0'); m1.valid <= '1'; wait for clk_period; @@ -243,7 +254,9 @@ begin m1.data1 <= absval(ra); m1.data2 <= (others => '0'); m1.data2(15 downto 0) <= absval(si); - m1.neg_result <= ra(63) xor si(15); + sign := ra(63) xor si(15); + m1.not_result <= sign; + m1.addend <= (others => sign); m1.valid <= '1'; wait for clk_period; diff --git a/xilinx-mult.vhdl b/xilinx-mult.vhdl index 46366d6..4c60775 100644 --- a/xilinx-mult.vhdl +++ b/xilinx-mult.vhdl @@ -12,8 +12,8 @@ entity multiply is port ( clk : in std_logic; - m_in : in Execute1ToMultiplyType; - m_out : out MultiplyToExecute1Type + m_in : in MultiplyInputType; + m_out : out MultiplyOutputType ); end entity multiply; @@ -33,11 +33,11 @@ architecture behaviour of multiply is signal p1_pat, p1_patb : std_ulogic; signal req_32bit, r32_1 : std_ulogic; - signal req_neg, rneg_1 : std_ulogic; + signal req_not, rnot_1 : std_ulogic; signal valid_1 : std_ulogic; begin - addend <= (others => m_in.neg_result); + addend <= m_in.addend; m00: DSP48E1 generic map ( @@ -73,7 +73,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -129,7 +129,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -184,7 +184,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -239,7 +239,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -295,7 +295,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -351,7 +351,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -408,7 +408,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -464,7 +464,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -520,7 +520,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -575,7 +575,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -630,7 +630,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -685,7 +685,7 @@ begin CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => '1', + CEM => m_in.valid, CEP => '0', CLK => clk, D => (others => '0'), @@ -734,12 +734,12 @@ begin CARRYINSEL => "000", CARRYOUT => s0_carry, CEA1 => '0', - CEA2 => '1', + CEA2 => valid_1, CEAD => '0', CEALUMODE => '0', CEB1 => '0', - CEB2 => '1', - CEC => '1', + CEB2 => valid_1, + CEC => valid_1, CECARRYIN => '0', CECTRL => '0', CED => '0', @@ -792,12 +792,12 @@ begin CARRYIN => s0_carry(3), CARRYINSEL => "000", CEA1 => '0', - CEA2 => '1', + CEA2 => valid_1, CEAD => '0', CEALUMODE => '0', CEB1 => '0', - CEB2 => '1', - CEC => '1', + CEB2 => valid_1, + CEC => valid_1, CECARRYIN => '0', CECTRL => '0', CED => '0', @@ -848,7 +848,7 @@ begin port map ( A => m21_p(22 downto 0) & m03_p(5 downto 0) & '0', ACIN => (others => '0'), - ALUMODE => "00" & rneg_1 & '0', + ALUMODE => "00" & rnot_1 & '0', B => (others => '0'), BCIN => (others => '0'), C => p0_mask, @@ -857,12 +857,12 @@ begin CARRYINSEL => "000", CARRYOUT => p0_carry, CEA1 => '0', - CEA2 => '1', + CEA2 => valid_1, CEAD => '0', - CEALUMODE => '1', + CEALUMODE => valid_1, CEB1 => '0', - CEB2 => '1', - CEC => '1', + CEB2 => valid_1, + CEC => valid_1, CECARRYIN => '0', CECTRL => '0', CED => '0', @@ -911,7 +911,7 @@ begin port map ( A => x"0000000" & '0' & m21_p(41), ACIN => (others => '0'), - ALUMODE => "00" & rneg_1 & '0', + ALUMODE => "00" & rnot_1 & '0', B => m21_p(40 downto 23), BCIN => (others => '0'), C => (others => '0'), @@ -919,11 +919,11 @@ begin CARRYIN => p0_carry(3), CARRYINSEL => "000", CEA1 => '0', - CEA2 => '1', + CEA2 => valid_1, CEAD => '0', - CEALUMODE => '1', + CEALUMODE => valid_1, CEB1 => '0', - CEB2 => '1', + CEB2 => valid_1, CEC => '0', CECARRYIN => '0', CECTRL => '0', @@ -952,7 +952,7 @@ begin RSTP => '0' ); - product(31 downto 0) <= product_lo xor (31 downto 0 => req_neg); + product(31 downto 0) <= product_lo xor (31 downto 0 => req_not); mult_out: process(all) variable ov : std_ulogic; @@ -977,8 +977,8 @@ begin valid_1 <= m_in.valid; req_32bit <= r32_1; r32_1 <= m_in.is_32bit; - req_neg <= rneg_1; - rneg_1 <= m_in.neg_result; + req_not <= rnot_1; + rnot_1 <= m_in.not_result; end if; end process; From f1238299bd05f4ea3da4e5b8340fa6dde47304bb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 27 Jul 2020 18:54:27 +1000 Subject: [PATCH 02/14] execute1: Take an extra cycle for OE=1 multiply instructions We now expect the overflow signal from the multiplier to come along one cycle later than the product. This breaks up a long combinatorial path and improves timing. This also changes some uses of v. to r. in the slow op logic, which should help timing as well. Signed-off-by: Paul Mackerras --- execute1.vhdl | 50 ++++++++++++++++++++++++++++++++---------------- multiply.vhdl | 6 +++++- xilinx-mult.vhdl | 5 ++++- 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index d48fee8..1068306 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -56,6 +56,7 @@ architecture behaviour of execute1 is lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; + mul_finish : std_ulogic; div_in_progress : std_ulogic; cntz_in_progress : std_ulogic; slow_op_insn : insn_type_t; @@ -69,7 +70,7 @@ architecture behaviour of execute1 is constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init, busy => '0', lr_update => '0', terminate => '0', - mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0', + mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0')); @@ -371,6 +372,7 @@ begin v.mul_in_progress := '0'; v.div_in_progress := '0'; v.cntz_in_progress := '0'; + v.mul_finish := '0'; -- signals to multiply and divide units sign1 := '0'; @@ -965,31 +967,47 @@ begin when others => -- i.e. OP_MUL_L64 result := multiply_to_x.result(63 downto 0); - overflow := multiply_to_x.overflow; end case; else result := divider_to_x.write_reg_data; overflow := divider_to_x.overflow; end if; - result_en := '1'; - v.e.write_reg := gpr_to_gspr(v.slow_op_dest); - v.e.rc := v.slow_op_rc; - v.e.xerc := v.slow_op_xerc; - v.e.write_xerc_enable := v.slow_op_oe; - -- We must test oe because the RC update code in writeback - -- will use the xerc value to set CR0:SO so we must not clobber - -- xerc if OE wasn't set. - if v.slow_op_oe = '1' then - v.e.xerc.ov := overflow; - v.e.xerc.ov32 := overflow; - v.e.xerc.so := v.slow_op_xerc.so or overflow; - end if; - v.e.valid := '1'; + if r.mul_in_progress = '1' and r.slow_op_oe = '1' then + -- have to wait until next cycle for overflow indication + v.mul_finish := '1'; + v.busy := '1'; + else + result_en := '1'; + v.e.write_reg := gpr_to_gspr(r.slow_op_dest); + v.e.rc := r.slow_op_rc; + v.e.xerc := r.slow_op_xerc; + v.e.write_xerc_enable := r.slow_op_oe; + -- We must test oe because the RC update code in writeback + -- will use the xerc value to set CR0:SO so we must not clobber + -- xerc if OE wasn't set. + if r.slow_op_oe = '1' then + v.e.xerc.ov := overflow; + v.e.xerc.ov32 := overflow; + v.e.xerc.so := r.slow_op_xerc.so or overflow; + end if; + v.e.valid := '1'; + end if; else v.busy := '1'; v.mul_in_progress := r.mul_in_progress; v.div_in_progress := r.div_in_progress; end if; + elsif r.mul_finish = '1' then + result := r.e.write_data; + result_en := '1'; + v.e.write_reg := gpr_to_gspr(r.slow_op_dest); + v.e.rc := r.slow_op_rc; + v.e.xerc := r.slow_op_xerc; + v.e.write_xerc_enable := r.slow_op_oe; + v.e.xerc.ov := multiply_to_x.overflow; + v.e.xerc.ov32 := multiply_to_x.overflow; + v.e.xerc.so := r.slow_op_xerc.so or multiply_to_x.overflow; + v.e.valid := '1'; end if; if illegal = '1' then diff --git a/multiply.vhdl b/multiply.vhdl index b737a46..a7ca7ac 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -38,12 +38,15 @@ architecture behaviour of multiply is end record; signal r, rin : reg_type := (multiply_pipeline => MultiplyPipelineInit); + signal overflow : std_ulogic; + signal ovf_in : std_ulogic; begin multiply_0: process(clk) begin if rising_edge(clk) then m <= m_in; r <= rin; + overflow <= ovf_in; end if; end process; @@ -74,9 +77,10 @@ begin else ov := (or d(127 downto 63)) and not (and d(127 downto 63)); end if; + ovf_in <= ov; m_out.result <= d; - m_out.overflow <= ov; + m_out.overflow <= overflow; m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid; rin <= v; diff --git a/xilinx-mult.vhdl b/xilinx-mult.vhdl index 4c60775..22d73c7 100644 --- a/xilinx-mult.vhdl +++ b/xilinx-mult.vhdl @@ -35,6 +35,7 @@ architecture behaviour of multiply is signal req_32bit, r32_1 : std_ulogic; signal req_not, rnot_1 : std_ulogic; signal valid_1 : std_ulogic; + signal overflow, ovf_in : std_ulogic; begin addend <= m_in.addend; @@ -964,9 +965,10 @@ begin ov := not ((p1_pat and p0_pat and not product(31)) or (p1_patb and p0_patb and product(31))); end if; + ovf_in <= ov; m_out.result <= product; - m_out.overflow <= ov; + m_out.overflow <= overflow; end process; process(clk) @@ -979,6 +981,7 @@ begin r32_1 <= m_in.is_32bit; req_not <= rnot_1; rnot_1 <= m_in.not_result; + overflow <= ovf_in; end if; end process; From 081684273e11fee3d3ec14bd97e8d858e1d29402 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 Jul 2020 12:09:02 +1000 Subject: [PATCH 03/14] execute1: Use r. not v. in countzero code This simplifies logic and improves timing. Signed-off-by: Paul Mackerras --- execute1.vhdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index 1068306..edddc4a 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -949,9 +949,9 @@ begin -- cnt[lt]z always takes two cycles result := countzero_result; result_en := '1'; - v.e.write_reg := gpr_to_gspr(v.slow_op_dest); - v.e.rc := v.slow_op_rc; - v.e.xerc := v.slow_op_xerc; + v.e.write_reg := gpr_to_gspr(r.slow_op_dest); + v.e.rc := r.slow_op_rc; + v.e.xerc := r.slow_op_xerc; v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or From 0fb89672902bbe7c56c419b57a5e96fc1d5cb202 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 3 Aug 2020 10:08:33 +1000 Subject: [PATCH 04/14] core: Implement the TAR register and the bctar instruction Signed-off-by: Paul Mackerras --- common.vhdl | 3 +++ decode1.vhdl | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/common.vhdl b/common.vhdl index e05720b..bd9210b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -26,6 +26,7 @@ package common is constant SPR_XER : spr_num_t := 1; constant SPR_LR : spr_num_t := 8; constant SPR_CTR : spr_num_t := 9; + constant SPR_TAR : spr_num_t := 815; constant SPR_DSISR : spr_num_t := 18; constant SPR_DAR : spr_num_t := 19; constant SPR_TB : spr_num_t := 268; @@ -459,6 +460,8 @@ package body common is n := 11; when SPR_XER => n := 12; + when SPR_TAR => + n := 13; when others => n := 0; return "000000"; diff --git a/decode1.vhdl b/decode1.vhdl index f553e2d..5677917 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -94,7 +94,7 @@ architecture behaviour of decode1 is 2#1100000010# => '1', 2#1100100010# => '1', 2#1101000010# => '1', 2#1101100010# => '1', 2#1110000010# => '1', 2#1110100010# => '1', 2#1111000010# => '1', 2#1111100010# => '1', 2#1000010000# => '1', -- bcctr 2#0000010000# => '1', -- bclr - 2#1000110000# => '0', -- bctar + 2#1000110000# => '1', -- bctar 2#0100000001# => '1', -- crand 2#0010000001# => '1', -- crandc 2#0100100001# => '1', -- creqv @@ -467,11 +467,12 @@ begin if f_in.insn(23) = '0' then v.ispr1 := fast_spr_num(SPR_CTR); end if; - -- TODO: Add TAR if f_in.insn(10) = '0' then v.ispr2 := fast_spr_num(SPR_LR); - else + elsif f_in.insn(6) = '0' then v.ispr2 := fast_spr_num(SPR_CTR); + else + v.ispr2 := fast_spr_num(SPR_TAR); end if; else -- Could be OP_RFID From fa77a6f683c8749da89c8e8c12b8b49cb44c2aa6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 3 Aug 2020 10:29:46 +1000 Subject: [PATCH 05/14] core: Implement the mcrxrx instruction This also removes OP_MCRXR, as the mcrxr instruction was removed in version 3.0B of the Power ISA, having been phased-out for the server architecture since v2.02. Signed-off-by: Paul Mackerras --- decode1.vhdl | 3 +-- decode_types.vhdl | 2 +- execute1.vhdl | 8 ++++++++ scripts/fmt_log/fmt_log.c | 8 ++++---- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 5677917..1c0ee3d 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -254,8 +254,7 @@ architecture behaviour of decode1 is 2#1100010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzcix 2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzux 2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzx - -- 2#1000000000# mcrxr - -- 2#1001000000# mcrxrx + 2#1001000000# => (ALU, OP_MCRXRX, NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf 2#0001010011# => (ALU, OP_MFMSR, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mfmsr 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, RS, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr diff --git a/decode_types.vhdl b/decode_types.vhdl index 9cd6d69..fac593e 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -10,7 +10,7 @@ package decode_types is OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, - OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD, + OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD, OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, OP_OR, OP_POPCNT, OP_PRTY, OP_RFID, diff --git a/execute1.vhdl b/execute1.vhdl index edddc4a..0bce696 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -739,6 +739,14 @@ begin end if; end loop; end if; + when OP_MCRXRX => + newcrf := v.e.xerc.ov & v.e.xerc.ca & v.e.xerc.ov32 & v.e.xerc.ca32; + bf := insn_bf(e_in.insn); + crnum := to_integer(unsigned(bf)); + v.e.write_cr_enable := '1'; + v.e.write_cr_mask := num_to_fxm(crnum); + v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & + newcrf & newcrf & newcrf & newcrf; when OP_MFMSR => result := ctrl.msr; result_en := '1'; diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index c8fb501..3a003f7 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -91,10 +91,10 @@ const char *ops[64] = "bperm ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", "darn ", "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "div ", "dive ", "exts ", "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "maddhd ", - "maddhdu", "maddld ", "mcrxr ", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", - "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", - "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", - "shr ", "sync ", "tlbie ", "trap ", "xor ", "ffail ", "?62 ", "?63 " + "maddhdu", "maddld ", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", + "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", + "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", "shr ", + "sync ", "tlbie ", "trap ", "xor ", "ffail ", "?61 ", "?62 ", "?63 " }; const char *spr_names[13] = From cce34039c3f9170c818c303aa7b9b31ffa783d6d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 3 Aug 2020 14:31:58 +1000 Subject: [PATCH 06/14] core: Implement the setb instruction Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 +- execute1.vhdl | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/decode1.vhdl b/decode1.vhdl index 1c0ee3d..1199bae 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -289,7 +289,7 @@ architecture behaviour of decode1 is 2#0101111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw 2#0010111010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd 2#0010011010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw - -- 2#0010000000# setb + 2#0010000000# => (ALU, OP_SETB, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- setb 2#0111110010# => (LDST, OP_TLBIE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- slbia 2#0000011011# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- sld 2#0000011000# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- slw diff --git a/execute1.vhdl b/execute1.vhdl index 0bce696..c5e1e3e 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -875,6 +875,15 @@ begin set_carry(v.e, rotator_carry, rotator_carry); end if; result_en := '1'; + when OP_SETB => + bfa := insn_bfa(e_in.insn); + crbit := to_integer(unsigned(bfa)) * 4; + result := (others => '0'); + if cr_in(31 - crbit) = '1' then + result := (others => '1'); + elsif cr_in(30 - crbit) = '1' then + result(0) := '1'; + end if; when OP_ISYNC => v.f.redirect := '1'; From b739372f7e3d12499a12002e44ca16d006b0d1e1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 3 Aug 2020 14:45:19 +1000 Subject: [PATCH 07/14] core: Implement the bpermd instruction Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 +- execute1.vhdl | 2 +- logical.vhdl | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 1199bae..159df1d 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -158,7 +158,7 @@ architecture behaviour of decode1 is 2#1011001010# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addzeo 2#0000011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- and 2#0000111100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- andc - -- 2#0011111100# bperm + 2#0011111100# => (ALU, OP_BPERM, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- bperm 2#0000000000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp 2#0111111100# => (ALU, OP_CMPB, NONE, RB, RS, RA, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb -- 2#0011100000# cmpeqb diff --git a/execute1.vhdl b/execute1.vhdl index c5e1e3e..cb9b13d 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -633,7 +633,7 @@ begin end if; end if; end if; - when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS => + when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM => result := logical_result; result_en := '1'; when OP_B => diff --git a/logical.vhdl b/logical.vhdl index 0f53544..2df66dc 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -35,11 +35,13 @@ architecture behaviour of logical is signal par0, par1 : std_ulogic; signal popcnt : std_ulogic_vector(63 downto 0); signal parity : std_ulogic_vector(63 downto 0); + signal permute : std_ulogic_vector(7 downto 0); begin logical_0: process(all) variable rb_adj, tmp : std_ulogic_vector(63 downto 0); variable negative : std_ulogic; + variable j : integer; begin -- population counts for i in 0 to 31 loop @@ -81,6 +83,16 @@ begin parity(32) <= par1; end if; + -- bit permutation + for i in 0 to 7 loop + j := i * 8; + if rs(j+7 downto j+6) = "00" then + permute(i) <= rb(to_integer(unsigned(rs(j+5 downto j)))); + else + permute(i) <= '0'; + end if; + end loop; + rb_adj := rb; if invert_in = '1' then rb_adj := not rb; @@ -106,6 +118,8 @@ begin tmp := parity; when OP_CMPB => tmp := ppc_cmpb(rs, rb); + when OP_BPERM => + tmp := std_ulogic_vector(resize(unsigned(permute), 64)); when others => -- EXTS -- note datalen is a 1-hot encoding From 8edfbf638b8c4704e1184618405061590cee684b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 3 Aug 2020 22:30:23 +1000 Subject: [PATCH 08/14] core: Implement the cmpeqb and cmprb instructions Signed-off-by: Paul Mackerras --- decode1.vhdl | 4 ++-- execute1.vhdl | 16 ++++++++++++++++ ppc_fx_insns.vhdl | 30 ++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 159df1d..a95cfad 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -161,9 +161,9 @@ architecture behaviour of decode1 is 2#0011111100# => (ALU, OP_BPERM, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- bperm 2#0000000000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp 2#0111111100# => (ALU, OP_CMPB, NONE, RB, RS, RA, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb - -- 2#0011100000# cmpeqb + 2#0011100000# => (ALU, OP_CMPEQB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpeqb 2#0000100000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl - -- 2#0011000000# cmprb + 2#0011000000# => (ALU, OP_CMPRB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmprb 2#0000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- cntlzd 2#0000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- cntlzw 2#1000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- cnttzd diff --git a/execute1.vhdl b/execute1.vhdl index cb9b13d..b836e33 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -633,6 +633,22 @@ begin end if; end if; end if; + when OP_CMPRB => + newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); + bf := insn_bf(e_in.insn); + crnum := to_integer(unsigned(bf)); + v.e.write_cr_enable := '1'; + v.e.write_cr_mask := num_to_fxm(crnum); + v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & + newcrf & newcrf & newcrf & newcrf; + when OP_CMPEQB => + newcrf := ppc_cmpeqb(a_in, b_in); + bf := insn_bf(e_in.insn); + crnum := to_integer(unsigned(bf)); + v.e.write_cr_enable := '1'; + v.e.write_cr_mask := num_to_fxm(crnum); + v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & + newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM => result := logical_result; result_en := '1'; diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl index 5fdf1c7..c34a884 100644 --- a/ppc_fx_insns.vhdl +++ b/ppc_fx_insns.vhdl @@ -87,6 +87,8 @@ package ppc_fx_insns is so: std_ulogic) return std_ulogic_vector; function ppc_cmpb (rs, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; + function ppc_cmpeqb (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; + function ppc_cmprb (ra, rb: std_ulogic_vector(63 downto 0); l: std_ulogic) return std_ulogic_vector; function ppc_divw (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_divdu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; @@ -746,6 +748,34 @@ package body ppc_fx_insns is return ret; end; + function ppc_cmpeqb (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable match: std_ulogic; + variable j: integer; + begin + match := '0'; + for i in 0 to 7 loop + j := i * 8; + if ra(7 downto 0) = rb(j + 7 downto j) then + match := '1'; + end if; + end loop; + return '0' & match & "00"; + end; + + function ppc_cmprb (ra, rb: std_ulogic_vector(63 downto 0); l: std_ulogic) return std_ulogic_vector is + variable match: std_ulogic; + variable v: unsigned(7 downto 0); + begin + match := '0'; + v := unsigned(ra(7 downto 0)); + if v >= unsigned(rb(7 downto 0)) and v <= unsigned(rb(15 downto 8)) then + match := '1'; + elsif l = '1' and v >= unsigned(rb(23 downto 16)) and v <= unsigned(rb(31 downto 24)) then + match := '1'; + end if; + return '0' & match & "00"; + end; + -- Not synthesizable function ppc_divw (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is variable tmp: signed(31 downto 0); From 290b05f97da6255734b4ff3c7c7a913cf99301f9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 4 Aug 2020 20:02:30 +1000 Subject: [PATCH 09/14] core: Implement the maddhd, maddhdu and maddld instructions These instructions use major opcode 4 and have a third GPR input operand, so we need a decode table for major opcode 4 and some plumbing to get the RC register operand read. The multiply-add instructions use the same insn_type_t values as the regular multiply instructions, and we distinguish in execute1 by looking at the major opcode. This turns out to be convenient because we don't have to add any cases in the code that handles the output of the multiplier, and it frees up some insn_type_t values. Signed-off-by: Paul Mackerras --- decode1.vhdl | 30 ++++++++++++++++++++++++++++++ decode2.vhdl | 5 ++++- decode_types.vhdl | 4 ++-- execute1.vhdl | 15 ++++++++++++++- insn_helpers.vhdl | 6 ++++++ scripts/fmt_log/fmt_log.c | 10 +++++----- 6 files changed, 61 insertions(+), 9 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index a95cfad..eceee40 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -34,6 +34,8 @@ architecture behaviour of decode1 is subtype major_opcode_t is unsigned(5 downto 0); type major_rom_array_t is array(0 to 63) of decode_rom_t; type minor_valid_array_t is array(0 to 1023) of std_ulogic; + type minor_valid_array_2t is array(0 to 2047) of std_ulogic; + type op_4_subop_array_t is array(0 to 63) of decode_rom_t; type op_19_subop_array_t is array(0 to 7) of decode_rom_t; type op_30_subop_array_t is array(0 to 15) of decode_rom_t; type op_31_subop_array_t is array(0 to 1023) of decode_rom_t; @@ -85,6 +87,24 @@ architecture behaviour of decode1 is others => illegal_inst ); + -- indexed by bits 5..0 and 10..6 of instruction word + constant decode_op_4_valid : minor_valid_array_2t := ( + 2#11000000000# to 2#11000011111# => '1', -- maddhd + 2#11000100000# to 2#11000111111# => '1', -- maddhdu + 2#11001100000# to 2#11001111111# => '1', -- maddld + others => '0' + ); + + -- indexed by bits 5..0 of instruction word + constant decode_op_4_array : op_4_subop_array_t := ( + -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl + -- op in out A out in out len ext pipe + 2#110000# => (ALU, OP_MUL_H64, RA, RB, RCR, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- maddhd + 2#110001# => (ALU, OP_MUL_H64, RA, RB, RCR, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- maddhdu + 2#110011# => (ALU, OP_MUL_L64, RA, RB, RCR, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- maddld + others => decode_rom_init + ); + -- indexed by bits 10..1 of instruction word constant decode_op_19_valid : minor_valid_array_t := ( -- addpcis, 5 upper bits are part of constant @@ -390,6 +410,7 @@ begin variable v : Decode1ToDecode2Type; variable f : Decode1ToFetch1Type; variable majorop : major_opcode_t; + variable minor4op : std_ulogic_vector(10 downto 0); variable op_19_bits: std_ulogic_vector(2 downto 0); variable sprn : spr_num_t; variable br_nia : std_ulogic_vector(61 downto 0); @@ -418,6 +439,15 @@ begin end if; v.decode := fetch_fail_inst; + elsif majorop = "000100" then + -- major opcode 4, mostly VMX/VSX stuff but also some integer ops (madd*) + minor4op := f_in.insn(5 downto 0) & f_in.insn(10 downto 6); + if decode_op_4_valid(to_integer(unsigned(minor4op))) = '1' then + v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0)))); + else + v.decode := illegal_inst; + end if; + elsif majorop = "011111" then -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); diff --git a/decode2.vhdl b/decode2.vhdl index 62c574c..b1531f1 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -135,6 +135,8 @@ architecture behaviour of decode2 is case t is when RS => return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data); + when RCR => + return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data); when NONE => return ('0', (others => '0'), (others => '0')); end case; @@ -282,7 +284,8 @@ begin else gpr_to_gspr(insn_ra(d_in.insn)); r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR else gpr_to_gspr(insn_rb(d_in.insn)); - r_out.read3_reg <= insn_rs(d_in.insn); + r_out.read3_reg <= insn_rcreg(d_in.insn) when d_in.decode.input_reg_c = RCR + else insn_rs(d_in.insn); c_out.read <= d_in.decode.input_cr; diff --git a/decode_types.vhdl b/decode_types.vhdl index fac593e..e5ae8c1 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -9,7 +9,7 @@ package decode_types is OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, - OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, + OP_LOAD, OP_STORE, OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD, OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, OP_OR, @@ -23,7 +23,7 @@ package decode_types is type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR); - type input_reg_c_t is (NONE, RS); + type input_reg_c_t is (NONE, RS, RCR); type output_reg_a_t is (NONE, RT, RA, SPR); type rc_t is (NONE, ONE, RC); type carry_in_t is (ZERO, CA, ONE); diff --git a/execute1.vhdl b/execute1.vhdl index b836e33..a620a50 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -309,6 +309,7 @@ begin variable taken_branch : std_ulogic; variable abs_branch : std_ulogic; variable spr_val : std_ulogic_vector(63 downto 0); + variable addend : std_ulogic_vector(127 downto 0); begin result := (others => '0'); result_with_carry := (others => '0'); @@ -408,8 +409,20 @@ begin x_to_divider.is_modulus <= '1'; end if; + addend := (others => '0'); + if e_in.insn(26) = '0' then + -- integer multiply-add, major op 4 (if it is a multiply) + addend(63 downto 0) := c_in; + if e_in.is_signed = '1' then + addend(127 downto 64) := (others => c_in(63)); + end if; + end if; + if (sign1 xor sign2) = '1' then + addend := not addend; + end if; + x_to_multiply.not_result <= sign1 xor sign2; - x_to_multiply.addend <= (others => sign1 xor sign2); + x_to_multiply.addend <= addend; x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); if e_in.is_32bit = '0' then -- 64-bit forms diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl index acd2f72..592acb0 100644 --- a/insn_helpers.vhdl +++ b/insn_helpers.vhdl @@ -6,6 +6,7 @@ package insn_helpers is function insn_rt (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_ra (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_rb (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_rcreg (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_si (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_ui (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_l (insn_in : std_ulogic_vector) return std_ulogic; @@ -59,6 +60,11 @@ package body insn_helpers is return insn_in(15 downto 11); end; + function insn_rcreg (insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(10 downto 6); + end; + function insn_si (insn_in : std_ulogic_vector) return std_ulogic_vector is begin return insn_in(15 downto 0); diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index 3a003f7..9b6775b 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -90,11 +90,11 @@ const char *ops[64] = "illegal", "nop ", "add ", "and ", "attn ", "b ", "bc ", "bcreg ", "bperm ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", "darn ", "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "div ", "dive ", "exts ", - "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "maddhd ", - "maddhdu", "maddld ", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", - "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", - "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", "shr ", - "sync ", "tlbie ", "trap ", "xor ", "ffail ", "?61 ", "?62 ", "?63 " + "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "mcrxrx ", + "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", + "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", "rfid ", "rlc ", "rlcl ", + "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", "tlbie ", "trap ", + "xor ", "ffail ", "?58 ", "?59 ", "?60 ", "?61 ", "?62 ", "?63 " }; const char *spr_names[13] = From 1a7aebeef80be3ae31e844aa83c9d7521cce8bed Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 5 Aug 2020 15:28:45 +1000 Subject: [PATCH 10/14] Add random number generator and implement the darn instruction This adds a true random number generator for the Xilinx FPGAs which uses a set of chaotic ring oscillators to generate random bits and then passes them through a Linear Hybrid Cellular Automaton (LHCA) to remove bias, as described in "High Speed True Random Number Generators in Xilinx FPGAs" by Catalin Baetoniu of Xilinx Inc., in: https://pdfs.semanticscholar.org/83ac/9e9c1bb3dad5180654984604c8d5d8137412.pdf This requires adding a .xdc file to tell vivado that the combinatorial loops that form the ring oscillators are intentional. The same code should work on other FPGAs as well if their tools can be told to accept the combinatorial loops. For simulation, the random.vhdl module gets compiled in, which uses the pseudorand() function to generate random numbers. Synthesis using yosys uses nonrandom.vhdl, which always signals an error, causing darn to return 0xffff_ffff_ffff_ffff. This adds an implementation of the darn instruction. Darn can return either raw or conditioned random numbers. On Xilinx FPGAs, reading a raw random number gives the output of the ring oscillators, and reading a conditioned random number gives the output of the LHCA. Signed-off-by: Paul Mackerras --- Makefile | 6 +++-- decode1.vhdl | 2 +- execute1.vhdl | 32 ++++++++++++++++++++++++++ fpga/fpga-random.vhdl | 53 +++++++++++++++++++++++++++++++++++++++++++ fpga/fpga-random.xdc | 3 +++ microwatt.core | 2 ++ nonrandom.vhdl | 22 ++++++++++++++++++ random.vhdl | 30 ++++++++++++++++++++++++ 8 files changed, 147 insertions(+), 3 deletions(-) create mode 100644 fpga/fpga-random.vhdl create mode 100644 fpga/fpga-random.xdc create mode 100644 nonrandom.vhdl create mode 100644 random.vhdl diff --git a/Makefile b/Makefile index 096be56..b584895 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,8 @@ uart_files = $(wildcard uart16550/*.v) soc_sim_files = $(soc_files) sim_console.vhdl sim_pp_uart.vhdl sim_bram_helpers.vhdl \ sim_bram.vhdl sim_jtag_socket.vhdl sim_jtag.vhdl dmi_dtm_xilinx.vhdl \ - sim_16550_uart.vhdl + sim_16550_uart.vhdl \ + random.vhdl glibc_random.vhdl glibc_random_helpers.vhdl soc_sim_c_files = sim_vhpi_c.c sim_bram_helpers_c.c sim_console_c.c \ sim_jtag_socket_c.c @@ -177,7 +178,8 @@ toplevel=fpga/top-generic.vhdl dmi_dtm=dmi_dtm_dummy.vhdl fpga_files = $(core_files) $(soc_files) fpga/soc_reset.vhdl \ - fpga/pp_fifo.vhd fpga/pp_soc_uart.vhd fpga/main_bram.vhdl + fpga/pp_fifo.vhd fpga/pp_soc_uart.vhd fpga/main_bram.vhdl \ + nonrandom.vhdl synth_files = $(core_files) $(soc_files) $(fpga_files) $(clkgen) $(toplevel) $(dmi_dtm) diff --git a/decode1.vhdl b/decode1.vhdl index eceee40..a58525e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -188,7 +188,7 @@ architecture behaviour of decode1 is 2#0000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- cntlzw 2#1000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- cnttzd 2#1000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- cnttzw - -- 2#1011110011# darn + 2#1011110011# => (ALU, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- darn 2#0001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbf 2#0000110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst 2#0100010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt diff --git a/execute1.vhdl b/execute1.vhdl index a620a50..a53024f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -97,6 +97,11 @@ architecture behaviour of execute1 is signal x_to_divider: Execute1ToDividerType; signal divider_to_x: DividerToExecute1Type; + -- random number generator signals + signal random_raw : std_ulogic_vector(63 downto 0); + signal random_cond : std_ulogic_vector(63 downto 0); + signal random_err : std_ulogic; + -- signals for logging signal exception_log : std_ulogic; signal irq_valid_log : std_ulogic; @@ -185,6 +190,11 @@ architecture behaviour of execute1 is return msr_out; end; + -- Tell vivado to keep the hierarchy for the random module so that the + -- net names in the xdc file match. + attribute keep_hierarchy : string; + attribute keep_hierarchy of random_0 : label is "yes"; + begin rotator_0: entity work.rotator @@ -238,6 +248,14 @@ begin d_out => divider_to_x ); + random_0: entity work.random + port map ( + clk => clk, + data => random_cond, + raw => random_raw, + err => random_err + ); + dbg_msr_out <= ctrl.msr; log_rd_addr <= r.log_addr_spr; @@ -776,6 +794,20 @@ begin v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; + when OP_DARN => + if random_err = '0' then + case e_in.insn(17 downto 16) is + when "00" => + result := x"00000000" & random_cond(31 downto 0); + when "10" => + result := random_raw; + when others => + result := random_cond; + end case; + else + result := (others => '1'); + end if; + result_en := '1'; when OP_MFMSR => result := ctrl.msr; result_en := '1'; diff --git a/fpga/fpga-random.vhdl b/fpga/fpga-random.vhdl new file mode 100644 index 0000000..7897c05 --- /dev/null +++ b/fpga/fpga-random.vhdl @@ -0,0 +1,53 @@ +-- Random number generator for Microwatt +-- Based on https://pdfs.semanticscholar.org/83ac/9e9c1bb3dad5180654984604c8d5d8137412.pdf +-- "High Speed True Random Number Generators in Xilinx FPGAs" +-- by Catalin Baetoniu, Xilinx Inc. + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; + +entity random is + port ( + clk : in std_ulogic; + data : out std_ulogic_vector(63 downto 0); + raw : out std_ulogic_vector(63 downto 0); + err : out std_ulogic + ); +end entity random; + +architecture behaviour of random is + signal ringosc : std_ulogic_vector(63 downto 0); + signal ro_reg : std_ulogic_vector(63 downto 0); + signal lhca : std_ulogic_vector(63 downto 0); + + constant lhca_diag : std_ulogic_vector(63 downto 0) := x"fffffffffffffffb"; + +begin + random_osc : process(all) + begin + -- chaotic set of ring oscillators + ringosc(0) <= ringosc(63) xor ringosc(0) xor ringosc(1); + for i in 1 to 62 loop + ringosc(i) <= ringosc(i-1) xor ringosc(i) xor ringosc(i+1); + end loop; + ringosc(63) <= not (ringosc(62) xor ringosc(63) xor ringosc(0)); + end process; + + lhca_update : process(clk) + begin + if rising_edge(clk) then + ro_reg <= ringosc; + raw <= ro_reg; + -- linear hybrid cellular automaton + -- used to even out the statistics of the ring oscillators + lhca <= ('0' & lhca(63 downto 1)) xor (lhca and lhca_diag) xor + (lhca(62 downto 0) & '0') xor ro_reg; + end if; + end process; + + data <= lhca; + err <= '0'; +end behaviour; diff --git a/fpga/fpga-random.xdc b/fpga/fpga-random.xdc new file mode 100644 index 0000000..ba69f87 --- /dev/null +++ b/fpga/fpga-random.xdc @@ -0,0 +1,3 @@ +set_property ALLOW_COMBINATORIAL_LOOPS TRUE [get_nets soc0/processor/execute1_0/random_0/ro_reg*] +set_property ALLOW_COMBINATORIAL_LOOPS TRUE [get_nets soc0/processor/execute1_0/random_0/p_*] +set_property ALLOW_COMBINATORIAL_LOOPS TRUE [get_nets soc0/processor/execute1_0/random_0/D*] diff --git a/microwatt.core b/microwatt.core index 15786fe..9c91620 100644 --- a/microwatt.core +++ b/microwatt.core @@ -64,6 +64,8 @@ filesets: xilinx_specific: files: - xilinx-mult.vhdl : {file_type : vhdlSource-2008} + - fpga/fpga-random.vhdl : {file_type : vhdlSource-2008} + - fpga/fpga-random.xdc : {file_type : xdc} debug_xilinx: files: diff --git a/nonrandom.vhdl b/nonrandom.vhdl new file mode 100644 index 0000000..16f81da --- /dev/null +++ b/nonrandom.vhdl @@ -0,0 +1,22 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; + +entity random is + port ( + clk : in std_ulogic; + data : out std_ulogic_vector(63 downto 0); + raw : out std_ulogic_vector(63 downto 0); + err : out std_ulogic + ); +end entity random; + +architecture behaviour of random is + +begin + data <= (others => '1'); + raw <= (others => '1'); + err <= '1'; +end behaviour; diff --git a/random.vhdl b/random.vhdl new file mode 100644 index 0000000..063c30e --- /dev/null +++ b/random.vhdl @@ -0,0 +1,30 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.glibc_random.all; + +entity random is + port ( + clk : in std_ulogic; + data : out std_ulogic_vector(63 downto 0); + raw : out std_ulogic_vector(63 downto 0); + err : out std_ulogic + ); +end entity random; + +architecture behaviour of random is +begin + err <= '0'; + + process(clk) + variable rand : std_ulogic_vector(63 downto 0); + begin + if rising_edge(clk) then + rand := pseudorand(64); + data <= rand; + raw <= rand; + end if; + end process; +end behaviour; From 5fafdc56efa6dd479fade2dd8ad1b5ff9526541e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 6 Aug 2020 19:15:02 +1000 Subject: [PATCH 11/14] core: Implement the addex instruction The addex instruction is like adde but uses the XER[OV] bit for the carry in and out rather than XER[CA]. Signed-off-by: Paul Mackerras --- decode1.vhdl | 1 + decode_types.vhdl | 2 +- execute1.vhdl | 10 +++++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index a58525e..caff3d8 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -172,6 +172,7 @@ architecture behaviour of decode1 is 2#1000001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addco 2#0010001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- adde 2#1010001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addeo + 2#0010101010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', OV, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addex 2#0011101010# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addme 2#1011101010# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addmeo 2#0011001010# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addze diff --git a/decode_types.vhdl b/decode_types.vhdl index e5ae8c1..7a60eac 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -26,7 +26,7 @@ package decode_types is type input_reg_c_t is (NONE, RS, RCR); type output_reg_a_t is (NONE, RT, RA, SPR); type rc_t is (NONE, ONE, RC); - type carry_in_t is (ZERO, CA, ONE); + type carry_in_t is (ZERO, CA, OV, ONE); constant SH_OFFSET : integer := 0; constant MB_OFFSET : integer := 1; diff --git a/execute1.vhdl b/execute1.vhdl index a53024f..256cb5e 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -164,6 +164,8 @@ architecture behaviour of execute1 is return '0'; when CA => return xerc.ca; + when OV => + return xerc.ov; when ONE => return '1'; end case; @@ -594,7 +596,13 @@ begin carry_64 := result_with_carry(64); if e_in.insn_type = OP_ADD then if e_in.output_carry = '1' then - set_carry(v.e, carry_32, carry_64); + if e_in.input_carry /= OV then + set_carry(v.e, carry_32, carry_64); + else + v.e.xerc.ov := carry_64; + v.e.xerc.ov32 := carry_32; + v.e.write_xerc_enable := '1'; + end if; end if; if e_in.oe = '1' then set_ov(v.e, From 7246bd6f678259500a4c68d5da818f7857247c20 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 6 Aug 2020 19:24:40 +1000 Subject: [PATCH 12/14] core: Implement the reserved no-op instructions These are no-ops that are reserved for future use as performance hints, so we just need to treat them as no-ops. Signed-off-by: Paul Mackerras --- decode1.vhdl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/decode1.vhdl b/decode1.vhdl index caff3d8..4e20706 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -302,6 +302,15 @@ architecture behaviour of decode1 is 2#0111011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nand 2#0001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- neg 2#1001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nego + -- next 8 are reserved no-op instructions + 2#1000010010# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop + 2#1000110010# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop + 2#1001010010# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop + 2#1001110010# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop + 2#1010010010# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop + 2#1010110010# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop + 2#1011010010# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop + 2#1011110010# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop 2#0001111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nor 2#0110111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- or 2#0110011100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- orc From 7052ceef4accebf06d3c344e478a1ad77c9098a2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 6 Aug 2020 20:31:09 +1000 Subject: [PATCH 13/14] core: Implement the wait instruction as a no-op Signed-off-by: Paul Mackerras --- decode1.vhdl | 1 + 1 file changed, 1 insertion(+) diff --git a/decode1.vhdl b/decode1.vhdl index 4e20706..69b50a6 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -364,6 +364,7 @@ architecture behaviour of decode1 is 2#0000000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw 2#0100110010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie 2#0100010010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbiel + 2#0000011110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- wait 2#0100111100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- xor others => illegal_inst ); From 83816cb9e331dcdffbb0e6faa801816b1d30350c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 7 Aug 2020 09:57:19 +1000 Subject: [PATCH 14/14] core: Implement BCD Assist instructions addg6s, cdtbcd, cbcdtod To avoid adding too much logic, this moves the adder used by OP_ADD out of the case statement in execute1.vhdl so that the result can be used by OP_ADDG6S as well. Signed-off-by: Paul Mackerras --- decode1.vhdl | 3 ++ decode_types.vhdl | 1 + execute1.vhdl | 44 +++++++++++++++------- logical.vhdl | 77 +++++++++++++++++++++++++++++++++++++++ scripts/fmt_log/fmt_log.c | 2 +- 5 files changed, 112 insertions(+), 15 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 69b50a6..21fea4a 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -173,6 +173,7 @@ architecture behaviour of decode1 is 2#0010001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- adde 2#1010001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addeo 2#0010101010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', OV, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addex + 2#0001001010# => (ALU, OP_ADDG6S, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addg6s 2#0011101010# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addme 2#1011101010# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addmeo 2#0011001010# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addze @@ -180,6 +181,8 @@ architecture behaviour of decode1 is 2#0000011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- and 2#0000111100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- andc 2#0011111100# => (ALU, OP_BPERM, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- bperm + 2#0100111010# => (ALU, OP_BCD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cbcdtd + 2#0100011010# => (ALU, OP_BCD, NONE, NONE, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cdtbcd 2#0000000000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp 2#0111111100# => (ALU, OP_CMPB, NONE, RB, RS, RA, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb 2#0011100000# => (ALU, OP_CMPEQB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpeqb diff --git a/decode_types.vhdl b/decode_types.vhdl index 7a60eac..ef654c3 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -18,6 +18,7 @@ package decode_types is OP_SHL, OP_SHR, OP_SYNC, OP_TLBIE, OP_TRAP, OP_XOR, + OP_BCD, OP_ADDG6S, OP_FETCH_FAILED ); type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA); diff --git a/execute1.vhdl b/execute1.vhdl index 256cb5e..1b83997 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -295,7 +295,7 @@ begin variable a_inv : std_ulogic_vector(63 downto 0); variable result : std_ulogic_vector(63 downto 0); variable newcrf : std_ulogic_vector(3 downto 0); - variable result_with_carry : std_ulogic_vector(64 downto 0); + variable sum_with_carry : std_ulogic_vector(64 downto 0); variable result_en : std_ulogic; variable crnum : crnum_t; variable crbit : integer range 0 to 31; @@ -332,7 +332,7 @@ begin variable addend : std_ulogic_vector(127 downto 0); begin result := (others => '0'); - result_with_carry := (others => '0'); + sum_with_carry := (others => '0'); result_en := '0'; newcrf := (others => '0'); is_branch := '0'; @@ -395,6 +395,15 @@ begin v.cntz_in_progress := '0'; v.mul_finish := '0'; + -- Main adder + if e_in.invert_a = '0' then + a_inv := a_in; + else + a_inv := not a_in; + end if; + sum_with_carry := ppc_adde(a_inv, b_in, + decode_input_carry(e_in.input_carry, v.e.xerc)); + -- signals to multiply and divide units sign1 := '0'; sign2 := '0'; @@ -584,16 +593,9 @@ begin when OP_NOP => -- Do nothing when OP_ADD | OP_CMP | OP_TRAP => - if e_in.invert_a = '0' then - a_inv := a_in; - else - a_inv := not a_in; - end if; - result_with_carry := ppc_adde(a_inv, b_in, - decode_input_carry(e_in.input_carry, v.e.xerc)); - result := result_with_carry(63 downto 0); + result := sum_with_carry(63 downto 0); carry_32 := result(32) xor a_inv(32) xor b_in(32); - carry_64 := result_with_carry(64); + carry_64 := sum_with_carry(64); if e_in.insn_type = OP_ADD then if e_in.output_carry = '1' then if e_in.input_carry /= OV then @@ -606,8 +608,8 @@ begin end if; if e_in.oe = '1' then set_ov(v.e, - calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)), - calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31))); + calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)), + calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31))); end if; result_en := '1'; else @@ -672,6 +674,19 @@ begin end if; end if; end if; + when OP_ADDG6S => + result := (others => '0'); + for i in 0 to 14 loop + lo := i * 4; + hi := (i + 1) * 4; + if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then + result(lo + 3 downto lo) := "0110"; + end if; + end loop; + if sum_with_carry(64) = '0' then + result(63 downto 60) := "0110"; + end if; + result_en := '1'; when OP_CMPRB => newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); bf := insn_bf(e_in.insn); @@ -688,7 +703,8 @@ begin v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; - when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM => + when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | + OP_BPERM | OP_BCD => result := logical_result; result_en := '1'; when OP_B => diff --git a/logical.vhdl b/logical.vhdl index 2df66dc..d008e47 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -37,6 +37,72 @@ architecture behaviour of logical is signal parity : std_ulogic_vector(63 downto 0); signal permute : std_ulogic_vector(7 downto 0); + function bcd_to_dpd(bcd: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is + variable dpd: std_ulogic_vector(9 downto 0); + variable a, b, c, d, e, f, g, h, i, j, k, m: std_ulogic; + begin + -- The following equations are copied from PowerISA v3.0B Book 1 appendix B + a := bcd(11); + b := bcd(10); + c := bcd(9); + d := bcd(8); + e := bcd(7); + f := bcd(6); + g := bcd(5); + h := bcd(4); + i := bcd(3); + j := bcd(2); + k := bcd(1); + m := bcd(0); + dpd(9) := (f and a and i and not e) or (j and a and not i) or (b and not a); + dpd(8) := (g and a and i and not e) or (k and a and not i) or (c and not a); + dpd(7) := d; + dpd(6) := (j and not a and e and not i) or (f and not i and not e) or + (f and not a and not e) or (e and i); + dpd(5) := (k and not a and e and not i) or (g and not i and not e) or + (g and not a and not e) or (a and i); + dpd(4) := h; + dpd(3) := a or e or i; + dpd(2) := (not e and j and not i) or (e and i) or a; + dpd(1) := (not a and k and not i) or (a and i) or e; + dpd(0) := m; + return dpd; + end; + + function dpd_to_bcd(dpd: std_ulogic_vector(9 downto 0)) return std_ulogic_vector is + variable bcd: std_ulogic_vector(11 downto 0); + variable p, q, r, s, t, u, v, w, x, y: std_ulogic; + begin + -- The following equations are copied from PowerISA v3.0B Book 1 appendix B + p := dpd(9); + q := dpd(8); + r := dpd(7); + s := dpd(6); + t := dpd(5); + u := dpd(4); + v := dpd(3); + w := dpd(2); + x := dpd(1); + y := dpd(0); + bcd(11) := (not s and v and w) or (t and v and w and s) or (v and w and not x); + bcd(10) := (p and s and x and not t) or (p and not w) or (p and not v); + bcd(9) := (q and s and x and not t) or (q and not w) or (q and not v); + bcd(8) := r; + bcd(7) := (v and not w and x) or (s and v and w and x) or (not t and v and w and x); + bcd(6) := (p and t and v and w and x and not s) or (s and not x and v) or + (s and not v); + bcd(5) := (q and t and w and v and x and not s) or (t and not x and v) or + (t and not v); + bcd(4) := u; + bcd(3) := (t and v and w and x) or (s and v and w and x) or (v and not w and not x); + bcd(2) := (p and not s and not t and w and v) or (s and v and not w and x) or + (p and w and not x and v) or (w and not v); + bcd(1) := (q and not s and not t and v and w) or (t and v and not w and x) or + (q and v and w and not x) or (x and not v); + bcd(0) := y; + return bcd; + end; + begin logical_0: process(all) variable rb_adj, tmp : std_ulogic_vector(63 downto 0); @@ -120,6 +186,17 @@ begin tmp := ppc_cmpb(rs, rb); when OP_BPERM => tmp := std_ulogic_vector(resize(unsigned(permute), 64)); + when OP_BCD => + -- invert_in is abused to indicate direction of conversion + if invert_in = '0' then + -- cbcdtd + tmp := x"000" & bcd_to_dpd(rs(55 downto 44)) & bcd_to_dpd(rs(43 downto 32)) & + x"000" & bcd_to_dpd(rs(23 downto 12)) & bcd_to_dpd(rs(11 downto 0)); + else + -- cdtbcd + tmp := x"00" & dpd_to_bcd(rs(51 downto 42)) & dpd_to_bcd(rs(41 downto 32)) & + x"00" & dpd_to_bcd(rs(19 downto 10)) & dpd_to_bcd(rs(9 downto 0)); + end if; when others => -- EXTS -- note datalen is a 1-hot encoding diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index 9b6775b..146346d 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -94,7 +94,7 @@ const char *ops[64] = "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", "tlbie ", "trap ", - "xor ", "ffail ", "?58 ", "?59 ", "?60 ", "?61 ", "?62 ", "?63 " + "xor ", "bcd ", "addg6s ", "ffail ", "?60 ", "?61 ", "?62 ", "?63 " }; const char *spr_names[13] =