diff --git a/bitsort.vhdl b/bitsort.vhdl index f2aeddb..01b34b5 100644 --- a/bitsort.vhdl +++ b/bitsort.vhdl @@ -1,5 +1,6 @@ -- Implements instructions that involve sorting bits, -- that is, cfuged, pextd and pdepd. +-- Also does bperm, which is somewhat different. -- -- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right -- and move the bits in RS in the same fashion to give the result @@ -7,6 +8,7 @@ -- corresponding bit in RB is 1 -- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out -- to the bit positions which have a 1 in RB +-- bperm: Select 8 arbitrary bits -- NB opc is bits 7-6 of the instruction: -- 00 = pdepd, 01 = pextd, 10 = cfuged @@ -27,6 +29,8 @@ entity bit_sorter is go : in std_ulogic; opc : in std_ulogic_vector(1 downto 0); done : out std_ulogic; + do_bperm : in std_ulogic; + bperm_done : out std_ulogic; result : out std_ulogic_vector(63 downto 0) ); end entity bit_sorter; @@ -45,6 +49,13 @@ architecture behaviour of bit_sorter is signal sr_vl : std_ulogic_vector(63 downto 0); signal sr_vr : std_ulogic_vector(63 downto 0); + signal is_bperm : std_ulogic; + signal bpc : unsigned(2 downto 0); + signal bp_done : std_ulogic; + signal bperm_res : std_ulogic_vector(7 downto 0); + signal rs_sr : std_ulogic_vector(63 downto 0); + signal rb_bp : std_ulogic_vector(63 downto 0); + begin bsort_r: process(clk) begin @@ -96,7 +107,41 @@ begin end if; end process; + -- bit permutation + bperm_res(7) <= rb_bp(to_integer(unsigned(not rs_sr(5 downto 0)))) when not is_X(rs_sr) + else 'X'; + + bperm_r: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + is_bperm <= '0'; + bp_done <= '0'; + bperm_res(6 downto 0) <= (others => '0'); + bpc <= to_unsigned(0, 3); + elsif do_bperm = '1' then + is_bperm <= '1'; + bp_done <= '0'; + bperm_res(6 downto 0) <= (others => '0'); + bpc <= to_unsigned(0, 3); + rs_sr <= rs; + rb_bp <= rb; + elsif bp_done = '1' then + is_bperm <= '0'; + bp_done <= '0'; + elsif is_bperm = '1' then + bperm_res(6 downto 0) <= bperm_res(7 downto 1); + rs_sr <= x"00" & rs_sr(63 downto 8); + if bpc = "110" then + bp_done <= '1'; + end if; + bpc <= bpc + 1; + end if; + end if; + end process; + done <= sd; - result <= val; + bperm_done <= bp_done; + result <= val when is_bperm = '0' else (56x"0" & bperm_res); end behaviour; diff --git a/decode2.vhdl b/decode2.vhdl index fd7434c..432426d 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -227,7 +227,6 @@ architecture behaviour of decode2 is OP_PRTY => "001", OP_CMPB => "001", OP_EXTS => "001", - OP_BPERM => "001", OP_BREV => "001", OP_BCD => "001", OP_MTSPR => "001", @@ -256,6 +255,7 @@ architecture behaviour of decode2 is OP_DIVE => "101", OP_MOD => "101", OP_BSORT => "100", + OP_BPERM => "100", OP_ADDG6S => "001", -- misc_result OP_ISEL => "010", OP_DARN => "011", diff --git a/execute1.vhdl b/execute1.vhdl index f218ab8..08bc694 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -116,6 +116,7 @@ architecture behaviour of execute1 is start_mul : std_ulogic; start_div : std_ulogic; start_bsort : std_ulogic; + start_bperm : std_ulogic; do_trace : std_ulogic; ciabr_trace : std_ulogic; fp_intr : std_ulogic; @@ -150,6 +151,7 @@ architecture behaviour of execute1 is mul_finish : std_ulogic; div_in_progress : std_ulogic; bsort_in_progress : std_ulogic; + bperm_in_progress : std_ulogic; no_instr_avail : std_ulogic; instr_dispatch : std_ulogic; ext_interrupt : std_ulogic; @@ -174,7 +176,7 @@ architecture behaviour of execute1 is spr_select => spr_id_init, pmu_spr_num => 5x"0", redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', - bsort_in_progress => '0', + bsort_in_progress => '0', bperm_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", @@ -245,6 +247,8 @@ architecture behaviour of execute1 is -- bit-sort unit signals signal bsort_start : std_ulogic; signal bsort_done : std_ulogic; + signal bperm_start : std_ulogic; + signal bperm_done : std_ulogic; -- random number generator signals signal random_raw : std_ulogic_vector(63 downto 0); @@ -515,6 +519,8 @@ begin go => bsort_start, opc => e_in.insn(7 downto 6), done => bsort_done, + do_bperm => bperm_start, + bperm_done => bperm_done, result => bsort_result ); @@ -1228,7 +1234,7 @@ begin when OP_CMPRB => when OP_CMPEQB => when OP_LOGIC | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS | - OP_BPERM | OP_BREV | OP_BCD => + OP_BREV | OP_BCD => when OP_B => v.take_branch := '1'; @@ -1433,6 +1439,11 @@ begin slow_op := '1'; owait := '1'; + when OP_BPERM => + v.start_bperm := '1'; + slow_op := '1'; + owait := '1'; + when OP_MUL_L64 => if e_in.is_32bit = '1' then v.se.mult_32s := '1'; @@ -1718,6 +1729,7 @@ begin x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; v.bsort_in_progress := actions.start_bsort; + v.bperm_in_progress := actions.start_bperm; v.br_mispredict := v.e.redirect and actions.direct_branch; v.advance_nia := actions.advance_nia; v.redir_to_next := actions.redir_to_next; @@ -1728,7 +1740,8 @@ begin -- multiply is happening in order to stop following -- instructions from using the wrong XER value -- (and for simplicity in the OE=0 case). - v.busy := actions.start_div or actions.start_mul or actions.start_bsort; + v.busy := actions.start_div or actions.start_mul or + actions.start_bsort or actions.start_bperm; -- instruction for other units, i.e. LDST if e_in.unit = LDST then @@ -1740,6 +1753,7 @@ begin end if; is_scv := go and actions.se.scv_trap; bsort_start <= go and actions.start_bsort; + bperm_start <= go and actions.start_bperm; pmu_trace <= go and actions.do_trace; if not HAS_FPU and ex1.div_in_progress = '1' then @@ -1780,6 +1794,13 @@ begin v.e.write_data := alu_result; bypass_valid := bsort_done; end if; + if ex1.bperm_in_progress = '1' then + v.bperm_in_progress := not bperm_done; + v.e.valid := bperm_done; + v.busy := not bperm_done; + v.e.write_data := alu_result; + bypass_valid := bperm_done; + end if; if v.e.write_xerc_enable = '1' and v.e.valid = '1' then v.xerc := v.e.xerc; diff --git a/logical.vhdl b/logical.vhdl index 2d139f8..792a896 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -23,7 +23,6 @@ architecture behaviour of logical is signal par0, par1 : std_ulogic; signal parity : std_ulogic_vector(63 downto 0); - signal permute : std_ulogic_vector(7 downto 0); function bcd_to_dpd(bcd: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is variable dpd: std_ulogic_vector(9 downto 0); @@ -109,16 +108,6 @@ begin parity(32) <= par1; end if; - -- bit permutation - for i in 0 to 7 loop - j := i * 8; - if rs(j+7 downto j+6) = "00" then - permute(i) <= rb(to_integer(unsigned(not rs(j+5 downto j)))); - else - permute(i) <= '0'; - end if; - end loop; - rb_adj := rb; if invert_in = '1' then rb_adj := not rb; @@ -157,8 +146,6 @@ begin tmp := parity; when OP_CMPB => tmp := ppc_cmpb(rs, rb); - when OP_BPERM => - tmp := std_ulogic_vector(resize(unsigned(permute), 64)); when OP_BCD => -- invert_in is abused to indicate direction of conversion if invert_in = '0' then