core: Change bperm to a simpler and slower implementation

This does bperm in the bitsort unit instead of the logical unit, and
no longer tries to do it in a single cycle with eight 64-to-1
multiplexers.  Instead it is now a state machine in the bitsort unit,
takes 8 cycles, and only has one 64-to-1 multiplexer.  This helps
improve timing and reduces LUT usage.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/436/head
Paul Mackerras 3 weeks ago
parent f6a839a86b
commit 23ff954059

@ -1,5 +1,6 @@
-- Implements instructions that involve sorting bits,
-- that is, cfuged, pextd and pdepd.
-- Also does bperm, which is somewhat different.
--
-- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right
-- and move the bits in RS in the same fashion to give the result
@ -7,6 +8,7 @@
-- corresponding bit in RB is 1
-- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out
-- to the bit positions which have a 1 in RB
-- bperm: Select 8 arbitrary bits

-- NB opc is bits 7-6 of the instruction:
-- 00 = pdepd, 01 = pextd, 10 = cfuged
@ -27,6 +29,8 @@ entity bit_sorter is
go : in std_ulogic;
opc : in std_ulogic_vector(1 downto 0);
done : out std_ulogic;
do_bperm : in std_ulogic;
bperm_done : out std_ulogic;
result : out std_ulogic_vector(63 downto 0)
);
end entity bit_sorter;
@ -45,6 +49,13 @@ architecture behaviour of bit_sorter is
signal sr_vl : std_ulogic_vector(63 downto 0);
signal sr_vr : std_ulogic_vector(63 downto 0);

signal is_bperm : std_ulogic;
signal bpc : unsigned(2 downto 0);
signal bp_done : std_ulogic;
signal bperm_res : std_ulogic_vector(7 downto 0);
signal rs_sr : std_ulogic_vector(63 downto 0);
signal rb_bp : std_ulogic_vector(63 downto 0);

begin
bsort_r: process(clk)
begin
@ -96,7 +107,41 @@ begin
end if;
end process;

-- bit permutation
bperm_res(7) <= rb_bp(to_integer(unsigned(not rs_sr(5 downto 0)))) when not is_X(rs_sr)
else 'X';

bperm_r: process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
is_bperm <= '0';
bp_done <= '0';
bperm_res(6 downto 0) <= (others => '0');
bpc <= to_unsigned(0, 3);
elsif do_bperm = '1' then
is_bperm <= '1';
bp_done <= '0';
bperm_res(6 downto 0) <= (others => '0');
bpc <= to_unsigned(0, 3);
rs_sr <= rs;
rb_bp <= rb;
elsif bp_done = '1' then
is_bperm <= '0';
bp_done <= '0';
elsif is_bperm = '1' then
bperm_res(6 downto 0) <= bperm_res(7 downto 1);
rs_sr <= x"00" & rs_sr(63 downto 8);
if bpc = "110" then
bp_done <= '1';
end if;
bpc <= bpc + 1;
end if;
end if;
end process;

done <= sd;
result <= val;
bperm_done <= bp_done;
result <= val when is_bperm = '0' else (56x"0" & bperm_res);

end behaviour;

@ -227,7 +227,6 @@ architecture behaviour of decode2 is
OP_PRTY => "001",
OP_CMPB => "001",
OP_EXTS => "001",
OP_BPERM => "001",
OP_BREV => "001",
OP_BCD => "001",
OP_MTSPR => "001",
@ -256,6 +255,7 @@ architecture behaviour of decode2 is
OP_DIVE => "101",
OP_MOD => "101",
OP_BSORT => "100",
OP_BPERM => "100",
OP_ADDG6S => "001", -- misc_result
OP_ISEL => "010",
OP_DARN => "011",

@ -116,6 +116,7 @@ architecture behaviour of execute1 is
start_mul : std_ulogic;
start_div : std_ulogic;
start_bsort : std_ulogic;
start_bperm : std_ulogic;
do_trace : std_ulogic;
ciabr_trace : std_ulogic;
fp_intr : std_ulogic;
@ -150,6 +151,7 @@ architecture behaviour of execute1 is
mul_finish : std_ulogic;
div_in_progress : std_ulogic;
bsort_in_progress : std_ulogic;
bperm_in_progress : std_ulogic;
no_instr_avail : std_ulogic;
instr_dispatch : std_ulogic;
ext_interrupt : std_ulogic;
@ -174,7 +176,7 @@ architecture behaviour of execute1 is
spr_select => spr_id_init, pmu_spr_num => 5x"0",
redir_to_next => '0', advance_nia => '0', lr_from_next => '0',
mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
bsort_in_progress => '0',
bsort_in_progress => '0', bperm_in_progress => '0',
no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
taken_branch_event => '0', br_mispredict => '0',
msr => 64x"0",
@ -245,6 +247,8 @@ architecture behaviour of execute1 is
-- bit-sort unit signals
signal bsort_start : std_ulogic;
signal bsort_done : std_ulogic;
signal bperm_start : std_ulogic;
signal bperm_done : std_ulogic;

-- random number generator signals
signal random_raw : std_ulogic_vector(63 downto 0);
@ -515,6 +519,8 @@ begin
go => bsort_start,
opc => e_in.insn(7 downto 6),
done => bsort_done,
do_bperm => bperm_start,
bperm_done => bperm_done,
result => bsort_result
);

@ -1228,7 +1234,7 @@ begin
when OP_CMPRB =>
when OP_CMPEQB =>
when OP_LOGIC | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS |
OP_BPERM | OP_BREV | OP_BCD =>
OP_BREV | OP_BCD =>

when OP_B =>
v.take_branch := '1';
@ -1433,6 +1439,11 @@ begin
slow_op := '1';
owait := '1';

when OP_BPERM =>
v.start_bperm := '1';
slow_op := '1';
owait := '1';

when OP_MUL_L64 =>
if e_in.is_32bit = '1' then
v.se.mult_32s := '1';
@ -1718,6 +1729,7 @@ begin
x_to_divider.valid <= actions.start_div;
v.div_in_progress := actions.start_div;
v.bsort_in_progress := actions.start_bsort;
v.bperm_in_progress := actions.start_bperm;
v.br_mispredict := v.e.redirect and actions.direct_branch;
v.advance_nia := actions.advance_nia;
v.redir_to_next := actions.redir_to_next;
@ -1728,7 +1740,8 @@ begin
-- multiply is happening in order to stop following
-- instructions from using the wrong XER value
-- (and for simplicity in the OE=0 case).
v.busy := actions.start_div or actions.start_mul or actions.start_bsort;
v.busy := actions.start_div or actions.start_mul or
actions.start_bsort or actions.start_bperm;

-- instruction for other units, i.e. LDST
if e_in.unit = LDST then
@ -1740,6 +1753,7 @@ begin
end if;
is_scv := go and actions.se.scv_trap;
bsort_start <= go and actions.start_bsort;
bperm_start <= go and actions.start_bperm;
pmu_trace <= go and actions.do_trace;

if not HAS_FPU and ex1.div_in_progress = '1' then
@ -1780,6 +1794,13 @@ begin
v.e.write_data := alu_result;
bypass_valid := bsort_done;
end if;
if ex1.bperm_in_progress = '1' then
v.bperm_in_progress := not bperm_done;
v.e.valid := bperm_done;
v.busy := not bperm_done;
v.e.write_data := alu_result;
bypass_valid := bperm_done;
end if;

if v.e.write_xerc_enable = '1' and v.e.valid = '1' then
v.xerc := v.e.xerc;

@ -23,7 +23,6 @@ architecture behaviour of logical is

signal par0, par1 : std_ulogic;
signal parity : std_ulogic_vector(63 downto 0);
signal permute : std_ulogic_vector(7 downto 0);

function bcd_to_dpd(bcd: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is
variable dpd: std_ulogic_vector(9 downto 0);
@ -109,16 +108,6 @@ begin
parity(32) <= par1;
end if;

-- bit permutation
for i in 0 to 7 loop
j := i * 8;
if rs(j+7 downto j+6) = "00" then
permute(i) <= rb(to_integer(unsigned(not rs(j+5 downto j))));
else
permute(i) <= '0';
end if;
end loop;

rb_adj := rb;
if invert_in = '1' then
rb_adj := not rb;
@ -157,8 +146,6 @@ begin
tmp := parity;
when OP_CMPB =>
tmp := ppc_cmpb(rs, rb);
when OP_BPERM =>
tmp := std_ulogic_vector(resize(unsigned(permute), 64));
when OP_BCD =>
-- invert_in is abused to indicate direction of conversion
if invert_in = '0' then

Loading…
Cancel
Save