countzero: Faster algorithm for count leading/trailing zeroes

This uses an algorithm for count leading/trailing zeroes that is
faster on FPGAs, which makes timing easier.  cntlz* and cnttz*
still take two cycles, though.

For count trailing zeroes, we compute x & -x, which for non-zero x
has a single 1 bit in the position of the least-significant 1 bit
in x.  This one-hot representation can then be converted to a bit
number with six 32-input OR gates.  For count leading zeroes, we
simply do a bit-reversal on x and then use the same algorithm.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/233/head
Paul Mackerras 5 years ago
parent 1f2058a0ed
commit 03a3a5d326

@ -15,123 +15,81 @@ entity zero_counter is
end entity zero_counter; end entity zero_counter;


architecture behaviour of zero_counter is architecture behaviour of zero_counter is
type intermediate_result is record -- Reverse the order of bits in a word
v16: std_ulogic_vector(15 downto 0); function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
sel_hi: std_ulogic_vector(1 downto 0); variable ret: std_ulogic_vector(a'left downto a'right);
is_32bit: std_ulogic; begin
count_right: std_ulogic; for i in a'right to a'left loop
end record; ret(a'left + a'right - i) := a(i);

end loop;
signal r, r_in : intermediate_result; return ret;
end;


-- Return the index of the leftmost or rightmost 1 in a set of 4 bits. -- If there is only one bit set in a doubleword, return its bit number
-- Assumes v is not "0000"; if it is, return (right ? "11" : "00"). -- (counting from the right). Each bit of the result is obtained by
function encoder(v: std_ulogic_vector(3 downto 0); right: std_ulogic) return std_ulogic_vector is -- ORing together 32 bits of the input:
-- bit 0 = a[1] or a[3] or a[5] or ...
-- bit 1 = a[2] or a[3] or a[6] or a[7] or ...
-- bit 2 = a[4..7] or a[12..15] or ...
-- bit 5 = a[32..63] ORed together
function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
variable ret: std_ulogic_vector(5 downto 0);
variable stride: natural;
variable bit: std_ulogic;
variable k: natural;
begin begin
if right = '0' then stride := 2;
if v(3) = '1' then for i in 0 to 5 loop
return "11"; bit := '0';
elsif v(2) = '1' then for j in 0 to (64 / stride) - 1 loop
return "10"; k := j * stride;
elsif v(1) = '1' then bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
return "01"; end loop;
else ret(i) := bit;
return "00"; stride := stride * 2;
end if; end loop;
else return ret;
if v(0) = '1' then
return "00";
elsif v(1) = '1' then
return "01";
elsif v(2) = '1' then
return "10";
else
return "11";
end if;
end if;
end; end;


signal inp : std_ulogic_vector(63 downto 0);
signal sum : std_ulogic_vector(64 downto 0);
signal msb_r : std_ulogic;
signal onehot : std_ulogic_vector(63 downto 0);
signal onehot_r : std_ulogic_vector(63 downto 0);
signal bitnum : std_ulogic_vector(5 downto 0);

begin begin
zerocounter_0: process(clk) countzero_r: process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
r <= r_in; msb_r <= sum(64);
onehot_r <= onehot;
end if; end if;
end process; end process;


zerocounter_1: process(all) countzero: process(all)
variable v: intermediate_result;
variable y, z: std_ulogic_vector(3 downto 0);
variable sel: std_ulogic_vector(5 downto 0);
variable v4: std_ulogic_vector(3 downto 0);

begin begin
-- Test 4 groups of 16 bits each.
-- The top 2 groups are considered to be zero in 32-bit mode.
z(0) := or (rs(15 downto 0));
z(1) := or (rs(31 downto 16));
z(2) := or (rs(47 downto 32));
z(3) := or (rs(63 downto 48));
if is_32bit = '0' then if is_32bit = '0' then
v.sel_hi := encoder(z, count_right); if count_right = '0' then
inp <= bit_reverse(rs);
else else
v.sel_hi(1) := '0'; inp <= rs;
end if;
else
inp(63 downto 32) <= x"FFFFFFFF";
if count_right = '0' then if count_right = '0' then
v.sel_hi(0) := z(1); inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
else else
v.sel_hi(0) := not z(0); inp(31 downto 0) <= rs(31 downto 0);
end if; end if;
end if; end if;


-- Select the leftmost/rightmost non-zero group of 16 bits sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
case v.sel_hi is onehot <= sum(63 downto 0) and inp;
when "00" =>
v.v16 := rs(15 downto 0);
when "01" =>
v.v16 := rs(31 downto 16);
when "10" =>
v.v16 := rs(47 downto 32);
when others =>
v.v16 := rs(63 downto 48);
end case;

-- Latch this and do the rest in the next cycle, for the sake of timing
v.is_32bit := is_32bit;
v.count_right := count_right;
r_in <= v;
sel(5 downto 4) := r.sel_hi;


-- Test 4 groups of 4 bits -- The following occurs after a clock edge
y(0) := or (r.v16(3 downto 0)); bitnum <= bit_number(onehot_r);
y(1) := or (r.v16(7 downto 4));
y(2) := or (r.v16(11 downto 8));
y(3) := or (r.v16(15 downto 12));
sel(3 downto 2) := encoder(y, r.count_right);

-- Select the leftmost/rightmost non-zero group of 4 bits
case sel(3 downto 2) is
when "00" =>
v4 := r.v16(3 downto 0);
when "01" =>
v4 := r.v16(7 downto 4);
when "10" =>
v4 := r.v16(11 downto 8);
when others =>
v4 := r.v16(15 downto 12);
end case;

sel(1 downto 0) := encoder(v4, r.count_right);

-- sel is now the index of the leftmost/rightmost 1 bit in rs
if v4 = "0000" then
-- operand is zero, return 32 for 32-bit, else 64
result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000";
elsif r.count_right = '0' then
-- return (63 - sel), trimmed to 5 bits in 32-bit mode
result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0);
else
result <= x"00000000000000" & "00" & sel;
end if;


result <= x"00000000000000" & "0" & msb_r & bitnum;
end process; end process;
end behaviour; end behaviour;

Loading…
Cancel
Save