Merge pull request #345 from antonblanchard/popcnt-go-fast
popcnt* timing improvements from Paulpull/349/head
						commit
						cef3660e74
					
				@ -0,0 +1,130 @@
 | 
			
		||||
library ieee;
 | 
			
		||||
use ieee.std_logic_1164.all;
 | 
			
		||||
use ieee.numeric_std.all;
 | 
			
		||||
 | 
			
		||||
library work;
 | 
			
		||||
use work.helpers.all;
 | 
			
		||||
 | 
			
		||||
entity bit_counter is
 | 
			
		||||
    port (
 | 
			
		||||
        clk         : in std_logic;
 | 
			
		||||
        rs          : in std_ulogic_vector(63 downto 0);
 | 
			
		||||
        count_right : in std_ulogic;
 | 
			
		||||
        do_popcnt   : in std_ulogic;
 | 
			
		||||
        is_32bit    : in std_ulogic;
 | 
			
		||||
        datalen     : in std_ulogic_vector(3 downto 0);
 | 
			
		||||
        result      : out std_ulogic_vector(63 downto 0)
 | 
			
		||||
        );
 | 
			
		||||
end entity bit_counter;
 | 
			
		||||
 | 
			
		||||
architecture behaviour of bit_counter is
 | 
			
		||||
    -- signals for count-leading/trailing-zeroes
 | 
			
		||||
    signal inp : std_ulogic_vector(63 downto 0);
 | 
			
		||||
    signal sum : std_ulogic_vector(64 downto 0);
 | 
			
		||||
    signal msb_r : std_ulogic;
 | 
			
		||||
    signal onehot : std_ulogic_vector(63 downto 0);
 | 
			
		||||
    signal onehot_r : std_ulogic_vector(63 downto 0);
 | 
			
		||||
    signal bitnum : std_ulogic_vector(5 downto 0);
 | 
			
		||||
    signal cntz : std_ulogic_vector(63 downto 0);
 | 
			
		||||
 | 
			
		||||
    -- signals for popcnt
 | 
			
		||||
    signal dlen_r   : std_ulogic_vector(3 downto 0);
 | 
			
		||||
    signal pcnt_r   : std_ulogic;
 | 
			
		||||
    subtype twobit is unsigned(1 downto 0);
 | 
			
		||||
    type twobit32 is array(0 to 31) of twobit;
 | 
			
		||||
    signal pc2      : twobit32;
 | 
			
		||||
    subtype threebit is unsigned(2 downto 0);
 | 
			
		||||
    type threebit16 is array(0 to 15) of threebit;
 | 
			
		||||
    signal pc4      : threebit16;
 | 
			
		||||
    subtype fourbit is unsigned(3 downto 0);
 | 
			
		||||
    type fourbit8 is array(0 to 7) of fourbit;
 | 
			
		||||
    signal pc8      : fourbit8;
 | 
			
		||||
    signal pc8_r    : fourbit8;
 | 
			
		||||
    subtype sixbit is unsigned(5 downto 0);
 | 
			
		||||
    type sixbit2 is array(0 to 1) of sixbit;
 | 
			
		||||
    signal pc32     : sixbit2;
 | 
			
		||||
    signal popcnt   : std_ulogic_vector(63 downto 0);
 | 
			
		||||
 | 
			
		||||
begin
 | 
			
		||||
    countzero_r: process(clk)
 | 
			
		||||
    begin
 | 
			
		||||
        if rising_edge(clk) then
 | 
			
		||||
            msb_r <= sum(64);
 | 
			
		||||
            onehot_r <= onehot;
 | 
			
		||||
        end if;
 | 
			
		||||
    end process;
 | 
			
		||||
 | 
			
		||||
    countzero: process(all)
 | 
			
		||||
    begin
 | 
			
		||||
        if is_32bit = '0' then
 | 
			
		||||
            if count_right = '0' then
 | 
			
		||||
                inp <= bit_reverse(rs);
 | 
			
		||||
            else
 | 
			
		||||
                inp <= rs;
 | 
			
		||||
            end if;
 | 
			
		||||
        else
 | 
			
		||||
            inp(63 downto 32) <= x"FFFFFFFF";
 | 
			
		||||
            if count_right = '0' then
 | 
			
		||||
                inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
 | 
			
		||||
            else
 | 
			
		||||
                inp(31 downto 0) <= rs(31 downto 0);
 | 
			
		||||
            end if;
 | 
			
		||||
        end if;
 | 
			
		||||
 | 
			
		||||
        sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
 | 
			
		||||
        onehot <= sum(63 downto 0) and inp;
 | 
			
		||||
 | 
			
		||||
        -- The following occurs after a clock edge
 | 
			
		||||
        bitnum <= bit_number(onehot_r);
 | 
			
		||||
 | 
			
		||||
        cntz <= 57x"0" & msb_r & bitnum;
 | 
			
		||||
    end process;
 | 
			
		||||
 | 
			
		||||
    popcnt_r: process(clk)
 | 
			
		||||
    begin
 | 
			
		||||
        if rising_edge(clk) then
 | 
			
		||||
            for i in 0 to 7 loop
 | 
			
		||||
                pc8_r(i) <= pc8(i);
 | 
			
		||||
            end loop;
 | 
			
		||||
            dlen_r <= datalen;
 | 
			
		||||
            pcnt_r <= do_popcnt;
 | 
			
		||||
        end if;
 | 
			
		||||
    end process;
 | 
			
		||||
 | 
			
		||||
    popcnt_a: process(all)
 | 
			
		||||
    begin
 | 
			
		||||
        for i in 0 to 31 loop
 | 
			
		||||
            pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
 | 
			
		||||
        end loop;
 | 
			
		||||
        for i in 0 to 15 loop
 | 
			
		||||
            pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
 | 
			
		||||
        end loop;
 | 
			
		||||
        for i in 0 to 7 loop
 | 
			
		||||
            pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
 | 
			
		||||
        end loop;
 | 
			
		||||
 | 
			
		||||
        -- after a clock edge
 | 
			
		||||
        for i in 0 to 1 loop
 | 
			
		||||
            pc32(i) <= ("00" & pc8_r(i * 4)) + ("00" & pc8_r(i * 4 + 1)) +
 | 
			
		||||
                       ("00" & pc8_r(i * 4 + 2)) + ("00" & pc8_r(i * 4 + 3));
 | 
			
		||||
        end loop;
 | 
			
		||||
        
 | 
			
		||||
        popcnt <= (others => '0');
 | 
			
		||||
        if dlen_r(3 downto 2) = "00" then
 | 
			
		||||
            -- popcntb
 | 
			
		||||
            for i in 0 to 7 loop
 | 
			
		||||
                popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8_r(i));
 | 
			
		||||
            end loop;
 | 
			
		||||
        elsif dlen_r(3) = '0' then
 | 
			
		||||
            -- popcntw
 | 
			
		||||
            for i in 0 to 1 loop
 | 
			
		||||
                popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
 | 
			
		||||
            end loop;
 | 
			
		||||
        else
 | 
			
		||||
            popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
 | 
			
		||||
        end if;
 | 
			
		||||
    end process;
 | 
			
		||||
 | 
			
		||||
    result <= cntz when pcnt_r = '0' else popcnt;
 | 
			
		||||
 | 
			
		||||
end behaviour;
 | 
			
		||||
@ -1,60 +0,0 @@
 | 
			
		||||
library ieee;
 | 
			
		||||
use ieee.std_logic_1164.all;
 | 
			
		||||
use ieee.numeric_std.all;
 | 
			
		||||
 | 
			
		||||
library work;
 | 
			
		||||
use work.helpers.all;
 | 
			
		||||
 | 
			
		||||
entity zero_counter is
 | 
			
		||||
    port (
 | 
			
		||||
        clk         : in std_logic;
 | 
			
		||||
        rs          : in std_ulogic_vector(63 downto 0);
 | 
			
		||||
        count_right : in std_ulogic;
 | 
			
		||||
        is_32bit    : in std_ulogic;
 | 
			
		||||
        result      : out std_ulogic_vector(63 downto 0)
 | 
			
		||||
        );
 | 
			
		||||
end entity zero_counter;
 | 
			
		||||
 | 
			
		||||
architecture behaviour of zero_counter is
 | 
			
		||||
    signal inp : std_ulogic_vector(63 downto 0);
 | 
			
		||||
    signal sum : std_ulogic_vector(64 downto 0);
 | 
			
		||||
    signal msb_r : std_ulogic;
 | 
			
		||||
    signal onehot : std_ulogic_vector(63 downto 0);
 | 
			
		||||
    signal onehot_r : std_ulogic_vector(63 downto 0);
 | 
			
		||||
    signal bitnum : std_ulogic_vector(5 downto 0);
 | 
			
		||||
 | 
			
		||||
begin
 | 
			
		||||
    countzero_r: process(clk)
 | 
			
		||||
    begin
 | 
			
		||||
        if rising_edge(clk) then
 | 
			
		||||
            msb_r <= sum(64);
 | 
			
		||||
            onehot_r <= onehot;
 | 
			
		||||
        end if;
 | 
			
		||||
    end process;
 | 
			
		||||
 | 
			
		||||
    countzero: process(all)
 | 
			
		||||
    begin
 | 
			
		||||
        if is_32bit = '0' then
 | 
			
		||||
            if count_right = '0' then
 | 
			
		||||
                inp <= bit_reverse(rs);
 | 
			
		||||
            else
 | 
			
		||||
                inp <= rs;
 | 
			
		||||
            end if;
 | 
			
		||||
        else
 | 
			
		||||
            inp(63 downto 32) <= x"FFFFFFFF";
 | 
			
		||||
            if count_right = '0' then
 | 
			
		||||
                inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
 | 
			
		||||
            else
 | 
			
		||||
                inp(31 downto 0) <= rs(31 downto 0);
 | 
			
		||||
            end if;
 | 
			
		||||
        end if;
 | 
			
		||||
 | 
			
		||||
        sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
 | 
			
		||||
        onehot <= sum(63 downto 0) and inp;
 | 
			
		||||
 | 
			
		||||
        -- The following occurs after a clock edge
 | 
			
		||||
        bitnum <= bit_number(onehot_r);
 | 
			
		||||
 | 
			
		||||
        result <= x"00000000000000" & "0" & msb_r & bitnum;
 | 
			
		||||
    end process;
 | 
			
		||||
end behaviour;
 | 
			
		||||
					Loading…
					
					
				
		Reference in New Issue