From c7025f9f284ec51e7d97f678bde59afa39c0f349 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 15 Oct 2019 10:29:53 +1100 Subject: [PATCH 1/2] divider: Add an output register This puts the output of the divider through a register. With the addition of the logic to detect overflow, the combinatorial output logic of the divider was becoming a critical path. Adding the output register adds a cycle to the latency of the divider but helps make timing at 100MHz on the A7-100. This also makes the valid, write_reg_enable and write_cr_enable fields of the output be registered, which eliminates warnings about register/latch pins with no clock. Signed-off-by: Paul Mackerras --- divider.vhdl | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/divider.vhdl b/divider.vhdl index a2a35b0..f9b34c8 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -22,6 +22,7 @@ architecture behaviour of divider is signal quot : std_ulogic_vector(63 downto 0); signal result : std_ulogic_vector(63 downto 0); signal sresult : std_ulogic_vector(63 downto 0); + signal oresult : std_ulogic_vector(63 downto 0); signal qbit : std_ulogic; signal running : std_ulogic; signal signcheck : std_ulogic; @@ -35,6 +36,7 @@ architecture behaviour of divider is signal write_reg : std_ulogic_vector(4 downto 0); signal overflow : std_ulogic; signal did_ovf : std_ulogic; + signal cr_data : std_ulogic_vector(2 downto 0); begin divider_0: process(clk) @@ -106,8 +108,8 @@ begin divider_1: process(all) begin - d_out <= DividerToWritebackInit; d_out.write_reg_nr <= write_reg; + d_out.write_cr_mask <= num_to_fxm(0); if is_modulus = '1' then result <= dend(128 downto 65); @@ -132,27 +134,36 @@ begin did_ovf <= overflow or (or (sresult(63 downto 32))); end if; if did_ovf = '1' then - d_out.write_reg_data <= (others => '0'); + oresult <= (others => '0'); elsif (is_32bit = '1') and (is_modulus = '0') then -- 32-bit divisions set the top 32 bits of the result to 0 - d_out.write_reg_data <= x"00000000" & sresult(31 downto 0); + oresult <= x"00000000" & sresult(31 downto 0); else - d_out.write_reg_data <= sresult; + oresult <= sresult; end if; - if count = "1000000" then - d_out.valid <= '1'; - d_out.write_reg_enable <= '1'; - if rc = '1' then - d_out.write_cr_enable <= '1'; - d_out.write_cr_mask <= num_to_fxm(0); - if (did_ovf = '1') or (or (sresult) = '0') then - d_out.write_cr_data <= x"20000000"; - elsif (sresult(63) = '1') and not ((is_32bit = '1') and (is_modulus = '0')) then - d_out.write_cr_data <= x"80000000"; - else - d_out.write_cr_data <= x"40000000"; - end if; + if (did_ovf = '1') or (or (sresult) = '0') then + cr_data <= "001"; + elsif (sresult(63) = '1') and not ((is_32bit = '1') and (is_modulus = '0')) then + cr_data <= "100"; + else + cr_data <= "010"; + end if; + end process; + + divider_out: process(clk) + begin + if rising_edge(clk) then + d_out.write_reg_data <= oresult; + d_out.write_cr_data <= cr_data & '0' & x"0000000"; + if count = "1000000" then + d_out.valid <= '1'; + d_out.write_reg_enable <= '1'; + d_out.write_cr_enable <= rc; + else + d_out.valid <= '0'; + d_out.write_reg_enable <= '0'; + d_out.write_cr_enable <= '0'; end if; end if; end process; From 82c19d4e7aedaa95cdb1793fd2ee67b38ff79e5b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 15 Oct 2019 14:59:15 +1100 Subject: [PATCH 2/2] divider: Reduce delay in detecting 32-bit overflow Timing analysis showed that even with the output register, timing was still a bit tight in the output stage, where the carry has to propagate all the way through the 64-bit negater, and we were then testing the top 33 bits to determine if a 32-bit operation had overflowed. Instead of detecting overflow at the end, we watch for any 1 bits getting shifted into the top 32 bits of the quotient register as we are doing the division. That is relatively easy to do and simplifies the output stage. Signed-off-by: Paul Mackerras --- divider.vhdl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/divider.vhdl b/divider.vhdl index f9b34c8..cfadc51 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -35,6 +35,7 @@ architecture behaviour of divider is signal rc : std_ulogic; signal write_reg : std_ulogic_vector(4 downto 0); signal overflow : std_ulogic; + signal ovf32 : std_ulogic; signal did_ovf : std_ulogic; signal cr_data : std_ulogic_vector(2 downto 0); @@ -66,6 +67,7 @@ begin count <= "1111111"; running <= '1'; overflow <= '0'; + ovf32 <= '0'; signcheck <= d_in.is_signed and (d_in.dividend(63) or d_in.divisor(63)); elsif signcheck = '1' then signcheck <= '0'; @@ -86,16 +88,19 @@ begin end if; overflow <= quot(63); if dend(128) = '1' or unsigned(dend(127 downto 64)) >= div then + ovf32 <= ovf32 or quot(31); dend <= std_ulogic_vector(unsigned(dend(127 downto 64)) - div) & dend(63 downto 0) & '0'; quot <= quot(62 downto 0) & '1'; count <= count + 1; elsif dend(128 downto 57) = x"000000000000000000" and count(6 downto 3) /= "0111" then -- consume 8 bits of zeroes in one cycle + ovf32 <= or (ovf32 & quot(31 downto 24)); dend <= dend(120 downto 0) & x"00"; quot <= quot(55 downto 0) & x"00"; count <= count + 8; else + ovf32 <= ovf32 or quot(31); dend <= dend(127 downto 0) & '0'; quot <= quot(62 downto 0) & '0'; count <= count + 1; @@ -125,13 +130,11 @@ begin if is_32bit = '0' then did_ovf <= overflow or (is_signed and (sresult(63) xor neg_result)); elsif is_signed = '1' then - if overflow = '1' or - (sresult(63 downto 31) /= x"00000000" & '0' and - sresult(63 downto 31) /= x"ffffffff" & '1') then + if ovf32 = '1' or sresult(32) /= sresult(31) then did_ovf <= '1'; end if; else - did_ovf <= overflow or (or (sresult(63 downto 32))); + did_ovf <= ovf32; end if; if did_ovf = '1' then oresult <= (others => '0');