From 9cce9362519b69106936ce437bd1dfe4f27dee4b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 28 Jul 2020 16:07:25 +1000
Subject: [PATCH] FPU: Implement fdiv[s]

This implements floating-point division A/B by a process that starts
with normalizing both inputs if necessary.  Then an estimate of 1/B
from a lookup table is refined by 3 Newton-Raphson iterations and then
multiplied by A to get a quotient.  The remainder is calculated as
A - R * B (where R is the result, i.e. the quotient) and the remainder
is compared to 0 and to B to see whether the quotient needs to be
incremented by 1.  The calculations of 1 / B are done with 56 fraction
bits and intermediate results are truncated rather than rounded,
meaning that the final estimate of 1 / B is always correct or a little
bit low, never too high, and thus the calculated quotient is correct
or 1 unit too low.  Doing the estimate of 1 / B with sufficient
precision that the quotient is always correct to the last bit without
needing any adjustment would require many more bits of precision.

This implements fdivs by computing a double-precision quotient and
then rounding it to single precision.  It would be possible to
optimize this by e.g. doing only 2 iterations of Newton-Raphson and
then doing the remainder calculation and adjustment at single
precision rather than double precision.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   2 +
 fpu.vhdl                   | 292 +++++++++++++++++++++++++++++++++++--
 tests/fpu/fpu.c            |  39 +++++
 tests/test_fpu.bin         | Bin 24272 -> 24416 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 323 insertions(+), 11 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 721c478..ddcbb3c 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -416,6 +416,7 @@ architecture behaviour of decode1 is
         --             unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                          op                               in   out   A   out  in    out  len        ext                                pipe
         2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
+        2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
         2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
         2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
         2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
@@ -469,6 +470,7 @@ architecture behaviour of decode1 is
     constant decode_op_63h_array : op_63_subop_array_1_t := (
         --            unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                         op                               in   out   A   out  in    out  len        ext                                pipe
+        2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
         2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
         2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
diff --git a/fpu.vhdl b/fpu.vhdl
index 209daa0..2584e1c 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,10 +40,12 @@ architecture behaviour of fpu is
                      DO_FMR, DO_FMRG,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
-                     DO_FADD, DO_FMUL,
+                     DO_FADD, DO_FMUL, DO_FDIV,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
                      MULT_1,
+                     LOOKUP,
+                     DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
@@ -51,6 +53,7 @@ architecture behaviour of fpu is
                      ROUNDING, ROUNDING_2, ROUNDING_3,
                      DENORM,
                      RENORM_A, RENORM_A2,
+                     RENORM_B, RENORM_B2,
                      RENORM_C, RENORM_C2);
 
     type reg_type is record
@@ -72,6 +75,7 @@ architecture behaviour of fpu is
         r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
         x            : std_ulogic;
         p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
+        y            : std_ulogic_vector(63 downto 0);  -- 8.56 format
         result_sign  : std_ulogic;
         result_class : fp_number_class;
         result_exp   : signed(EXP_BITS-1 downto 0);
@@ -91,8 +95,11 @@ architecture behaviour of fpu is
         add_bsmall   : std_ulogic;
         is_multiply  : std_ulogic;
         first        : std_ulogic;
+        count        : unsigned(1 downto 0);
     end record;
 
+    type lookup_table is array(0 to 255) of std_ulogic_vector(17 downto 0);
+
     signal r, rin : reg_type;
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
@@ -114,7 +121,9 @@ architecture behaviour of fpu is
     signal multiply_to_f : MultiplyOutputType;
     signal msel_1        : std_ulogic_vector(1 downto 0);
     signal msel_2        : std_ulogic_vector(1 downto 0);
+    signal msel_add      : std_ulogic_vector(1 downto 0);
     signal msel_inv      : std_ulogic;
+    signal inverse_est   : std_ulogic_vector(18 downto 0);
 
     -- opsel values
     constant AIN_R    : std_ulogic_vector(1 downto 0) := "00";
@@ -134,11 +143,61 @@ architecture behaviour of fpu is
     -- msel values
     constant MUL1_A : std_ulogic_vector(1 downto 0) := "00";
     constant MUL1_B : std_ulogic_vector(1 downto 0) := "01";
+    constant MUL1_Y : std_ulogic_vector(1 downto 0) := "10";
     constant MUL1_R : std_ulogic_vector(1 downto 0) := "11";
 
     constant MUL2_C   : std_ulogic_vector(1 downto 0) := "00";
+    constant MUL2_LUT : std_ulogic_vector(1 downto 0) := "01";
+    constant MUL2_P   : std_ulogic_vector(1 downto 0) := "10";
     constant MUL2_R   : std_ulogic_vector(1 downto 0) := "11";
 
+    constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00";
+    constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01";
+    constant MULADD_A     : std_ulogic_vector(1 downto 0) := "10";
+
+    -- Inverse lookup table, indexed by the top 8 fraction bits
+    -- Output range is [0.5, 1) in 0.19 format, though the top
+    -- bit isn't stored since it is always 1.
+    -- Each output value is the inverse of the center of the input
+    -- range for the value, i.e. entry 0 is 1 / (1 + 1/512),
+    -- entry 1 is 1 / (1 + 3/512), etc.
+    signal inverse_table : lookup_table := (
+        -- 1/x lookup table
+        -- Unit bit is assumed to be 1, so input range is [1, 2)
+        18x"3fc01", 18x"3f411", 18x"3ec31", 18x"3e460", 18x"3dc9f", 18x"3d4ec", 18x"3cd49", 18x"3c5b5",
+        18x"3be2f", 18x"3b6b8", 18x"3af4f", 18x"3a7f4", 18x"3a0a7", 18x"39968", 18x"39237", 18x"38b14",
+        18x"383fe", 18x"37cf5", 18x"375f9", 18x"36f0a", 18x"36828", 18x"36153", 18x"35a8a", 18x"353ce",
+        18x"34d1e", 18x"3467a", 18x"33fe3", 18x"33957", 18x"332d7", 18x"32c62", 18x"325f9", 18x"31f9c",
+        18x"3194a", 18x"31303", 18x"30cc7", 18x"30696", 18x"30070", 18x"2fa54", 18x"2f443", 18x"2ee3d",
+        18x"2e841", 18x"2e250", 18x"2dc68", 18x"2d68b", 18x"2d0b8", 18x"2caee", 18x"2c52e", 18x"2bf79",
+        18x"2b9cc", 18x"2b429", 18x"2ae90", 18x"2a900", 18x"2a379", 18x"29dfb", 18x"29887", 18x"2931b",
+        18x"28db8", 18x"2885e", 18x"2830d", 18x"27dc4", 18x"27884", 18x"2734d", 18x"26e1d", 18x"268f6",
+        18x"263d8", 18x"25ec1", 18x"259b3", 18x"254ac", 18x"24fad", 18x"24ab7", 18x"245c8", 18x"240e1",
+        18x"23c01", 18x"23729", 18x"23259", 18x"22d90", 18x"228ce", 18x"22413", 18x"21f60", 18x"21ab4",
+        18x"2160f", 18x"21172", 18x"20cdb", 18x"2084b", 18x"203c2", 18x"1ff40", 18x"1fac4", 18x"1f64f",
+        18x"1f1e1", 18x"1ed79", 18x"1e918", 18x"1e4be", 18x"1e069", 18x"1dc1b", 18x"1d7d4", 18x"1d392",
+        18x"1cf57", 18x"1cb22", 18x"1c6f3", 18x"1c2ca", 18x"1bea7", 18x"1ba8a", 18x"1b672", 18x"1b261",
+        18x"1ae55", 18x"1aa50", 18x"1a64f", 18x"1a255", 18x"19e60", 18x"19a70", 18x"19686", 18x"192a2",
+        18x"18ec3", 18x"18ae9", 18x"18715", 18x"18345", 18x"17f7c", 18x"17bb7", 18x"177f7", 18x"1743d",
+        18x"17087", 18x"16cd7", 18x"1692c", 18x"16585", 18x"161e4", 18x"15e47", 18x"15ab0", 18x"1571d",
+        18x"1538e", 18x"15005", 18x"14c80", 18x"14900", 18x"14584", 18x"1420d", 18x"13e9b", 18x"13b2d",
+        18x"137c3", 18x"1345e", 18x"130fe", 18x"12da2", 18x"12a4a", 18x"126f6", 18x"123a7", 18x"1205c",
+        18x"11d15", 18x"119d2", 18x"11694", 18x"11359", 18x"11023", 18x"10cf1", 18x"109c2", 18x"10698",
+        18x"10372", 18x"10050", 18x"0fd31", 18x"0fa17", 18x"0f700", 18x"0f3ed", 18x"0f0de", 18x"0edd3",
+        18x"0eacb", 18x"0e7c7", 18x"0e4c7", 18x"0e1ca", 18x"0ded2", 18x"0dbdc", 18x"0d8eb", 18x"0d5fc",
+        18x"0d312", 18x"0d02b", 18x"0cd47", 18x"0ca67", 18x"0c78a", 18x"0c4b1", 18x"0c1db", 18x"0bf09",
+        18x"0bc3a", 18x"0b96e", 18x"0b6a5", 18x"0b3e0", 18x"0b11e", 18x"0ae5f", 18x"0aba3", 18x"0a8eb",
+        18x"0a636", 18x"0a383", 18x"0a0d4", 18x"09e28", 18x"09b80", 18x"098da", 18x"09637", 18x"09397",
+        18x"090fb", 18x"08e61", 18x"08bca", 18x"08936", 18x"086a5", 18x"08417", 18x"0818c", 18x"07f04",
+        18x"07c7e", 18x"079fc", 18x"0777c", 18x"074ff", 18x"07284", 18x"0700d", 18x"06d98", 18x"06b26",
+        18x"068b6", 18x"0664a", 18x"063e0", 18x"06178", 18x"05f13", 18x"05cb1", 18x"05a52", 18x"057f5",
+        18x"0559a", 18x"05342", 18x"050ed", 18x"04e9a", 18x"04c4a", 18x"049fc", 18x"047b0", 18x"04567",
+        18x"04321", 18x"040dd", 18x"03e9b", 18x"03c5c", 18x"03a1f", 18x"037e4", 18x"035ac", 18x"03376",
+        18x"03142", 18x"02f11", 18x"02ce2", 18x"02ab5", 18x"0288b", 18x"02663", 18x"0243d", 18x"02219",
+        18x"01ff7", 18x"01dd8", 18x"01bbb", 18x"019a0", 18x"01787", 18x"01570", 18x"0135b", 18x"01149",
+        18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100"
+        );
+
     -- Left and right shifter with 120 bit input and 64 bit output.
     -- Shifts inp left by shift bits and returns the upper 64 bits of
     -- the result.  The shift parameter is interpreted as a signed
@@ -359,6 +418,14 @@ begin
         end if;
     end process;
 
+    -- synchronous reads from lookup table
+    lut_access: process(clk)
+    begin
+        if rising_edge(clk) then
+            inverse_est <= '1' & inverse_table(to_integer(unsigned(r.b.mantissa(53 downto 46))));
+        end if;
+    end process;
+
     e_out.busy <= r.busy;
     e_out.exception <= r.fpscr(FPSCR_FEX);
     e_out.interrupt <= r.do_intr;
@@ -391,6 +458,7 @@ begin
         variable update_fx   : std_ulogic;
         variable arith_done  : std_ulogic;
         variable invalid     : std_ulogic;
+        variable zero_divide : std_ulogic;
         variable mant_nz     : std_ulogic;
         variable min_exp     : signed(EXP_BITS-1 downto 0);
         variable max_exp     : signed(EXP_BITS-1 downto 0);
@@ -408,9 +476,14 @@ begin
         variable qnan_result : std_ulogic;
         variable longmask    : std_ulogic;
         variable set_a       : std_ulogic;
+        variable set_b       : std_ulogic;
         variable set_c       : std_ulogic;
         variable px_nz       : std_ulogic;
         variable maddend     : std_ulogic_vector(127 downto 0);
+        variable set_y       : std_ulogic;
+        variable pcmpb_eq    : std_ulogic;
+        variable pcmpb_lt    : std_ulogic;
+        variable pshift      : std_ulogic;
     begin
         v := r;
         illegal := '0';
@@ -478,8 +551,16 @@ begin
             exp_huge := '1';
         end if;
 
-        -- Compare P with zero
+        -- Compare P with zero and with B
         px_nz := or (r.p(57 downto 4));
+        pcmpb_eq := '0';
+        if r.p(59 downto 4) = r.b.mantissa(55 downto 0) then
+            pcmpb_eq := '1';
+        end if;
+        pcmpb_lt := '0';
+        if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(55 downto 0)) then
+            pcmpb_lt := '1';
+        end if;
 
         v.writing_back := '0';
         v.instr_done := '0';
@@ -498,18 +579,22 @@ begin
         update_fx := '0';
         arith_done := '0';
         invalid := '0';
+        zero_divide := '0';
         renormalize := '0';
         set_x := '0';
         qnan_result := '0';
         longmask := r.single_prec;
         set_a := '0';
+        set_b := '0';
         set_c := '0';
         f_to_multiply.is_32bit <= '0';
         f_to_multiply.valid <= '0';
         msel_1 <= MUL1_A;
         msel_2 <= MUL2_C;
+        msel_add <= MULADD_ZERO;
         msel_inv <= '0';
-
+        set_y := '0';
+        pshift := '0';
         case r.state is
             when IDLE =>
                 if e_in.valid = '1' then
@@ -550,6 +635,8 @@ begin
                         when "01111" =>
                             v.round_mode := "001";
                             v.state := DO_FCTI;
+                        when "10010" =>
+                            v.state := DO_FDIV;
                         when "10100" | "10101" =>
                             v.state := DO_FADD;
                         when "11001" =>
@@ -897,6 +984,63 @@ begin
                     arith_done := '1';
                 end if;
 
+            when DO_FDIV =>
+                opsel_a <= AIN_A;
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.result_sign := r.a.negative xor r.b.negative;
+                v.result_exp := r.a.exponent - r.b.exponent;
+                v.count := "00";
+                if r.a.class = FINITE and r.b.class = FINITE then
+                    -- Renormalize denorm operands
+                    if r.a.mantissa(54) = '0' then
+                        v.state := RENORM_A;
+                    elsif r.b.mantissa(54) = '0' then
+                        opsel_a <= AIN_B;
+                        v.state := RENORM_B;
+                    else
+                        v.first := '1';
+                        v.state := DIV_2;
+                    end if;
+                else
+                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                        (r.b.class = NAN and r.b.mantissa(53) = '0') then
+                        -- Signalling NAN
+                        v.fpscr(FPSCR_VXSNAN) := '1';
+                        invalid := '1';
+                    end if;
+                    if r.a.class = NAN then
+                        -- result is A
+                        v.result_sign := r.a.negative;
+                    elsif r.b.class = NAN then
+                        v.result_class := NAN;
+                        v.result_sign := r.b.negative;
+                        opsel_a <= AIN_B;
+                    elsif r.b.class = INFINITY then
+                        if r.a.class = INFINITY then
+                            v.fpscr(FPSCR_VXIDI) := '1';
+                            qnan_result := '1';
+                        else
+                            v.result_class := ZERO;
+                        end if;
+                    elsif r.b.class = ZERO then
+                        if r.a.class = ZERO then
+                            v.fpscr(FPSCR_VXZDZ) := '1';
+                            qnan_result := '1';
+                        else
+                            if r.a.class = FINITE then
+                                zero_divide := '1';
+                            end if;
+                            v.result_class := INFINITY;
+                        end if;
+                    -- else r.b.class = FINITE, result_class = r.a.class
+                    end if;
+                    arith_done := '1';
+                end if;
+
             when RENORM_A =>
                 renormalize := '1';
                 v.state := RENORM_A2;
@@ -904,14 +1048,33 @@ begin
             when RENORM_A2 =>
                 set_a := '1';
                 v.result_exp := new_exp;
-                opsel_a <= AIN_C;
-                if r.c.mantissa(54) = '1' then
-                    v.first := '1';
-                    v.state := MULT_1;
+                if r.insn(4) = '1' then
+                    opsel_a <= AIN_C;
+                    if r.c.mantissa(54) = '1' then
+                        v.first := '1';
+                        v.state := MULT_1;
+                    else
+                        v.state := RENORM_C;
+                    end if;
                 else
-                    v.state := RENORM_C;
+                        opsel_a <= AIN_B;
+                        if r.b.mantissa(54) = '1' then
+                            v.first := '1';
+                            v.state := DIV_2;
+                        else
+                            v.state := RENORM_B;
+                    end if;
                 end if;
 
+            when RENORM_B =>
+                renormalize := '1';
+                v.state := RENORM_B2;
+
+            when RENORM_B2 =>
+                set_b := '1';
+                v.result_exp := r.result_exp + r.shift;
+                v.state := LOOKUP;
+
             when RENORM_C =>
                 renormalize := '1';
                 v.state := RENORM_C2;
@@ -982,6 +1145,82 @@ begin
                     v.state := FINISH;
                 end if;
 
+            when LOOKUP =>
+                opsel_a <= AIN_B;
+                -- wait one cycle for inverse_table[B] lookup
+                v.first := '1';
+                v.state := DIV_2;
+
+            when DIV_2 =>
+                -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y
+                msel_1 <= MUL1_B;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                if r.count = 0 then
+                    msel_2 <= MUL2_LUT;
+                else
+                    msel_2 <= MUL2_P;
+                end if;
+                set_y := r.first;
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.count := r.count + 1;
+                    v.state := DIV_3;
+                end if;
+
+            when DIV_3 =>
+                -- compute Y = P = P * Y
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    if r.count = 3 then
+                        v.state := DIV_4;
+                    else
+                        v.state := DIV_2;
+                    end if;
+                end if;
+
+            when DIV_4 =>
+                -- compute R = P = A * Y (quotient)
+                msel_1 <= MUL1_A;
+                msel_2 <= MUL2_P;
+                set_y := r.first;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.state := DIV_5;
+                end if;
+
+            when DIV_5 =>
+                -- compute P = A - B * R (remainder)
+                msel_1 <= MUL1_B;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := DIV_6;
+                end if;
+
+            when DIV_6 =>
+                -- test if remainder is 0 or >= B
+                if pcmpb_lt = '1' then
+                    -- quotient is correct, set X if remainder non-zero
+                    v.x := r.p(58) or px_nz;
+                else
+                    -- quotient needs to be incremented by 1
+                    carry_in <= '1';
+                    v.x := not pcmpb_eq;
+                end if;
+                v.state := FINISH;
+
             when INT_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -1218,6 +1457,9 @@ begin
 
         end case;
 
+        if zero_divide = '1' then
+            v.fpscr(FPSCR_ZX) := '1';
+        end if;
         if qnan_result = '1' then
             invalid := '1';
             v.result_class := NAN;
@@ -1227,7 +1469,9 @@ begin
         end if;
         if arith_done = '1' then
             -- Enabled invalid exception doesn't write result or FPRF
-            if (invalid and r.fpscr(FPSCR_VE)) = '0' then
+            -- Neither does enabled zero-divide exception
+            if (invalid and r.fpscr(FPSCR_VE)) = '0' and
+                (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then
                 v.writing_back := '1';
                 v.update_fprf := '1';
             end if;
@@ -1236,30 +1480,52 @@ begin
             update_fx := '1';
         end if;
 
-        -- Multiplier data path
+        -- Multiplier and divide/square root data path
         case msel_1 is
             when MUL1_A =>
                 f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00";
             when MUL1_B =>
                 f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00";
+            when MUL1_Y =>
+                f_to_multiply.data1 <= r.y;
             when others =>
                 f_to_multiply.data1 <= r.r(61 downto 0) & "00";
         end case;
         case msel_2 is
             when MUL2_C =>
                 f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00";
+            when MUL2_LUT =>
+                f_to_multiply.data2 <= x"00" & inverse_est & '0' & x"000000000";
+            when MUL2_P =>
+                f_to_multiply.data2 <= r.p;
             when others =>
                 f_to_multiply.data2 <= r.r(61 downto 0) & "00";
         end case;
         maddend := (others => '0');
+        case msel_add is
+            when MULADD_CONST =>
+                -- addend is 2.0 in 16.112 format
+                maddend(113) := '1';                -- 2.0
+            when MULADD_A =>
+                -- addend is A in 16.112 format
+                maddend(121 downto 58) := r.a.mantissa;
+            when others =>
+        end case;
         if msel_inv = '1' then
             f_to_multiply.addend <= not maddend;
         else
             f_to_multiply.addend <= maddend;
         end if;
         f_to_multiply.not_result <= msel_inv;
+        if set_y = '1' then
+            v.y := f_to_multiply.data2;
+        end if;
         if multiply_to_f.valid = '1' then
-            v.p := multiply_to_f.result(63 downto 0);
+            if pshift = '0' then
+                v.p := multiply_to_f.result(63 downto 0);
+            else
+                v.p := multiply_to_f.result(119 downto 56);
+            end if;
         end if;
 
         -- Data path.
@@ -1378,6 +1644,10 @@ begin
             v.a.exponent := new_exp;
             v.a.mantissa := shift_res;
         end if;
+        if set_b = '1' then
+            v.b.exponent := new_exp;
+            v.b.mantissa := shift_res;
+        end if;
         if set_c = '1' then
             v.c.exponent := new_exp;
             v.c.mantissa := shift_res;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 305359a..cbb0ee2 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1007,6 +1007,7 @@ struct mulvals {
 	{ 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
 	{ 0xbf4fff801fffffff, 0x6d7fffff8000007f, 0xecdfff7fa001fffe },
 	{ 0x3fbd50275a65ed80, 0x0010000000000000, 0x0001d50275a65ed8 },
+	{ 0x3fe95d8937acf1ce, 0x0000000000000001, 0x0000000000000001 },
 };
 
 int test15(long arg)
@@ -1073,6 +1074,43 @@ int fpu_test_16(void)
 	return trapit(0, test16);
 }
 
+struct divvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	unsigned long prod;
+} divvals[] = {
+	{ 0x3ff0000000000000, 0x0000000000000000, 0x7ff0000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
+	{ 0x4000000000000000, 0x4008000000000000, 0x3fe5555555555555 },
+	{ 0xc01fff0007ffffff, 0xc03ffffffdffffbf, 0x3fcfff0009fff041 },
+};
+
+int test17(long arg)
+{
+	long i;
+	unsigned long result;
+	struct divvals *vp = divvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(divvals) / sizeof(divvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); lfd 6,8(%0); fdiv 7,5,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		if (result != vp->prod) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_17(void)
+{
+	enable_fp();
+	return trapit(0, test17);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1114,6 +1152,7 @@ int main(void)
 	do_test(14, fpu_test_14);
 	do_test(15, fpu_test_15);
 	do_test(16, fpu_test_16);
+	do_test(17, fpu_test_17);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 1e0e29e0c174fe7fcc993071b35fef6086d1a0c0..dc5af293a27cd0d08eda8272695afbe7e9ba89c5 100755
GIT binary patch
delta 2394
zcmai0drVVj6hHUUmSS3Mh4QGSFAxE-77@6tC>0swp~xH>r&FuwoO{I?XK+Jcrkb)u
z10J{F!a&Vt*s>*<?&abbd$`D^jwSvf@r67l;A5~Uc250vzFVTsnRt?ubI$Mkedjy(
zeCOV_^BbYNP0$0lX8{zg`YE(%3)!_b_4;rTJoO1PMA%oqG)<UxXxG;h3Ws;vvj7Hv
z0q{sxs3`$*>Y8Qj%oWCpc^t%Yvx3|X;{JT5^P#l>*6X5db<Me10m^u-q^26+?K2Hy
ztYAj)G+>8rs_1LNQQbl^{vv&eC?2!md*KOTd@ny~7(2qJ8?+WEIK*rL#@bzjcY}4*
zp*A8@3$y}|GZ^GP9Wgg7u=9XRLG|Cl_an-M1{4fwh7+++@SVK>Kn`Quasb+0SFq6F
z5|6~<4#P{z^<L@IA^bSXf$@>iLJQ7{Tp%>!&d3E~#4<b|St8Cfp(ARuvTnIo`Wr{0
zGO;^KBedexr~<`Wue4r<HbiTMbyyXhuY7r>SL#B2Oy=&GGUaiPS1RNI*2VBW@5el)
zY+UV?4r8h@4Z95rv>G)+8BQ}&R)a#nJ-~{o$ZUQInl)Z2iFld9EWBu(jJu2r#iKXz
zMPq!ZrxL=-J@ZjJ$*))v1qHj~OxQiiq&1QMPB`xhc>3^M_#|;}Bqo?vi288cVM?B;
zjf8^z)C+k%p{j!V|Gmi}`ZnU9rhM^713nR(Eb4aS^RWvf>o%54vD9xXtQuhKK(-EQ
zBqE-S%@-90j5NP01aP~#cp~R1@9?q@WGjN49&@s&iNNs|ny_7%YuRP2+xVeW@lS7c
z;qCt=9J1sK1DI<4ftN9AO*U}uom;F}fNJTp(DKSsut|NGZd)g+_hOSRm-BjTL7x60
zUI1<OW!g}1%L8;Uq;P8hYwc-#OPf7ebn0-(zF63YbK+!db6j#*#RMo2fVng{VxMbQ
zKCrbyej~bFRuCxNN5|CAaV*}6O?I=xH}*VE*dG|Y8GI)42Y8dZD1NZ^_VIM613Tj)
zv<=iN9SDKn=nhhGBrbef*SO$ql@3=|I-y*C;G<*M6Ep;e$9yL>Aq7+8UBVW8CEj1X
zj|$MhHaG8j3;F!u^ilqHxKcQlNVoPVj-EKPX$>G-<~__&Q=Y3gc%E7Ex&*6g9>JL?
zbr4B_D!Jf&RNN*!G#Q2c_+-MPi(7vE@x^5k=W)du<UEflZsf237T1WiQ*H{+;#-Mn
z!W#TMF-<IJ_75d)4ykSp-pD|JUA%lSL|xf>jqkL|<HOkM?5}9OrrBP<mvG)!CH1F$
z?D9_#c2InylH<Jp#|S?xV{hKWItlxbu*v<b_;nf#H?@lM`H)ku^?V%e9bQR9bVgZ)
zF28<Ss1U=y1bP~RvZY>0kY7cc#9!TQXmd;nYm`6E;PK(pj_Ui7zsvEAplxjKk-%EC
z$_hnWe3<2QX?@N1!A}%W`SAkS>ch93;gdNMLi4VHq^#hIdn?Cu55{=dxxf%SDn11u
zsTQDiIxoWKk4;@ERBt-VSRch)bfm<AymO3|a)5+mZqgdUhRJf_SQ+Va=NW6^013xl
zARQ*mN|p=9DoJk>7S90^ZtF-j7Z^JWfd9;}iR~BAoaPXZcVh{eyo*?sW)@$*h)rZR
z^k5sAh8{dmro9L6k{Reh^Gvg->&4k*q+VP>#?y;cWPH8YM5eVD+sK^m#q(sK5ATvm
z?4x~?lp;VWdFe(~fZUcooSklri4LKv`WZ8k;bJ9CAxw?0rCU^`<dyZ~yXkgS1-YC0
zu`NAW)k1D(KVC~uP@N*T;CD1;*j2a5bzj0<a^n>Mr!L`(8FuA7ay3|=VNq=$@Ax2o
zkzrQtA$QLZ-pw$M<L7-vVdCX`;aLig@X)LTCDn|!%s68rc^}XZI_GoABc-v1)+@Lo
zGgsv$*L{uL*{Qt!$0_Xm<Ni}|teddYgz=qR=8;|}OpOMqSed5+Sc7Y&DTWQ?MPH{;
z5Qy;Pc&mO+mG~6ym*SFU-eAnfx03LTd}vw-^KyVBk<?8ZjaQ|qN%KY-;|Ledt|9dh
zmcsLraBL50g?~=gZY4G^N%EJv9a<cnugB?mYW#9h5;{v%$#P2KsojfY>8Jk#EEJff
v{aeQFFqYFkXPAWn`!&a3F~32rJvs78X6^F99Qhd3=NqtFQ2H&452^nJR9`?N

delta 2173
zcmah~e@s(X6u$4#*5X=hf%2o2(w2fyp@1;@pr}=pwXgv?wrmC!#BG>4BFo(BR%kWV
zfEk2(wuBfRV$`T{22xFMQG;1x!Zr>5w?*er3InDL62<CtcjvZ7jWh8ihjY$%zkA;I
z&O5-!1#b8}rzFI-gpl(3OY_RNWBh>llrg+#gD-tC@9FWa&F8ZEJAYW<aCR7q2$}qu
zkUBv}Je33!MNbI3cAL@)R)?__MPcp`a(}uu{KA<Be2RFzqGv^s2RlwzdfbHUxz_!J
z74Ast5@1NNhz}lsImH?b{&Hm{&ktx}Urc%=yUU*BgrS%m)#GGh@2C18rGwVVf5SQy
zAdj`kXYhoOWs_m<>e$jqPsbsv2=(uT1F;QU0&uE))p-rEUku$pw2ac7%Lo~?9tMZX
z%AeH0Zq>))Mz8Sw6r6}RgE20FtAizRRa^~p#8vUXtc4qKmAo+t%<-R#?d!b41DK7s
zz(~A|YlJ)TcF`uUusH;6O^|aRLTf^q*to$f{07QI%ihEqvA51EI9LH)iR{jS#5cq>
z8@<9=$WvQDr<QRt$W~j#S5e?D=vRNi$?<0zR3=HdZ{eLJzo;vY*r6^(0~s2Pqy~%2
z(NL~2@K;sPq*>29qT!0h6unAC?7i57jUtfV>cqj$Pd4+;W++ZB6FrK>9wV9r*q^N6
zyM1sfc}<+Bxj{(Aaq39@1f_>c6vQJSJx*K3m&C$)Z4;`!tgVP<Jn>^%!8oEY$CP4<
zTp2^`9HCY(^lDSUl_KNYI$%#qr`pqeL~#By>UKQ(U+=}bGEN8Ebw4s)OZ6rd<34^!
zD>&j7PS0y_RTI6SffoHoyt5m|^(z?9WH6!DBZBe;UakUK^=j}MY;p;)h4$rWgrmg+
zR}J~>o+Q;2*|;DqG67~&OTm>Y;}p=I8alp^YU2B%U@FxN!^T*78@{E2a6$GRCSleX
zlRY$_Ef=Y)o^iW!Nkd5AnI~8=uGsM>d2UE^UYeEL44<U=-Cd|vMt0iRyt*-GbG<Xi
z*4CQEune5{%Q!>GS&C<bl!SOMa-`VjNT|<}(78bBJul%z3p<d+k=BslgD7qj^rr*o
zgH;P(E>3mt)#7IFzFOR6<Ph=-amSJLytuESJ>x#-fE}6n_&awdGoK&$&Oep;d4&65
z_=`*g>CL~6Mo3+af3usq(1)gUW63c*Cp`w=YQ))G#W<fnoHyIOU5Fj!zjQI2&3^##
z<`DMPQ*02ib%>>n(~52PX@w0(#rSO3aMnt;4o@FmfJDfR*KtFBrFk9~sY5bb4j%Gy
z7Dj49e<ews6zX%_&no|9&fA<^({oE8y7GD*Dc>#yN1jzK?J-QA6cN{nD$*haUtUZa
zLn3gAWhApGd}3>1nBw^whVrUZ;YS&>2+3?Gq&<g;kovP1Z{XbC-zg1X%}jxg98z<G
z(rN~vG0cYEh1d*6W(=!A-|`2g#~6Udutt`QP^x1QjbScy8Dd5Tpt0D6u0iZNA@Ev(
zo*y0obAg$^eiPPXsJR851zO&93r=D1jKUQR$3|fq!|*6Xy{_ehqhQ9M3qU0XdjOg+
zYz;sshVB5I!Y~+sD;OpMFpWVm22q7tUKoSQLbc>q9yN?XQ=vLBi0Q-_r5dIUO@}Uv
zAY=x53sWSyn3asfr9y+`JxpuIVY<*HX~T429JH2n$!Sb;|AcajL2@0_j0yM@Q;~>}
za}%)7Vi0FxdKWHQQY6)wot}bmi&p#@rY=}5B+X}!>&8lP8_a?_(un0bEDysrAypl~
zEQtR@8Hw3S^dMrQJMd6g3NwO8Y!egm4LlG^v)eE;&OCb&Hv9dEWg*5cGJ6xf53yR<
zTvQ<@5_}CfTeMJ<h1r(7l-f8Xtcubpm$&kzQDmiLIewt6`I=1UJxbeH2Q=20Ey-=f
zybM6gK)0dS!t&xps;)Uo8N!UQJ?MvFM{$h!+<i)&{@&u<Vt-$8EeAR8XZovcX1Rav
Nt-VtJ!FLCx{{h#p-|+wd

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 04c6c08..a8e2dcb 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -14,3 +14,4 @@ test 13:PASS
 test 14:PASS
 test 15:PASS
 test 16:PASS
+test 17:PASS