From 9cce9362519b69106936ce437bd1dfe4f27dee4b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 Jul 2020 16:07:25 +1000 Subject: [PATCH] FPU: Implement fdiv[s] This implements floating-point division A/B by a process that starts with normalizing both inputs if necessary. Then an estimate of 1/B from a lookup table is refined by 3 Newton-Raphson iterations and then multiplied by A to get a quotient. The remainder is calculated as A - R * B (where R is the result, i.e. the quotient) and the remainder is compared to 0 and to B to see whether the quotient needs to be incremented by 1. The calculations of 1 / B are done with 56 fraction bits and intermediate results are truncated rather than rounded, meaning that the final estimate of 1 / B is always correct or a little bit low, never too high, and thus the calculated quotient is correct or 1 unit too low. Doing the estimate of 1 / B with sufficient precision that the quotient is always correct to the last bit without needing any adjustment would require many more bits of precision. This implements fdivs by computing a double-precision quotient and then rounding it to single precision. It would be possible to optimize this by e.g. doing only 2 iterations of Newton-Raphson and then doing the remainder calculation and adjustment at single precision rather than double precision. Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 + fpu.vhdl | 292 +++++++++++++++++++++++++++++++++++-- tests/fpu/fpu.c | 39 +++++ tests/test_fpu.bin | Bin 24272 -> 24416 bytes tests/test_fpu.console_out | 1 + 5 files changed, 323 insertions(+), 11 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 721c478..ddcbb3c 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -416,6 +416,7 @@ architecture behaviour of decode1 is -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe 2#01110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fcfid[u]s + 2#10010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fdivs 2#10100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fsubs 2#10101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fadds 2#11001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmuls @@ -469,6 +470,7 @@ architecture behaviour of decode1 is constant decode_op_63h_array : op_63_subop_array_1_t := ( -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe + 2#0010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fdiv 2#0100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsub 2#0101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fadd 2#1001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmul diff --git a/fpu.vhdl b/fpu.vhdl index 209daa0..2584e1c 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -40,10 +40,12 @@ architecture behaviour of fpu is DO_FMR, DO_FMRG, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, - DO_FADD, DO_FMUL, + DO_FADD, DO_FMUL, DO_FDIV, FRI_1, ADD_SHIFT, ADD_2, ADD_3, MULT_1, + LOOKUP, + DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, @@ -51,6 +53,7 @@ architecture behaviour of fpu is ROUNDING, ROUNDING_2, ROUNDING_3, DENORM, RENORM_A, RENORM_A2, + RENORM_B, RENORM_B2, RENORM_C, RENORM_C2); type reg_type is record @@ -72,6 +75,7 @@ architecture behaviour of fpu is r : std_ulogic_vector(63 downto 0); -- 10.54 format x : std_ulogic; p : std_ulogic_vector(63 downto 0); -- 8.56 format + y : std_ulogic_vector(63 downto 0); -- 8.56 format result_sign : std_ulogic; result_class : fp_number_class; result_exp : signed(EXP_BITS-1 downto 0); @@ -91,8 +95,11 @@ architecture behaviour of fpu is add_bsmall : std_ulogic; is_multiply : std_ulogic; first : std_ulogic; + count : unsigned(1 downto 0); end record; + type lookup_table is array(0 to 255) of std_ulogic_vector(17 downto 0); + signal r, rin : reg_type; signal fp_result : std_ulogic_vector(63 downto 0); @@ -114,7 +121,9 @@ architecture behaviour of fpu is signal multiply_to_f : MultiplyOutputType; signal msel_1 : std_ulogic_vector(1 downto 0); signal msel_2 : std_ulogic_vector(1 downto 0); + signal msel_add : std_ulogic_vector(1 downto 0); signal msel_inv : std_ulogic; + signal inverse_est : std_ulogic_vector(18 downto 0); -- opsel values constant AIN_R : std_ulogic_vector(1 downto 0) := "00"; @@ -134,11 +143,61 @@ architecture behaviour of fpu is -- msel values constant MUL1_A : std_ulogic_vector(1 downto 0) := "00"; constant MUL1_B : std_ulogic_vector(1 downto 0) := "01"; + constant MUL1_Y : std_ulogic_vector(1 downto 0) := "10"; constant MUL1_R : std_ulogic_vector(1 downto 0) := "11"; constant MUL2_C : std_ulogic_vector(1 downto 0) := "00"; + constant MUL2_LUT : std_ulogic_vector(1 downto 0) := "01"; + constant MUL2_P : std_ulogic_vector(1 downto 0) := "10"; constant MUL2_R : std_ulogic_vector(1 downto 0) := "11"; + constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00"; + constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01"; + constant MULADD_A : std_ulogic_vector(1 downto 0) := "10"; + + -- Inverse lookup table, indexed by the top 8 fraction bits + -- Output range is [0.5, 1) in 0.19 format, though the top + -- bit isn't stored since it is always 1. + -- Each output value is the inverse of the center of the input + -- range for the value, i.e. entry 0 is 1 / (1 + 1/512), + -- entry 1 is 1 / (1 + 3/512), etc. + signal inverse_table : lookup_table := ( + -- 1/x lookup table + -- Unit bit is assumed to be 1, so input range is [1, 2) + 18x"3fc01", 18x"3f411", 18x"3ec31", 18x"3e460", 18x"3dc9f", 18x"3d4ec", 18x"3cd49", 18x"3c5b5", + 18x"3be2f", 18x"3b6b8", 18x"3af4f", 18x"3a7f4", 18x"3a0a7", 18x"39968", 18x"39237", 18x"38b14", + 18x"383fe", 18x"37cf5", 18x"375f9", 18x"36f0a", 18x"36828", 18x"36153", 18x"35a8a", 18x"353ce", + 18x"34d1e", 18x"3467a", 18x"33fe3", 18x"33957", 18x"332d7", 18x"32c62", 18x"325f9", 18x"31f9c", + 18x"3194a", 18x"31303", 18x"30cc7", 18x"30696", 18x"30070", 18x"2fa54", 18x"2f443", 18x"2ee3d", + 18x"2e841", 18x"2e250", 18x"2dc68", 18x"2d68b", 18x"2d0b8", 18x"2caee", 18x"2c52e", 18x"2bf79", + 18x"2b9cc", 18x"2b429", 18x"2ae90", 18x"2a900", 18x"2a379", 18x"29dfb", 18x"29887", 18x"2931b", + 18x"28db8", 18x"2885e", 18x"2830d", 18x"27dc4", 18x"27884", 18x"2734d", 18x"26e1d", 18x"268f6", + 18x"263d8", 18x"25ec1", 18x"259b3", 18x"254ac", 18x"24fad", 18x"24ab7", 18x"245c8", 18x"240e1", + 18x"23c01", 18x"23729", 18x"23259", 18x"22d90", 18x"228ce", 18x"22413", 18x"21f60", 18x"21ab4", + 18x"2160f", 18x"21172", 18x"20cdb", 18x"2084b", 18x"203c2", 18x"1ff40", 18x"1fac4", 18x"1f64f", + 18x"1f1e1", 18x"1ed79", 18x"1e918", 18x"1e4be", 18x"1e069", 18x"1dc1b", 18x"1d7d4", 18x"1d392", + 18x"1cf57", 18x"1cb22", 18x"1c6f3", 18x"1c2ca", 18x"1bea7", 18x"1ba8a", 18x"1b672", 18x"1b261", + 18x"1ae55", 18x"1aa50", 18x"1a64f", 18x"1a255", 18x"19e60", 18x"19a70", 18x"19686", 18x"192a2", + 18x"18ec3", 18x"18ae9", 18x"18715", 18x"18345", 18x"17f7c", 18x"17bb7", 18x"177f7", 18x"1743d", + 18x"17087", 18x"16cd7", 18x"1692c", 18x"16585", 18x"161e4", 18x"15e47", 18x"15ab0", 18x"1571d", + 18x"1538e", 18x"15005", 18x"14c80", 18x"14900", 18x"14584", 18x"1420d", 18x"13e9b", 18x"13b2d", + 18x"137c3", 18x"1345e", 18x"130fe", 18x"12da2", 18x"12a4a", 18x"126f6", 18x"123a7", 18x"1205c", + 18x"11d15", 18x"119d2", 18x"11694", 18x"11359", 18x"11023", 18x"10cf1", 18x"109c2", 18x"10698", + 18x"10372", 18x"10050", 18x"0fd31", 18x"0fa17", 18x"0f700", 18x"0f3ed", 18x"0f0de", 18x"0edd3", + 18x"0eacb", 18x"0e7c7", 18x"0e4c7", 18x"0e1ca", 18x"0ded2", 18x"0dbdc", 18x"0d8eb", 18x"0d5fc", + 18x"0d312", 18x"0d02b", 18x"0cd47", 18x"0ca67", 18x"0c78a", 18x"0c4b1", 18x"0c1db", 18x"0bf09", + 18x"0bc3a", 18x"0b96e", 18x"0b6a5", 18x"0b3e0", 18x"0b11e", 18x"0ae5f", 18x"0aba3", 18x"0a8eb", + 18x"0a636", 18x"0a383", 18x"0a0d4", 18x"09e28", 18x"09b80", 18x"098da", 18x"09637", 18x"09397", + 18x"090fb", 18x"08e61", 18x"08bca", 18x"08936", 18x"086a5", 18x"08417", 18x"0818c", 18x"07f04", + 18x"07c7e", 18x"079fc", 18x"0777c", 18x"074ff", 18x"07284", 18x"0700d", 18x"06d98", 18x"06b26", + 18x"068b6", 18x"0664a", 18x"063e0", 18x"06178", 18x"05f13", 18x"05cb1", 18x"05a52", 18x"057f5", + 18x"0559a", 18x"05342", 18x"050ed", 18x"04e9a", 18x"04c4a", 18x"049fc", 18x"047b0", 18x"04567", + 18x"04321", 18x"040dd", 18x"03e9b", 18x"03c5c", 18x"03a1f", 18x"037e4", 18x"035ac", 18x"03376", + 18x"03142", 18x"02f11", 18x"02ce2", 18x"02ab5", 18x"0288b", 18x"02663", 18x"0243d", 18x"02219", + 18x"01ff7", 18x"01dd8", 18x"01bbb", 18x"019a0", 18x"01787", 18x"01570", 18x"0135b", 18x"01149", + 18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100" + ); + -- Left and right shifter with 120 bit input and 64 bit output. -- Shifts inp left by shift bits and returns the upper 64 bits of -- the result. The shift parameter is interpreted as a signed @@ -359,6 +418,14 @@ begin end if; end process; + -- synchronous reads from lookup table + lut_access: process(clk) + begin + if rising_edge(clk) then + inverse_est <= '1' & inverse_table(to_integer(unsigned(r.b.mantissa(53 downto 46)))); + end if; + end process; + e_out.busy <= r.busy; e_out.exception <= r.fpscr(FPSCR_FEX); e_out.interrupt <= r.do_intr; @@ -391,6 +458,7 @@ begin variable update_fx : std_ulogic; variable arith_done : std_ulogic; variable invalid : std_ulogic; + variable zero_divide : std_ulogic; variable mant_nz : std_ulogic; variable min_exp : signed(EXP_BITS-1 downto 0); variable max_exp : signed(EXP_BITS-1 downto 0); @@ -408,9 +476,14 @@ begin variable qnan_result : std_ulogic; variable longmask : std_ulogic; variable set_a : std_ulogic; + variable set_b : std_ulogic; variable set_c : std_ulogic; variable px_nz : std_ulogic; variable maddend : std_ulogic_vector(127 downto 0); + variable set_y : std_ulogic; + variable pcmpb_eq : std_ulogic; + variable pcmpb_lt : std_ulogic; + variable pshift : std_ulogic; begin v := r; illegal := '0'; @@ -478,8 +551,16 @@ begin exp_huge := '1'; end if; - -- Compare P with zero + -- Compare P with zero and with B px_nz := or (r.p(57 downto 4)); + pcmpb_eq := '0'; + if r.p(59 downto 4) = r.b.mantissa(55 downto 0) then + pcmpb_eq := '1'; + end if; + pcmpb_lt := '0'; + if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(55 downto 0)) then + pcmpb_lt := '1'; + end if; v.writing_back := '0'; v.instr_done := '0'; @@ -498,18 +579,22 @@ begin update_fx := '0'; arith_done := '0'; invalid := '0'; + zero_divide := '0'; renormalize := '0'; set_x := '0'; qnan_result := '0'; longmask := r.single_prec; set_a := '0'; + set_b := '0'; set_c := '0'; f_to_multiply.is_32bit <= '0'; f_to_multiply.valid <= '0'; msel_1 <= MUL1_A; msel_2 <= MUL2_C; + msel_add <= MULADD_ZERO; msel_inv <= '0'; - + set_y := '0'; + pshift := '0'; case r.state is when IDLE => if e_in.valid = '1' then @@ -550,6 +635,8 @@ begin when "01111" => v.round_mode := "001"; v.state := DO_FCTI; + when "10010" => + v.state := DO_FDIV; when "10100" | "10101" => v.state := DO_FADD; when "11001" => @@ -897,6 +984,63 @@ begin arith_done := '1'; end if; + when DO_FDIV => + opsel_a <= AIN_A; + v.result_sign := r.a.negative; + v.result_class := r.a.class; + v.result_exp := r.a.exponent; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + v.result_sign := r.a.negative xor r.b.negative; + v.result_exp := r.a.exponent - r.b.exponent; + v.count := "00"; + if r.a.class = FINITE and r.b.class = FINITE then + -- Renormalize denorm operands + if r.a.mantissa(54) = '0' then + v.state := RENORM_A; + elsif r.b.mantissa(54) = '0' then + opsel_a <= AIN_B; + v.state := RENORM_B; + else + v.first := '1'; + v.state := DIV_2; + end if; + else + if (r.a.class = NAN and r.a.mantissa(53) = '0') or + (r.b.class = NAN and r.b.mantissa(53) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + if r.a.class = NAN then + -- result is A + v.result_sign := r.a.negative; + elsif r.b.class = NAN then + v.result_class := NAN; + v.result_sign := r.b.negative; + opsel_a <= AIN_B; + elsif r.b.class = INFINITY then + if r.a.class = INFINITY then + v.fpscr(FPSCR_VXIDI) := '1'; + qnan_result := '1'; + else + v.result_class := ZERO; + end if; + elsif r.b.class = ZERO then + if r.a.class = ZERO then + v.fpscr(FPSCR_VXZDZ) := '1'; + qnan_result := '1'; + else + if r.a.class = FINITE then + zero_divide := '1'; + end if; + v.result_class := INFINITY; + end if; + -- else r.b.class = FINITE, result_class = r.a.class + end if; + arith_done := '1'; + end if; + when RENORM_A => renormalize := '1'; v.state := RENORM_A2; @@ -904,14 +1048,33 @@ begin when RENORM_A2 => set_a := '1'; v.result_exp := new_exp; - opsel_a <= AIN_C; - if r.c.mantissa(54) = '1' then - v.first := '1'; - v.state := MULT_1; + if r.insn(4) = '1' then + opsel_a <= AIN_C; + if r.c.mantissa(54) = '1' then + v.first := '1'; + v.state := MULT_1; + else + v.state := RENORM_C; + end if; else - v.state := RENORM_C; + opsel_a <= AIN_B; + if r.b.mantissa(54) = '1' then + v.first := '1'; + v.state := DIV_2; + else + v.state := RENORM_B; + end if; end if; + when RENORM_B => + renormalize := '1'; + v.state := RENORM_B2; + + when RENORM_B2 => + set_b := '1'; + v.result_exp := r.result_exp + r.shift; + v.state := LOOKUP; + when RENORM_C => renormalize := '1'; v.state := RENORM_C2; @@ -982,6 +1145,82 @@ begin v.state := FINISH; end if; + when LOOKUP => + opsel_a <= AIN_B; + -- wait one cycle for inverse_table[B] lookup + v.first := '1'; + v.state := DIV_2; + + when DIV_2 => + -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y + msel_1 <= MUL1_B; + msel_add <= MULADD_CONST; + msel_inv <= '1'; + if r.count = 0 then + msel_2 <= MUL2_LUT; + else + msel_2 <= MUL2_P; + end if; + set_y := r.first; + pshift := '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.first := '1'; + v.count := r.count + 1; + v.state := DIV_3; + end if; + + when DIV_3 => + -- compute Y = P = P * Y + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + f_to_multiply.valid <= r.first; + pshift := '1'; + if multiply_to_f.valid = '1' then + v.first := '1'; + if r.count = 3 then + v.state := DIV_4; + else + v.state := DIV_2; + end if; + end if; + + when DIV_4 => + -- compute R = P = A * Y (quotient) + msel_1 <= MUL1_A; + msel_2 <= MUL2_P; + set_y := r.first; + f_to_multiply.valid <= r.first; + pshift := '1'; + if multiply_to_f.valid = '1' then + opsel_r <= RES_MULT; + v.first := '1'; + v.state := DIV_5; + end if; + + when DIV_5 => + -- compute P = A - B * R (remainder) + msel_1 <= MUL1_B; + msel_2 <= MUL2_R; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.state := DIV_6; + end if; + + when DIV_6 => + -- test if remainder is 0 or >= B + if pcmpb_lt = '1' then + -- quotient is correct, set X if remainder non-zero + v.x := r.p(58) or px_nz; + else + -- quotient needs to be incremented by 1 + carry_in <= '1'; + v.x := not pcmpb_eq; + end if; + v.state := FINISH; + when INT_SHIFT => opsel_r <= RES_SHIFT; set_x := '1'; @@ -1218,6 +1457,9 @@ begin end case; + if zero_divide = '1' then + v.fpscr(FPSCR_ZX) := '1'; + end if; if qnan_result = '1' then invalid := '1'; v.result_class := NAN; @@ -1227,7 +1469,9 @@ begin end if; if arith_done = '1' then -- Enabled invalid exception doesn't write result or FPRF - if (invalid and r.fpscr(FPSCR_VE)) = '0' then + -- Neither does enabled zero-divide exception + if (invalid and r.fpscr(FPSCR_VE)) = '0' and + (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then v.writing_back := '1'; v.update_fprf := '1'; end if; @@ -1236,30 +1480,52 @@ begin update_fx := '1'; end if; - -- Multiplier data path + -- Multiplier and divide/square root data path case msel_1 is when MUL1_A => f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00"; when MUL1_B => f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00"; + when MUL1_Y => + f_to_multiply.data1 <= r.y; when others => f_to_multiply.data1 <= r.r(61 downto 0) & "00"; end case; case msel_2 is when MUL2_C => f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00"; + when MUL2_LUT => + f_to_multiply.data2 <= x"00" & inverse_est & '0' & x"000000000"; + when MUL2_P => + f_to_multiply.data2 <= r.p; when others => f_to_multiply.data2 <= r.r(61 downto 0) & "00"; end case; maddend := (others => '0'); + case msel_add is + when MULADD_CONST => + -- addend is 2.0 in 16.112 format + maddend(113) := '1'; -- 2.0 + when MULADD_A => + -- addend is A in 16.112 format + maddend(121 downto 58) := r.a.mantissa; + when others => + end case; if msel_inv = '1' then f_to_multiply.addend <= not maddend; else f_to_multiply.addend <= maddend; end if; f_to_multiply.not_result <= msel_inv; + if set_y = '1' then + v.y := f_to_multiply.data2; + end if; if multiply_to_f.valid = '1' then - v.p := multiply_to_f.result(63 downto 0); + if pshift = '0' then + v.p := multiply_to_f.result(63 downto 0); + else + v.p := multiply_to_f.result(119 downto 56); + end if; end if; -- Data path. @@ -1378,6 +1644,10 @@ begin v.a.exponent := new_exp; v.a.mantissa := shift_res; end if; + if set_b = '1' then + v.b.exponent := new_exp; + v.b.mantissa := shift_res; + end if; if set_c = '1' then v.c.exponent := new_exp; v.c.mantissa := shift_res; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 305359a..cbb0ee2 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1007,6 +1007,7 @@ struct mulvals { { 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 }, { 0xbf4fff801fffffff, 0x6d7fffff8000007f, 0xecdfff7fa001fffe }, { 0x3fbd50275a65ed80, 0x0010000000000000, 0x0001d50275a65ed8 }, + { 0x3fe95d8937acf1ce, 0x0000000000000001, 0x0000000000000001 }, }; int test15(long arg) @@ -1073,6 +1074,43 @@ int fpu_test_16(void) return trapit(0, test16); } +struct divvals { + unsigned long val_a; + unsigned long val_b; + unsigned long prod; +} divvals[] = { + { 0x3ff0000000000000, 0x0000000000000000, 0x7ff0000000000000 }, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 }, + { 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 }, + { 0x4000000000000000, 0x4008000000000000, 0x3fe5555555555555 }, + { 0xc01fff0007ffffff, 0xc03ffffffdffffbf, 0x3fcfff0009fff041 }, +}; + +int test17(long arg) +{ + long i; + unsigned long result; + struct divvals *vp = divvals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(divvals) / sizeof(divvals[0]); ++i, ++vp) { + asm("lfd 5,0(%0); lfd 6,8(%0); fdiv 7,5,6; stfd 7,0(%1)" + : : "b" (&vp->val_a), "b" (&result) : "memory"); + if (result != vp->prod) { + print_hex(i, 2, " "); + print_hex(result, 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_17(void) +{ + enable_fp(); + return trapit(0, test17); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1114,6 +1152,7 @@ int main(void) do_test(14, fpu_test_14); do_test(15, fpu_test_15); do_test(16, fpu_test_16); + do_test(17, fpu_test_17); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 1e0e29e0c174fe7fcc993071b35fef6086d1a0c0..dc5af293a27cd0d08eda8272695afbe7e9ba89c5 100755 GIT binary patch delta 2394 zcmai0drVVj6hHUUmSS3Mh4QGSFAxE-77@6tC>0swp~xH>r&FuwoO{I?XK+Jcrkb)u z10J{F!a&Vt*s>*Y8Qj%oWCpc^t%Yvx3|X;{JT5^P#l>*6X5db8rs_1LNQQbl^{vv&eC?2!md*KOTd@ny~7(2qJ8?+WEIK*rL#@bzjcY}4* zp*A8@3$y}|GZ^GP9Wgg7u=9XRLG|Cl_an-M1{4fwh7+++@SVK>Kn`Quasb+0SFq6F z5|6~<4#P{z^iLJQ7{Tp%>!&d3E~#4SL#B2Oy=&GGUaiPS1RNI*2VBW@5el) zY+UV?4r8h@4Z95rv>G)+8BQ}&R)a#nJ-~{o$ZUQInl)Z2iFld9EWBu(jJu2r#iKXz zMPq!ZrxL=-J@ZjJ$*))v1qHj~OxQiiq&1QMPB`xhc>3^M_#|;}Bqo?vi288cVM?B; zjf8^z)C+k%p{j!V|Gmi}`ZnU9rhM^713nR(Eb4aS^RWvf>o%54vD9xXtQuhKK(-EQ zBqE-S%@-90j5NP01aP~#cp~R1@9?q@WGjN49&@s&iNNs|ny_7%YuRP2+xVeW@lS7c z;qCt=9J1sK1DI<4ftN9AO*U}uom;F}fNJTp(DKSsut|NGZd)g+_hOSRm-BjTL7x60 zUI1Aq7+8UBVW8CEj1X zj|$MhHaG8j3;F!u^ilqHxKcQlNVoPVj-EKPX$>G-<~__&Q=Y3gc%E7Ex&*6g9>JL? zbr4B_D!Jf&RNN*!G#Q2c_+-MPi(7vE@x^5k=W)dujqkL|0kY7cc#9!TQXmd;nYm`6E;PK(pj_Ui7zsvEAplxjKk-%EC z$_hnWe3<2QX?@N1!A}%W`SAkS>ch93;gdNMLi4VHq^#hIdn?Cu55{=dxxf%SDn11u zsTQDiIxoWKk4;@ERBt-VSRch)bfm7S90^ZtF-j7Z^JWfd9;}iR~BAoaPXZcVh{eyo*?sW)@$*h)rZR z^k5sAh8{dmro9L6k{Reh^Gvg->&4k*q+VP>#?y;cWPH8YM5eVD+sK^m#q(sK5ATvm z?4x~?lp;VWdFe(~fZUcooSklri4LKv`WZ8k;bJ9CAxw?0rCU^`Mr!L`(8FuA7ay3|=VNq=$@Ax2o zkzrQtA$QLZ-pw$M8Jk#EEJff v{aeQFFqYFkXPAWn`!&a3F~32rJvs78X6^F99Qhd3=NqtFQ2H&452^nJR9`?N delta 2173 zcmah~e@s(X6u$4#*5X=hf%2o2(w2fyp@1;@pr}=pwXgv?wrmC!#BG>4BFo(BR%kWV zfEk2(wuBfRV$`T{22xFMQG;1x!Zr>5w?*er3InDL62llrg+#gD-tC@9FWa&F8ZEJAYWe$jqPsbsv2=(uT1F;QU0&uE))p-rEUku$pw2ac7%Lo~?9tMZX z%AeH0Zq>))Mz8Sw6r6}RgE20FtAizRRa^~p#8vUXtc4qKmAo+t%<-R#?d!b41DK7s zz(~A|YlJ)TcF`uUusH;6O^|aRLTf^q*to$f{07QI%ihEqvA51EI9LH)iR{jS#5cq> z8@<9=$WvQDr29qT!0h6unAC?7i57jUtfV>cqj$Pd4+;W++ZB6FrK>9wV9r*q^N6 zyM1sfc}<+Bxj{(Aaq39@1f_>c6vQJSJx*K3m&C$)Z4;`!tgVP^%!8oEY$CP4< zTp2^`9HCY(^lDSUl_KNYI$%#qr`pqeL~#By>UKQ(U+=}bGEN8Ebw4s)OZ6rd<34^! zD>&j7PS0y_RTI6SffoHoyt5m|^(z?9WH6!DBZBe;UakUK^=j}MY;p;)h4$rWgrmg+ zR}J~>o+Q;2*|;DqG67~&OTm>Y;}p=I8alp^YU2B%U@FxN!^T*78@{E2a6$GRCSleX zlRY$_Ef=Y)o^iW!Nkd5AnI~8=uGsM>d2UE^UYeEL44GS&C3mt)#7IFzFOR6x#-fE}6n_&awdGoK&$&Oep;d4&65 z_=`*g>CL~6Mo3+af3usq(1)gUW63c*Cp`w=YQ))G#W>n(~52PX@w0(#rSO3aMnt;4o@FmfJDfR*KtFBrFk9~sY5bb4j%Gy z7Dj49e#yN1jzK?J-QA6cN{nD$*haUtUZa zLn3gAWhApGd}3>1nBw^whVrUZ;YS&>2+3?Gq&q9yN?XQ=vLBi0Q-_r5dIUO@}Uv zAY=x53sWSyn3asfr9y+`JxpuIVY<*HX~T429JH2n$!Sb;|AcajL2@0_j0yM@Q;~>} za}%)7Vi0FxdKWHQQY6)wot}bmi&p#@rY=}5B+X}!>&8lP8_a?_(un0bEDysrAypl~ zEQtR@8Hw3S^dMrQJMd6g3NwO8Y!egm4LlG^v)eE;&OCb&Hv9dEWg*5cGJ6xf53yR< zTvQ<@5_}CfTeMJ