diff --git a/fpu.vhdl b/fpu.vhdl
index eaa4cf2..afac4c0 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -169,9 +169,7 @@ architecture behaviour of fpu is
         oe           : std_ulogic;
         xerc         : xer_common_t;
         xerc_result  : xer_common_t;
-        res_negate   : std_ulogic;
-        res_subtract : std_ulogic;
-        res_rmode    : std_ulogic_vector(2 downto 0);
+        res_sign     : std_ulogic;
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -609,20 +607,13 @@ architecture behaviour of fpu is
 
     -- Construct a DP floating-point result from components
     function pack_dp(negative: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0);
-                     mantissa: std_ulogic_vector; single_prec: std_ulogic; quieten_nan: std_ulogic;
-                     negate: std_ulogic; is_subtract: std_ulogic; round_mode: std_ulogic_vector)
+                     mantissa: std_ulogic_vector; single_prec: std_ulogic; quieten_nan: std_ulogic)
         return std_ulogic_vector is
         variable dp_result : std_ulogic_vector(63 downto 0);
-        variable sign : std_ulogic;
     begin
         dp_result := (others => '0');
-        sign := negative;
         case class is
             when ZERO =>
-                if is_subtract = '1' then
-                    -- set result sign depending on rounding mode
-                    sign := round_mode(0) and round_mode(1);
-                end if;
             when FINITE =>
                 if mantissa(UNIT_BIT) = '1' then
                     -- normalized number
@@ -642,7 +633,7 @@ architecture behaviour of fpu is
                     dp_result(28 downto 0) := mantissa(SP_LSB - 1 downto DP_LSB);
                 end if;
         end case;
-        dp_result(63) := sign xor negate;
+        dp_result(63) := negative;
         return dp_result;
     end;
 
@@ -860,6 +851,7 @@ begin
         variable opcbits     : std_ulogic_vector(4 downto 0);
         variable int_result  : std_ulogic;
         variable illegal     : std_ulogic;
+        variable rsign       : std_ulogic;
     begin
         v := r;
         v.complete := '0';
@@ -1825,8 +1817,17 @@ begin
 
             when RENORM_B2 =>
                 set_b := '1';
-                re_sel2 <= REXP2_NE;
-                re_set_result <= '1';
+                -- For fdiv, we need to increase result_exp by shift rather
+                -- than decreasing it as for fre/frsqrte and fsqrt.
+                -- We do that by negating r.shift in this cycle and then
+                -- setting result_exp to new_exp in the next cycle
+                if r.use_a = '1' then
+                    rs_sel1 <= RSH1_S;
+                    rs_neg1 <= '1';
+                else
+                    re_sel2 <= REXP2_NE;
+                    re_set_result <= '1';
+                end if;
                 v.opsel_a := AIN_B;
                 v.state := LOOKUP;
 
@@ -2046,6 +2047,12 @@ begin
             when LOOKUP =>
                 -- r.opsel_a = AIN_B
                 -- wait one cycle for inverse_table[B] lookup
+                -- if this is a division, compute exponent
+                -- (see comment on RENORM_B2 above)
+                if r.use_a = '1' then
+                    re_sel2 <= REXP2_NE;
+                    re_set_result <= '1';
+                end if;
                 v.first := '1';
                 if r.insn(4) = '0' then
                     if r.insn(3) = '0' then
@@ -2590,7 +2597,6 @@ begin
                 arith_done := '1';
 
             when NAN_RESULT =>
-                v.negate := '0';
                 if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
                     (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
                     (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then
@@ -3158,14 +3164,14 @@ begin
 
         end case;
 
+        rsign := v.result_sign;
         if zero_divide = '1' then
             v.fpscr(FPSCR_ZX) := '1';
         end if;
         if qnan_result = '1' then
             invalid := '1';
             v.result_class := NAN;
-            v.result_sign := '0';
-            v.negate := '0';
+            rsign := '0';
             misc_sel <= "0001";
             opsel_r <= RES_MISC;
             arith_done := '1';
@@ -3181,6 +3187,12 @@ begin
                 v.writing_fpr := '1';
                 v.update_fprf := '1';
             end if;
+            if v.is_subtract = '1' and v.result_class = ZERO then
+                rsign := r.round_mode(0) and r.round_mode(1);
+            end if;
+            if v.negate = '1' and v.result_class /= NAN then
+                rsign := not rsign;
+            end if;
             v.instr_done := '1';
             update_fx := '1';
         end if;
@@ -3516,7 +3528,7 @@ begin
         end if;
 
         if r.update_fprf = '1' then
-            v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class,
+            v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.res_sign, r.result_class,
                                                              r.r(UNIT_BIT) and not r.denorm);
         end if;
 
@@ -3541,9 +3553,7 @@ begin
                 v.int_result := int_result;
                 v.illegal := illegal;
                 v.nsnan_result := v.quieten_nan;
-                v.res_negate := v.negate;
-                v.res_subtract := v.is_subtract;
-                v.res_rmode := r.round_mode;
+                v.res_sign := rsign;
                 if r.integer_op = '1' then
                     v.cr_mask := num_to_fxm(0);
                 elsif r.is_cmp = '0' then
@@ -3574,9 +3584,8 @@ begin
         if r.int_result = '1' then
             fp_result <= r.r;
         else
-            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
-                                 r.sp_result, r.nsnan_result,
-                                 r.res_negate, r.res_subtract, r.res_rmode);
+            fp_result <= pack_dp(r.res_sign, r.result_class, r.result_exp, r.r,
+                                 r.sp_result, r.nsnan_result);
         end if;
 
         rin <= v;
diff --git a/scripts/run_test.sh b/scripts/run_test.sh
index 1a032ba..fc3505f 100755
--- a/scripts/run_test.sh
+++ b/scripts/run_test.sh
@@ -21,7 +21,7 @@ cd $TMPDIR
 
 cp ${MICROWATT_DIR}/tests/${TEST}.bin main_ram.bin
 
-${MICROWATT_DIR}/core_tb | sed 's/.*: //' | egrep '^(GPR[0-9]|LR |CTR |XER |CR [0-9])' | sort | grep -v GPR31 > test.out || true
+${MICROWATT_DIR}/core_tb | sed 's/.*: //' | grep -E '^(GPR[0-9]|LR |CTR |XER |CR [0-9])' | sort | grep -v GPR31 > test.out || true
 
 grep -v "^$" ${MICROWATT_DIR}/tests/${TEST}.out | sort | grep -v GPR31 > exp.out
 
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 773c05d..059d83b 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -459,6 +459,7 @@ int test6(long arg)
 	unsigned long results[6];
 	unsigned long v;
 
+	set_fpscr(0);
 	for (i = 0; i < sizeof(sp_dp_equiv) / sizeof(sp_dp_equiv[0]); ++i) {
 		v = sp_dp_equiv[i].dp;
 		asm("lfd%U0%X0 3,%0; fmr 6,3; fneg 7,3; stfd 6,0(%1); stfd 7,8(%1)"
@@ -474,6 +475,8 @@ int test6(long arg)
 		    results[4] != (v & ~SIGN) ||
 		    results[5] != (v | SIGN))
 			return i + 1;
+		if (get_fpscr() != 0)
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -484,6 +487,98 @@ int fpu_test_6(void)
 	return trapit(0, test6);
 }
 
+unsigned long expected_fprf(unsigned long result, bool single)
+{
+	unsigned long sign = (result >> 63) & 1;
+	unsigned long exp = (result >> 52) & 0x7ff;
+	unsigned long mant = (result & ((1ul << 52) - 1));
+
+	if (exp == 0x7ff) {
+		/* infinity or NaN */
+		if (mant)
+			return 0x11;	/* NaN */
+		if (sign)
+			return 0x09;	/* -Infinity */
+		else
+			return 0x05;	/* +Infinity */
+	} else if (exp > (single ? 0x380 : 0)) {
+		if (sign)
+			return 0x08;	/* -normalized */
+		else
+			return 0x04;	/* +normalized */
+	} else if (mant || exp > 0) {
+		if (sign)
+			return 0x18;	/* -denorm */
+		else
+			return 0x14;	/* +denorm */
+	} else {
+		if (sign)
+			return 0x12;	/* -zero */
+		else
+			return 0x02;	/* +zero */
+	}
+}
+
+unsigned long expected_fprf_sp(unsigned long result)
+{
+	unsigned long sign = (result >> 31) & 1;
+	unsigned long exp = (result >> 23) & 0xff;
+	unsigned long mant = (result & ((1ul << 23) - 1));
+
+	if (exp == 0xff) {
+		/* infinity or NaN */
+		if (mant)
+			return 0x11;	/* NaN */
+		if (sign)
+			return 0x09;	/* -Infinity */
+		else
+			return 0x05;	/* +Infinity */
+	} else if (exp > 0) {
+		if (sign)
+			return 0x08;	/* -normalized */
+		else
+			return 0x04;	/* +normalized */
+	} else if (mant) {
+		if (sign)
+			return 0x18;	/* -denorm */
+		else
+			return 0x14;	/* +denorm */
+	} else {
+		if (sign)
+			return 0x12;	/* -zero */
+		else
+			return 0x02;	/* +zero */
+	}
+}
+
+int check_fprf(unsigned long result, bool single, unsigned long fpscr)
+{
+	unsigned long fprf;
+
+	fprf = expected_fprf(result, single);
+	if (((fpscr >> 12) & 0x1f) == fprf)
+		return 0;
+	print_string("\r\n");
+	print_hex(result, 16, " ");
+	print_hex(fpscr, 8, " ");
+	print_hex(fprf, 2, " ");
+	return 1;
+}
+
+int check_fprf_sp(unsigned long result, unsigned long fpscr)
+{
+	unsigned long fprf;
+
+	fprf = expected_fprf_sp(result);
+	if (((fpscr >> 12) & 0x1f) == fprf)
+		return 0;
+	print_string("\r\n");
+	print_hex(result, 16, " ");
+	print_hex(fpscr, 8, " ");
+	print_hex(fprf, 2, " ");
+	return 1;
+}
+
 struct int_fp_equiv {
 	long		ival;
 	unsigned long	fp;
@@ -522,12 +617,15 @@ int test7(long arg)
 {
 	long i;
 	unsigned long results[4];
+	unsigned long fpscr;
 
 	for (i = 0; i < sizeof(intvals) / sizeof(intvals[0]); ++i) {
+		set_fpscr(0);
 		asm("lfd%U0%X0 3,%0; fcfid 6,3; fcfidu 7,3; stfd 6,0(%1); stfd 7,8(%1)"
 		    : : "m" (intvals[i].ival), "b" (results) : "memory");
 		asm("fcfids 9,3; stfd 9,16(%0); fcfidus 10,3; stfd 10,24(%0)"
 		    : : "b" (results) : "memory");
+		fpscr = get_fpscr();
 		if (results[0] != intvals[i].fp ||
 		    results[1] != intvals[i].fp_u ||
 		    results[2] != intvals[i].fp_s ||
@@ -539,6 +637,8 @@ int test7(long arg)
 			print_hex(results[3], 16, " ");
 			return i + 1;
 		}
+		if (check_fprf(results[3], true, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -582,16 +682,20 @@ int test8(long arg)
 {
 	long i;
 	unsigned long result;
+	unsigned long fpscr;
 
 	for (i = 0; i < sizeof(roundvals) / sizeof(roundvals[0]); ++i) {
 		asm("lfd 3,0(%0); lfd 4,8(%0); mtfsf 0,3,1,0; frsp 6,4; stfd 6,0(%1)"
 		    : : "b" (&roundvals[i]), "b" (&result) : "memory");
+		fpscr = get_fpscr();
 		if (result != roundvals[i].spval) {
 			print_string("\r\n");
 			print_hex(i, 4, " ");
 			print_hex(result, 16, " ");
 			return i + 1;
 		}
+		if (check_fprf(result, true, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -796,6 +900,7 @@ int test11(long arg)
 	long i;
 	unsigned long results[4];
 	struct frivals *vp = frivals;
+	unsigned long fpscr;
 
 	for (i = 0; i < sizeof(frivals) / sizeof(frivals[0]); ++i, ++vp) {
 		set_fpscr(FPS_RN_FLOOR);
@@ -807,6 +912,7 @@ int test11(long arg)
 		asm("frip 5,3; stfd 5,16(%0)" : : "b" (results) : "memory");
 		set_fpscr(FPS_RN_CEIL);
 		asm("frim 5,3; stfd 5,24(%0)" : : "b" (results) : "memory");
+		fpscr = get_fpscr();
 		if (results[0] != vp->nval || results[1] != vp->zval ||
 		    results[2] != vp->pval || results[3] != vp->mval) {
 			print_hex(i, 2, "\r\n");
@@ -816,6 +922,8 @@ int test11(long arg)
 			print_hex(results[3], 16, " ");
 			return i + 1;
 		}
+		if (check_fprf(results[3], false, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -903,17 +1011,21 @@ int test13(long arg)
 	long i;
 	unsigned long results[2];
 	struct addvals *vp = addvals;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(addvals) / sizeof(addvals[0]); ++i, ++vp) {
 		asm("lfd 5,0(%0); lfd 6,8(%0); fadd 7,5,6; fsub 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
 		    : : "b" (&vp->val_a), "b" (results) : "memory");
+		fpscr = get_fpscr();
 		if (results[0] != vp->sum || results[1] != vp->diff) {
 			print_hex(i, 2, " ");
 			print_hex(results[0], 16, " ");
 			print_hex(results[1], 16, "\r\n");
 			return i + 1;
 		}
+		if (check_fprf(results[1], false, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -976,18 +1088,22 @@ int test14(long arg)
 	long i;
 	unsigned long results[2];
 	struct addvals *vp = sp_addvals;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(sp_addvals) / sizeof(sp_addvals[0]); ++i, ++vp) {
 		asm("lfd 5,0(%0); frsp 5,5; lfd 6,8(%0); frsp 6,6; "
 		    "fadds 7,5,6; fsubs 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
 		    : : "b" (&vp->val_a), "b" (results) : "memory");
+		fpscr = get_fpscr();
 		if (results[0] != vp->sum || results[1] != vp->diff) {
 			print_hex(i, 2, " ");
 			print_hex(results[0], 16, " ");
 			print_hex(results[1], 16, "\r\n");
 			return i + 1;
 		}
+		if (check_fprf(results[1], true, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -1017,16 +1133,20 @@ int test15(long arg)
 	long i;
 	unsigned long result;
 	struct mulvals *vp = mulvals;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(mulvals) / sizeof(mulvals[0]); ++i, ++vp) {
 		asm("lfd 5,0(%0); lfd 6,8(%0); fmul 7,5,6; stfd 7,0(%1)"
 		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		fpscr = get_fpscr();
 		if (result != vp->prod) {
 			print_hex(i, 2, " ");
 			print_hex(result, 16, " ");
 			return i + 1;
 		}
+		if (check_fprf(result, false, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -1056,16 +1176,20 @@ int test16(long arg)
 	long i;
 	unsigned int result;
 	struct mulvals_sp *vp = mulvals_sp;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(mulvals_sp) / sizeof(mulvals_sp[0]); ++i, ++vp) {
 		asm("lfs 5,0(%0); lfs 6,4(%0); fmuls 7,5,6; stfs 7,0(%1)"
 		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		fpscr = get_fpscr();
 		if (result != vp->prod) {
 			print_hex(i, 2, " ");
 			print_hex(result, 8, " ");
 			return i + 1;
 		}
+		if (check_fprf_sp(result, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -1086,6 +1210,10 @@ struct divvals {
 	{ 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
 	{ 0x4000000000000000, 0x4008000000000000, 0x3fe5555555555555 },
 	{ 0xc01fff0007ffffff, 0xc03ffffffdffffbf, 0x3fcfff0009fff041 },
+	{ 0x0010000000000000, 0x0018000000000000, 0x3fe5555555555555 },
+	{ 0x0008000000000000, 0x0018000000000000, 0x3fd5555555555555 },
+	{ 0x0010000000000000, 0x0000c00000000000, 0x4035555555555555 },
+	{ 0x0004000000000000, 0x0000300000000000, 0x4035555555555555 },
 };
 
 int test17(long arg)
@@ -1093,16 +1221,20 @@ int test17(long arg)
 	long i;
 	unsigned long result;
 	struct divvals *vp = divvals;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(divvals) / sizeof(divvals[0]); ++i, ++vp) {
 		asm("lfd 5,0(%0); lfd 6,8(%0); fdiv 7,5,6; stfd 7,0(%1)"
 		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		fpscr = get_fpscr();
 		if (result != vp->prod) {
 			print_hex(i, 2, " ");
 			print_hex(result, 16, " ");
 			return i + 1;
 		}
+		if (check_fprf(result, false, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -1123,6 +1255,9 @@ struct recipvals {
 	{ 0xbff0000000000000, 0xbfeff00400000000 },
 	{ 0x4008000000000000, 0x3fd54e3800000000 },
 	{ 0xc03ffffffdffffbf, 0xbfa0040000000000 },
+	{ 0x0008100000000000, 0x7fdfb0c400000000 },
+	{ 0x0004080000000000, 0x7fefb0c400000000 },
+	{ 0x0002040000000000, 0x7ff0000000000000 },
 };
 
 int test18(long arg)
@@ -1130,16 +1265,20 @@ int test18(long arg)
 	long i;
 	unsigned long result;
 	struct recipvals *vp = recipvals;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(recipvals) / sizeof(recipvals[0]); ++i, ++vp) {
 		asm("lfd 6,0(%0); fre 7,6; stfd 7,0(%1)"
 		    : : "b" (&vp->val), "b" (&result) : "memory");
+		fpscr = get_fpscr();
 		if (result != vp->inv) {
 			print_hex(i, 2, " ");
 			print_hex(result, 16, " ");
 			return i + 1;
 		}
+		if (check_fprf(result, false, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -1273,16 +1412,20 @@ int test21(long arg)
 	long i;
 	unsigned long result;
 	struct isqrtvals *vp = isqrtvals;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(isqrtvals) / sizeof(isqrtvals[0]); ++i, ++vp) {
 		asm("lfd 6,0(%0); frsqrte 7,6; stfd 7,0(%1)"
 		    : : "b" (&vp->val), "b" (&result) : "memory");
+		fpscr = get_fpscr();
 		if (result != vp->inv) {
 			print_hex(i, 2, " ");
 			print_hex(result, 16, " ");
 			return i + 1;
 		}
+		if (check_fprf(result, false, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -1320,16 +1463,20 @@ int test22(long arg)
 	long i;
 	unsigned long result;
 	struct sqrtvals *vp = sqrtvals;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(sqrtvals) / sizeof(sqrtvals[0]); ++i, ++vp) {
 		asm("lfd 6,0(%0); fsqrt 7,6; stfd 7,0(%1)"
 		    : : "b" (&vp->val), "b" (&result) : "memory");
+		fpscr = get_fpscr();
 		if (result != vp->inv) {
 			print_hex(i, 2, " ");
 			print_hex(result, 16, " ");
 			return i + 1;
 		}
+		if (check_fprf(result, false, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
@@ -1384,6 +1531,7 @@ int test23(long arg)
 	long i;
 	unsigned long results[4];
 	struct fmavals *vp = fmavals;
+	unsigned long fpscr;
 
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(fmavals) / sizeof(fmavals[0]); ++i, ++vp) {
@@ -1391,6 +1539,7 @@ int test23(long arg)
 		    : : "b" (&vp->ra), "b" (results) : "memory");
 		asm("fmsub 1,6,7,8; fnmadd 2,6,7,8; fnmsub 3,6,7,8; stfd 1,8(%0); stfd 2,16(%0); stfd 3,24(%0)"
 		    : : "b" (results) : "memory");
+		fpscr = get_fpscr();
 		if (results[0] != vp->fma || results[1] != vp->fms ||
 		    results[2] != vp->nfma || results[3] != vp->nfms) {
 			print_hex(i, 2, " ");
@@ -1400,6 +1549,8 @@ int test23(long arg)
 			print_hex(results[3], 16, "\r\n");
 			return i + 1;
 		}
+		if (check_fprf(results[3], false, fpscr))
+			return i + 0x101;
 	}
 	return 0;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 3c1021b..e4e2116 100755
Binary files a/tests/test_fpu.bin and b/tests/test_fpu.bin differ