diff --git a/Makefile b/Makefile
index a830a5f..ed74176 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ icache_tb.o: common.o wishbone_types.o icache.o wishbone_bram_wrapper.o
 dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o
 dcache_tb.o: common.o wishbone_types.o dcache.o wishbone_bram_wrapper.o
 insn_helpers.o:
-loadstore1.o: common.o helpers.o
+loadstore1.o: common.o helpers.o decode_types.o
 logical.o: decode_types.o
 multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o
 multiply.o: common.o decode_types.o
@@ -131,7 +131,7 @@ dmi_dtm_tb: dmi_dtm_tb.o sim_vhpi_c.o sim_bram_helpers_c.o
 tests = $(sort $(patsubst tests/%.out,%,$(wildcard tests/*.out)))
 tests_console = $(sort $(patsubst tests/%.console_out,%,$(wildcard tests/*.console_out)))
 
-check: $(tests) $(test_console) test_micropython test_micropython_long
+check: $(tests) $(tests_console) test_micropython test_micropython_long
 
 check_light: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 test_micropython test_micropython_long $(tests_console)
 
diff --git a/README.md b/README.md
index 8bf4622..98f2140 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,10 @@ You can try out Microwatt/Micropython without hardware by using the ghdl simulat
 
 - Build micropython. If you aren't building on a ppc64le box you
   will need a cross compiler. If it isn't available on your distro
-  grab the powerpc64le-power8 toolchain from https://toolchains.bootlin.com
+  grab the powerpc64le-power8 toolchain from https://toolchains.bootlin.com.
+  You may need to set the CROSS_COMPILE environment variable
+  to the prefix used for your cross compilers.  The default is
+  powerpc64le-linux-gnu-.
 
 ```
 git clone https://github.com/micropython/micropython.git
diff --git a/common.vhdl b/common.vhdl
index d10d857..ed97e0c 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -7,6 +7,15 @@ use work.decode_types.all;
 
 package common is
 
+    -- MSR bit numbers
+    constant MSR_SF  : integer := (63 - 0);     -- Sixty-Four bit mode
+    constant MSR_EE  : integer := (63 - 48);    -- External interrupt Enable
+    constant MSR_PR  : integer := (63 - 49);    -- PRoblem state
+    constant MSR_IR  : integer := (63 - 58);    -- Instruction Relocation
+    constant MSR_DR  : integer := (63 - 59);    -- Data Relocation
+    constant MSR_RI  : integer := (63 - 62);    -- Recoverable Interrupt
+    constant MSR_LE  : integer := (63 - 63);    -- Little Endian
+
     -- SPR numbers
     subtype spr_num_t is integer range 0 to 1023;
 
@@ -109,6 +118,7 @@ package common is
 
     type Decode2ToExecute1Type is record
 	valid: std_ulogic;
+        unit : unit_t;
 	insn_type: insn_type_t;
 	nia: std_ulogic_vector(63 downto 0);
 	write_reg: gspr_index_t;
@@ -141,7 +151,7 @@ package common is
         reserve : std_ulogic;                           -- set for larx/stcx
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
-	(valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
+	(valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
          lr => '0', rc => '0', oe => '0', invert_a => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0',
@@ -204,7 +214,7 @@ package common is
 
     type Execute1ToLoadstore1Type is record
 	valid : std_ulogic;
-	load : std_ulogic;				-- is this a load or store
+        op : insn_type_t;                               -- what ld/st op to do
 	addr1 : std_ulogic_vector(63 downto 0);
 	addr2 : std_ulogic_vector(63 downto 0);
 	data : std_ulogic_vector(63 downto 0);		-- data to write, unused for read
@@ -219,13 +229,14 @@ package common is
         reserve : std_ulogic;                           -- set for larx/stcx.
         rc : std_ulogic;                                -- set for stcx.
     end record;
-    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', ci => '0', byte_reverse => '0',
+    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
                                                                      sign_extend => '0', update => '0', xerc => xerc_init,
                                                                      reserve => '0', rc => '0', others => (others => '0'));
 
     type Loadstore1ToDcacheType is record
 	valid : std_ulogic;
 	load : std_ulogic;
+        dcbz : std_ulogic;
 	nc : std_ulogic;
         reserve : std_ulogic;
 	addr : std_ulogic_vector(63 downto 0);
diff --git a/dcache.vhdl b/dcache.vhdl
index 7e553bf..7d61a85 100644
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -581,8 +581,12 @@ begin
 		wr_data  <= r0.data;
 		wr_sel   <= r0.byte_sel;
 	    else
-		-- Otherwise, we might be doing a reload
-		wr_data <= wishbone_in.dat;
+		-- Otherwise, we might be doing a reload or a DCBZ
+                if r1.req.dcbz = '1' then
+                    wr_data <= (others => '0');
+                else
+                    wr_data <= wishbone_in.dat;
+                end if;
 		wr_sel  <= (others => '1');
 		wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
 	    end if;
@@ -593,7 +597,8 @@ begin
 	    if reloading and wishbone_in.ack = '1' and r1.store_way = i then
 		do_write <= '1';
 	    end if;
-	    if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' then
+	    if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' and
+                r1.req.dcbz = '0' then
 		assert not reloading report "Store hit while in state:" &
 		    state_t'image(r1.state)
 		    severity FAILURE;
@@ -718,18 +723,54 @@ begin
 			r1.wb.we <= '0';
 			r1.state <= NC_LOAD_WAIT_ACK;
 
-		    when OP_STORE_HIT | OP_STORE_MISS =>
-                        r1.wb.sel <= r0.byte_sel;
-                        r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
-			r1.wb.dat <= r0.data;
-                        if cancel_store = '0' then
+                    when OP_STORE_HIT | OP_STORE_MISS =>
+                        if r0.dcbz = '0' then
+                            r1.wb.sel <= r0.byte_sel;
+                            r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
+                            r1.wb.dat <= r0.data;
+                            if cancel_store = '0' then
+                                r1.wb.cyc <= '1';
+                                r1.wb.stb <= '1';
+                                r1.wb.we <= '1';
+                                r1.state <= STORE_WAIT_ACK;
+                            else
+                                r1.stcx_fail <= '1';
+                                r1.state <= IDLE;
+                            end if;
+                        else
+                            -- dcbz is handled much like a load miss except
+                            -- that we are writing to memory instead of reading
+                            r1.store_index <= req_index;
+                            r1.store_row <= get_row(req_laddr);
+
+                            if req_op = OP_STORE_HIT then
+                                r1.store_way <= req_hit_way;
+                            else
+                                r1.store_way <= replace_way;
+
+                                -- Force misses on the victim way while zeroing
+                                cache_valids(req_index)(replace_way) <= '0';
+
+                                -- Store new tag in selected way
+                                for i in 0 to NUM_WAYS-1 loop
+                                    if i = replace_way then
+                                        tagset := cache_tags(req_index);
+                                        write_tag(i, tagset, req_tag);
+                                        cache_tags(req_index) <= tagset;
+                                    end if;
+                                end loop;
+                            end if;
+
+                            -- Set up for wishbone writes
+                            r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0);
+                            r1.wb.sel <= (others => '1');
+                            r1.wb.we <= '1';
+                            r1.wb.dat <= (others => '0');
                             r1.wb.cyc <= '1';
                             r1.wb.stb <= '1';
-                            r1.wb.we <= '1';
-                            r1.state <= STORE_WAIT_ACK;
-                        else
-                            r1.stcx_fail <= '1';
-                            r1.state <= IDLE;
+
+                            -- Handle the rest like a load miss
+                            r1.state <= RELOAD_WAIT_ACK;
                         end if;
 
 		    -- OP_NONE and OP_BAD do nothing
@@ -766,7 +807,7 @@ begin
 			-- not idle, which we don't currently know how to deal
 			-- with.
 			--
-			if r1.store_row = get_row(r1.req.addr) then
+			if r1.store_row = get_row(r1.req.addr) and r1.req.dcbz = '0' then
 			    r1.slow_data <= wishbone_in.dat;
 			end if;
 
diff --git a/decode1.vhdl b/decode1.vhdl
index 8c7d5f2..785b669 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -164,7 +164,7 @@ architecture behaviour of decode1 is
 		2#0000110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst
 		2#0100010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt
 		2#0011110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst
-		-- 2#1111110110# dcbz
+		2#1111110110#  =>       (LDST,   OP_DCBZ,      RA_OR_ZERO, RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- dcbz
 		2#0110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeu
 		2#1110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeuo
 		2#0110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweu
diff --git a/decode2.vhdl b/decode2.vhdl
index ff773aa..edcc50c 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -304,6 +304,7 @@ begin
 
 		-- execute unit
 		v.e.nia := d_in.nia;
+                v.e.unit := d_in.decode.unit;
 		v.e.insn_type := d_in.decode.insn_type;
 		v.e.read_reg1 := decoded_reg_a.reg;
 		v.e.read_data1 := decoded_reg_a.data;
diff --git a/execute1.vhdl b/execute1.vhdl
index e32285d..8286d30 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -50,6 +50,11 @@ architecture behaviour of execute1 is
 	slow_op_oe : std_ulogic;
 	slow_op_xerc : xer_common_t;
     end record;
+    constant reg_type_init : reg_type :=
+        (e => Execute1ToWritebackInit, lr_update => '0',
+         mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0',
+         slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
+         others => (others => '0'));
 
     signal r, rin : reg_type;
 
@@ -73,6 +78,28 @@ architecture behaviour of execute1 is
     signal x_to_divider: Execute1ToDividerType;
     signal divider_to_x: DividerToExecute1Type;
 
+    type privilege_level is (USER, SUPER);
+    type op_privilege_array is array(insn_type_t) of privilege_level;
+    constant op_privilege: op_privilege_array := (
+        OP_ATTN => SUPER,
+        OP_MFMSR => SUPER,
+        OP_MTMSRD => SUPER,
+        OP_RFID => SUPER,
+        others => USER
+        );
+
+    function instr_is_privileged(op: insn_type_t; insn: std_ulogic_vector(31 downto 0))
+        return boolean is
+    begin
+        if op_privilege(op) = SUPER then
+            return true;
+        elsif op = OP_MFSPR or op = OP_MTSPR then
+            return insn(20) = '1';
+        else
+            return false;
+        end if;
+    end;
+
     procedure set_carry(e: inout Execute1ToWritebackType;
 			carry32 : in std_ulogic;
 			carry : in std_ulogic) is
@@ -126,11 +153,11 @@ architecture behaviour of execute1 is
 	--  tion MSR bits are not saved or restored.
 	--  Full function MSR bits lie in the range 0:32, 37:41, and
 	--  48:63, and partial function MSR bits lie in the range
-	--  33:36 and 42:47.
+	--  33:36 and 42:47. (Note this is IBM bit numbering).
 	msr_out := (others => '0');
-	msr_out(32 downto 0) := msr(32 downto 0);
-	msr_out(41 downto 37) := msr(41 downto 37);
-	msr_out(63 downto 48) := msr(63 downto 48);
+	msr_out(63 downto 31) := msr(63 downto 31);
+	msr_out(26 downto 22) := msr(26 downto 22);
+	msr_out(15 downto 0)  := msr(15 downto 0);
 	return msr_out;
     end;
 
@@ -195,14 +222,20 @@ begin
     execute1_0: process(clk)
     begin
 	if rising_edge(clk) then
-	    r <= rin;
-	    ctrl <= ctrl_tmp;
-	    assert not (r.lr_update = '1' and e_in.valid = '1')
-		report "LR update collision with valid in EX1"
-		severity failure;
-	    if r.lr_update = '1' then
-		report "LR update to " & to_hstring(r.next_lr);
-	    end if;
+            if rst = '1' then
+                r <= reg_type_init;
+                ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
+                ctrl.irq_state <= WRITE_SRR0;
+            else
+                r <= rin;
+                ctrl <= ctrl_tmp;
+                assert not (r.lr_update = '1' and e_in.valid = '1')
+                    report "LR update collision with valid in EX1"
+                    severity failure;
+                if r.lr_update = '1' then
+                    report "LR update to " & to_hstring(r.next_lr);
+                end if;
+            end if;
 	end if;
     end process;
 
@@ -372,7 +405,7 @@ begin
 	ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);
 
 	irq_valid := '0';
-	if ctrl.msr(63 - 48) = '1' then
+	if ctrl.msr(MSR_EE) = '1' then
 	    if ctrl.dec(63) = '1' then
 		ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#900#, 64));
 		report "IRQ valid: DEC";
@@ -409,21 +442,37 @@ begin
 	    v.e.exc_write_reg := fast_spr_num(SPR_SRR1);
 	    v.e.exc_write_data := ctrl.srr1;
             v.e.exc_write_enable := '1';
-	    ctrl_tmp.msr(63 - 48) <= '0'; -- clear EE
+            ctrl_tmp.msr(MSR_SF) <= '1';
+            ctrl_tmp.msr(MSR_EE) <= '0';
+            ctrl_tmp.msr(MSR_PR) <= '0';
+            ctrl_tmp.msr(MSR_IR) <= '0';
+            ctrl_tmp.msr(MSR_DR) <= '0';
+            ctrl_tmp.msr(MSR_RI) <= '0';
+            ctrl_tmp.msr(MSR_LE) <= '1';
 	    f_out.redirect <= '1';
 	    f_out.redirect_nia <= ctrl.irq_nia;
 	    v.e.valid := e_in.valid;
 	    report "Writing SRR1: " & to_hstring(ctrl.srr1);
 
-	elsif irq_valid = '1' then
+	elsif irq_valid = '1' and e_in.valid = '1' then
 	    -- we need two cycles to write srr0 and 1
 	    -- will need more when we have to write DSISR, DAR and HIER
             -- Don't deliver the interrupt until we have a valid instruction
             -- coming in, so we have a valid NIA to put in SRR0.
-	    exception := e_in.valid;
+	    exception := '1';
 	    ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
 
-	elsif e_in.valid = '1' then
+        elsif e_in.valid = '1' and ctrl.msr(MSR_PR) = '1' and
+            instr_is_privileged(e_in.insn_type, e_in.insn) then
+            -- generate a program interrupt
+            exception := '1';
+            ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#700#, 64));
+            ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
+            -- set bit 45 to indicate privileged instruction type interrupt
+            ctrl_tmp.srr1(63 - 45) <= '1';
+            report "privileged instruction";
+            
+	elsif e_in.valid = '1' and e_in.unit = ALU then
 
 	    report "execute nia " & to_hstring(e_in.nia);
 
@@ -555,7 +604,7 @@ begin
 	    when OP_B =>
 		f_out.redirect <= '1';
 		if (insn_aa(e_in.insn)) then
-		    f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
+		    f_out.redirect_nia <= b_in;
 		else
 		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		end if;
@@ -571,7 +620,7 @@ begin
 		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
 		    f_out.redirect <= '1';
 		    if (insn_aa(e_in.insn)) then
-			f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
+			f_out.redirect_nia <= b_in;
 		    else
 			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		    end if;
@@ -594,7 +643,17 @@ begin
 	    when OP_RFID =>
 		f_out.redirect <= '1';
 		f_out.redirect_nia <= a_in(63 downto 2) & "00"; -- srr0
-		ctrl_tmp.msr <= msr_copy(std_ulogic_vector(signed(b_in))); -- srr1
+                -- Can't use msr_copy here because the partial function MSR
+                -- bits should be left unchanged, not zeroed.
+                ctrl_tmp.msr(63 downto 31) <= b_in(63 downto 31);
+                ctrl_tmp.msr(26 downto 22) <= b_in(26 downto 22);
+                ctrl_tmp.msr(15 downto 0)  <= b_in(15 downto 0);
+                if b_in(MSR_PR) = '1' then
+                    ctrl_tmp.msr(MSR_EE) <= '1';
+                    ctrl_tmp.msr(MSR_IR) <= '1';
+                    ctrl_tmp.msr(MSR_DR) <= '1';
+                end if;
+
 	    when OP_CMPB =>
 		result := ppc_cmpb(c_in, b_in);
 		result_en := '1';
@@ -668,7 +727,7 @@ begin
 		    end loop;
 		end if;
 	    when OP_MFMSR =>
-		result := msr_copy(ctrl.msr);
+		result := ctrl.msr;
 		result_en := '1';
 	    when OP_MFSPR =>
 		report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
@@ -724,9 +783,23 @@ begin
 		    v.e.write_cr_mask := num_to_fxm(crnum);
 		end if;
 		v.e.write_cr_data := c_in(31 downto 0);
-	    when OP_MTMSRD =>
-		-- FIXME handle just the bits we need to.
-		ctrl_tmp.msr <= msr_copy(c_in);
+            when OP_MTMSRD =>
+                if e_in.insn(16) = '1' then
+                    -- just update EE and RI
+                    ctrl_tmp.msr(MSR_EE) <= c_in(MSR_EE);
+                    ctrl_tmp.msr(MSR_RI) <= c_in(MSR_RI);
+                else
+                    -- Architecture says to leave out bits 3 (HV), 51 (ME)
+                    -- and 63 (LE) (IBM bit numbering)
+                    ctrl_tmp.msr(63 downto 61) <= c_in(63 downto 61);
+                    ctrl_tmp.msr(59 downto 13) <= c_in(59 downto 13);
+                    ctrl_tmp.msr(11 downto 1)  <= c_in(11 downto 1);
+                    if c_in(MSR_PR) = '1' then
+                        ctrl_tmp.msr(MSR_EE) <= '1';
+                        ctrl_tmp.msr(MSR_IR) <= '1';
+                        ctrl_tmp.msr(MSR_DR) <= '1';
+                    end if;
+                end if;
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(c_in);
@@ -781,11 +854,6 @@ begin
 		stall_out <= '1';
 		x_to_divider.valid <= '1';
 
-            when OP_LOAD | OP_STORE =>
-                -- loadstore/dcache has its own port to writeback
-                v.e.valid := '0';
-                lv.valid := '1';
-
             when others =>
 		terminate_out <= '1';
 		report "illegal";
@@ -811,6 +879,14 @@ begin
 		report "Delayed LR update to " & to_hstring(next_nia);
 		stall_out <= '1';
 	    end if;
+
+        elsif e_in.valid = '1' then
+            -- instruction for other units, i.e. LDST
+            v.e.valid := '0';
+            if e_in.unit = LDST then
+                lv.valid := '1';
+            end if;
+
 	elsif r.lr_update = '1' then
 	    result_en := '1';
 	    result := r.next_lr;
@@ -877,9 +953,7 @@ begin
 	v.e.write_enable := result_en;
 
         -- Outputs to loadstore1 (async)
-        if e_in.insn_type = OP_LOAD then
-            lv.load := '1';
-        end if;
+        lv.op := e_in.insn_type;
         lv.addr1 := a_in;
         lv.addr2 := b_in;
         lv.data := c_in;
diff --git a/hello_world/Makefile b/hello_world/Makefile
index 674095e..a609199 100644
--- a/hello_world/Makefile
+++ b/hello_world/Makefile
@@ -1,7 +1,7 @@
 ARCH = $(shell uname -m)
 ifneq ("$(ARCH)", "ppc64")
 ifneq ("$(ARCH)", "ppc64le")
-	CROSS_COMPILE ?= powerpc64le-linux-
+	CROSS_COMPILE ?= powerpc64le-linux-gnu-
 endif
 endif
 
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 518feee..90650db 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -3,6 +3,7 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 
 library work;
+use work.decode_types.all;
 use work.common.all;
 use work.helpers.all;
 
@@ -41,7 +42,8 @@ architecture behave of loadstore1 is
 
     type reg_stage_t is record
         -- latch most of the input request
-	load         : std_ulogic;
+        load         : std_ulogic;
+        dcbz         : std_ulogic;
 	addr         : std_ulogic_vector(63 downto 0);
 	store_data   : std_ulogic_vector(63 downto 0);
 	load_data    : std_ulogic_vector(63 downto 0);
@@ -146,59 +148,63 @@ begin
         two_dwords := or (r.second_bytes);
 
         -- load data formatting
-        if r.load = '1' then
-            byte_offset := unsigned(r.addr(2 downto 0));
-            brev_lenm1 := "000";
-            if r.byte_reverse = '1' then
-                brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
-            end if;
+        byte_offset := unsigned(r.addr(2 downto 0));
+        brev_lenm1 := "000";
+        if r.byte_reverse = '1' then
+            brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
+        end if;
 
-            -- shift and byte-reverse data bytes
-            for i in 0 to 7 loop
-                kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
-                use_second(i) := kk(3);
-                j := to_integer(kk(2 downto 0)) * 8;
-                data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
-            end loop;
-
-            -- Work out the sign bit for sign extension.
-            -- Assumes we are not doing both sign extension and byte reversal,
-            -- in that for unaligned loads crossing two dwords we end up
-            -- using a bit from the second dword, whereas for a byte-reversed
-            -- (i.e. big-endian) load the sign bit would be in the first dword.
-            negative := (r.length(3) and data_permuted(63)) or
-                        (r.length(2) and data_permuted(31)) or
-                        (r.length(1) and data_permuted(15)) or
-                        (r.length(0) and data_permuted(7));
-
-            -- trim and sign-extend
-            for i in 0 to 7 loop
-                if i < to_integer(unsigned(r.length)) then
-                    if two_dwords = '1' then
-                        trim_ctl(i) := '1' & not use_second(i);
-                    else
-                        trim_ctl(i) := not use_second(i) & '0';
-                    end if;
+        -- shift and byte-reverse data bytes
+        for i in 0 to 7 loop
+            kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
+            use_second(i) := kk(3);
+            j := to_integer(kk(2 downto 0)) * 8;
+            data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
+        end loop;
+
+        -- Work out the sign bit for sign extension.
+        -- Assumes we are not doing both sign extension and byte reversal,
+        -- in that for unaligned loads crossing two dwords we end up
+        -- using a bit from the second dword, whereas for a byte-reversed
+        -- (i.e. big-endian) load the sign bit would be in the first dword.
+        negative := (r.length(3) and data_permuted(63)) or
+                    (r.length(2) and data_permuted(31)) or
+                    (r.length(1) and data_permuted(15)) or
+                    (r.length(0) and data_permuted(7));
+
+        -- trim and sign-extend
+        for i in 0 to 7 loop
+            if i < to_integer(unsigned(r.length)) then
+                if two_dwords = '1' then
+                    trim_ctl(i) := '1' & not use_second(i);
                 else
-                    trim_ctl(i) := '0' & (negative and r.sign_extend);
+                    trim_ctl(i) := not use_second(i) & '0';
                 end if;
-                case trim_ctl(i) is
-                    when "11" =>
-                        data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
-                    when "10" =>
-                        data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
-                    when "01" =>
-                        data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
-                    when others =>
-                        data_trimmed(i * 8 + 7 downto i * 8) := x"00";
-                end case;
-            end loop;
-        end if;
+            else
+                trim_ctl(i) := '0' & (negative and r.sign_extend);
+            end if;
+            case trim_ctl(i) is
+                when "11" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
+                when "10" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
+                when "01" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
+                when others =>
+                    data_trimmed(i * 8 + 7 downto i * 8) := x"00";
+            end case;
+        end loop;
 
         case r.state is
         when IDLE =>
             if l_in.valid = '1' then
-                v.load := l_in.load;
+                v.load := '0';
+                v.dcbz := '0';
+                if l_in.op = OP_LOAD then
+                    v.load := '1';
+                elsif l_in.op = OP_DCBZ then
+                    v.dcbz := '1';
+                end if;
                 v.addr := lsu_sum;
                 v.write_reg := l_in.write_reg;
                 v.length := l_in.length;
@@ -229,18 +235,16 @@ begin
                 v.addr := lsu_sum;
 
                 -- Do byte reversing and rotating for stores in the first cycle
-                if v.load = '0' then
-                    byte_offset := unsigned(lsu_sum(2 downto 0));
-                    brev_lenm1 := "000";
-                    if l_in.byte_reverse = '1' then
-                        brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
-                    end if;
-                    for i in 0 to 7 loop
-                        k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
-                        j := to_integer(k) * 8;
-                        v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
-                    end loop;
+                byte_offset := unsigned(lsu_sum(2 downto 0));
+                brev_lenm1 := "000";
+                if l_in.byte_reverse = '1' then
+                    brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
                 end if;
+                for i in 0 to 7 loop
+                    k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
+                    j := to_integer(k) * 8;
+                    v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
+                end loop;
 
                 req := '1';
                 stall := '1';
@@ -293,6 +297,7 @@ begin
         -- Update outputs to dcache
         d_out.valid <= req;
         d_out.load <= v.load;
+        d_out.dcbz <= v.dcbz;
         d_out.nc <= v.nc;
         d_out.reserve <= v.reserve;
         d_out.addr <= addr;
diff --git a/rust_lib_demo/Makefile b/rust_lib_demo/Makefile
index 26aebf8..fdbb18b 100644
--- a/rust_lib_demo/Makefile
+++ b/rust_lib_demo/Makefile
@@ -1,7 +1,7 @@
 ARCH = $(shell uname -m)
 ifneq ("$(ARCH)", "ppc64")
 ifneq ("$(ARCH)", "ppc64le")
-	CROSS_COMPILE ?= powerpc64le-linux-
+	CROSS_COMPILE ?= powerpc64le-linux-gnu-
 endif
 endif
 
diff --git a/tests/Makefile.test b/tests/Makefile.test
index 9676370..250135d 100644
--- a/tests/Makefile.test
+++ b/tests/Makefile.test
@@ -1,7 +1,7 @@
 ARCH = $(shell uname -m)
 ifneq ("$(ARCH)", "ppc64")
 ifneq ("$(ARCH)", "ppc64le")
-        CROSS_COMPILE ?= powerpc64le-linux-
+        CROSS_COMPILE ?= powerpc64le-linux-gnu-
         endif
         endif
 
diff --git a/tests/privileged/Makefile b/tests/privileged/Makefile
new file mode 100644
index 0000000..7c24998
--- /dev/null
+++ b/tests/privileged/Makefile
@@ -0,0 +1,3 @@
+TEST=privileged
+
+include ../Makefile.test
diff --git a/tests/privileged/head.S b/tests/privileged/head.S
new file mode 100644
index 0000000..9b76234
--- /dev/null
+++ b/tests/privileged/head.S
@@ -0,0 +1,91 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define STACK_TOP 0x8000
+
+/* Load an immediate 64-bit value into a register */
+#define LOAD_IMM64(r, e)			\
+	lis     r,(e)@highest;			\
+	ori     r,r,(e)@higher;			\
+	rldicr  r,r, 32, 31;			\
+	oris    r,r, (e)@h;			\
+	ori     r,r, (e)@l;
+
+	.section ".head","ax"
+
+	/*
+	 * Microwatt currently enters in LE mode at 0x0, so we don't need to
+	 * do any endian fix ups
+	 */
+	. = 0
+.global _start
+_start:
+	b	boot_entry
+
+.global boot_entry
+boot_entry:
+	/* setup stack */
+	LOAD_IMM64(%r1, STACK_TOP - 0x100)
+	LOAD_IMM64(%r12, main)
+	mtctr	%r12
+	bctrl
+	attn // terminate on exit
+	b .
+
+	/* Call a function with a specified MSR value */
+	.global	call_with_msr
+call_with_msr:
+	mtsrr0	%r4
+	mr	%r12,%r4
+	mtsrr1	%r5
+	rfid
+
+#define EXCEPTION(nr)		\
+	.= nr			;\
+	li	%r3,nr		;\
+	blr
+
+	EXCEPTION(0x300)
+	EXCEPTION(0x380)
+	EXCEPTION(0x400)
+	EXCEPTION(0x480)
+	EXCEPTION(0x500)
+	EXCEPTION(0x600)
+	EXCEPTION(0x700)
+	EXCEPTION(0x800)
+	EXCEPTION(0x900)
+	EXCEPTION(0x980)
+	EXCEPTION(0xa00)
+	EXCEPTION(0xb00)
+
+	/*
+	 * System call - used to exit from tests where MSR[PR]
+	 * may have been set.
+	 */
+	. = 0xc00
+	blr
+
+	EXCEPTION(0xd00)
+	EXCEPTION(0xe00)
+	EXCEPTION(0xe20)
+	EXCEPTION(0xe40)
+	EXCEPTION(0xe60)
+	EXCEPTION(0xe80)
+	EXCEPTION(0xf00)
+	EXCEPTION(0xf20)
+	EXCEPTION(0xf40)
+	EXCEPTION(0xf60)
+	EXCEPTION(0xf80)
diff --git a/tests/privileged/powerpc.lds b/tests/privileged/powerpc.lds
new file mode 100644
index 0000000..8c8c65b
--- /dev/null
+++ b/tests/privileged/powerpc.lds
@@ -0,0 +1,13 @@
+SECTIONS
+{
+	_start = .;
+	. = 0;
+	.head : {
+		KEEP(*(.head))
+ 	}
+	. = 0x2000;
+	.text : { *(.text) }
+	. = 0x4000;
+	.data : { *(.data) }
+	.bss : { *(.bss) }
+}
diff --git a/tests/privileged/privileged.c b/tests/privileged/privileged.c
new file mode 100644
index 0000000..073dc07
--- /dev/null
+++ b/tests/privileged/privileged.c
@@ -0,0 +1,152 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "console.h"
+
+#define MSR_EE	0x8000
+#define MSR_PR	0x4000
+#define MSR_IR	0x0020
+#define MSR_DR	0x0010
+
+extern int call_with_msr(unsigned long arg, int (*fn)(unsigned long), unsigned long msr);
+
+#define SRR0	26
+#define SRR1	27
+
+static inline unsigned long mfspr(int sprnum)
+{
+	long val;
+
+	__asm__ volatile("mfspr %0,%1" : "=r" (val) : "i" (sprnum));
+	return val;
+}
+
+static inline void mtspr(int sprnum, unsigned long val)
+{
+	__asm__ volatile("mtspr %0,%1" : : "i" (sprnum), "r" (val));
+}
+
+void print_string(const char *str)
+{
+	for (; *str; ++str)
+		putchar(*str);
+}
+
+void print_hex(unsigned long val, int ndigits)
+{
+	int i, x;
+
+	for (i = (ndigits - 1) * 4; i >= 0; i -= 4) {
+		x = (val >> i) & 0xf;
+		if (x >= 10)
+			putchar(x + 'a' - 10);
+		else
+			putchar(x + '0');
+	}
+}
+
+// i < 100
+void print_test_number(int i)
+{
+	print_string("test ");
+	putchar(48 + i/10);
+	putchar(48 + i%10);
+	putchar(':');
+}
+
+int priv_fn_1(unsigned long x)
+{
+	__asm__ volatile("attn");
+	__asm__ volatile("li 3,0; sc");
+	return 0;
+}
+
+int priv_fn_2(unsigned long x)
+{
+	__asm__ volatile("mfmsr 3");
+	__asm__ volatile("sc");
+	return 0;
+}
+
+int priv_fn_3(unsigned long x)
+{
+	__asm__ volatile("mtmsrd 3");
+	__asm__ volatile("li 3,0; sc");
+	return 0;
+}
+
+int priv_fn_4(unsigned long x)
+{
+	__asm__ volatile("rfid");
+	__asm__ volatile("li 3,0; sc");
+	return 0;
+}
+
+int priv_fn_5(unsigned long x)
+{
+	__asm__ volatile("mfsrr0 3");
+	__asm__ volatile("sc");
+	return 0;
+}
+
+int priv_fn_6(unsigned long x)
+{
+	__asm__ volatile("mtsrr0 3");
+	__asm__ volatile("sc");
+	return 0;
+}
+
+int priv_test(int (*fn)(unsigned long))
+{
+	unsigned long msr;
+	int vec;
+
+	__asm__ volatile ("mtdec %0" : : "r" (0x7fffffff));
+	__asm__ volatile ("mfmsr %0" : "=r" (msr));
+	/* this should fail */
+	vec = call_with_msr(0, fn, msr | MSR_PR);
+	if (vec != 0x700)
+		return vec | 1;
+	/* SRR1 should be set correctly */
+	msr |= MSR_PR | MSR_EE | MSR_IR | MSR_DR;
+	if (mfspr(SRR1) != (msr | 0x40000))
+		return 2;
+	return 0;
+}
+
+int fail = 0;
+
+void do_test(int num, int (*fn)(unsigned long))
+{
+	int ret;
+
+	print_test_number(num);
+	ret = priv_test(fn);
+	if (ret == 0) {
+		print_string("PASS\r\n");
+	} else {
+		fail = 1;
+		print_string("FAIL ");
+		print_hex(ret, 4);
+		print_string(" SRR0=");
+		print_hex(mfspr(SRR0), 16);
+		print_string(" SRR1=");
+		print_hex(mfspr(SRR1), 16);
+		print_string("\r\n");
+	}
+}
+
+int main(void)
+{
+	potato_uart_init();
+
+	do_test(1, priv_fn_1);
+	do_test(2, priv_fn_2);
+	do_test(3, priv_fn_3);
+	do_test(4, priv_fn_4);
+	do_test(5, priv_fn_5);
+	do_test(6, priv_fn_6);
+
+	return fail;
+}
diff --git a/tests/test_privileged.bin b/tests/test_privileged.bin
new file mode 100755
index 0000000..5b8ce63
Binary files /dev/null and b/tests/test_privileged.bin differ
diff --git a/tests/test_privileged.console_out b/tests/test_privileged.console_out
new file mode 100644
index 0000000..a49bb9b
--- /dev/null
+++ b/tests/test_privileged.console_out
@@ -0,0 +1,6 @@
+test 01:PASS
+test 02:PASS
+test 03:PASS
+test 04:PASS
+test 05:PASS
+test 06:PASS
diff --git a/tests/update_console_tests b/tests/update_console_tests
index 11306bb..94e74d1 100755
--- a/tests/update_console_tests
+++ b/tests/update_console_tests
@@ -3,7 +3,7 @@
 # Script to update console related tests from source
 #
 
-for i in sc illegal decrementer xics ; do
+for i in sc illegal decrementer xics privileged ; do
     cd $i
     make
     cd -
diff --git a/wishbone_debug_master.vhdl b/wishbone_debug_master.vhdl
index 11b9ee3..ddf6923 100644
--- a/wishbone_debug_master.vhdl
+++ b/wishbone_debug_master.vhdl
@@ -49,6 +49,7 @@ architecture behaviour of wishbone_debug_master is
     
     type state_t is (IDLE, WB_CYCLE, DMI_WAIT);
     signal state : state_t;
+    signal do_inc : std_ulogic;
 
 begin
 
@@ -84,16 +85,16 @@ begin
 		reg_addr <= (others => '0');
 		reg_ctrl <= (others => '0');
 	    else 	    -- Standard register writes
-		if dmi_req and dmi_wr then
+                if do_inc = '1' then
+		    -- Address register auto-increment
+		    reg_addr <= std_ulogic_vector(unsigned(reg_addr) +
+						  decode_autoinc(reg_ctrl(10 downto 9)));
+                elsif dmi_req and dmi_wr then
 		    if dmi_addr = DBG_WB_ADDR then
 			reg_addr <= dmi_din;
 		    elsif dmi_addr = DBG_WB_CTRL then
 			reg_ctrl <= dmi_din(10 downto 0);
 		    end if;
-                elsif state = WB_CYCLE and (wb_in.ack and reg_ctrl(8))= '1'  then
-		    -- Address register auto-increment
-		    reg_addr <= std_ulogic_vector(unsigned(reg_addr) +
-						  decode_autoinc(reg_ctrl(10 downto 9)));
 		end if;
 	    end if;
 	end if;
@@ -145,6 +146,7 @@ begin
 	    if (rst) then
 		state <= IDLE;
 		wb_out.stb <= '0';
+                do_inc <= '0';
 	    else
 		case state is
 		when IDLE =>
@@ -162,11 +164,13 @@ begin
 			--
 			wb_out.stb <= '0';
 			state <= DMI_WAIT;
+                        do_inc <= reg_ctrl(8);
 		    end if;
 		when DMI_WAIT =>
 		    if dmi_req = '0' then
 			state <= IDLE;
 		    end if;
+                    do_inc <= '0';
 		end case;
 	    end if;
 	end if;