diff --git a/dcache.vhdl b/dcache.vhdl
index 5eb659d..f4403e4 100644
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -264,6 +264,23 @@ architecture rtl of dcache is
     -- subsequent load requests to the same line can be completed as
     -- soon as the necessary data comes in from memory, without
     -- waiting for the whole line to be read.
+    --
+    -- Aligned loads and stores of a doubleword or less are atomic
+    -- because they are done in a single wishbone operation.
+    -- For quadword atomic loads and stores we rely on the wishbone
+    -- arbiter not interrupting access to a target once it has first
+    -- given access; i.e. once we have the main wishbone, no other
+    -- master gets access until we drop cyc.
+    --
+    -- Note on loads potentially hitting the victim line that is
+    -- currently being replaced: the new tag is available starting
+    -- with the 3rd cycle of RELOAD_WAIT_ACK state.  As long as the
+    -- first read on the wishbone takes at least one cycle (i.e. the
+    -- ack doesn't arrive in the same cycle as stb was asserted),
+    -- r1.full will be true at least until that 3rd cycle and so a load
+    -- following a load miss can't hit on the old tag of the victim
+    -- line.  As long as ack is not generated combinationally from
+    -- stb, this will be fine.
 
     -- Stage 0 register, basically contains just the latched request
     type reg_stage_0_t is record
@@ -307,12 +324,16 @@ architecture rtl of dcache is
         full             : std_ulogic;          -- have uncompleted request
         mmu_req          : std_ulogic;          -- request is from MMU
         req              : mem_access_request_t;
+        atomic_more      : std_ulogic;          -- atomic request isn't finished
 
 	-- Cache hit state
 	hit_way          : way_t;
 	hit_load_valid   : std_ulogic;
         hit_index        : index_t;
         cache_hit        : std_ulogic;
+        prev_hit         : std_ulogic;
+        prev_way         : way_t;
+        prev_hit_reload  : std_ulogic;
 
         -- TLB hit state
         tlb_hit          : std_ulogic;
@@ -389,6 +410,7 @@ architecture rtl of dcache is
     signal req_same_tag     : std_ulogic;
     signal req_go           : std_ulogic;
     signal req_nc           : std_ulogic;
+    signal req_hit_reload   : std_ulogic;
 
     signal early_req_row  : row_t;
     signal early_rd_valid : std_ulogic;
@@ -927,6 +949,7 @@ begin
         variable fwd_match   : std_ulogic;
         variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0);
         variable snoop_match : std_ulogic;
+        variable hit_reload  : std_ulogic;
     begin
 	-- Extract line, row and tag from request
         rindex := get_index(r0.req.addr);
@@ -1071,6 +1094,7 @@ begin
             assert not is_X(rindex);
             assert not is_X(r1.store_index);
         end if;
+        hit_reload := '0';
         if r1.state = RELOAD_WAIT_ACK and rel_match = '1' and
             rindex = r1.store_index then
             -- Ignore is_hit from above, because a load miss writes the new tag
@@ -1085,11 +1109,23 @@ begin
                       r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or
                       use_forward_rl;
             hit_way := replace_way;
+            hit_reload := is_hit;
+        elsif r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' and
+            r0.req.nc = '0' and perm_attr.nocache = '0' and r1.prev_hit = '1' then
+            -- For the second half of an atomic quadword load, just use the
+            -- same way as the first half, without considering whether the line
+            -- is valid; it is as if we had read the second dword at the same
+            -- time as the first dword, and the line was valid back then.
+            -- (Cases where the line is currently being reloaded are handled above.)
+            -- NB lq to noncacheable isn't required to be atomic per the ISA.
+            is_hit := '1';
+            hit_way := r1.prev_way;
         end if;
 
 	-- The way that matched on a hit	       
 	req_hit_way <= hit_way;
         req_is_hit <= is_hit;
+        req_hit_reload <= hit_reload;
 
         -- work out whether we have permission for this access
         -- NB we don't yet implement AMR, thus no KUAP
@@ -1418,6 +1454,8 @@ begin
                 r1.acks_pending <= to_unsigned(0, 3);
                 r1.stalled <= '0';
                 r1.dec_acks <= '0';
+                r1.prev_hit <= '0';
+                r1.prev_hit_reload <= '0';
                 reservation.valid <= '0';
                 reservation.addr <= (others => '0');
 
@@ -1443,9 +1481,7 @@ begin
                 if req_go = '1' and access_ok = '1' and r0.req.load = '1' and
                     r0.req.reserve = '1' and r0.req.atomic_first = '1' then
                     reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS);
-                    if req_is_hit = '1' then
-                        reservation.valid <= not req_snoop_hit;
-                    end if;
+                    reservation.valid <= req_is_hit and not req_snoop_hit;
                 end if;
 
                 -- Do invalidations from snooped stores to memory
@@ -1488,8 +1524,8 @@ begin
                     req.flush := r0.req.flush;
                     req.touch := r0.req.touch;
                     req.reserve := r0.req.reserve;
-                    req.first_dw := r0.req.atomic_first;
-                    req.last_dw := r0.req.atomic_last;
+                    req.first_dw := not r0.req.atomic_qw or r0.req.atomic_first;
+                    req.last_dw := not r0.req.atomic_qw or r0.req.atomic_last;
                     req.real_addr := ra;
                     -- Force data to 0 for dcbz
                     if r0.req.dcbz = '1' then
@@ -1528,6 +1564,11 @@ begin
                 if req_op_load_miss = '1' or (r0.req.dcbz = '1' and req_is_hit = '0') then
                     r1.choose_victim <= '1';
                 end if;
+                if req_go = '1' then
+                    r1.prev_hit <= req_is_hit;
+                    r1.prev_way <= req_hit_way;
+                    r1.prev_hit_reload <= req_hit_reload;
+                end if;
 
                 -- Update count of pending acks
                 acks := r1.acks_pending;
@@ -1549,6 +1590,7 @@ begin
                     r1.wb.sel <= req.byte_sel;
                     r1.wb.dat <= req.data;
                     r1.dcbz <= req.dcbz;
+                    r1.atomic_more <= not req.last_dw;
 
                     -- Keep track of our index and way for subsequent stores.
                     r1.store_index <= get_index(req.real_addr);
@@ -1659,7 +1701,7 @@ begin
                             assert not is_X(r1.req.real_addr);
                         end if;
 			if r1.full = '1' and r1.req.same_tag = '1' and
-                            ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op_lmiss = '1') and
+                            ((r1.dcbz = '1' and r1.req.dcbz = '1') or r1.req.op_lmiss = '1') and
                             r1.store_row = get_row(r1.req.real_addr) then
                             r1.full <= '0';
                             r1.slow_valid <= '1';
@@ -1668,12 +1710,9 @@ begin
                             else
                                 r1.mmu_done <= '1';
                             end if;
-                            -- NB: for lqarx, set the reservation on the first
-                            -- dword so that a snooped store between the two
-                            -- dwords will kill the reservation.
-                            if req.reserve = '1' and req.first_dw = '1' then
+                            -- NB: for lqarx, set the reservation on the first dword
+                            if r1.req.reserve = '1' and r1.req.first_dw = '1' then
                                 reservation.valid <= '1';
-                                reservation.addr <= req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS);
                             end if;
 			end if;
 
@@ -1690,6 +1729,10 @@ begin
 			    cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1';
 
                             ev.dcache_refill <= not r1.dcbz;
+                            -- Second half of a lq/lqarx can assume a hit on this line now
+                            -- if the first half hit this line.
+                            r1.prev_hit <= r1.prev_hit_reload;
+                            r1.prev_way <= r1.store_way;
                             r1.state <= IDLE;
 			end if;
 
@@ -1703,6 +1746,10 @@ begin
                     if wishbone_in.stall = '0' then
                         -- See if there is another store waiting to be done
                         -- which is in the same real page.
+                        -- This could be either in r1.req or in r0.
+                        -- Ignore store-conditionals, they have to go through
+                        -- DO_STCX state, unless they are the second half of a
+                        -- successful stqcx, which is handled here.
                         if req.valid = '1' then
                             r1.wb.adr(SET_SIZE_BITS - ROW_OFF_BITS - 1 downto 0) <=
                                 req.real_addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS);
@@ -1710,28 +1757,33 @@ begin
                             r1.wb.sel <= req.byte_sel;
                         end if;
                         assert not is_X(acks);
-                        if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and
-                            req.op_store = '1' then
-                            r1.wb.stb <= '1';
-                            stbs_done := false;
-                            r1.store_way <= req.hit_way;
-                            r1.store_row <= get_row(req.real_addr);
-                            r1.write_bram <= req.is_hit;
-                            r1.full <= '0';
-                            r1.slow_valid <= '1';
-                            -- Store requests never come from the MMU
-                            r1.ls_valid <= '1';
-                            stbs_done := false;
+                        r1.wb.stb <= '0';
+                        if req.op_store = '1' and req.same_tag = '1' and req.dcbz = '0' and
+                            (req.reserve = '0' or r1.atomic_more = '1') then
+                            if acks < 7 then
+                                r1.wb.stb <= '1';
+                                stbs_done := false;
+                                r1.store_way <= req.hit_way;
+                                r1.store_row <= get_row(req.real_addr);
+                                r1.write_bram <= req.is_hit;
+                                r1.atomic_more <= not req.last_dw;
+                                r1.full <= '0';
+                                r1.slow_valid <= '1';
+                                -- Store requests never come from the MMU
+                                r1.ls_valid <= '1';
+                            end if;
                         else
-                            r1.wb.stb <= '0';
                             stbs_done := true;
+                            if req.valid = '1' then
+                                r1.atomic_more <= '0';
+                            end if;
                         end if;
 		    end if;
 
 		    -- Got ack ? See if complete.
-		    if wishbone_in.ack = '1' then
+                    if stbs_done and r1.atomic_more = '0' then
                         assert not is_X(acks);
-                        if stbs_done and acks = 1 then
+                        if acks = 0 or (wishbone_in.ack = '1' and acks = 1) then
                             r1.state <= IDLE;
                             r1.wb.cyc <= '0';
                             r1.wb.stb <= '0';
@@ -1770,31 +1822,30 @@ begin
                         r1.wb.cyc <= '0';
                         r1.wb.stb <= '0';
                         reservation.valid <= '0';
+                        -- If this is the first half of a stqcx., the second half
+                        -- will fail also because the reservation is not valid.
+                        r1.state <= IDLE;
                     elsif r1.wb.cyc = '0' then
                         -- Right address and have reservation, so start the
                         -- wishbone cycle
                         r1.wb.we <= '1';
                         r1.wb.cyc <= '1';
                         r1.wb.stb <= '1';
-                    else
-                        if wishbone_in.stall = '0' then
-                            -- Store has been accepted, so now we can write the
-                            -- cache data RAM
-                            r1.write_bram <= req.is_hit;
-                            r1.wb.stb <= '0';
-                        end if;
-                        if wishbone_in.ack = '1' then
-                            r1.state <= IDLE;
-                            r1.wb.cyc <= '0';
-                            r1.wb.stb <= '0';
-                            r1.full <= '0';
-                            r1.slow_valid <= '1';
-                            r1.ls_valid <= '1';
-                            -- For stqcx., kill the reservation on the last dword
-                            if r1.req.last_dw = '1' then
-                                reservation.valid <= '0';
-                            end if;
-                        end if;
+                    elsif r1.wb.stb = '1' and wishbone_in.stall = '0' then
+                        -- Store has been accepted, so now we can write the
+                        -- cache data RAM and complete the request
+                        r1.write_bram <= r1.req.is_hit;
+                        r1.wb.stb <= '0';
+                        r1.full <= '0';
+                        r1.slow_valid <= '1';
+                        r1.ls_valid <= '1';
+                        reservation.valid <= '0';
+                        -- For a stqcx, STORE_WAIT_ACK will issue the second half
+                        -- without checking the reservation, which is what we want
+                        -- given that the first half has gone out.
+                        -- With r1.atomic_more set, STORE_WAIT_ACK won't exit to
+                        -- IDLE state until it sees the second half.
+                        r1.state <= STORE_WAIT_ACK;
                     end if;
 
                 when FLUSH_CYCLE =>