From bdd4d041629f92484806812e54744ed5d8413c55 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 11 Jun 2022 19:20:57 +1000
Subject: [PATCH 01/30] Simplify flow control in the dcache and loadstore units

Simplify the flow control by stalling the whole upstream pipeline when
a stage can't proceed, instead of trying to let each stage progress
independently when it can.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 dcache.vhdl     |   6 +-
 loadstore1.vhdl | 334 ++++++++++++++++++++++++------------------------
 2 files changed, 173 insertions(+), 167 deletions(-)

diff --git a/dcache.vhdl b/dcache.vhdl
index 8f7af52..2d5ebe3 100644
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -588,7 +588,7 @@ begin
             end if;
             if rst = '1' then
                 r0_full <= '0';
-            elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then
+            elsif r1.full = '0' and d_in.hold = '0' then
                 r0 <= r;
                 r0_full <= r.req.valid;
             elsif r0.d_valid = '0' then
@@ -605,9 +605,9 @@ begin
     m_out.stall <= '0';
 
     -- Hold off the request in r0 when r1 has an uncompleted request
-    r0_stall <= r0_full and (r1.full or d_in.hold);
+    r0_stall <= r1.full or d_in.hold;
     r0_valid <= r0_full and not r1.full and not d_in.hold;
-    stall_out <= r0_stall;
+    stall_out <= r1.full;
 
     events <= ev;
 
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 6c4b0db..ea7baec 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -43,9 +43,7 @@ architecture behave of loadstore1 is
 
     -- State machine for unaligned loads/stores
     type state_t is (IDLE,              -- ready for instruction
-                     MMU_LOOKUP,        -- waiting for MMU to look up translation
-                     TLBIE_WAIT,        -- waiting for MMU to finish doing a tlbie
-                     FINISH_LFS         -- write back converted SP data for lfs*
+                     MMU_WAIT           -- waiting for MMU to finish doing something
                      );
 
     type byte_index_t is array(0 to 7) of unsigned(2 downto 0);
@@ -63,9 +61,7 @@ architecture behave of loadstore1 is
         write_spr    : std_ulogic;
         mmu_op       : std_ulogic;
         instr_fault  : std_ulogic;
-        load_zero    : std_ulogic;
         do_update    : std_ulogic;
-        noop         : std_ulogic;
         mode_32bit   : std_ulogic;
 	addr         : std_ulogic_vector(63 downto 0);
         byte_sel     : std_ulogic_vector(7 downto 0);
@@ -93,11 +89,12 @@ architecture behave of loadstore1 is
         align_intr   : std_ulogic;
         dword_index  : std_ulogic;
         two_dwords   : std_ulogic;
+        incomplete   : std_ulogic;
         nia          : std_ulogic_vector(63 downto 0);
     end record;
     constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0',
                                           dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0',
-                                          instr_fault => '0', load_zero => '0', do_update => '0', noop => '0',
+                                          instr_fault => '0', do_update => '0',
                                           mode_32bit => '0', addr => (others => '0'),
                                           byte_sel => x"00", second_bytes => x"00",
                                           store_data => (others => '0'), instr_tag => instr_tag_init,
@@ -108,11 +105,12 @@ architecture behave of loadstore1 is
                                           atomic => '0', atomic_last => '0', rc => '0', nc => '0',
                                           virt_mode => '0', priv_mode => '0', load_sp => '0',
                                           sprn => 10x"0", is_slbia => '0', align_intr => '0',
-                                          dword_index => '0', two_dwords => '0',
+                                          dword_index => '0', two_dwords => '0', incomplete => '0',
                                           nia => (others => '0'));
 
     type reg_stage1_t is record
         req : request_t;
+        busy : std_ulogic;
         issued : std_ulogic;
         addr0 : std_ulogic_vector(63 downto 0);
     end record;
@@ -121,6 +119,7 @@ architecture behave of loadstore1 is
         req        : request_t;
         byte_index : byte_index_t;
         use_second : std_ulogic_vector(7 downto 0);
+        busy       : std_ulogic;
         wait_dc    : std_ulogic;
         wait_mmu   : std_ulogic;
         one_cycle  : std_ulogic;
@@ -130,6 +129,7 @@ architecture behave of loadstore1 is
 
     type reg_stage3_t is record
         state        : state_t;
+        complete     : std_ulogic;
         instr_tag    : instr_tag_t;
         write_enable : std_ulogic;
 	write_reg    : gspr_index_t;
@@ -137,7 +137,6 @@ architecture behave of loadstore1 is
         rc           : std_ulogic;
         xerc         : xer_common_t;
         store_done   : std_ulogic;
-        convert_lfs  : std_ulogic;
         load_data    : std_ulogic_vector(63 downto 0);
         dar          : std_ulogic_vector(63 downto 0);
         dsisr        : std_ulogic_vector(31 downto 0);
@@ -157,6 +156,7 @@ architecture behave of loadstore1 is
     signal r2, r2in : reg_stage2_t;
     signal r3, r3in : reg_stage3_t;
 
+    signal flush    : std_ulogic;
     signal busy     : std_ulogic;
     signal complete : std_ulogic;
     signal in_progress : std_ulogic;
@@ -166,12 +166,9 @@ architecture behave of loadstore1 is
     signal load_dp_data  : std_ulogic_vector(63 downto 0);
     signal store_data    : std_ulogic_vector(63 downto 0);
 
-    signal stage1_issue_enable : std_ulogic;
     signal stage1_req          : request_t;
     signal stage1_dcreq        : std_ulogic;
     signal stage1_dreq         : std_ulogic;
-    signal stage2_busy_next    : std_ulogic;
-    signal stage3_busy_next    : std_ulogic;
 
     -- Generate byte enables from sizes
     function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
@@ -274,7 +271,11 @@ begin
     begin
         if rising_edge(clk) then
             if rst = '1' then
+                r1.busy <= '0';
+                r1.issued <= '0';
                 r1.req.valid <= '0';
+                r1.req.dc_req <= '0';
+                r1.req.incomplete <= '0';
                 r1.req.tlbie <= '0';
                 r1.req.is_slbia <= '0';
                 r1.req.instr_fault <= '0';
@@ -284,6 +285,7 @@ begin
                 r1.req.xerc <= xerc_init;
 
                 r2.req.valid <= '0';
+                r2.busy <= '0';
                 r2.req.tlbie <= '0';
                 r2.req.is_slbia <= '0';
                 r2.req.instr_fault <= '0';
@@ -301,8 +303,8 @@ begin
                 r3.state <= IDLE;
                 r3.write_enable <= '0';
                 r3.interrupt <= '0';
+                r3.complete <= '0';
                 r3.stage1_en <= '1';
-                r3.convert_lfs <= '0';
                 r3.events.load_complete <= '0';
                 r3.events.store_complete <= '0';
                 flushing <= '0';
@@ -311,7 +313,7 @@ begin
                 r2 <= r2in;
                 r3 <= r3in;
                 flushing <= (flushing or (r1in.req.valid and r1in.req.align_intr)) and
-                            not r3in.interrupt;
+                            not flush;
             end if;
             stage1_dreq <= stage1_dcreq;
             if d_in.valid = '1' then
@@ -321,7 +323,7 @@ begin
                 assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure;
             end if;
             if m_in.done = '1' or m_in.err = '1' then
-                assert r2.req.valid = '1' and (r3.state = MMU_LOOKUP or r3.state = TLBIE_WAIT) severity failure;
+                assert r2.req.valid = '1' and r3.state = MMU_WAIT severity failure;
             end if;
         end if;
     end process;
@@ -507,6 +509,7 @@ begin
             when others =>
         end case;
         v.dc_req := l_in.valid and (v.load or v.store or v.dcbz) and not v.align_intr;
+        v.incomplete := v.dc_req and v.two_dwords;
 
         -- Work out controls for load and store formatting
         brev_lenm1 := "000";
@@ -518,17 +521,10 @@ begin
         req_in <= v;
     end process;
 
-    busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or
-                              (r1.issued and d_in.error) or
-                              stage2_busy_next or
-                              (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index));
-    complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or
-                (r2.wait_mmu and m_in.done) or r3.convert_lfs;
+    busy <= dc_stall or d_in.error or r1.busy or r2.busy;
+    complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or r3.complete;
     in_progress <= r1.req.valid or (r2.req.valid and not complete);
 
-    stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and
-                           not (r2.req.valid and r2.req.mmu_op);
-
     -- Processing done in the first cycle of a load/store instruction
     loadstore1_1: process(all)
         variable v     : reg_stage1_t;
@@ -538,10 +534,11 @@ begin
     begin
         v := r1;
         issue := '0';
+        dcreq := '0';
 
-        if busy = '0' then
+        if r1.busy = '0' then
             req := req_in;
-            v.issued := '0';
+            req.valid := l_in.valid;
             if flushing = '1' then
                 -- Make this a no-op request rather than simply invalid.
                 -- It will never get to stage 3 since there is a request ahead of
@@ -554,37 +551,49 @@ begin
             end if;
         else
             req := r1.req;
-        end if;
-
-        if r1.req.valid = '1' then
             if r1.req.dc_req = '1' and r1.issued = '0' then
                 issue := '1';
-            elsif r1.issued = '1' and d_in.error = '1' then
-                v.issued := '0';
-            elsif stage2_busy_next = '0' then
-                -- we can change what's in r1 next cycle because the current thing
-                -- in r1 will go into r2
-                if r1.req.dc_req = '1' and r1.req.two_dwords = '1' and r1.req.dword_index = '0' then
-                    -- construct the second request for a misaligned access
-                    req.dword_index := '1';
-                    req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000";
-                    if r1.req.mode_32bit = '1' then
-                        req.addr(32) := '0';
-                    end if;
-                    req.byte_sel := r1.req.second_bytes;
-                    issue := '1';
+            elsif r1.req.incomplete = '1' then
+                -- construct the second request for a misaligned access
+                req.dword_index := '1';
+                req.incomplete := '0';
+                req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000";
+                if r1.req.mode_32bit = '1' then
+                    req.addr(32) := '0';
                 end if;
+                req.byte_sel := r1.req.second_bytes;
+                issue := '1';
+            else
+                -- For the lfs conversion cycle, leave the request valid
+                -- for another cycle but with req.dc_req = 0.
+                -- For an MMU request last cycle, we have nothing
+                -- to do in this cycle, so make it invalid.
+                if r1.req.load_sp = '0' then
+                    req.valid := '0';
+                end if;
+                req.dc_req := '0';
             end if;
         end if;
-        if r3in.interrupt = '1' then
-            req.valid := '0';
-            issue := '0';
-        end if;
 
-        v.req := req;
-        dcreq := issue and stage1_issue_enable and not d_in.error and not dc_stall;
-        if issue = '1' then
-            v.issued := dcreq;
+        if flush = '1' then
+            v.req.valid := '0';
+            v.req.dc_req := '0';
+            v.req.incomplete := '0';
+            v.issued := '0';
+            v.busy := '0';
+        elsif (dc_stall or d_in.error or r2.busy) = '0' then
+            -- we can change what's in r1 next cycle because the current thing
+            -- in r1 will go into r2
+            v.req := req;
+            dcreq := issue;
+            v.issued := issue;
+            v.busy := (issue and (req.incomplete or req.load_sp)) or (req.valid and req.mmu_op);
+        else
+            -- pipeline is stalled
+            if r1.issued = '1' and d_in.error = '1' then
+                v.issued := '0';
+                v.busy := '1';
+            end if;
         end if;
 
         stage1_req <= req;
@@ -602,6 +611,7 @@ begin
         variable kk : unsigned(3 downto 0);
         variable idx : unsigned(2 downto 0);
         variable byte_offset : unsigned(2 downto 0);
+        variable interrupt : std_ulogic;
     begin
         v := r2;
 
@@ -614,44 +624,61 @@ begin
             store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j);
         end loop;
 
-        if stage3_busy_next = '0' and
-            (r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0') then
-            v.req := r1.req;
-            v.addr0 := r1.addr0;
-            v.req.store_data := store_data;
-            v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and
-                         not (r1.req.two_dwords and not r1.req.dword_index);
-            v.wait_mmu := r1.req.valid and r1.req.mmu_op;
-            v.one_cycle := r1.req.valid and (r1.req.noop or r1.req.read_spr or
-                                             (r1.req.write_spr and not r1.req.mmu_op) or
-                                             r1.req.load_zero or r1.req.do_update);
-            if r1.req.read_spr = '1' then
-                v.wr_sel := "00";
-            elsif r1.req.do_update = '1' or r1.req.store = '1' then
-                v.wr_sel := "01";
-            elsif r1.req.load_sp = '1' then
-                v.wr_sel := "10";
+        if (dc_stall or d_in.error or r2.busy) = '0' then
+            if r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0' then
+                v.req := r1.req;
+                v.addr0 := r1.addr0;
+                v.req.store_data := store_data;
+                v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and
+                             not r1.req.incomplete;
+                v.wait_mmu := r1.req.valid and r1.req.mmu_op;
+                v.busy := r1.req.valid and r1.req.mmu_op;
+                v.one_cycle := r1.req.valid and not (r1.req.dc_req or r1.req.mmu_op);
+                if r1.req.read_spr = '1' then
+                    v.wr_sel := "00";
+                elsif r1.req.do_update = '1' or r1.req.store = '1' then
+                    v.wr_sel := "01";
+                elsif r1.req.load_sp = '1' then
+                    v.wr_sel := "10";
+                else
+                    v.wr_sel := "11";
+                end if;
+
+                -- Work out load formatter controls for next cycle
+                for i in 0 to 7 loop
+                    idx := to_unsigned(i, 3) xor r1.req.brev_mask;
+                    kk := ('0' & idx) + ('0' & byte_offset);
+                    v.use_second(i) := kk(3);
+                    v.byte_index(i) := kk(2 downto 0);
+                end loop;
             else
-                v.wr_sel := "11";
+                v.req.valid := '0';
+                v.wait_dc := '0';
+                v.wait_mmu := '0';
+                v.one_cycle := '0';
+            end if;
+        end if;
+        if r2.wait_mmu = '1' and m_in.done = '1' then
+            if r2.req.mmu_op = '1' then
+                v.req.valid := '0';
+                v.busy := '0';
             end if;
-
-            -- Work out load formatter controls for next cycle
-            for i in 0 to 7 loop
-                idx := to_unsigned(i, 3) xor r1.req.brev_mask;
-                kk := ('0' & idx) + ('0' & byte_offset);
-                v.use_second(i) := kk(3);
-                v.byte_index(i) := kk(2 downto 0);
-            end loop;
-        elsif stage3_busy_next = '0' then
-            v.req.valid := '0';
-            v.wait_dc := '0';
             v.wait_mmu := '0';
         end if;
+        if r2.busy = '1' and r2.wait_mmu = '0' then
+            v.busy := '0';
+        end if;
 
-        stage2_busy_next <= r1.req.valid and stage3_busy_next;
-
-        if r3in.interrupt = '1' then
+        interrupt := (r2.req.valid and r2.req.align_intr) or
+                     (d_in.error and d_in.cache_paradox) or m_in.err;
+        if interrupt = '1' then
             v.req.valid := '0';
+            v.busy := '0';
+            v.wait_dc := '0';
+            v.wait_mmu := '0';
+        elsif d_in.error = '1' then
+            v.wait_mmu := '1';
+            v.busy := '1';
         end if;
 
         r2in <= v;
@@ -671,7 +698,6 @@ begin
         variable write_data    : std_ulogic_vector(63 downto 0);
         variable do_update     : std_ulogic;
         variable done          : std_ulogic;
-        variable part_done     : std_ulogic;
         variable exception     : std_ulogic;
         variable data_permuted : std_ulogic_vector(63 downto 0);
         variable data_trimmed  : std_ulogic_vector(63 downto 0);
@@ -687,13 +713,12 @@ begin
         mmureq := '0';
         mmu_mtspr := '0';
         done := '0';
-        part_done := '0';
         exception := '0';
         dsisr := (others => '0');
         write_enable := '0';
         sprval := (others => '0');
         do_update := '0';
-        v.convert_lfs := '0';
+        v.complete := '0';
         v.srr1 := (others => '0');
         v.events := (others => '0');
 
@@ -775,94 +800,74 @@ begin
                 -- generate alignment interrupt
                 exception := '1';
             end if;
-            if r2.req.load_zero = '1' then
-                write_enable := '1';
-            end if;
             if r2.req.do_update = '1' then
                 do_update := '1';
             end if;
-        end if;
-
-        case r3.state is
-        when IDLE =>
-            if d_in.valid = '1' then
-                if r2.req.two_dwords = '0' or r2.req.dword_index = '1' then
-                    write_enable := r2.req.load and not r2.req.load_sp;
-                    if HAS_FPU and r2.req.load_sp = '1' then
-                        -- SP to DP conversion takes a cycle
-                        v.state := FINISH_LFS;
-                        v.convert_lfs := '1';
-                    else
-                        -- stores write back rA update
-                        do_update := r2.req.update and r2.req.store;
-                    end if;
-                else
-                    part_done := '1';
-                end if;
+            if r2.req.load_sp = '1' and r2.req.dc_req = '0' then
+                write_enable := '1';
             end if;
-            if d_in.error = '1' then
-                if d_in.cache_paradox = '1' then
-                    -- signal an interrupt straight away
-                    exception := '1';
-                    dsisr(63 - 38) := not r2.req.load;
-                    -- XXX there is no architected bit for this
-                    -- (probably should be a machine check in fact)
-                    dsisr(63 - 35) := d_in.cache_paradox;
+            if r2.req.write_spr = '1' and r2.req.mmu_op = '0' then
+                if r2.req.sprn(0) = '0' then
+                    v.dsisr := r2.req.store_data(31 downto 0);
                 else
-                    -- Look up the translation for TLB miss
-                    -- and also for permission error and RC error
-                    -- in case the PTE has been updated.
-                    mmureq := '1';
-                    v.state := MMU_LOOKUP;
-                    v.stage1_en := '0';
+                    v.dar := r2.req.store_data;
                 end if;
             end if;
-            if r2.req.valid = '1' then
-                if r2.req.mmu_op = '1' then
-                    -- send request (tlbie, mtspr, itlb miss) to MMU
-                    mmureq := not r2.req.write_spr;
-                    mmu_mtspr := r2.req.write_spr;
-                    if r2.req.instr_fault = '1' then
-                        v.state := MMU_LOOKUP;
-                        v.events.itlb_miss := '1';
-                    else
-                        v.state := TLBIE_WAIT;
-                    end if;
-                elsif r2.req.write_spr = '1' then
-                    if r2.req.sprn(0) = '0' then
-                        v.dsisr := r2.req.store_data(31 downto 0);
-                    else
-                        v.dar := r2.req.store_data;
-                    end if;
-                end if;
+        end if;
+
+        if r3.state = IDLE and r2.req.valid = '1' and r2.req.mmu_op = '1' then
+            -- send request (tlbie, mtspr, itlb miss) to MMU
+            mmureq := not r2.req.write_spr;
+            mmu_mtspr := r2.req.write_spr;
+            if r2.req.instr_fault = '1' then
+                v.events.itlb_miss := '1';
             end if;
+            v.state := MMU_WAIT;
+        end if;
 
-        when MMU_LOOKUP =>
-            if m_in.done = '1' then
-                if r2.req.instr_fault = '0' then
-                    -- retry the request now that the MMU has installed a TLB entry
-                    req := '1';
-                    v.stage1_en := '1';
-                    v.state := IDLE;
-                end if;
+        if d_in.valid = '1' then
+            if r2.req.incomplete = '0' then
+                write_enable := r2.req.load and not r2.req.load_sp;
+                -- stores write back rA update
+                do_update := r2.req.update and r2.req.store;
             end if;
-            if m_in.err = '1' then
+        end if;
+        if d_in.error = '1' then
+            if d_in.cache_paradox = '1' then
+                -- signal an interrupt straight away
                 exception := '1';
-                dsisr(63 - 33) := m_in.invalid;
-                dsisr(63 - 36) := m_in.perm_error;
-                dsisr(63 - 38) := r2.req.store or r2.req.dcbz;
-                dsisr(63 - 44) := m_in.badtree;
-                dsisr(63 - 45) := m_in.rc_error;
+                dsisr(63 - 38) := not r2.req.load;
+                -- XXX there is no architected bit for this
+                -- (probably should be a machine check in fact)
+                dsisr(63 - 35) := d_in.cache_paradox;
+            else
+                -- Look up the translation for TLB miss
+                -- and also for permission error and RC error
+                -- in case the PTE has been updated.
+                mmureq := '1';
+                v.state := MMU_WAIT;
+                v.stage1_en := '0';
             end if;
+        end if;
 
-        when TLBIE_WAIT =>
-
-        when FINISH_LFS =>
-            write_enable := '1';
-
-        end case;
+        if m_in.done = '1' then
+            if r2.req.dc_req = '1' then
+                -- retry the request now that the MMU has installed a TLB entry
+                req := '1';
+            else
+                v.complete := '1';
+            end if;
+        end if;
+        if m_in.err = '1' then
+            exception := '1';
+            dsisr(63 - 33) := m_in.invalid;
+            dsisr(63 - 36) := m_in.perm_error;
+            dsisr(63 - 38) := r2.req.store or r2.req.dcbz;
+            dsisr(63 - 44) := m_in.badtree;
+            dsisr(63 - 45) := m_in.rc_error;
+        end if;
 
-        if complete = '1' or exception = '1' then
+        if (m_in.done or m_in.err) = '1' then
             v.stage1_en := '1';
             v.state := IDLE;
         end if;
@@ -915,7 +920,7 @@ begin
         end case;
 
         -- Update outputs to dcache
-        if stage1_issue_enable = '1' then
+        if r3.stage1_en = '1' then
             d_out.valid <= stage1_dcreq;
             d_out.load <= stage1_req.load;
             d_out.dcbz <= stage1_req.dcbz;
@@ -945,7 +950,7 @@ begin
         else
             d_out.data <= r2.req.store_data;
         end if;
-        d_out.hold <= r2.req.valid and r2.req.load_sp and d_in.valid;
+        d_out.hold <= '0';
 
         -- Update outputs to MMU
         m_out.valid <= mmureq;
@@ -980,8 +985,7 @@ begin
 
         events <= r3.events;
 
-        -- Busy calculation.
-        stage3_busy_next <= r2.req.valid and not (complete or part_done or exception);
+        flush <= exception;
 
         -- Update registers
         r3in <= v;
@@ -1001,7 +1005,9 @@ begin
                             d_out.valid &
                             m_in.done &
                             r2.req.dword_index &
-                            std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 3));
+                            r2.req.valid &
+                            r2.wait_dc &
+                            std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 1));
             end if;
         end process;
         log_out <= log_data;

From 204fedc63f7831e35cea09688b6e5249de8938da Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 29 Jun 2022 20:02:36 +1000
Subject: [PATCH 02/30] Move XER low bits out of register file

Besides the overflow and status carry bits, XER has 18 bits which need
to retain the value written by mtxer (in case software wants to
emulate the move-assist instructions (lswi, lswx, stswi, stswx).
Until now these bits (and others) have been stored in the GPR file as
a "fast" SPR, but this causes complications because XER is not really
a fast SPR.

Instead, we now store these 18 bits in the 'ctrl' signal, which exists
in execute1.  This will enable us to simplify the data path in future,
and has the added bonus that with a little bit of plumbing, we can get
the full XER value printed when dumping registers at the end of a
simulation.

Therefore this changes scripts/run_test.sh to remove the greps which
exclude XER from the comparison of actual and expected register
results.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl         |  7 ++++---
 core.vhdl           |  7 ++++---
 cr_file.vhdl        | 10 ++++++++++
 execute1.vhdl       | 42 +++++++++++++++++++-----------------------
 register_file.vhdl  |  1 -
 scripts/run_test.sh |  4 ++--
 6 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 14a8801..bab5aed 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -114,7 +114,7 @@ package common is
 
     -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
     -- in the CR file as a kind of CR extension (with a separate write
-    -- control). The rest is stored as a fast SPR.
+    -- control). The rest is stored in ctrl_t (effectively in execute1).
     type xer_common_t is record
 	ca : std_ulogic;
 	ca32 : std_ulogic;
@@ -192,7 +192,10 @@ package common is
 	dec: std_ulogic_vector(63 downto 0);
 	msr: std_ulogic_vector(63 downto 0);
         cfar: std_ulogic_vector(63 downto 0);
+        xer_low: std_ulogic_vector(17 downto 0);
     end record;
+    constant ctrl_t_init : ctrl_t :=
+        (xer_low => 18x"0", others => (others => '0'));
 
     type Fetch1ToIcacheType is record
 	req: std_ulogic;
@@ -739,8 +742,6 @@ package body common is
            n := 10;
        when SPR_HSPRG1 =>
            n := 11;
-       when SPR_XER =>
-           n := 12;
        when SPR_TAR =>
            n := 13;
        when others =>
diff --git a/core.vhdl b/core.vhdl
index b18f09a..070a1f1 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -145,7 +145,7 @@ architecture behave of core is
     signal dbg_gpr_addr : gspr_index_t;
     signal dbg_gpr_data : std_ulogic_vector(63 downto 0);
 
-    signal msr : std_ulogic_vector(63 downto 0);
+    signal ctrl_debug : ctrl_t;
 
     -- PMU event bus
     signal icache_events    : IcacheEventType;
@@ -333,6 +333,7 @@ begin
             d_out => cr_file_to_decode2,
             w_in => writeback_to_cr_file,
             sim_dump => sim_cr_dump,
+            ctrl => ctrl_debug,
             log_out => log_data(183 downto 171)
             );
 
@@ -359,7 +360,7 @@ begin
             bypass_data => execute1_bypass,
             bypass_cr_data => execute1_cr_bypass,
 	    icache_inval => ex1_icache_inval,
-            dbg_msr_out => msr,
+            dbg_ctrl_out => ctrl_debug,
             wb_events => writeback_events,
             ls_events => loadstore_events,
             dc_events => dcache_events,
@@ -482,7 +483,7 @@ begin
 	    terminate => terminate,
 	    core_stopped => dbg_core_is_stopped,
 	    nia => fetch1_to_icache.nia,
-            msr => msr,
+            msr => ctrl_debug.msr,
             dbg_gpr_req => dbg_gpr_req,
             dbg_gpr_ack => dbg_gpr_ack,
             dbg_gpr_addr => dbg_gpr_addr,
diff --git a/cr_file.vhdl b/cr_file.vhdl
index e9788cb..d1aedba 100644
--- a/cr_file.vhdl
+++ b/cr_file.vhdl
@@ -18,6 +18,7 @@ entity cr_file is
         d_out : out CrFileToDecode2Type;
 
         w_in  : in WritebackToCrFileType;
+        ctrl  : in ctrl_t;
 
         -- debug
         sim_dump : in std_ulogic;
@@ -84,9 +85,18 @@ begin
 
     sim_dump_test: if SIM generate
         dump_cr: process(all)
+            variable xer : std_ulogic_vector(31 downto 0);
         begin
             if sim_dump = '1' then
                 report "CR 00000000" & to_hstring(crs);
+                xer := (others => '0');
+                xer(31) := xerc.so;
+                xer(30) := xerc.ov;
+                xer(29) := xerc.ca;
+                xer(19) := xerc.ov32;
+                xer(18) := xerc.ca32;
+                xer(17 downto 0) := ctrl.xer_low;
+                report "XER 00000000" & to_hstring(xer);
                 assert false report "end of test" severity failure;
             end if;
         end process;
diff --git a/execute1.vhdl b/execute1.vhdl
index 955a1da..b955b75 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -41,7 +41,7 @@ entity execute1 is
         bypass_data : out bypass_data_t;
         bypass_cr_data : out cr_bypass_data_t;
 
-        dbg_msr_out : out std_ulogic_vector(63 downto 0);
+        dbg_ctrl_out : out ctrl_t;
 
 	icache_inval : out std_ulogic;
 	terminate_out : out std_ulogic;
@@ -99,8 +99,8 @@ architecture behaviour of execute1 is
     signal mshort_p : std_ulogic_vector(31 downto 0) := (others => '0');
 
     signal valid_in : std_ulogic;
-    signal ctrl: ctrl_t;
-    signal ctrl_tmp: ctrl_t;
+    signal ctrl: ctrl_t := ctrl_t_init;
+    signal ctrl_tmp: ctrl_t := ctrl_t_init;
     signal right_shift, rot_clear_left, rot_clear_right: std_ulogic;
     signal rot_sign_ext: std_ulogic;
     signal rotator_result: std_ulogic_vector(63 downto 0);
@@ -249,6 +249,13 @@ architecture behaviour of execute1 is
         return x(n - 1) = '1';
     end;
 
+    function assemble_xer(xerc: xer_common_t; xer_low: std_ulogic_vector)
+        return std_ulogic_vector is
+    begin
+        return 32x"0" & xerc.so & xerc.ov & xerc.ca & "000000000" &
+            xerc.ov32 & xerc.ca32 & xer_low(17 downto 0);
+    end;
+
     -- Tell vivado to keep the hierarchy for the random module so that the
     -- net names in the xdc file match.
     attribute keep_hierarchy : string;
@@ -336,7 +343,7 @@ begin
             );
     end generate;
 
-    dbg_msr_out <= ctrl.msr;
+    dbg_ctrl_out <= ctrl;
     log_rd_addr <= r.log_addr_spr;
 
     a_in <= e_in.read_data1;
@@ -402,9 +409,7 @@ begin
 	if rising_edge(clk) then
             if rst = '1' then
                 r <= reg_type_init;
-                ctrl.tb <= (others => '0');
-                ctrl.dec <= (others => '0');
-                ctrl.cfar <= (others => '0');
+                ctrl <= ctrl_t_init;
                 ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
             else
                 r <= rin;
@@ -1043,19 +1048,11 @@ begin
 		    "=" & to_hstring(a_in);
 		if is_fast_spr(e_in.read_reg1) = '1' then
 		    spr_val := a_in;
-                    if decode_spr_num(e_in.insn) = SPR_XER then
-			-- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer
-			spr_val(63 downto 32) := (others => '0');
-			spr_val(63-32) := xerc_in.so;
-			spr_val(63-33) := xerc_in.ov;
-			spr_val(63-34) := xerc_in.ca;
-			spr_val(63-35 downto 63-43) := "000000000";
-			spr_val(63-44) := xerc_in.ov32;
-			spr_val(63-45) := xerc_in.ca32;
-                    end if;
 		else
                     spr_val := c_in;
                     case decode_spr_num(e_in.insn) is
+                    when SPR_XER =>
+                        spr_val := assemble_xer(xerc_in, ctrl.xer_low);
 		    when SPR_TB =>
 			spr_val := ctrl.tb;
 		    when SPR_TBU =>
@@ -1118,17 +1115,16 @@ begin
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(c_in);
-		if is_fast_spr(e_in.write_reg) then
-		    if decode_spr_num(e_in.insn) = SPR_XER then
+		if is_fast_spr(e_in.write_reg) = '0' then
+		    -- slow spr
+		    case decode_spr_num(e_in.insn) is
+                    when SPR_XER =>
 			v.e.xerc.so := c_in(63-32);
 			v.e.xerc.ov := c_in(63-33);
 			v.e.xerc.ca := c_in(63-34);
 			v.e.xerc.ov32 := c_in(63-44);
 			v.e.xerc.ca32 := c_in(63-45);
-		    end if;
-		else
-		    -- slow spr
-		    case decode_spr_num(e_in.insn) is
+                        ctrl_tmp.xer_low <= c_in(17 downto 0);
 		    when SPR_DEC =>
 			ctrl_tmp.dec <= c_in;
                     when 724 =>     -- LOG_ADDR SPR
diff --git a/register_file.vhdl b/register_file.vhdl
index b5e7246..ab35855 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -143,7 +143,6 @@ begin
 
                 report "LR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_LR)))));
                 report "CTR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_CTR)))));
-                report "XER " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_XER)))));
                 sim_dump_done <= '1';
             else
                 sim_dump_done <= '0';
diff --git a/scripts/run_test.sh b/scripts/run_test.sh
index 9fcb7ce..185c3a6 100755
--- a/scripts/run_test.sh
+++ b/scripts/run_test.sh
@@ -21,9 +21,9 @@ cd $TMPDIR
 
 cp ${MICROWATT_DIR}/tests/${TEST}.bin main_ram.bin
 
-${MICROWATT_DIR}/core_tb | sed 's/.*: //' | egrep '^(GPR[0-9]|LR |CTR |XER |CR [0-9])' | sort | grep -v GPR31 | grep -v XER > test.out || true
+${MICROWATT_DIR}/core_tb | sed 's/.*: //' | egrep '^(GPR[0-9]|LR |CTR |XER |CR [0-9])' | sort | grep -v GPR31 > test.out || true
 
-grep -v "^$" ${MICROWATT_DIR}/tests/${TEST}.out | sort | grep -v GPR31 | grep -v XER > exp.out
+grep -v "^$" ${MICROWATT_DIR}/tests/${TEST}.out | sort | grep -v GPR31 > exp.out
 
 cp test.out /tmp
 cp exp.out /tmp

From 813e2317bf1f1c10d988f660c0a4282da316a3b9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 18 Jun 2022 16:24:30 +1000
Subject: [PATCH 03/30] execute1: Restructure to separate out execution of side
 effects

We now have a record that represents the actions taken in executing an
instruction, and a process that computes that for the incoming
instruction.  We no longer have 'current' or 'r.cur_instr', instead
things like the destination register are put into r.e in the first
cycle of an instruction and not reinitialized in subsequent busy
cycles.

For mfspr and mtspr, we now decode "slow" SPR numbers (those SPRs that
are not stored in the register file) to a new "spr_selector" record
in decode1 (excluding those in the loadstore unit).  With this, the
result for mfspr is determined in the data path.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl   |  25 +-
 cr_file.vhdl  |   6 +-
 decode1.vhdl  |  38 ++-
 decode2.vhdl  |  20 +-
 execute1.vhdl | 903 ++++++++++++++++++++++++++++----------------------
 5 files changed, 579 insertions(+), 413 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index bab5aed..7ecf4e2 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -124,6 +124,23 @@ package common is
     end record;
     constant xerc_init : xer_common_t := (others => '0');
 
+    subtype spr_selector is std_ulogic_vector(2 downto 0);
+    type spr_id is record
+        sel   : spr_selector;
+        valid : std_ulogic;
+        ispmu : std_ulogic;
+    end record;
+    constant spr_id_init : spr_id := (sel => "000", others => '0');
+
+    constant SPRSEL_TB   : spr_selector := 3x"0";
+    constant SPRSEL_TBU  : spr_selector := 3x"1";
+    constant SPRSEL_DEC  : spr_selector := 3x"2";
+    constant SPRSEL_PVR  : spr_selector := 3x"3";
+    constant SPRSEL_LOGA : spr_selector := 3x"4";
+    constant SPRSEL_LOGD : spr_selector := 3x"5";
+    constant SPRSEL_CFAR : spr_selector := 3x"6";
+    constant SPRSEL_XER  : spr_selector := 3x"7";
+
     -- FPSCR bit numbers
     constant FPSCR_FX     : integer := 63 - 32;
     constant FPSCR_FEX    : integer := 63 - 33;
@@ -235,11 +252,13 @@ package common is
 	decode: decode_rom_t;
         br_pred: std_ulogic; -- Branch was predicted to be taken
         big_endian: std_ulogic;
+        spr_info : spr_id;
     end record;
     constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
         (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
          ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'),
-         decode => decode_rom_init, br_pred => '0', big_endian => '0');
+         decode => decode_rom_init, br_pred => '0', big_endian => '0',
+         spr_info => spr_id_init);
 
     type Decode1ToFetch1Type is record
         redirect     : std_ulogic;
@@ -299,6 +318,7 @@ package common is
         sub_select : std_ulogic_vector(2 downto 0);     -- sub-result selection
         repeat : std_ulogic;                            -- set if instruction is cracked into two ops
         second : std_ulogic;                            -- set if this is the second op
+        spr_select : spr_id;
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
 	(valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init,
@@ -311,7 +331,8 @@ package common is
          read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'),
          cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'),
          result_sel => "000", sub_select => "000",
-         repeat => '0', second => '0', others => (others => '0'));
+         repeat => '0', second => '0', spr_select => spr_id_init,
+         others => (others => '0'));
 
     type MultiplyInputType is record
 	valid: std_ulogic;
diff --git a/cr_file.vhdl b/cr_file.vhdl
index d1aedba..940b95b 100644
--- a/cr_file.vhdl
+++ b/cr_file.vhdl
@@ -66,7 +66,11 @@ begin
                 crs <= crs_updated;
             end if;
             if w_in.write_xerc_enable = '1' then
-                report "Writing XERC";
+                report "Writing XERC SO=" & std_ulogic'image(xerc_updated.so) &
+                    " OV=" & std_ulogic'image(xerc_updated.ov) &
+                    " CA=" & std_ulogic'image(xerc_updated.ca) &
+                    " OV32=" & std_ulogic'image(xerc_updated.ov32) &
+                    " CA32=" & std_ulogic'image(xerc_updated.ca32);
                 xerc <= xerc_updated;
             end if;
         end if;
diff --git a/decode1.vhdl b/decode1.vhdl
index baf4347..fb92b9e 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -519,6 +519,40 @@ architecture behaviour of decode1 is
     constant nop_instr      : decode_rom_t := (ALU,  NONE, OP_NOP,          NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);
     constant fetch_fail_inst: decode_rom_t := (LDST, NONE, OP_FETCH_FAILED, NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);
 
+    function map_spr(sprn : spr_num_t) return spr_id is
+        variable i : spr_id;
+    begin
+        i.sel := "000";
+        i.valid := '1';
+        i.ispmu := '0';
+        case sprn is
+            when SPR_TB =>
+                i.sel := SPRSEL_TB;
+            when SPR_TBU =>
+                i.sel := SPRSEL_TBU;
+            when SPR_DEC =>
+                i.sel := SPRSEL_DEC;
+            when SPR_PVR =>
+                i.sel := SPRSEL_PVR;
+            when 724 =>     -- LOG_ADDR SPR
+                i.sel := SPRSEL_LOGA;
+            when 725 =>     -- LOG_DATA SPR
+                i.sel := SPRSEL_LOGD;
+            when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 |
+                SPR_UMMCR0 | SPR_UMMCR1 | SPR_UMMCR2 | SPR_UMMCRA | SPR_USIER | SPR_USIAR | SPR_USDAR |
+                SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 |
+                SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR =>
+                i.ispmu := '1';
+            when SPR_CFAR =>
+                i.sel := SPRSEL_CFAR;
+            when SPR_XER =>
+                i.sel := SPRSEL_XER;
+            when others =>
+                i.valid := '0';
+        end case;
+        return i;
+    end;
+
 begin
     decode1_0: process(clk)
     begin
@@ -586,6 +620,9 @@ begin
         majorop := unsigned(f_in.insn(31 downto 26));
         v.decode := major_decode_rom_array(to_integer(majorop));
 
+        sprn := decode_spr_num(f_in.insn);
+        v.spr_info := map_spr(sprn);
+
         case to_integer(unsigned(majorop)) is
         when 4 =>
             -- major opcode 4, mostly VMX/VSX stuff but also some integer ops (madd*)
@@ -598,7 +635,6 @@ begin
             v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1))));
 
             -- Work out ispr1/ispro independent of v.decode since they seem to be critical path
-            sprn := decode_spr_num(f_in.insn);
             v.ispr1 := fast_spr_num(sprn);
             v.ispro := fast_spr_num(sprn);
 
diff --git a/decode2.vhdl b/decode2.vhdl
index 5aa1a6f..8998f2b 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -228,13 +228,6 @@ architecture behaviour of decode2 is
         OP_SHR      => "010",
         OP_EXTSWSLI => "010",
         OP_MUL_L64  => "011",           -- muldiv_result
-        OP_MUL_H64  => "011",
-        OP_MUL_H32  => "011",
-        OP_DIV      => "011",
-        OP_DIVE     => "011",
-        OP_MOD      => "011",
-        OP_CNTZ     => "100",           -- countbits_result
-        OP_POPCNT   => "100",
         OP_MFSPR    => "101",           -- spr_result
         OP_B        => "110",           -- next_nia
         OP_BC       => "110",
@@ -440,6 +433,8 @@ begin
             decoded_reg_o.reg(0) := not r.repeat;
         end if;
 
+        v.e.spr_select := d_in.spr_info;
+
         r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid;
         r_out.read1_reg    <= decoded_reg_a.reg;
         r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid;
@@ -496,6 +491,17 @@ begin
                 v.e.result_sel := "000";        -- select adder output
             end if;
         end if;
+        if op = OP_MFSPR then
+            if is_fast_spr(d_in.ispr1) = '1' then
+                v.e.result_sel := "000";        -- adder_result, effectively a_in
+            elsif d_in.spr_info.valid = '0' then
+                -- Privileged mfspr to invalid/unimplemented SPR numbers
+                -- writes the contents of RT back to RT (i.e. it's a no-op)
+                v.e.result_sel := "001";        -- logical_result
+            elsif d_in.spr_info.ispmu = '1' then
+                v.e.result_sel := "100";        -- pmuspr_result
+            end if;
+        end if;
 
         -- See if any of the operands can get their value via the bypass path.
         case gpr_a_bypass is
diff --git a/execute1.vhdl b/execute1.vhdl
index b955b75..21f6f8f 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -62,7 +62,6 @@ end entity execute1;
 architecture behaviour of execute1 is
     type reg_type is record
 	e : Execute1ToWritebackType;
-        cur_instr : Decode2ToExecute1Type;
         busy: std_ulogic;
         terminate: std_ulogic;
         intr_pending : std_ulogic;
@@ -70,6 +69,8 @@ architecture behaviour of execute1 is
         trace_next : std_ulogic;
         prev_op : insn_type_t;
         br_taken : std_ulogic;
+        oe : std_ulogic;
+        mul_select : std_ulogic_vector(1 downto 0);
 	mul_in_progress : std_ulogic;
         mul_finish : std_ulogic;
         div_in_progress : std_ulogic;
@@ -83,15 +84,42 @@ architecture behaviour of execute1 is
     end record;
     constant reg_type_init : reg_type :=
         (e => Execute1ToWritebackInit,
-         cur_instr => Decode2ToExecute1Init,
          busy => '0', terminate => '0', intr_pending => '0',
          fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0',
+         oe => '0', mul_select => "00",
          mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
          no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
          taken_branch_event => '0', br_mispredict => '0',
          others => (others => '0'));
 
+    type actions_type is record
+	e : Execute1ToWritebackType;
+        complete : std_ulogic;
+        exception : std_ulogic;
+        trap : std_ulogic;
+        terminate : std_ulogic;
+        write_msr : std_ulogic;
+        new_msr : std_ulogic_vector(63 downto 0);
+        write_xerlow : std_ulogic;
+        write_pmuspr : std_ulogic;
+        write_dec : std_ulogic;
+        write_loga : std_ulogic;
+        inc_loga : std_ulogic;
+        write_cfar : std_ulogic;
+        take_branch : std_ulogic;
+        direct_branch : std_ulogic;
+        start_mul : std_ulogic;
+        start_div : std_ulogic;
+        start_cntz : std_ulogic;
+        do_trace : std_ulogic;
+        fp_intr : std_ulogic;
+        icache_inval : std_ulogic;
+    end record;
+    constant actions_type_init : actions_type :=
+        (e => Execute1ToWritebackInit, new_msr => (others => '0'), others => '0');
+
     signal r, rin : reg_type;
+    signal actions : actions_type;
 
     signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
     signal cr_in : std_ulogic_vector(31 downto 0);
@@ -112,9 +140,9 @@ architecture behaviour of execute1 is
     signal adder_result: std_ulogic_vector(63 downto 0);
     signal misc_result: std_ulogic_vector(63 downto 0);
     signal muldiv_result: std_ulogic_vector(63 downto 0);
+    signal shortmul_result: std_ulogic_vector(63 downto 0);
     signal spr_result: std_ulogic_vector(63 downto 0);
     signal next_nia : std_ulogic_vector(63 downto 0);
-    signal current: Decode2ToExecute1Type;
 
     signal carry_32 : std_ulogic;
     signal carry_64 : std_ulogic;
@@ -369,7 +397,7 @@ begin
                        br_taken_complete => r.taken_branch_event,
                        br_mispredict => r.br_mispredict,
                        others => '0');
-    x_to_pmu.nia <= current.nia;
+    x_to_pmu.nia <= e_in.nia;
     x_to_pmu.addr <= (others => '0');
     x_to_pmu.addr_v <= '0';
     x_to_pmu.spr_num <= e_in.insn(20 downto 16);
@@ -381,7 +409,7 @@ begin
     -- (SO, OV[32] and CA[32]) are only modified by instructions that are
     -- handled here, we can just forward the result being sent to
     -- writeback.
-    xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc;
+    xerc_in <= r.e.xerc when (r.e.write_xerc_enable and r.e.valid) = '1' else e_in.xerc;
 
     with e_in.unit select busy_out <=
         l_in.busy or r.busy or fp_in.busy when LDST,
@@ -391,15 +419,24 @@ begin
 
     terminate_out <= r.terminate;
 
-    current <= e_in when r.busy = '0' else r.cur_instr;
+    -- Slow SPR read mux
+    with e_in.spr_select.sel select spr_result <=
+        ctrl.tb when SPRSEL_TB,
+        32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU,
+        ctrl.dec when SPRSEL_DEC,
+        32x"0" & PVR_MICROWATT when SPRSEL_PVR,
+        log_wr_addr & r.log_addr_spr when SPRSEL_LOGA,
+        log_rd_data when SPRSEL_LOGD,
+        ctrl.cfar when SPRSEL_CFAR,
+        assemble_xer(xerc_in, ctrl.xer_low) when others;
 
     -- Result mux
-    with current.result_sel select alu_result <=
+    with e_in.result_sel select alu_result <=
         adder_result       when "000",
         logical_result     when "001",
         rotator_result     when "010",
-        muldiv_result      when "011",
-        countbits_result   when "100",
+        shortmul_result    when "011",
+        pmu_to_x.spr_val   when "100",
         spr_result         when "101",
         next_nia           when "110",
         misc_result        when others;
@@ -545,13 +582,10 @@ begin
             x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
         end if;
 
-        case current.sub_select(1 downto 0) is
+        shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
+        case r.mul_select is
             when "00" =>
-                if HAS_SHORT_MULT and r.mul_in_progress = '0' then
-                    muldiv_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
-                else
-                    muldiv_result <= multiply_to_x.result(63 downto 0);
-                end if;
+                muldiv_result <= multiply_to_x.result(63 downto 0);
             when "01" =>
                 muldiv_result <= multiply_to_x.result(127 downto 64);
             when "10" =>
@@ -562,7 +596,7 @@ begin
         end case;
 
         -- Compute misc_result
-        case current.sub_select is
+        case e_in.sub_select is
             when "000" =>
                 misc_result <= (others => '0');
             when "001" =>
@@ -684,7 +718,7 @@ begin
         bf := insn_bf(e_in.insn);
         crnum := to_integer(unsigned(bf));
         newcrf := (others => '0');
-        case current.sub_select is
+        case e_in.sub_select is
             when "000" =>
                 -- CMP and CMPL instructions
                 if e_in.is_signed = '1' then
@@ -697,7 +731,7 @@ begin
             when "010" =>
                 newcrf := ppc_cmpeqb(a_in, b_in);
             when "011" =>
-                if current.insn(1) = '1' then
+                if e_in.insn(1) = '1' then
                     -- CR logical instructions
                     j := (7 - crnum) * 4;
                     newcrf := cr_in(j + 3 downto j);
@@ -728,7 +762,7 @@ begin
                 newcrf := xerc_in.ov & xerc_in.ov32 & xerc_in.ca & xerc_in.ca32;
             when others =>
         end case;
-        if current.insn_type = OP_MTCRF then
+        if e_in.insn_type = OP_MTCRF then
             if e_in.insn(20) = '0' then
                 -- mtcrf
                 write_cr_mask <= insn_fxm(e_in.insn);
@@ -737,201 +771,86 @@ begin
                 crnum := fxm_to_num(insn_fxm(e_in.insn));
                 write_cr_mask <= num_to_fxm(crnum);
             end if;
-            write_cr_data <= c_in(31 downto 0);
         else
             write_cr_mask <= num_to_fxm(crnum);
-            write_cr_data <= newcrf & newcrf & newcrf & newcrf &
-                             newcrf & newcrf & newcrf & newcrf;
         end if;
+        for i in 0 to 7 loop
+            if write_cr_mask(i) = '0' then
+                write_cr_data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4);
+            elsif e_in.insn_type = OP_MTCRF then
+                write_cr_data(i*4 + 3 downto i*4) <= c_in(i*4 + 3 downto i*4);
+            else
+                write_cr_data(i*4 + 3 downto i*4) <= newcrf;
+            end if;
+        end loop;
 
     end process;
 
-    execute1_1: process(all)
-	variable v : reg_type;
+    execute1_actions: process(all)
+        variable v: actions_type;
 	variable bo, bi : std_ulogic_vector(4 downto 0);
-	variable overflow : std_ulogic;
-        variable lv : Execute1ToLoadstore1Type;
-	variable irq_valid : std_ulogic;
-	variable exception : std_ulogic;
         variable illegal : std_ulogic;
-        variable is_branch : std_ulogic;
-        variable is_direct_branch : std_ulogic;
-        variable taken_branch : std_ulogic;
-        variable abs_branch : std_ulogic;
-        variable spr_val : std_ulogic_vector(63 downto 0);
-        variable do_trace : std_ulogic;
-        variable hold_wr_data : std_ulogic;
-        variable fv : Execute1ToFPUType;
+        variable privileged : std_ulogic;
+        variable slow_op : std_ulogic;
     begin
-        is_branch := '0';
-        is_direct_branch := '0';
-        taken_branch := '0';
-        abs_branch := '0';
-        hold_wr_data := '0';
-
-	v := r;
-	v.e := Execute1ToWritebackInit;
+        v := actions_type_init;
+        v.e.write_data := alu_result;
+        v.e.write_reg := e_in.write_reg;
+        v.e.write_enable := e_in.write_reg_enable;
+        v.e.rc := e_in.rc;
+        v.e.write_cr_data := write_cr_data;
+        v.e.write_cr_mask := write_cr_mask;
+        v.e.write_cr_enable := e_in.output_cr;
+        v.e.write_xerc_enable := e_in.output_xer;
+        v.e.xerc := xerc_in;
+        v.new_msr := ctrl.msr;
+        v.e.write_xerc_enable := e_in.output_xer;
         v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) &
                           not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF);
-        v.e.xerc := xerc_in;
-
-        lv := Execute1ToLoadstore1Init;
-        fv := Execute1ToFPUInit;
-
-        x_to_multiply.valid <= '0';
-        x_to_divider.valid <= '0';
-	v.mul_in_progress := '0';
-        v.div_in_progress := '0';
-        v.cntz_in_progress := '0';
-        v.mul_finish := '0';
-        v.ext_interrupt := '0';
-        v.taken_branch_event := '0';
-        v.br_mispredict := '0';
-
-        x_to_pmu.mfspr <= '0';
-        x_to_pmu.mtspr <= '0';
-        x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47);
-        x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51);
-        x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55);
-        x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63);
-        x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM);
-        x_to_pmu.pr_msr <= ctrl.msr(MSR_PR);
-
-        spr_result <= (others => '0');
-        spr_val := (others => '0');
-
-	ctrl_tmp <= ctrl;
-	-- FIXME: run at 512MHz not core freq
-	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
-	ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);
-
-        irq_valid := ctrl.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);
-
-	v.terminate := '0';
-	icache_inval <= '0';
-	v.busy := '0';
-
-	-- Next insn adder used in a couple of places
-	next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4);
-
-	-- rotator control signals
-	right_shift <= '1' when e_in.insn_type = OP_SHR else '0';
-	rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0';
-	rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0';
-        rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0';
-
-        do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0';
+        v.e.intr_vec := 16#700#;
+        v.e.mode_32bit := not ctrl.msr(MSR_SF);
+        v.e.instr_tag := e_in.instr_tag;
+        v.e.last_nia := e_in.nia;
+        v.e.br_offset := 64x"4";
+
+        -- Note the difference between v.exception and v.trap:
+        -- v.exception signals a condition that prevents execution of the
+        -- instruction, and hence shouldn't depend on operand data, so as to
+        -- avoid timing chains through both data and control paths.
+        -- v.trap also means we want to generate an interrupt, but doesn't
+        -- cancel instruction execution (hence we need to avoid setting any
+        -- side-effect flags or write enables when generating a trap).
+        -- With v.trap = 1 we will assert both r.e.valid and r.e.interrupt
+        -- to writeback, and it will complete the instruction and take
+        -- and interrupt.  It is OK for v.trap to depend on operand data.
 
         illegal := '0';
-        if r.intr_pending = '1' then
-            v.e.srr1 := r.e.srr1;
-            v.e.intr_vec := r.e.intr_vec;
-        end if;
-        if valid_in = '1' then
-            v.e.last_nia := e_in.nia;
-        else
-            v.e.last_nia := r.e.last_nia;
-        end if;
-
-        v.e.mode_32bit := not ctrl.msr(MSR_SF);
-        v.e.instr_tag := current.instr_tag;
+        privileged := '0';
+        slow_op := '0';
 
-        do_trace := valid_in and ctrl.msr(MSR_SE);
-        if valid_in = '1' then
-            v.cur_instr := e_in;
-            v.prev_op := e_in.insn_type;
+        if ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then
+            privileged := '1';
         end if;
 
-        -- Determine if there is any interrupt to be taken
-        -- before/instead of executing this instruction
-        exception := r.intr_pending;
-        if valid_in = '1' and e_in.second = '0' and r.intr_pending = '0' then
-            if HAS_FPU and r.fp_exception_next = '1' then
-                -- This is used for FP-type program interrupts that
-                -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
-                exception := '1';
-                v.e.intr_vec := 16#700#;
-                v.e.srr1(47 - 43) := '1';
-                v.e.srr1(47 - 47) := '1';
-            elsif r.trace_next = '1' then
-                -- Generate a trace interrupt rather than executing the next instruction
-                -- or taking any asynchronous interrupt
-                exception := '1';
-                v.e.intr_vec := 16#d00#;
-                v.e.srr1(47 - 33) := '1';
-                if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
-                    r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
-                    v.e.srr1(47 - 35) := '1';
-                elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
-                    v.e.srr1(47 - 36) := '1';
-                end if;
-
-            elsif irq_valid = '1' then
-                -- Don't deliver the interrupt until we have a valid instruction
-                -- coming in, so we have a valid NIA to put in SRR0.
-                if pmu_to_x.intr = '1' then
-                    v.e.intr_vec := 16#f00#;
-                    report "IRQ valid: PMU";
-                elsif ctrl.dec(63) = '1' then
-                    v.e.intr_vec := 16#900#;
-                    report "IRQ valid: DEC";
-                elsif ext_irq_in = '1' then
-                    v.e.intr_vec := 16#500#;
-                    report "IRQ valid: External";
-                    v.ext_interrupt := '1';
-                end if;
-                exception := '1';
-
-            elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then
-                -- generate a program interrupt
-                exception := '1';
-                v.e.intr_vec := 16#700#;
-                -- set bit 45 to indicate privileged instruction type interrupt
-                v.e.srr1(47 - 45) := '1';
-                report "privileged instruction";
-
-            elsif not HAS_FPU and e_in.fac = FPU then
-                -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations
-                illegal := '1';
-
-            elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then
-                -- generate a floating-point unavailable interrupt
-                exception := '1';
-                v.e.intr_vec := 16#800#;
-                report "FP unavailable interrupt";
-            end if;
+        if (not HAS_FPU and e_in.fac = FPU) or e_in.unit = NONE then
+            -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations
+            illegal := '1';
         end if;
-        if exception = '1' and l_in.in_progress = '1' then
-            -- We can't send this interrupt to writeback yet because there are
-            -- still instructions in loadstore1 that haven't completed.
-            v.intr_pending := '1';
-            v.busy := '1';
-        end if;
-        if l_in.interrupt = '1' then
-            v.intr_pending := '0';
-        end if;
-
-        v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or r.busy or fp_in.busy);
-        v.instr_dispatch := valid_in and not exception and not illegal;
-
-	if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then
-	    v.e.valid := '1';
-
-	    case_0: case e_in.insn_type is
 
+        v.do_trace := ctrl.msr(MSR_SE);
+        case_0: case e_in.insn_type is
 	    when OP_ILLEGAL =>
-		-- we need two cycles to write srr0 and 1
-		-- will need more when we have to write HEIR
 		illegal := '1';
 	    when OP_SC =>
 		-- check bit 1 of the instruction is 1 so we know this is sc;
                 -- 0 would mean scv, so generate an illegal instruction interrupt
-		-- we need two cycles to write srr0 and 1
                 if e_in.insn(1) = '1' then
-                    exception := '1';
+                    v.trap := '1';
                     v.e.intr_vec := 16#C00#;
                     v.e.last_nia := next_nia;
-                    report "sc";
+                    if e_in.valid = '1' then
+                        report "sc";
+                    end if;
                 else
                     illegal := '1';
                 end if;
@@ -940,12 +859,14 @@ begin
                 -- if not then it is illegal
                 if e_in.insn(10 downto 1) = "0100000000" then
                     v.terminate := '1';
-                    report "ATTN";
+                    if e_in.valid = '1' then
+                        report "ATTN";
+                    end if;
                 else
                     illegal := '1';
                 end if;
 	    when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT =>
-		-- Do nothing
+            -- Do nothing
 	    when OP_ADD =>
                 if e_in.output_carry = '1' then
                     if e_in.input_carry /= OV then
@@ -966,27 +887,34 @@ begin
                 v.e.srr1(47 - 46) := '1';
                 if or (trapval and insn_to(e_in.insn)) = '1' then
                     -- generate trap-type program interrupt
-                    exception := '1';
-                    report "trap";
+                    v.trap := '1';
+                    if e_in.valid = '1' then
+                        report "trap";
+                    end if;
                 end if;
             when OP_ADDG6S =>
             when OP_CMPRB =>
             when OP_CMPEQB =>
             when OP_AND | OP_OR | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS |
-                    OP_BPERM | OP_BCD =>
+                OP_BPERM | OP_BCD =>
 
 	    when OP_B =>
-                is_branch := '1';
-                taken_branch := '1';
-                is_direct_branch := '1';
-                abs_branch := e_in.br_abs;
+                v.take_branch := '1';
+                v.direct_branch := '1';
+                v.e.br_last := '1';
+                v.e.br_taken := '1';
+                v.e.br_offset := b_in;
+                v.e.abs_br := insn_aa(e_in.insn);
+                if e_in.br_pred = '0' then
+                    -- should never happen
+                    v.e.redirect := '1';
+                end if;
                 if ctrl.msr(MSR_BE) = '1' then
-                    do_trace := '1';
+                    v.do_trace := '1';
                 end if;
-                v.taken_branch_event := '1';
-            when OP_BC | OP_BCREG =>
+                v.write_cfar := '1';
+            when OP_BC =>
                 -- read_data1 is CTR
-		-- for OP_BCREG, read_data2 is target register (CTR, LR or TAR)
                 -- If this instruction updates both CTR and LR, then it is
                 -- doubled; the first instruction decrements CTR and determines
                 -- whether the branch is taken, and the second does the
@@ -994,21 +922,52 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
                 if e_in.second = '0' then
-                    taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
+                    v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
                 else
-                    taken_branch := r.br_taken;
+                    v.take_branch := r.br_taken;
+                end if;
+                if v.take_branch = '1' then
+                    v.e.br_offset := b_in;
+                    v.e.abs_br := insn_aa(e_in.insn);
                 end if;
-                v.br_taken := taken_branch;
-                v.taken_branch_event := taken_branch;
-                abs_branch := e_in.br_abs;
                 if e_in.repeat = '0' or e_in.second = '1' then
-                    is_branch := '1';
-                    if e_in.insn_type = OP_BC then
-                        is_direct_branch := '1';
+                    -- Mispredicted branches cause a redirect
+                    if v.take_branch /= e_in.br_pred then
+                        v.e.redirect := '1';
                     end if;
+                    v.direct_branch := '1';
+                    v.e.br_last := '1';
+                    v.e.br_taken := v.take_branch;
                     if ctrl.msr(MSR_BE) = '1' then
-                        do_trace := '1';
+                        v.do_trace := '1';
                     end if;
+                    v.write_cfar := v.take_branch;
+                end if;
+            when OP_BCREG =>
+                -- read_data1 is CTR, read_data2 is target register (CTR, LR or TAR)
+                -- If this instruction updates both CTR and LR, then it is
+                -- doubled; the first instruction decrements CTR and determines
+                -- whether the branch is taken, and the second does the
+                -- redirect and the LR update.
+		bo := insn_bo(e_in.insn);
+		bi := insn_bi(e_in.insn);
+                if e_in.second = '0' then
+                    v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
+                else
+                    v.take_branch := r.br_taken;
+                end if;
+                if v.take_branch = '1' then
+                    v.e.br_offset := b_in;
+                    v.e.abs_br := '1';
+                end if;
+                if e_in.repeat = '0' or e_in.second = '1' then
+                    -- Indirect branches are never predicted taken
+                    v.e.redirect := v.take_branch;
+                    v.e.br_taken := v.take_branch;
+                    if ctrl.msr(MSR_BE) = '1' then
+                        v.do_trace := '1';
+                    end if;
+                    v.write_cfar := v.take_branch;
                 end if;
 
 	    when OP_RFID =>
@@ -1016,131 +975,115 @@ begin
                                   not a_in(MSR_LE) & not a_in(MSR_SF);
                 -- Can't use msr_copy here because the partial function MSR
                 -- bits should be left unchanged, not zeroed.
-                ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31);
-                ctrl_tmp.msr(26 downto 22) <= a_in(26 downto 22);
-                ctrl_tmp.msr(15 downto 0)  <= a_in(15 downto 0);
+                v.new_msr(63 downto 31) := a_in(63 downto 31);
+                v.new_msr(26 downto 22) := a_in(26 downto 22);
+                v.new_msr(15 downto 0)  := a_in(15 downto 0);
                 if a_in(MSR_PR) = '1' then
-                    ctrl_tmp.msr(MSR_EE) <= '1';
-                    ctrl_tmp.msr(MSR_IR) <= '1';
-                    ctrl_tmp.msr(MSR_DR) <= '1';
+                    v.new_msr(MSR_EE) := '1';
+                    v.new_msr(MSR_IR) := '1';
+                    v.new_msr(MSR_DR) := '1';
                 end if;
-                -- mark this as a branch so CFAR gets updated
-                is_branch := '1';
-                taken_branch := '1';
-                abs_branch := '1';
+                v.write_msr := '1';
+                v.e.br_offset := b_in;
+                v.e.abs_br := '1';
+                v.e.redirect := '1';
+                v.write_cfar := '1';
                 if HAS_FPU then
-                    v.fp_exception_next := fp_in.exception and
-                                           (a_in(MSR_FE0) or a_in(MSR_FE1));
+                    v.fp_intr := fp_in.exception and
+                                 (a_in(MSR_FE0) or a_in(MSR_FE1));
                 end if;
-                do_trace := '0';
+                v.do_trace := '0';
 
             when OP_CNTZ | OP_POPCNT =>
-                v.e.valid := '0';
-                v.cntz_in_progress := '1';
-                v.busy := '1';
+                slow_op := '1';
+                v.start_cntz := '1';
 	    when OP_ISEL =>
             when OP_CROP =>
             when OP_MCRXRX =>
             when OP_DARN =>
 	    when OP_MFMSR =>
 	    when OP_MFSPR =>
-		report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
-		    "=" & to_hstring(a_in);
 		if is_fast_spr(e_in.read_reg1) = '1' then
-		    spr_val := a_in;
-		else
-                    spr_val := c_in;
-                    case decode_spr_num(e_in.insn) is
-                    when SPR_XER =>
-                        spr_val := assemble_xer(xerc_in, ctrl.xer_low);
-		    when SPR_TB =>
-			spr_val := ctrl.tb;
-		    when SPR_TBU =>
-                        spr_val(63 downto 32) := (others => '0');
-			spr_val(31 downto 0)  := ctrl.tb(63 downto 32);
-		    when SPR_DEC =>
-			spr_val := ctrl.dec;
-                    when SPR_CFAR =>
-                        spr_val := ctrl.cfar;
-                    when SPR_PVR =>
-                        spr_val(63 downto 32) := (others => '0');
-                        spr_val(31 downto 0) := PVR_MICROWATT;
-                    when 724 =>     -- LOG_ADDR SPR
-                        spr_val := log_wr_addr & r.log_addr_spr;
-                    when 725 =>     -- LOG_DATA SPR
-                        spr_val := log_rd_data;
-                        v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1);
-                    when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 |
-                        SPR_UMMCR0 | SPR_UMMCR1 | SPR_UMMCR2 | SPR_UMMCRA | SPR_USIER | SPR_USIAR | SPR_USDAR |
-                        SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 |
-                        SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR =>
-                        x_to_pmu.mfspr <= '1';
-                        spr_val := pmu_to_x.spr_val;
-                    when others =>
-                        -- mfspr from unimplemented SPRs should be a nop in
-                        -- supervisor mode and a program interrupt for user mode
-                        if is_fast_spr(e_in.read_reg1) = '0' and ctrl.msr(MSR_PR) = '1' then
-                            illegal := '1';
-                        end if;
+                    if e_in.valid = '1' then
+                        report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
+                            "=" & to_hstring(a_in);
+                    end if;
+		elsif e_in.spr_select.valid = '1' then
+                    if e_in.valid = '1' then
+                        report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
+                            "=" & to_hstring(spr_result);
+                    end if;
+                    case e_in.spr_select.sel is
+                       when SPRSEL_LOGD =>
+                           v.inc_loga := '1';
+                           when others =>
                     end case;
+                else
+                    -- mfspr from unimplemented SPRs should be a nop in
+                    -- supervisor mode and a program interrupt for user mode
+                    if e_in.valid = '1' then
+                        report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
+                            " invalid";
+                    end if;
+                    if ctrl.msr(MSR_PR) = '1' then
+                        illegal := '1';
+                    end if;
                 end if;
-                spr_result <= spr_val;
 
 	    when OP_MFCR =>
 	    when OP_MTCRF =>
             when OP_MTMSRD =>
+                v.write_msr := '1';
                 if e_in.insn(16) = '1' then
                     -- just update EE and RI
-                    ctrl_tmp.msr(MSR_EE) <= c_in(MSR_EE);
-                    ctrl_tmp.msr(MSR_RI) <= c_in(MSR_RI);
+                    v.new_msr(MSR_EE) := c_in(MSR_EE);
+                    v.new_msr(MSR_RI) := c_in(MSR_RI);
                 else
                     -- Architecture says to leave out bits 3 (HV), 51 (ME)
                     -- and 63 (LE) (IBM bit numbering)
                     if e_in.is_32bit = '0' then
-                        ctrl_tmp.msr(63 downto 61) <= c_in(63 downto 61);
-                        ctrl_tmp.msr(59 downto 32) <= c_in(59 downto 32);
+                        v.new_msr(63 downto 61) := c_in(63 downto 61);
+                        v.new_msr(59 downto 32) := c_in(59 downto 32);
                     end if;
-                    ctrl_tmp.msr(31 downto 13) <= c_in(31 downto 13);
-                    ctrl_tmp.msr(11 downto 1)  <= c_in(11 downto 1);
+                    v.new_msr(31 downto 13) := c_in(31 downto 13);
+                    v.new_msr(11 downto 1)  := c_in(11 downto 1);
                     if c_in(MSR_PR) = '1' then
-                        ctrl_tmp.msr(MSR_EE) <= '1';
-                        ctrl_tmp.msr(MSR_IR) <= '1';
-                        ctrl_tmp.msr(MSR_DR) <= '1';
+                        v.new_msr(MSR_EE) := '1';
+                        v.new_msr(MSR_IR) := '1';
+                        v.new_msr(MSR_DR) := '1';
                     end if;
                     if HAS_FPU then
-                        v.fp_exception_next := fp_in.exception and
-                                               (c_in(MSR_FE0) or c_in(MSR_FE1));
+                        v.fp_intr := fp_in.exception and
+                                     (c_in(MSR_FE0) or c_in(MSR_FE1));
                     end if;
                 end if;
 	    when OP_MTSPR =>
-		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
-		    "=" & to_hstring(c_in);
-		if is_fast_spr(e_in.write_reg) = '0' then
-		    -- slow spr
-		    case decode_spr_num(e_in.insn) is
-                    when SPR_XER =>
-			v.e.xerc.so := c_in(63-32);
-			v.e.xerc.ov := c_in(63-33);
-			v.e.xerc.ca := c_in(63-34);
-			v.e.xerc.ov32 := c_in(63-44);
-			v.e.xerc.ca32 := c_in(63-45);
-                        ctrl_tmp.xer_low <= c_in(17 downto 0);
-		    when SPR_DEC =>
-			ctrl_tmp.dec <= c_in;
-                    when 724 =>     -- LOG_ADDR SPR
-                        v.log_addr_spr := c_in(31 downto 0);
-                    when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 |
-                        SPR_UMMCR0 | SPR_UMMCR2 | SPR_UMMCRA |
-                        SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 |
-                        SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR =>
-                        x_to_pmu.mtspr <= '1';
-		    when others =>
-                        -- mtspr to unimplemented SPRs should be a nop in
-                        -- supervisor mode and a program interrupt for user mode
-                        if ctrl.msr(MSR_PR) = '1' then
-                            illegal := '1';
-                        end if;
-		    end case;
+                if e_in.valid = '1' then
+                    report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
+                        "=" & to_hstring(c_in);
+                end if;
+                v.write_pmuspr := e_in.spr_select.ispmu;
+                if e_in.spr_select.valid = '1' and e_in.spr_select.ispmu = '0' then
+                    case e_in.spr_select.sel is
+                        when SPRSEL_XER =>
+                            v.e.xerc.so := c_in(63-32);
+                            v.e.xerc.ov := c_in(63-33);
+                            v.e.xerc.ca := c_in(63-34);
+                            v.e.xerc.ov32 := c_in(63-44);
+                            v.e.xerc.ca32 := c_in(63-45);
+                            v.write_xerlow := '1';
+                        when SPRSEL_DEC =>
+                            v.write_dec := '1';
+                        when SPRSEL_LOGA =>
+                            v.write_loga := '1';
+                        when others =>
+                    end case;
+		elsif is_fast_spr(e_in.write_reg) = '0' then
+                    -- mtspr to unimplemented SPRs should be a nop in
+                    -- supervisor mode and a program interrupt for user mode
+                    if ctrl.msr(MSR_PR) = '1' then
+                        illegal := '1';
+                    end if;
 		end if;
 	    when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI =>
 		if e_in.output_carry = '1' then
@@ -1150,13 +1093,12 @@ begin
 
 	    when OP_ISYNC =>
 		v.e.redirect := '1';
-                v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64));
 
 	    when OP_ICBI =>
-		icache_inval <= '1';
+		v.icache_inval := '1';
 
-	    when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
-                if HAS_SHORT_MULT and e_in.insn_type = OP_MUL_L64 and e_in.insn(26) = '1' and
+	    when OP_MUL_L64 =>
+                if HAS_SHORT_MULT and e_in.insn(26) = '1' and
                     fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then
                     -- Operands fit into 16 bits, so use short multiplier
                     if e_in.oe = '1' then
@@ -1165,54 +1107,230 @@ begin
                     end if;
                 else
                     -- Use standard multiplier
-                    v.e.valid := '0';
-                    v.mul_in_progress := '1';
-                    v.busy := '1';
-                    x_to_multiply.valid <= '1';
+                    v.start_mul := '1';
+                    slow_op := '1';
                 end if;
 
+	    when OP_MUL_H64 | OP_MUL_H32 =>
+                v.start_mul := '1';
+                slow_op := '1';
+
 	    when OP_DIV | OP_DIVE | OP_MOD =>
-		v.e.valid := '0';
-		v.div_in_progress := '1';
-		v.busy := '1';
-		x_to_divider.valid <= '1';
+                v.start_div := '1';
+                slow_op := '1';
+
+            when OP_FETCH_FAILED =>
+                -- Handling an ITLB miss doesn't count as having executed an instruction
+                v.do_trace := '0';
 
             when others =>
-		v.terminate := '1';
-		report "illegal";
-	    end case;
-
-            -- Mispredicted branches cause a redirect
-            if is_branch = '1' then
-                if taken_branch = '1' then
-                    ctrl_tmp.cfar <= e_in.nia;
+                if e_in.valid = '1' and e_in.unit = ALU then
+                    report "unhandled insn_type " & insn_type_t'image(e_in.insn_type);
                 end if;
-                if taken_branch = '1' then
-                    v.e.br_offset := b_in;
-                    v.e.abs_br := abs_branch;
-                else
-                    v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64));
+        end case;
+
+        if privileged = '1' then
+            -- generate a program interrupt
+            v.exception := '1';
+            -- set bit 45 to indicate privileged instruction type interrupt
+            v.e.srr1(47 - 45) := '1';
+            if e_in.valid = '1' then
+                report "privileged instruction";
+            end if;
+
+        elsif illegal = '1' then
+            v.exception := '1';
+            -- Since we aren't doing Hypervisor emulation assist (0xe40) we
+            -- set bit 44 to indicate we have an illegal
+            v.e.srr1(47 - 44) := '1';
+            if e_in.valid = '1' then
+                report "illegal instruction";
+            end if;
+
+        elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then
+            -- generate a floating-point unavailable interrupt
+            v.exception := '1';
+            v.e.intr_vec := 16#800#;
+            if e_in.valid = '1' then
+                report "FP unavailable interrupt";
+            end if;
+        end if;
+
+        if e_in.unit = ALU then
+            v.complete := e_in.valid and not v.exception and not slow_op;
+        end if;
+
+        actions <= v;
+    end process;
+
+    execute1_1: process(all)
+	variable v : reg_type;
+	variable overflow : std_ulogic;
+        variable lv : Execute1ToLoadstore1Type;
+	variable irq_valid : std_ulogic;
+	variable exception : std_ulogic;
+        variable fv : Execute1ToFPUType;
+        variable go : std_ulogic;
+    begin
+	v := r;
+        if r.busy = '0' then
+            v.e := actions.e;
+            v.oe := e_in.oe;
+            v.mul_select := e_in.sub_select(1 downto 0);
+        end if;
+
+        lv := Execute1ToLoadstore1Init;
+        fv := Execute1ToFPUInit;
+
+        x_to_multiply.valid <= '0';
+        x_to_divider.valid <= '0';
+	v.mul_in_progress := '0';
+        v.div_in_progress := '0';
+        v.cntz_in_progress := '0';
+        v.mul_finish := '0';
+        v.ext_interrupt := '0';
+        v.taken_branch_event := '0';
+        v.br_mispredict := '0';
+
+        x_to_pmu.mfspr <= '0';
+        x_to_pmu.mtspr <= '0';
+        x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47);
+        x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51);
+        x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55);
+        x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63);
+        x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM);
+        x_to_pmu.pr_msr <= ctrl.msr(MSR_PR);
+
+	ctrl_tmp <= ctrl;
+	-- FIXME: run at 512MHz not core freq
+	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
+	ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);
+
+        irq_valid := ctrl.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);
+
+	v.terminate := '0';
+	icache_inval <= '0';
+	v.busy := '0';
+
+	-- Next insn adder used in a couple of places
+	next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4);
+
+	-- rotator control signals
+	right_shift <= '1' when e_in.insn_type = OP_SHR else '0';
+	rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0';
+	rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0';
+        rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0';
+
+        do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0';
+
+        if r.intr_pending = '1' then
+            v.e.srr1 := r.e.srr1;
+            v.e.intr_vec := r.e.intr_vec;
+        end if;
+
+        if valid_in = '1' then
+            v.prev_op := e_in.insn_type;
+        end if;
+
+        -- Determine if there is any interrupt to be taken
+        -- before/instead of executing this instruction
+        exception := r.intr_pending or (valid_in and actions.exception);
+        if valid_in = '1' and e_in.second = '0' and r.intr_pending = '0' then
+            if HAS_FPU and r.fp_exception_next = '1' then
+                -- This is used for FP-type program interrupts that
+                -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
+                exception := '1';
+                v.e.intr_vec := 16#700#;
+                v.e.srr1 := (others => '0');
+                v.e.srr1(47 - 43) := '1';
+                v.e.srr1(47 - 47) := '1';
+            elsif r.trace_next = '1' then
+                -- Generate a trace interrupt rather than executing the next instruction
+                -- or taking any asynchronous interrupt
+                exception := '1';
+                v.e.intr_vec := 16#d00#;
+                v.e.srr1 := (others => '0');
+                v.e.srr1(47 - 33) := '1';
+                if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
+                    r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
+                    v.e.srr1(47 - 35) := '1';
+                elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
+                    v.e.srr1(47 - 36) := '1';
                 end if;
-                if taken_branch /= e_in.br_pred then
-                    v.e.redirect := '1';
-                    v.br_mispredict := is_direct_branch;
+
+            elsif irq_valid = '1' then
+                -- Don't deliver the interrupt until we have a valid instruction
+                -- coming in, so we have a valid NIA to put in SRR0.
+                if pmu_to_x.intr = '1' then
+                    v.e.intr_vec := 16#f00#;
+                    report "IRQ valid: PMU";
+                elsif ctrl.dec(63) = '1' then
+                    v.e.intr_vec := 16#900#;
+                    report "IRQ valid: DEC";
+                elsif ext_irq_in = '1' then
+                    v.e.intr_vec := 16#500#;
+                    report "IRQ valid: External";
+                    v.ext_interrupt := '1';
                 end if;
-                v.e.br_last := is_direct_branch;
-                v.e.br_taken := taken_branch;
+                v.e.srr1 := (others => '0');
+                exception := '1';
+
             end if;
+        end if;
+        if exception = '1' and l_in.in_progress = '1' then
+            -- We can't send this interrupt to writeback yet because there are
+            -- still instructions in loadstore1 that haven't completed.
+            v.intr_pending := '1';
+            v.busy := '1';
+        end if;
+
+        v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or r.busy or fp_in.busy);
+
+        go := valid_in and not exception;
+        v.instr_dispatch := go;
+
+	if go = '1' then
+            v.e.valid := actions.complete;
+            v.taken_branch_event := actions.take_branch;
+            v.br_taken := actions.take_branch;
+            v.trace_next := actions.do_trace;
+            v.fp_exception_next := actions.fp_intr;
+            v.cntz_in_progress := actions.start_cntz;
+
+            if actions.write_msr = '1' then
+                ctrl_tmp.msr <= actions.new_msr;
+            end if;
+            if actions.write_xerlow = '1' then
+                ctrl_tmp.xer_low <= c_in(17 downto 0);
+            end if;
+            if actions.write_dec = '1' then
+                ctrl_tmp.dec <= c_in;
+            end if;
+            if actions.write_cfar = '1' then
+                ctrl_tmp.cfar <= e_in.nia;
+            end if;
+            if actions.write_loga = '1' then
+                v.log_addr_spr := c_in(31 downto 0);
+            elsif actions.inc_loga = '1' then
+                v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1);
+            end if;
+            x_to_pmu.mtspr <= actions.write_pmuspr;
+            icache_inval <= actions.icache_inval;
+            x_to_multiply.valid <= actions.start_mul;
+            v.mul_in_progress := actions.start_mul;
+            x_to_divider.valid <= actions.start_div;
+            v.div_in_progress := actions.start_div;
+            v.terminate := actions.terminate;
+            v.br_mispredict := v.e.redirect and actions.direct_branch;
+            v.busy := actions.start_cntz or actions.start_mul or actions.start_div;
+            exception := actions.trap;
 
-        elsif valid_in = '1' and exception = '0' and illegal = '0' then
             -- instruction for other units, i.e. LDST
             if e_in.unit = LDST then
                 lv.valid := '1';
-            elsif e_in.unit = NONE then
-                illegal := '1';
-            elsif HAS_FPU and e_in.unit = FPU then
-                fv.valid := '1';
             end if;
-            -- Handling an ITLB miss doesn't count as having executed an instruction
-            if e_in.insn_type = OP_FETCH_FAILED then
-                do_trace := '0';
+            if HAS_FPU and e_in.unit = FPU then
+                fv.valid := '1';
             end if;
         end if;
 
@@ -1222,38 +1340,44 @@ begin
         if r.cntz_in_progress = '1' then
             -- cnt[lt]z and popcnt* always take two cycles
             v.e.valid := '1';
-	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
-	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
-	       (r.div_in_progress = '1' and divider_to_x.valid = '1') then
-		if r.mul_in_progress = '1' then
-                    overflow := '0';
-		else
-		    overflow := divider_to_x.overflow;
-		end if;
-                if r.mul_in_progress = '1' and current.oe = '1' then
+            v.e.write_data := countbits_result;
+        end if;
+	if r.div_in_progress = '1' then
+	    if divider_to_x.valid = '1' then
+                v.e.write_data := muldiv_result;
+                overflow := divider_to_x.overflow;
+                -- We must test oe because the RC update code in writeback
+                -- will use the xerc value to set CR0:SO so we must not clobber
+                -- xerc if OE wasn't set.
+                if r.oe = '1' then
+                    v.e.xerc.ov := overflow;
+                    v.e.xerc.ov32 := overflow;
+                    if overflow = '1' then
+                        v.e.xerc.so := '1';
+                    end if;
+                end if;
+                v.e.valid := '1';
+	    else
+		v.busy := '1';
+		v.div_in_progress := '1';
+	    end if;
+        end if;
+	if r.mul_in_progress = '1' then
+	    if multiply_to_x.valid = '1' then
+                v.e.write_data := muldiv_result;
+                if r.oe = '1' then
                     -- have to wait until next cycle for overflow indication
                     v.mul_finish := '1';
                     v.busy := '1';
                 else
-                    -- We must test oe because the RC update code in writeback
-                    -- will use the xerc value to set CR0:SO so we must not clobber
-                    -- xerc if OE wasn't set.
-                    if current.oe = '1' then
-                        v.e.xerc.ov := overflow;
-                        v.e.xerc.ov32 := overflow;
-                        if overflow = '1' then
-                            v.e.xerc.so := '1';
-                        end if;
-                    end if;
                     v.e.valid := '1';
                 end if;
 	    else
 		v.busy := '1';
-		v.mul_in_progress := r.mul_in_progress;
-		v.div_in_progress := r.div_in_progress;
+		v.mul_in_progress := '1';
 	    end if;
-        elsif r.mul_finish = '1' then
-            hold_wr_data := '1';
+        end if;
+        if r.mul_finish = '1' then
             v.e.xerc.ov := multiply_to_x.overflow;
             v.e.xerc.ov32 := multiply_to_x.overflow;
             if multiply_to_x.overflow = '1' then
@@ -1262,24 +1386,11 @@ begin
             v.e.valid := '1';
 	end if;
 
-        if illegal = '1' then
-            exception := '1';
-            v.e.intr_vec := 16#700#;
-            -- Since we aren't doing Hypervisor emulation assist (0xe40) we
-            -- set bit 44 to indicate we have an illegal
-            v.e.srr1(47 - 44) := '1';
-            report "illegal";
-        end if;
-
         v.e.interrupt := exception and not (l_in.in_progress or l_in.interrupt);
         if v.e.interrupt = '1' then
             v.intr_pending := '0';
         end if;
 
-        if do_trace = '1' then
-            v.trace_next := '1';
-        end if;
-
  	if interrupt_in = '1' then
             ctrl_tmp.msr(MSR_SF) <= '1';
             ctrl_tmp.msr(MSR_EE) <= '0';
@@ -1298,32 +1409,13 @@ begin
             v.intr_pending := '0';
         end if;
 
-        if hold_wr_data = '0' then
-            v.e.write_data := alu_result;
-        else
-            v.e.write_data := r.e.write_data;
-        end if;
-        v.e.write_reg := current.write_reg;
-	v.e.write_enable := current.write_reg_enable and v.e.valid and not exception;
-        v.e.rc := current.rc and v.e.valid and not exception;
-        v.e.write_cr_data := write_cr_data;
-        v.e.write_cr_mask := write_cr_mask;
-        v.e.write_cr_enable := current.output_cr and v.e.valid and not exception;
-        v.e.write_xerc_enable := current.output_xer and v.e.valid and not exception;
-
-        bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid;
-        bypass_data.tag.tag <= current.instr_tag.tag;
+        bypass_data.tag.valid <= v.e.write_enable and v.e.valid;
+        bypass_data.tag.tag <= v.e.instr_tag.tag;
         bypass_data.data <= v.e.write_data;
 
-        bypass_cr_data.tag.valid <= current.instr_tag.valid and current.output_cr and v.e.valid;
-        bypass_cr_data.tag.tag <= current.instr_tag.tag;
-        for i in 0 to 7 loop
-            if v.e.write_cr_mask(i) = '1' then
-                bypass_cr_data.data(i*4 + 3 downto i*4) <= v.e.write_cr_data(i*4 + 3 downto i*4);
-            else
-                bypass_cr_data.data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4);
-            end if;
-        end loop;
+        bypass_cr_data.tag.valid <= v.e.write_cr_enable and v.e.valid;
+        bypass_cr_data.tag.tag <= v.e.instr_tag.tag;
+        bypass_cr_data.data <= v.e.write_cr_data;
 
         -- Outputs to loadstore1 (async)
         lv.op := e_in.insn_type;
@@ -1373,6 +1465,13 @@ begin
 	-- update outputs
         l_out <= lv;
 	e_out <= r.e;
+        if r.e.valid = '0' then
+            e_out.write_enable <= '0';
+            e_out.write_cr_enable <= '0';
+            e_out.write_xerc_enable <= '0';
+            e_out.redirect <= '0';
+            e_out.br_last <= '0';
+        end if;
         e_out.msr <= msr_copy(ctrl.msr);
         fp_out <= fv;
 
@@ -1394,7 +1493,7 @@ begin
                             "000" &
                             r.e.write_enable &
                             r.e.valid &
-                            (r.e.redirect or r.e.interrupt) &
+                            ((r.e.redirect and r.e.valid) or r.e.interrupt) &
                             r.busy &
                             flush_in;
             end if;

From 521a5403a9b04c49a4f724f67e67b93ae7f6fb44 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 18 Jun 2022 17:29:43 +1000
Subject: [PATCH 04/30] execute1: Rename 'r' to 'ex1'

Maybe this will give us slightly better names in critical path reports
and the like.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 execute1.vhdl | 97 ++++++++++++++++++++++++++-------------------------
 1 file changed, 49 insertions(+), 48 deletions(-)

diff --git a/execute1.vhdl b/execute1.vhdl
index 21f6f8f..7bd0913 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -118,7 +118,7 @@ architecture behaviour of execute1 is
     constant actions_type_init : actions_type :=
         (e => Execute1ToWritebackInit, new_msr => (others => '0'), others => '0');
 
-    signal r, rin : reg_type;
+    signal ex1, ex1in : reg_type;
     signal actions : actions_type;
 
     signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
@@ -372,7 +372,7 @@ begin
     end generate;
 
     dbg_ctrl_out <= ctrl;
-    log_rd_addr <= r.log_addr_spr;
+    log_rd_addr <= ex1.log_addr_spr;
 
     a_in <= e_in.read_data1;
     b_in <= e_in.read_data2;
@@ -391,11 +391,11 @@ begin
                        dtlb_miss_resolved => dc_events.dtlb_miss_resolved,
                        icache_miss => ic_events.icache_miss,
                        itlb_miss_resolved => ic_events.itlb_miss_resolved,
-                       no_instr_avail => r.no_instr_avail,
-                       dispatch => r.instr_dispatch,
-                       ext_interrupt => r.ext_interrupt,
-                       br_taken_complete => r.taken_branch_event,
-                       br_mispredict => r.br_mispredict,
+                       no_instr_avail => ex1.no_instr_avail,
+                       dispatch => ex1.instr_dispatch,
+                       ext_interrupt => ex1.ext_interrupt,
+                       br_taken_complete => ex1.taken_branch_event,
+                       br_mispredict => ex1.br_mispredict,
                        others => '0');
     x_to_pmu.nia <= e_in.nia;
     x_to_pmu.addr <= (others => '0');
@@ -409,15 +409,15 @@ begin
     -- (SO, OV[32] and CA[32]) are only modified by instructions that are
     -- handled here, we can just forward the result being sent to
     -- writeback.
-    xerc_in <= r.e.xerc when (r.e.write_xerc_enable and r.e.valid) = '1' else e_in.xerc;
+    xerc_in <= ex1.e.xerc when (ex1.e.write_xerc_enable and ex1.e.valid) = '1' else e_in.xerc;
 
     with e_in.unit select busy_out <=
-        l_in.busy or r.busy or fp_in.busy when LDST,
-        l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others;
+        l_in.busy or ex1.busy or fp_in.busy when LDST,
+        l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy when others;
 
     valid_in <= e_in.valid and not busy_out and not flush_in;
 
-    terminate_out <= r.terminate;
+    terminate_out <= ex1.terminate;
 
     -- Slow SPR read mux
     with e_in.spr_select.sel select spr_result <=
@@ -425,7 +425,7 @@ begin
         32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU,
         ctrl.dec when SPRSEL_DEC,
         32x"0" & PVR_MICROWATT when SPRSEL_PVR,
-        log_wr_addr & r.log_addr_spr when SPRSEL_LOGA,
+        log_wr_addr & ex1.log_addr_spr when SPRSEL_LOGA,
         log_rd_data when SPRSEL_LOGD,
         ctrl.cfar when SPRSEL_CFAR,
         assemble_xer(xerc_in, ctrl.xer_low) when others;
@@ -445,16 +445,16 @@ begin
     begin
 	if rising_edge(clk) then
             if rst = '1' then
-                r <= reg_type_init;
+                ex1 <= reg_type_init;
                 ctrl <= ctrl_t_init;
                 ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
             else
-                r <= rin;
+                ex1 <= ex1in;
                 ctrl <= ctrl_tmp;
                 if valid_in = '1' then
                     report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) &
-                        " wr=" & to_hstring(rin.e.write_reg) & " we=" & std_ulogic'image(rin.e.write_enable) &
-                        " tag=" & integer'image(rin.e.instr_tag.tag) & std_ulogic'image(rin.e.instr_tag.valid);
+                        " wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) &
+                        " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid);
                 end if;
             end if;
 	end if;
@@ -583,7 +583,7 @@ begin
         end if;
 
         shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
-        case r.mul_select is
+        case ex1.mul_select is
             when "00" =>
                 muldiv_result <= multiply_to_x.result(63 downto 0);
             when "01" =>
@@ -820,7 +820,7 @@ begin
         -- v.trap also means we want to generate an interrupt, but doesn't
         -- cancel instruction execution (hence we need to avoid setting any
         -- side-effect flags or write enables when generating a trap).
-        -- With v.trap = 1 we will assert both r.e.valid and r.e.interrupt
+        -- With v.trap = 1 we will assert both ex1.e.valid and ex1.e.interrupt
         -- to writeback, and it will complete the instruction and take
         -- and interrupt.  It is OK for v.trap to depend on operand data.
 
@@ -924,7 +924,7 @@ begin
                 if e_in.second = '0' then
                     v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
                 else
-                    v.take_branch := r.br_taken;
+                    v.take_branch := ex1.br_taken;
                 end if;
                 if v.take_branch = '1' then
                     v.e.br_offset := b_in;
@@ -954,7 +954,7 @@ begin
                 if e_in.second = '0' then
                     v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
                 else
-                    v.take_branch := r.br_taken;
+                    v.take_branch := ex1.br_taken;
                 end if;
                 if v.take_branch = '1' then
                     v.e.br_offset := b_in;
@@ -1172,8 +1172,8 @@ begin
         variable fv : Execute1ToFPUType;
         variable go : std_ulogic;
     begin
-	v := r;
-        if r.busy = '0' then
+	v := ex1;
+        if ex1.busy = '0' then
             v.e := actions.e;
             v.oe := e_in.oe;
             v.mul_select := e_in.sub_select(1 downto 0);
@@ -1223,9 +1223,9 @@ begin
 
         do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0';
 
-        if r.intr_pending = '1' then
-            v.e.srr1 := r.e.srr1;
-            v.e.intr_vec := r.e.intr_vec;
+        if ex1.intr_pending = '1' then
+            v.e.srr1 := ex1.e.srr1;
+            v.e.intr_vec := ex1.e.intr_vec;
         end if;
 
         if valid_in = '1' then
@@ -1234,9 +1234,9 @@ begin
 
         -- Determine if there is any interrupt to be taken
         -- before/instead of executing this instruction
-        exception := r.intr_pending or (valid_in and actions.exception);
-        if valid_in = '1' and e_in.second = '0' and r.intr_pending = '0' then
-            if HAS_FPU and r.fp_exception_next = '1' then
+        exception := ex1.intr_pending or (valid_in and actions.exception);
+        if valid_in = '1' and e_in.second = '0' and ex1.intr_pending = '0' then
+            if HAS_FPU and ex1.fp_exception_next = '1' then
                 -- This is used for FP-type program interrupts that
                 -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
                 exception := '1';
@@ -1244,17 +1244,18 @@ begin
                 v.e.srr1 := (others => '0');
                 v.e.srr1(47 - 43) := '1';
                 v.e.srr1(47 - 47) := '1';
-            elsif r.trace_next = '1' then
+            elsif ex1.trace_next = '1' then
                 -- Generate a trace interrupt rather than executing the next instruction
                 -- or taking any asynchronous interrupt
                 exception := '1';
                 v.e.intr_vec := 16#d00#;
                 v.e.srr1 := (others => '0');
                 v.e.srr1(47 - 33) := '1';
-                if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
-                    r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
+                if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or
+                    ex1.prev_op = OP_DCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then
                     v.e.srr1(47 - 35) := '1';
-                elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
+                elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or
+                    ex1.prev_op = OP_DCBTST then
                     v.e.srr1(47 - 36) := '1';
                 end if;
 
@@ -1284,7 +1285,7 @@ begin
             v.busy := '1';
         end if;
 
-        v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or r.busy or fp_in.busy);
+        v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy);
 
         go := valid_in and not exception;
         v.instr_dispatch := go;
@@ -1312,7 +1313,7 @@ begin
             if actions.write_loga = '1' then
                 v.log_addr_spr := c_in(31 downto 0);
             elsif actions.inc_loga = '1' then
-                v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1);
+                v.log_addr_spr := std_ulogic_vector(unsigned(ex1.log_addr_spr) + 1);
             end if;
             x_to_pmu.mtspr <= actions.write_pmuspr;
             icache_inval <= actions.icache_inval;
@@ -1334,22 +1335,22 @@ begin
             end if;
         end if;
 
-        -- The following cases all occur when r.busy = 1 and therefore
+        -- The following cases all occur when ex1.busy = 1 and therefore
         -- valid_in = 0.  Hence they don't happen in the same cycle as any of
         -- the cases above which depend on valid_in = 1.
-        if r.cntz_in_progress = '1' then
+        if ex1.cntz_in_progress = '1' then
             -- cnt[lt]z and popcnt* always take two cycles
             v.e.valid := '1';
             v.e.write_data := countbits_result;
         end if;
-	if r.div_in_progress = '1' then
+	if ex1.div_in_progress = '1' then
 	    if divider_to_x.valid = '1' then
                 v.e.write_data := muldiv_result;
                 overflow := divider_to_x.overflow;
                 -- We must test oe because the RC update code in writeback
                 -- will use the xerc value to set CR0:SO so we must not clobber
                 -- xerc if OE wasn't set.
-                if r.oe = '1' then
+                if ex1.oe = '1' then
                     v.e.xerc.ov := overflow;
                     v.e.xerc.ov32 := overflow;
                     if overflow = '1' then
@@ -1362,10 +1363,10 @@ begin
 		v.div_in_progress := '1';
 	    end if;
         end if;
-	if r.mul_in_progress = '1' then
+	if ex1.mul_in_progress = '1' then
 	    if multiply_to_x.valid = '1' then
                 v.e.write_data := muldiv_result;
-                if r.oe = '1' then
+                if ex1.oe = '1' then
                     -- have to wait until next cycle for overflow indication
                     v.mul_finish := '1';
                     v.busy := '1';
@@ -1377,7 +1378,7 @@ begin
 		v.mul_in_progress := '1';
 	    end if;
         end if;
-        if r.mul_finish = '1' then
+        if ex1.mul_finish = '1' then
             v.e.xerc.ov := multiply_to_x.overflow;
             v.e.xerc.ov32 := multiply_to_x.overflow;
             if multiply_to_x.overflow = '1' then
@@ -1460,12 +1461,12 @@ begin
         fv.out_cr := e_in.output_cr;
 
 	-- Update registers
-	rin <= v;
+	ex1in <= v;
 
 	-- update outputs
         l_out <= lv;
-	e_out <= r.e;
-        if r.e.valid = '0' then
+	e_out <= ex1.e;
+        if ex1.e.valid = '0' then
             e_out.write_enable <= '0';
             e_out.write_cr_enable <= '0';
             e_out.write_xerc_enable <= '0';
@@ -1491,10 +1492,10 @@ begin
                             irq_valid_log &
                             interrupt_in &
                             "000" &
-                            r.e.write_enable &
-                            r.e.valid &
-                            ((r.e.redirect and r.e.valid) or r.e.interrupt) &
-                            r.busy &
+                            ex1.e.write_enable &
+                            ex1.e.valid &
+                            ((ex1.e.redirect and ex1.e.valid) or ex1.e.interrupt) &
+                            ex1.busy &
                             flush_in;
             end if;
         end process;

From 3510071d9a8dde12056f90dacb15c34eb6601971 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 30 Jun 2022 20:33:33 +1000
Subject: [PATCH 05/30] Add a second execute stage to the pipeline

This adds a second execute stage to the pipeline, in order to match up
the length of the pipeline through loadstore and dcache with the
length through execute1.  This will ultimately enable us to get rid of
the 1-cycle bubble that we currently have when issuing ALU
instructions after one or more LSU instructions.

Most ALU instructions execute in the first stage, except for
count-zeroes and popcount instructions (which take two cycles and do
some of their work in the second stage) and mfspr/mtspr to "slow" SPRs
(TB, DEC, PVR, LOGA/LOGD, CFAR).  Multiply and divide/mod instructions
take several cycles but the instruction stays in the first stage (ex1)
and ex1.busy is asserted until the operation is complete.

There is currently a bypass from the first stage but not the second
stage.  Performance is down somewhat because of that and because this
doesn't yet eliminate the bubble between LSU and ALU instructions.

The forwarding of XER common bits has been changed somewhat because
now there is another pipeline stage between ex1 and the committed
state in cr_file.  The simplest thing for now is to record the last
value written and use that, unless there has been a flush, in which
case the committed state (obtained via e_in.xerc) is used.

Note that this fixes what was previously a benign bug in control.vhdl,
where it was possible for control to forget an instructions dependency
on a value from a previous instruction (a GPR or the CR) if this
instruction writes the value and the instruction gets to the point
where it could issue but is blocked by the busy signal from execute1.
In that situation, control may incorrectly not indicate that a bypass
should be used.  That didn't matter previously because, for ALU and
FPU instructions, there was only one previous instruction in flight
and once the current instruction could issue, the previous instruction
was completing and the correct value would be obtained from
register_file or cr_file.  For loadstore instructions there could be
two being executed, but because there are no bypass paths, failing to
indicate use of a bypass path is fine.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl       |   6 +-
 control.vhdl      |   5 +-
 countbits_tb.vhdl |   1 +
 decode2.vhdl      |   1 -
 divider.vhdl      |   2 +-
 execute1.vhdl     | 570 +++++++++++++++++++++++++++-------------------
 6 files changed, 339 insertions(+), 246 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 7ecf4e2..6cbf181 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -356,6 +356,7 @@ package common is
 
     type Execute1ToDividerType is record
 	valid: std_ulogic;
+        flush: std_ulogic;
 	dividend: std_ulogic_vector(63 downto 0);
 	divisor: std_ulogic_vector(63 downto 0);
 	is_signed: std_ulogic;
@@ -364,9 +365,8 @@ package common is
 	is_modulus: std_ulogic;
         neg_result: std_ulogic;
     end record;
-    constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
-                                                              is_extended => '0', is_modulus => '0',
-                                                              neg_result => '0', others => (others => '0'));
+    constant Execute1ToDividerInit: Execute1ToDividerType := (
+        dividend => 64x"0", divisor => 64x"0", others => '0');
 
     type PMUEventType is record
         no_instr_avail      : std_ulogic;
diff --git a/control.vhdl b/control.vhdl
index 1d55517..0bbe9ad 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -104,7 +104,8 @@ begin
                         tag_regs(i).wr_cr <= '0';
                         report "tag " & integer'image(i) & " not valid";
                     end if;
-                    if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then
+                    if instr_tag.valid = '1' and gpr_write_valid = '1' and
+                        tag_regs(i).reg = gpr_write_in then
                         tag_regs(i).recent <= '0';
                         if tag_regs(i).recent = '1' and tag_regs(i).wr_gpr = '1' then
                             report "tag " & integer'image(i) & " not recent";
@@ -126,7 +127,7 @@ begin
                 curr_cr_tag <= 0;
             else
                 curr_tag <= next_tag;
-                if cr_write_valid = '1' then
+                if instr_tag.valid = '1' and cr_write_valid = '1' then
                     curr_cr_tag <= instr_tag.tag;
                 end if;
             end if;
diff --git a/countbits_tb.vhdl b/countbits_tb.vhdl
index c00a6b6..c945c57 100644
--- a/countbits_tb.vhdl
+++ b/countbits_tb.vhdl
@@ -26,6 +26,7 @@ begin
     bitcounter_0: entity work.bit_counter
         port map (
             clk => clk,
+            stall => '0',
             rs => rs,
             result => res,
             count_right => count_right,
diff --git a/decode2.vhdl b/decode2.vhdl
index 8998f2b..af0c27d 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -228,7 +228,6 @@ architecture behaviour of decode2 is
         OP_SHR      => "010",
         OP_EXTSWSLI => "010",
         OP_MUL_L64  => "011",           -- muldiv_result
-        OP_MFSPR    => "101",           -- spr_result
         OP_B        => "110",           -- next_nia
         OP_BC       => "110",
         OP_BCREG    => "110",
diff --git a/divider.vhdl b/divider.vhdl
index 3f9b312..55e3c5d 100644
--- a/divider.vhdl
+++ b/divider.vhdl
@@ -36,7 +36,7 @@ begin
     divider_0: process(clk)
     begin
         if rising_edge(clk) then
-            if rst = '1' then
+            if rst = '1' or d_in.flush = '1' then
                 dend <= (others => '0');
                 div <= (others => '0');
                 quot <= (others => '0');
diff --git a/execute1.vhdl b/execute1.vhdl
index 7bd0913..ebcdfeb 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -60,65 +60,90 @@ entity execute1 is
 end entity execute1;
 
 architecture behaviour of execute1 is
-    type reg_type is record
+    type side_effect_type is record
+        terminate : std_ulogic;
+        icache_inval : std_ulogic;
+        write_msr : std_ulogic;
+        write_xerlow : std_ulogic;
+        write_dec : std_ulogic;
+        write_cfar : std_ulogic;
+        write_loga : std_ulogic;
+        inc_loga : std_ulogic;
+        write_pmuspr : std_ulogic;
+    end record;
+    constant side_effect_init : side_effect_type := (others => '0');
+
+    type actions_type is record
+        e : Execute1ToWritebackType;
+        se : side_effect_type;
+        complete : std_ulogic;
+        exception : std_ulogic;
+        trap : std_ulogic;
+        new_msr : std_ulogic_vector(63 downto 0);
+        take_branch : std_ulogic;
+        direct_branch : std_ulogic;
+        start_mul : std_ulogic;
+        start_div : std_ulogic;
+        do_trace : std_ulogic;
+        fp_intr : std_ulogic;
+        res2_sel : std_ulogic_vector(1 downto 0);
+        bypass_valid : std_ulogic;
+    end record;
+    constant actions_type_init : actions_type :=
+        (e => Execute1ToWritebackInit, se => side_effect_init,
+         new_msr => (others => '0'), res2_sel => "00", others => '0');
+
+    type reg_stage1_type is record
 	e : Execute1ToWritebackType;
+        se : side_effect_type;
         busy: std_ulogic;
-        terminate: std_ulogic;
-        intr_pending : std_ulogic;
         fp_exception_next : std_ulogic;
         trace_next : std_ulogic;
         prev_op : insn_type_t;
         br_taken : std_ulogic;
         oe : std_ulogic;
         mul_select : std_ulogic_vector(1 downto 0);
+        res2_sel : std_ulogic_vector(1 downto 0);
+        spr_select : spr_id;
+        pmu_spr_num : std_ulogic_vector(4 downto 0);
 	mul_in_progress : std_ulogic;
         mul_finish : std_ulogic;
         div_in_progress : std_ulogic;
-        cntz_in_progress : std_ulogic;
         no_instr_avail : std_ulogic;
         instr_dispatch : std_ulogic;
         ext_interrupt : std_ulogic;
         taken_branch_event : std_ulogic;
         br_mispredict : std_ulogic;
-        log_addr_spr : std_ulogic_vector(31 downto 0);
+        msr : std_ulogic_vector(63 downto 0);
+        xerc : xer_common_t;
+        xerc_valid : std_ulogic;
     end record;
-    constant reg_type_init : reg_type :=
-        (e => Execute1ToWritebackInit,
-         busy => '0', terminate => '0', intr_pending => '0',
+    constant reg_stage1_type_init : reg_stage1_type :=
+        (e => Execute1ToWritebackInit, se => side_effect_init,
+         busy => '0',
          fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0',
-         oe => '0', mul_select => "00",
-         mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
+         oe => '0', mul_select => "00", res2_sel => "00",
+         spr_select => spr_id_init, pmu_spr_num => 5x"0",
+         mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
          no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
          taken_branch_event => '0', br_mispredict => '0',
-         others => (others => '0'));
+         msr => 64x"0",
+         xerc => xerc_init, xerc_valid => '0');
 
-    type actions_type is record
+    type reg_stage2_type is record
 	e : Execute1ToWritebackType;
-        complete : std_ulogic;
-        exception : std_ulogic;
-        trap : std_ulogic;
-        terminate : std_ulogic;
-        write_msr : std_ulogic;
-        new_msr : std_ulogic_vector(63 downto 0);
-        write_xerlow : std_ulogic;
-        write_pmuspr : std_ulogic;
-        write_dec : std_ulogic;
-        write_loga : std_ulogic;
-        inc_loga : std_ulogic;
-        write_cfar : std_ulogic;
-        take_branch : std_ulogic;
-        direct_branch : std_ulogic;
-        start_mul : std_ulogic;
-        start_div : std_ulogic;
-        start_cntz : std_ulogic;
-        do_trace : std_ulogic;
-        fp_intr : std_ulogic;
-        icache_inval : std_ulogic;
+        se : side_effect_type;
+        ext_interrupt : std_ulogic;
+        taken_branch_event : std_ulogic;
+        br_mispredict : std_ulogic;
+        log_addr_spr : std_ulogic_vector(31 downto 0);
     end record;
-    constant actions_type_init : actions_type :=
-        (e => Execute1ToWritebackInit, new_msr => (others => '0'), others => '0');
+    constant reg_stage2_type_init : reg_stage2_type :=
+        (e => Execute1ToWritebackInit, se => side_effect_init,
+         log_addr_spr => 32x"0", others => '0');
 
-    signal ex1, ex1in : reg_type;
+    signal ex1, ex1in : reg_stage1_type;
+    signal ex2, ex2in : reg_stage2_type;
     signal actions : actions_type;
 
     signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
@@ -142,7 +167,9 @@ architecture behaviour of execute1 is
     signal muldiv_result: std_ulogic_vector(63 downto 0);
     signal shortmul_result: std_ulogic_vector(63 downto 0);
     signal spr_result: std_ulogic_vector(63 downto 0);
+    signal ex_result: std_ulogic_vector(63 downto 0);
     signal next_nia : std_ulogic_vector(63 downto 0);
+    signal s1_sel : std_ulogic_vector(2 downto 0);
 
     signal carry_32 : std_ulogic;
     signal carry_64 : std_ulogic;
@@ -372,7 +399,7 @@ begin
     end generate;
 
     dbg_ctrl_out <= ctrl;
-    log_rd_addr <= ex1.log_addr_spr;
+    log_rd_addr <= ex2.log_addr_spr;
 
     a_in <= e_in.read_data1;
     b_in <= e_in.read_data2;
@@ -393,15 +420,15 @@ begin
                        itlb_miss_resolved => ic_events.itlb_miss_resolved,
                        no_instr_avail => ex1.no_instr_avail,
                        dispatch => ex1.instr_dispatch,
-                       ext_interrupt => ex1.ext_interrupt,
-                       br_taken_complete => ex1.taken_branch_event,
-                       br_mispredict => ex1.br_mispredict,
+                       ext_interrupt => ex2.ext_interrupt,
+                       br_taken_complete => ex2.taken_branch_event,
+                       br_mispredict => ex2.br_mispredict,
                        others => '0');
     x_to_pmu.nia <= e_in.nia;
     x_to_pmu.addr <= (others => '0');
     x_to_pmu.addr_v <= '0';
-    x_to_pmu.spr_num <= e_in.insn(20 downto 16);
-    x_to_pmu.spr_val <= c_in;
+    x_to_pmu.spr_num <= ex1.pmu_spr_num;
+    x_to_pmu.spr_val <= ex1.e.write_data;
     x_to_pmu.run <= '1';
 
     -- XER forwarding. To avoid having to track XER hazards, we use
@@ -409,35 +436,23 @@ begin
     -- (SO, OV[32] and CA[32]) are only modified by instructions that are
     -- handled here, we can just forward the result being sent to
     -- writeback.
-    xerc_in <= ex1.e.xerc when (ex1.e.write_xerc_enable and ex1.e.valid) = '1' else e_in.xerc;
+    xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc;
 
     with e_in.unit select busy_out <=
-        l_in.busy or ex1.busy or fp_in.busy when LDST,
+        l_in.busy or ex1.e.valid or ex1.busy or fp_in.busy when LDST,
+        l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU,
         l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy when others;
 
-    valid_in <= e_in.valid and not busy_out and not flush_in;
+    valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt);
 
-    terminate_out <= ex1.terminate;
-
-    -- Slow SPR read mux
-    with e_in.spr_select.sel select spr_result <=
-        ctrl.tb when SPRSEL_TB,
-        32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU,
-        ctrl.dec when SPRSEL_DEC,
-        32x"0" & PVR_MICROWATT when SPRSEL_PVR,
-        log_wr_addr & ex1.log_addr_spr when SPRSEL_LOGA,
-        log_rd_data when SPRSEL_LOGD,
-        ctrl.cfar when SPRSEL_CFAR,
-        assemble_xer(xerc_in, ctrl.xer_low) when others;
-
-    -- Result mux
-    with e_in.result_sel select alu_result <=
+    -- First stage result mux
+    s1_sel <= e_in.result_sel when ex1.busy = '0' else "100";
+    with s1_sel select alu_result <=
         adder_result       when "000",
         logical_result     when "001",
         rotator_result     when "010",
         shortmul_result    when "011",
-        pmu_to_x.spr_val   when "100",
-        spr_result         when "101",
+        muldiv_result      when "100",
         next_nia           when "110",
         misc_result        when others;
 
@@ -445,22 +460,31 @@ begin
     begin
 	if rising_edge(clk) then
             if rst = '1' then
-                ex1 <= reg_type_init;
+                ex1 <= reg_stage1_type_init;
+                ex2 <= reg_stage2_type_init;
                 ctrl <= ctrl_t_init;
                 ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
+                ex1.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
             else
                 ex1 <= ex1in;
+                ex2 <= ex2in;
                 ctrl <= ctrl_tmp;
                 if valid_in = '1' then
                     report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) &
                         " wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) &
                         " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid);
                 end if;
+                -- We mustn't get stalled on a cycle where execute2 is
+                -- completing an instruction or generating an interrupt
+                if ex2.e.valid = '1' or ex2.e.interrupt = '1' then
+                    assert (l_in.busy or fp_in.busy) = '0'
+                        severity failure;
+                end if;
             end if;
 	end if;
     end process;
 
-    -- Data path for integer instructions
+    -- Data path for integer instructions (first execute stage)
     execute1_dp: process(all)
 	variable a_inv : std_ulogic_vector(63 downto 0);
 	variable b_or_m1 : std_ulogic_vector(63 downto 0);
@@ -543,6 +567,7 @@ begin
         if e_in.insn_type = OP_MOD then
             x_to_divider.is_modulus <= '1';
         end if;
+        x_to_divider.flush <= flush_in;
 
         addend := (others => '0');
         if e_in.insn(26) = '0' then
@@ -638,7 +663,7 @@ begin
                 misc_result <= darn;
             when "100" =>
                 -- mfmsr
-		misc_result <= ctrl.msr;
+		misc_result <= ex1.msr;
             when "101" =>
 		if e_in.insn(20) = '0' then
 		    -- mfcr
@@ -792,6 +817,7 @@ begin
         variable illegal : std_ulogic;
         variable privileged : std_ulogic;
         variable slow_op : std_ulogic;
+        variable owait : std_ulogic;
     begin
         v := actions_type_init;
         v.e.write_data := alu_result;
@@ -803,12 +829,11 @@ begin
         v.e.write_cr_enable := e_in.output_cr;
         v.e.write_xerc_enable := e_in.output_xer;
         v.e.xerc := xerc_in;
-        v.new_msr := ctrl.msr;
-        v.e.write_xerc_enable := e_in.output_xer;
-        v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) &
-                          not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF);
+        v.new_msr := ex1.msr;
+        v.e.redir_mode := ex1.msr(MSR_IR) & not ex1.msr(MSR_PR) &
+                          not ex1.msr(MSR_LE) & not ex1.msr(MSR_SF);
         v.e.intr_vec := 16#700#;
-        v.e.mode_32bit := not ctrl.msr(MSR_SF);
+        v.e.mode_32bit := not ex1.msr(MSR_SF);
         v.e.instr_tag := e_in.instr_tag;
         v.e.last_nia := e_in.nia;
         v.e.br_offset := 64x"4";
@@ -827,8 +852,9 @@ begin
         illegal := '0';
         privileged := '0';
         slow_op := '0';
+        owait := '0';
 
-        if ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then
+        if ex1.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then
             privileged := '1';
         end if;
 
@@ -837,7 +863,7 @@ begin
             illegal := '1';
         end if;
 
-        v.do_trace := ctrl.msr(MSR_SE);
+        v.do_trace := ex1.msr(MSR_SE);
         case_0: case e_in.insn_type is
 	    when OP_ILLEGAL =>
 		illegal := '1';
@@ -858,7 +884,7 @@ begin
                 -- check bits 1-10 of the instruction to make sure it's attn
                 -- if not then it is illegal
                 if e_in.insn(10 downto 1) = "0100000000" then
-                    v.terminate := '1';
+                    v.se.terminate := '1';
                     if e_in.valid = '1' then
                         report "ATTN";
                     end if;
@@ -909,10 +935,10 @@ begin
                     -- should never happen
                     v.e.redirect := '1';
                 end if;
-                if ctrl.msr(MSR_BE) = '1' then
+                if ex1.msr(MSR_BE) = '1' then
                     v.do_trace := '1';
                 end if;
-                v.write_cfar := '1';
+                v.se.write_cfar := '1';
             when OP_BC =>
                 -- read_data1 is CTR
                 -- If this instruction updates both CTR and LR, then it is
@@ -938,10 +964,10 @@ begin
                     v.direct_branch := '1';
                     v.e.br_last := '1';
                     v.e.br_taken := v.take_branch;
-                    if ctrl.msr(MSR_BE) = '1' then
+                    if ex1.msr(MSR_BE) = '1' then
                         v.do_trace := '1';
                     end if;
-                    v.write_cfar := v.take_branch;
+                    v.se.write_cfar := v.take_branch;
                 end if;
             when OP_BCREG =>
                 -- read_data1 is CTR, read_data2 is target register (CTR, LR or TAR)
@@ -964,10 +990,10 @@ begin
                     -- Indirect branches are never predicted taken
                     v.e.redirect := v.take_branch;
                     v.e.br_taken := v.take_branch;
-                    if ctrl.msr(MSR_BE) = '1' then
+                    if ex1.msr(MSR_BE) = '1' then
                         v.do_trace := '1';
                     end if;
-                    v.write_cfar := v.take_branch;
+                    v.se.write_cfar := v.take_branch;
                 end if;
 
 	    when OP_RFID =>
@@ -983,11 +1009,11 @@ begin
                     v.new_msr(MSR_IR) := '1';
                     v.new_msr(MSR_DR) := '1';
                 end if;
-                v.write_msr := '1';
+                v.se.write_msr := '1';
                 v.e.br_offset := b_in;
                 v.e.abs_br := '1';
                 v.e.redirect := '1';
-                v.write_cfar := '1';
+                v.se.write_cfar := '1';
                 if HAS_FPU then
                     v.fp_intr := fp_in.exception and
                                  (a_in(MSR_FE0) or a_in(MSR_FE1));
@@ -995,8 +1021,8 @@ begin
                 v.do_trace := '0';
 
             when OP_CNTZ | OP_POPCNT =>
+                v.res2_sel := "01";
                 slow_op := '1';
-                v.start_cntz := '1';
 	    when OP_ISEL =>
             when OP_CROP =>
             when OP_MCRXRX =>
@@ -1010,14 +1036,19 @@ begin
                     end if;
 		elsif e_in.spr_select.valid = '1' then
                     if e_in.valid = '1' then
-                        report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
-                            "=" & to_hstring(spr_result);
+                        report "MFSPR to slow SPR " & integer'image(decode_spr_num(e_in.insn));
+                    end if;
+                    slow_op := '1';
+                    if e_in.spr_select.ispmu = '0' then
+                        case e_in.spr_select.sel is
+                            when SPRSEL_LOGD =>
+                                v.se.inc_loga := '1';
+                            when others =>
+                        end case;
+                        v.res2_sel := "10";
+                    else
+                        v.res2_sel := "11";
                     end if;
-                    case e_in.spr_select.sel is
-                       when SPRSEL_LOGD =>
-                           v.inc_loga := '1';
-                           when others =>
-                    end case;
                 else
                     -- mfspr from unimplemented SPRs should be a nop in
                     -- supervisor mode and a program interrupt for user mode
@@ -1025,7 +1056,7 @@ begin
                         report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
                             " invalid";
                     end if;
-                    if ctrl.msr(MSR_PR) = '1' then
+                    if ex1.msr(MSR_PR) = '1' then
                         illegal := '1';
                     end if;
                 end if;
@@ -1033,7 +1064,7 @@ begin
 	    when OP_MFCR =>
 	    when OP_MTCRF =>
             when OP_MTMSRD =>
-                v.write_msr := '1';
+                v.se.write_msr := '1';
                 if e_in.insn(16) = '1' then
                     -- just update EE and RI
                     v.new_msr(MSR_EE) := c_in(MSR_EE);
@@ -1062,7 +1093,7 @@ begin
                     report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
                         "=" & to_hstring(c_in);
                 end if;
-                v.write_pmuspr := e_in.spr_select.ispmu;
+                v.se.write_pmuspr := e_in.spr_select.ispmu;
                 if e_in.spr_select.valid = '1' and e_in.spr_select.ispmu = '0' then
                     case e_in.spr_select.sel is
                         when SPRSEL_XER =>
@@ -1071,17 +1102,17 @@ begin
                             v.e.xerc.ca := c_in(63-34);
                             v.e.xerc.ov32 := c_in(63-44);
                             v.e.xerc.ca32 := c_in(63-45);
-                            v.write_xerlow := '1';
+                            v.se.write_xerlow := '1';
                         when SPRSEL_DEC =>
-                            v.write_dec := '1';
+                            v.se.write_dec := '1';
                         when SPRSEL_LOGA =>
-                            v.write_loga := '1';
+                            v.se.write_loga := '1';
                         when others =>
                     end case;
 		elsif is_fast_spr(e_in.write_reg) = '0' then
                     -- mtspr to unimplemented SPRs should be a nop in
                     -- supervisor mode and a program interrupt for user mode
-                    if ctrl.msr(MSR_PR) = '1' then
+                    if ex1.msr(MSR_PR) = '1' then
                         illegal := '1';
                     end if;
 		end if;
@@ -1095,7 +1126,7 @@ begin
 		v.e.redirect := '1';
 
 	    when OP_ICBI =>
-		v.icache_inval := '1';
+		v.se.icache_inval := '1';
 
 	    when OP_MUL_L64 =>
                 if HAS_SHORT_MULT and e_in.insn(26) = '1' and
@@ -1109,15 +1140,18 @@ begin
                     -- Use standard multiplier
                     v.start_mul := '1';
                     slow_op := '1';
+                    owait := '1';
                 end if;
 
 	    when OP_MUL_H64 | OP_MUL_H32 =>
                 v.start_mul := '1';
                 slow_op := '1';
+                owait := '1';
 
 	    when OP_DIV | OP_DIVE | OP_MOD =>
                 v.start_div := '1';
                 slow_op := '1';
+                owait := '1';
 
             when OP_FETCH_FAILED =>
                 -- Handling an ITLB miss doesn't count as having executed an instruction
@@ -1147,7 +1181,7 @@ begin
                 report "illegal instruction";
             end if;
 
-        elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then
+        elsif HAS_FPU and ex1.msr(MSR_FP) = '0' and e_in.fac = FPU then
             -- generate a floating-point unavailable interrupt
             v.exception := '1';
             v.e.intr_vec := 16#800#;
@@ -1157,26 +1191,33 @@ begin
         end if;
 
         if e_in.unit = ALU then
-            v.complete := e_in.valid and not v.exception and not slow_op;
+            v.complete := e_in.valid and not v.exception and not owait;
+            v.bypass_valid := e_in.valid and not v.exception and not slow_op;
         end if;
 
         actions <= v;
     end process;
 
+    -- First execute stage
     execute1_1: process(all)
-	variable v : reg_type;
+	variable v : reg_stage1_type;
 	variable overflow : std_ulogic;
         variable lv : Execute1ToLoadstore1Type;
 	variable irq_valid : std_ulogic;
 	variable exception : std_ulogic;
         variable fv : Execute1ToFPUType;
         variable go : std_ulogic;
+        variable bypass_valid : std_ulogic;
     begin
 	v := ex1;
-        if ex1.busy = '0' then
+        if (ex1.busy or l_in.busy or fp_in.busy) = '0' then
             v.e := actions.e;
+            v.e.valid := '0';
             v.oe := e_in.oe;
+            v.spr_select := e_in.spr_select;
+            v.pmu_spr_num := e_in.insn(20 downto 16);
             v.mul_select := e_in.sub_select(1 downto 0);
+            v.se := side_effect_init;
         end if;
 
         lv := Execute1ToLoadstore1Init;
@@ -1184,33 +1225,13 @@ begin
 
         x_to_multiply.valid <= '0';
         x_to_divider.valid <= '0';
-	v.mul_in_progress := '0';
-        v.div_in_progress := '0';
-        v.cntz_in_progress := '0';
-        v.mul_finish := '0';
         v.ext_interrupt := '0';
         v.taken_branch_event := '0';
         v.br_mispredict := '0';
+        v.busy := '0';
+        bypass_valid := '0';
 
-        x_to_pmu.mfspr <= '0';
-        x_to_pmu.mtspr <= '0';
-        x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47);
-        x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51);
-        x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55);
-        x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63);
-        x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM);
-        x_to_pmu.pr_msr <= ctrl.msr(MSR_PR);
-
-	ctrl_tmp <= ctrl;
-	-- FIXME: run at 512MHz not core freq
-	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
-	ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);
-
-        irq_valid := ctrl.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);
-
-	v.terminate := '0';
-	icache_inval <= '0';
-	v.busy := '0';
+        irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);
 
 	-- Next insn adder used in a couple of places
 	next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4);
@@ -1223,19 +1244,14 @@ begin
 
         do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0';
 
-        if ex1.intr_pending = '1' then
-            v.e.srr1 := ex1.e.srr1;
-            v.e.intr_vec := ex1.e.intr_vec;
-        end if;
-
         if valid_in = '1' then
             v.prev_op := e_in.insn_type;
         end if;
 
         -- Determine if there is any interrupt to be taken
         -- before/instead of executing this instruction
-        exception := ex1.intr_pending or (valid_in and actions.exception);
-        if valid_in = '1' and e_in.second = '0' and ex1.intr_pending = '0' then
+        exception := valid_in and actions.exception;
+        if valid_in = '1' and e_in.second = '0' then
             if HAS_FPU and ex1.fp_exception_next = '1' then
                 -- This is used for FP-type program interrupts that
                 -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
@@ -1278,54 +1294,37 @@ begin
 
             end if;
         end if;
-        if exception = '1' and l_in.in_progress = '1' then
-            -- We can't send this interrupt to writeback yet because there are
-            -- still instructions in loadstore1 that haven't completed.
-            v.intr_pending := '1';
-            v.busy := '1';
-        end if;
 
-        v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy);
+        v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or
+                                 ex1.busy or fp_in.busy);
 
         go := valid_in and not exception;
         v.instr_dispatch := go;
 
 	if go = '1' then
+            v.se := actions.se;
             v.e.valid := actions.complete;
+            bypass_valid := actions.bypass_valid;
             v.taken_branch_event := actions.take_branch;
             v.br_taken := actions.take_branch;
             v.trace_next := actions.do_trace;
             v.fp_exception_next := actions.fp_intr;
-            v.cntz_in_progress := actions.start_cntz;
-
-            if actions.write_msr = '1' then
-                ctrl_tmp.msr <= actions.new_msr;
-            end if;
-            if actions.write_xerlow = '1' then
-                ctrl_tmp.xer_low <= c_in(17 downto 0);
-            end if;
-            if actions.write_dec = '1' then
-                ctrl_tmp.dec <= c_in;
-            end if;
-            if actions.write_cfar = '1' then
-                ctrl_tmp.cfar <= e_in.nia;
-            end if;
-            if actions.write_loga = '1' then
-                v.log_addr_spr := c_in(31 downto 0);
-            elsif actions.inc_loga = '1' then
-                v.log_addr_spr := std_ulogic_vector(unsigned(ex1.log_addr_spr) + 1);
-            end if;
-            x_to_pmu.mtspr <= actions.write_pmuspr;
-            icache_inval <= actions.icache_inval;
+            v.res2_sel := actions.res2_sel;
+            v.msr := actions.new_msr;
             x_to_multiply.valid <= actions.start_mul;
             v.mul_in_progress := actions.start_mul;
             x_to_divider.valid <= actions.start_div;
             v.div_in_progress := actions.start_div;
-            v.terminate := actions.terminate;
             v.br_mispredict := v.e.redirect and actions.direct_branch;
-            v.busy := actions.start_cntz or actions.start_mul or actions.start_div;
             exception := actions.trap;
 
+            -- Go busy while division is happening because the
+            -- divider is not pipelined.  Also go busy while a
+            -- multiply is happening in order to stop following
+            -- instructions from using the wrong XER value
+            -- (and for simplicity in the OE=0 case).
+            v.busy := actions.start_div or actions.start_mul;
+
             -- instruction for other units, i.e. LDST
             if e_in.unit = LDST then
                 lv.valid := '1';
@@ -1335,86 +1334,74 @@ begin
             end if;
         end if;
 
-        -- The following cases all occur when ex1.busy = 1 and therefore
-        -- valid_in = 0.  Hence they don't happen in the same cycle as any of
-        -- the cases above which depend on valid_in = 1.
-        if ex1.cntz_in_progress = '1' then
-            -- cnt[lt]z and popcnt* always take two cycles
-            v.e.valid := '1';
-            v.e.write_data := countbits_result;
-        end if;
-	if ex1.div_in_progress = '1' then
-	    if divider_to_x.valid = '1' then
-                v.e.write_data := muldiv_result;
-                overflow := divider_to_x.overflow;
-                -- We must test oe because the RC update code in writeback
-                -- will use the xerc value to set CR0:SO so we must not clobber
-                -- xerc if OE wasn't set.
-                if ex1.oe = '1' then
-                    v.e.xerc.ov := overflow;
-                    v.e.xerc.ov32 := overflow;
-                    if overflow = '1' then
-                        v.e.xerc.so := '1';
-                    end if;
+        if ex1.div_in_progress = '1' then
+            v.div_in_progress := not divider_to_x.valid;
+            v.busy := not divider_to_x.valid;
+            if divider_to_x.valid = '1' and ex1.oe = '1' then
+                v.e.xerc.ov := divider_to_x.overflow;
+                v.e.xerc.ov32 := divider_to_x.overflow;
+                if divider_to_x.overflow = '1' then
+                    v.e.xerc.so := '1';
                 end if;
-                v.e.valid := '1';
-	    else
-		v.busy := '1';
-		v.div_in_progress := '1';
-	    end if;
+            end if;
+            v.e.valid := divider_to_x.valid;
+            v.e.write_data := alu_result;
+            bypass_valid := v.e.valid;
         end if;
-	if ex1.mul_in_progress = '1' then
-	    if multiply_to_x.valid = '1' then
-                v.e.write_data := muldiv_result;
-                if ex1.oe = '1' then
-                    -- have to wait until next cycle for overflow indication
-                    v.mul_finish := '1';
-                    v.busy := '1';
-                else
-                    v.e.valid := '1';
-                end if;
-	    else
-		v.busy := '1';
-		v.mul_in_progress := '1';
-	    end if;
+        if ex1.mul_in_progress = '1' then
+            v.mul_in_progress := not multiply_to_x.valid;
+            v.mul_finish := multiply_to_x.valid and ex1.oe;
+            v.e.valid := multiply_to_x.valid and not ex1.oe;
+            v.busy := not v.e.valid;
+            v.e.write_data := alu_result;
+            bypass_valid := v.e.valid;
         end if;
         if ex1.mul_finish = '1' then
+            v.mul_finish := '0';
             v.e.xerc.ov := multiply_to_x.overflow;
             v.e.xerc.ov32 := multiply_to_x.overflow;
             if multiply_to_x.overflow = '1' then
                 v.e.xerc.so := '1';
             end if;
             v.e.valid := '1';
-	end if;
+        end if;
 
-        v.e.interrupt := exception and not (l_in.in_progress or l_in.interrupt);
-        if v.e.interrupt = '1' then
-            v.intr_pending := '0';
+        if v.e.write_xerc_enable = '1' and v.e.valid = '1' then
+            v.xerc := v.e.xerc;
+            v.xerc_valid := '1';
         end if;
 
- 	if interrupt_in = '1' then
-            ctrl_tmp.msr(MSR_SF) <= '1';
-            ctrl_tmp.msr(MSR_EE) <= '0';
-            ctrl_tmp.msr(MSR_PR) <= '0';
-            ctrl_tmp.msr(MSR_SE) <= '0';
-            ctrl_tmp.msr(MSR_BE) <= '0';
-            ctrl_tmp.msr(MSR_FP) <= '0';
-            ctrl_tmp.msr(MSR_FE0) <= '0';
-            ctrl_tmp.msr(MSR_FE1) <= '0';
-            ctrl_tmp.msr(MSR_IR) <= '0';
-            ctrl_tmp.msr(MSR_DR) <= '0';
-            ctrl_tmp.msr(MSR_RI) <= '0';
-            ctrl_tmp.msr(MSR_LE) <= '1';
+        if (ex1.busy or l_in.busy or fp_in.busy) = '0' then
+            v.e.interrupt := exception;
+        end if;
+        if v.e.valid = '0' then
+            v.e.redirect := '0';
+            v.e.br_last := '0';
+        end if;
+        if flush_in = '1' then
+            v.e.valid := '0';
+            v.e.interrupt := '0';
+            v.e.redirect := '0';
+            v.e.br_last := '0';
+            v.busy := '0';
+            v.div_in_progress := '0';
+            v.mul_in_progress := '0';
+            v.mul_finish := '0';
+            v.xerc_valid := '0';
+        end if;
+        if flush_in = '1' or interrupt_in = '1' then
+            v.msr := ctrl_tmp.msr;
+        end if;
+        if interrupt_in = '1' then
             v.trace_next := '0';
             v.fp_exception_next := '0';
-            v.intr_pending := '0';
         end if;
 
-        bypass_data.tag.valid <= v.e.write_enable and v.e.valid;
+        bypass_data.tag.valid <= v.e.write_enable and bypass_valid;
         bypass_data.tag.tag <= v.e.instr_tag.tag;
-        bypass_data.data <= v.e.write_data;
+        bypass_data.data <= alu_result;
 
-        bypass_cr_data.tag.valid <= v.e.write_cr_enable and v.e.valid;
+        bypass_cr_data.tag.valid <= v.e.write_cr_enable and bypass_valid;
         bypass_cr_data.tag.tag <= v.e.instr_tag.tag;
         bypass_cr_data.data <= v.e.write_cr_data;
 
@@ -1427,7 +1414,7 @@ begin
         lv.data := c_in;
         lv.write_reg := e_in.write_reg;
         lv.length := e_in.data_len;
-        lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE);
+        lv.byte_reverse := e_in.byte_reverse xnor ex1.msr(MSR_LE);
         lv.sign_extend := e_in.sign_extend;
         lv.update := e_in.update;
         lv.xerc := xerc_in;
@@ -1439,9 +1426,9 @@ begin
             e_in.insn(5 downto 1) = "10101" then
             lv.ci := '1';
         end if;
-        lv.virt_mode := ctrl.msr(MSR_DR);
-        lv.priv_mode := not ctrl.msr(MSR_PR);
-        lv.mode_32bit := not ctrl.msr(MSR_SF);
+        lv.virt_mode := ex1.msr(MSR_DR);
+        lv.priv_mode := not ex1.msr(MSR_PR);
+        lv.mode_32bit := not ex1.msr(MSR_SF);
         lv.is_32bit := e_in.is_32bit;
         lv.repeat := e_in.repeat;
         lv.second := e_in.second;
@@ -1452,7 +1439,7 @@ begin
         fv.insn := e_in.insn;
         fv.itag := e_in.instr_tag;
         fv.single := e_in.is_32bit;
-        fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1);
+        fv.fe_mode := ex1.msr(MSR_FE0) & ex1.msr(MSR_FE1);
         fv.fra := a_in;
         fv.frb := b_in;
         fv.frc := c_in;
@@ -1465,19 +1452,124 @@ begin
 
 	-- update outputs
         l_out <= lv;
-	e_out <= ex1.e;
-        if ex1.e.valid = '0' then
-            e_out.write_enable <= '0';
-            e_out.write_cr_enable <= '0';
-            e_out.write_xerc_enable <= '0';
-            e_out.redirect <= '0';
-            e_out.br_last <= '0';
+        fp_out <= fv;
+        irq_valid_log <= irq_valid;
+    end process;
+
+    -- Slow SPR read mux
+    with ex1.spr_select.sel select spr_result <=
+        ctrl.tb when SPRSEL_TB,
+        32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU,
+        ctrl.dec when SPRSEL_DEC,
+        32x"0" & PVR_MICROWATT when SPRSEL_PVR,
+        log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA,
+        log_rd_data when SPRSEL_LOGD,
+        ctrl.cfar when SPRSEL_CFAR,
+        assemble_xer(ex1.e.xerc, ctrl.xer_low) when others;
+
+    -- Second stage result mux
+    with ex1.res2_sel select ex_result <=
+        countbits_result  when "01",
+        spr_result        when "10",
+        pmu_to_x.spr_val  when "11",
+        ex1.e.write_data  when others;
+
+    -- Second execute stage control
+    execute2_1: process(all)
+	variable v : reg_stage2_type;
+	variable overflow : std_ulogic;
+        variable lv : Execute1ToLoadstore1Type;
+        variable fv : Execute1ToFPUType;
+        variable k : integer;
+        variable go : std_ulogic;
+    begin
+	v := ex2;
+        if (l_in.busy or fp_in.busy) = '0' then
+            v.e := ex1.e;
+            v.se := ex1.se;
+            v.e.write_data := ex_result;
+            v.ext_interrupt := ex1.ext_interrupt;
+            v.taken_branch_event := ex1.taken_branch_event;
+            v.br_mispredict := ex1.br_mispredict;
+        end if;
+
+	ctrl_tmp <= ctrl;
+	-- FIXME: run at 512MHz not core freq
+	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
+	ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);
+
+        x_to_pmu.mfspr <= '0';
+        x_to_pmu.mtspr <= '0';
+        x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47);
+        x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51);
+        x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55);
+        x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63);
+        x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM);
+        x_to_pmu.pr_msr <= ctrl.msr(MSR_PR);
+
+        if v.e.valid = '0' or flush_in = '1' then
+            v.e.write_enable := '0';
+            v.e.write_cr_enable := '0';
+            v.e.write_xerc_enable := '0';
+            v.e.redirect := '0';
+            v.e.br_last := '0';
+            v.se := side_effect_init;
+            v.taken_branch_event := '0';
+            v.br_mispredict := '0';
+        end if;
+        if flush_in = '1' then
+            v.e.valid := '0';
+            v.e.interrupt := '0';
+            v.ext_interrupt := '0';
+        end if;
+
+	if (l_in.busy or fp_in.busy) = '0' then
+            if ex1.se.write_msr = '1' then
+                ctrl_tmp.msr <= ex1.msr;
+            end if;
+            if ex1.se.write_xerlow = '1' then
+                ctrl_tmp.xer_low <= ex1.e.write_data(17 downto 0);
+            end if;
+            if ex1.se.write_dec = '1' then
+                ctrl_tmp.dec <= ex1.e.write_data;
+            end if;
+            if ex1.se.write_cfar = '1' then
+                ctrl_tmp.cfar <= ex1.e.last_nia;
+            end if;
+            if ex1.se.write_loga = '1' then
+                v.log_addr_spr := ex1.e.write_data(31 downto 0);
+            elsif ex1.se.inc_loga = '1' then
+                v.log_addr_spr := std_ulogic_vector(unsigned(ex2.log_addr_spr) + 1);
+            end if;
+            x_to_pmu.mtspr <= ex1.se.write_pmuspr;
         end if;
+
+ 	if interrupt_in = '1' then
+            ctrl_tmp.msr(MSR_SF) <= '1';
+            ctrl_tmp.msr(MSR_EE) <= '0';
+            ctrl_tmp.msr(MSR_PR) <= '0';
+            ctrl_tmp.msr(MSR_SE) <= '0';
+            ctrl_tmp.msr(MSR_BE) <= '0';
+            ctrl_tmp.msr(MSR_FP) <= '0';
+            ctrl_tmp.msr(MSR_FE0) <= '0';
+            ctrl_tmp.msr(MSR_FE1) <= '0';
+            ctrl_tmp.msr(MSR_IR) <= '0';
+            ctrl_tmp.msr(MSR_DR) <= '0';
+            ctrl_tmp.msr(MSR_RI) <= '0';
+            ctrl_tmp.msr(MSR_LE) <= '1';
+        end if;
+
+	-- Update registers
+	ex2in <= v;
+
+	-- update outputs
+	e_out <= ex2.e;
         e_out.msr <= msr_copy(ctrl.msr);
-        fp_out <= fv;
 
-        exception_log <= exception;
-        irq_valid_log <= irq_valid;
+        terminate_out <= ex2.se.terminate;
+        icache_inval <= ex2.se.icache_inval;
+
+        exception_log <= v.e.interrupt;
     end process;
 
     e1_log: if LOG_LENGTH > 0 generate
@@ -1492,9 +1584,9 @@ begin
                             irq_valid_log &
                             interrupt_in &
                             "000" &
-                            ex1.e.write_enable &
-                            ex1.e.valid &
-                            ((ex1.e.redirect and ex1.e.valid) or ex1.e.interrupt) &
+                            ex2.e.write_enable &
+                            ex2.e.valid &
+                            (ex2.e.redirect or ex2.e.interrupt) &
                             ex1.busy &
                             flush_in;
             end if;

From 4b6148ada6a58adb48167733b492c73c505b6930 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 28 Jun 2022 08:40:42 +1000
Subject: [PATCH 06/30] Add a bypass path from the execute2 stage

This enables some instructions to issue earlier and thus improves
performance, at the cost of some extra multiplexers in decode2.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 control.vhdl  | 50 ++++++++++++++++++++++++++++++--------------------
 core.vhdl     |  6 ++++++
 decode2.vhdl  | 34 +++++++++++++++++++++++-----------
 execute1.vhdl | 16 ++++++++++++++++
 4 files changed, 75 insertions(+), 31 deletions(-)

diff --git a/control.vhdl b/control.vhdl
index 0bbe9ad..17a288b 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -36,6 +36,8 @@ entity control is
 
         execute_next_tag    : in instr_tag_t;
         execute_next_cr_tag : in instr_tag_t;
+        execute2_next_tag    : in instr_tag_t;
+        execute2_next_cr_tag : in instr_tag_t;
 
         cr_read_in          : in std_ulogic;
         cr_write_in         : in std_ulogic;
@@ -44,10 +46,10 @@ entity control is
         stall_out           : out std_ulogic;
         stopped_out         : out std_ulogic;
 
-        gpr_bypass_a        : out std_ulogic;
-        gpr_bypass_b        : out std_ulogic;
-        gpr_bypass_c        : out std_ulogic;
-        cr_bypass           : out std_ulogic;
+        gpr_bypass_a        : out std_ulogic_vector(1 downto 0);
+        gpr_bypass_b        : out std_ulogic_vector(1 downto 0);
+        gpr_bypass_c        : out std_ulogic_vector(1 downto 0);
+        cr_bypass           : out std_ulogic_vector(1 downto 0);
 
         instr_tag_out       : out instr_tag_t
         );
@@ -142,11 +144,11 @@ begin
         variable tag_s : instr_tag_t;
         variable tag_t : instr_tag_t;
         variable incr_tag : tag_number_t;
-        variable byp_a : std_ulogic;
-        variable byp_b : std_ulogic;
-        variable byp_c : std_ulogic;
+        variable byp_a : std_ulogic_vector(1 downto 0);
+        variable byp_b : std_ulogic_vector(1 downto 0);
+        variable byp_c : std_ulogic_vector(1 downto 0);
         variable tag_cr : instr_tag_t;
-        variable byp_cr : std_ulogic;
+        variable byp_cr : std_ulogic_vector(1 downto 0);
     begin
         tag_a := instr_tag_init;
         for i in tag_number_t loop
@@ -179,26 +181,32 @@ begin
             tag_c.valid := '0';
         end if;
 
-        byp_a := '0';
+        byp_a := "00";
         if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then
-            byp_a := '1';
+            byp_a := "10";
+        elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then
+            byp_a := "11";
         end if;
-        byp_b := '0';
+        byp_b := "00";
         if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then
-            byp_b := '1';
+            byp_b := "10";
+        elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then
+            byp_b := "11";
         end if;
-        byp_c := '0';
+        byp_c := "00";
         if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then
-            byp_c := '1';
+            byp_c := "10";
+        elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then
+            byp_c := "11";
         end if;
 
         gpr_bypass_a <= byp_a;
         gpr_bypass_b <= byp_b;
         gpr_bypass_c <= byp_c;
 
-        gpr_tag_stall <= (tag_a.valid and not byp_a) or
-                         (tag_b.valid and not byp_b) or
-                         (tag_c.valid and not byp_c);
+        gpr_tag_stall <= (tag_a.valid and not byp_a(1)) or
+                         (tag_b.valid and not byp_b(1)) or
+                         (tag_c.valid and not byp_c(1));
 
         incr_tag := curr_tag;
         instr_tag.tag <= curr_tag;
@@ -215,13 +223,15 @@ begin
         if tag_match(tag_cr, complete_in) then
             tag_cr.valid := '0';
         end if;
-        byp_cr := '0';
+        byp_cr := "00";
         if EX1_BYPASS and tag_match(execute_next_cr_tag, tag_cr) then
-            byp_cr := '1';
+            byp_cr := "10";
+        elsif EX1_BYPASS and tag_match(execute2_next_cr_tag, tag_cr) then
+            byp_cr := "11";
         end if;
 
         cr_bypass <= byp_cr;
-        cr_tag_stall <= tag_cr.valid and not byp_cr;
+        cr_tag_stall <= tag_cr.valid and not byp_cr(1);
     end process;
 
     control1 : process(all)
diff --git a/core.vhdl b/core.vhdl
index 070a1f1..84604c6 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -79,6 +79,8 @@ architecture behave of core is
     signal execute1_to_writeback: Execute1ToWritebackType;
     signal execute1_bypass: bypass_data_t;
     signal execute1_cr_bypass: cr_bypass_data_t;
+    signal execute2_bypass: bypass_data_t;
+    signal execute2_cr_bypass: cr_bypass_data_t;
 
     -- load store signals
     signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
@@ -298,6 +300,8 @@ begin
             c_out => decode2_to_cr_file,
             execute_bypass => execute1_bypass,
             execute_cr_bypass => execute1_cr_bypass,
+            execute2_bypass => execute2_bypass,
+            execute2_cr_bypass => execute2_cr_bypass,
             log_out => log_data(119 downto 110)
             );
     decode2_busy_in <= ex1_busy_out;
@@ -359,6 +363,8 @@ begin
             e_out => execute1_to_writeback,
             bypass_data => execute1_bypass,
             bypass_cr_data => execute1_cr_bypass,
+            bypass2_data => execute2_bypass,
+            bypass2_cr_data => execute2_cr_bypass,
 	    icache_inval => ex1_icache_inval,
             dbg_ctrl_out => ctrl_debug,
             wb_events => writeback_events,
diff --git a/decode2.vhdl b/decode2.vhdl
index af0c27d..c290c98 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -39,6 +39,8 @@ entity decode2 is
 
         execute_bypass    : in bypass_data_t;
         execute_cr_bypass : in cr_bypass_data_t;
+        execute2_bypass    : in bypass_data_t;
+        execute2_cr_bypass : in cr_bypass_data_t;
 
         log_out : out std_ulogic_vector(9 downto 0)
 	);
@@ -273,19 +275,19 @@ architecture behaviour of decode2 is
 
     signal gpr_a_read_valid : std_ulogic;
     signal gpr_a_read       : gspr_index_t;
-    signal gpr_a_bypass     : std_ulogic;
+    signal gpr_a_bypass     : std_ulogic_vector(1 downto 0);
 
     signal gpr_b_read_valid : std_ulogic;
     signal gpr_b_read       : gspr_index_t;
-    signal gpr_b_bypass     : std_ulogic;
+    signal gpr_b_bypass     : std_ulogic_vector(1 downto 0);
 
     signal gpr_c_read_valid : std_ulogic;
     signal gpr_c_read       : gspr_index_t;
-    signal gpr_c_bypass     : std_ulogic;
+    signal gpr_c_bypass     : std_ulogic_vector(1 downto 0);
 
     signal cr_read_valid   : std_ulogic;
     signal cr_write_valid  : std_ulogic;
-    signal cr_bypass       : std_ulogic;
+    signal cr_bypass       : std_ulogic_vector(1 downto 0);
 
     signal instr_tag       : instr_tag_t;
 
@@ -321,6 +323,8 @@ begin
 
             execute_next_tag     => execute_bypass.tag,
             execute_next_cr_tag  => execute_cr_bypass.tag,
+            execute2_next_tag    => execute2_bypass.tag,
+            execute2_next_cr_tag => execute2_cr_bypass.tag,
 
             cr_read_in           => cr_read_valid,
             cr_write_in          => cr_write_valid,
@@ -504,27 +508,35 @@ begin
 
         -- See if any of the operands can get their value via the bypass path.
         case gpr_a_bypass is
-            when '1' =>
+            when "10" =>
                 v.e.read_data1 := execute_bypass.data;
+            when "11" =>
+                v.e.read_data1 := execute2_bypass.data;
             when others =>
                 v.e.read_data1 := decoded_reg_a.data;
         end case;
         case gpr_b_bypass is
-            when '1' =>
+            when "10" =>
                 v.e.read_data2 := execute_bypass.data;
+            when "11" =>
+                v.e.read_data2 := execute2_bypass.data;
             when others =>
                 v.e.read_data2 := decoded_reg_b.data;
         end case;
         case gpr_c_bypass is
-            when '1' =>
+            when "10" =>
                 v.e.read_data3 := execute_bypass.data;
+            when "11" =>
+                v.e.read_data3 := execute2_bypass.data;
             when others =>
                 v.e.read_data3 := decoded_reg_c.data;
         end case;
 
         v.e.cr := c_in.read_cr_data;
-        if cr_bypass = '1' then
+        if cr_bypass = "10" then
             v.e.cr := execute_cr_bypass.data;
+        elsif cr_bypass = "11" then
+            v.e.cr := execute2_cr_bypass.data;
         end if;
 
         -- issue control
@@ -577,9 +589,9 @@ begin
                             r.e.valid &
                             stopped_out &
                             stall_out &
-                            gpr_a_bypass &
-                            gpr_b_bypass &
-                            gpr_c_bypass;
+                            (gpr_a_bypass(1) or gpr_a_bypass(0)) &
+                            (gpr_b_bypass(1) or gpr_b_bypass(0)) &
+                            (gpr_c_bypass(1) or gpr_c_bypass(0));
             end if;
         end process;
         log_out <= log_data;
diff --git a/execute1.vhdl b/execute1.vhdl
index ebcdfeb..ebc24c5 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -40,6 +40,8 @@ entity execute1 is
 	e_out : out Execute1ToWritebackType;
         bypass_data : out bypass_data_t;
         bypass_cr_data : out cr_bypass_data_t;
+        bypass2_data : out bypass_data_t;
+        bypass2_cr_data : out cr_bypass_data_t;
 
         dbg_ctrl_out : out ctrl_t;
 
@@ -1482,6 +1484,7 @@ begin
         variable fv : Execute1ToFPUType;
         variable k : integer;
         variable go : std_ulogic;
+        variable bypass_valid : std_ulogic;
     begin
 	v := ex2;
         if (l_in.busy or fp_in.busy) = '0' then
@@ -1559,6 +1562,19 @@ begin
             ctrl_tmp.msr(MSR_LE) <= '1';
         end if;
 
+        bypass_valid := ex1.e.valid;
+        if (ex2.busy or l_in.busy or fp_in.busy) = '1' and ex1.res2_sel(1) = '1' then
+            bypass_valid := '0';
+        end if;
+
+        bypass2_data.tag.valid <= ex1.e.write_enable and bypass_valid;
+        bypass2_data.tag.tag <= ex1.e.instr_tag.tag;
+        bypass2_data.data <= ex_result;
+
+        bypass2_cr_data.tag.valid <= ex1.e.write_cr_enable and bypass_valid;
+        bypass2_cr_data.tag.tag <= ex1.e.instr_tag.tag;
+        bypass2_cr_data.data <= ex1.e.write_cr_data;
+
 	-- Update registers
 	ex2in <= v;
 

From e030a500e85ad0e22e47dfb7af087e7fef9df20d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 27 Jun 2022 18:53:04 +1000
Subject: [PATCH 07/30] Allow integer instructions and load/store instructions
 to execute together

Execute1 and loadstore1 now send each other stall signals that
indicate that a valid instruction in stage 2 can't complete in this
cycle, and hence any valid instruction in stage 1 in the other unit
can't move to stage 2.  With this in place, an ALU instruction can
move into stage 1 while a LSU instruction is in stage 2.

Since the FPU doesn't yet have a way to stall completion, we can't yet
start FPU instructions while any LSU or ALU instruction is in
progress.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |  5 +++--
 countbits.vhdl  |  5 +++--
 execute1.vhdl   | 22 +++++++++++++---------
 loadstore1.vhdl |  6 +++---
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 6cbf181..ac733db 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -461,6 +461,7 @@ package common is
         is_32bit : std_ulogic;
         repeat : std_ulogic;
         second : std_ulogic;
+        e2stall : std_ulogic;
         msr : std_ulogic_vector(63 downto 0);
     end record;
     constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type :=
@@ -473,13 +474,13 @@ package common is
          write_reg => (others => '0'),
          length => (others => '0'),
          mode_32bit => '0', is_32bit => '0',
-         repeat => '0', second => '0',
+         repeat => '0', second => '0', e2stall => '0',
          msr => (others => '0'));
 
     type Loadstore1ToExecute1Type is record
         busy : std_ulogic;
+        l2stall : std_ulogic;
         in_progress : std_ulogic;
-        interrupt : std_ulogic;
     end record;
 
     type Loadstore1ToDcacheType is record
diff --git a/countbits.vhdl b/countbits.vhdl
index b16baa0..87417a9 100644
--- a/countbits.vhdl
+++ b/countbits.vhdl
@@ -9,6 +9,7 @@ entity bit_counter is
     port (
         clk         : in std_logic;
         rs          : in std_ulogic_vector(63 downto 0);
+        stall       : in std_ulogic;
         count_right : in std_ulogic;
         do_popcnt   : in std_ulogic;
         is_32bit    : in std_ulogic;
@@ -49,7 +50,7 @@ architecture behaviour of bit_counter is
 begin
     countzero_r: process(clk)
     begin
-        if rising_edge(clk) then
+        if rising_edge(clk) and stall = '0' then
             inp_r <= inp;
             sum_r <= sum;
         end if;
@@ -88,7 +89,7 @@ begin
 
     popcnt_r: process(clk)
     begin
-        if rising_edge(clk) then
+        if rising_edge(clk) and stall = '0' then
             for i in 0 to 7 loop
                 pc8_r(i) <= pc8(i);
             end loop;
diff --git a/execute1.vhdl b/execute1.vhdl
index ebc24c5..e4db56f 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -204,6 +204,8 @@ architecture behaviour of execute1 is
     signal exception_log : std_ulogic;
     signal irq_valid_log : std_ulogic;
 
+    signal stage2_stall : std_ulogic;
+
     type privilege_level is (USER, SUPER);
     type op_privilege_array is array(insn_type_t) of privilege_level;
     constant op_privilege: op_privilege_array := (
@@ -351,6 +353,7 @@ begin
 	port map (
             clk => clk,
 	    rs => c_in,
+            stall => stage2_stall,
 	    count_right => e_in.insn(10),
 	    is_32bit => e_in.is_32bit,
             do_popcnt => do_popcnt,
@@ -436,14 +439,13 @@ begin
     -- XER forwarding. To avoid having to track XER hazards, we use
     -- the previously latched value.  Since the XER common bits
     -- (SO, OV[32] and CA[32]) are only modified by instructions that are
-    -- handled here, we can just forward the result being sent to
-    -- writeback.
+    -- handled here, we can just use the result most recently sent to
+    -- writeback, unless a pipeline flush has happened in the meantime.
     xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc;
 
     with e_in.unit select busy_out <=
-        l_in.busy or ex1.e.valid or ex1.busy or fp_in.busy when LDST,
         l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU,
-        l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy when others;
+        l_in.busy or ex1.busy or fp_in.busy when others;
 
     valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt);
 
@@ -479,8 +481,7 @@ begin
                 -- We mustn't get stalled on a cycle where execute2 is
                 -- completing an instruction or generating an interrupt
                 if ex2.e.valid = '1' or ex2.e.interrupt = '1' then
-                    assert (l_in.busy or fp_in.busy) = '0'
-                        severity failure;
+                    assert stage2_stall = '0' severity failure;
                 end if;
             end if;
 	end if;
@@ -1434,6 +1435,7 @@ begin
         lv.is_32bit := e_in.is_32bit;
         lv.repeat := e_in.repeat;
         lv.second := e_in.second;
+        lv.e2stall := '0';
 
         -- Outputs to FPU
         fv.op := e_in.insn_type;
@@ -1476,6 +1478,8 @@ begin
         pmu_to_x.spr_val  when "11",
         ex1.e.write_data  when others;
 
+    stage2_stall <= l_in.l2stall or fp_in.busy;
+
     -- Second execute stage control
     execute2_1: process(all)
 	variable v : reg_stage2_type;
@@ -1487,7 +1491,7 @@ begin
         variable bypass_valid : std_ulogic;
     begin
 	v := ex2;
-        if (l_in.busy or fp_in.busy) = '0' then
+        if stage2_stall = '0' then
             v.e := ex1.e;
             v.se := ex1.se;
             v.e.write_data := ex_result;
@@ -1526,7 +1530,7 @@ begin
             v.ext_interrupt := '0';
         end if;
 
-	if (l_in.busy or fp_in.busy) = '0' then
+	if stage2_stall = '0' then
             if ex1.se.write_msr = '1' then
                 ctrl_tmp.msr <= ex1.msr;
             end if;
@@ -1563,7 +1567,7 @@ begin
         end if;
 
         bypass_valid := ex1.e.valid;
-        if (ex2.busy or l_in.busy or fp_in.busy) = '1' and ex1.res2_sel(1) = '1' then
+        if stage2_stall = '1' and ex1.res2_sel(1) = '1' then
             bypass_valid := '0';
         end if;
 
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index ea7baec..bd62f0b 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -624,7 +624,7 @@ begin
             store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j);
         end loop;
 
-        if (dc_stall or d_in.error or r2.busy) = '0' then
+        if (dc_stall or d_in.error or r2.busy or l_in.e2stall) = '0' then
             if r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0' then
                 v.req := r1.req;
                 v.addr0 := r1.addr0;
@@ -950,7 +950,7 @@ begin
         else
             d_out.data <= r2.req.store_data;
         end if;
-        d_out.hold <= '0';
+        d_out.hold <= l_in.e2stall;
 
         -- Update outputs to MMU
         m_out.valid <= mmureq;
@@ -980,8 +980,8 @@ begin
 
         -- update busy signal back to execute1
         e_out.busy <= busy;
+        e_out.l2stall <= dc_stall or d_in.error or r2.busy;
         e_out.in_progress <= in_progress;
-        e_out.interrupt <= r3.interrupt;
 
         events <= r3.events;
 

From ef122868d55d4681c4823ea9705179a60fc04da6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 28 Jun 2022 18:18:08 +1000
Subject: [PATCH 08/30] Do CR0 setting for Rc=1 instructions in execute2
 instead of writeback

This lets us forward the CR0 result to following instructions that
use CR, meaning they get to issue one cycle earlier.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 execute1.vhdl  | 67 +++++++++++++++++++++++++++++++++++++++++---------
 writeback.vhdl | 23 +----------------
 2 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/execute1.vhdl b/execute1.vhdl
index e4db56f..75e8275 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -169,7 +169,6 @@ architecture behaviour of execute1 is
     signal muldiv_result: std_ulogic_vector(63 downto 0);
     signal shortmul_result: std_ulogic_vector(63 downto 0);
     signal spr_result: std_ulogic_vector(63 downto 0);
-    signal ex_result: std_ulogic_vector(63 downto 0);
     signal next_nia : std_ulogic_vector(63 downto 0);
     signal s1_sel : std_ulogic_vector(2 downto 0);
 
@@ -799,8 +798,10 @@ begin
                 crnum := fxm_to_num(insn_fxm(e_in.insn));
                 write_cr_mask <= num_to_fxm(crnum);
             end if;
-        else
+        elsif e_in.output_cr = '1' then
             write_cr_mask <= num_to_fxm(crnum);
+        else
+            write_cr_mask <= (others => '0');
         end if;
         for i in 0 to 7 loop
             if write_cr_mask(i) = '0' then
@@ -1471,13 +1472,6 @@ begin
         ctrl.cfar when SPRSEL_CFAR,
         assemble_xer(ex1.e.xerc, ctrl.xer_low) when others;
 
-    -- Second stage result mux
-    with ex1.res2_sel select ex_result <=
-        countbits_result  when "01",
-        spr_result        when "10",
-        pmu_to_x.spr_val  when "11",
-        ex1.e.write_data  when others;
-
     stage2_stall <= l_in.l2stall or fp_in.busy;
 
     -- Second execute stage control
@@ -1489,12 +1483,18 @@ begin
         variable k : integer;
         variable go : std_ulogic;
         variable bypass_valid : std_ulogic;
+        variable rcresult : std_ulogic_vector(63 downto 0);
+        variable sprres : std_ulogic_vector(63 downto 0);
+        variable ex_result : std_ulogic_vector(63 downto 0);
+        variable cr_res : std_ulogic_vector(31 downto 0);
+        variable cr_mask : std_ulogic_vector(7 downto 0);
+        variable sign, zero : std_ulogic;
+        variable rcnz_hi, rcnz_lo : std_ulogic;
     begin
 	v := ex2;
         if stage2_stall = '0' then
             v.e := ex1.e;
             v.se := ex1.se;
-            v.e.write_data := ex_result;
             v.ext_interrupt := ex1.ext_interrupt;
             v.taken_branch_event := ex1.taken_branch_event;
             v.br_mispredict := ex1.br_mispredict;
@@ -1530,7 +1530,49 @@ begin
             v.ext_interrupt := '0';
         end if;
 
+        -- This is split like this because mfspr doesn't have an Rc bit,
+        -- and we don't want the zero-detect logic to be after the
+        -- SPR mux for timing reasons.
+        if ex1.res2_sel(0) = '0' then
+            rcresult := ex1.e.write_data;
+            sprres := spr_result;
+        else
+            rcresult := countbits_result;
+            sprres := pmu_to_x.spr_val;
+        end if;
+        if ex1.res2_sel(1) = '0' then
+            ex_result := rcresult;
+        else
+            ex_result := sprres;
+        end if;
+
+        cr_res := ex1.e.write_cr_data;
+        cr_mask := ex1.e.write_cr_mask;
+        if ex1.e.rc = '1' and ex1.e.write_enable = '1' then
+            rcnz_lo := or (rcresult(31 downto 0));
+            if ex1.e.mode_32bit = '0' then
+                rcnz_hi := or (rcresult(63 downto 32));
+                zero := not (rcnz_hi or rcnz_lo);
+                sign := ex_result(63);
+            else
+                zero := not rcnz_lo;
+                sign := ex_result(31);
+            end if;
+            cr_res(31) := sign;
+            cr_res(30) := not (sign or zero);
+            cr_res(29) := zero;
+            cr_res(28) := ex1.xerc.so;
+            cr_mask(7) := '1';
+        end if;
+
 	if stage2_stall = '0' then
+            v.e.write_data := ex_result;
+            v.e.write_cr_data := cr_res;
+            v.e.write_cr_mask := cr_mask;
+            if ex1.e.rc = '1' and ex1.e.write_enable = '1' and v.e.valid = '1' then
+                v.e.write_cr_enable := '1';
+            end if;
+
             if ex1.se.write_msr = '1' then
                 ctrl_tmp.msr <= ex1.msr;
             end if;
@@ -1575,9 +1617,10 @@ begin
         bypass2_data.tag.tag <= ex1.e.instr_tag.tag;
         bypass2_data.data <= ex_result;
 
-        bypass2_cr_data.tag.valid <= ex1.e.write_cr_enable and bypass_valid;
+        bypass2_cr_data.tag.valid <= (ex1.e.write_cr_enable or (ex1.e.rc and ex1.e.write_enable))
+                                     and bypass_valid;
         bypass2_cr_data.tag.tag <= ex1.e.instr_tag.tag;
-        bypass2_cr_data.data <= ex1.e.write_cr_data;
+        bypass2_cr_data.data <= cr_res;
 
 	-- Update registers
 	ex2in <= v;
diff --git a/writeback.vhdl b/writeback.vhdl
index a99d4d2..db30164 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -66,7 +66,7 @@ begin
                     to_integer(unsigned(w))) <= 1 severity failure;
 
             w(0) := e_in.write_cr_enable;
-            x(0) := (e_in.write_enable and e_in.rc);
+            x(0) := l_in.rc;
             y(0) := fp_in.write_cr_enable;
             assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) +
                     to_integer(unsigned(y))) <= 1 severity failure;
@@ -80,9 +80,6 @@ begin
     writeback_1: process(all)
         variable v    : reg_type;
         variable f    : WritebackToFetch1Type;
-        variable cf: std_ulogic_vector(3 downto 0);
-        variable zero : std_ulogic;
-        variable sign : std_ulogic;
         variable scf  : std_ulogic_vector(3 downto 0);
         variable vec  : integer range 0 to 16#fff#;
         variable srr1 : std_ulogic_vector(15 downto 0);
@@ -186,24 +183,6 @@ begin
                 c_out.write_cr_data(31 downto 28) <= scf;
             end if;
 
-            -- Perform CR0 update for RC forms
-            -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data
-            if e_in.rc = '1' and e_in.write_enable = '1' then
-                zero := not (or e_in.write_data(31 downto 0));
-                if e_in.mode_32bit = '0' then
-                    sign := e_in.write_data(63);
-                    zero := zero and not (or e_in.write_data(63 downto 32));
-                else
-                    sign := e_in.write_data(31);
-                end if;
-                c_out.write_cr_enable <= '1';
-                c_out.write_cr_mask <= num_to_fxm(0);
-                cf(3) := sign;
-                cf(2) := not sign and not zero;
-                cf(1) := zero;
-                cf(0) := e_in.xerc.so;
-                c_out.write_cr_data(31 downto 28) <= cf;
-            end if;
         end if;
 
         -- Outputs to fetch1

From 9a8a8e50f8e886a90315091fe8d9e584c8429493 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 2 Jul 2022 14:17:18 +1000
Subject: [PATCH 09/30] FPU: Add stage-2 stall ability to FPU

This makes the FPU able to stall other units at execute stage 2 and be
stalled by other units (specifically the LSU).

This means that the completion and writeback for an instruction can
now end up being deferred until the second cycle of a following
instruction, i.e. the cycle when the state machine has gone through
IDLE state into one of the DO_* states, which means we need to latch
the destination FPR number, CR mask, etc. from the previous
instruction so that we present the correct information to writeback.

The advantage of this is that we can get rid of the in_progress signal
from the LSU.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |   5 +-
 core.vhdl       |   1 +
 execute1.vhdl   |  14 ++--
 fpu.vhdl        | 169 ++++++++++++++++++++++++++++++------------------
 loadstore1.vhdl |   3 -
 5 files changed, 118 insertions(+), 74 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index ac733db..ea6a8d8 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -480,7 +480,6 @@ package common is
     type Loadstore1ToExecute1Type is record
         busy : std_ulogic;
         l2stall : std_ulogic;
-        in_progress : std_ulogic;
     end record;
 
     type Loadstore1ToDcacheType is record
@@ -640,16 +639,18 @@ package common is
         frt     : gspr_index_t;
         rc      : std_ulogic;
         out_cr  : std_ulogic;
+        stall   : std_ulogic;
     end record;
     constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
                                                        itag => instr_tag_init,
                                                        insn  => (others => '0'), fe_mode => "00", rc => '0',
                                                        fra => (others => '0'), frb => (others => '0'),
                                                        frc => (others => '0'), frt => (others => '0'),
-                                                       single => '0', out_cr => '0');
+                                                       single => '0', out_cr => '0', stall => '0');
 
     type FPUToExecute1Type is record
         busy      : std_ulogic;
+        f2stall   : std_ulogic;
         exception : std_ulogic;
     end record;
     constant FPUToExecute1Init : FPUToExecute1Type := (others => '0');
diff --git a/core.vhdl b/core.vhdl
index 84604c6..23f7e82 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -384,6 +384,7 @@ begin
             port map (
                 clk => clk,
                 rst => rst_fpu,
+                flush_in => flush,
                 e_in => execute1_to_fpu,
                 e_out => fpu_to_execute1,
                 w_out => fpu_to_writeback
diff --git a/execute1.vhdl b/execute1.vhdl
index 75e8275..57f90b0 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -442,9 +442,9 @@ begin
     -- writeback, unless a pipeline flush has happened in the meantime.
     xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc;
 
-    with e_in.unit select busy_out <=
-        l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU,
-        l_in.busy or ex1.busy or fp_in.busy when others;
+    -- N.B. the busy signal from each source includes the
+    -- stage2 stall from that source in it.
+    busy_out <= l_in.busy or ex1.busy or fp_in.busy;
 
     valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt);
 
@@ -1299,8 +1299,7 @@ begin
             end if;
         end if;
 
-        v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or
-                                 ex1.busy or fp_in.busy);
+        v.no_instr_avail := not (e_in.valid or l_in.busy or ex1.busy or fp_in.busy);
 
         go := valid_in and not exception;
         v.instr_dispatch := go;
@@ -1436,7 +1435,7 @@ begin
         lv.is_32bit := e_in.is_32bit;
         lv.repeat := e_in.repeat;
         lv.second := e_in.second;
-        lv.e2stall := '0';
+        lv.e2stall := fp_in.f2stall;
 
         -- Outputs to FPU
         fv.op := e_in.insn_type;
@@ -1451,6 +1450,7 @@ begin
         fv.frt := e_in.write_reg;
         fv.rc := e_in.rc;
         fv.out_cr := e_in.output_cr;
+        fv.stall := l_in.l2stall;
 
 	-- Update registers
 	ex1in <= v;
@@ -1472,7 +1472,7 @@ begin
         ctrl.cfar when SPRSEL_CFAR,
         assemble_xer(ex1.e.xerc, ctrl.xer_low) when others;
 
-    stage2_stall <= l_in.l2stall or fp_in.busy;
+    stage2_stall <= l_in.l2stall or fp_in.f2stall;
 
     -- Second execute stage control
     execute2_1: process(all)
diff --git a/fpu.vhdl b/fpu.vhdl
index fad09cc..a20a7a0 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -15,6 +15,7 @@ entity fpu is
     port (
         clk : in std_ulogic;
         rst : in std_ulogic;
+        flush_in : in std_ulogic;
 
         e_in  : in  Execute1ToFPUType;
         e_out : out FPUToExecute1Type;
@@ -35,7 +36,7 @@ architecture behaviour of fpu is
         mantissa : std_ulogic_vector(63 downto 0);      -- 10.54 format
     end record;
 
-    type state_t is (IDLE,
+    type state_t is (IDLE, DO_ILLEGAL,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                      DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                      DO_FCFID, DO_FCTI,
@@ -71,7 +72,9 @@ architecture behaviour of fpu is
     type reg_type is record
         state        : state_t;
         busy         : std_ulogic;
+        f2stall      : std_ulogic;
         instr_done   : std_ulogic;
+        complete     : std_ulogic;
         do_intr      : std_ulogic;
         illegal      : std_ulogic;
         op           : insn_type_t;
@@ -83,7 +86,9 @@ architecture behaviour of fpu is
         rc           : std_ulogic;
         is_cmp       : std_ulogic;
         single_prec  : std_ulogic;
+        sp_result    : std_ulogic;
         fpscr        : std_ulogic_vector(31 downto 0);
+        comm_fpscr   : std_ulogic_vector(31 downto 0);  -- committed FPSCR value
         a            : fpu_reg_type;
         b            : fpu_reg_type;
         c            : fpu_reg_type;
@@ -96,13 +101,17 @@ architecture behaviour of fpu is
         result_class : fp_number_class;
         result_exp   : signed(EXP_BITS-1 downto 0);
         shift        : signed(EXP_BITS-1 downto 0);
-        writing_back : std_ulogic;
+        writing_fpr  : std_ulogic;
+        write_reg    : gspr_index_t;
+        complete_tag : instr_tag_t;
+        writing_cr   : std_ulogic;
         int_result   : std_ulogic;
         cr_result    : std_ulogic_vector(3 downto 0);
         cr_mask      : std_ulogic_vector(7 downto 0);
         old_exc      : std_ulogic_vector(4 downto 0);
         update_fprf  : std_ulogic;
         quieten_nan  : std_ulogic;
+        nsnan_result : std_ulogic;
         tiny         : std_ulogic;
         denorm       : std_ulogic;
         round_mode   : std_ulogic_vector(2 downto 0);
@@ -542,17 +551,30 @@ begin
     fpu_0: process(clk)
     begin
         if rising_edge(clk) then
-            if rst = '1' then
+            if rst = '1' or flush_in = '1' then
                 r.state <= IDLE;
                 r.busy <= '0';
+                r.f2stall <= '0';
                 r.instr_done <= '0';
+                r.complete <= '0';
+                r.illegal <= '0';
                 r.do_intr <= '0';
+                r.writing_fpr <= '0';
+                r.writing_cr <= '0';
                 r.fpscr <= (others => '0');
-                r.writing_back <= '0';
-                r.dest_fpr <= (others =>'0');
+                r.write_reg <= (others =>'0');
+                r.complete_tag.valid <= '0';
                 r.cr_mask <= (others =>'0');
                 r.cr_result <= (others =>'0');
                 r.instr_tag.valid <= '0';
+                if rst = '1' then
+                    r.fpscr <= (others => '0');
+                    r.comm_fpscr <= (others => '0');
+                elsif r.do_intr = '0' then
+                    -- flush_in = 1 and not due to us generating an interrupt,
+                    -- roll back to committed fpscr
+                    r.fpscr <= r.comm_fpscr;
+                end if;
             else
                 assert not (r.state /= IDLE and e_in.valid = '1') severity failure;
                 r <= rin;
@@ -577,14 +599,19 @@ begin
     end process;
 
     e_out.busy <= r.busy;
+    e_out.f2stall <= r.f2stall;
     e_out.exception <= r.fpscr(FPSCR_FEX);
 
-    w_out.valid <= r.instr_done and not r.do_intr;
-    w_out.instr_tag <= r.instr_tag;
-    w_out.write_enable <= r.writing_back;
-    w_out.write_reg <= r.dest_fpr;
+    -- Note that the cycle where r.complete = 1 for an instruction can be as
+    -- late as the second cycle of the following instruction (i.e. in the state
+    -- following IDLE state).  Hence it is important that none of the fields of
+    -- r that are used below are modified in IDLE state.
+    w_out.valid <= r.complete;
+    w_out.instr_tag <= r.complete_tag;
+    w_out.write_enable <= r.writing_fpr and r.complete;
+    w_out.write_reg <= r.write_reg;
     w_out.write_data <= fp_result;
-    w_out.write_cr_enable <= r.instr_done and (r.rc or r.is_cmp);
+    w_out.write_cr_enable <= r.writing_cr and r.complete;
     w_out.write_cr_mask <= r.cr_mask;
     w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result &
                            r.cr_result & r.cr_result & r.cr_result & r.cr_result;
@@ -599,7 +626,6 @@ begin
         variable bdec        : fpu_reg_type;
         variable cdec        : fpu_reg_type;
         variable fpscr_mask  : std_ulogic_vector(31 downto 0);
-        variable illegal     : std_ulogic;
         variable j, k        : integer;
         variable flm         : std_ulogic_vector(7 downto 0);
         variable int_input   : std_ulogic;
@@ -644,12 +670,22 @@ begin
         variable maddend     : std_ulogic_vector(127 downto 0);
         variable sum         : std_ulogic_vector(63 downto 0);
         variable round_inc   : std_ulogic_vector(63 downto 0);
+        variable int_result  : std_ulogic;
+        variable illegal     : std_ulogic;
     begin
         v := r;
-        illegal := '0';
-        v.busy := '0';
+        v.complete := '0';
+        v.do_intr := '0';
         int_input := '0';
 
+        if r.complete = '1' or r.do_intr = '1' then
+            v.instr_done := '0';
+            v.writing_fpr := '0';
+            v.writing_cr := '0';
+            v.comm_fpscr := r.fpscr;
+            v.illegal := '0';
+        end if;
+
         -- capture incoming instruction
         if e_in.valid = '1' then
             v.insn := e_in.insn;
@@ -660,14 +696,8 @@ begin
             v.dest_fpr := e_in.frt;
             v.single_prec := e_in.single;
             v.longmask := e_in.single;
-            v.int_result := '0';
             v.rc := e_in.rc;
             v.is_cmp := e_in.out_cr;
-            if e_in.out_cr = '0' then
-                v.cr_mask := num_to_fxm(1);
-            else
-                v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn))));
-            end if;
             int_input := '0';
             if e_in.op = OP_FPOP_I then
                 int_input := '1';
@@ -741,8 +771,6 @@ begin
             pcmpb_lt := '1';
         end if;
 
-        v.writing_back := '0';
-        v.instr_done := '0';
         v.update_fprf := '0';
         v.shift := to_signed(0, EXP_BITS);
         v.first := '0';
@@ -777,6 +805,8 @@ begin
         pshift := '0';
         renorm_sqrt := '0';
         shiftin := '0';
+        int_result := '0';
+        illegal := '0';
         case r.state is
             when IDLE =>
                 v.use_a := '0';
@@ -785,6 +815,7 @@ begin
                 v.invalid := '0';
                 v.negate := '0';
                 if e_in.valid = '1' then
+                    v.busy := '1';
                     case e_in.insn(5 downto 1) is
                         when "00000" =>
                             if e_in.insn(8) = '1' then
@@ -876,13 +907,17 @@ begin
                             end if;
                             v.state := DO_FMADD;
                         when others =>
-                            illegal := '1';
+                            v.state := DO_ILLEGAL;
                     end case;
                 end if;
                 v.x := '0';
                 v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
                 set_s := '1';
 
+            when DO_ILLEGAL =>
+                illegal := '1';
+                v.instr_done := '1';
+
             when DO_MCRFS =>
                 j := to_integer(unsigned(insn_bfa(r.insn)));
                 for i in 0 to 7 loop
@@ -894,11 +929,9 @@ begin
                 end loop;
                 v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF");
                 v.instr_done := '1';
-                v.state := IDLE;
 
             when DO_FTDIV =>
                 v.instr_done := '1';
-                v.state := IDLE;
                 v.cr_result := "0000";
                 if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
                     (r.b.class = FINITE and r.b.mantissa(53) = '0') then
@@ -917,7 +950,6 @@ begin
 
             when DO_FTSQRT =>
                 v.instr_done := '1';
-                v.state := IDLE;
                 v.cr_result := "0000";
                 if r.b.class = ZERO or r.b.class = INFINITY or
                     (r.b.class = FINITE and r.b.mantissa(53) = '0') then
@@ -932,7 +964,6 @@ begin
                 -- fcmp[uo]
                 -- r.opsel_a = AIN_B
                 v.instr_done := '1';
-                v.state := IDLE;
                 update_fx := '1';
                 v.result_exp := r.b.exponent;
                 if (r.a.class = NAN and r.a.mantissa(53) = '0') or
@@ -993,7 +1024,6 @@ begin
                     end if;
                 end loop;
                 v.instr_done := '1';
-                v.state := IDLE;
 
             when DO_MTFSFI =>
                 -- mtfsfi
@@ -1007,20 +1037,17 @@ begin
                     end loop;
                 end if;
                 v.instr_done := '1';
-                v.state := IDLE;
 
             when DO_FMRG =>
                 -- fmrgew, fmrgow
                 opsel_r <= RES_MISC;
                 misc_sel <= "01" & r.insn(8) & '0';
-                v.int_result := '1';
-                v.writing_back := '1';
+                int_result := '1';
+                v.writing_fpr := '1';
                 v.instr_done := '1';
-                v.state := IDLE;
 
             when DO_MFFS =>
-                v.int_result := '1';
-                v.writing_back := '1';
+                v.writing_fpr := '1';
                 opsel_r <= RES_MISC;
                 case r.insn(20 downto 16) is
                     when "00000" =>
@@ -1044,10 +1071,11 @@ begin
                         -- mffsl
                         fpscr_mask := x"0007F0FF";
                     when others =>
-                        illegal := '1';
+                        v.illegal := '1';
+                        v.writing_fpr := '0';
                 end case;
+                int_result := '1';
                 v.instr_done := '1';
-                v.state := IDLE;
 
             when DO_MTFSF =>
                 if r.insn(25) = '1' then
@@ -1064,7 +1092,6 @@ begin
                     end if;
                 end loop;
                 v.instr_done := '1';
-                v.state := IDLE;
 
             when DO_FMR =>
                 -- r.opsel_a = AIN_B
@@ -1082,9 +1109,8 @@ begin
                 else
                     v.result_sign := r.a.negative;     -- fcpsgn
                 end if;
-                v.writing_back := '1';
+                v.writing_fpr := '1';
                 v.instr_done := '1';
-                v.state := IDLE;
 
             when DO_FRI =>    -- fri[nzpm]
                 -- r.opsel_a = AIN_B
@@ -1153,7 +1179,7 @@ begin
                     invalid := '1';
                 end if;
 
-                v.int_result := '1';
+                int_result := '1';
                 case r.b.class is
                     when ZERO =>
                         arith_done := '1';
@@ -1671,7 +1697,6 @@ begin
                 end if;
                 v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
                 v.instr_done := '1';
-                v.state := IDLE;
 
             when MULT_1 =>
                 f_to_multiply.valid <= r.first;
@@ -1849,7 +1874,6 @@ begin
                 v.cr_result(1) := exp_tiny or exp_huge;
                 if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then
                     v.instr_done := '1';
-                    v.state := IDLE;
                 else
                     v.shift := r.a.exponent;
                     v.doing_ftdiv := "10";
@@ -2054,6 +2078,7 @@ begin
                     when others =>      -- fctidu[z]
                         need_check := r.r(63);
                 end case;
+                int_result := '1';
                 if need_check = '1' then
                     v.state := INT_CHECK;
                 else
@@ -2080,6 +2105,7 @@ begin
                         v.fpscr(FPSCR_XX) := '1';
                     end if;
                 end if;
+                int_result := '1';
                 arith_done := '1';
 
             when INT_OFLOW =>
@@ -2090,6 +2116,7 @@ begin
                 end if;
                 v.fpscr(FPSCR_VXCVI) := '1';
                 invalid := '1';
+                int_result := '1';
                 arith_done := '1';
 
             when FRI_1 =>
@@ -2306,11 +2333,10 @@ begin
             -- Neither does enabled zero-divide exception
             if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and
                 (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then
-                v.writing_back := '1';
+                v.writing_fpr := '1';
                 v.update_fprf := '1';
             end if;
             v.instr_done := '1';
-            v.state := IDLE;
             update_fx := '1';
         end if;
 
@@ -2530,12 +2556,6 @@ begin
             v.shift := resize(signed('0' & clz) - 9, EXP_BITS);
         end if;
 
-        if r.int_result = '1' then
-            fp_result <= r.r;
-        else
-            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
-                                 r.single_prec, r.quieten_nan);
-        end if;
         if r.update_fprf = '1' then
             v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class,
                                                              r.r(54) and not r.denorm);
@@ -2549,24 +2569,49 @@ begin
             (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then
             v.fpscr(FPSCR_FX) := '1';
         end if;
-        if r.rc = '1' then
-            v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
-        end if;
 
-        v.illegal := illegal;
-        if illegal = '1' then
-            v.instr_done := '0';
-            v.do_intr := '1';
-            v.writing_back := '0';
-            v.busy := '0';
-            v.state := IDLE;
+        if v.instr_done = '1' then
+            if r.state /= IDLE then
+                v.state := IDLE;
+                v.busy := '0';
+                v.f2stall := '0';
+                if r.rc = '1' then
+                    v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
+                end if;
+                v.sp_result := r.single_prec;
+                v.int_result := int_result;
+                v.illegal := illegal;
+                v.nsnan_result := v.quieten_nan;
+                if r.is_cmp = '0' then
+                    v.cr_mask := num_to_fxm(1);
+                else
+                    v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(r.insn))));
+                end if;
+                v.writing_cr := r.is_cmp or r.rc;
+                v.write_reg := r.dest_fpr;
+                v.complete_tag := r.instr_tag;
+            end if;
+            if e_in.stall = '0' then
+                v.complete := not v.illegal;
+                v.do_intr := (v.fpscr(FPSCR_FEX) and r.fe_mode) or v.illegal;
+            end if;
+            -- N.B. We rely on execute1 to prevent any new instruction
+            -- coming in while e_in.stall = 1, without us needing to
+            -- have busy asserted.
         else
-            v.do_intr := v.instr_done and v.fpscr(FPSCR_FEX) and r.fe_mode;
-            if v.state /= IDLE or v.do_intr = '1' then
-                v.busy := '1';
+            if r.state /= IDLE and e_in.stall = '0' then
+                v.f2stall := '1';
             end if;
         end if;
 
+        -- This mustn't depend on any fields of r that are modified in IDLE state.
+        if r.int_result = '1' then
+            fp_result <= r.r;
+        else
+            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
+                                 r.sp_result, r.nsnan_result);
+        end if;
+
         rin <= v;
     end process;
 
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index bd62f0b..ff2633b 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -159,7 +159,6 @@ architecture behave of loadstore1 is
     signal flush    : std_ulogic;
     signal busy     : std_ulogic;
     signal complete : std_ulogic;
-    signal in_progress : std_ulogic;
     signal flushing : std_ulogic;
 
     signal store_sp_data : std_ulogic_vector(31 downto 0);
@@ -523,7 +522,6 @@ begin
 
     busy <= dc_stall or d_in.error or r1.busy or r2.busy;
     complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or r3.complete;
-    in_progress <= r1.req.valid or (r2.req.valid and not complete);
 
     -- Processing done in the first cycle of a load/store instruction
     loadstore1_1: process(all)
@@ -981,7 +979,6 @@ begin
         -- update busy signal back to execute1
         e_out.busy <= busy;
         e_out.l2stall <= dc_stall or d_in.error or r2.busy;
-        e_out.in_progress <= in_progress;
 
         events <= r3.events;
 

From ebe1caab85c35497e733c566fc9750813f505e5d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 2 Jul 2022 22:23:35 +1000
Subject: [PATCH 10/30] decode1: Reduce number of single-issue instructions

This reduces the set of instructions marked as single-issue to just
attn and mtspr to "slow" SPRs (those that are not stored in the
register file).

The instructions that were previously single-issue are: isync, dcbf,
dcbst, dcbt, dcbtst, eieio, icbi, mfmsr, mtmsr, mtmsrd, mfspr to slow
SPRS, sync, tlbsync and wait.  The synchronization instructions are
mostly no-ops anyway due to the in-order nature of the core, and the
cache-management instructions are unimplemented (except for icbi).
The MSR ops don't need to be single-issue due to the in-order core and
the fact that MSR updates are effective on the following instruction.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index fb92b9e..3f3109f 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -170,7 +170,7 @@ architecture behaviour of decode1 is
         -- bclr, bcctr, bctar
         2#100#    =>       (ALU, NONE, OP_BCREG,     SPR,        SPR,         NONE, SPR,  '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
         -- isync
-        2#111#    =>       (ALU, NONE, OP_ISYNC,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE),
+        2#111#    =>       (ALU, NONE, OP_ISYNC,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         -- rfid
         2#101#    =>       (ALU, NONE, OP_RFID,      SPR,        SPR,         NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         others   => illegal_inst
@@ -223,10 +223,10 @@ architecture behaviour of decode1 is
         2#1000111010#  =>       (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- cnttzd
         2#1000011010#  =>       (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- cnttzw
         2#1011110011#  =>       (ALU,  NONE, OP_DARN,      NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- darn
-        2#0001010110#  =>       (ALU,  NONE, OP_DCBF,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbf
-        2#0000110110#  =>       (ALU,  NONE, OP_DCBST,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbst
-        2#0100010110#  =>       (ALU,  NONE, OP_DCBT,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbt
-        2#0011110110#  =>       (ALU,  NONE, OP_DCBTST,    NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbtst
+        2#0001010110#  =>       (ALU,  NONE, OP_DCBF,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbf
+        2#0000110110#  =>       (ALU,  NONE, OP_DCBST,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbst
+        2#0100010110#  =>       (ALU,  NONE, OP_DCBT,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbt
+        2#0011110110#  =>       (ALU,  NONE, OP_DCBTST,    NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbtst
         2#1111110110#  =>       (LDST, NONE, OP_DCBZ,      RA_OR_ZERO, RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbz
         2#0110001001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdeu
         2#1110001001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdeuo
@@ -247,7 +247,7 @@ architecture behaviour of decode1 is
         2#1100110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dss
         2#0101010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dst
         2#0101110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dstst
-        2#1101010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- eieio
+        2#1101010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- eieio
         2#0100011100#  =>       (ALU,  NONE, OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- eqv
         2#1110111010#  =>       (ALU,  NONE, OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- extsb
         2#1110011010#  =>       (ALU,  NONE, OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- extsh
@@ -327,8 +327,8 @@ architecture behaviour of decode1 is
         2#1100001001#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd
         2#1100001011#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw
         2#0010010000#  =>       (ALU,  NONE, OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtcrf/mtocrf
-        2#0010010010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1', NONE), -- mtmsr
-        2#0010110010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mtmsrd # ignore top bits and d
+        2#0010010010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- mtmsr
+        2#0010110010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtmsrd # ignore top bits and d
         2#0111010011#  =>       (ALU,  NONE, OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr
         2#0001001001#  =>       (ALU,  NONE, OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- mulhd
         2#0000001001#  =>       (ALU,  NONE, OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- mulhdu
@@ -409,13 +409,13 @@ architecture behaviour of decode1 is
         2#1011101000#  =>       (ALU,  NONE, OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfmeo
         2#0011001000#  =>       (ALU,  NONE, OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfze
         2#1011001000#  =>       (ALU,  NONE, OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfzeo
-        2#1001010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- sync
+        2#1001010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sync
         2#0001000100#  =>       (ALU,  NONE, OP_TRAP,      RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- td
         2#0000000100#  =>       (ALU,  NONE, OP_TRAP,      RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- tw
         2#0100110010#  =>       (LDST, NONE, OP_TLBIE,     NONE,       RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbie
         2#0100010010#  =>       (LDST, NONE, OP_TLBIE,     NONE,       RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbiel
-        2#1000110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- tlbsync
-        2#0000011110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- wait
+        2#1000110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbsync
+        2#0000011110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- wait
         2#0100111100#  =>       (ALU,  NONE, OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- xor
         others => illegal_inst
 	);
@@ -640,9 +640,9 @@ begin
 
             if std_match(f_in.insn(10 downto 1), "01-1010011") then
                 -- mfspr or mtspr
-                -- Make slow SPRs single issue
                 if is_fast_spr(v.ispr1) = '0' then
-                    vi.force_single := '1';
+                    -- Make mtspr to slow SPRs single issue
+                    vi.force_single := f_in.insn(8);
                     -- send MMU-related SPRs to loadstore1
                     case sprn is
                         when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>

From 0bd1e24024879ae6a30f29ae8a6a47e169551096 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 4 Jul 2022 18:23:03 +1000
Subject: [PATCH 11/30] decode2: Rename 'r' to 'dc2'

Also get rid of a couple of unused variables.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode2.vhdl | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/decode2.vhdl b/decode2.vhdl
index c290c98..1d4ce57 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -52,7 +52,7 @@ architecture behaviour of decode2 is
         repeat : std_ulogic;
     end record;
 
-    signal r, rin : reg_type;
+    signal dc2, dc2in : reg_type;
 
     signal deferred : std_ulogic;
 
@@ -302,7 +302,7 @@ begin
 
             complete_in => complete_in,
             valid_in    => control_valid_in,
-            repeated    => r.repeat,
+            repeated    => dc2.repeat,
             busy_in     => busy_in,
             deferred    => deferred,
             flush_in    => flush_in,
@@ -341,16 +341,16 @@ begin
             instr_tag_out => instr_tag
             );
 
-    deferred <= r.e.valid and busy_in;
+    deferred <= dc2.e.valid and busy_in;
 
     decode2_0: process(clk)
     begin
         if rising_edge(clk) then
             if rst = '1' or flush_in = '1' or deferred = '0' then
-                if rin.e.valid = '1' then
-                    report "execute " & to_hstring(rin.e.nia);
+                if dc2in.e.valid = '1' then
+                    report "execute " & to_hstring(dc2in.e.nia);
                 end if;
-                r <= rin;
+                dc2 <= dc2in;
             end if;
         end if;
     end process;
@@ -359,8 +359,6 @@ begin
 
     decode2_1: process(all)
         variable v : reg_type;
-        variable mul_a : std_ulogic_vector(63 downto 0);
-        variable mul_b : std_ulogic_vector(63 downto 0);
         variable decoded_reg_a : decode_input_reg_t;
         variable decoded_reg_b : decode_input_reg_t;
         variable decoded_reg_c : decode_input_reg_t;
@@ -368,13 +366,10 @@ begin
         variable length : std_ulogic_vector(3 downto 0);
         variable op : insn_type_t;
     begin
-        v := r;
+        v := dc2;
 
         v.e := Decode2ToExecute1Init;
 
-        mul_a := (others => '0');
-        mul_b := (others => '0');
-
         --v.e.input_cr := d_in.decode.input_cr;
         v.e.output_cr := d_in.decode.output_cr;
 
@@ -409,21 +404,21 @@ begin
 
         if d_in.decode.repeat /= NONE then
             v.e.repeat := '1';
-            v.e.second := r.repeat;
+            v.e.second := dc2.repeat;
             case d_in.decode.repeat is
                 when DRSE =>
                     -- do RS|1,RS for LE; RS,RS|1 for BE
-                    if r.repeat = d_in.big_endian then
+                    if dc2.repeat = d_in.big_endian then
                         decoded_reg_c.reg(0) := '1';
                     end if;
                 when DRTE =>
                     -- do RT|1,RT for LE; RT,RT|1 for BE
-                    if r.repeat = d_in.big_endian then
+                    if dc2.repeat = d_in.big_endian then
                         decoded_reg_o.reg(0) := '1';
                     end if;
                 when DUPD =>
                     -- update-form loads, 2nd instruction writes RA
-                    if r.repeat = '1' then
+                    if dc2.repeat = '1' then
                         decoded_reg_o.reg := decoded_reg_a.reg;
                     end if;
                 when others =>
@@ -431,9 +426,9 @@ begin
         elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then
             -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled
             v.e.repeat := '1';
-            v.e.second := r.repeat;
+            v.e.second := dc2.repeat;
             -- first one does CTR, second does LR
-            decoded_reg_o.reg(0) := not r.repeat;
+            decoded_reg_o.reg(0) := not dc2.repeat;
         end if;
 
         v.e.spr_select := d_in.spr_info;
@@ -487,7 +482,7 @@ begin
         v.e.result_sel := result_select(op);
         v.e.sub_select := subresult_select(op);
         if op = OP_BC or op = OP_BCREG then
-            if d_in.insn(23) = '0' and r.repeat = '0' and
+            if d_in.insn(23) = '0' and dc2.repeat = '0' and
                 not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
                 -- decrement CTR if BO(2) = 0 and not bcctr
                 v.e.addm1 := '1';
@@ -562,7 +557,7 @@ begin
 
         v.e.valid := control_valid_out;
         if control_valid_out = '1' then
-            v.repeat := v.e.repeat and not r.repeat;
+            v.repeat := v.e.repeat and not dc2.repeat;
         end if;
 
         stall_out <= control_stall_out or v.repeat;
@@ -573,10 +568,10 @@ begin
         end if;
 
         -- Update registers
-        rin <= v;
+        dc2in <= v;
 
         -- Update outputs
-        e_out <= r.e;
+        e_out <= dc2.e;
     end process;
 
     d2_log: if LOG_LENGTH > 0 generate
@@ -585,8 +580,8 @@ begin
         dec2_log : process(clk)
         begin
             if rising_edge(clk) then
-                log_data <= r.e.nia(5 downto 2) &
-                            r.e.valid &
+                log_data <= dc2.e.nia(5 downto 2) &
+                            dc2.e.valid &
                             stopped_out &
                             stall_out &
                             (gpr_a_bypass(1) or gpr_a_bypass(0)) &

From c9e838b6560fb7981062fef2762762e9cf4e748f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 4 Jun 2022 17:37:48 +1000
Subject: [PATCH 12/30] Remove support for lq, stq, lqarx and stqcx.

They are optional in SFFS (scalar fixed-point and floating-point
subset), are not needed for running Linux, and add complexity, so
remove them.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl                       |  17 ---
 decode2.vhdl                       |  10 --
 decode_types.vhdl                  |   2 -
 loadstore1.vhdl                    |  11 --
 tests/modes/head.S                 |  60 ----------
 tests/modes/modes.c                | 171 -----------------------------
 tests/reservation/head.S           |  28 -----
 tests/reservation/reservation.c    |  62 -----------
 tests/test_modes.bin               | Bin 20520 -> 20520 bytes
 tests/test_modes.console_out       |   2 -
 tests/test_reservation.bin         | Bin 11604 -> 10888 bytes
 tests/test_reservation.console_out |   1 -
 12 files changed, 364 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 3f3109f..b807054 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -93,7 +93,6 @@ architecture behaviour of decode1 is
         43 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhau
         40 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhz
         41 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzu
-        56 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_DQ,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTE), -- lq
         32 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwz
         33 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzu
          7 =>       (ALU,  NONE, OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- mulli
@@ -310,7 +309,6 @@ architecture behaviour of decode1 is
         2#1100110101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzcix
         2#0100110111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzux
         2#0100010111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzx
-        2#0100010100#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', DRTE), -- lqarx
         2#0000010100#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lwarx
         2#0101110101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwaux
         2#0101010101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwax
@@ -393,7 +391,6 @@ architecture behaviour of decode1 is
         2#1011010110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0', NONE), -- sthcx
         2#0110110111#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthux
         2#0110010111#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthx
-        2#0010110110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0', DRSE), -- stqcx
         2#1010010110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwbrx
         2#1110010101#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwcix
         2#0010010110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0', NONE), -- stwcx
@@ -452,7 +449,6 @@ architecture behaviour of decode1 is
         --                                op                                           in   out   A   out  in    out  len        ext                                 pipe
         0     =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_DS,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- std
         1     =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_DS,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdu
-        2     =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_DS,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSE), -- stq
         others   => decode_rom_init
         );
 
@@ -652,13 +648,6 @@ begin
                     end case;
                 end if;
             end if;
-            if std_match(f_in.insn(10 downto 1), "0100010100") then
-                -- lqarx, illegal if RA = RT or RB = RT
-                if f_in.insn(25 downto 21) = f_in.insn(20 downto 16) or
-                    f_in.insn(25 downto 21) = f_in.insn(15 downto 11) then
-                    vi.override := '1';
-                end if;
-            end if;
 
         when 16 =>
             -- CTR may be needed as input to bc
@@ -722,12 +711,6 @@ begin
         when 30 =>
             v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1))));
 
-        when 56 =>
-            -- lq, illegal if RA = RT
-            if f_in.insn(25 downto 21) = f_in.insn(20 downto 16) then
-                vi.override := '1';
-            end if;
-
         when 58 =>
             v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
diff --git a/decode2.vhdl b/decode2.vhdl
index 1d4ce57..371c48c 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -406,16 +406,6 @@ begin
             v.e.repeat := '1';
             v.e.second := dc2.repeat;
             case d_in.decode.repeat is
-                when DRSE =>
-                    -- do RS|1,RS for LE; RS,RS|1 for BE
-                    if dc2.repeat = d_in.big_endian then
-                        decoded_reg_c.reg(0) := '1';
-                    end if;
-                when DRTE =>
-                    -- do RT|1,RT for LE; RT,RT|1 for BE
-                    if dc2.repeat = d_in.big_endian then
-                        decoded_reg_o.reg(0) := '1';
-                    end if;
                 when DUPD =>
                     -- update-form loads, 2nd instruction writes RA
                     if dc2.repeat = '1' then
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 885cc91..514bc08 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -53,8 +53,6 @@ package decode_types is
     type length_t is (NONE, is1B, is2B, is4B, is8B);
 
     type repeat_t is (NONE,      -- instruction is not repeated
-                      DRSE,      -- double RS, endian twist
-                      DRTE,      -- double RT, endian twist
                       DUPD);     -- update-form load
 
     type decode_rom_t is record
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index ff2633b..7fad454 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -458,17 +458,6 @@ begin
         -- check alignment for larx/stcx
         misaligned := or (addr_mask and addr(2 downto 0));
         v.align_intr := l_in.reserve and misaligned;
-        if l_in.repeat = '1' and l_in.second = '0' and l_in.update = '0' and addr(3) = '1' then
-            -- length is really 16 not 8
-            -- Make misaligned lq cause an alignment interrupt in LE mode,
-            -- in order to avoid the case with RA = RT + 1 where the second half
-            -- faults but the first doesn't (and updates RT+1, destroying RA).
-            -- The equivalent BE case doesn't occur because RA = RT is illegal.
-            misaligned := '1';
-            if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then
-                v.align_intr := '1';
-            end if;
-        end if;
 
         v.atomic := not misaligned;
         v.atomic_last := not misaligned and (l_in.second or not l_in.repeat);
diff --git a/tests/modes/head.S b/tests/modes/head.S
index 8b00bdd..d9e69dc 100644
--- a/tests/modes/head.S
+++ b/tests/modes/head.S
@@ -230,63 +230,3 @@ restore:
 	ld	%r0,16(%r1)
 	mtlr	%r0
 	blr
-
-	.global	do_lq
-do_lq:
-	lq	%r6,0(%r3)
-	std	%r6,0(%r4)
-	std	%r7,8(%r4)
-	li	%r3,0
-	blr
-
-	.global	do_lq_np	/* "non-preferred" form of lq */
-do_lq_np:
-	mr	%r7,%r3
-	lq	%r6,0(%r7)
-	std	%r6,0(%r4)
-	std	%r7,8(%r4)
-	li	%r3,0
-	blr
-
-	.global	do_lq_bad	/* illegal form of lq */
-do_lq_bad:
-	mr	%r6,%r3
-	.long	0xe0c60000	/* lq %r6,0(%r6) */
-	std	%r6,0(%r4)
-	std	%r7,8(%r4)
-	li	%r3,0
-	blr
-
-	.global	do_stq
-do_stq:
-	ld	%r8,0(%r4)
-	ld	%r9,8(%r4)
-	stq	%r8,0(%r3)
-	li	%r3,0
-	blr
-
-	/* big-endian versions of the above */
-	.global	do_lq_be
-do_lq_be:
-	.long	0x0000c3e0
-	.long	0x0000c4f8
-	.long	0x0800e4f8
-	.long	0x00006038
-	.long	0x2000804e
-
-	.global	do_lq_np_be	/* "non-preferred" form of lq */
-do_lq_np_be:
-	.long	0x781b677c
-	.long	0x0000c7e0
-	.long	0x0000c4f8
-	.long	0x0800e4f8
-	.long	0x00006038
-	.long	0x2000804e
-
-	.global	do_stq_be
-do_stq_be:
-	.long	0x000004e9
-	.long	0x080024e9
-	.long	0x020003f9
-	.long	0x00006038
-	.long	0x2000804e
diff --git a/tests/modes/modes.c b/tests/modes/modes.c
index b94bb47..fa4872c 100644
--- a/tests/modes/modes.c
+++ b/tests/modes/modes.c
@@ -12,14 +12,6 @@
 extern unsigned long callit(unsigned long arg1, unsigned long arg2,
 			    unsigned long fn, unsigned long msr);
 
-extern void do_lq(void *src, unsigned long *regs);
-extern void do_lq_np(void *src, unsigned long *regs);
-extern void do_lq_bad(void *src, unsigned long *regs);
-extern void do_stq(void *dst, unsigned long *regs);
-extern void do_lq_be(void *src, unsigned long *regs);
-extern void do_lq_np_be(void *src, unsigned long *regs);
-extern void do_stq_be(void *dst, unsigned long *regs);
-
 static inline void do_tlbie(unsigned long rb, unsigned long rs)
 {
 	__asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory");
@@ -302,167 +294,6 @@ int mode_test_6(void)
 	return 0;
 }
 
-int mode_test_7(void)
-{
-	unsigned long quad[4] __attribute__((__aligned__(16)));
-	unsigned long regs[2];
-	unsigned long ret, msr;
-
-	/*
-	 * Test lq/stq in LE mode
-	 */
-	msr = MSR_SF | MSR_LE;
-	quad[0] = 0x123456789abcdef0ul;
-	quad[1] = 0xfafa5959bcbc3434ul;
-	ret = callit((unsigned long)quad, (unsigned long)regs,
-		     (unsigned long)&do_lq, msr);
-	if (ret)
-		return ret | 1;
-	if (regs[0] != quad[1] || regs[1] != quad[0])
-		return 2;
-	/* unaligned may give alignment interrupt */
-	quad[2] = 0x0011223344556677ul;
-	ret = callit((unsigned long)&quad[1], (unsigned long)regs,
-		     (unsigned long)&do_lq, msr);
-	if (ret == 0) {
-		if (regs[0] != quad[2] || regs[1] != quad[1])
-			return 3;
-	} else if (ret == 0x600) {
-		if (mfspr(SPRG0) != (unsigned long) &do_lq ||
-		    mfspr(DAR) != (unsigned long) &quad[1])
-			return ret | 4;
-	} else
-		return ret | 5;
-
-	/* try stq */
-	regs[0] = 0x5238523852385238ul;
-	regs[1] = 0x5239523952395239ul;
-	ret = callit((unsigned long)quad, (unsigned long)regs,
-		     (unsigned long)&do_stq, msr);
-	if (ret)
-		return ret | 5;
-	if (quad[0] != regs[1] || quad[1] != regs[0])
-		return 6;
-	regs[0] = 0x0172686966746564ul;
-	regs[1] = 0xfe8d0badd00dabcdul;
-	ret = callit((unsigned long)quad + 1, (unsigned long)regs,
-		     (unsigned long)&do_stq, msr);
-	if (ret)
-		return ret | 7;
-	if (((quad[0] >> 8) | (quad[1] << 56)) != regs[1] ||
-	    ((quad[1] >> 8) | (quad[2] << 56)) != regs[0])
-		return 8;
-
-	/* try lq non-preferred form */
-	quad[0] = 0x56789abcdef01234ul;
-	quad[1] = 0x5959bcbc3434fafaul;
-	ret = callit((unsigned long)quad, (unsigned long)regs,
-		     (unsigned long)&do_lq_np, msr);
-	if (ret)
-		return ret | 9;
-	if (regs[0] != quad[1] || regs[1] != quad[0])
-		return 10;
-	/* unaligned should give alignment interrupt in uW implementation */
-	quad[2] = 0x6677001122334455ul;
-	ret = callit((unsigned long)&quad[1], (unsigned long)regs,
-		     (unsigned long)&do_lq_np, msr);
-	if (ret == 0x600) {
-		if (mfspr(SPRG0) != (unsigned long) &do_lq_np + 4 ||
-		    mfspr(DAR) != (unsigned long) &quad[1])
-			return ret | 11;
-	} else
-		return 12;
-
-	/* make sure lq with rt = ra causes an illegal instruction interrupt */
-	ret = callit((unsigned long)quad, (unsigned long)regs,
-		     (unsigned long)&do_lq_bad, msr);
-	if (ret != 0x700)
-		return 13;
-	if (mfspr(SPRG0) != (unsigned long)&do_lq_bad + 4 ||
-	    !(mfspr(SPRG3) & 0x80000))
-		return 14;
-	return 0;
-}
-
-int mode_test_8(void)
-{
-	unsigned long quad[4] __attribute__((__aligned__(16)));
-	unsigned long regs[2];
-	unsigned long ret, msr;
-
-	/*
-	 * Test lq/stq in BE mode
-	 */
-	msr = MSR_SF;
-	quad[0] = 0x123456789abcdef0ul;
-	quad[1] = 0xfafa5959bcbc3434ul;
-	ret = callit((unsigned long)quad, (unsigned long)regs,
-		     (unsigned long)&do_lq_be, msr);
-	if (ret)
-		return ret | 1;
-	if (regs[0] != quad[0] || regs[1] != quad[1]) {
-		print_hex(regs[0], 16);
-		print_string(" ");
-		print_hex(regs[1], 16);
-		print_string(" ");
-		return 2;
-	}
-	/* don't expect alignment interrupt */
-	quad[2] = 0x0011223344556677ul;
-	ret = callit((unsigned long)&quad[1], (unsigned long)regs,
-		     (unsigned long)&do_lq_be, msr);
-	if (ret == 0) {
-		if (regs[0] != quad[1] || regs[1] != quad[2])
-			return 3;
-	} else
-		return ret | 5;
-
-	/* try stq */
-	regs[0] = 0x5238523852385238ul;
-	regs[1] = 0x5239523952395239ul;
-	ret = callit((unsigned long)quad, (unsigned long)regs,
-		     (unsigned long)&do_stq_be, msr);
-	if (ret)
-		return ret | 5;
-	if (quad[0] != regs[0] || quad[1] != regs[1])
-		return 6;
-	regs[0] = 0x0172686966746564ul;
-	regs[1] = 0xfe8d0badd00dabcdul;
-	ret = callit((unsigned long)quad + 1, (unsigned long)regs,
-		     (unsigned long)&do_stq_be, msr);
-	if (ret)
-		return ret | 7;
-	if (((quad[0] >> 8) | (quad[1] << 56)) != regs[0] ||
-	    ((quad[1] >> 8) | (quad[2] << 56)) != regs[1]) {
-			print_hex(quad[0], 16);
-			print_string(" ");
-			print_hex(quad[1], 16);
-			print_string(" ");
-			print_hex(quad[2], 16);
-			print_string(" ");
-		return 8;
-	}
-
-	/* try lq non-preferred form */
-	quad[0] = 0x56789abcdef01234ul;
-	quad[1] = 0x5959bcbc3434fafaul;
-	ret = callit((unsigned long)quad, (unsigned long)regs,
-		     (unsigned long)&do_lq_np_be, msr);
-	if (ret)
-		return ret | 9;
-	if (regs[0] != quad[0] || regs[1] != quad[1])
-		return 10;
-	/* unaligned should not give alignment interrupt in uW implementation */
-	quad[2] = 0x6677001122334455ul;
-	ret = callit((unsigned long)&quad[1], (unsigned long)regs,
-		     (unsigned long)&do_lq_np_be, msr);
-	if (ret)
-		return ret | 11;
-	if (regs[0] != quad[1] || regs[1] != quad[2])
-		return 12;
-	return 0;
-}
-
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -507,8 +338,6 @@ int main(void)
 	do_test(4, mode_test_4);
 	do_test(5, mode_test_5);
 	do_test(6, mode_test_6);
-	do_test(7, mode_test_7);
-	do_test(8, mode_test_8);
 
 	return fail;
 }
diff --git a/tests/reservation/head.S b/tests/reservation/head.S
index 4ff85ce..ce258b5 100644
--- a/tests/reservation/head.S
+++ b/tests/reservation/head.S
@@ -155,31 +155,3 @@ call_ret:
 	ld	%r31,248(%r1)
 	addi	%r1,%r1,256
 	blr
-
-	.global	do_lqarx
-do_lqarx:
-	/* r3 = src, r4 = regs */
-	lqarx	%r10,0,%r3
-	std	%r10,0(%r4)
-	std	%r11,8(%r4)
-	li	%r3,0
-	blr
-
-	.global do_lqarx_bad
-do_lqarx_bad:
-	/* r3 = src, r4 = regs */
-	.long	0x7d405228	/* lqarx %r10,0,%r10 */
-	std	%r10,0(%r4)
-	std	%r11,8(%r4)
-	li	%r3,0
-	blr
-
-	.global do_stqcx
-do_stqcx:
-	/* r3 = dest, r4 = regs, return CR */
-	ld	%r10,0(%r4)
-	ld	%r11,8(%r4)
-	stqcx.	%r10,0,%r3
-	mfcr	%r3
-	oris	%r3,%r3,1	/* to distinguish from trap number */
-	blr
diff --git a/tests/reservation/reservation.c b/tests/reservation/reservation.c
index a3d5a7a..79bbc1f 100644
--- a/tests/reservation/reservation.c
+++ b/tests/reservation/reservation.c
@@ -7,10 +7,6 @@
 extern unsigned long callit(unsigned long arg1, unsigned long arg2,
 			    unsigned long (*fn)(unsigned long, unsigned long));
 
-extern unsigned long do_lqarx(unsigned long src, unsigned long regs);
-extern unsigned long do_lqarx_bad(unsigned long src, unsigned long regs);
-extern unsigned long do_stqcx(unsigned long dst, unsigned long regs);
-
 #define DSISR	18
 #define DAR	19
 #define SRR0	26
@@ -184,63 +180,6 @@ int resv_test_2(void)
 	return 0;
 }
 
-/* test lqarx/stqcx */
-int resv_test_3(void)
-{
-	unsigned long x[4] __attribute__((__aligned__(16)));
-	unsigned long y[2], regs[2];
-	unsigned long ret, offset;
-	int count;
-
-	x[0] = 0x7766554433221100ul;
-	x[1] = 0xffeeddccbbaa9988ul;
-	y[0] = 0x0badcafef00dd00dul;
-	y[1] = 0xdeadbeef07070707ul;
-	for (count = 0; count < 1000; ++count) {
-		ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx);
-		if (ret)
-			return ret | 1;
-		ret = callit((unsigned long)x, (unsigned long)y, do_stqcx);
-		if (ret < 0x10000)
-			return ret | 2;
-		if (ret & 0x20000000)
-			break;
-	}
-	if (count == 1000)
-		return 3;
-	if (x[0] != y[1] || x[1] != y[0])
-		return 4;
-	if (regs[1] != 0x7766554433221100ul || regs[0] != 0xffeeddccbbaa9988ul)
-		return 5;
-	ret = callit((unsigned long)x, (unsigned long)regs, do_stqcx);
-	if (ret < 0x10000 || (ret & 0x20000000))
-		return ret | 12;
-	/* test alignment interrupts */
-	for (offset = 0; offset < 16; ++offset) {
-		ret = callit((unsigned long)x + offset, (unsigned long)regs, do_lqarx);
-		if (ret == 0 && (offset & 15) != 0)
-			return 6;
-		if (ret == 0x600) {
-			if ((offset & 15) == 0)
-				return ret + 7;
-		} else if (ret)
-			return ret;
-		ret = callit((unsigned long)x + offset, (unsigned long)y, do_stqcx);
-		if (ret >= 0x10000 && (offset & 15) != 0)
-			return 8;
-		if (ret == 0x600) {
-			if ((offset & 15) == 0)
-				return ret + 9;
-		} else if (ret < 0x10000)
-			return ret;
-	}
-	/* test illegal interrupt for bad lqarx case */
-	ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx_bad);
-	if (ret != 0x700 || !(mfspr(SRR1) & 0x80000))
-		return ret + 10;
-	return 0;
-}
-
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -265,7 +204,6 @@ int main(void)
 
 	do_test(1, resv_test_1);
 	do_test(2, resv_test_2);
-	do_test(3, resv_test_3);
 
 	return fail;
 }
diff --git a/tests/test_modes.bin b/tests/test_modes.bin
index 7e6b8f5d03126023a7ce4e75f157bd289d1e387c..24e39813fa353af0d03f534c01389d4c3f97d9f5 100755
GIT binary patch
delta 1111
zcmYjOZD?Cn7=G_rcALg5i}*p)&h+*+Ka!c1AWOK}a`mQZK1}yfaaHhF+RWNXk?x0(
zwl~YQL{NXVdHtbmXgj53qew?9i1;IH{uujJWgAQs<|pk$8MfQ9we0www6MMKa-Z{@
z=RD^*@2$>A)fs83z-ros@t>8&y!*VX68eMnIPtI0PiVYkLo@7So`3OdxQjjg1pXL)
zfqnfC%8_^EQ3quDn{YgGgta*EO2o_ll<`(%FP@3mv3kIHxH^*8+#uC|;2rhnIPbl_
z@vszvlI8&3zSNXA-<Kf+0J*-~B{KwhO>if-xdV{X-rU9^=+H(MrGxE~bO<|QoljkB
zg3NEvj2C9fxX@NKeeG{7z8-6q*i9J+^$L4s74PbE?A!`|_Tu%@0>v=Mf}*U{`LBP^
z?Xa829`IfXl&H9#^TF6Rr1`*7r*+~B8^4#_U}+a;`NX?Im*W%rVbTz~8tFX5&2^8O
zLeo!J@Rx~~8vMxyFMI^QJd$Tx8Q+Vvq1$jvaqKYCEI5R38{Y8JKv}!+Q0OLOJL$Qb
zLuGAtdo9ztkM^f&Ei#G`d}Ix-Ea5MPxA{XsO<CPhw8tkNTTR8!92{nlae&wFq{O?(
zB}s!XtX(SU^j)T=qoO2>0TkL!nB=5=)$(!?z=3adcX|w6f$RYF*t`wQTdx^Ks3|(g
zS^T1VU&&Svj6n_bOS|c-*hyy}BR2}e?(YH61y@OAi)EOM3tr6l6HEP6oN>~g#g(Wy
zBfQV2mEx48t~T&dqk-AQ?3fJ#(Z{(RU__l;9KM?9@{%p**=<`zZ&3miBd`_ssDY;w
zN|*-J9NfyKuKF5TI)i>!xnL(OK{@y64$D-)Is)JeR(gCG>RC{$|8Y)Z0XXbZo{XRB
zbwNTFkMUD$)V4Kb$-QiT4ZD)hI()>Gsc*00cv6+;34XbbX9-><`29LwNvaJINC56T
z*qTy3euBsEaBh)8l<<fVw~YO%PWd>Y%w0T{ipVnr58cCWQ>{B|1UK<Us?BqgP_EAT
zXyYt^Lf>`bzQbMV_SP2qufz8_zbtlpn&a@flyHUU9YdH(tARRgiMXQN)M84*4<1Dk
wk2I*S;&l4?efAB`1tUtRE$ThQ1?f6bg8P_y|0d_div#Y{yQde9Rr>t@1HPzfVE_OC

delta 3286
zcmZ`+e{56N6+V7(V&|6^3R?+b@RGQzUlv;DN9i+(6Fbo2B^9Klv{3DjxTztjI+RYS
zYPZjQCd#BTwNk<xQaiDhRBFUFw!w(BPNGs8P)emDr4p*pR7#SiT?}+fHR$q-+<oW0
z=Mfk@=_>br_r7z!bM86cbB-Qy96jRbi%ZVXiC4am_0!`-XJ*|rHA~c{%9Pp`FWIXR
zeS8Oe5Af0d!-<?}H+j<%WzA)5jGrc&J%cetZnceQTZ&|DZ%KUo<J<6iv5|HXrM={y
z%OWXr_++$bmFJ*;O8RuA@wNYNdhI=Oa8^t#%!rQ`;!i12AtoE2d|cE~YPf#Xr6fjO
zqw!O}R$>ksZJYU0iN+Vc(u>G3+b>FIWh$MUpO=Fn+OO{3twd#ts{GEWQ%bD1R*Cws
zKk<!dA+RSZuUt(%VF$Ih4Y@kCE-F>yE#-Q$>mG`P$dUF@7?H{WYV~PTq9+sqCIjR3
zd8}8E9Ar{8Wo-DPx(vCUTP$)f@3F`+nfSj?>wY+oEpy&WJt6kvhs`Y8k0x}>5j$r`
zFTj!dgd7y&z52Hve_v&X>Jv(ov)~~m!C7Dj9BC^J%!c43fb#*I;r$5K-(IyM#{M)%
zts?4dL|q_&=al=hsS_Dck_2CadQ)qXS{pTIjvSa-0wC?cx<QNMune{l5X-UIj$z*i
zEM$`%#`@<DD=VEH7D>(ulkDE3#F*%FF(q0GvP|^NB{}$qd_Jcn3dfY#x!)^Mof(I1
z#)*<ntq5#gq(qrv+mc0svlF3-X*Y;SBkqJ}IFcaMr-eYk4?@yNGqAh_r<Xz%nh$vV
zDbC_#V~1T!s39|p8G@*#7}pXCP>E(z$_2H1w;V(PnJiI6vH5wckbZOJW+4@TWG+&A
z>V{~^)J~qArDHd4t)0VIzr4rF(h3B|SD7JEQ>I%iP(&t)OyN}7&I;IQPI=`n7#YM>
zIZ4#z<Er?|wlHR2mx?(@o3f+jfY=e&310_CdhxOqKq1?7l~+3X{rhdB9+Q2eHnYf%
zb|l@@{L<87Pndvzu_#)M@!HkKaFtJrUN$aQJs?eaj5XEO(v7sSt$JT!$U~9re2Q?R
zO&R;EON^h^xQ(mT-m2+Bifml`L~n2@J98<waxLu-Jxxkr$W@+P6r%1>p7qvQ)SDmp
zv=Oc<`JX6o8yrI$Si&%DHn)>3L0~bslMrxQ+)WG)P2buYTx^ygv};ci>p*lGHK3;{
zQm(B+!*cmUMX;{W>T#HJTtq3=*DdNMP@VCGzw7<@8^Mc=pR=(f{z{NtR+;k!6k+iY
zVpwJI-~hM)_>NdSe7GYP4;GB;|6qbTnQ|336~iKs1!GL&IGUD8;jl^S2-X?FxPVNt
z6BZ11!h(>g?7NA@FHDEjmJtiC>}(ySJ%b2X{?ND(0LytToLRkSIrn1C0>DmL0G3C_
z0>F*W0stF#Y<}+jY?}Zp6+3sqx87|!k4|&yxxb{5=i{E?iUc?JT4aSaR2osx^gn^*
zg(01a_SkLL^9XvAuje|d7b!Ncr^?1Afcb6ghc<Q&m>Yq%E(yKGQjhoQhwYTIdMuBQ
z)gz<9EFdVFy9&pBU*B4E8S&^d#UqT5OldmL&hUBkWQ6paMRx`Os|Tl;)nhpc6q8vz
zEFN4uAAV&y%A_(MsjRe}a3%|ECtaqKn>EyiHGkiC;VkiEd{$kuAzt#DCd9M#c{E<<
z?`~JQ*FM8Xw|_yje7>-R@0+gXkyt<EYS(^i^w+E`K9pni@+R0iV9&uzJ$~}Tn)MDx
zpK-Cad}XnlB9q_O;{z}iFG*_N_46lJK9uW_ye?zi#)DGfr19sCgHmk5c;?~Hbrp_L
zJ?wlMzY?wtX1TSq--#9n%M~~?!4S^8o+|4<0i1h~FZ~<LM+VZbI@**(EeeLuMR4v-
z;Db$F_{yZzPcUb^75Gag{-TMq9maQcw;O*_H#j<ti|SUX>u1J~0_%3Z5bIS>pgM)<
zsn157p?$sTfW786X^r-(EC{^s!g>hn{q~xx%e}W(eGzLW&$;<!>e=ZFMk27T(2S=q
z)r7yw0pm#E)!Pv7ro(l;5=HW8oew1xDugxYQ44=E0{A<I!Iy`}k&=kwCC}qmU7PyH
z)fGviTw`HVc`*fLaz`-|$DE{Hew30lj4xA;+jzKcgHFStAl`1{@m6Y5CAk@An-d^4
zx0bMn^}!wPq;s|+ar%4Bi4;1{ya>EkfIXu7-{$ivAqUw0@1B;EeW|jvIp0p^oIl@A
z==^dli$Z|{QjAM=)oYjzZGBq^cVM;5eWh4bx0CT;z5EE0<E`Wb5gmAtd9r;!y%Y9=
ziv&Ad`)-q0GkPWCCzRyhzl8m9WXb5Q_h$3UvN2Hq7fHTiJQG|eX;+L_f)(4mSA}>E
zo7CcwSA<{N?4hkW_zIE_qZgwC`yXUsvNbnM?>gR*T0C7i*P7>{rb}Pug!8fG#W;>}
z?O(=duq@{SrblOte_~1wqO&u`4;soIkbzcWx$0XX-e7~<O+)WG_QF*7l4*$zwc?C+
z&1h;UD~@A2c1?&BV19V}4Pf;~vY|X@2(zK<#$dz7`(`lx;5#A4_|#)&AozK>*$iYY
zpd!>qhv8@}DfVOb>3@Xi;*)q}j4i-?#zT$k*K}ZZepU$ljs$E5yxHSNxciKKjb%#j
zoDi?`7M>yu+zibE<2>=hISXNQ7=La2@rJ|mLNLM)pH5*M2gXU@hp}H_oLvwij(-#G
Jbsuc?{TF)MbVmRH

diff --git a/tests/test_modes.console_out b/tests/test_modes.console_out
index 25e791c..a49bb9b 100644
--- a/tests/test_modes.console_out
+++ b/tests/test_modes.console_out
@@ -4,5 +4,3 @@ test 03:PASS
 test 04:PASS
 test 05:PASS
 test 06:PASS
-test 07:PASS
-test 08:PASS
diff --git a/tests/test_reservation.bin b/tests/test_reservation.bin
index 1e305f43af8cbe4213e597cdfc50f1eab71abb5d..1cb62505581b238325722ae999b1b70778e0ffa1 100755
GIT binary patch
delta 986
zcmYjPO-vI}5T30qv;^ytVggah>;D!O0})MYsa;e;ZPju>V+<OQi`>+M*-Fc<q%o3U
zQhXdRk%S}ugn;(o0TPshCLWZ8=*?)nfPWW5EI;y`SL)9sllk8FX1<x7eM{$VozAL;
zVfW1StKyTUir3bW>9)rtC)koD5WT`2dB&U(#tadF3Hrie*Oo761ZMklrSCO>4<W_|
zLIfN^^_54fTpvoQKNd6vZsb?Lu{Z#9BVGn@$tf6Ag84qJ5(ZlZNCk4Gy{RlX0tDNr
z3nq=%JuBAm{N`eQIp!10f5TW@02XQ&AtGhjpd!Fr$OvKSGGgo#$YBDMxTHxu^F~V;
z8?kV#$nzS`k75)dMISnAXp4uK?wKHZoA!7T!ZAAIsTFGIq9@`I)!u*zLVgi9P=g9@
zH@37!&;n(^YWe7Mw8?9$zAr!&u=yeEKVHO*_)%l@yf<N}Qt6b}X(uQJx^tvwi^bKA
zPnr?*z&K*YF1AQ2{qFS&AL(wNkDl=Dw+wMPvwkE<hkgClzb62TEYlI6jmlCc)MbEZ
z8Q3M<jj|J4x8Z<%e_@P#PpH97&<z0Up1u@c&s5d%G(Z{sBDt!SFo9SPdh(GGQlHN#
zX_zNrD2Y*@zd<wR{YjXI@WL*P#Eg07B&y+5hPAx_Vb0w-8cS#J&v0G<5)^gkU0zd;
zcRC|MUc<E?w3D=1cdit0LMdRw^H|ad{^0-I4p4&S+Z=u8-)Bcz#$rWT60RbC9K*jo
zZbG58b>sl<#v*Ue<y>(WW^u`qHKUpfFD%pjuZNx@pq~P8%QRExHu54mSvPUG<0oTF
zc;Y6ab2$+o{8>(*aZW++L{7!Xtp&Xk{b`=wtUqXRBOFD~(zSY*X%^8mb;v#wzIoWB
zCuLWOC;)V<(`&N5vKit0I%C~<32jzV(K-;Pld{|Y70C=I(f$n5wZWKPU;reH3i?aB
dD%T5})X`ug;wED<3UcE$d<y9oW0yFA_8W*pG0p%0

delta 1756
zcmZWpZA@EL7=G`irDX#w5fj)ZUfKe^lSOpP(yny9o#>j*2+<LVX4;0afQbn%n6<3+
z7BUm|k9hWD%Wz+A%Yq--6cds$gA#Qri8JAciTiYaz<r2rEHDkn=j|=d=t<tZ?|shm
zp7-mVJJ34y@?cV>x--2yU50Okks)eYaMJb#qPSn9?DM`C{*Qq(1J2AFl}4tP;%SB2
zw=?(C-(6FspJ*e>!(~MjeN$G?OX&_vsfLNN;+yI!^XfKJqr<yL9Oq1;@t~BMphne!
z<o7jZ`5$}J$s6yR2KCL)-I7}f8Ni+<^3<oP-w4chndQ_UQONX<INH;LRP7&iQZzYX
zH`(utybtr&<@v?nYv5mya*INjgPUEn(Kjgj4TXk+Hfr>B!{$myK%hd8?-af{&=hn7
z9QWsO_F;S<w2Msff%{Q&$1W|@)T$@GcIYYhTD^nH^u(NFsqSOpoAGnO>4K|4g~r1u
za^berLX(#){_t77W8i?E@B#m5xfWXd&caJ(^^TGYdZLB#%UbBS&n?{H&^sWBHsHw{
zC%8!_#*^@WcGKz<S8a8tJ-*bPK_nw(f4Dp`6l|fs96x#e9kVa*W6XNe#gsKvYL8EH
zp;}kZEIKEp3|5oey&uIE<eQV55(1as|CWRMlXuJsGw`qAN8m^luJ4BCARI7xXTp4B
zDcDb*^mfWdn6nieg=El0;atb1Jl05w6MjCJi@J6rUzEe+ORnrnqcG#7`bx%0vA1|y
zUL>~6q%MC2vMvNr4()_4&X0>g;6OvrMTK$y-9>A>(~A}!2kroV5jZNFX`{%D4gS=k
zr_Y0fvCOmp<L^;?@7QI8qQLCnG_}Ss@>t4ruZ>~mu>_O$fpl9MAp@kzOBP}HYz#BD
z_It3;U0JG7$ydlDaB){OvqzzLcxbUdQjBpJoUpZ&ter?9qmfQ;@}E?l>3MnWPbsg9
zAKaDpaXUqiid*hxrBj4Fk10=zZJw}ts#hHHR5kplP!wmxTV~C41GVQ~&AB|b0hOG|
z+YNY0V>XJ4Zcl}<dDp3@d&LH?TOI8cuX<Og<GrHSTdORHqu#m_+^0sq6JtB~_}7T9
zyz5H>9KxC11g-BCwwm4lK?Z7uf}JWwmr<1u@6jlMcLa0d%MpO)xA{-0Dp7Lf>uDbH
z#}%tHL`CAYT2Cc~G;$^&sSj<G{`FbP(6|<&LZ>)byT+t(Bg9FtxU#?aNsECSOChO+
zRfIi!zx$#yW1C-{ns_Ld%I5weT7dnEu}4&QfcaUe1^E2om$ZyC>z>Kk&6Km%FP$kH
z>DGQ8D<C6KOuBfw?x_`=4sAXlr87{W=?Je|sWy|iQd9Yym6{tp^44>oi1t3g7>8+~
zLlG}hQgIwVV`nbCSmHrABg21J!v0k>OH9{!h0Awj<LF%}r*H}{7X&9$a!$%N2Cx`2
zAY;Ii(7YNUTOry#aoe{()BtlIWd5F%J!n2fiy{;`Xdkk@S8zx@tdvFGj+726iDjh}
zxpMi^#h(i!%e|nThxo-B-D97H)h1f?I{PYx=(y<9J%vrM<`=}UURB-+v-7@`-JsDk
zr5X-w78iA|ejMNcgIJs>Ctrbky@vrTAH;wR2z~YA>&CFd2rp)5As2wv<9J?-aUIit
HJt*~Wb@w*g

diff --git a/tests/test_reservation.console_out b/tests/test_reservation.console_out
index 623335d..0c39ae3 100644
--- a/tests/test_reservation.console_out
+++ b/tests/test_reservation.console_out
@@ -1,3 +1,2 @@
 test 01:PASS
 test 02:PASS
-test 03:PASS

From 2f45e545ed86795c0f282204a27f97887329051f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 8 Jul 2022 14:07:28 +1000
Subject: [PATCH 13/30] decode2: Rework to make the stall_out signal come from
 a register

At present the busy/stall signal going to decode1 depends on whether
control thinks it can issue the current instruction, and that depends
on completion and bypass signals coming from execute1 and writeback.

To improve the timing of stall_out, this rearranges decode2 so that
stall_out is asserted when we have a valid instruction that couldn't
be issued in the previous cycle.  This means that decode1 could give
us a new instruction when we haven't issued the previous instruction.

This in turn means that we can only use d_in in the first cycle of
processing an instruction.  After the first cycle, we get register
addresses etc. from dc2 rather than d_in.

Then, to avoid the need to read register operands from register_file
in each cycle until the instruction issues, we bring the bypass path
for data being written to the register file into decode2 explicitly
rather than having it in register_file.

A new process called decode2_addrs does the process of calling
decode_input_reg_* and decode_output_reg and sets up the register file
addresses.  This was split out (and decode_input_reg_* reworked) to
try to reduce the number of passes through the decode2_1 process that
need to be done in simulation.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl        |   1 +
 control.vhdl       |  31 ++--
 core.vhdl          |   5 +
 decode2.vhdl       | 430 +++++++++++++++++++++++++--------------------
 register_file.vhdl |  14 +-
 writeback.vhdl     |   7 +
 6 files changed, 269 insertions(+), 219 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index ea6a8d8..54a87d2 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -288,6 +288,7 @@ package common is
         write_reg_enable: std_ulogic;
 	read_reg1: gspr_index_t;
 	read_reg2: gspr_index_t;
+	read_reg3: gspr_index_t;
 	read_data1: std_ulogic_vector(63 downto 0);
 	read_data2: std_ulogic_vector(63 downto 0);
 	read_data3: std_ulogic_vector(63 downto 0);
diff --git a/control.vhdl b/control.vhdl
index 17a288b..e6855c2 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -15,9 +15,7 @@ entity control is
 
         complete_in         : in instr_tag_t;
         valid_in            : in std_ulogic;
-        repeated            : in std_ulogic;
         flush_in            : in std_ulogic;
-        busy_in             : in std_ulogic;
         deferred            : in std_ulogic;
         sgl_pipe_in         : in std_ulogic;
         stop_mark_in        : in std_ulogic;
@@ -43,7 +41,6 @@ entity control is
         cr_write_in         : in std_ulogic;
 
         valid_out           : out std_ulogic;
-        stall_out           : out std_ulogic;
         stopped_out         : out std_ulogic;
 
         gpr_bypass_a        : out std_ulogic_vector(1 downto 0);
@@ -157,9 +154,6 @@ begin
                 tag_a.tag := i;
             end if;
         end loop;
-        if tag_match(tag_a, complete_in) then
-            tag_a.valid := '0';
-        end if;
         tag_b := instr_tag_init;
         for i in tag_number_t loop
             if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then
@@ -167,9 +161,6 @@ begin
                 tag_b.tag := i;
             end if;
         end loop;
-        if tag_match(tag_b, complete_in) then
-            tag_b.valid := '0';
-        end if;
         tag_c := instr_tag_init;
         for i in tag_number_t loop
             if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then
@@ -177,26 +168,29 @@ begin
                 tag_c.tag := i;
             end if;
         end loop;
-        if tag_match(tag_c, complete_in) then
-            tag_c.valid := '0';
-        end if;
 
         byp_a := "00";
         if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then
-            byp_a := "10";
+            byp_a := "01";
         elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then
+            byp_a := "10";
+        elsif tag_match(complete_in, tag_a) then
             byp_a := "11";
         end if;
         byp_b := "00";
         if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then
-            byp_b := "10";
+            byp_b := "01";
         elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then
+            byp_b := "10";
+        elsif tag_match(complete_in, tag_b) then
             byp_b := "11";
         end if;
         byp_c := "00";
         if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then
-            byp_c := "10";
+            byp_c := "01";
         elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then
+            byp_c := "10";
+        elsif tag_match(complete_in, tag_c) then
             byp_c := "11";
         end if;
 
@@ -204,9 +198,9 @@ begin
         gpr_bypass_b <= byp_b;
         gpr_bypass_c <= byp_c;
 
-        gpr_tag_stall <= (tag_a.valid and not byp_a(1)) or
-                         (tag_b.valid and not byp_b(1)) or
-                         (tag_c.valid and not byp_c(1));
+        gpr_tag_stall <= (tag_a.valid and not (or (byp_a))) or
+                         (tag_b.valid and not (or (byp_b))) or
+                         (tag_c.valid and not (or (byp_c)));
 
         incr_tag := curr_tag;
         instr_tag.tag <= curr_tag;
@@ -331,7 +325,6 @@ begin
 
         -- update outputs
         valid_out <= valid_tmp;
-        stall_out <= stall_tmp or deferred;
 
         -- update registers
         rin_int <= v_int;
diff --git a/core.vhdl b/core.vhdl
index 23f7e82..ba8f0cc 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -100,6 +100,9 @@ architecture behave of core is
     signal fpu_to_execute1: FPUToExecute1Type;
     signal fpu_to_writeback: FPUToWritebackType;
 
+    -- Writeback signals
+    signal writeback_bypass: bypass_data_t;
+
     -- local signals
     signal fetch1_stall_in : std_ulogic;
     signal icache_stall_out : std_ulogic;
@@ -302,6 +305,7 @@ begin
             execute_cr_bypass => execute1_cr_bypass,
             execute2_bypass => execute2_bypass,
             execute2_cr_bypass => execute2_cr_bypass,
+            writeback_bypass => writeback_bypass,
             log_out => log_data(119 downto 110)
             );
     decode2_busy_in <= ex1_busy_out;
@@ -463,6 +467,7 @@ begin
             w_out => writeback_to_register_file,
             c_out => writeback_to_cr_file,
             f_out => writeback_to_fetch1,
+            wb_bypass => writeback_bypass,
             events => writeback_events,
             interrupt_out => do_interrupt,
             complete_out => complete
diff --git a/decode2.vhdl b/decode2.vhdl
index 371c48c..41f3e09 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -41,6 +41,7 @@ entity decode2 is
         execute_cr_bypass : in cr_bypass_data_t;
         execute2_bypass    : in bypass_data_t;
         execute2_cr_bypass : in cr_bypass_data_t;
+        writeback_bypass  : in bypass_data_t;
 
         log_out : out std_ulogic_vector(9 downto 0)
 	);
@@ -49,8 +50,16 @@ end entity decode2;
 architecture behaviour of decode2 is
     type reg_type is record
         e : Decode2ToExecute1Type;
-        repeat : std_ulogic;
+        repeat : repeat_t;
+        busy : std_ulogic;
+        sgl_pipe : std_ulogic;
+        reg_a_valid : std_ulogic;
+        reg_b_valid : std_ulogic;
+        reg_c_valid : std_ulogic;
+        reg_o_valid : std_ulogic;
     end record;
+    constant reg_type_init : reg_type :=
+        (e => Decode2ToExecute1Init, repeat => NONE, others => '0');
 
     signal dc2, dc2in : reg_type;
 
@@ -61,20 +70,21 @@ architecture behaviour of decode2 is
         reg       : gspr_index_t;
         data      : std_ulogic_vector(63 downto 0);
     end record;
+    constant decode_input_reg_init : decode_input_reg_t := ('0', (others => '0'), (others => '0'));
 
     type decode_output_reg_t is record
         reg_valid : std_ulogic;
         reg       : gspr_index_t;
     end record;
+    constant decode_output_reg_init : decode_output_reg_t := ('0', (others => '0'));
 
     function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0);
-                                 reg_data : std_ulogic_vector(63 downto 0);
                                  ispr : gspr_index_t;
                                  instr_addr : std_ulogic_vector(63 downto 0))
         return decode_input_reg_t is
     begin
         if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then
-            return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data);
+            return ('1', gpr_to_gspr(insn_ra(insn_in)), (others => '0'));
         elsif t = SPR then
             -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
             -- If it's all 0, we don't treat it as a dependency as slow SPRs
@@ -83,27 +93,26 @@ architecture behaviour of decode2 is
             assert is_fast_spr(ispr) =  '1' or ispr = "0000000"
                 report "Decode A says SPR but ISPR is invalid:" &
                 to_hstring(ispr) severity failure;
-            return (is_fast_spr(ispr), ispr, reg_data);
+            return (is_fast_spr(ispr), ispr, (others => '0'));
         elsif t = CIA then
             return ('0', (others => '0'), instr_addr);
         elsif HAS_FPU and t = FRA then
-            return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data);
+            return ('1', fpr_to_gspr(insn_fra(insn_in)), (others => '0'));
         else
             return ('0', (others => '0'), (others => '0'));
         end if;
     end;
 
     function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0);
-                                 reg_data : std_ulogic_vector(63 downto 0);
                                  ispr : gspr_index_t) return decode_input_reg_t is
         variable ret : decode_input_reg_t;
     begin
         case t is
             when RB =>
-                ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data);
+                ret := ('1', gpr_to_gspr(insn_rb(insn_in)), (others => '0'));
             when FRB =>
                 if HAS_FPU then
-                    ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data);
+                    ret := ('1', fpr_to_gspr(insn_frb(insn_in)), (others => '0'));
                 else
                     ret := ('0', (others => '0'), (others => '0'));
                 end if;
@@ -138,7 +147,7 @@ architecture behaviour of decode2 is
                 assert is_fast_spr(ispr) = '1' or ispr = "0000000"
                     report "Decode B says SPR but ISPR is invalid:" &
                     to_hstring(ispr) severity failure;
-                ret := (is_fast_spr(ispr), ispr, reg_data);
+                ret := (is_fast_spr(ispr), ispr, (others => '0'));
             when NONE =>
                 ret := ('0', (others => '0'), (others => '0'));
         end case;
@@ -146,23 +155,23 @@ architecture behaviour of decode2 is
         return ret;
     end;
 
-    function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0);
-                                 reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is
+    function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0))
+        return decode_input_reg_t is
     begin
         case t is
             when RS =>
-                return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data);
+                return ('1', gpr_to_gspr(insn_rs(insn_in)), (others => '0'));
             when RCR =>
-                return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data);
+                return ('1', gpr_to_gspr(insn_rcreg(insn_in)), (others => '0'));
             when FRS =>
                 if HAS_FPU then
-                    return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data);
+                    return ('1', fpr_to_gspr(insn_frt(insn_in)), (others => '0'));
                 else
                     return ('0', (others => '0'), (others => '0'));
                 end if;
             when FRC =>
                 if HAS_FPU then
-                    return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data);
+                    return ('1', fpr_to_gspr(insn_frc(insn_in)), (others => '0'));
                 else
                     return ('0', (others => '0'), (others => '0'));
                 end if;
@@ -264,10 +273,14 @@ architecture behaviour of decode2 is
         others     => "000"
         );
 
+    signal decoded_reg_a : decode_input_reg_t;
+    signal decoded_reg_b : decode_input_reg_t;
+    signal decoded_reg_c : decode_input_reg_t;
+    signal decoded_reg_o : decode_output_reg_t;
+
     -- issue control signals
     signal control_valid_in : std_ulogic;
     signal control_valid_out : std_ulogic;
-    signal control_stall_out : std_ulogic;
     signal control_sgl_pipe : std_logic;
 
     signal gpr_write_valid : std_ulogic;
@@ -302,8 +315,6 @@ begin
 
             complete_in => complete_in,
             valid_in    => control_valid_in,
-            repeated    => dc2.repeat,
-            busy_in     => busy_in,
             deferred    => deferred,
             flush_in    => flush_in,
             sgl_pipe_in => control_sgl_pipe,
@@ -331,7 +342,6 @@ begin
             cr_bypass            => cr_bypass,
 
             valid_out   => control_valid_out,
-            stall_out   => control_stall_out,
             stopped_out => stopped_out,
 
             gpr_bypass_a => gpr_a_bypass,
@@ -346,9 +356,12 @@ begin
     decode2_0: process(clk)
     begin
         if rising_edge(clk) then
-            if rst = '1' or flush_in = '1' or deferred = '0' then
+            if rst = '1' or flush_in = '1' then
+                dc2 <= reg_type_init;
+            elsif deferred = '0' then
                 if dc2in.e.valid = '1' then
-                    report "execute " & to_hstring(dc2in.e.nia);
+                    report "execute " & to_hstring(dc2in.e.nia) &
+                        " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid);
                 end if;
                 dc2 <= dc2in;
             end if;
@@ -357,205 +370,246 @@ begin
 
     c_out.read <= d_in.decode.input_cr;
 
+    decode2_addrs: process(all)
+    begin
+        decoded_reg_a <= decode_input_reg_init;
+        decoded_reg_b <= decode_input_reg_init;
+        decoded_reg_c <= decode_input_reg_init;
+        decoded_reg_o <= decode_output_reg_init;
+        if d_in.valid = '1' then
+            decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.ispr1, d_in.nia);
+            decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, d_in.ispr2);
+            decoded_reg_c <= decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn);
+            decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro);
+        end if;
+
+        r_out.read1_enable <= decoded_reg_a.reg_valid;
+        r_out.read1_reg    <= decoded_reg_a.reg;
+        r_out.read2_enable <= decoded_reg_b.reg_valid;
+        r_out.read2_reg    <= decoded_reg_b.reg;
+        r_out.read3_enable <= decoded_reg_c.reg_valid;
+        r_out.read3_reg    <= decoded_reg_c.reg;
+
+    end process;
+
     decode2_1: process(all)
         variable v : reg_type;
-        variable decoded_reg_a : decode_input_reg_t;
-        variable decoded_reg_b : decode_input_reg_t;
-        variable decoded_reg_c : decode_input_reg_t;
-        variable decoded_reg_o : decode_output_reg_t;
         variable length : std_ulogic_vector(3 downto 0);
         variable op : insn_type_t;
+        variable valid_in : std_ulogic;
     begin
         v := dc2;
 
-        v.e := Decode2ToExecute1Init;
-
-        --v.e.input_cr := d_in.decode.input_cr;
-        v.e.output_cr := d_in.decode.output_cr;
+        valid_in := d_in.valid or dc2.busy;
 
-        -- Work out whether XER common bits are set
-        v.e.output_xer := d_in.decode.output_carry;
-        case d_in.decode.insn_type is
-            when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE =>
-                -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only
-                if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then
-                    v.e.oe := '1';
-                    v.e.output_xer := '1';
-                end if;
-            when OP_MTSPR =>
-                if decode_spr_num(d_in.insn) = SPR_XER then
-                    v.e.output_xer := '1';
-                end if;
-            when others =>
-        end case;
+        if dc2.busy = '0' then
+            v.e := Decode2ToExecute1Init;
 
-        decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1,
-                                             d_in.nia);
-        decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2);
-        decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data);
-        decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro);
+            v.sgl_pipe := d_in.decode.sgl_pipe;
 
-        if d_in.decode.lr = '1' then
-            v.e.lr := insn_lk(d_in.insn);
-            -- b and bc have even major opcodes; bcreg is considered absolute
-            v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26);
-        end if;
-        op := d_in.decode.insn_type;
+            v.e.input_cr := d_in.decode.input_cr;
+            v.e.output_cr := d_in.decode.output_cr;
 
-        if d_in.decode.repeat /= NONE then
-            v.e.repeat := '1';
-            v.e.second := dc2.repeat;
-            case d_in.decode.repeat is
-                when DUPD =>
-                    -- update-form loads, 2nd instruction writes RA
-                    if dc2.repeat = '1' then
-                        decoded_reg_o.reg := decoded_reg_a.reg;
+            -- Work out whether XER common bits are set
+            v.e.output_xer := d_in.decode.output_carry;
+            case d_in.decode.insn_type is
+                when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE =>
+                    -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only
+                    if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then
+                        v.e.oe := '1';
+                        v.e.output_xer := '1';
+                    end if;
+                when OP_MTSPR =>
+                    if decode_spr_num(d_in.insn) = SPR_XER then
+                        v.e.output_xer := '1';
                     end if;
                 when others =>
             end case;
-        elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then
-            -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled
-            v.e.repeat := '1';
-            v.e.second := dc2.repeat;
-            -- first one does CTR, second does LR
-            decoded_reg_o.reg(0) := not dc2.repeat;
-        end if;
 
-        v.e.spr_select := d_in.spr_info;
+            v.reg_a_valid := decoded_reg_a.reg_valid;
+            v.reg_b_valid := decoded_reg_b.reg_valid;
+            v.reg_c_valid := decoded_reg_c.reg_valid;
+            v.reg_o_valid := decoded_reg_o.reg_valid;
 
-        r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid;
-        r_out.read1_reg    <= decoded_reg_a.reg;
-        r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid;
-        r_out.read2_reg    <= decoded_reg_b.reg;
-        r_out.read3_enable <= decoded_reg_c.reg_valid and d_in.valid;
-        r_out.read3_reg    <= decoded_reg_c.reg;
+            if d_in.decode.lr = '1' then
+                v.e.lr := insn_lk(d_in.insn);
+                -- b and bc have even major opcodes; bcreg is considered absolute
+                v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26);
+            end if;
+            op := d_in.decode.insn_type;
+
+            v.repeat := d_in.decode.repeat;
+            if d_in.decode.repeat /= NONE then
+                v.e.repeat := '1';
+            elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then
+                -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled
+                v.e.repeat := '1';
+            end if;
 
-        case d_in.decode.length is
-            when is1B =>
-                length := "0001";
-            when is2B =>
-                length := "0010";
-            when is4B =>
-                length := "0100";
-            when is8B =>
-                length := "1000";
-            when NONE =>
-                length := "0000";
-        end case;
+            v.e.spr_select := d_in.spr_info;
+
+            case d_in.decode.length is
+                when is1B =>
+                    length := "0001";
+                when is2B =>
+                    length := "0010";
+                when is4B =>
+                    length := "0100";
+                when is8B =>
+                    length := "1000";
+                when NONE =>
+                    length := "0000";
+            end case;
 
-        -- execute unit
-        v.e.nia := d_in.nia;
-        v.e.unit := d_in.decode.unit;
-        v.e.fac := d_in.decode.facility;
-        v.e.instr_tag := instr_tag;
-        v.e.read_reg1 := decoded_reg_a.reg;
-        v.e.read_reg2 := decoded_reg_b.reg;
-        v.e.write_reg := decoded_reg_o.reg;
-        v.e.write_reg_enable := decoded_reg_o.reg_valid;
-        v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
-        v.e.xerc := c_in.read_xerc_data;
-        v.e.invert_a := d_in.decode.invert_a;
-        v.e.addm1 := '0';
-        v.e.insn_type := op;
-        v.e.invert_out := d_in.decode.invert_out;
-        v.e.input_carry := d_in.decode.input_carry;
-        v.e.output_carry := d_in.decode.output_carry;
-        v.e.is_32bit := d_in.decode.is_32bit;
-        v.e.is_signed := d_in.decode.is_signed;
-        v.e.insn := d_in.insn;
-        v.e.data_len := length;
-        v.e.byte_reverse := d_in.decode.byte_reverse;
-        v.e.sign_extend := d_in.decode.sign_extend;
-        v.e.update := d_in.decode.update;
-        v.e.reserve := d_in.decode.reserve;
-        v.e.br_pred := d_in.br_pred;
-        v.e.result_sel := result_select(op);
-        v.e.sub_select := subresult_select(op);
-        if op = OP_BC or op = OP_BCREG then
-            if d_in.insn(23) = '0' and dc2.repeat = '0' and
-                not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
-                -- decrement CTR if BO(2) = 0 and not bcctr
-                v.e.addm1 := '1';
-                v.e.result_sel := "000";        -- select adder output
+            -- execute unit
+            v.e.nia := d_in.nia;
+            v.e.unit := d_in.decode.unit;
+            v.e.fac := d_in.decode.facility;
+            v.e.read_reg1 := decoded_reg_a.reg;
+            v.e.read_reg2 := decoded_reg_b.reg;
+            v.e.read_reg3 := decoded_reg_c.reg;
+            v.e.write_reg := decoded_reg_o.reg;
+            v.e.write_reg_enable := decoded_reg_o.reg_valid;
+            v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
+            v.e.xerc := c_in.read_xerc_data;
+            v.e.invert_a := d_in.decode.invert_a;
+            v.e.addm1 := '0';
+            v.e.insn_type := op;
+            v.e.invert_out := d_in.decode.invert_out;
+            v.e.input_carry := d_in.decode.input_carry;
+            v.e.output_carry := d_in.decode.output_carry;
+            v.e.is_32bit := d_in.decode.is_32bit;
+            v.e.is_signed := d_in.decode.is_signed;
+            v.e.insn := d_in.insn;
+            v.e.data_len := length;
+            v.e.byte_reverse := d_in.decode.byte_reverse;
+            v.e.sign_extend := d_in.decode.sign_extend;
+            v.e.update := d_in.decode.update;
+            v.e.reserve := d_in.decode.reserve;
+            v.e.br_pred := d_in.br_pred;
+            v.e.result_sel := result_select(op);
+            v.e.sub_select := subresult_select(op);
+            if op = OP_BC or op = OP_BCREG then
+                if d_in.insn(23) = '0' and
+                    not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
+                    -- decrement CTR if BO(2) = 0 and not bcctr
+                    v.e.addm1 := '1';
+                    v.e.result_sel := "000";        -- select adder output
+                end if;
             end if;
-        end if;
-        if op = OP_MFSPR then
-            if is_fast_spr(d_in.ispr1) = '1' then
-                v.e.result_sel := "000";        -- adder_result, effectively a_in
-            elsif d_in.spr_info.valid = '0' then
-                -- Privileged mfspr to invalid/unimplemented SPR numbers
-                -- writes the contents of RT back to RT (i.e. it's a no-op)
-                v.e.result_sel := "001";        -- logical_result
-            elsif d_in.spr_info.ispmu = '1' then
-                v.e.result_sel := "100";        -- pmuspr_result
+            if op = OP_MFSPR then
+                if is_fast_spr(d_in.ispr1) = '1' then
+                    v.e.result_sel := "000";        -- adder_result, effectively a_in
+                elsif d_in.spr_info.valid = '0' then
+                    -- Privileged mfspr to invalid/unimplemented SPR numbers
+                    -- writes the contents of RT back to RT (i.e. it's a no-op)
+                    v.e.result_sel := "001";        -- logical_result
+                elsif d_in.spr_info.ispmu = '1' then
+                    v.e.result_sel := "100";        -- pmuspr_result
+                end if;
             end if;
-        end if;
 
-        -- See if any of the operands can get their value via the bypass path.
-        case gpr_a_bypass is
-            when "10" =>
-                v.e.read_data1 := execute_bypass.data;
-            when "11" =>
-                v.e.read_data1 := execute2_bypass.data;
-            when others =>
-                v.e.read_data1 := decoded_reg_a.data;
-        end case;
-        case gpr_b_bypass is
-            when "10" =>
-                v.e.read_data2 := execute_bypass.data;
-            when "11" =>
-                v.e.read_data2 := execute2_bypass.data;
-            when others =>
-                v.e.read_data2 := decoded_reg_b.data;
-        end case;
-        case gpr_c_bypass is
-            when "10" =>
-                v.e.read_data3 := execute_bypass.data;
-            when "11" =>
-                v.e.read_data3 := execute2_bypass.data;
-            when others =>
-                v.e.read_data3 := decoded_reg_c.data;
-        end case;
-
-        v.e.cr := c_in.read_cr_data;
-        if cr_bypass = "10" then
-            v.e.cr := execute_cr_bypass.data;
-        elsif cr_bypass = "11" then
-            v.e.cr := execute2_cr_bypass.data;
+        elsif dc2.e.valid = '1' then
+            -- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction.
+            -- Set up for the second iteration (if deferred = 1 this will all be ignored)
+            v.e.second := '1';
+            case dc2.repeat is
+                when DUPD =>
+                    -- update-form loads, 2nd instruction writes RA
+                    v.e.write_reg := dc2.e.read_reg1;
+                when NONE =>
+                    -- bcl/bclrl/bctarl that needs to write both CTR and LR
+                    v.e.write_reg(0) := '0';    -- point to LR
+                    v.e.result_sel := "110";    -- select NIA (to go to LR)
+                when others =>
+            end case;
         end if;
 
         -- issue control
-        control_valid_in <= d_in.valid;
-        control_sgl_pipe <= d_in.decode.sgl_pipe;
+        control_valid_in <= valid_in;
+        control_sgl_pipe <= v.sgl_pipe;
 
-        gpr_write_valid <= v.e.write_reg_enable;
-        gpr_write <= decoded_reg_o.reg;
+        gpr_write_valid <= v.reg_o_valid;
+        gpr_write <= v.e.write_reg;
 
-        gpr_a_read_valid <= decoded_reg_a.reg_valid;
-        gpr_a_read <= decoded_reg_a.reg;
+        gpr_a_read_valid <= v.reg_a_valid;
+        gpr_a_read <= v.e.read_reg1;
 
-        gpr_b_read_valid <= decoded_reg_b.reg_valid;
-        gpr_b_read <= decoded_reg_b.reg;
+        gpr_b_read_valid <= v.reg_b_valid;
+        gpr_b_read <= v.e.read_reg2;
 
-        gpr_c_read_valid <= decoded_reg_c.reg_valid;
-        gpr_c_read <= decoded_reg_c.reg;
+        gpr_c_read_valid <= v.reg_c_valid;
+        gpr_c_read <= v.e.read_reg3;
 
-        cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
+        cr_write_valid <= v.e.output_cr or v.e.rc;
         -- Since ops that write CR only write some of the fields,
         -- any op that writes CR effectively also reads it.
-        cr_read_valid <= cr_write_valid or d_in.decode.input_cr;
+        cr_read_valid <= cr_write_valid or v.e.input_cr;
 
-        v.e.valid := control_valid_out;
-        if control_valid_out = '1' then
-            v.repeat := v.e.repeat and not dc2.repeat;
+        -- See if any of the operands can get their value via the bypass path.
+        if dc2.busy = '0' or gpr_a_bypass /= "00" then
+            case gpr_a_bypass is
+                when "01" =>
+                    v.e.read_data1 := execute_bypass.data;
+                when "10" =>
+                    v.e.read_data1 := execute2_bypass.data;
+                when "11" =>
+                    v.e.read_data1 := writeback_bypass.data;
+                when others =>
+                    if decoded_reg_a.reg_valid = '1' then
+                        v.e.read_data1 := r_in.read1_data;
+                    else
+                        v.e.read_data1 := decoded_reg_a.data;
+                    end if;
+            end case;
+        end if;
+        if dc2.busy = '0' or gpr_b_bypass /= "00" then
+            case gpr_b_bypass is
+                when "01" =>
+                    v.e.read_data2 := execute_bypass.data;
+                when "10" =>
+                    v.e.read_data2 := execute2_bypass.data;
+                when "11" =>
+                    v.e.read_data2 := writeback_bypass.data;
+                when others =>
+                    if decoded_reg_b.reg_valid = '1' then
+                        v.e.read_data2 := r_in.read2_data;
+                    else
+                        v.e.read_data2 := decoded_reg_b.data;
+                    end if;
+            end case;
+        end if;
+        if dc2.busy = '0' or gpr_c_bypass /= "00" then
+            case gpr_c_bypass is
+                when "01" =>
+                    v.e.read_data3 := execute_bypass.data;
+                when "10" =>
+                    v.e.read_data3 := execute2_bypass.data;
+                when "11" =>
+                    v.e.read_data3 := writeback_bypass.data;
+                when others =>
+                    if decoded_reg_c.reg_valid = '1' then
+                        v.e.read_data3 := r_in.read3_data;
+                    else
+                        v.e.read_data3 := decoded_reg_c.data;
+                    end if;
+            end case;
         end if;
 
-        stall_out <= control_stall_out or v.repeat;
+        case cr_bypass is
+            when "10" =>
+                v.e.cr := execute_cr_bypass.data;
+            when "11" =>
+                v.e.cr := execute2_cr_bypass.data;
+            when others =>
+                v.e.cr := c_in.read_cr_data;
+        end case;
 
-        if rst = '1' or flush_in = '1' then
-            v.e := Decode2ToExecute1Init;
-            v.repeat := '0';
-        end if;
+        v.e.valid := control_valid_out;
+        v.e.instr_tag := instr_tag;
+        v.busy := valid_in and (not control_valid_out or (v.e.repeat and not v.e.second));
+
+        stall_out <= dc2.busy or deferred;
 
         -- Update registers
         dc2in <= v;
@@ -574,9 +628,9 @@ begin
                             dc2.e.valid &
                             stopped_out &
                             stall_out &
-                            (gpr_a_bypass(1) or gpr_a_bypass(0)) &
-                            (gpr_b_bypass(1) or gpr_b_bypass(0)) &
-                            (gpr_c_bypass(1) or gpr_c_bypass(0));
+                            (gpr_a_bypass(1) xor gpr_a_bypass(0)) &
+                            (gpr_b_bypass(1) xor gpr_b_bypass(0)) &
+                            (gpr_c_bypass(1) xor gpr_c_bypass(0));
             end if;
         end process;
         log_out <= log_data;
diff --git a/register_file.vhdl b/register_file.vhdl
index ab35855..0235dfc 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -100,18 +100,8 @@ begin
         d_out.read2_data <= rd_port_b;
         d_out.read3_data <= registers(to_integer(unsigned(c_addr)));
 
-        -- Forward any written data
-        if w_in.write_enable = '1' then
-            if a_addr = w_addr then
-                d_out.read1_data <= w_in.write_data;
-            end if;
-            if b_addr = w_addr then
-                d_out.read2_data <= w_in.write_data;
-            end if;
-            if c_addr = w_addr then
-                d_out.read3_data <= w_in.write_data;
-            end if;
-        end if;
+        -- Forwarding of written data is now done explicitly with a bypass path
+        -- from writeback to decode2.
     end process register_read_0;
 
     -- Latch read data and ack if dbg read requested and B port not busy
diff --git a/writeback.vhdl b/writeback.vhdl
index db30164..0d6f41d 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -19,6 +19,8 @@ entity writeback is
         c_out        : out WritebackToCrFileType;
         f_out        : out WritebackToFetch1Type;
 
+        wb_bypass    : out bypass_data_t;
+
         -- PMU event bus
         events       : out WritebackEventType;
 
@@ -215,6 +217,11 @@ begin
         f_out <= f;
         flush_out <= f_out.redirect;
 
+        -- Register write data bypass to decode2
+        wb_bypass.tag.tag <= complete_out.tag;
+        wb_bypass.tag.valid <= complete_out.valid and w_out.write_enable;
+        wb_bypass.data <= w_out.write_data;
+
         rin <= v;
     end process;
 end;

From 2da08bcf2e64e5f77ce8b4098ae27101dceef6cc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 8 Jul 2022 16:37:12 +1000
Subject: [PATCH 14/30] decode1: Remove stash buffer

Now that the timing of the busy signal from decode2 doesn't depend on
register numbers or downstream instruction completion, we no longer
need the stash buffer on the output of decode1.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index b807054..5bc023b 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -30,7 +30,6 @@ end entity decode1;
 
 architecture behaviour of decode1 is
     signal r, rin : Decode1ToDecode2Type;
-    signal s      : Decode1ToDecode2Type;
     signal f, fin : Decode1ToFetch1Type;
 
     constant illegal_inst : decode_rom_t :=
@@ -46,7 +45,6 @@ architecture behaviour of decode1 is
         (override => '0', override_decode => illegal_inst, override_unit => '0', force_single => '0');
 
     signal ri, ri_in : reg_internal_t;
-    signal si        : reg_internal_t;
 
     type br_predictor_t is record
         br_nia    : std_ulogic_vector(61 downto 0);
@@ -555,26 +553,12 @@ begin
         if rising_edge(clk) then
             if rst = '1' then
                 r <= Decode1ToDecode2Init;
-                s <= Decode1ToDecode2Init;
                 ri <= reg_internal_t_init;
-                si <= reg_internal_t_init;
             elsif flush_in = '1' then
                 r.valid <= '0';
-                s.valid <= '0';
-            elsif s.valid = '1' then
-                if stall_in = '0' then
-                    r <= s;
-                    ri <= si;
-                    s.valid <= '0';
-                end if;
-            else
-                s <= rin;
-                si <= ri_in;
-                s.valid <= rin.valid and r.valid and stall_in;
-                if r.valid = '0' or stall_in = '0' then
-                    r <= rin;
-                    ri <= ri_in;
-                end if;
+            elsif stall_in = '0' then
+                r <= rin;
+                ri <= ri_in;
             end if;
             if rst = '1' then
                 br.br_nia <= (others => '0');
@@ -585,7 +569,7 @@ begin
             end if;
         end if;
     end process;
-    busy_out <= s.valid;
+    busy_out <= stall_in;
 
     decode1_1: process(all)
         variable v : Decode1ToDecode2Type;

From e598c2aef8067f2fdbcb0f2eab3d945e3eca1335 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 9 Jul 2022 11:55:13 +1000
Subject: [PATCH 15/30] control: Reimplement serialization using tags

This lets us get rid of r_int and its 'outstanding' counter.  We now
test more directly for excess completions by checking that we don't
get duplicate completions for the same tag.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 control.vhdl | 116 +++++++++++----------------------------------------
 decode2.vhdl |  12 ++++--
 2 files changed, 32 insertions(+), 96 deletions(-)

diff --git a/control.vhdl b/control.vhdl
index e6855c2..e5ad1c7 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -17,7 +17,7 @@ entity control is
         valid_in            : in std_ulogic;
         flush_in            : in std_ulogic;
         deferred            : in std_ulogic;
-        sgl_pipe_in         : in std_ulogic;
+        serialize           : in std_ulogic;
         stop_mark_in        : in std_ulogic;
 
         gpr_write_valid_in  : in std_ulogic;
@@ -53,16 +53,6 @@ entity control is
 end entity control;
 
 architecture rtl of control is
-    type state_type is (IDLE, WAIT_FOR_PREV_TO_COMPLETE, WAIT_FOR_CURR_TO_COMPLETE);
-
-    type reg_internal_type is record
-        state : state_type;
-        outstanding : integer range -1 to PIPELINE_DEPTH+2;
-    end record;
-    constant reg_internal_init : reg_internal_type := (state => IDLE, outstanding => 0);
-
-    signal r_int, rin_int : reg_internal_type := reg_internal_init;
-
     signal gpr_write_valid : std_ulogic;
     signal cr_write_valid  : std_ulogic;
 
@@ -71,6 +61,7 @@ architecture rtl of control is
         reg    : gspr_index_t;
         recent : std_ulogic;
         wr_cr  : std_ulogic;
+        valid  : std_ulogic;
     end record;
 
     type tag_regs_array is array(tag_number_t) of tag_register;
@@ -80,27 +71,29 @@ architecture rtl of control is
 
     signal gpr_tag_stall : std_ulogic;
     signal cr_tag_stall  : std_ulogic;
+    signal serial_stall  : std_ulogic;
 
     signal curr_tag : tag_number_t;
     signal next_tag : tag_number_t;
 
     signal curr_cr_tag : tag_number_t;
+    signal prev_tag : tag_number_t;
 
 begin
     control0: process(clk)
     begin
         if rising_edge(clk) then
-            assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1)
-                report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure;
-            r_int <= rin_int;
             for i in tag_number_t loop
                 if rst = '1' or flush_in = '1' then
                     tag_regs(i).wr_gpr <= '0';
                     tag_regs(i).wr_cr <= '0';
+                    tag_regs(i).valid <= '0';
                 else
                     if complete_in.valid = '1' and i = complete_in.tag then
+                        assert tag_regs(i).valid = '1' report "spurious completion" severity failure;
                         tag_regs(i).wr_gpr <= '0';
                         tag_regs(i).wr_cr <= '0';
+                        tag_regs(i).valid <= '0';
                         report "tag " & integer'image(i) & " not valid";
                     end if;
                     if instr_tag.valid = '1' and gpr_write_valid = '1' and
@@ -115,6 +108,7 @@ begin
                         tag_regs(i).reg <= gpr_write_in;
                         tag_regs(i).recent <= gpr_write_valid;
                         tag_regs(i).wr_cr <= cr_write_valid;
+                        tag_regs(i).valid <= '1';
                         if gpr_write_valid = '1' then
                             report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in);
                         end if;
@@ -124,11 +118,15 @@ begin
             if rst = '1' then
                 curr_tag <= 0;
                 curr_cr_tag <= 0;
+                prev_tag <= 0;
             else
                 curr_tag <= next_tag;
                 if instr_tag.valid = '1' and cr_write_valid = '1' then
                     curr_cr_tag <= instr_tag.tag;
                 end if;
+                if valid_out = '1' then
+                    prev_tag <= instr_tag.tag;
+                end if;
             end if;
         end if;
     end process;
@@ -146,6 +144,7 @@ begin
         variable byp_c : std_ulogic_vector(1 downto 0);
         variable tag_cr : instr_tag_t;
         variable byp_cr : std_ulogic_vector(1 downto 0);
+        variable tag_prev : instr_tag_t;
     begin
         tag_a := instr_tag_init;
         for i in tag_number_t loop
@@ -226,107 +225,40 @@ begin
 
         cr_bypass <= byp_cr;
         cr_tag_stall <= tag_cr.valid and not byp_cr(1);
+
+        tag_prev.tag := prev_tag;
+        tag_prev.valid := tag_regs(prev_tag).valid;
+        if tag_match(tag_prev, complete_in) then
+            tag_prev.valid := '0';
+        end if;
+        serial_stall <= tag_prev.valid;
     end process;
 
     control1 : process(all)
-        variable v_int : reg_internal_type;
         variable valid_tmp : std_ulogic;
-        variable stall_tmp : std_ulogic;
     begin
-        v_int := r_int;
-
         -- asynchronous
         valid_tmp := valid_in and not flush_in;
-        stall_tmp := '0';
-
-        if flush_in = '1' then
-            v_int.outstanding := 0;
-        elsif complete_in.valid = '1' then
-            v_int.outstanding := r_int.outstanding - 1;
-        end if;
-        if r_int.outstanding >= PIPELINE_DEPTH + 1 then
-            valid_tmp := '0';
-            stall_tmp := '1';
-        end if;
 
         if rst = '1' then
             gpr_write_valid <= '0';
             cr_write_valid <= '0';
-            v_int := reg_internal_init;
             valid_tmp := '0';
         end if;
 
         -- Handle debugger stop
-        stopped_out <= '0';
-        if stop_mark_in = '1' and v_int.outstanding = 0 then
-            stopped_out <= '1';
-        end if;
-
-        -- state machine to handle instructions that must be single
-        -- through the pipeline.
-        case r_int.state is
-            when IDLE =>
-                if valid_tmp = '1' then
-                    if (sgl_pipe_in = '1') then
-                        if v_int.outstanding /= 0 then
-                            v_int.state := WAIT_FOR_PREV_TO_COMPLETE;
-                            stall_tmp := '1';
-                        else
-                            -- send insn out and wait on it to complete
-                            v_int.state := WAIT_FOR_CURR_TO_COMPLETE;
-                        end if;
-                    else
-                        -- let it go out if there are no GPR or CR hazards
-                        stall_tmp := gpr_tag_stall or cr_tag_stall;
-                    end if;
-                end if;
-
-            when WAIT_FOR_PREV_TO_COMPLETE =>
-                if v_int.outstanding = 0 then
-                    -- send insn out and wait on it to complete
-                    v_int.state := WAIT_FOR_CURR_TO_COMPLETE;
-                else
-                    stall_tmp := '1';
-                end if;
-
-            when WAIT_FOR_CURR_TO_COMPLETE =>
-                if v_int.outstanding = 0 then
-                    v_int.state := IDLE;
-                    -- XXX Don't replicate this
-                    if valid_tmp = '1' then
-                        if (sgl_pipe_in = '1') then
-                            if v_int.outstanding /= 0 then
-                                v_int.state := WAIT_FOR_PREV_TO_COMPLETE;
-                                stall_tmp := '1';
-                            else
-                                -- send insn out and wait on it to complete
-                                v_int.state := WAIT_FOR_CURR_TO_COMPLETE;
-                            end if;
-                        else
-                            -- let it go out if there are no GPR or CR hazards
-                            stall_tmp := gpr_tag_stall or cr_tag_stall;
-                        end if;
-                    end if;
-                else
-                    stall_tmp := '1';
-                end if;
-        end case;
+        stopped_out <= stop_mark_in and not serial_stall;
 
-        if stall_tmp = '1' then
+        -- Don't let it go out if there are GPR or CR hazards
+        -- or we are waiting for the previous instruction to complete
+        if (gpr_tag_stall or cr_tag_stall or (serialize and serial_stall)) = '1' then
             valid_tmp := '0';
         end if;
 
         gpr_write_valid <= gpr_write_valid_in and valid_tmp;
         cr_write_valid <= cr_write_in and valid_tmp;
 
-        if valid_tmp = '1' and deferred = '0' then
-            v_int.outstanding := v_int.outstanding + 1;
-        end if;
-
         -- update outputs
         valid_out <= valid_tmp;
-
-        -- update registers
-        rin_int <= v_int;
     end process;
 end;
diff --git a/decode2.vhdl b/decode2.vhdl
index 41f3e09..500e4f5 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -53,6 +53,7 @@ architecture behaviour of decode2 is
         repeat : repeat_t;
         busy : std_ulogic;
         sgl_pipe : std_ulogic;
+        prev_sgl : std_ulogic;
         reg_a_valid : std_ulogic;
         reg_b_valid : std_ulogic;
         reg_c_valid : std_ulogic;
@@ -281,7 +282,7 @@ architecture behaviour of decode2 is
     -- issue control signals
     signal control_valid_in : std_ulogic;
     signal control_valid_out : std_ulogic;
-    signal control_sgl_pipe : std_logic;
+    signal control_serialize : std_logic;
 
     signal gpr_write_valid : std_ulogic;
     signal gpr_write : gspr_index_t;
@@ -317,7 +318,7 @@ begin
             valid_in    => control_valid_in,
             deferred    => deferred,
             flush_in    => flush_in,
-            sgl_pipe_in => control_sgl_pipe,
+            serialize   => control_serialize,
             stop_mark_in => d_in.stop_mark,
 
             gpr_write_valid_in => gpr_write_valid,
@@ -405,7 +406,10 @@ begin
         if dc2.busy = '0' then
             v.e := Decode2ToExecute1Init;
 
-            v.sgl_pipe := d_in.decode.sgl_pipe;
+            if d_in.valid = '1' then
+                v.prev_sgl := dc2.sgl_pipe;
+                v.sgl_pipe := d_in.decode.sgl_pipe;
+            end if;
 
             v.e.input_cr := d_in.decode.input_cr;
             v.e.output_cr := d_in.decode.output_cr;
@@ -527,7 +531,7 @@ begin
 
         -- issue control
         control_valid_in <= valid_in;
-        control_sgl_pipe <= v.sgl_pipe;
+        control_serialize <= v.sgl_pipe or v.prev_sgl;
 
         gpr_write_valid <= v.reg_o_valid;
         gpr_write <= v.e.write_reg;

From 7c240a664bb68bc1d9c35254fe06e07436eb1318 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 9 Jul 2022 13:17:18 +1000
Subject: [PATCH 16/30] fetch1: Fix debug stop again

This fixes a bug which prevents the core from stopping properly.  The
same bug was previously fixed in commit e41cb01bca99 ("fetch1: Fix
debug stop", 2020-12-19) and reintroduced by commit 0fb207be6069
("fetch1: Implement a simple branch target cache", 2020-12-19).

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fetch1.vhdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch1.vhdl b/fetch1.vhdl
index 4c4a6a8..af1dd6b 100644
--- a/fetch1.vhdl
+++ b/fetch1.vhdl
@@ -93,7 +93,7 @@ begin
             end if;
             -- always send the up-to-date stop mark and req
             r.stop_mark <= stop_in;
-            r.req <= not rst;
+            r.req <= not rst and not stop_in;
 	end if;
     end process;
     log_out <= log_nia;

From d1850fea29a88bcb4f7789da1e4e50550c2eb9ec Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 9 Jul 2022 18:29:48 +1000
Subject: [PATCH 17/30] Track hazards explicitly for XER overflow bits

This provides a mechanism for tracking updates to the XER overflow
bits (SO, OV, OV32) and stalling instructions which need current
values of those bits (mfxer, integer compare instructions, integer
Rc=1 instructions, addex) or which writes carry bits (since all the
XER common bits are written together, if we are writing CA/CA32 we
need up-to-date values of SO/OV/OV32).

This will enable updates to SO/OV/OV32 to be done at other places
besides the ex1 stage.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 control.vhdl  | 26 +++++++++++++++++++++++++-
 decode2.vhdl  | 36 +++++++++++++++++++++++++++++++++---
 execute1.vhdl | 20 +++++++++++++-------
 3 files changed, 71 insertions(+), 11 deletions(-)

diff --git a/control.vhdl b/control.vhdl
index e5ad1c7..e8c8068 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -39,6 +39,8 @@ entity control is
 
         cr_read_in          : in std_ulogic;
         cr_write_in         : in std_ulogic;
+        ov_read_in          : in std_ulogic;
+        ov_write_in         : in std_ulogic;
 
         valid_out           : out std_ulogic;
         stopped_out         : out std_ulogic;
@@ -55,12 +57,14 @@ end entity control;
 architecture rtl of control is
     signal gpr_write_valid : std_ulogic;
     signal cr_write_valid  : std_ulogic;
+    signal ov_write_valid  : std_ulogic;
 
     type tag_register is record
         wr_gpr : std_ulogic;
         reg    : gspr_index_t;
         recent : std_ulogic;
         wr_cr  : std_ulogic;
+        wr_ov  : std_ulogic;
         valid  : std_ulogic;
     end record;
 
@@ -71,12 +75,14 @@ architecture rtl of control is
 
     signal gpr_tag_stall : std_ulogic;
     signal cr_tag_stall  : std_ulogic;
+    signal ov_tag_stall  : std_ulogic;
     signal serial_stall  : std_ulogic;
 
     signal curr_tag : tag_number_t;
     signal next_tag : tag_number_t;
 
     signal curr_cr_tag : tag_number_t;
+    signal curr_ov_tag : tag_number_t;
     signal prev_tag : tag_number_t;
 
 begin
@@ -87,12 +93,14 @@ begin
                 if rst = '1' or flush_in = '1' then
                     tag_regs(i).wr_gpr <= '0';
                     tag_regs(i).wr_cr <= '0';
+                    tag_regs(i).wr_ov <= '0';
                     tag_regs(i).valid <= '0';
                 else
                     if complete_in.valid = '1' and i = complete_in.tag then
                         assert tag_regs(i).valid = '1' report "spurious completion" severity failure;
                         tag_regs(i).wr_gpr <= '0';
                         tag_regs(i).wr_cr <= '0';
+                        tag_regs(i).wr_ov <= '0';
                         tag_regs(i).valid <= '0';
                         report "tag " & integer'image(i) & " not valid";
                     end if;
@@ -108,6 +116,7 @@ begin
                         tag_regs(i).reg <= gpr_write_in;
                         tag_regs(i).recent <= gpr_write_valid;
                         tag_regs(i).wr_cr <= cr_write_valid;
+                        tag_regs(i).wr_ov <= ov_write_valid;
                         tag_regs(i).valid <= '1';
                         if gpr_write_valid = '1' then
                             report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in);
@@ -118,12 +127,16 @@ begin
             if rst = '1' then
                 curr_tag <= 0;
                 curr_cr_tag <= 0;
+                curr_ov_tag <= 0;
                 prev_tag <= 0;
             else
                 curr_tag <= next_tag;
                 if instr_tag.valid = '1' and cr_write_valid = '1' then
                     curr_cr_tag <= instr_tag.tag;
                 end if;
+                if instr_tag.valid = '1' and ov_write_valid = '1' then
+                    curr_ov_tag <= instr_tag.tag;
+                end if;
                 if valid_out = '1' then
                     prev_tag <= instr_tag.tag;
                 end if;
@@ -144,6 +157,7 @@ begin
         variable byp_c : std_ulogic_vector(1 downto 0);
         variable tag_cr : instr_tag_t;
         variable byp_cr : std_ulogic_vector(1 downto 0);
+        variable tag_ov : instr_tag_t;
         variable tag_prev : instr_tag_t;
     begin
         tag_a := instr_tag_init;
@@ -226,6 +240,14 @@ begin
         cr_bypass <= byp_cr;
         cr_tag_stall <= tag_cr.valid and not byp_cr(1);
 
+        -- OV hazards
+        tag_ov.tag := curr_ov_tag;
+        tag_ov.valid := ov_read_in and tag_regs(curr_ov_tag).wr_ov;
+        if tag_match(tag_ov, complete_in) then
+            tag_ov.valid := '0';
+        end if;
+        ov_tag_stall <= tag_ov.valid;
+
         tag_prev.tag := prev_tag;
         tag_prev.valid := tag_regs(prev_tag).valid;
         if tag_match(tag_prev, complete_in) then
@@ -251,12 +273,14 @@ begin
 
         -- Don't let it go out if there are GPR or CR hazards
         -- or we are waiting for the previous instruction to complete
-        if (gpr_tag_stall or cr_tag_stall or (serialize and serial_stall)) = '1' then
+        if (gpr_tag_stall or cr_tag_stall or ov_tag_stall or
+            (serialize and serial_stall)) = '1' then
             valid_tmp := '0';
         end if;
 
         gpr_write_valid <= gpr_write_valid_in and valid_tmp;
         cr_write_valid <= cr_write_in and valid_tmp;
+        ov_write_valid <= ov_write_in and valid_tmp;
 
         -- update outputs
         valid_out <= valid_tmp;
diff --git a/decode2.vhdl b/decode2.vhdl
index 500e4f5..a043ef9 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -58,6 +58,8 @@ architecture behaviour of decode2 is
         reg_b_valid : std_ulogic;
         reg_c_valid : std_ulogic;
         reg_o_valid : std_ulogic;
+        input_ov  : std_ulogic;
+        output_ov : std_ulogic;
     end record;
     constant reg_type_init : reg_type :=
         (e => Decode2ToExecute1Init, repeat => NONE, others => '0');
@@ -303,6 +305,9 @@ architecture behaviour of decode2 is
     signal cr_write_valid  : std_ulogic;
     signal cr_bypass       : std_ulogic_vector(1 downto 0);
 
+    signal ov_read_valid   : std_ulogic;
+    signal ov_write_valid  : std_ulogic;
+
     signal instr_tag       : instr_tag_t;
 
 begin
@@ -342,6 +347,9 @@ begin
             cr_write_in          => cr_write_valid,
             cr_bypass            => cr_bypass,
 
+            ov_read_in           => ov_read_valid,
+            ov_write_in          => ov_write_valid,
+
             valid_out   => control_valid_out,
             stopped_out => stopped_out,
 
@@ -414,19 +422,39 @@ begin
             v.e.input_cr := d_in.decode.input_cr;
             v.e.output_cr := d_in.decode.output_cr;
 
-            -- Work out whether XER common bits are set
+            -- Work out whether XER SO/OV/OV32 bits are set
+            -- or used by this instruction
+            v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
             v.e.output_xer := d_in.decode.output_carry;
+            v.input_ov := d_in.decode.output_carry;
+            v.output_ov := '0';
+            if d_in.decode.input_carry = OV then
+                v.input_ov := '1';
+                v.output_ov := '1';
+            end if;
+            if v.e.rc = '1' and d_in.decode.facility /= FPU then
+                v.input_ov := '1';
+            end if;
             case d_in.decode.insn_type is
                 when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE =>
                     -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only
                     if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then
                         v.e.oe := '1';
                         v.e.output_xer := '1';
+                        v.output_ov := '1';
+                        v.input_ov := '1';      -- need SO state if setting OV to 0
+                    end if;
+                when OP_MFSPR =>
+                    if decode_spr_num(d_in.insn) = SPR_XER then
+                        v.input_ov := '1';
                     end if;
                 when OP_MTSPR =>
                     if decode_spr_num(d_in.insn) = SPR_XER then
                         v.e.output_xer := '1';
+                        v.output_ov := '1';
                     end if;
+                when OP_CMP | OP_MCRXRX =>
+                    v.input_ov := '1';
                 when others =>
             end case;
 
@@ -474,8 +502,6 @@ begin
             v.e.read_reg3 := decoded_reg_c.reg;
             v.e.write_reg := decoded_reg_o.reg;
             v.e.write_reg_enable := decoded_reg_o.reg_valid;
-            v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
-            v.e.xerc := c_in.read_xerc_data;
             v.e.invert_a := d_in.decode.invert_a;
             v.e.addm1 := '0';
             v.e.insn_type := op;
@@ -550,6 +576,9 @@ begin
         -- any op that writes CR effectively also reads it.
         cr_read_valid <= cr_write_valid or v.e.input_cr;
 
+        ov_read_valid <= v.input_ov;
+        ov_write_valid <= v.output_ov;
+
         -- See if any of the operands can get their value via the bypass path.
         if dc2.busy = '0' or gpr_a_bypass /= "00" then
             case gpr_a_bypass is
@@ -608,6 +637,7 @@ begin
             when others =>
                 v.e.cr := c_in.read_cr_data;
         end case;
+        v.e.xerc := c_in.read_xerc_data;
 
         v.e.valid := control_valid_out;
         v.e.instr_tag := instr_tag;
diff --git a/execute1.vhdl b/execute1.vhdl
index 57f90b0..6fadc8c 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -435,12 +435,18 @@ begin
     x_to_pmu.spr_val <= ex1.e.write_data;
     x_to_pmu.run <= '1';
 
-    -- XER forwarding. To avoid having to track XER hazards, we use
-    -- the previously latched value.  Since the XER common bits
-    -- (SO, OV[32] and CA[32]) are only modified by instructions that are
-    -- handled here, we can just use the result most recently sent to
-    -- writeback, unless a pipeline flush has happened in the meantime.
-    xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc;
+    -- XER forwarding.  The CA and CA32 bits are only modified by instructions
+    -- that are handled here, so for them we can just use the result most
+    -- recently sent to writeback, unless a pipeline flush has happened in the
+    -- meantime.
+    -- Hazards for SO/OV/OV32 are handled by control.vhdl as there may be other
+    -- units writing to them.  No forwarding is done because performance of
+    -- instructions that alter them is not considered significant.
+    xerc_in.so <= e_in.xerc.so;
+    xerc_in.ov <= e_in.xerc.ov;
+    xerc_in.ov32 <= e_in.xerc.ov32;
+    xerc_in.ca <= ex1.xerc.ca when ex1.xerc_valid = '1' else e_in.xerc.ca;
+    xerc_in.ca32 <= ex1.xerc.ca32 when ex1.xerc_valid = '1' else e_in.xerc.ca32;
 
     -- N.B. the busy signal from each source includes the
     -- stage2 stall from that source in it.
@@ -1561,7 +1567,7 @@ begin
             cr_res(31) := sign;
             cr_res(30) := not (sign or zero);
             cr_res(29) := zero;
-            cr_res(28) := ex1.xerc.so;
+            cr_res(28) := ex1.e.xerc.so;
             cr_mask(7) := '1';
         end if;
 

From 23d5c4edc50bf64a7e675220c338671059ede0bf Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 2 May 2022 09:39:26 +1000
Subject: [PATCH 18/30] FPU: Convert internal R, A, B, and C registers to 8.56
 format

This changes the representation of the R, A, B and C registers in the
FPU from 10.54 format (10 bits to the left of the binary point and 54
bits to the right) to 8.56 format, to match the representation used in
the P and Y registers and the multiplier operands.  This eliminates
the need for shifting when R, A, B or C is an input to the multiplier
and will make it easier to implement integer division in the FPU.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 220 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 123 insertions(+), 97 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index a20a7a0..27587f7 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -28,12 +28,20 @@ architecture behaviour of fpu is
     type fp_number_class is (ZERO, FINITE, INFINITY, NAN);
 
     constant EXP_BITS : natural := 13;
+    constant UNIT_BIT : natural := 56;
+    constant QNAN_BIT : natural := UNIT_BIT - 1;
+    constant SP_LSB   : natural := UNIT_BIT - 23;
+    constant SP_GBIT  : natural := SP_LSB - 1;
+    constant SP_RBIT  : natural := SP_LSB - 2;
+    constant DP_LSB   : natural := UNIT_BIT - 52;
+    constant DP_GBIT  : natural := DP_LSB - 1;
+    constant DP_RBIT  : natural := DP_LSB - 2;
 
     type fpu_reg_type is record
         class    : fp_number_class;
         negative : std_ulogic;
         exponent : signed(EXP_BITS-1 downto 0);         -- unbiased
-        mantissa : std_ulogic_vector(63 downto 0);      -- 10.54 format
+        mantissa : std_ulogic_vector(63 downto 0);      -- 8.56 format
     end record;
 
     type state_t is (IDLE, DO_ILLEGAL,
@@ -92,7 +100,7 @@ architecture behaviour of fpu is
         a            : fpu_reg_type;
         b            : fpu_reg_type;
         c            : fpu_reg_type;
-        r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
+        r            : std_ulogic_vector(63 downto 0);  -- 8.56 format
         s            : std_ulogic_vector(55 downto 0);  -- extended fraction
         x            : std_ulogic;
         p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
@@ -170,7 +178,7 @@ architecture behaviour of fpu is
     constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
     constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
     constant BIN_RND  : std_ulogic_vector(1 downto 0) := "10";
-    constant BIN_PS6  : std_ulogic_vector(1 downto 0) := "11";
+    constant BIN_PS8  : std_ulogic_vector(1 downto 0) := "11";
 
     constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
     constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
@@ -432,7 +440,8 @@ architecture behaviour of fpu is
             if exp_nz = '0' then
                 r.exponent := to_signed(-1022, EXP_BITS);
             end if;
-            r.mantissa := "000000000" & exp_nz & fpr(51 downto 0) & "00";
+            r.mantissa := std_ulogic_vector(shift_left(resize(unsigned(exp_nz & fpr(51 downto 0)), 64),
+                                                       UNIT_BIT - 52));
             cls := exp_ao & exp_nz & frac_nz;
             case cls is
                 when "000"  => r.class := ZERO;
@@ -465,22 +474,22 @@ architecture behaviour of fpu is
         case class is
             when ZERO =>
             when FINITE =>
-                if mantissa(54) = '1' then
+                if mantissa(UNIT_BIT) = '1' then
                     -- normalized number
                     result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023);
                 end if;
-                result(51 downto 29) := mantissa(53 downto 31);
+                result(51 downto 29) := mantissa(UNIT_BIT - 1 downto SP_LSB);
                 if single_prec = '0' then
-                    result(28 downto 0) := mantissa(30 downto 2);
+                    result(28 downto 0) := mantissa(SP_LSB - 1 downto DP_LSB);
                 end if;
             when INFINITY =>
                 result(62 downto 52) := "11111111111";
             when NAN =>
                 result(62 downto 52) := "11111111111";
-                result(51) := quieten_nan or mantissa(53);
-                result(50 downto 29) := mantissa(52 downto 31);
+                result(51) := quieten_nan or mantissa(QNAN_BIT);
+                result(50 downto 29) := mantissa(QNAN_BIT - 1 downto SP_LSB);
                 if single_prec = '0' then
-                    result(28 downto 0) := mantissa(30 downto 2);
+                    result(28 downto 0) := mantissa(SP_LSB - 1 downto DP_LSB);
                 end if;
         end case;
         return result;
@@ -488,8 +497,8 @@ architecture behaviour of fpu is
 
     -- Determine whether to increment when rounding
     -- Returns rounding_inc & inexact
-    -- Assumes x includes the bottom 29 bits of the mantissa already
-    -- if single_prec = 1 (usually arranged by setting set_x = 1 earlier).
+    -- If single_prec = 1, assumes x includes the bottom 31 (== SP_LSB - 2)
+    -- bits of the mantissa already (usually arranged by setting set_x = 1 earlier).
     function fp_rounding(mantissa: std_ulogic_vector(63 downto 0); x: std_ulogic;
                          single_prec: std_ulogic; rn: std_ulogic_vector(2 downto 0);
                          sign: std_ulogic)
@@ -499,11 +508,11 @@ architecture behaviour of fpu is
         variable lsb : std_ulogic;
     begin
         if single_prec = '0' then
-            grx := mantissa(1 downto 0) & x;
-            lsb := mantissa(2);
+            grx := mantissa(DP_GBIT downto DP_RBIT) & (x or (or mantissa(DP_RBIT - 1 downto 0)));
+            lsb := mantissa(DP_LSB);
         else
-            grx := mantissa(30 downto 29) & x;
-            lsb := mantissa(31);
+            grx := mantissa(SP_GBIT downto SP_RBIT) & x;
+            lsb := mantissa(SP_LSB);
         end if;
         ret(1) := '0';
         ret(0) := or (grx);
@@ -589,11 +598,11 @@ begin
     begin
         if rising_edge(clk) then
             if r.is_sqrt = '1' then
-                addrhi := r.b.mantissa(55 downto 54);
+                addrhi := r.b.mantissa(UNIT_BIT + 1 downto UNIT_BIT);
             else
                 addrhi := "00";
             end if;
-            addr := addrhi & r.b.mantissa(53 downto 46);
+            addr := addrhi & r.b.mantissa(UNIT_BIT - 1 downto UNIT_BIT - 8);
             inverse_est <= '1' & inverse_table(to_integer(unsigned(addr)));
         end if;
     end process;
@@ -670,6 +679,8 @@ begin
         variable maddend     : std_ulogic_vector(127 downto 0);
         variable sum         : std_ulogic_vector(63 downto 0);
         variable round_inc   : std_ulogic_vector(63 downto 0);
+        variable rbit_inc    : std_ulogic;
+        variable mult_mask   : std_ulogic;
         variable int_result  : std_ulogic;
         variable illegal     : std_ulogic;
     begin
@@ -729,8 +740,8 @@ begin
             end if;
         end if;
 
-        r_hi_nz <= or (r.r(55 downto 31));
-        r_lo_nz <= or (r.r(30 downto 2));
+        r_hi_nz <= or (r.r(UNIT_BIT + 1 downto SP_LSB));
+        r_lo_nz <= or (r.r(SP_LSB - 1 downto DP_LSB));
         s_nz <= or (r.s);
 
         if r.single_prec = '0' then
@@ -761,13 +772,13 @@ begin
         end if;
 
         -- Compare P with zero and with B
-        px_nz := or (r.p(57 downto 4));
+        px_nz := or (r.p(UNIT_BIT + 1 downto 4));
         pcmpb_eq := '0';
-        if r.p(59 downto 4) = r.b.mantissa(55 downto 0) then
+        if r.p(59 downto 4) = r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT) then
             pcmpb_eq := '1';
         end if;
         pcmpb_lt := '0';
-        if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(55 downto 0)) then
+        if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT)) then
             pcmpb_lt := '1';
         end if;
 
@@ -805,6 +816,8 @@ begin
         pshift := '0';
         renorm_sqrt := '0';
         shiftin := '0';
+        rbit_inc := '0';
+        mult_mask := '0';
         int_result := '0';
         illegal := '0';
         case r.state is
@@ -870,7 +883,7 @@ begin
                             v.state := DO_FCTI;
                         when "10010" =>
                             v.opsel_a := AIN_A;
-                            if v.b.mantissa(54) = '0' and v.a.mantissa(54) = '1' then
+                            if v.b.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then
                                 v.opsel_a := AIN_B;
                             end if;
                             v.state := DO_FDIV;
@@ -889,7 +902,7 @@ begin
                         when "11001" =>
                             v.is_multiply := '1';
                             v.opsel_a := AIN_A;
-                            if v.c.mantissa(54) = '0' and v.a.mantissa(54) = '1' then
+                            if v.c.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then
                                 v.opsel_a := AIN_C;
                             end if;
                             v.state := DO_FMUL;
@@ -898,9 +911,9 @@ begin
                             v.opsel_a := AIN_B;
                             v.state := DO_FRSQRTE;
                         when "11100" | "11101" | "11110" | "11111" =>
-                            if v.a.mantissa(54) = '0' then
+                            if v.a.mantissa(UNIT_BIT) = '0' then
                                 v.opsel_a := AIN_A;
-                            elsif v.c.mantissa(54) = '0' then
+                            elsif v.c.mantissa(UNIT_BIT) = '0' then
                                 v.opsel_a := AIN_C;
                             else
                                 v.opsel_a := AIN_B;
@@ -934,7 +947,7 @@ begin
                 v.instr_done := '1';
                 v.cr_result := "0000";
                 if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
-                    (r.b.class = FINITE and r.b.mantissa(53) = '0') then
+                    (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then
                     v.cr_result(2) := '1';
                 end if;
                 if r.a.class = NAN or r.a.class = INFINITY or
@@ -952,7 +965,7 @@ begin
                 v.instr_done := '1';
                 v.cr_result := "0000";
                 if r.b.class = ZERO or r.b.class = INFINITY or
-                    (r.b.class = FINITE and r.b.mantissa(53) = '0') then
+                    (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then
                     v.cr_result(2) := '1';
                 end if;
                 if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO
@@ -966,8 +979,8 @@ begin
                 v.instr_done := '1';
                 update_fx := '1';
                 v.result_exp := r.b.exponent;
-                if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                    (r.b.class = NAN and r.b.mantissa(53) = '0') then
+                if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
+                    (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') then
                     -- Signalling NAN
                     v.fpscr(FPSCR_VXSNAN) := '1';
                     if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then
@@ -1119,7 +1132,7 @@ begin
                 v.result_exp := r.b.exponent;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                if r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0' then
                     -- Signalling NAN
                     v.fpscr(FPSCR_VXSNAN) := '1';
                     invalid := '1';
@@ -1190,7 +1203,7 @@ begin
                         elsif r.b.exponent >= to_signed(52, EXP_BITS) then
                             -- integer already, no rounding required,
                             -- shift into final position
-                            v.shift := r.b.exponent - to_signed(54, EXP_BITS);
+                            v.shift := r.b.exponent - to_signed(UNIT_BIT, EXP_BITS);
                             if r.insn(8) = '1' and r.b.negative = '1' then
                                 v.state := INT_OFLOW;
                             else
@@ -1214,7 +1227,7 @@ begin
                     v.result_sign := '1';
                 end if;
                 v.result_class := r.b.class;
-                v.result_exp := to_signed(54, EXP_BITS);
+                v.result_exp := to_signed(UNIT_BIT, EXP_BITS);
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
                 if r.b.class = ZERO then
@@ -1286,9 +1299,9 @@ begin
                 if r.a.class = FINITE and r.c.class = FINITE then
                     v.result_exp := r.a.exponent + r.c.exponent;
                     -- Renormalize denorm operands
-                    if r.a.mantissa(54) = '0' then
+                    if r.a.mantissa(UNIT_BIT) = '0' then
                         v.state := RENORM_A;
-                    elsif r.c.mantissa(54) = '0' then
+                    elsif r.c.mantissa(UNIT_BIT) = '0' then
                         v.state := RENORM_C;
                     else
                         f_to_multiply.valid <= '1';
@@ -1325,9 +1338,9 @@ begin
                 v.count := "00";
                 if r.a.class = FINITE and r.b.class = FINITE then
                     -- Renormalize denorm operands
-                    if r.a.mantissa(54) = '0' then
+                    if r.a.mantissa(UNIT_BIT) = '0' then
                         v.state := RENORM_A;
-                    elsif r.b.mantissa(54) = '0' then
+                    elsif r.b.mantissa(UNIT_BIT) = '0' then
                         v.state := RENORM_B;
                     else
                         v.first := '1';
@@ -1384,7 +1397,7 @@ begin
                         if r.b.negative = '1' then
                             v.fpscr(FPSCR_VXSQRT) := '1';
                             qnan_result := '1';
-                        elsif r.b.mantissa(54) = '0' then
+                        elsif r.b.mantissa(UNIT_BIT) = '0' then
                             v.state := RENORM_B;
                         elsif r.b.exponent(0) = '0' then
                             v.state := SQRT_1;
@@ -1416,7 +1429,7 @@ begin
                 case r.b.class is
                     when FINITE =>
                         v.result_exp := - r.b.exponent;
-                        if r.b.mantissa(54) = '0' then
+                        if r.b.mantissa(UNIT_BIT) = '0' then
                             v.state := RENORM_B;
                         else
                             v.state := FRE_1;
@@ -1446,7 +1459,7 @@ begin
                         if r.b.negative = '1' then
                             v.fpscr(FPSCR_VXSQRT) := '1';
                             qnan_result := '1';
-                        elsif r.b.mantissa(54) = '0' then
+                        elsif r.b.mantissa(UNIT_BIT) = '0' then
                             v.state := RENORM_B;
                         elsif r.b.exponent(0) = '0' then
                             v.state := RSQRT_1;
@@ -1488,9 +1501,9 @@ begin
                     mulexp := r.a.exponent + r.c.exponent;
                     v.result_exp := mulexp;
                     -- Make sure A and C are normalized
-                    if r.a.mantissa(54) = '0' then
+                    if r.a.mantissa(UNIT_BIT) = '0' then
                         v.state := RENORM_A;
-                    elsif r.c.mantissa(54) = '0' then
+                    elsif r.c.mantissa(UNIT_BIT) = '0' then
                         v.state := RENORM_C;
                     elsif r.b.class = ZERO then
                         -- no addend, degenerates to multiply
@@ -1559,7 +1572,7 @@ begin
                 set_a := '1';
                 v.result_exp := new_exp;
                 if r.insn(4) = '1' then
-                    if r.c.mantissa(54) = '1' then
+                    if r.c.mantissa(UNIT_BIT) = '1' then
                         if r.insn(3) = '0' or r.b.class = ZERO then
                             v.first := '1';
                             v.state := MULT_1;
@@ -1575,7 +1588,7 @@ begin
                         v.state := RENORM_C;
                     end if;
                 else
-                    if r.b.mantissa(54) = '1' then
+                    if r.b.mantissa(UNIT_BIT) = '1' then
                         v.first := '1';
                         v.state := DIV_2;
                     else
@@ -1654,7 +1667,7 @@ begin
                     opsel_ainv <= '1';
                     carry_in <= '1';
                     v.state := FINISH;
-                elsif r.r(55) = '1' then
+                elsif r.r(UNIT_BIT + 1) = '1' then
                     -- sum overflowed, shift right
                     opsel_r <= RES_SHIFT;
                     set_x := '1';
@@ -1663,10 +1676,10 @@ begin
                     else
                         v.state := ROUNDING;
                     end if;
-                elsif r.r(54) = '1' then
+                elsif r.r(UNIT_BIT) = '1' then
                     set_x := '1';
                     v.state := ROUNDING;
-                elsif (r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
+                elsif (r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then
                     -- r.x must be zero at this point
                     v.result_class := ZERO;
                     if r.is_subtract = '1' then
@@ -1753,12 +1766,12 @@ begin
                     opsel_s <= S_NEG;
                     set_s := '1';
                 end if;
-                v.shift := to_signed(56, EXP_BITS);
+                v.shift := to_signed(UNIT_BIT, EXP_BITS);
                 v.state := FMADD_6;
 
             when FMADD_6 =>
-                -- r.shift = 56 (or 0, but only if r is now nonzero)
-                if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
+                -- r.shift = UNIT_BIT (or 0, but only if r is now nonzero)
+                if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then
                     if s_nz = '0' then
                         -- must be a subtraction, and r.x must be zero
                         v.result_class := ZERO;
@@ -1771,7 +1784,7 @@ begin
                         set_s := '1';
                         -- stay in state FMADD_6
                     end if;
-                elsif r.r(56 downto 54) = "001" then
+                elsif r.r(UNIT_BIT + 2 downto UNIT_BIT) = "001" then
                     v.state := FINISH;
                 else
                     renormalize := '1';
@@ -1835,6 +1848,7 @@ begin
                 set_y := r.first;
                 f_to_multiply.valid <= r.first;
                 pshift := '1';
+                mult_mask := '1';
                 if multiply_to_f.valid = '1' then
                     opsel_r <= RES_MULT;
                     v.first := '1';
@@ -1853,13 +1867,15 @@ begin
                 end if;
 
             when DIV_6 =>
+                -- r.opsel_a = AIN_R
                 -- test if remainder is 0 or >= B
                 if pcmpb_lt = '1' then
                     -- quotient is correct, set X if remainder non-zero
-                    v.x := r.p(58) or px_nz;
+                    v.x := r.p(UNIT_BIT + 2) or px_nz;
                 else
-                    -- quotient needs to be incremented by 1
-                    carry_in <= '1';
+                    -- quotient needs to be incremented by 1 in R-bit position
+                    rbit_inc := '1';
+                    opsel_b <= BIN_RND;
                     v.x := not pcmpb_eq;
                 end if;
                 v.state := FINISH;
@@ -1913,6 +1929,7 @@ begin
                 msel_2 <= MUL2_R;
                 set_y := r.first;
                 pshift := '1';
+                mult_mask := '1';
                 if multiply_to_f.valid = '1' then
                     -- put result into R
                     opsel_r <= RES_MULT;
@@ -1957,6 +1974,7 @@ begin
                 set_y := r.first;
                 -- wait for second multiply (should be here already)
                 pshift := '1';
+                mult_mask := '1';
                 if multiply_to_f.valid = '1' then
                     -- put result into R
                     opsel_r <= RES_MULT;
@@ -2001,11 +2019,8 @@ begin
                 end if;
 
             when SQRT_10 =>
-                -- Add the bottom 8 bits of P, sign-extended,
-                -- divided by 4, onto R.
-                -- The division by 4 is because R is 10.54 format
-                -- whereas P is 8.56 format.
-                opsel_b <= BIN_PS6;
+                -- Add the bottom 8 bits of P, sign-extended, onto R.
+                opsel_b <= BIN_PS8;
                 sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1);
                 v.result_exp := sqrt_exp;
                 v.shift := to_signed(1, EXP_BITS);
@@ -2030,7 +2045,7 @@ begin
                 -- test if remainder is 0 or >= B = 2*R + 1
                 if pcmpb_lt = '1' then
                     -- square root is correct, set X if remainder non-zero
-                    v.x := r.p(58) or px_nz;
+                    v.x := r.p(UNIT_BIT + 2) or px_nz;
                 else
                     -- square root needs to be incremented by 1
                     carry_in <= '1';
@@ -2043,10 +2058,10 @@ begin
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
                 v.state := INT_ROUND;
-                v.shift := to_signed(-2, EXP_BITS);
+                v.shift := to_signed(52 - UNIT_BIT, EXP_BITS);
 
             when INT_ROUND =>
-                -- r.shift = -2
+                -- r.shift = -4 (== 52 - UNIT_BIT)
                 opsel_r <= RES_SHIFT;
                 round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign);
                 v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
@@ -2059,7 +2074,7 @@ begin
                 end if;
 
             when INT_ISHIFT =>
-                -- r.shift = b.exponent - 54;
+                -- r.shift = b.exponent - UNIT_BIT;
                 opsel_r <= RES_SHIFT;
                 v.state := INT_FINAL;
 
@@ -2129,7 +2144,7 @@ begin
                 if r.is_multiply = '1' and px_nz = '1' then
                     v.x := '1';
                 end if;
-                if r.r(63 downto 54) /= "0000000001" then
+                if r.r(63 downto UNIT_BIT) /= std_ulogic_vector(to_unsigned(1, 64 - UNIT_BIT)) then
                     renormalize := '1';
                     v.state := NORMALIZE;
                 else
@@ -2172,7 +2187,7 @@ begin
                     -- if denormalized, have to normalize before rounding
                     v.fpscr(FPSCR_UX) := '1';
                     v.result_exp := r.result_exp + bias_exp;
-                    if r.r(54) = '0' then
+                    if r.r(UNIT_BIT) = '0' then
                         renormalize := '1';
                         v.state := NORMALIZE;
                     else
@@ -2215,7 +2230,7 @@ begin
                     v.shift := to_signed(-1, EXP_BITS);
                     v.state := ROUNDING_2;
                 else
-                    if r.r(54) = '0' then
+                    if r.r(UNIT_BIT) = '0' then
                         -- result after masking could be zero, or could be a
                         -- denormalized result that needs to be renormalized
                         renormalize := '1';
@@ -2235,14 +2250,14 @@ begin
                 -- Check for overflow during rounding
                 -- r.shift = -1
                 v.x := '0';
-                if r.r(55) = '1' then
+                if r.r(UNIT_BIT + 1) = '1' then
                     opsel_r <= RES_SHIFT;
                     if exp_huge = '1' then
                         v.state := ROUND_OFLOW;
                     else
                         arith_done := '1';
                     end if;
-                elsif r.r(54) = '0' then
+                elsif r.r(UNIT_BIT) = '0' then
                     -- Do CLZ so we can renormalize the result
                     renormalize := '1';
                     v.state := ROUNDING_3;
@@ -2278,9 +2293,9 @@ begin
                 arith_done := '1';
 
             when NAN_RESULT =>
-                if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(53) = '0') or
-                    (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(53) = '0') or
-                    (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(53) = '0') then
+                if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
+                    (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
+                    (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then
                     -- Signalling NAN
                     v.fpscr(FPSCR_VXSNAN) := '1';
                     invalid := '1';
@@ -2343,39 +2358,41 @@ begin
         -- Multiplier and divide/square root data path
         case msel_1 is
             when MUL1_A =>
-                f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00";
+                f_to_multiply.data1 <= r.a.mantissa;
             when MUL1_B =>
-                f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00";
+                f_to_multiply.data1 <= r.b.mantissa;
             when MUL1_Y =>
                 f_to_multiply.data1 <= r.y;
             when others =>
-                f_to_multiply.data1 <= r.r(61 downto 0) & "00";
+                f_to_multiply.data1 <= r.r;
         end case;
         case msel_2 is
             when MUL2_C =>
-                f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00";
+                f_to_multiply.data2 <= r.c.mantissa;
             when MUL2_LUT =>
-                f_to_multiply.data2 <= x"00" & inverse_est & '0' & x"000000000";
+                f_to_multiply.data2 <= std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64),
+                                                                    UNIT_BIT - 19));
             when MUL2_P =>
                 f_to_multiply.data2 <= r.p;
             when others =>
-                f_to_multiply.data2 <= r.r(61 downto 0) & "00";
+                f_to_multiply.data2 <= r.r;
         end case;
         maddend := (others => '0');
         case msel_add is
             when MULADD_CONST =>
                 -- addend is 2.0 or 1.5 in 16.112 format
                 if r.is_sqrt = '0' then
-                    maddend(113) := '1';                -- 2.0
+                    maddend(2*UNIT_BIT + 1) := '1';                       -- 2.0
                 else
-                    maddend(112 downto 111) := "11";    -- 1.5
+                    maddend(2*UNIT_BIT downto 2*UNIT_BIT - 1) := "11";    -- 1.5
                 end if;
             when MULADD_A =>
                 -- addend is A in 16.112 format
-                maddend(121 downto 58) := r.a.mantissa;
+                maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.a.mantissa;
             when MULADD_RS =>
                 -- addend is concatenation of R and S in 16.112 format
-                maddend := "000000" & r.r & r.s & "00";
+                maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.r;
+                maddend(UNIT_BIT - 1 downto 0) := r.s;
             when others =>
         end case;
         if msel_inv = '1' then
@@ -2391,7 +2408,7 @@ begin
             if pshift = '0' then
                 v.p := multiply_to_f.result(63 downto 0);
             else
-                v.p := multiply_to_f.result(119 downto 56);
+                v.p := multiply_to_f.result(UNIT_BIT + 63 downto UNIT_BIT);
             end if;
         end if;
 
@@ -2433,11 +2450,15 @@ begin
             when BIN_R =>
                 in_b0 := r.r;
             when BIN_RND =>
-                round_inc := (31 => r.single_prec, 2 => not r.single_prec, others => '0');
+                if rbit_inc = '0' then
+                    round_inc := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0');
+                else
+                    round_inc := (DP_RBIT => '1', others => '0');
+                end if;
                 in_b0 := round_inc;
             when others =>
-                -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64
-                in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64));
+                -- BIN_PS8, 8 LSBs of P sign-extended to 64
+                in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 0)), 64));
         end case;
         if opsel_binv = '1' then
             in_b0 := not in_b0;
@@ -2451,9 +2472,9 @@ begin
         end if;
         sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
         if opsel_mask = '1' then
-            sum(1 downto 0) := "00";
+            sum(DP_LSB - 1 downto 0) := "0000";
             if r.single_prec = '1' then
-                sum(30 downto 2) := (others => '0');
+                sum(SP_LSB - 1 downto DP_LSB) := (others => '0');
             end if;
         end if;
         case opsel_r is
@@ -2462,20 +2483,25 @@ begin
             when RES_SHIFT =>
                 result <= shift_res;
             when RES_MULT =>
-                result <= multiply_to_f.result(121 downto 58);
+                result <= multiply_to_f.result(UNIT_BIT + 63 downto UNIT_BIT);
+                if mult_mask = '1' then
+                    -- trim to 54 fraction bits if mult_mask = 1, for quotient when dividing
+                    result(UNIT_BIT - 55 downto 0) <= (others => '0');
+                end if;
             when others =>
+                misc := (others => '0');
                 case misc_sel is
                     when "0000" =>
                         misc := x"00000000" & (r.fpscr and fpscr_mask);
                     when "0001" =>
                         -- generated QNaN mantissa
-                        misc := x"0020000000000000";
+                        misc(QNAN_BIT) := '1';
                     when "0010" =>
                         -- mantissa of max representable DP number
-                        misc := x"007ffffffffffffc";
+                        misc(UNIT_BIT downto DP_LSB) := (others => '1');
                     when "0011" =>
                         -- mantissa of max representable SP number
-                        misc := x"007fffff80000000";
+                        misc(UNIT_BIT downto SP_LSB) := (others => '1');
                     when "0100" =>
                         -- fmrgow result
                         misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0);
@@ -2483,7 +2509,8 @@ begin
                         -- fmrgew result
                         misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32);
                     when "0111" =>
-                        misc := 10x"000" & inverse_est & 35x"000000000";
+                        misc := std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64),
+                                                             UNIT_BIT - 19));
                     when "1000" =>
                         -- max positive result for fctiw[z]
                         misc := x"000000007fffffff";
@@ -2509,7 +2536,6 @@ begin
                         -- max negative result for fctidu[z]
                         misc := x"0000000000000000";
                     when others =>
-                        misc := x"0000000000000000";
                 end case;
                 result <= misc;
         end case;
@@ -2519,7 +2545,7 @@ begin
                 when S_NEG =>
                     v.s := std_ulogic_vector(unsigned(not r.s) + (not r.x));
                 when S_MULT =>
-                    v.s := multiply_to_f.result(57 downto 2);
+                    v.s := multiply_to_f.result(55 downto 0);
                 when S_SHIFT =>
                     v.s := shift_res(63 downto 8);
                     if shift_res(7 downto 0) /= x"00" then
@@ -2553,12 +2579,12 @@ begin
                 -- make denormalized value end up with even exponent
                 clz(0) := '1';
             end if;
-            v.shift := resize(signed('0' & clz) - 9, EXP_BITS);
+            v.shift := resize(signed('0' & clz) - (63 - UNIT_BIT), EXP_BITS);
         end if;
 
         if r.update_fprf = '1' then
             v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class,
-                                                             r.r(54) and not r.denorm);
+                                                             r.r(UNIT_BIT) and not r.denorm);
         end if;
 
         v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or

From a95f8aab385b05ba6f5b7bedd6fbe1e97669ebdb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 7 May 2022 18:28:33 +1000
Subject: [PATCH 19/30] FPU: Add integer division logic to FPU

This adds logic to the FPU to accomplish 64-bit integer divisions.
No instruction actually uses this yet.

The algorithm used is to obtain an estimate of the reciprocal of the
divisor using the lookup table and refine it by one to three
iterations of the Newton-Raphson algorithm (the number of iterations
depends on the number of significant bits in the dividend).  Then the
reciprocal is multiplied by the dividend to get the quotient estimate.
The remainder is calculated as dividend - quotient * divisor.  If the
remainder is greater than or equal to the divisor, the quotient is
incremented, or if a modulo operation is being done, the divisor is
subtracted from the remainder.  The inverse estimate after refinement
is good enough that the quotient estimate is always equal to or one
less than the true quotient.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl   |  34 ++--
 execute1.vhdl |   1 +
 fpu.vhdl      | 525 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 541 insertions(+), 19 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 54a87d2..aa7b830 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -627,27 +627,29 @@ package common is
          srr1 => (others => '0'), msr => (others => '0'));
 
     type Execute1ToFPUType is record
-        valid   : std_ulogic;
-        op      : insn_type_t;
-        nia     : std_ulogic_vector(63 downto 0);
-        itag    : instr_tag_t;
-        insn    : std_ulogic_vector(31 downto 0);
-        single  : std_ulogic;
-        fe_mode : std_ulogic_vector(1 downto 0);
-        fra     : std_ulogic_vector(63 downto 0);
-        frb     : std_ulogic_vector(63 downto 0);
-        frc     : std_ulogic_vector(63 downto 0);
-        frt     : gspr_index_t;
-        rc      : std_ulogic;
-        out_cr  : std_ulogic;
-        stall   : std_ulogic;
+        valid     : std_ulogic;
+        op        : insn_type_t;
+        nia       : std_ulogic_vector(63 downto 0);
+        itag      : instr_tag_t;
+        insn      : std_ulogic_vector(31 downto 0);
+        single    : std_ulogic;
+        is_signed : std_ulogic;
+        fe_mode   : std_ulogic_vector(1 downto 0);
+        fra       : std_ulogic_vector(63 downto 0);
+        frb       : std_ulogic_vector(63 downto 0);
+        frc       : std_ulogic_vector(63 downto 0);
+        frt       : gspr_index_t;
+        rc        : std_ulogic;
+        out_cr    : std_ulogic;
+        stall     : std_ulogic;
     end record;
     constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
                                                        itag => instr_tag_init,
-                                                       insn  => (others => '0'), fe_mode => "00", rc => '0',
+                                                       insn => (others => '0'), fe_mode => "00", rc => '0',
                                                        fra => (others => '0'), frb => (others => '0'),
                                                        frc => (others => '0'), frt => (others => '0'),
-                                                       single => '0', out_cr => '0', stall => '0');
+                                                       single => '0', is_signed => '0', out_cr => '0',
+                                                       stall => '0');
 
     type FPUToExecute1Type is record
         busy      : std_ulogic;
diff --git a/execute1.vhdl b/execute1.vhdl
index 6fadc8c..2121963 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -1449,6 +1449,7 @@ begin
         fv.insn := e_in.insn;
         fv.itag := e_in.instr_tag;
         fv.single := e_in.is_32bit;
+        fv.is_signed := e_in.is_signed;
         fv.fe_mode := ex1.msr(MSR_FE0) & ex1.msr(MSR_FE1);
         fv.fra := a_in;
         fv.frb := b_in;
diff --git a/fpu.vhdl b/fpu.vhdl
index 27587f7..18d3a5a 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -75,7 +75,19 @@ architecture behaviour of fpu is
                      RENORM_A, RENORM_A2,
                      RENORM_B, RENORM_B2,
                      RENORM_C, RENORM_C2,
-                     NAN_RESULT, EXC_RESULT);
+                     NAN_RESULT, EXC_RESULT,
+                     DO_IDIVMOD,
+                     IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3,
+                     IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3,
+                     IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5,
+                     IDIV_DODIV,
+                     IDIV_DIV, IDIV_DIV2, IDIV_DIV3, IDIV_DIV4, IDIV_DIV5,
+                     IDIV_DIV6, IDIV_DIV7, IDIV_DIV8, IDIV_DIV9,
+                     IDIV_EXT_TBH, IDIV_EXT_TBH2, IDIV_EXT_TBH3,
+                     IDIV_EXT_TBH4, IDIV_EXT_TBH5,
+                     IDIV_EXTDIV, IDIV_EXTDIV1, IDIV_EXTDIV2, IDIV_EXTDIV3,
+                     IDIV_EXTDIV4, IDIV_EXTDIV5, IDIV_EXTDIV6,
+                     IDIV_MODADJ, IDIV_MODSUB, IDIV_DIVADJ, IDIV_OVFCHK, IDIV_DONE, IDIV_ZERO);
 
     type reg_type is record
         state        : state_t;
@@ -139,6 +151,14 @@ architecture behaviour of fpu is
         invalid      : std_ulogic;
         negate       : std_ulogic;
         longmask     : std_ulogic;
+        divext       : std_ulogic;
+        divmod       : std_ulogic;
+        is_signed    : std_ulogic;
+        int_ovf      : std_ulogic;
+        div_close    : std_ulogic;
+        inc_quot     : std_ulogic;
+        a_hi         : std_ulogic_vector(7 downto 0);
+        a_lo         : std_ulogic_vector(55 downto 0);
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -159,6 +179,7 @@ architecture behaviour of fpu is
     signal lost_bits     : std_ulogic;
     signal r_hi_nz       : std_ulogic;
     signal r_lo_nz       : std_ulogic;
+    signal r_gt_1        : std_ulogic;
     signal s_nz          : std_ulogic;
     signal misc_sel      : std_ulogic_vector(3 downto 0);
     signal f_to_multiply : MultiplyInputType;
@@ -663,7 +684,12 @@ begin
         variable msb         : std_ulogic;
         variable is_add      : std_ulogic;
         variable set_a       : std_ulogic;
+        variable set_a_exp   : std_ulogic;
+        variable set_a_mant  : std_ulogic;
+        variable set_a_hi    : std_ulogic;
+        variable set_a_lo    : std_ulogic;
         variable set_b       : std_ulogic;
+        variable set_b_mant  : std_ulogic;
         variable set_c       : std_ulogic;
         variable set_y       : std_ulogic;
         variable set_s       : std_ulogic;
@@ -671,10 +697,13 @@ begin
         variable px_nz       : std_ulogic;
         variable pcmpb_eq    : std_ulogic;
         variable pcmpb_lt    : std_ulogic;
+        variable pcmpc_eq    : std_ulogic;
+        variable pcmpc_lt    : std_ulogic;
         variable pshift      : std_ulogic;
         variable renorm_sqrt : std_ulogic;
         variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
         variable shiftin     : std_ulogic;
+        variable shiftin0    : std_ulogic;
         variable mulexp      : signed(EXP_BITS-1 downto 0);
         variable maddend     : std_ulogic_vector(127 downto 0);
         variable sum         : std_ulogic_vector(63 downto 0);
@@ -722,6 +751,11 @@ begin
             v.is_sqrt := '0';
             v.add_bsmall := '0';
             v.doing_ftdiv := "00";
+            v.divext := e_in.insn(8) and not e_in.insn(7);
+            v.divmod := not e_in.insn(8);
+            v.is_signed := e_in.is_signed;
+            v.int_ovf := '0';
+            v.div_close := '0';
 
             adec := decode_dp(e_in.fra, int_input);
             bdec := decode_dp(e_in.frb, int_input);
@@ -738,10 +772,14 @@ begin
             if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then
                 v.madd_cmp := '1';
             end if;
+
+            v.a_hi := 8x"0";
+            v.a_lo := 56x"0";
         end if;
 
         r_hi_nz <= or (r.r(UNIT_BIT + 1 downto SP_LSB));
         r_lo_nz <= or (r.r(SP_LSB - 1 downto DP_LSB));
+        r_gt_1 <= or (r.r(63 downto 1));
         s_nz <= or (r.s);
 
         if r.single_prec = '0' then
@@ -781,6 +819,14 @@ begin
         if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT)) then
             pcmpb_lt := '1';
         end if;
+        pcmpc_eq := '0';
+        if r.p = r.c.mantissa then
+            pcmpc_eq := '1';
+        end if;
+        pcmpc_lt := '0';
+        if unsigned(r.p) < unsigned(r.c.mantissa) then
+            pcmpc_lt := '1';
+        end if;
 
         v.update_fprf := '0';
         v.shift := to_signed(0, EXP_BITS);
@@ -803,7 +849,12 @@ begin
         set_x := '0';
         qnan_result := '0';
         set_a := '0';
+        set_a_exp := '0';
+        set_a_mant := '0';
+        set_a_hi := '0';
+        set_a_lo := '0';
         set_b := '0';
+        set_b_mant := '0';
         set_c := '0';
         set_s := '0';
         f_to_multiply.is_32bit <= '0';
@@ -816,6 +867,7 @@ begin
         pshift := '0';
         renorm_sqrt := '0';
         shiftin := '0';
+        shiftin0 := '0';
         rbit_inc := '0';
         mult_mask := '0';
         int_result := '0';
@@ -866,6 +918,10 @@ begin
                             else
                                 v.state := DO_FRI;
                             end if;
+                        when "01001" =>
+                            -- integer divides and mods, major opcode 31
+                            v.opsel_a := AIN_B;
+                            v.state := DO_IDIVMOD;
                         when "01100" =>
                             v.opsel_a := AIN_B;
                             v.state := DO_FRSP;
@@ -2327,6 +2383,451 @@ begin
                 end case;
                 arith_done := '1';
 
+            when DO_IDIVMOD =>
+                -- r.opsel_a = AIN_B
+                v.result_sign := r.is_signed and (r.a.negative xor (r.b.negative and not r.divmod));
+                if r.b.class = ZERO then
+                    -- B is zero, signal overflow
+                    v.int_ovf := '1';
+                    v.state := IDIV_ZERO;
+                elsif r.a.class = ZERO then
+                    -- A is zero, result is zero (both for div and for mod)
+                    v.state := IDIV_ZERO;
+                else
+                    -- take absolute value for signed division, and
+                    -- normalize and round up B to 8.56 format, like fcfid[u]
+                    if r.is_signed = '1' and r.b.negative = '1' then
+                        opsel_ainv <= '1';
+                        carry_in <= '1';
+                    end if;
+                    v.result_class := FINITE;
+                    v.result_exp := to_signed(UNIT_BIT, EXP_BITS);
+                    v.state := IDIV_NORMB;
+                end if;
+            when IDIV_NORMB =>
+                -- do count-leading-zeroes on B (now in R)
+                renormalize := '1';
+                -- save the original value of B or |B| in C
+                set_c := '1';
+                v.state := IDIV_NORMB2;
+            when IDIV_NORMB2 =>
+                -- get B into the range [1, 2) in 8.56 format
+                set_x := '1';           -- record if any 1 bits shifted out
+                opsel_r <= RES_SHIFT;
+                v.state := IDIV_NORMB3;
+            when IDIV_NORMB3 =>
+                -- add the X bit onto R to round up B
+                carry_in <= r.x;
+                -- prepare to do count-leading-zeroes on A
+                v.opsel_a := AIN_A;
+                v.state := IDIV_CLZA;
+            when IDIV_CLZA =>
+                set_b := '1';           -- put R back into B
+                -- r.opsel_a = AIN_A
+                if r.is_signed = '1' and r.a.negative = '1' then
+                    opsel_ainv <= '1';
+                    carry_in <= '1';
+                end if;
+                v.result_exp := to_signed(UNIT_BIT, EXP_BITS);
+                v.opsel_a := AIN_C;
+                v.state := IDIV_CLZA2;
+            when IDIV_CLZA2 =>
+                -- r.opsel_a = AIN_C
+                renormalize := '1';
+                -- write the dividend back into A in case we negated it
+                set_a_mant := '1';
+                -- while doing the count-leading-zeroes on A,
+                -- also compute A - B to tell us whether A >= B
+                -- (using the original value of B, which is now in C)
+                opsel_b <= BIN_R;
+                opsel_ainv <= '1';
+                carry_in <= '1';
+                v.state := IDIV_CLZA3;
+            when IDIV_CLZA3 =>
+                -- save the exponent of A (but don't overwrite the mantissa)
+                v.a.exponent := new_exp;
+                v.div_close := '0';
+                if new_exp = r.b.exponent then
+                    v.div_close := '1';
+                end if;
+                v.state := IDIV_NR0;
+                if new_exp > r.b.exponent or (v.div_close = '1' and r.r(63) = '0') then
+                    -- A >= B, overflow if extended division
+                    if r.divext = '1' then
+                        v.int_ovf := '1';
+                        -- return 0 in overflow cases
+                        v.state := IDIV_ZERO;
+                    end if;
+                else
+                    -- A < B, result is zero for normal division
+                    if r.divmod = '0' and r.divext = '0' then
+                        v.state := IDIV_ZERO;
+                    end if;
+                end if;
+            when IDIV_NR0 =>
+                -- reduce number of Newton-Raphson iterations for small A
+                if r.divext = '1' or new_exp >= to_signed(32, EXP_BITS) then
+                    v.count := "00";
+                elsif new_exp >= to_signed(16, EXP_BITS) then
+                    v.count := "01";
+                else
+                    v.count := "10";
+                end if;
+                -- first NR iteration does Y = LUT; P = 2 - B * LUT
+                msel_1 <= MUL1_B;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                msel_2 <= MUL2_LUT;
+                set_y := '1';
+                if r.b.mantissa(UNIT_BIT + 1) = '1' then
+                    -- rounding up of the mantissa caused overflow, meaning the
+                    -- normalized B is 2.0.  Since this is outside the range
+                    -- of the LUT, just use 0.5 as the estimated inverse.
+                    v.state := IDIV_USE0_5;
+                else
+                    -- start the first multiply now
+                    f_to_multiply.valid <= '1';
+                    -- note we don't set v.first, thus the following IDIV_NR1
+                    -- state doesn't start a multiply (we already did that)
+                    v.state := IDIV_NR1;
+                end if;
+            when IDIV_NR1 =>
+                -- subsequent NR iterations do Y = P; P = 2 - B * P
+                msel_1 <= MUL1_B;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                msel_2 <= MUL2_P;
+                set_y := r.first;
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.count := r.count + 1;
+                    v.state := IDIV_NR2;
+                end if;
+            when IDIV_NR2 =>
+                -- compute P = Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                v.opsel_a := AIN_A;
+                v.shift := to_signed(64, EXP_BITS);
+                -- Get 0.5 into R in case the inverse estimate turns out to be
+                -- less than 0.5, in which case we want to use 0.5, to avoid
+                -- infinite loops in some cases.
+                opsel_r <= RES_MISC;
+                misc_sel <= "0001";
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    if r.count = "11" then
+                        v.state := IDIV_DODIV;
+                    else
+                        v.state := IDIV_NR1;
+                    end if;
+                end if;
+            when IDIV_USE0_5 =>
+                -- Get 0.5 into R; it turns out the generated
+                -- QNaN mantissa is actually what we want
+                opsel_r <= RES_MISC;
+                misc_sel <= "0001";
+                v.opsel_a := AIN_A;
+                v.shift := to_signed(64, EXP_BITS);
+                v.state := IDIV_DODIV;
+            when IDIV_DODIV =>
+                -- r.opsel_a = AIN_A
+                -- r.shift = 64
+                -- inverse estimate is in P or in R; copy it to Y
+                if r.b.mantissa(UNIT_BIT + 1) = '1' or
+                    (r.p(UNIT_BIT) = '0' and r.p(UNIT_BIT - 1) = '0') then
+                    msel_2 <= MUL2_R;
+                else
+                    msel_2 <= MUL2_P;
+                end if;
+                set_y := '1';
+                -- shift_res is 0 because r.shift = 64;
+                -- put that into B, which now holds the quotient
+                set_b_mant := '1';
+                if r.divext = '0' then
+                    v.shift := to_signed(-UNIT_BIT, EXP_BITS);
+                    v.first := '1';
+                    v.state := IDIV_DIV;
+                elsif r.div_close = '0' then
+                    v.shift := to_signed(64 - UNIT_BIT, EXP_BITS);
+                    v.state := IDIV_EXTDIV;
+                else
+                    -- handle top bit of quotient specially
+                    -- for this we need the divisor left-justified in B
+                    v.opsel_a := AIN_C;
+                    v.state := IDIV_EXT_TBH;
+                end if;
+            when IDIV_DIV =>
+                -- Dividing A by C, r.shift = -56; A is in R
+                -- Put A into the bottom 64 bits of Ahi/A/Alo
+                set_a_mant := r.first;
+                set_a_lo := r.first;
+                -- compute R = R * Y (quotient estimate)
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_R;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                opsel_r <= RES_MULT;
+                v.shift := - r.b.exponent;
+                if multiply_to_f.valid = '1' then
+                    v.state := IDIV_DIV2;
+                end if;
+            when IDIV_DIV2 =>
+                -- r.shift = - b.exponent
+                -- shift the quotient estimate right by b.exponent bits
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := IDIV_DIV3;
+            when IDIV_DIV3 =>
+                -- quotient (so far) is in R; multiply by C and subtract from A
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_C;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                -- store the current quotient estimate in B
+                set_b_mant := r.first;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                if multiply_to_f.valid = '1' then
+                    v.state := IDIV_DIV4;
+                end if;
+            when IDIV_DIV4 =>
+                -- remainder is in R/S and P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                v.inc_quot := not pcmpc_lt and not r.divmod;
+                if r.divmod = '0' then
+                    v.opsel_a := AIN_B;
+                end if;
+                v.shift := to_signed(UNIT_BIT, EXP_BITS);
+                if pcmpc_lt = '1' or pcmpc_eq = '1' then
+                    if r.divmod = '0' then
+                        v.state := IDIV_DIVADJ;
+                    elsif pcmpc_eq = '1' then
+                        v.state := IDIV_ZERO;
+                    else
+                        v.state := IDIV_MODADJ;
+                    end if;
+                else
+                    -- need to do another iteration, compute P * Y
+                    f_to_multiply.valid <= '1';
+                    v.state := IDIV_DIV5;
+                end if;
+            when IDIV_DIV5 =>
+                pshift := '1';
+                opsel_r <= RES_MULT;
+                v.shift := - r.b.exponent;
+                if multiply_to_f.valid = '1' then
+                    v.state := IDIV_DIV6;
+                end if;
+            when IDIV_DIV6 =>
+                -- r.shift = - b.exponent
+                -- shift the quotient estimate right by b.exponent bits
+                opsel_r <= RES_SHIFT;
+                v.opsel_a := AIN_B;
+                v.first := '1';
+                v.state := IDIV_DIV7;
+            when IDIV_DIV7 =>
+                -- r.opsel_a = AIN_B
+                -- add shifted quotient delta onto the total quotient
+                opsel_b <= BIN_R;
+                v.first := '1';
+                v.state := IDIV_DIV8;
+            when IDIV_DIV8 =>
+                -- quotient (so far) is in R; multiply by C and subtract from A
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_C;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                -- store the current quotient estimate in B
+                set_b_mant := r.first;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                if multiply_to_f.valid = '1' then
+                    v.state := IDIV_DIV9;
+                end if;
+            when IDIV_DIV9 =>
+                -- remainder is in R/S and P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                v.inc_quot := not pcmpc_lt and not r.divmod;
+                if r.divmod = '0' then
+                    v.opsel_a := AIN_B;
+                end if;
+                v.shift := to_signed(UNIT_BIT, EXP_BITS);
+                if r.divmod = '0' then
+                    v.state := IDIV_DIVADJ;
+                elsif pcmpc_eq = '1' then
+                    v.state := IDIV_ZERO;
+                else
+                    v.state := IDIV_MODADJ;
+                end if;
+            when IDIV_EXT_TBH =>
+                -- r.opsel_a = AIN_C; get divisor into R and prepare to shift left
+                v.shift := to_signed(63, EXP_BITS) - r.b.exponent;
+                v.opsel_a := AIN_A;
+                v.state := IDIV_EXT_TBH2;
+            when IDIV_EXT_TBH2 =>
+                -- r.opsel_a = AIN_A; divisor is in R
+                -- r.shift = 63 - b.exponent; shift and put into B
+                set_b_mant := '1';
+                v.shift := to_signed(64 - UNIT_BIT, EXP_BITS);
+                v.state := IDIV_EXT_TBH3;
+            when IDIV_EXT_TBH3 =>
+                -- Dividing (A << 64) by C
+                -- r.shift = 8
+                -- Put A in the top 64 bits of Ahi/A/Alo
+                set_a_hi := '1';
+                set_a_mant := '1';
+                v.shift := to_signed(64, EXP_BITS) - r.b.exponent;
+                v.state := IDIV_EXT_TBH4;
+            when IDIV_EXT_TBH4 =>
+                -- dividend (A) is in R
+                -- r.shift = 64 - B.exponent, so is at least 1
+                opsel_r <= RES_SHIFT;
+                -- top bit of A gets lost in the shift, so handle it specially
+                v.opsel_a := AIN_B;
+                v.shift := to_signed(63, EXP_BITS);
+                v.state := IDIV_EXT_TBH5;
+            when IDIV_EXT_TBH5 =>
+                -- r.opsel_a = AIN_B, r.shift = 63
+                -- shifted dividend is in R, subtract left-justified divisor
+                opsel_b <= BIN_R;
+                opsel_ainv <= '1';
+                carry_in <= '1';
+                -- and put 1<<63 into B as the divisor (S is still 0)
+                shiftin0 := '1';
+                set_b_mant := '1';
+                v.first := '1';
+                v.state := IDIV_EXTDIV2;
+            when IDIV_EXTDIV =>
+                -- Dividing (A << 64) by C
+                -- r.shift = 8
+                -- Put A in the top 64 bits of Ahi/A/Alo
+                set_a_hi := '1';
+                set_a_mant := '1';
+                v.shift := to_signed(64, EXP_BITS) - r.b.exponent;
+                v.state := IDIV_EXTDIV1;
+            when IDIV_EXTDIV1 =>
+                -- dividend is in R
+                -- r.shift = 64 - B.exponent
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := IDIV_EXTDIV2;
+            when IDIV_EXTDIV2 =>
+                -- shifted remainder is in R; compute R = R * Y (quotient estimate)
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_R;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                v.opsel_a := AIN_B;
+                opsel_r <= RES_MULT;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := IDIV_EXTDIV3;
+                end if;
+            when IDIV_EXTDIV3 =>
+                -- r.opsel_a = AIN_B
+                -- delta quotient is in R; add it to B
+                opsel_b <= BIN_R;
+                v.first := '1';
+                v.state := IDIV_EXTDIV4;
+            when IDIV_EXTDIV4 =>
+                -- quotient is in R; put it in B and compute remainder
+                set_b_mant := r.first;
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_C;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                v.shift := to_signed(UNIT_BIT, EXP_BITS) - r.b.exponent;
+                if multiply_to_f.valid = '1' then
+                    v.state := IDIV_EXTDIV5;
+                end if;
+            when IDIV_EXTDIV5 =>
+                -- r.shift = r.b.exponent - 56
+                -- remainder is in R/S; shift it right r.b.exponent bits
+                opsel_r <= RES_SHIFT;
+                -- test LS 64b of remainder in P against divisor in C
+                v.inc_quot := not pcmpc_lt;
+                v.opsel_a := AIN_B;
+                v.state := IDIV_EXTDIV6;
+            when IDIV_EXTDIV6 =>
+                -- r.opsel_a = AIN_B
+                -- shifted remainder is in R, see if it is > 1
+                -- and compute R = R * Y if so
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_R;
+                pshift := '1';
+                if r_gt_1 = '1' then
+                    f_to_multiply.valid <= '1';
+                    v.state := IDIV_EXTDIV2;
+                else
+                    v.state := IDIV_DIVADJ;
+                end if;
+            when IDIV_MODADJ =>
+                -- r.shift = 56
+                -- result is in R/S
+                opsel_r <= RES_SHIFT;
+                if pcmpc_lt = '0' then
+                    v.opsel_a := AIN_C;
+                    v.state := IDIV_MODSUB;
+                elsif r.result_sign = '0' then
+                    v.state := IDIV_DONE;
+                else
+                    v.state := IDIV_DIVADJ;
+                end if;
+            when IDIV_MODSUB =>
+                -- r.opsel_a = AIN_C
+                -- Subtract divisor from remainder
+                opsel_ainv <= '1';
+                carry_in <= '1';
+                opsel_b <= BIN_R;
+                if r.result_sign = '0' then
+                    v.state := IDIV_DONE;
+                else
+                    v.state := IDIV_DIVADJ;
+                end if;
+            when IDIV_DIVADJ =>
+                -- result (so far) is on the A input of the adder
+                -- set carry to increment quotient if needed
+                -- and also negate R if the answer is negative
+                opsel_ainv <= r.result_sign;
+                carry_in <= r.inc_quot xor r.result_sign;
+                if r.is_signed = '0' then
+                    v.state := IDIV_DONE;
+                else
+                    v.state := IDIV_OVFCHK;
+                end if;
+            when IDIV_OVFCHK =>
+                v.int_ovf := r.r(63) xor r.result_sign;
+                if v.int_ovf = '1' then
+                    v.state := IDIV_ZERO;
+                else
+                    v.state := IDIV_DONE;
+                end if;
+            when IDIV_DONE =>
+                int_result := '1';
+                v.writing_fpr := '1';
+                v.instr_done := '1';
+            when IDIV_ZERO =>
+                opsel_r <= RES_MISC;
+                misc_sel <= "0101";
+                int_result := '1';
+                v.writing_fpr := '1';
+                v.instr_done := '1';
+
         end case;
 
         if zero_divide = '1' then
@@ -2388,7 +2889,9 @@ begin
                 end if;
             when MULADD_A =>
                 -- addend is A in 16.112 format
+                maddend(127 downto UNIT_BIT + 64) := r.a_hi;
                 maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.a.mantissa;
+                maddend(UNIT_BIT - 1 downto 0) := r.a_lo;
             when MULADD_RS =>
                 -- addend is concatenation of R and S in 16.112 format
                 maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.r;
@@ -2465,7 +2968,8 @@ begin
         end if;
         in_b <= in_b0;
         if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
-            shift_res := shifter_64(r.r & (shiftin or r.s(55)) & r.s(54 downto 0),
+            shift_res := shifter_64(r.r(63 downto 1) & (shiftin0 or r.r(0)) &
+                                    (shiftin or r.s(55)) & r.s(54 downto 0),
                                     std_ulogic_vector(r.shift(6 downto 0)));
         else
             shift_res := (others => '0');
@@ -2556,12 +3060,27 @@ begin
             end case;
         end if;
 
-        if set_a = '1' then
+        if set_a = '1' or set_a_exp = '1' then
             v.a.exponent := new_exp;
+        end if;
+        if set_a = '1' or set_a_mant = '1' then
             v.a.mantissa := shift_res;
         end if;
+        if e_in.valid = '1' then
+            v.a_hi := (others => '0');
+            v.a_lo := (others => '0');
+        else
+            if set_a_hi = '1' then
+                v.a_hi := r.r(63 downto 56);
+            end if;
+            if set_a_lo = '1' then
+                v.a_lo := r.r(55 downto 0);
+            end if;
+        end if;
         if set_b = '1' then
             v.b.exponent := new_exp;
+        end if;
+        if set_b = '1' or set_b_mant = '1' then
             v.b.mantissa := shift_res;
         end if;
         if set_c = '1' then

From 34330552e8f1d78c1dac1e7a154dcee6a991c74a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 7 May 2022 22:34:23 +1000
Subject: [PATCH 20/30] FPU: Add logic for 32-bit integer division

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 77 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 63 insertions(+), 14 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 18d3a5a..b8cea39 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -80,7 +80,7 @@ architecture behaviour of fpu is
                      IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3,
                      IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3,
                      IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5,
-                     IDIV_DODIV,
+                     IDIV_DODIV, IDIV_SH32,
                      IDIV_DIV, IDIV_DIV2, IDIV_DIV3, IDIV_DIV4, IDIV_DIV5,
                      IDIV_DIV6, IDIV_DIV7, IDIV_DIV8, IDIV_DIV9,
                      IDIV_EXT_TBH, IDIV_EXT_TBH2, IDIV_EXT_TBH3,
@@ -445,17 +445,20 @@ architecture behaviour of fpu is
 
     -- Split a DP floating-point number into components and work out its class.
     -- If is_int = 1, the input is considered an integer
-    function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic) return fpu_reg_type is
+    function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic;
+                       is_32bint: std_ulogic; is_signed: std_ulogic) return fpu_reg_type is
         variable r       : fpu_reg_type;
         variable exp_nz  : std_ulogic;
         variable exp_ao  : std_ulogic;
         variable frac_nz : std_ulogic;
+        variable low_nz  : std_ulogic;
         variable cls     : std_ulogic_vector(2 downto 0);
     begin
         r.negative := fpr(63);
         exp_nz := or (fpr(62 downto 52));
         exp_ao := and (fpr(62 downto 52));
         frac_nz := or (fpr(51 downto 0));
+        low_nz := or (fpr(31 downto 0));
         if is_int = '0' then
             r.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS);
             if exp_nz = '0' then
@@ -472,6 +475,16 @@ architecture behaviour of fpu is
                 when "110"  => r.class := INFINITY;
                 when others => r.class := NAN;
             end case;
+        elsif is_32bint = '1' then
+            r.negative := fpr(31);
+            r.mantissa(31 downto 0) := fpr(31 downto 0);
+            r.mantissa(63 downto 32) := (others => (is_signed and fpr(31)));
+            r.exponent := (others => '0');
+            if low_nz = '1' then
+                r.class := FINITE;
+            else
+                r.class := ZERO;
+            end if;
         else
             r.mantissa := fpr;
             r.exponent := (others => '0');
@@ -659,6 +672,7 @@ begin
         variable j, k        : integer;
         variable flm         : std_ulogic_vector(7 downto 0);
         variable int_input   : std_ulogic;
+        variable is_32bint   : std_ulogic;
         variable mask        : std_ulogic_vector(63 downto 0);
         variable in_a0       : std_ulogic_vector(63 downto 0);
         variable in_b0       : std_ulogic_vector(63 downto 0);
@@ -710,6 +724,8 @@ begin
         variable round_inc   : std_ulogic_vector(63 downto 0);
         variable rbit_inc    : std_ulogic;
         variable mult_mask   : std_ulogic;
+        variable sign_bit    : std_ulogic;
+        variable rnd_b32     : std_ulogic;
         variable int_result  : std_ulogic;
         variable illegal     : std_ulogic;
     begin
@@ -717,6 +733,7 @@ begin
         v.complete := '0';
         v.do_intr := '0';
         int_input := '0';
+        is_32bint := '0';
 
         if r.complete = '1' or r.do_intr = '1' then
             v.instr_done := '0';
@@ -735,12 +752,25 @@ begin
             v.fe_mode := or (e_in.fe_mode);
             v.dest_fpr := e_in.frt;
             v.single_prec := e_in.single;
-            v.longmask := e_in.single;
+            v.is_signed := e_in.is_signed;
             v.rc := e_in.rc;
             v.is_cmp := e_in.out_cr;
-            int_input := '0';
-            if e_in.op = OP_FPOP_I then
+            v.longmask := '0';
+            v.divext := '0';
+            v.divmod := '0';
+            if e_in.op = OP_FPOP or e_in.op = OP_FPOP_I then
+                v.longmask := e_in.single;
+                if e_in.op = OP_FPOP_I then
+                    int_input := '1';
+                end if;
+            else -- OP_DIV, OP_DIVE, OP_MOD
                 int_input := '1';
+                is_32bint := e_in.single;
+                if e_in.op = OP_DIVE then
+                    v.divext := '1';
+                elsif e_in.op = OP_MOD then
+                    v.divmod := '1';
+                end if;
             end if;
             v.quieten_nan := '1';
             v.tiny := '0';
@@ -751,15 +781,12 @@ begin
             v.is_sqrt := '0';
             v.add_bsmall := '0';
             v.doing_ftdiv := "00";
-            v.divext := e_in.insn(8) and not e_in.insn(7);
-            v.divmod := not e_in.insn(8);
-            v.is_signed := e_in.is_signed;
             v.int_ovf := '0';
             v.div_close := '0';
 
-            adec := decode_dp(e_in.fra, int_input);
-            bdec := decode_dp(e_in.frb, int_input);
-            cdec := decode_dp(e_in.frc, int_input);
+            adec := decode_dp(e_in.fra, int_input, is_32bint, e_in.is_signed);
+            bdec := decode_dp(e_in.frb, int_input, is_32bint, e_in.is_signed);
+            cdec := decode_dp(e_in.frc, int_input, '0', '0');
             v.a := adec;
             v.b := bdec;
             v.c := cdec;
@@ -870,6 +897,7 @@ begin
         shiftin0 := '0';
         rbit_inc := '0';
         mult_mask := '0';
+        rnd_b32 := '0';
         int_result := '0';
         illegal := '0';
         case r.state is
@@ -918,7 +946,7 @@ begin
                             else
                                 v.state := DO_FRI;
                             end if;
-                        when "01001" =>
+                        when "01001" | "01011" =>
                             -- integer divides and mods, major opcode 31
                             v.opsel_a := AIN_B;
                             v.state := DO_IDIVMOD;
@@ -2552,6 +2580,10 @@ begin
                     v.shift := to_signed(-UNIT_BIT, EXP_BITS);
                     v.first := '1';
                     v.state := IDIV_DIV;
+                elsif r.single_prec = '1' then
+                    -- divwe[u][o], shift A left 32 bits
+                    v.shift := to_signed(32, EXP_BITS);
+                    v.state := IDIV_SH32;
                 elsif r.div_close = '0' then
                     v.shift := to_signed(64 - UNIT_BIT, EXP_BITS);
                     v.state := IDIV_EXTDIV;
@@ -2561,6 +2593,12 @@ begin
                     v.opsel_a := AIN_C;
                     v.state := IDIV_EXT_TBH;
                 end if;
+            when IDIV_SH32 =>
+                -- r.shift = 32, R contains the dividend
+                opsel_r <= RES_SHIFT;
+                v.shift := to_signed(-UNIT_BIT, EXP_BITS);
+                v.first := '1';
+                v.state := IDIV_DIV;
             when IDIV_DIV =>
                 -- Dividing A by C, r.shift = -56; A is in R
                 -- Put A into the bottom 64 bits of Ahi/A/Alo
@@ -2805,13 +2843,22 @@ begin
                 -- and also negate R if the answer is negative
                 opsel_ainv <= r.result_sign;
                 carry_in <= r.inc_quot xor r.result_sign;
+                rnd_b32 := '1';
+                if r.divmod = '0' then
+                    opsel_b <= BIN_RND;
+                end if;
                 if r.is_signed = '0' then
                     v.state := IDIV_DONE;
                 else
                     v.state := IDIV_OVFCHK;
                 end if;
             when IDIV_OVFCHK =>
-                v.int_ovf := r.r(63) xor r.result_sign;
+                if r.single_prec = '0' then
+                    sign_bit := r.r(63);
+                else
+                    sign_bit := r.r(31);
+                end if;
+                v.int_ovf := sign_bit xor r.result_sign;
                 if v.int_ovf = '1' then
                     v.state := IDIV_ZERO;
                 else
@@ -2953,7 +3000,9 @@ begin
             when BIN_R =>
                 in_b0 := r.r;
             when BIN_RND =>
-                if rbit_inc = '0' then
+                if rnd_b32 = '1' then
+                    round_inc := (32 => r.result_sign and r.single_prec, others => '0');
+                elsif rbit_inc = '0' then
                     round_inc := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0');
                 else
                     round_inc := (DP_RBIT => '1', others => '0');

From 73cc5167ec1ea591d9da43f2e392b5202f045f32 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 9 May 2022 19:18:42 +1000
Subject: [PATCH 21/30] Use FPU for division instructions if we have an FPU

- Arrange for XER to be written for OE=1 forms
- Arrange for condition codes to be set for RC=1 forms
  (including correct handling for 32-bit mode)
- Don't instantiate the divider if we have an FPU.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |   7 ++++
 decode1.vhdl    |  52 +++++++++++++++---------
 execute1.vhdl   |  29 ++++++++-----
 fpu.vhdl        |  53 +++++++++++++++++++++++-
 tests/fpu/fpu.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++
 writeback.vhdl  |   7 ++++
 6 files changed, 221 insertions(+), 33 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index aa7b830..f846fb4 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -640,7 +640,10 @@ package common is
         frc       : std_ulogic_vector(63 downto 0);
         frt       : gspr_index_t;
         rc        : std_ulogic;
+        m32b      : std_ulogic;
         out_cr    : std_ulogic;
+        oe        : std_ulogic;
+        xerc      : xer_common_t;
         stall     : std_ulogic;
     end record;
     constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
@@ -649,6 +652,7 @@ package common is
                                                        fra => (others => '0'), frb => (others => '0'),
                                                        frc => (others => '0'), frt => (others => '0'),
                                                        single => '0', is_signed => '0', out_cr => '0',
+                                                       m32b => '0', oe => '0', xerc => xerc_init,
                                                        stall => '0');
 
     type FPUToExecute1Type is record
@@ -668,6 +672,8 @@ package common is
         write_cr_enable : std_ulogic;
         write_cr_mask   : std_ulogic_vector(7 downto 0);
         write_cr_data   : std_ulogic_vector(31 downto 0);
+        write_xerc      : std_ulogic;
+        xerc            : xer_common_t;
         intr_vec        : intr_vector_t;
         srr0            : std_ulogic_vector(63 downto 0);
         srr1            : std_ulogic_vector(15 downto 0);
@@ -677,6 +683,7 @@ package common is
          write_enable => '0', write_reg => (others => '0'),
          write_cr_enable => '0', write_cr_mask => (others => '0'),
          write_cr_data => (others => '0'),
+         write_xerc => '0', xerc => xerc_init,
          intr_vec => 0, srr1 => (others => '0'),
          others => (others => '0'));
 
diff --git a/decode1.vhdl b/decode1.vhdl
index 5bc023b..2e2a8e3 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -35,6 +35,18 @@ architecture behaviour of decode1 is
     constant illegal_inst : decode_rom_t :=
         (NONE, NONE, OP_ILLEGAL,   NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);
 
+    -- If we have an FPU, then it is used for integer divisions,
+    -- otherwise a dedicated divider in the ALU is used.
+    function divider_unit(hf : boolean) return unit_t is
+    begin
+        if hf then
+            return FPU;
+        else
+            return ALU;
+        end if;
+    end;
+    constant DVU : unit_t := divider_unit(HAS_FPU);
+
     type reg_internal_t is record
         override : std_ulogic;
         override_decode: decode_rom_t;
@@ -225,22 +237,22 @@ architecture behaviour of decode1 is
         2#0100010110#  =>       (ALU,  NONE, OP_DCBT,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbt
         2#0011110110#  =>       (ALU,  NONE, OP_DCBTST,    NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbtst
         2#1111110110#  =>       (LDST, NONE, OP_DCBZ,      RA_OR_ZERO, RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbz
-        2#0110001001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdeu
-        2#1110001001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdeuo
-        2#0110001011#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divweu
-        2#1110001011#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divweuo
-        2#0110101001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divde
-        2#1110101001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divdeo
-        2#0110101011#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divwe
-        2#1110101011#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divweo
-        2#0111001001#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdu
-        2#1111001001#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divduo
-        2#0111001011#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divwu
-        2#1111001011#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divwuo
-        2#0111101001#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divd
-        2#1111101001#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divdo
-        2#0111101011#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divw
-        2#1111101011#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divwo
+        2#0110001001#  =>       (DVU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdeu
+        2#1110001001#  =>       (DVU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdeuo
+        2#0110001011#  =>       (DVU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divweu
+        2#1110001011#  =>       (DVU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divweuo
+        2#0110101001#  =>       (DVU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divde
+        2#1110101001#  =>       (DVU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divdeo
+        2#0110101011#  =>       (DVU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divwe
+        2#1110101011#  =>       (DVU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divweo
+        2#0111001001#  =>       (DVU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdu
+        2#1111001001#  =>       (DVU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divduo
+        2#0111001011#  =>       (DVU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divwu
+        2#1111001011#  =>       (DVU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divwuo
+        2#0111101001#  =>       (DVU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divd
+        2#1111101001#  =>       (DVU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divdo
+        2#0111101011#  =>       (DVU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divw
+        2#1111101011#  =>       (DVU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divwo
         2#1100110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dss
         2#0101010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dst
         2#0101110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dstst
@@ -318,10 +330,10 @@ architecture behaviour of decode1 is
         2#0000010011#  =>       (ALU,  NONE, OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfcr/mfocrf
         2#0001010011#  =>       (ALU,  NONE, OP_MFMSR,     NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mfmsr
         2#0101010011#  =>       (ALU,  NONE, OP_MFSPR,     SPR,        NONE,        RS,   RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr
-        2#0100001001#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud
-        2#0100001011#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw
-        2#1100001001#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd
-        2#1100001011#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw
+        2#0100001001#  =>       (DVU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud
+        2#0100001011#  =>       (DVU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw
+        2#1100001001#  =>       (DVU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd
+        2#1100001011#  =>       (DVU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw
         2#0010010000#  =>       (ALU,  NONE, OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtcrf/mtocrf
         2#0010010010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- mtmsr
         2#0010110010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtmsrd # ignore top bits and d
diff --git a/execute1.vhdl b/execute1.vhdl
index 2121963..2efe439 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -188,7 +188,7 @@ architecture behaviour of execute1 is
 
     -- divider signals
     signal x_to_divider: Execute1ToDividerType;
-    signal divider_to_x: DividerToExecute1Type;
+    signal divider_to_x: DividerToExecute1Type := DividerToExecute1Init;
 
     -- random number generator signals
     signal random_raw  : std_ulogic_vector(63 downto 0);
@@ -367,13 +367,15 @@ begin
             m_out => multiply_to_x
             );
 
-    divider_0: entity work.divider
-        port map (
-            clk => clk,
-            rst => rst,
-            d_in => x_to_divider,
-            d_out => divider_to_x
-            );
+    divider_0: if not HAS_FPU generate
+        div_0: entity work.divider
+            port map (
+                clk => clk,
+                rst => rst,
+                d_in => x_to_divider,
+                d_out => divider_to_x
+                );
+    end generate;
 
     random_0: entity work.random
         port map (
@@ -1159,9 +1161,11 @@ begin
                 owait := '1';
 
 	    when OP_DIV | OP_DIVE | OP_MOD =>
-                v.start_div := '1';
-                slow_op := '1';
-                owait := '1';
+                if not HAS_FPU then
+                    v.start_div := '1';
+                    slow_op := '1';
+                    owait := '1';
+                end if;
 
             when OP_FETCH_FAILED =>
                 -- Handling an ITLB miss doesn't count as having executed an instruction
@@ -1457,6 +1461,9 @@ begin
         fv.frt := e_in.write_reg;
         fv.rc := e_in.rc;
         fv.out_cr := e_in.output_cr;
+        fv.m32b := not ex1.msr(MSR_SF);
+        fv.oe := e_in.oe;
+        fv.xerc := xerc_in;
         fv.stall := l_in.l2stall;
 
 	-- Update registers
diff --git a/fpu.vhdl b/fpu.vhdl
index b8cea39..90e04b3 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -125,6 +125,7 @@ architecture behaviour of fpu is
         write_reg    : gspr_index_t;
         complete_tag : instr_tag_t;
         writing_cr   : std_ulogic;
+        writing_xer  : std_ulogic;
         int_result   : std_ulogic;
         cr_result    : std_ulogic_vector(3 downto 0);
         cr_mask      : std_ulogic_vector(7 downto 0);
@@ -151,6 +152,7 @@ architecture behaviour of fpu is
         invalid      : std_ulogic;
         negate       : std_ulogic;
         longmask     : std_ulogic;
+        integer_op   : std_ulogic;
         divext       : std_ulogic;
         divmod       : std_ulogic;
         is_signed    : std_ulogic;
@@ -159,6 +161,10 @@ architecture behaviour of fpu is
         inc_quot     : std_ulogic;
         a_hi         : std_ulogic_vector(7 downto 0);
         a_lo         : std_ulogic_vector(55 downto 0);
+        m32b         : std_ulogic;
+        oe           : std_ulogic;
+        xerc         : xer_common_t;
+        xerc_result  : xer_common_t;
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -604,6 +610,7 @@ begin
                 r.do_intr <= '0';
                 r.writing_fpr <= '0';
                 r.writing_cr <= '0';
+                r.writing_xer <= '0';
                 r.fpscr <= (others => '0');
                 r.write_reg <= (others =>'0');
                 r.complete_tag.valid <= '0';
@@ -658,6 +665,8 @@ begin
     w_out.write_cr_mask <= r.cr_mask;
     w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result &
                            r.cr_result & r.cr_result & r.cr_result & r.cr_result;
+    w_out.write_xerc <= r.writing_xer and r.complete;
+    w_out.xerc <= r.xerc_result;
     w_out.interrupt <= r.do_intr;
     w_out.intr_vec <= 16#700#;
     w_out.srr0 <= r.nia;
@@ -739,6 +748,7 @@ begin
             v.instr_done := '0';
             v.writing_fpr := '0';
             v.writing_cr := '0';
+            v.writing_xer := '0';
             v.comm_fpscr := r.fpscr;
             v.illegal := '0';
         end if;
@@ -755,7 +765,11 @@ begin
             v.is_signed := e_in.is_signed;
             v.rc := e_in.rc;
             v.is_cmp := e_in.out_cr;
+            v.oe := e_in.oe;
+            v.m32b := e_in.m32b;
+            v.xerc := e_in.xerc;
             v.longmask := '0';
+            v.integer_op := '0';
             v.divext := '0';
             v.divmod := '0';
             if e_in.op = OP_FPOP or e_in.op = OP_FPOP_I then
@@ -764,6 +778,7 @@ begin
                     int_input := '1';
                 end if;
             else -- OP_DIV, OP_DIVE, OP_MOD
+                v.integer_op := '1';
                 int_input := '1';
                 is_32bint := e_in.single;
                 if e_in.op = OP_DIVE then
@@ -2865,12 +2880,44 @@ begin
                     v.state := IDIV_DONE;
                 end if;
             when IDIV_DONE =>
+                v.xerc_result := v.xerc;
+                if r.oe = '1' then
+                    v.xerc_result.ov := '0';
+                    v.xerc_result.ov32 := '0';
+                    v.writing_xer := '1';
+                end if;
+                if r.m32b = '0' then
+                    v.cr_result(3) := r.r(63);
+                    v.cr_result(2 downto 1) := "00";
+                    if r.r = 64x"0" then
+                        v.cr_result(1) := '1';
+                    else
+                        v.cr_result(2) := not r.r(63);
+                    end if;
+                else
+                    v.cr_result(3) := r.r(31);
+                    v.cr_result(2 downto 1) := "00";
+                    if r.r(31 downto 0) = 32x"0" then
+                        v.cr_result(1) := '1';
+                    else
+                        v.cr_result(2) := not r.r(31);
+                    end if;
+                end if;
+                v.cr_result(0) := v.xerc.so;
                 int_result := '1';
                 v.writing_fpr := '1';
                 v.instr_done := '1';
             when IDIV_ZERO =>
                 opsel_r <= RES_MISC;
                 misc_sel <= "0101";
+                v.xerc_result := v.xerc;
+                if r.oe = '1' then
+                    v.xerc_result.ov := r.int_ovf;
+                    v.xerc_result.ov32 := r.int_ovf;
+                    v.xerc_result.so := r.xerc.so or r.int_ovf;
+                    v.writing_xer := '1';
+                end if;
+                v.cr_result := "001" & v.xerc_result.so;
                 int_result := '1';
                 v.writing_fpr := '1';
                 v.instr_done := '1';
@@ -3169,14 +3216,16 @@ begin
                 v.state := IDLE;
                 v.busy := '0';
                 v.f2stall := '0';
-                if r.rc = '1' then
+                if r.rc = '1' and (r.op = OP_FPOP or r.op = OP_FPOP_I) then
                     v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
                 end if;
                 v.sp_result := r.single_prec;
                 v.int_result := int_result;
                 v.illegal := illegal;
                 v.nsnan_result := v.quieten_nan;
-                if r.is_cmp = '0' then
+                if r.integer_op = '1' then
+                    v.cr_mask := num_to_fxm(0);
+                elsif r.is_cmp = '0' then
                     v.cr_mask := num_to_fxm(1);
                 else
                     v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(r.insn))));
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 500e92d..773c05d 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1410,6 +1410,110 @@ int fpu_test_23(void)
 	return trapit(0, test23);
 }
 
+struct idiv_tests {
+	unsigned long denom;
+	unsigned long divisor;
+	unsigned long divd;
+	unsigned long divdu;
+	unsigned long divde;
+	unsigned long divdeu;
+	unsigned long modsd;
+	unsigned long modud;
+} idiv_tests[] = {
+	{ 0, 0,			0, 0, 0, 0, 0, 0 },
+	{ 0x56789a, 0x1234,	0x4c0, 0x4c0, 0, 0, 0x19a, 0x19a },
+	{ 2, 3,			0, 0, 0, 0xaaaaaaaaaaaaaaaa, 2, 2 },
+	{ 31, 157,		0, 0, 0x328c3ab35cf15328, 0x328c3ab35cf15328, 31, 31 },
+	{ -4329874, 43879,	-98, 0x17e5a119b9170, 0, 0, -29732, 39518 },
+	{ -4329874, -43879,	98, 0, 0, 0xffffffffffbe99d4, -29732, -4329874 },
+	{ 0x8000000000000000ul, -1, 0, 0, 0, 0x8000000000000000ul, 0, 0x8000000000000000ul },
+};
+
+int fpu_test_24(void)
+{
+	long i;
+	unsigned long a, b, results[6];
+
+	for (i = 0; i < sizeof(idiv_tests) / sizeof(idiv_tests[0]); ++i) {
+		a = idiv_tests[i].denom;
+		b = idiv_tests[i].divisor;
+		asm("divd %0,%1,%2" : "=r" (results[0]) : "r" (a), "r" (b));
+		asm("divdu %0,%1,%2" : "=r" (results[1]) : "r" (a), "r" (b));
+		asm("divde %0,%1,%2" : "=r" (results[2]) : "r" (a), "r" (b));
+		asm("divdeu %0,%1,%2" : "=r" (results[3]) : "r" (a), "r" (b));
+		asm("modsd %0,%1,%2" : "=r" (results[4]) : "r" (a), "r" (b));
+		asm("modud %0,%1,%2" : "=r" (results[5]) : "r" (a), "r" (b));
+		if (results[0] != idiv_tests[i].divd ||
+		    results[1] != idiv_tests[i].divdu ||
+		    results[2] != idiv_tests[i].divde ||
+		    results[3] != idiv_tests[i].divdeu ||
+		    results[4] != idiv_tests[i].modsd ||
+		    results[5] != idiv_tests[i].modud) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, " ");
+			print_hex(results[4], 16, " ");
+			print_hex(results[5], 16, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+struct wdiv_tests {
+	unsigned int denom;
+	unsigned int divisor;
+	unsigned int divw;
+	unsigned int divwu;
+	unsigned int divwe;
+	unsigned int divweu;
+	unsigned int modsw;
+	unsigned int moduw;
+} wdiv_tests[] = {
+	{ 0, 0,			0, 0, 0, 0, 0, 0 },
+	{ 0x56789a, 0x1234,	0x4c0, 0x4c0, 0, 0, 0x19a, 0x19a },
+	{ 2, 3,			0, 0, 0, 0xaaaaaaaa, 2, 2 },
+	{ 31, 157,		0, 0, 0x328c3ab3, 0x328c3ab3, 31, 31 },
+	{ -4329874, 43879,	-98, 0x17df7, 0, 0, -29732, 17165 },
+	{ -4329874, -43879,	98, 0, 0, 0xffbe99a9, -29732, -4329874 },
+	{ 0x80000000u, -1,	0, 0, 0, 0x80000000u, 0, 0x80000000u },
+};
+
+int fpu_test_25(void)
+{
+	long i;
+	unsigned int a, b, results[6];
+
+	for (i = 0; i < sizeof(wdiv_tests) / sizeof(wdiv_tests[0]); ++i) {
+		a = wdiv_tests[i].denom;
+		b = wdiv_tests[i].divisor;
+		asm("divw %0,%1,%2" : "=r" (results[0]) : "r" (a), "r" (b));
+		asm("divwu %0,%1,%2" : "=r" (results[1]) : "r" (a), "r" (b));
+		asm("divwe %0,%1,%2" : "=r" (results[2]) : "r" (a), "r" (b));
+		asm("divweu %0,%1,%2" : "=r" (results[3]) : "r" (a), "r" (b));
+		asm("modsw %0,%1,%2" : "=r" (results[4]) : "r" (a), "r" (b));
+		asm("moduw %0,%1,%2" : "=r" (results[5]) : "r" (a), "r" (b));
+		if (results[0] != wdiv_tests[i].divw ||
+		    results[1] != wdiv_tests[i].divwu ||
+		    results[2] != wdiv_tests[i].divwe ||
+		    results[3] != wdiv_tests[i].divweu ||
+		    results[4] != wdiv_tests[i].modsw ||
+		    results[5] != wdiv_tests[i].moduw) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 8, " ");
+			print_hex(results[1], 8, " ");
+			print_hex(results[2], 8, " ");
+			print_hex(results[3], 8, " ");
+			print_hex(results[4], 8, " ");
+			print_hex(results[5], 8, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1458,6 +1562,8 @@ int main(void)
 	do_test(21, fpu_test_21);
 	do_test(22, fpu_test_22);
 	do_test(23, fpu_test_23);
+	do_test(24, fpu_test_24);
+	do_test(25, fpu_test_25);
 
 	return fail;
 }
diff --git a/writeback.vhdl b/writeback.vhdl
index 0d6f41d..5b384c6 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -73,6 +73,8 @@ begin
             assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) +
                     to_integer(unsigned(y))) <= 1 severity failure;
 
+            assert (e_in.write_xerc_enable and fp_in.write_xerc) /= '1' severity failure;
+
             assert not (e_in.valid = '1' and e_in.instr_tag.valid = '0') severity failure;
             assert not (l_in.valid = '1' and l_in.instr_tag.valid = '0') severity failure;
             assert not (fp_in.valid = '1' and fp_in.instr_tag.valid = '0') severity failure;
@@ -168,6 +170,11 @@ begin
                 c_out.write_cr_data <= fp_in.write_cr_data;
             end if;
 
+            if fp_in.write_xerc = '1' then
+                c_out.write_xerc_enable <= '1';
+                c_out.write_xerc_data <= fp_in.xerc;
+            end if;
+
             if l_in.write_enable = '1' then
                 w_out.write_reg <= l_in.write_reg;
                 w_out.write_data <= l_in.write_data;

From bc4d02cb0dcc5b502a45651953ac7bd34521f0b9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 12 Jul 2022 08:52:05 +1000
Subject: [PATCH 22/30] Start removing SPRs from register file

This starts the process of removing SPRs from the register file by
moving SRR0/1, SPRG0-3, HSRR0/1 and HSPRG0/1 out of the register file
and putting them into execute1.  They are stored in a pair of small
RAM arrays, referred to as "even" and "odd".  The reason for having
two arrays is so that two values can be read and written in each
cycle.  For example, SRR0 and SRR1 can be written in parallel by an
interrupt and read in parallel by the rfid instruction.

The addresses in the RAM which will be accessed are determined in the
decode2 stage.  We have one write address for both sides, but two read
addresses, since in future we will want to be able to read CTR at the
same time as either LR or TAR.

We now have a connection from writeback to execute1 which carries the
partial SRR1 value for an interrupt.  SRR0 comes from the execute
pipeline; we no longer need to carry instruction addresses along the
LSU and FPU pipelines.  Since SRR0 and SRR1 can be written in the same
cycle now, we don't need the little state machine in writeback any
more.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |  64 +++++++++++++--------
 core.vhdl       |   6 +-
 decode1.vhdl    |  61 +++++++++++++++-----
 decode2.vhdl    |  19 +++++++
 execute1.vhdl   | 146 ++++++++++++++++++++++++++++++++++++++++++------
 fpu.vhdl        |   3 -
 loadstore1.vhdl |   8 +--
 writeback.vhdl  |  43 ++------------
 8 files changed, 242 insertions(+), 108 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index f846fb4..74341d1 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -124,6 +124,28 @@ package common is
     end record;
     constant xerc_init : xer_common_t := (others => '0');
 
+    -- Some SPRs are stored in a pair of small RAMs in execute1
+    -- Even half:
+    subtype ramspr_index is natural range 0 to 7;
+    constant RAMSPR_SRR0   : ramspr_index := 0;
+    constant RAMSPR_HSRR0  : ramspr_index := 1;
+    constant RAMSPR_SPRG0  : ramspr_index := 2;
+    constant RAMSPR_SPRG2  : ramspr_index := 3;
+    constant RAMSPR_HSPRG0 : ramspr_index := 4;
+    -- Odd half:
+    constant RAMSPR_SRR1   : ramspr_index := 0;
+    constant RAMSPR_HSRR1  : ramspr_index := 1;
+    constant RAMSPR_SPRG1  : ramspr_index := 2;
+    constant RAMSPR_SPRG3  : ramspr_index := 3;
+    constant RAMSPR_HSPRG1 : ramspr_index := 4;
+
+    type ram_spr_info is record
+        index : ramspr_index;
+        isodd : std_ulogic;
+        valid : std_ulogic;
+    end record;
+    constant ram_spr_info_init: ram_spr_info := (index => 0, others => '0');
+
     subtype spr_selector is std_ulogic_vector(2 downto 0);
     type spr_id is record
         sel   : spr_selector;
@@ -253,12 +275,13 @@ package common is
         br_pred: std_ulogic; -- Branch was predicted to be taken
         big_endian: std_ulogic;
         spr_info : spr_id;
+        ram_spr : ram_spr_info;
     end record;
     constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
         (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
          ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'),
          decode => decode_rom_init, br_pred => '0', big_endian => '0',
-         spr_info => spr_id_init);
+         spr_info => spr_id_init, ram_spr => ram_spr_info_init);
 
     type Decode1ToFetch1Type is record
         redirect     : std_ulogic;
@@ -320,6 +343,13 @@ package common is
         repeat : std_ulogic;                            -- set if instruction is cracked into two ops
         second : std_ulogic;                            -- set if this is the second op
         spr_select : spr_id;
+        spr_is_ram : std_ulogic;
+        ramspr_even_rdaddr : ramspr_index;
+        ramspr_odd_rdaddr  : ramspr_index;
+        ramspr_rd_odd      : std_ulogic;
+        ramspr_wraddr      : ramspr_index;
+        ramspr_write_even  : std_ulogic;
+        ramspr_write_odd   : std_ulogic;
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
 	(valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init,
@@ -333,6 +363,9 @@ package common is
          cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'),
          result_sel => "000", sub_select => "000",
          repeat => '0', second => '0', spr_select => spr_id_init,
+         spr_is_ram => '0',
+         ramspr_even_rdaddr => 0, ramspr_odd_rdaddr => 0, ramspr_rd_odd => '0',
+         ramspr_wraddr => 0, ramspr_write_even => '0', ramspr_write_odd => '0',
          others => (others => '0'));
 
     type MultiplyInputType is record
@@ -574,7 +607,6 @@ package common is
         store_done : std_ulogic;
         interrupt : std_ulogic;
         intr_vec : intr_vector_t;
-        srr0: std_ulogic_vector(63 downto 0);
         srr1: std_ulogic_vector(15 downto 0);
     end record;
     constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType :=
@@ -582,7 +614,7 @@ package common is
          write_reg => (others => '0'), write_data => (others => '0'),
          xerc => xerc_init, rc => '0', store_done => '0',
          interrupt => '0', intr_vec => 0,
-         srr0 => (others => '0'), srr1 => (others => '0'));
+         srr1 => (others => '0'));
 
     type Loadstore1EventType is record
         load_complete  : std_ulogic;
@@ -675,7 +707,6 @@ package common is
         write_xerc      : std_ulogic;
         xerc            : xer_common_t;
         intr_vec        : intr_vector_t;
-        srr0            : std_ulogic_vector(63 downto 0);
         srr1            : std_ulogic_vector(15 downto 0);
     end record;
     constant FPUToWritebackInit : FPUToWritebackType :=
@@ -731,6 +762,11 @@ package common is
 							       write_cr_mask => (others => '0'),
 							       write_cr_data => (others => '0'));
 
+    type WritebackToExecute1Type is record
+        intr : std_ulogic;
+        srr1 : std_ulogic_vector(15 downto 0);
+    end record;
+
     type WritebackEventType is record
         instr_complete : std_ulogic;
         fp_complete    : std_ulogic;
@@ -755,26 +791,6 @@ package body common is
            n := 0;              -- N.B. decode2 relies on this specific value
        when SPR_CTR =>
            n := 1;              -- N.B. decode2 relies on this specific value
-       when SPR_SRR0 =>
-           n := 2;
-       when SPR_SRR1 =>
-           n := 3;
-       when SPR_HSRR0 =>
-           n := 4;
-       when SPR_HSRR1 =>
-           n := 5;
-       when SPR_SPRG0 =>
-           n := 6;
-       when SPR_SPRG1 =>
-           n := 7;
-       when SPR_SPRG2 =>
-           n := 8;
-       when SPR_SPRG3 | SPR_SPRG3U =>
-           n := 9;
-       when SPR_HSPRG0 =>
-           n := 10;
-       when SPR_HSPRG1 =>
-           n := 11;
        when SPR_TAR =>
            n := 13;
        when others =>
diff --git a/core.vhdl b/core.vhdl
index ba8f0cc..b2f2704 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -102,6 +102,7 @@ architecture behave of core is
 
     -- Writeback signals
     signal writeback_bypass: bypass_data_t;
+    signal wb_interrupt: WritebackToExecute1Type;
 
     -- local signals
     signal fetch1_stall_in : std_ulogic;
@@ -122,7 +123,6 @@ architecture behave of core is
     signal complete: instr_tag_t;
     signal terminate: std_ulogic;
     signal core_rst: std_ulogic;
-    signal do_interrupt: std_ulogic;
 
     -- Delayed/Latched resets and alt_reset
     signal rst_fetch1  : std_ulogic;
@@ -361,7 +361,7 @@ begin
             l_in => loadstore1_to_execute1,
             fp_in => fpu_to_execute1,
             ext_irq_in => ext_irq,
-            interrupt_in => do_interrupt,
+            interrupt_in => wb_interrupt,
             l_out => execute1_to_loadstore1,
             fp_out => execute1_to_fpu,
             e_out => execute1_to_writeback,
@@ -469,7 +469,7 @@ begin
             f_out => writeback_to_fetch1,
             wb_bypass => writeback_bypass,
             events => writeback_events,
-            interrupt_out => do_interrupt,
+            interrupt_out => wb_interrupt,
             complete_out => complete
             );
 
diff --git a/decode1.vhdl b/decode1.vhdl
index 2e2a8e3..fd01d61 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -181,7 +181,7 @@ architecture behaviour of decode1 is
         -- isync
         2#111#    =>       (ALU, NONE, OP_ISYNC,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         -- rfid
-        2#101#    =>       (ALU, NONE, OP_RFID,      SPR,        SPR,         NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
+        2#101#    =>       (ALU, NONE, OP_RFID,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         others   => illegal_inst
         );
 
@@ -525,6 +525,42 @@ architecture behaviour of decode1 is
     constant nop_instr      : decode_rom_t := (ALU,  NONE, OP_NOP,          NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);
     constant fetch_fail_inst: decode_rom_t := (LDST, NONE, OP_FETCH_FAILED, NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);
 
+    function decode_ram_spr(sprn : spr_num_t) return ram_spr_info is
+        variable ret : ram_spr_info;
+    begin
+        ret := (index => 0, isodd => '0', valid => '1');
+        case sprn is
+            when SPR_SRR0 =>
+                ret.index := RAMSPR_SRR0;
+            when SPR_SRR1 =>
+                ret.index := RAMSPR_SRR1;
+                ret.isodd := '1';
+            when SPR_HSRR0 =>
+                ret.index := RAMSPR_HSRR0;
+            when SPR_HSRR1 =>
+                ret.index := RAMSPR_HSRR1;
+                ret.isodd := '1';
+            when SPR_SPRG0 =>
+                ret.index := RAMSPR_SPRG0;
+            when SPR_SPRG1 =>
+                ret.index := RAMSPR_SPRG1;
+                ret.isodd := '1';
+            when SPR_SPRG2 =>
+                ret.index := RAMSPR_SPRG2;
+            when SPR_SPRG3 | SPR_SPRG3U =>
+                ret.index := RAMSPR_SPRG3;
+                ret.isodd := '1';
+            when SPR_HSPRG0 =>
+                ret.index := RAMSPR_HSPRG0;
+            when SPR_HSPRG1 =>
+                ret.index := RAMSPR_HSPRG1;
+                ret.isodd := '1';
+            when others =>
+                ret.valid := '0';
+        end case;
+        return ret;
+    end;
+
     function map_spr(sprn : spr_num_t) return spr_id is
         variable i : spr_id;
     begin
@@ -614,6 +650,7 @@ begin
 
         sprn := decode_spr_num(f_in.insn);
         v.spr_info := map_spr(sprn);
+        v.ram_spr := decode_ram_spr(sprn);
 
         case to_integer(unsigned(majorop)) is
         when 4 =>
@@ -632,17 +669,17 @@ begin
 
             if std_match(f_in.insn(10 downto 1), "01-1010011") then
                 -- mfspr or mtspr
-                if is_fast_spr(v.ispr1) = '0' then
-                    -- Make mtspr to slow SPRs single issue
+                -- Make mtspr to slow SPRs single issue
+                if v.spr_info.valid = '1' then
                     vi.force_single := f_in.insn(8);
-                    -- send MMU-related SPRs to loadstore1
-                    case sprn is
-                        when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>
-                            vi.override_decode.unit := LDST;
-                            vi.override_unit := '1';
-                        when others =>
-                    end case;
                 end if;
+                -- send MMU-related SPRs to loadstore1
+                case sprn is
+                    when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>
+                        vi.override_decode.unit := LDST;
+                        vi.override_unit := '1';
+                    when others =>
+                end case;
             end if;
 
         when 16 =>
@@ -690,10 +727,6 @@ begin
                 else
                     v.ispr2 := fast_spr_num(SPR_TAR);
                 end if;
-            else
-                -- Could be OP_RFID
-                v.ispr1 := fast_spr_num(SPR_SRR1);
-                v.ispr2 := fast_spr_num(SPR_SRR0);
             end if;
 
         when 24 =>
diff --git a/decode2.vhdl b/decode2.vhdl
index a043ef9..c76b7f5 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -480,6 +480,23 @@ begin
 
             v.e.spr_select := d_in.spr_info;
 
+            case op is
+                when OP_MFSPR =>
+                    v.e.ramspr_even_rdaddr := d_in.ram_spr.index;
+                    v.e.ramspr_odd_rdaddr := d_in.ram_spr.index;
+                    v.e.ramspr_rd_odd := d_in.ram_spr.isodd;
+                    v.e.spr_is_ram := d_in.ram_spr.valid;
+                when OP_MTSPR =>
+                    v.e.ramspr_wraddr := d_in.ram_spr.index;
+                    v.e.ramspr_write_even := d_in.ram_spr.valid and not d_in.ram_spr.isodd;
+                    v.e.ramspr_write_odd := d_in.ram_spr.valid and d_in.ram_spr.isodd;
+                    v.e.spr_is_ram := d_in.ram_spr.valid;
+                when OP_RFID =>
+                    v.e.ramspr_even_rdaddr := RAMSPR_SRR0;
+                    v.e.ramspr_odd_rdaddr := RAMSPR_SRR1;
+                when others =>
+            end case;
+
             case d_in.decode.length is
                 when is1B =>
                     length := "0001";
@@ -530,6 +547,8 @@ begin
             if op = OP_MFSPR then
                 if is_fast_spr(d_in.ispr1) = '1' then
                     v.e.result_sel := "000";        -- adder_result, effectively a_in
+                elsif d_in.ram_spr.valid = '1' then
+                    v.e.result_sel := "101";        -- ramspr_result
                 elsif d_in.spr_info.valid = '0' then
                     -- Privileged mfspr to invalid/unimplemented SPR numbers
                     -- writes the contents of RT back to RT (i.e. it's a no-op)
diff --git a/execute1.vhdl b/execute1.vhdl
index 2efe439..b0b2f98 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -31,7 +31,7 @@ entity execute1 is
         fp_in : in FPUToExecute1Type;
 
 	ext_irq_in : std_ulogic;
-        interrupt_in : std_ulogic;
+        interrupt_in : WritebackToExecute1Type;
 
 	-- asynchronous
         l_out : out Execute1ToLoadstore1Type;
@@ -72,6 +72,8 @@ architecture behaviour of execute1 is
         write_loga : std_ulogic;
         inc_loga : std_ulogic;
         write_pmuspr : std_ulogic;
+        ramspr_write_even : std_ulogic;
+        ramspr_write_odd : std_ulogic;
     end record;
     constant side_effect_init : side_effect_type := (others => '0');
 
@@ -119,6 +121,7 @@ architecture behaviour of execute1 is
         msr : std_ulogic_vector(63 downto 0);
         xerc : xer_common_t;
         xerc_valid : std_ulogic;
+        ramspr_wraddr : ramspr_index;
     end record;
     constant reg_stage1_type_init : reg_stage1_type :=
         (e => Execute1ToWritebackInit, se => side_effect_init,
@@ -130,7 +133,8 @@ architecture behaviour of execute1 is
          no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
          taken_branch_event => '0', br_mispredict => '0',
          msr => 64x"0",
-         xerc => xerc_init, xerc_valid => '0');
+         xerc => xerc_init, xerc_valid => '0',
+         ramspr_wraddr => 0);
 
     type reg_stage2_type is record
 	e : Execute1ToWritebackType;
@@ -203,6 +207,20 @@ architecture behaviour of execute1 is
     signal exception_log : std_ulogic;
     signal irq_valid_log : std_ulogic;
 
+    -- SPR-related signals
+    type ramspr_half_t is array(ramspr_index) of std_ulogic_vector(63 downto 0);
+    signal even_sprs : ramspr_half_t := (others => (others => '0'));
+    signal odd_sprs : ramspr_half_t := (others => (others => '0'));
+    signal ramspr_even : std_ulogic_vector(63 downto 0);
+    signal ramspr_odd : std_ulogic_vector(63 downto 0);
+    signal ramspr_result : std_ulogic_vector(63 downto 0);
+    signal ramspr_rd_odd : std_ulogic;
+    signal ramspr_wr_addr : ramspr_index;
+    signal ramspr_even_wr_data : std_ulogic_vector(63 downto 0);
+    signal ramspr_even_wr_enab : std_ulogic;
+    signal ramspr_odd_wr_data : std_ulogic_vector(63 downto 0);
+    signal ramspr_odd_wr_enab : std_ulogic;
+
     signal stage2_stall : std_ulogic;
 
     type privilege_level is (USER, SUPER);
@@ -289,6 +307,18 @@ architecture behaviour of execute1 is
 	return msr_out;
     end;
 
+    function intr_srr1(msr: std_ulogic_vector; flags: std_ulogic_vector)
+        return std_ulogic_vector is
+        variable srr1: std_ulogic_vector(63 downto 0);
+    begin
+        srr1(63 downto 31) := msr(63 downto 31);
+        srr1(30 downto 27) := flags(14 downto 11);
+        srr1(26 downto 22) := msr(26 downto 22);
+        srr1(21 downto 16) := flags(5 downto 0);
+        srr1(15 downto  0) := msr(15 downto 0);
+        return srr1;
+    end;
+
     -- Work out whether a signed value fits into n bits,
     -- that is, see if it is in the range -2^(n-1) .. 2^(n-1) - 1
     function fits_in_n_bits(val: std_ulogic_vector; n: integer) return boolean is
@@ -456,6 +486,78 @@ begin
 
     valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt);
 
+    -- SPRs stored in two small RAM arrays (two so that we can read and write
+    -- two SPRs in each cycle).
+
+    ramspr_read: process(all)
+        variable even_rd_data, odd_rd_data : std_ulogic_vector(63 downto 0);
+        variable wr_addr : ramspr_index;
+        variable even_wr_enab, odd_wr_enab : std_ulogic;
+        variable even_wr_data, odd_wr_data : std_ulogic_vector(63 downto 0);
+        variable doit : std_ulogic;
+    begin
+        -- Read address mux and async RAM reading
+        even_rd_data := even_sprs(e_in.ramspr_even_rdaddr);
+        odd_rd_data := odd_sprs(e_in.ramspr_odd_rdaddr);
+
+        -- Write address and data muxes
+        doit := ex1.e.valid and not stage2_stall and not flush_in;
+        even_wr_enab := (ex1.se.ramspr_write_even and doit) or interrupt_in.intr;
+        odd_wr_enab  := (ex1.se.ramspr_write_odd and doit) or interrupt_in.intr;
+        if interrupt_in.intr = '1' then
+            wr_addr := RAMSPR_SRR0;
+        else
+            wr_addr := ex1.ramspr_wraddr;
+        end if;
+        if interrupt_in.intr = '1' then
+            even_wr_data := ex2.e.last_nia;
+            odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1);
+        else
+            even_wr_data := ex1.e.write_data;
+            odd_wr_data := ex1.e.write_data;
+        end if;
+        ramspr_wr_addr <= wr_addr;
+        ramspr_even_wr_data <= even_wr_data;
+        ramspr_even_wr_enab <= even_wr_enab;
+        ramspr_odd_wr_data <= odd_wr_data;
+        ramspr_odd_wr_enab <= odd_wr_enab;
+
+        -- SPR RAM read with write data bypass
+        -- We assume no instruction executes in the cycle immediately following
+        -- an interrupt, so we don't need to bypass interrupt data
+        if ex1.se.ramspr_write_even = '1' and e_in.ramspr_even_rdaddr = ex1.ramspr_wraddr then
+            ramspr_even <= ex1.e.write_data;
+        else
+            ramspr_even <= even_rd_data;
+        end if;
+        if ex1.se.ramspr_write_odd = '1' and e_in.ramspr_odd_rdaddr = ex1.ramspr_wraddr then
+            ramspr_odd <= ex1.e.write_data;
+        else
+            ramspr_odd <= odd_rd_data;
+        end if;
+        if e_in.ramspr_rd_odd = '0' then
+            ramspr_result <= ramspr_even;
+        else
+            ramspr_result <= ramspr_odd;
+        end if;
+    end process;
+
+    ramspr_write: process(clk)
+    begin
+        if rising_edge(clk) then
+            if ramspr_even_wr_enab = '1' then
+                even_sprs(ramspr_wr_addr) <= ramspr_even_wr_data;
+                report "writing even spr " & integer'image(ramspr_wr_addr) & " data=" &
+                    to_hstring(ramspr_even_wr_data);
+            end if;
+            if ramspr_odd_wr_enab = '1' then
+                odd_sprs(ramspr_wr_addr) <= ramspr_odd_wr_data;
+                report "writing odd spr " & integer'image(ramspr_wr_addr) & " data=" &
+                    to_hstring(ramspr_odd_wr_data);
+            end if;
+        end if;
+    end process;
+
     -- First stage result mux
     s1_sel <= e_in.result_sel when ex1.busy = '0' else "100";
     with s1_sel select alu_result <=
@@ -464,6 +566,7 @@ begin
         rotator_result     when "010",
         shortmul_result    when "011",
         muldiv_result      when "100",
+        ramspr_result      when "101",
         next_nia           when "110",
         misc_result        when others;
 
@@ -830,6 +933,7 @@ begin
         variable privileged : std_ulogic;
         variable slow_op : std_ulogic;
         variable owait : std_ulogic;
+        variable srr1 : std_ulogic_vector(63 downto 0);
     begin
         v := actions_type_init;
         v.e.write_data := alu_result;
@@ -850,6 +954,9 @@ begin
         v.e.last_nia := e_in.nia;
         v.e.br_offset := 64x"4";
 
+        v.se.ramspr_write_even := e_in.ramspr_write_even;
+        v.se.ramspr_write_odd := e_in.ramspr_write_odd;
+
         -- Note the difference between v.exception and v.trap:
         -- v.exception signals a condition that prevents execution of the
         -- instruction, and hence shouldn't depend on operand data, so as to
@@ -1009,26 +1116,27 @@ begin
                 end if;
 
 	    when OP_RFID =>
-                v.e.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) &
-                                  not a_in(MSR_LE) & not a_in(MSR_SF);
+                srr1 := ramspr_odd;
+                v.e.redir_mode := (srr1(MSR_IR) or srr1(MSR_PR)) & not srr1(MSR_PR) &
+                                  not srr1(MSR_LE) & not srr1(MSR_SF);
                 -- Can't use msr_copy here because the partial function MSR
                 -- bits should be left unchanged, not zeroed.
-                v.new_msr(63 downto 31) := a_in(63 downto 31);
-                v.new_msr(26 downto 22) := a_in(26 downto 22);
-                v.new_msr(15 downto 0)  := a_in(15 downto 0);
-                if a_in(MSR_PR) = '1' then
+                v.new_msr(63 downto 31) := srr1(63 downto 31);
+                v.new_msr(26 downto 22) := srr1(26 downto 22);
+                v.new_msr(15 downto 0)  := srr1(15 downto 0);
+                if srr1(MSR_PR) = '1' then
                     v.new_msr(MSR_EE) := '1';
                     v.new_msr(MSR_IR) := '1';
                     v.new_msr(MSR_DR) := '1';
                 end if;
                 v.se.write_msr := '1';
-                v.e.br_offset := b_in;
+                v.e.br_offset := ramspr_even;
                 v.e.abs_br := '1';
                 v.e.redirect := '1';
                 v.se.write_cfar := '1';
                 if HAS_FPU then
                     v.fp_intr := fp_in.exception and
-                                 (a_in(MSR_FE0) or a_in(MSR_FE1));
+                                 (srr1(MSR_FE0) or srr1(MSR_FE1));
                 end if;
                 v.do_trace := '0';
 
@@ -1041,10 +1149,10 @@ begin
             when OP_DARN =>
 	    when OP_MFMSR =>
 	    when OP_MFSPR =>
-		if is_fast_spr(e_in.read_reg1) = '1' then
+		if is_fast_spr(e_in.read_reg1) = '1' or e_in.spr_is_ram = '1' then
                     if e_in.valid = '1' then
                         report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
-                            "=" & to_hstring(a_in);
+                            "=" & to_hstring(alu_result);
                     end if;
 		elsif e_in.spr_select.valid = '1' then
                     if e_in.valid = '1' then
@@ -1121,7 +1229,9 @@ begin
                             v.se.write_loga := '1';
                         when others =>
                     end case;
-		elsif is_fast_spr(e_in.write_reg) = '0' then
+                end if;
+		if e_in.spr_select.valid = '0' and is_fast_spr(e_in.write_reg) = '0' and
+                    e_in.spr_is_ram = '0' then
                     -- mtspr to unimplemented SPRs should be a nop in
                     -- supervisor mode and a program interrupt for user mode
                     if ex1.msr(MSR_PR) = '1' then
@@ -1232,6 +1342,7 @@ begin
             v.pmu_spr_num := e_in.insn(20 downto 16);
             v.mul_select := e_in.sub_select(1 downto 0);
             v.se := side_effect_init;
+            v.ramspr_wraddr := e_in.ramspr_wraddr;
         end if;
 
         lv := Execute1ToLoadstore1Init;
@@ -1402,10 +1513,10 @@ begin
             v.mul_finish := '0';
             v.xerc_valid := '0';
         end if;
-        if flush_in = '1' or interrupt_in = '1' then
+        if flush_in = '1' or interrupt_in.intr = '1' then
             v.msr := ctrl_tmp.msr;
         end if;
-        if interrupt_in = '1' then
+        if interrupt_in.intr = '1' then
             v.trace_next := '0';
             v.fp_exception_next := '0';
         end if;
@@ -1449,7 +1560,6 @@ begin
 
         -- Outputs to FPU
         fv.op := e_in.insn_type;
-        fv.nia := e_in.nia;
         fv.insn := e_in.insn;
         fv.itag := e_in.instr_tag;
         fv.single := e_in.is_32bit;
@@ -1607,7 +1717,7 @@ begin
             x_to_pmu.mtspr <= ex1.se.write_pmuspr;
         end if;
 
- 	if interrupt_in = '1' then
+ 	if interrupt_in.intr = '1' then
             ctrl_tmp.msr(MSR_SF) <= '1';
             ctrl_tmp.msr(MSR_EE) <= '0';
             ctrl_tmp.msr(MSR_PR) <= '0';
@@ -1659,7 +1769,7 @@ begin
                             ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) &
                             exception_log &
                             irq_valid_log &
-                            interrupt_in &
+                            interrupt_in.intr &
                             "000" &
                             ex2.e.write_enable &
                             ex2.e.valid &
diff --git a/fpu.vhdl b/fpu.vhdl
index 90e04b3..2dd221e 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -99,7 +99,6 @@ architecture behaviour of fpu is
         illegal      : std_ulogic;
         op           : insn_type_t;
         insn         : std_ulogic_vector(31 downto 0);
-        nia          : std_ulogic_vector(63 downto 0);
         instr_tag    : instr_tag_t;
         dest_fpr     : gspr_index_t;
         fe_mode      : std_ulogic;
@@ -669,7 +668,6 @@ begin
     w_out.xerc <= r.xerc_result;
     w_out.interrupt <= r.do_intr;
     w_out.intr_vec <= 16#700#;
-    w_out.srr0 <= r.nia;
     w_out.srr1 <= (47-44 => r.illegal, 47-43 => not r.illegal, others => '0');
 
     fpu_1: process(all)
@@ -756,7 +754,6 @@ begin
         -- capture incoming instruction
         if e_in.valid = '1' then
             v.insn := e_in.insn;
-            v.nia := e_in.nia;
             v.op := e_in.op;
             v.instr_tag := e_in.itag;
             v.fe_mode := or (e_in.fe_mode);
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 7fad454..b556211 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -90,7 +90,6 @@ architecture behave of loadstore1 is
         dword_index  : std_ulogic;
         two_dwords   : std_ulogic;
         incomplete   : std_ulogic;
-        nia          : std_ulogic_vector(63 downto 0);
     end record;
     constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0',
                                           dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0',
@@ -105,8 +104,7 @@ architecture behave of loadstore1 is
                                           atomic => '0', atomic_last => '0', rc => '0', nc => '0',
                                           virt_mode => '0', priv_mode => '0', load_sp => '0',
                                           sprn => 10x"0", is_slbia => '0', align_intr => '0',
-                                          dword_index => '0', two_dwords => '0', incomplete => '0',
-                                          nia => (others => '0'));
+                                          dword_index => '0', two_dwords => '0', incomplete => '0');
 
     type reg_stage1_t is record
         req : request_t;
@@ -146,7 +144,6 @@ architecture behave of loadstore1 is
         stage1_en    : std_ulogic;
         interrupt    : std_ulogic;
         intr_vec     : integer range 0 to 16#fff#;
-        nia          : std_ulogic_vector(63 downto 0);
         srr1         : std_ulogic_vector(15 downto 0);
         events       : Loadstore1EventType;
     end record;
@@ -412,7 +409,6 @@ begin
         v.virt_mode := l_in.virt_mode;
         v.priv_mode := l_in.priv_mode;
         v.sprn := sprn;
-        v.nia := l_in.nia;
 
         lsu_sum := std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2));
 
@@ -866,7 +862,6 @@ begin
         -- or ISI or ISegI for instruction fetch exceptions
         v.interrupt := exception;
         if exception = '1' then
-            v.nia := r2.req.nia;
             if r2.req.align_intr = '1' then
                 v.intr_vec := 16#600#;
                 v.dar := r2.req.addr;
@@ -962,7 +957,6 @@ begin
         l_out.store_done <= d_in.store_done;
         l_out.interrupt <= r3.interrupt;
         l_out.intr_vec <= r3.intr_vec;
-        l_out.srr0 <= r3.nia;
         l_out.srr1 <= r3.srr1;
 
         -- update busy signal back to execute1
diff --git a/writeback.vhdl b/writeback.vhdl
index 5b384c6..2f6af2c 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -25,20 +25,12 @@ entity writeback is
         events       : out WritebackEventType;
 
         flush_out    : out std_ulogic;
-        interrupt_out: out std_ulogic;
+        interrupt_out: out WritebackToExecute1Type;
         complete_out : out instr_tag_t
         );
 end entity writeback;
 
 architecture behaviour of writeback is
-    type irq_state_t is (WRITE_SRR0, WRITE_SRR1);
-
-    type reg_type is record
-        state : irq_state_t;
-        srr1  : std_ulogic_vector(63 downto 0);
-    end record;
-
-    signal r, rin : reg_type;
 
 begin
     writeback_0: process(clk)
@@ -47,13 +39,6 @@ begin
         variable w : std_ulogic_vector(0 downto 0);
     begin
         if rising_edge(clk) then
-            if rst = '1' then
-                r.state <= WRITE_SRR0;
-                r.srr1 <= (others => '0');
-            else
-                r <= rin;
-            end if;
-
             -- Do consistency checks only on the clock edge
             x(0) := e_in.valid;
             y(0) := l_in.valid;
@@ -82,7 +67,6 @@ begin
     end process;
 
     writeback_1: process(all)
-        variable v    : reg_type;
         variable f    : WritebackToFetch1Type;
         variable scf  : std_ulogic_vector(3 downto 0);
         variable vec  : integer range 0 to 16#fff#;
@@ -92,9 +76,7 @@ begin
         w_out <= WritebackToRegisterFileInit;
         c_out <= WritebackToCrFileInit;
         f := WritebackToFetch1Init;
-        interrupt_out <= '0';
         vec := 0;
-        v := r;
 
         complete_out <= instr_tag_init;
         if e_in.valid = '1' then
@@ -108,37 +90,21 @@ begin
         events.fp_complete <= fp_in.valid;
 
         intr := e_in.interrupt or l_in.interrupt or fp_in.interrupt;
+        interrupt_out.intr <= intr;
 
-        if r.state = WRITE_SRR1 then
-            w_out.write_reg <= fast_spr_num(SPR_SRR1);
-            w_out.write_data <= r.srr1;
-            w_out.write_enable <= '1';
-            interrupt_out <= '1';
-            v.state := WRITE_SRR0;
-
-        elsif intr = '1' then
-            w_out.write_reg <= fast_spr_num(SPR_SRR0);
-            w_out.write_enable <= '1';
-            v.state := WRITE_SRR1;
+        if intr = '1' then
             srr1 := (others => '0');
             if e_in.interrupt = '1' then
                 vec := e_in.intr_vec;
-                w_out.write_data <= e_in.last_nia;
                 srr1 := e_in.srr1;
             elsif l_in.interrupt = '1' then
                 vec := l_in.intr_vec;
-                w_out.write_data <= l_in.srr0;
                 srr1 := l_in.srr1;
             elsif fp_in.interrupt = '1' then
                 vec := fp_in.intr_vec;
-                w_out.write_data <= fp_in.srr0;
                 srr1 := fp_in.srr1;
             end if;
-            v.srr1(63 downto 31) := e_in.msr(63 downto 31);
-            v.srr1(30 downto 27) := srr1(14 downto 11);
-            v.srr1(26 downto 22) := e_in.msr(26 downto 22);
-            v.srr1(21 downto 16) := srr1(5 downto 0);
-            v.srr1(15 downto 0) := e_in.msr(15 downto 0);
+            interrupt_out.srr1 <= srr1;
 
         else
             if e_in.write_enable = '1' then
@@ -229,6 +195,5 @@ begin
         wb_bypass.tag.valid <= complete_out.valid and w_out.write_enable;
         wb_bypass.data <= w_out.write_data;
 
-        rin <= v;
     end process;
 end;

From 337b1042501a84b3f28b11e94e650800177a63ce Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 12 Jul 2022 11:20:17 +1000
Subject: [PATCH 23/30] Move LR, CTR and TAR out of the register file

By putting CTR on the odd side and LR and TAR on the even side, we can
read and write CTR for bdnz-style instructions in parallel with
reading LR or TAR for indirect branches and writing LR for branches
with LK=1.  Thus we don't need to double up any of these instructions,
giving a simplification in decode2.

We now have logic for printing LR and CTR at the end of a simulation
in execute1, in addition to the similar logic in register_file and
cr_file.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl        |  27 +++--------
 core.vhdl          |   6 ++-
 decode1.vhdl       |  45 ++++-------------
 decode2.vhdl       |  57 +++++++++++++---------
 execute1.vhdl      | 117 +++++++++++++++++++++++----------------------
 register_file.vhdl |   3 --
 6 files changed, 118 insertions(+), 137 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 74341d1..7df451b 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -132,12 +132,15 @@ package common is
     constant RAMSPR_SPRG0  : ramspr_index := 2;
     constant RAMSPR_SPRG2  : ramspr_index := 3;
     constant RAMSPR_HSPRG0 : ramspr_index := 4;
+    constant RAMSPR_LR     : ramspr_index := 5;         -- must equal RAMSPR_CTR
+    constant RAMSPR_TAR    : ramspr_index := 6;
     -- Odd half:
     constant RAMSPR_SRR1   : ramspr_index := 0;
     constant RAMSPR_HSRR1  : ramspr_index := 1;
     constant RAMSPR_SPRG1  : ramspr_index := 2;
     constant RAMSPR_SPRG3  : ramspr_index := 3;
     constant RAMSPR_HSPRG1 : ramspr_index := 4;
+    constant RAMSPR_CTR    : ramspr_index := 5;         -- must equal RAMSPR_LR
 
     type ram_spr_info is record
         index : ramspr_index;
@@ -322,7 +325,6 @@ package common is
 	rc: std_ulogic;
 	oe: std_ulogic;
 	invert_a: std_ulogic;
-        addm1 : std_ulogic;
 	invert_out: std_ulogic;
 	input_carry: carry_in_t;
 	output_carry: std_ulogic;
@@ -350,11 +352,12 @@ package common is
         ramspr_wraddr      : ramspr_index;
         ramspr_write_even  : std_ulogic;
         ramspr_write_odd   : std_ulogic;
+        dec_ctr : std_ulogic;
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
 	(valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init,
          write_reg_enable => '0',
-         lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0',
+         lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0',
          output_cr => '0', output_xer => '0',
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
@@ -366,6 +369,7 @@ package common is
          spr_is_ram => '0',
          ramspr_even_rdaddr => 0, ramspr_odd_rdaddr => 0, ramspr_rd_odd => '0',
          ramspr_wraddr => 0, ramspr_write_even => '0', ramspr_write_odd => '0',
+         dec_ctr => '0',
          others => (others => '0'));
 
     type MultiplyInputType is record
@@ -780,25 +784,8 @@ package body common is
 	return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16)));
     end;
     function fast_spr_num(spr: spr_num_t) return gspr_index_t is
-       variable n : integer range 0 to 31;
-       -- tmp variable introduced as workaround for VCS compilation
-       -- simulation was failing with subtype constraint mismatch error
-       -- see GitHub PR #173
-       variable tmp : std_ulogic_vector(4 downto 0);
     begin
-       case spr is
-       when SPR_LR =>
-           n := 0;              -- N.B. decode2 relies on this specific value
-       when SPR_CTR =>
-           n := 1;              -- N.B. decode2 relies on this specific value
-       when SPR_TAR =>
-           n := 13;
-       when others =>
-           n := 0;
-           return "0000000";
-       end case;
-       tmp := std_ulogic_vector(to_unsigned(n, 5));
-       return "01" & tmp;
+        return "0000000";
     end;
 
     function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is
diff --git a/core.vhdl b/core.vhdl
index b2f2704..82c66b4 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -138,6 +138,7 @@ architecture behave of core is
     signal rst_dbg     : std_ulogic;
     signal alt_reset_d : std_ulogic;
 
+    signal sim_ex_dump: std_ulogic;
     signal sim_cr_dump: std_ulogic;
 
     -- Debug actions
@@ -326,7 +327,7 @@ begin
             dbg_gpr_addr => dbg_gpr_addr,
             dbg_gpr_data => dbg_gpr_data,
 	    sim_dump => terminate,
-	    sim_dump_done => sim_cr_dump,
+	    sim_dump_done => sim_ex_dump,
             log_out => log_data(255 downto 184)
 	    );
 
@@ -347,6 +348,7 @@ begin
 
     execute1_0: entity work.execute1
         generic map (
+            SIM => SIM,
             EX1_BYPASS => EX1_BYPASS,
             HAS_FPU => HAS_FPU,
             HAS_SHORT_MULT => HAS_SHORT_MULT,
@@ -376,6 +378,8 @@ begin
             dc_events => dcache_events,
             ic_events => icache_events,
             terminate_out => terminate,
+            sim_dump => sim_ex_dump,
+            sim_dump_done => sim_cr_dump,
             log_out => log_data(134 downto 120),
             log_rd_addr => log_rd_addr,
             log_rd_data => log_rd_data,
diff --git a/decode1.vhdl b/decode1.vhdl
index fd01d61..b6cea31 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -89,8 +89,8 @@ architecture behaviour of decode1 is
         28 =>       (ALU,  NONE, OP_AND,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE), -- andi.
         29 =>       (ALU,  NONE, OP_AND,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE), -- andis.
          0 =>       (ALU,  NONE, OP_ATTN,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn
-        18 =>       (ALU,  NONE, OP_B,         NONE,       CONST_LI,    NONE, SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b
-        16 =>       (ALU,  NONE, OP_BC,        SPR,        CONST_BD,    NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc
+        18 =>       (ALU,  NONE, OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b
+        16 =>       (ALU,  NONE, OP_BC,        NONE,       CONST_BD,    NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc
         11 =>       (ALU,  NONE, OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi
         10 =>       (ALU,  NONE, OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli
         34 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbz
@@ -177,7 +177,7 @@ architecture behaviour of decode1 is
         -- addpcis
         2#001#    =>       (ALU, NONE, OP_ADD,       CIA,        CONST_DXHI4, NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         -- bclr, bcctr, bctar
-        2#100#    =>       (ALU, NONE, OP_BCREG,     SPR,        SPR,         NONE, SPR,  '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        2#100#    =>       (ALU, NONE, OP_BCREG,     NONE,       NONE,        NONE, SPR,  '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
         -- isync
         2#111#    =>       (ALU, NONE, OP_ISYNC,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         -- rfid
@@ -530,6 +530,13 @@ architecture behaviour of decode1 is
     begin
         ret := (index => 0, isodd => '0', valid => '1');
         case sprn is
+            when SPR_LR =>
+                ret.index := RAMSPR_LR;
+            when SPR_CTR =>
+                ret.index := RAMSPR_CTR;
+                ret.isodd := '1';
+            when SPR_TAR =>
+                ret.index := RAMSPR_TAR;
             when SPR_SRR0 =>
                 ret.index := RAMSPR_SRR0;
             when SPR_SRR1 =>
@@ -683,13 +690,6 @@ begin
             end if;
 
         when 16 =>
-            -- CTR may be needed as input to bc
-            if f_in.insn(23) = '0' then
-                v.ispr1 := fast_spr_num(SPR_CTR);
-                v.ispro := fast_spr_num(SPR_CTR);
-            elsif f_in.insn(0) = '1' then
-                v.ispro := fast_spr_num(SPR_LR);
-            end if;
             -- Predict backward branches as taken, forward as untaken
             v.br_pred := f_in.insn(15);
             br_offset := resize(signed(f_in.insn(15 downto 2)), 24);
@@ -698,37 +698,12 @@ begin
             -- Unconditional branches are always taken
             v.br_pred := '1';
             br_offset := signed(f_in.insn(25 downto 2));
-            if f_in.insn(0) = '1' then
-                v.ispro := fast_spr_num(SPR_LR);
-            end if;
 
         when 19 =>
             vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6))));
             op_19_bits := f_in.insn(5) & f_in.insn(3) & f_in.insn(2);
             v.decode := decode_op_19_array(to_integer(unsigned(op_19_bits)));
 
-            -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path
-            if f_in.insn(2) = '0' then
-                -- Could be OP_BCREG: bclr, bcctr, bctar
-                -- Branch uses CTR as condition when BO(2) is 0. This is
-                -- also used to indicate that CTR is modified (they go
-                -- together).
-                -- bcctr doesn't update CTR or use it in the branch condition
-                if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then
-                    v.ispr1 := fast_spr_num(SPR_CTR);
-                    v.ispro := fast_spr_num(SPR_CTR);
-                elsif f_in.insn(0) = '1' then
-                    v.ispro := fast_spr_num(SPR_LR);
-                end if;
-                if f_in.insn(10) = '0' then
-                    v.ispr2 := fast_spr_num(SPR_LR);
-                elsif f_in.insn(6) = '0' then
-                    v.ispr2 := fast_spr_num(SPR_CTR);
-                else
-                    v.ispr2 := fast_spr_num(SPR_TAR);
-                end if;
-            end if;
-
         when 24 =>
             -- ori, special-case the standard NOP
             if std_match(f_in.insn, "01100000000000000000000000000000") then
diff --git a/decode2.vhdl b/decode2.vhdl
index c76b7f5..928ec94 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -406,6 +406,7 @@ begin
         variable length : std_ulogic_vector(3 downto 0);
         variable op : insn_type_t;
         variable valid_in : std_ulogic;
+        variable decctr : std_ulogic;
     begin
         v := dc2;
 
@@ -470,17 +471,45 @@ begin
             end if;
             op := d_in.decode.insn_type;
 
+            -- Does this instruction decrement CTR?
+            -- bc, bclr, bctar with BO(2) = 0 do, but not bcctr.
+            decctr := '0';
+            if d_in.insn(23) = '0' and
+                (op = OP_BC or
+                 (op = OP_BCREG and not (d_in.insn(10) = '1' and d_in.insn(6) = '0'))) then
+                decctr := '1';
+            end if;
+            v.e.dec_ctr := decctr;
+
             v.repeat := d_in.decode.repeat;
             if d_in.decode.repeat /= NONE then
                 v.e.repeat := '1';
-            elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then
-                -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled
-                v.e.repeat := '1';
             end if;
 
             v.e.spr_select := d_in.spr_info;
 
+            if decctr = '1' then
+                -- read and write CTR
+                v.e.ramspr_odd_rdaddr := RAMSPR_CTR;
+                v.e.ramspr_wraddr := RAMSPR_CTR;
+                v.e.ramspr_write_odd := '1';
+            end if;
+            if v.e.lr = '1' then
+                -- write LR
+                v.e.ramspr_wraddr := RAMSPR_LR;
+                v.e.ramspr_write_even := '1';
+            end if;
+
             case op is
+                when OP_BCREG =>
+                    if d_in.insn(10) = '0' then
+                        v.e.ramspr_even_rdaddr := RAMSPR_LR;
+                    elsif d_in.insn(6) = '0' then
+                        v.e.ramspr_odd_rdaddr := RAMSPR_CTR;
+                        v.e.ramspr_rd_odd := '1';
+                    else
+                        v.e.ramspr_even_rdaddr := RAMSPR_TAR;
+                    end if;
                 when OP_MFSPR =>
                     v.e.ramspr_even_rdaddr := d_in.ram_spr.index;
                     v.e.ramspr_odd_rdaddr := d_in.ram_spr.index;
@@ -520,7 +549,6 @@ begin
             v.e.write_reg := decoded_reg_o.reg;
             v.e.write_reg_enable := decoded_reg_o.reg_valid;
             v.e.invert_a := d_in.decode.invert_a;
-            v.e.addm1 := '0';
             v.e.insn_type := op;
             v.e.invert_out := d_in.decode.invert_out;
             v.e.input_carry := d_in.decode.input_carry;
@@ -536,14 +564,6 @@ begin
             v.e.br_pred := d_in.br_pred;
             v.e.result_sel := result_select(op);
             v.e.sub_select := subresult_select(op);
-            if op = OP_BC or op = OP_BCREG then
-                if d_in.insn(23) = '0' and
-                    not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
-                    -- decrement CTR if BO(2) = 0 and not bcctr
-                    v.e.addm1 := '1';
-                    v.e.result_sel := "000";        -- select adder output
-                end if;
-            end if;
             if op = OP_MFSPR then
                 if is_fast_spr(d_in.ispr1) = '1' then
                     v.e.result_sel := "000";        -- adder_result, effectively a_in
@@ -562,16 +582,9 @@ begin
             -- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction.
             -- Set up for the second iteration (if deferred = 1 this will all be ignored)
             v.e.second := '1';
-            case dc2.repeat is
-                when DUPD =>
-                    -- update-form loads, 2nd instruction writes RA
-                    v.e.write_reg := dc2.e.read_reg1;
-                when NONE =>
-                    -- bcl/bclrl/bctarl that needs to write both CTR and LR
-                    v.e.write_reg(0) := '0';    -- point to LR
-                    v.e.result_sel := "110";    -- select NIA (to go to LR)
-                when others =>
-            end case;
+            -- DUPD is the only possibility here:
+            -- update-form loads, 2nd instruction writes RA
+            v.e.write_reg := dc2.e.read_reg1;
         end if;
 
         -- issue control
diff --git a/execute1.vhdl b/execute1.vhdl
index b0b2f98..5ee830b 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -12,6 +12,7 @@ use work.ppc_fx_insns.all;
 
 entity execute1 is
     generic (
+        SIM : boolean := false;
         EX1_BYPASS : boolean := true;
         HAS_FPU : boolean := true;
         HAS_SHORT_MULT : boolean := false;
@@ -54,6 +55,10 @@ entity execute1 is
         dc_events    : in DcacheEventType;
         ic_events    : in IcacheEventType;
 
+        -- debug
+        sim_dump      : in std_ulogic;
+        sim_dump_done : out std_ulogic;
+
         log_out : out std_ulogic_vector(14 downto 0);
         log_rd_addr : out std_ulogic_vector(31 downto 0);
         log_rd_data : in std_ulogic_vector(63 downto 0);
@@ -92,10 +97,12 @@ architecture behaviour of execute1 is
         fp_intr : std_ulogic;
         res2_sel : std_ulogic_vector(1 downto 0);
         bypass_valid : std_ulogic;
+        ramspr_odd_data : std_ulogic_vector(63 downto 0);
     end record;
     constant actions_type_init : actions_type :=
         (e => Execute1ToWritebackInit, se => side_effect_init,
-         new_msr => (others => '0'), res2_sel => "00", others => '0');
+         new_msr => (others => '0'), res2_sel => "00",
+         ramspr_odd_data => 64x"0", others => '0');
 
     type reg_stage1_type is record
 	e : Execute1ToWritebackType;
@@ -104,7 +111,6 @@ architecture behaviour of execute1 is
         fp_exception_next : std_ulogic;
         trace_next : std_ulogic;
         prev_op : insn_type_t;
-        br_taken : std_ulogic;
         oe : std_ulogic;
         mul_select : std_ulogic_vector(1 downto 0);
         res2_sel : std_ulogic_vector(1 downto 0);
@@ -122,11 +128,12 @@ architecture behaviour of execute1 is
         xerc : xer_common_t;
         xerc_valid : std_ulogic;
         ramspr_wraddr : ramspr_index;
+        ramspr_odd_data : std_ulogic_vector(63 downto 0);
     end record;
     constant reg_stage1_type_init : reg_stage1_type :=
         (e => Execute1ToWritebackInit, se => side_effect_init,
          busy => '0',
-         fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0',
+         fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
          oe => '0', mul_select => "00", res2_sel => "00",
          spr_select => spr_id_init, pmu_spr_num => 5x"0",
          mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
@@ -134,7 +141,7 @@ architecture behaviour of execute1 is
          taken_branch_event => '0', br_mispredict => '0',
          msr => 64x"0",
          xerc => xerc_init, xerc_valid => '0',
-         ramspr_wraddr => 0);
+         ramspr_wraddr => 0, ramspr_odd_data => 64x"0");
 
     type reg_stage2_type is record
 	e : Execute1ToWritebackType;
@@ -514,7 +521,7 @@ begin
             odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1);
         else
             even_wr_data := ex1.e.write_data;
-            odd_wr_data := ex1.e.write_data;
+            odd_wr_data := ex1.ramspr_odd_data;
         end if;
         ramspr_wr_addr <= wr_addr;
         ramspr_even_wr_data <= even_wr_data;
@@ -531,7 +538,7 @@ begin
             ramspr_even <= even_rd_data;
         end if;
         if ex1.se.ramspr_write_odd = '1' and e_in.ramspr_odd_rdaddr = ex1.ramspr_wraddr then
-            ramspr_odd <= ex1.e.write_data;
+            ramspr_odd <= ex1.ramspr_odd_data;
         else
             ramspr_odd <= odd_rd_data;
         end if;
@@ -600,7 +607,6 @@ begin
     -- Data path for integer instructions (first execute stage)
     execute1_dp: process(all)
 	variable a_inv : std_ulogic_vector(63 downto 0);
-	variable b_or_m1 : std_ulogic_vector(63 downto 0);
 	variable sum_with_carry : std_ulogic_vector(64 downto 0);
         variable sign1, sign2 : std_ulogic;
         variable abs1, abs2 : signed(63 downto 0);
@@ -635,12 +641,7 @@ begin
         else
             a_inv := not a_in;
         end if;
-        if e_in.addm1 = '0' then
-            b_or_m1 := b_in;
-        else
-            b_or_m1 := (others => '1');
-        end if;
-        sum_with_carry := ppc_adde(a_inv, b_or_m1,
+        sum_with_carry := ppc_adde(a_inv, b_in,
                                    decode_input_carry(e_in.input_carry, xerc_in));
         adder_result <= sum_with_carry(63 downto 0);
         carry_32 <= sum_with_carry(32) xor a_inv(32) xor b_in(32);
@@ -956,6 +957,10 @@ begin
 
         v.se.ramspr_write_even := e_in.ramspr_write_even;
         v.se.ramspr_write_odd := e_in.ramspr_write_odd;
+        v.ramspr_odd_data := c_in;
+        if e_in.dec_ctr = '1' then
+            v.ramspr_odd_data := std_ulogic_vector(unsigned(ramspr_odd) - 1);
+        end if;
 
         -- Note the difference between v.exception and v.trap:
         -- v.exception signals a condition that prevents execution of the
@@ -1059,61 +1064,42 @@ begin
                 end if;
                 v.se.write_cfar := '1';
             when OP_BC =>
-                -- read_data1 is CTR
-                -- If this instruction updates both CTR and LR, then it is
-                -- doubled; the first instruction decrements CTR and determines
-                -- whether the branch is taken, and the second does the
-                -- redirect and the LR update.
+                -- If CTR is being decremented, it is in ramspr_odd.
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
-                if e_in.second = '0' then
-                    v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
-                else
-                    v.take_branch := ex1.br_taken;
-                end if;
+                v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd);
                 if v.take_branch = '1' then
                     v.e.br_offset := b_in;
                     v.e.abs_br := insn_aa(e_in.insn);
                 end if;
-                if e_in.repeat = '0' or e_in.second = '1' then
-                    -- Mispredicted branches cause a redirect
-                    if v.take_branch /= e_in.br_pred then
-                        v.e.redirect := '1';
-                    end if;
-                    v.direct_branch := '1';
-                    v.e.br_last := '1';
-                    v.e.br_taken := v.take_branch;
-                    if ex1.msr(MSR_BE) = '1' then
-                        v.do_trace := '1';
-                    end if;
-                    v.se.write_cfar := v.take_branch;
+                -- Mispredicted branches cause a redirect
+                if v.take_branch /= e_in.br_pred then
+                    v.e.redirect := '1';
+                end if;
+                v.direct_branch := '1';
+                v.e.br_last := '1';
+                v.e.br_taken := v.take_branch;
+                if ex1.msr(MSR_BE) = '1' then
+                    v.do_trace := '1';
                 end if;
+                v.se.write_cfar := v.take_branch;
             when OP_BCREG =>
-                -- read_data1 is CTR, read_data2 is target register (CTR, LR or TAR)
-                -- If this instruction updates both CTR and LR, then it is
-                -- doubled; the first instruction decrements CTR and determines
-                -- whether the branch is taken, and the second does the
-                -- redirect and the LR update.
+                -- If CTR is being decremented, it is in ramspr_odd.
+                -- The target address is in ramspr_result (LR, CTR or TAR).
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
-                if e_in.second = '0' then
-                    v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
-                else
-                    v.take_branch := ex1.br_taken;
-                end if;
+                v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd);
                 if v.take_branch = '1' then
-                    v.e.br_offset := b_in;
+                    v.e.br_offset := ramspr_result;
                     v.e.abs_br := '1';
                 end if;
-                if e_in.repeat = '0' or e_in.second = '1' then
-                    -- Indirect branches are never predicted taken
-                    v.e.redirect := v.take_branch;
-                    v.e.br_taken := v.take_branch;
-                    if ex1.msr(MSR_BE) = '1' then
-                        v.do_trace := '1';
-                    end if;
-                    v.se.write_cfar := v.take_branch;
+                -- Indirect branches are never predicted taken
+                v.e.redirect := v.take_branch;
+                v.e.br_taken := v.take_branch;
+                if ex1.msr(MSR_BE) = '1' then
+                    v.do_trace := '1';
                 end if;
+                v.se.write_cfar := v.take_branch;
 
 	    when OP_RFID =>
                 srr1 := ramspr_odd;
@@ -1130,7 +1116,7 @@ begin
                     v.new_msr(MSR_DR) := '1';
                 end if;
                 v.se.write_msr := '1';
-                v.e.br_offset := ramspr_even;
+                v.e.br_offset := ramspr_result;
                 v.e.abs_br := '1';
                 v.e.redirect := '1';
                 v.se.write_cfar := '1';
@@ -1343,6 +1329,7 @@ begin
             v.mul_select := e_in.sub_select(1 downto 0);
             v.se := side_effect_init;
             v.ramspr_wraddr := e_in.ramspr_wraddr;
+            v.ramspr_odd_data := actions.ramspr_odd_data;
         end if;
 
         lv := Execute1ToLoadstore1Init;
@@ -1430,7 +1417,6 @@ begin
             v.e.valid := actions.complete;
             bypass_valid := actions.bypass_valid;
             v.taken_branch_event := actions.take_branch;
-            v.br_taken := actions.take_branch;
             v.trace_next := actions.do_trace;
             v.fp_exception_next := actions.fp_intr;
             v.res2_sel := actions.res2_sel;
@@ -1759,6 +1745,25 @@ begin
         exception_log <= v.e.interrupt;
     end process;
 
+    sim_dump_test: if SIM generate
+        dump_exregs: process(all)
+            variable xer : std_ulogic_vector(63 downto 0);
+        begin
+            if sim_dump = '1' then
+                report "LR " & to_hstring(even_sprs(RAMSPR_LR));
+                report "CTR " & to_hstring(odd_sprs(RAMSPR_CTR));
+                sim_dump_done <= '1';
+            else
+                sim_dump_done <= '0';
+            end if;
+        end process;
+    end generate;
+
+    -- Keep GHDL synthesis happy
+    sim_dump_test_synth: if not SIM generate
+        sim_dump_done <= '0';
+    end generate;
+
     e1_log: if LOG_LENGTH > 0 generate
         signal log_data : std_ulogic_vector(14 downto 0);
     begin
diff --git a/register_file.vhdl b/register_file.vhdl
index 0235dfc..ed856cb 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -130,9 +130,6 @@ begin
                 loop_0: for i in 0 to 31 loop
                     report "GPR" & integer'image(i) & " " & to_hstring(registers(i));
                 end loop loop_0;
-
-                report "LR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_LR)))));
-                report "CTR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_CTR)))));
                 sim_dump_done <= '1';
             else
                 sim_dump_done <= '0';

From fdb3ef6874fb34e67e8d6f136440378c706069e9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 19 Feb 2022 19:03:49 +1100
Subject: [PATCH 24/30] Finish off taking SPRs out of register file

With this, the register file now contains 64 entries, for 32 GPRs and
32 FPRs, rather than the 128 it had previously.  Several things get
simplified - decode1 no longer has to work out the ispr{1,2,o} values,
decode_input_reg_{a,b,c} no longer have the t = SPR case, etc.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl        | 45 ++++++-------------------------------------
 decode1.vhdl       | 10 +++-------
 decode2.vhdl       | 48 ++++++++++------------------------------------
 decode_types.vhdl  |  6 +++---
 execute1.vhdl      |  5 ++---
 loadstore1.vhdl    |  2 +-
 logical.vhdl       |  2 +-
 register_file.vhdl | 26 +++++++++++--------------
 8 files changed, 37 insertions(+), 107 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 7df451b..06b62e0 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -86,30 +86,19 @@ package common is
     -- GPR indices in the register file (GPR only)
     subtype gpr_index_t is std_ulogic_vector(4 downto 0);
 
-    -- Extended GPR index (can hold an SPR or a FPR)
-    subtype gspr_index_t is std_ulogic_vector(6 downto 0);
+    -- Extended GPR index (can hold a GPR or a FPR)
+    subtype gspr_index_t is std_ulogic_vector(5 downto 0);
 
     -- FPR indices
     subtype fpr_index_t is std_ulogic_vector(4 downto 0);
 
-    -- Some SPRs are stored in the register file, they use the magic
-    -- GPR numbers above 31.
+    -- FPRs are stored in the register file, using GSPR
+    -- numbers from 32 to 63.
     --
-    -- The function fast_spr_num() returns the corresponding fast
-    -- pseudo-GPR number for a given SPR number. The result MSB
-    -- indicates if this is indeed a fast SPR. If clear, then
-    -- the SPR is not stored in the GPR file.
-    --
-    -- FPRs are also stored in the register file, using GSPR
-    -- numbers from 64 to 95.
-    --
-    function fast_spr_num(spr: spr_num_t) return gspr_index_t;
 
     -- Indices conversion functions
     function gspr_to_gpr(i: gspr_index_t) return gpr_index_t;
     function gpr_to_gspr(i: gpr_index_t) return gspr_index_t;
-    function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t;
-    function is_fast_spr(s: gspr_index_t) return std_ulogic;
     function fpr_to_gspr(f: fpr_index_t) return gspr_index_t;
 
     -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
@@ -271,9 +260,6 @@ package common is
 	stop_mark : std_ulogic;
 	nia: std_ulogic_vector(63 downto 0);
 	insn: std_ulogic_vector(31 downto 0);
-	ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr
-	ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR)
-	ispro: gspr_index_t; -- (G)SPR written with LR or CTR
 	decode: decode_rom_t;
         br_pred: std_ulogic; -- Branch was predicted to be taken
         big_endian: std_ulogic;
@@ -282,7 +268,6 @@ package common is
     end record;
     constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
         (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
-         ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'),
          decode => decode_rom_init, br_pred => '0', big_endian => '0',
          spr_info => spr_id_init, ram_spr => ram_spr_info_init);
 
@@ -783,10 +768,6 @@ package body common is
     begin
 	return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16)));
     end;
-    function fast_spr_num(spr: spr_num_t) return gspr_index_t is
-    begin
-        return "0000000";
-    end;
 
     function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is
     begin
@@ -795,26 +776,12 @@ package body common is
 
     function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is
     begin
-	return "00" & i;
-    end;
-
-    function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is
-    begin
-	if s(5) = '1' then
-	    return s;
-	else
-	    return gpr_to_gspr(g);
-	end if;
-    end;
-
-    function is_fast_spr(s: gspr_index_t) return std_ulogic is
-    begin
-	return s(5);
+	return "0" & i;
     end;
 
     function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is
     begin
-        return "10" & f;
+        return "1" & f;
     end;
 
     function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is
diff --git a/decode1.vhdl b/decode1.vhdl
index b6cea31..af8cd6c 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -177,7 +177,7 @@ architecture behaviour of decode1 is
         -- addpcis
         2#001#    =>       (ALU, NONE, OP_ADD,       CIA,        CONST_DXHI4, NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         -- bclr, bcctr, bctar
-        2#100#    =>       (ALU, NONE, OP_BCREG,     NONE,       NONE,        NONE, SPR,  '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
+        2#100#    =>       (ALU, NONE, OP_BCREG,     NONE,       NONE,        NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
         -- isync
         2#111#    =>       (ALU, NONE, OP_ISYNC,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         -- rfid
@@ -329,7 +329,7 @@ architecture behaviour of decode1 is
         2#1001000000#  =>       (ALU,  NONE, OP_MCRXRX,    NONE,       NONE,        NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mcrxrx
         2#0000010011#  =>       (ALU,  NONE, OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfcr/mfocrf
         2#0001010011#  =>       (ALU,  NONE, OP_MFMSR,     NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mfmsr
-        2#0101010011#  =>       (ALU,  NONE, OP_MFSPR,     SPR,        NONE,        RS,   RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr
+        2#0101010011#  =>       (ALU,  NONE, OP_MFSPR,     NONE,       NONE,        RS,   RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr
         2#0100001001#  =>       (DVU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud
         2#0100001011#  =>       (DVU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw
         2#1100001001#  =>       (DVU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd
@@ -337,7 +337,7 @@ architecture behaviour of decode1 is
         2#0010010000#  =>       (ALU,  NONE, OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtcrf/mtocrf
         2#0010010010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- mtmsr
         2#0010110010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtmsrd # ignore top bits and d
-        2#0111010011#  =>       (ALU,  NONE, OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr
+        2#0111010011#  =>       (ALU,  NONE, OP_MTSPR,     NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr
         2#0001001001#  =>       (ALU,  NONE, OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- mulhd
         2#0000001001#  =>       (ALU,  NONE, OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- mulhdu
         2#0001001011#  =>       (ALU,  NONE, OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- mulhw
@@ -670,10 +670,6 @@ begin
             -- major opcode 31, lots of things
             v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1))));
 
-            -- Work out ispr1/ispro independent of v.decode since they seem to be critical path
-            v.ispr1 := fast_spr_num(sprn);
-            v.ispro := fast_spr_num(sprn);
-
             if std_match(f_in.insn(10 downto 1), "01-1010011") then
                 -- mfspr or mtspr
                 -- Make mtspr to slow SPRs single issue
diff --git a/decode2.vhdl b/decode2.vhdl
index 928ec94..5a8c2b7 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -82,21 +82,11 @@ architecture behaviour of decode2 is
     constant decode_output_reg_init : decode_output_reg_t := ('0', (others => '0'));
 
     function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0);
-                                 ispr : gspr_index_t;
                                  instr_addr : std_ulogic_vector(63 downto 0))
         return decode_input_reg_t is
     begin
         if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then
             return ('1', gpr_to_gspr(insn_ra(insn_in)), (others => '0'));
-        elsif t = SPR then
-            -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
-            -- If it's all 0, we don't treat it as a dependency as slow SPRs
-            -- operations are single issue.
-            --
-            assert is_fast_spr(ispr) =  '1' or ispr = "0000000"
-                report "Decode A says SPR but ISPR is invalid:" &
-                to_hstring(ispr) severity failure;
-            return (is_fast_spr(ispr), ispr, (others => '0'));
         elsif t = CIA then
             return ('0', (others => '0'), instr_addr);
         elsif HAS_FPU and t = FRA then
@@ -106,8 +96,8 @@ architecture behaviour of decode2 is
         end if;
     end;
 
-    function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0);
-                                 ispr : gspr_index_t) return decode_input_reg_t is
+    function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0))
+        return decode_input_reg_t is
         variable ret : decode_input_reg_t;
     begin
         case t is
@@ -143,14 +133,6 @@ architecture behaviour of decode2 is
                 ret := ('0', (others => '0'), x"00000000000000" & "00" & insn_in(1) & insn_in(15 downto 11));
             when CONST_SH32 =>
                 ret := ('0', (others => '0'), x"00000000000000" & "000" & insn_in(15 downto 11));
-            when SPR =>
-                -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
-                -- If it's all 0, we don't treat it as a dependency as slow SPRs
-                -- operations are single issue.
-                assert is_fast_spr(ispr) = '1' or ispr = "0000000"
-                    report "Decode B says SPR but ISPR is invalid:" &
-                    to_hstring(ispr) severity failure;
-                ret := (is_fast_spr(ispr), ispr, (others => '0'));
             when NONE =>
                 ret := ('0', (others => '0'), (others => '0'));
         end case;
@@ -183,8 +165,8 @@ architecture behaviour of decode2 is
         end case;
     end;
 
-    function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0);
-                                ispr : gspr_index_t) return decode_output_reg_t is
+    function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0))
+        return decode_output_reg_t is
     begin
         case t is
             when RT =>
@@ -195,18 +177,10 @@ architecture behaviour of decode2 is
                 if HAS_FPU then
                     return ('1', fpr_to_gspr(insn_frt(insn_in)));
                 else
-                    return ('0', "0000000");
+                    return ('0', "000000");
                 end if;
-            when SPR =>
-                -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
-                -- If it's all 0, we don't treat it as a dependency as slow SPRs
-                -- operations are single issue.
-                assert is_fast_spr(ispr) = '1' or ispr = "0000000"
-                    report "Decode B says SPR but ISPR is invalid:" &
-                    to_hstring(ispr) severity failure;
-                return (is_fast_spr(ispr), ispr);
             when NONE =>
-                return ('0', "0000000");
+                return ('0', "000000");
         end case;
     end;
 
@@ -386,10 +360,10 @@ begin
         decoded_reg_c <= decode_input_reg_init;
         decoded_reg_o <= decode_output_reg_init;
         if d_in.valid = '1' then
-            decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.ispr1, d_in.nia);
-            decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, d_in.ispr2);
+            decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.nia);
+            decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn);
             decoded_reg_c <= decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn);
-            decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro);
+            decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn);
         end if;
 
         r_out.read1_enable <= decoded_reg_a.reg_valid;
@@ -565,9 +539,7 @@ begin
             v.e.result_sel := result_select(op);
             v.e.sub_select := subresult_select(op);
             if op = OP_MFSPR then
-                if is_fast_spr(d_in.ispr1) = '1' then
-                    v.e.result_sel := "000";        -- adder_result, effectively a_in
-                elsif d_in.ram_spr.valid = '1' then
+                if d_in.ram_spr.valid = '1' then
                     v.e.result_sel := "101";        -- ramspr_result
                 elsif d_in.spr_info.valid = '0' then
                     -- Privileged mfspr to invalid/unimplemented SPR numbers
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 514bc08..9ee329d 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -22,11 +22,11 @@ package decode_types is
                          OP_BCD, OP_ADDG6S,
                          OP_FETCH_FAILED
 			 );
-    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA);
+    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, CIA, FRA);
     type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
-                           CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
+                           CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, FRB);
     type input_reg_c_t is (NONE, RS, RCR, FRC, FRS);
-    type output_reg_a_t is (NONE, RT, RA, SPR, FRT);
+    type output_reg_a_t is (NONE, RT, RA, FRT);
     type rc_t is (NONE, ONE, RC);
     type carry_in_t is (ZERO, CA, OV, ONE);
 
diff --git a/execute1.vhdl b/execute1.vhdl
index 5ee830b..dc68806 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -1135,7 +1135,7 @@ begin
             when OP_DARN =>
 	    when OP_MFMSR =>
 	    when OP_MFSPR =>
-		if is_fast_spr(e_in.read_reg1) = '1' or e_in.spr_is_ram = '1' then
+		if e_in.spr_is_ram = '1' then
                     if e_in.valid = '1' then
                         report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
                             "=" & to_hstring(alu_result);
@@ -1216,8 +1216,7 @@ begin
                         when others =>
                     end case;
                 end if;
-		if e_in.spr_select.valid = '0' and is_fast_spr(e_in.write_reg) = '0' and
-                    e_in.spr_is_ram = '0' then
+		if e_in.spr_select.valid = '0' and e_in.spr_is_ram = '0' then
                     -- mtspr to unimplemented SPRs should be a nop in
                     -- supervisor mode and a program interrupt for user mode
                     if ex1.msr(MSR_PR) = '1' then
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index b556211..9dab15b 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -97,7 +97,7 @@ architecture behave of loadstore1 is
                                           mode_32bit => '0', addr => (others => '0'),
                                           byte_sel => x"00", second_bytes => x"00",
                                           store_data => (others => '0'), instr_tag => instr_tag_init,
-                                          write_reg => 7x"00", length => x"0",
+                                          write_reg => 6x"00", length => x"0",
                                           elt_length => x"0", byte_reverse => '0', brev_mask => "000",
                                           sign_extend => '0', update => '0',
                                           xerc => xerc_init, reserve => '0',
diff --git a/logical.vhdl b/logical.vhdl
index 60309ac..77ef29c 100644
--- a/logical.vhdl
+++ b/logical.vhdl
@@ -167,7 +167,7 @@ begin
 		end if;
 		tmp(7 downto 0) := rs(7 downto 0);
             when others =>
-                -- e.g. OP_MTSPR
+                -- e.g. OP_MFSPR
                 tmp := rs;
         end case;
 
diff --git a/register_file.vhdl b/register_file.vhdl
index ed856cb..dcce0a4 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -34,7 +34,7 @@ entity register_file is
 end entity register_file;
 
 architecture behaviour of register_file is
-    type regfile is array(0 to 127) of std_ulogic_vector(63 downto 0);
+    type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0);
     signal registers : regfile := (others => (others => '0'));
     signal rd_port_b : std_ulogic_vector(63 downto 0);
     signal dbg_data : std_ulogic_vector(63 downto 0);
@@ -47,15 +47,11 @@ begin
         if rising_edge(clk) then
             if w_in.write_enable = '1' then
                 w_addr := w_in.write_reg;
-                if HAS_FPU and w_addr(6) = '1' then
+                if HAS_FPU and w_addr(5) = '1' then
                     report "Writing FPR " & to_hstring(w_addr(4 downto 0)) & " " & to_hstring(w_in.write_data);
                 else
-                    w_addr(6) := '0';
-                    if w_addr(5) = '0' then
-                        report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
-                    else
-                        report "Writing GSPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
-                    end if;
+                    w_addr(5) := '0';
+                    report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
                 end if;
                 assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
                 registers(to_integer(unsigned(w_addr))) <= w_in.write_data;
@@ -73,11 +69,11 @@ begin
         c_addr := d_in.read3_reg;
         w_addr := w_in.write_reg;
         if not HAS_FPU then
-            -- Make it obvious that we only want 64 GSPRs for a no-FPU implementation
-            a_addr(6) := '0';
-            b_addr(6) := '0';
-            c_addr(6) := '0';
-            w_addr(6) := '0';
+            -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation
+            a_addr(5) := '0';
+            b_addr(5) := '0';
+            c_addr(5) := '0';
+            w_addr(5) := '0';
         end if;
         if d_in.read1_enable = '1' then
             report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr))));
@@ -93,7 +89,7 @@ begin
         if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then
             b_addr := dbg_gpr_addr;
             if not HAS_FPU then
-                b_addr(6) := '0';
+                b_addr(5) := '0';
             end if;
         end if;
         rd_port_b <= registers(to_integer(unsigned(b_addr)));
@@ -150,7 +146,7 @@ begin
             if rising_edge(clk) then
                 log_data <= w_in.write_data &
                             w_in.write_enable &
-                            w_in.write_reg;
+                            '0' & w_in.write_reg;
             end if;
         end process;
         log_out <= log_data;

From d0f319290fd22724a06b6db628aa7ee3458ca1bc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 25 Feb 2022 16:46:34 +1100
Subject: [PATCH 25/30] Restore debug access to SPRs

This provides access to the SPRs via the JTAG DMI interface.  For now
they are still accessed as if they were GPR/FPRs using the same
numbering as before (GPRs at 0 - 0x1f, SPRs at 0x20 - 0x2d, FPRs at
0x40 - 0x5f).

For XER, debug reads now report the full value, not just the bits that
were previously stored in the register file.  The "slow" SPR mux is
not used for debug reads.

Decode2 determines on each cycle whether a debug SPR access will
happen next cycle, based on whether there is a request and whether the
current instruction accesses the SPR RAM.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl                 |  2 +
 core.vhdl                   | 14 +++++++
 core_debug.vhdl             | 84 +++++++++++++++++++++++++++++++++----
 decode2.vhdl                | 29 ++++++++++++-
 execute1.vhdl               | 24 +++++++++++
 scripts/mw_debug/mw_debug.c |  2 +-
 6 files changed, 144 insertions(+), 11 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 06b62e0..d743c2d 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -337,6 +337,7 @@ package common is
         ramspr_wraddr      : ramspr_index;
         ramspr_write_even  : std_ulogic;
         ramspr_write_odd   : std_ulogic;
+        dbg_spr_access : std_ulogic;
         dec_ctr : std_ulogic;
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
@@ -354,6 +355,7 @@ package common is
          spr_is_ram => '0',
          ramspr_even_rdaddr => 0, ramspr_odd_rdaddr => 0, ramspr_rd_odd => '0',
          ramspr_wraddr => 0, ramspr_write_even => '0', ramspr_write_odd => '0',
+         dbg_spr_access => '0',
          dec_ctr => '0',
          others => (others => '0'));
 
diff --git a/core.vhdl b/core.vhdl
index 82c66b4..a91b729 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -150,6 +150,10 @@ architecture behave of core is
     signal dbg_gpr_ack : std_ulogic;
     signal dbg_gpr_addr : gspr_index_t;
     signal dbg_gpr_data : std_ulogic_vector(63 downto 0);
+    signal dbg_spr_req : std_ulogic;
+    signal dbg_spr_ack : std_ulogic;
+    signal dbg_spr_addr : std_ulogic_vector(7 downto 0);
+    signal dbg_spr_data : std_ulogic_vector(63 downto 0);
 
     signal ctrl_debug : ctrl_t;
 
@@ -307,6 +311,8 @@ begin
             execute2_bypass => execute2_bypass,
             execute2_cr_bypass => execute2_cr_bypass,
             writeback_bypass => writeback_bypass,
+            dbg_spr_req => dbg_spr_req,
+            dbg_spr_addr => dbg_spr_addr,
             log_out => log_data(119 downto 110)
             );
     decode2_busy_in <= ex1_busy_out;
@@ -378,6 +384,10 @@ begin
             dc_events => dcache_events,
             ic_events => icache_events,
             terminate_out => terminate,
+            dbg_spr_req => dbg_spr_req,
+            dbg_spr_ack => dbg_spr_ack,
+            dbg_spr_addr => dbg_spr_addr,
+            dbg_spr_data => dbg_spr_data,
             sim_dump => sim_ex_dump,
             sim_dump_done => sim_cr_dump,
             log_out => log_data(134 downto 120),
@@ -504,6 +514,10 @@ begin
             dbg_gpr_ack => dbg_gpr_ack,
             dbg_gpr_addr => dbg_gpr_addr,
             dbg_gpr_data => dbg_gpr_data,
+            dbg_spr_req => dbg_spr_req,
+            dbg_spr_ack => dbg_spr_ack,
+            dbg_spr_addr => dbg_spr_addr,
+            dbg_spr_data => dbg_spr_data,
             log_data => log_data,
             log_read_addr => log_rd_addr,
             log_read_data => log_rd_data,
diff --git a/core_debug.vhdl b/core_debug.vhdl
index ff99df4..a1d4a94 100644
--- a/core_debug.vhdl
+++ b/core_debug.vhdl
@@ -33,12 +33,18 @@ entity core_debug is
         nia             : in std_ulogic_vector(63 downto 0);
         msr             : in std_ulogic_vector(63 downto 0);
 
-        -- GSPR register read port
+        -- GPR/FPR register read port
         dbg_gpr_req     : out std_ulogic;
         dbg_gpr_ack     : in std_ulogic;
         dbg_gpr_addr    : out gspr_index_t;
         dbg_gpr_data    : in std_ulogic_vector(63 downto 0);
 
+        -- SPR register read port
+        dbg_spr_req     : out std_ulogic;
+        dbg_spr_ack     : in std_ulogic;
+        dbg_spr_addr    : out std_ulogic_vector(7 downto 0);
+        dbg_spr_data    : in std_ulogic_vector(63 downto 0);
+
         -- Core logging data
         log_data        : in std_ulogic_vector(255 downto 0);
         log_read_addr   : in std_ulogic_vector(31 downto 0);
@@ -105,7 +111,10 @@ architecture behave of core_debug is
     signal do_icreset   : std_ulogic;
     signal terminated   : std_ulogic;
     signal do_gspr_rd   : std_ulogic;
-    signal gspr_index   : gspr_index_t;
+    signal gspr_index   : std_ulogic_vector(7 downto 0);
+    signal gspr_data    : std_ulogic_vector(63 downto 0);
+
+    signal spr_index_valid : std_ulogic;
 
     signal log_dmi_addr        : std_ulogic_vector(31 downto 0) := (others => '0');
     signal log_dmi_data        : std_ulogic_vector(63 downto 0) := (others => '0');
@@ -119,9 +128,7 @@ architecture behave of core_debug is
 begin
        -- Single cycle register accesses on DMI except for GSPR data
     dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA
-               else dbg_gpr_ack;
-    dbg_gpr_req <= dmi_req when dmi_addr = DBG_CORE_GSPR_DATA
-                   else '0';
+               else dbg_gpr_ack or dbg_spr_ack;
 
     -- Status register read composition
     stat_reg <= (2 => terminated,
@@ -129,12 +136,16 @@ begin
                  0 => stopping,
                  others => '0');
 
+    gspr_data <= dbg_gpr_data when gspr_index(5) = '0' else
+                 dbg_spr_data when spr_index_valid = '1' else
+                 (others => '0');
+
     -- DMI read data mux
     with dmi_addr select dmi_dout <=
         stat_reg        when DBG_CORE_STAT,
         nia             when DBG_CORE_NIA,
         msr             when DBG_CORE_MSR,
-        dbg_gpr_data    when DBG_CORE_GSPR_DATA,
+        gspr_data       when DBG_CORE_GSPR_DATA,
         log_write_addr & log_dmi_addr when DBG_CORE_LOG_ADDR,
         log_dmi_data    when DBG_CORE_LOG_DATA,
         log_dmi_trigger when DBG_CORE_LOG_TRIGGER,
@@ -191,7 +202,7 @@ begin
                                 terminated <= '0';
                             end if;
                         elsif dmi_addr = DBG_CORE_GSPR_INDEX then
-                            gspr_index <= dmi_din(gspr_index_t'left downto 0);
+                            gspr_index <= dmi_din(7 downto 0);
                         elsif dmi_addr = DBG_CORE_LOG_ADDR then
                             log_dmi_addr <= dmi_din(31 downto 0);
                             do_dmi_log_rd <= '1';
@@ -226,7 +237,64 @@ begin
         end if;
     end process;
 
-    dbg_gpr_addr <= gspr_index;
+    gspr_access: process(clk)
+        variable valid : std_ulogic;
+        variable sel : spr_selector;
+        variable isram : std_ulogic;
+        variable raddr : ramspr_index;
+        variable odd : std_ulogic;
+    begin
+        if rising_edge(clk) then
+            if rst = '1' or dmi_req = '0' or dmi_addr /= DBG_CORE_GSPR_DATA then
+                dbg_gpr_req <= '0';
+                dbg_spr_req <= '0';
+            else
+                dbg_gpr_req <= not gspr_index(5);
+                dbg_spr_req <= gspr_index(5);
+            end if;
+
+            -- Map 0 - 0x1f to GPRs, 0x20 - 0x3f to SPRs, and 0x40 - 0x5f to FPRs
+            dbg_gpr_addr <= gspr_index(6) & gspr_index(4 downto 0);
+
+            -- For SPRs, use the same mapping as when the fast SPRs were in the GPR file
+            valid := '1';
+            sel := "000";
+            isram := '1';
+            raddr := 0;
+            odd := '0';
+            case gspr_index(4 downto 0) is
+                when 5x"00" =>
+                    raddr := RAMSPR_LR;
+                when 5x"01" =>
+                    odd := '1';
+                    raddr := RAMSPR_CTR;
+                when 5x"02" | 5x"03" =>
+                    odd := gspr_index(0);
+                    raddr := RAMSPR_SRR0;
+                when 5x"04" | 5x"05" =>
+                    odd := gspr_index(0);
+                    raddr := RAMSPR_HSRR0;
+                when 5x"06" | 5x"07" =>
+                    odd := gspr_index(0);
+                    raddr := RAMSPR_SPRG0;
+                when 5x"08" | 5x"09" =>
+                    odd := gspr_index(0);
+                    raddr := RAMSPR_SPRG2;
+                when 5x"0a" | 5x"0b" =>
+                    odd := gspr_index(0);
+                    raddr := RAMSPR_HSPRG0;
+                when 5x"0c" =>
+                    isram := '0';
+                    sel := SPRSEL_XER;
+                when 5x"0d" =>
+                    raddr := RAMSPR_TAR;
+                when others =>
+                    valid := '0';
+            end case;
+            dbg_spr_addr <= isram & sel & std_ulogic_vector(to_unsigned(raddr, 3)) & odd;
+            spr_index_valid <= valid;
+        end if;
+    end process;
 
     -- Core control signals generated by the debug module
     core_stop <= stopping and not do_step;
diff --git a/decode2.vhdl b/decode2.vhdl
index 5a8c2b7..d91bec5 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -43,6 +43,10 @@ entity decode2 is
         execute2_cr_bypass : in cr_bypass_data_t;
         writeback_bypass  : in bypass_data_t;
 
+        -- Access to SPRs from core_debug module
+        dbg_spr_req  : in std_ulogic;
+        dbg_spr_addr : in std_ulogic_vector(7 downto 0);
+
         log_out : out std_ulogic_vector(9 downto 0)
 	);
 end entity decode2;
@@ -60,6 +64,7 @@ architecture behaviour of decode2 is
         reg_o_valid : std_ulogic;
         input_ov  : std_ulogic;
         output_ov : std_ulogic;
+        read_rspr : std_ulogic;
     end record;
     constant reg_type_init : reg_type :=
         (e => Decode2ToExecute1Init, repeat => NONE, others => '0');
@@ -347,6 +352,13 @@ begin
                         " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid);
                 end if;
                 dc2 <= dc2in;
+            elsif dc2.read_rspr = '0' then
+                -- Update debug SPR access signals even when stalled
+                -- if the instruction in dc2.e doesn't read any SPRs.
+                dc2.e.dbg_spr_access <= dc2in.e.dbg_spr_access;
+                dc2.e.ramspr_even_rdaddr <= dc2in.e.ramspr_even_rdaddr;
+                dc2.e.ramspr_odd_rdaddr <= dc2in.e.ramspr_odd_rdaddr;
+                dc2.e.ramspr_rd_odd <= dc2in.e.ramspr_rd_odd;
             end if;
         end if;
     end process;
@@ -381,6 +393,7 @@ begin
         variable op : insn_type_t;
         variable valid_in : std_ulogic;
         variable decctr : std_ulogic;
+        variable sprs_busy : std_ulogic;
     begin
         v := dc2;
 
@@ -389,6 +402,8 @@ begin
         if dc2.busy = '0' then
             v.e := Decode2ToExecute1Init;
 
+            sprs_busy := '0';
+
             if d_in.valid = '1' then
                 v.prev_sgl := dc2.sgl_pipe;
                 v.sgl_pipe := d_in.decode.sgl_pipe;
@@ -467,6 +482,7 @@ begin
                 v.e.ramspr_odd_rdaddr := RAMSPR_CTR;
                 v.e.ramspr_wraddr := RAMSPR_CTR;
                 v.e.ramspr_write_odd := '1';
+                sprs_busy := '1';
             end if;
             if v.e.lr = '1' then
                 -- write LR
@@ -484,11 +500,13 @@ begin
                     else
                         v.e.ramspr_even_rdaddr := RAMSPR_TAR;
                     end if;
+                    sprs_busy := '1';
                 when OP_MFSPR =>
                     v.e.ramspr_even_rdaddr := d_in.ram_spr.index;
                     v.e.ramspr_odd_rdaddr := d_in.ram_spr.index;
                     v.e.ramspr_rd_odd := d_in.ram_spr.isodd;
                     v.e.spr_is_ram := d_in.ram_spr.valid;
+                    sprs_busy := d_in.ram_spr.valid;
                 when OP_MTSPR =>
                     v.e.ramspr_wraddr := d_in.ram_spr.index;
                     v.e.ramspr_write_even := d_in.ram_spr.valid and not d_in.ram_spr.isodd;
@@ -497,8 +515,10 @@ begin
                 when OP_RFID =>
                     v.e.ramspr_even_rdaddr := RAMSPR_SRR0;
                     v.e.ramspr_odd_rdaddr := RAMSPR_SRR1;
+                    sprs_busy := '1';
                 when others =>
             end case;
+            v.read_rspr := sprs_busy and d_in.valid;
 
             case d_in.decode.length is
                 when is1B =>
@@ -545,8 +565,6 @@ begin
                     -- Privileged mfspr to invalid/unimplemented SPR numbers
                     -- writes the contents of RT back to RT (i.e. it's a no-op)
                     v.e.result_sel := "001";        -- logical_result
-                elsif d_in.spr_info.ispmu = '1' then
-                    v.e.result_sel := "100";        -- pmuspr_result
                 end if;
             end if;
 
@@ -649,6 +667,13 @@ begin
 
         stall_out <= dc2.busy or deferred;
 
+        v.e.dbg_spr_access := dbg_spr_req and not v.read_rspr;
+        if v.e.dbg_spr_access = '1' then
+            v.e.ramspr_even_rdaddr := to_integer(unsigned(dbg_spr_addr(3 downto 1)));
+            v.e.ramspr_odd_rdaddr := to_integer(unsigned(dbg_spr_addr(3 downto 1)));
+            v.e.ramspr_rd_odd := dbg_spr_addr(0);
+        end if;
+
         -- Update registers
         dc2in <= v;
 
diff --git a/execute1.vhdl b/execute1.vhdl
index dc68806..20efef6 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -55,6 +55,12 @@ entity execute1 is
         dc_events    : in DcacheEventType;
         ic_events    : in IcacheEventType;
 
+        -- Access to SPRs from core_debug module
+        dbg_spr_req   : in std_ulogic;
+        dbg_spr_ack   : out std_ulogic;
+        dbg_spr_addr  : in std_ulogic_vector(7 downto 0);
+        dbg_spr_data  : out std_ulogic_vector(63 downto 0);
+
         -- debug
         sim_dump      : in std_ulogic;
         sim_dump_done : out std_ulogic;
@@ -604,6 +610,24 @@ begin
 	end if;
     end process;
 
+    ex_dbg_spr: process(clk)
+    begin
+        if rising_edge(clk) then
+            if rst = '0' and dbg_spr_req = '1' then
+                if e_in.dbg_spr_access = '1' and dbg_spr_ack = '0' then
+                    if dbg_spr_addr(7) = '1' then
+                        dbg_spr_data <= ramspr_result;
+                    else
+                        dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low);
+                    end if;
+                    dbg_spr_ack <= '1';
+                end if;
+            else
+                dbg_spr_ack <= '0';
+            end if;
+        end if;
+    end process;
+
     -- Data path for integer instructions (first execute stage)
     execute1_dp: process(all)
 	variable a_inv : std_ulogic_vector(63 downto 0);
diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c
index 6271760..ef5b1ec 100644
--- a/scripts/mw_debug/mw_debug.c
+++ b/scripts/mw_debug/mw_debug.c
@@ -548,7 +548,7 @@ static const char *fast_spr_names[] =
 {
 	"lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1",
 	"sprg0", "sprg1", "sprg2", "sprg3",
-	"hsprg0", "hsprg1", "xer"
+	"hsprg0", "hsprg1", "xer", "tar",
 };
 
 static void gpr_read(uint64_t reg, uint64_t count)

From af814a0d5eedf433c52fc9674b1aa1241069f9be Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 24 Feb 2022 11:37:17 +1100
Subject: [PATCH 26/30] Provide debug access to SPRs in loadstore1 and mmu

They are accessible as GSPR 0x3c - PID, 0x3d - PTCR, 0x3e - DSISR
and 0x3f - DAR.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl                 |  4 ++-
 core.vhdl                   | 12 ++++++++
 core_debug.vhdl             | 29 +++++++++++++-----
 loadstore1.vhdl             | 61 +++++++++++++++++++++++++++++--------
 mmu.vhdl                    | 11 +++----
 scripts/mw_debug/mw_debug.c |  8 ++++-
 6 files changed, 96 insertions(+), 29 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index d743c2d..39ebfb1 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -547,7 +547,9 @@ package common is
         iside : std_ulogic;
         load  : std_ulogic;
         priv  : std_ulogic;
-        sprn  : std_ulogic_vector(9 downto 0);
+        ric   : std_ulogic_vector(1 downto 0);
+        sprnf : std_ulogic;
+        sprnt : std_ulogic;
         addr  : std_ulogic_vector(63 downto 0);
         rs    : std_ulogic_vector(63 downto 0);
     end record;
diff --git a/core.vhdl b/core.vhdl
index a91b729..641c12d 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -154,6 +154,10 @@ architecture behave of core is
     signal dbg_spr_ack : std_ulogic;
     signal dbg_spr_addr : std_ulogic_vector(7 downto 0);
     signal dbg_spr_data : std_ulogic_vector(63 downto 0);
+    signal dbg_ls_spr_req : std_ulogic;
+    signal dbg_ls_spr_ack : std_ulogic;
+    signal dbg_ls_spr_addr : std_ulogic_vector(1 downto 0);
+    signal dbg_ls_spr_data : std_ulogic_vector(63 downto 0);
 
     signal ctrl_debug : ctrl_t;
 
@@ -432,6 +436,10 @@ begin
             m_in => mmu_to_loadstore1,
             dc_stall => dcache_stall_out,
             events => loadstore_events,
+            dbg_spr_req => dbg_ls_spr_req,
+            dbg_spr_ack => dbg_ls_spr_ack,
+            dbg_spr_addr => dbg_ls_spr_addr,
+            dbg_spr_data => dbg_ls_spr_data,
             log_out => log_data(149 downto 140)
             );
 
@@ -518,6 +526,10 @@ begin
             dbg_spr_ack => dbg_spr_ack,
             dbg_spr_addr => dbg_spr_addr,
             dbg_spr_data => dbg_spr_data,
+            dbg_ls_spr_req => dbg_ls_spr_req,
+            dbg_ls_spr_ack => dbg_ls_spr_ack,
+            dbg_ls_spr_addr => dbg_ls_spr_addr,
+            dbg_ls_spr_data => dbg_ls_spr_data,
             log_data => log_data,
             log_read_addr => log_rd_addr,
             log_read_data => log_rd_data,
diff --git a/core_debug.vhdl b/core_debug.vhdl
index a1d4a94..c060f74 100644
--- a/core_debug.vhdl
+++ b/core_debug.vhdl
@@ -39,12 +39,18 @@ entity core_debug is
         dbg_gpr_addr    : out gspr_index_t;
         dbg_gpr_data    : in std_ulogic_vector(63 downto 0);
 
-        -- SPR register read port
+        -- SPR register read port for SPRs in execute1
         dbg_spr_req     : out std_ulogic;
         dbg_spr_ack     : in std_ulogic;
         dbg_spr_addr    : out std_ulogic_vector(7 downto 0);
         dbg_spr_data    : in std_ulogic_vector(63 downto 0);
 
+        -- SPR register read port for SPRs in loadstore1 and mmu
+        dbg_ls_spr_req  : out std_ulogic;
+        dbg_ls_spr_ack  : in std_ulogic;
+        dbg_ls_spr_addr : out std_ulogic_vector(1 downto 0);
+        dbg_ls_spr_data : in std_ulogic_vector(63 downto 0);
+
         -- Core logging data
         log_data        : in std_ulogic_vector(255 downto 0);
         log_read_addr   : in std_ulogic_vector(31 downto 0);
@@ -128,7 +134,7 @@ architecture behave of core_debug is
 begin
        -- Single cycle register accesses on DMI except for GSPR data
     dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA
-               else dbg_gpr_ack or dbg_spr_ack;
+               else dbg_gpr_ack or dbg_spr_ack or dbg_ls_spr_ack;
 
     -- Status register read composition
     stat_reg <= (2 => terminated,
@@ -137,6 +143,7 @@ begin
                  others => '0');
 
     gspr_data <= dbg_gpr_data when gspr_index(5) = '0' else
+                 dbg_ls_spr_data when dbg_ls_spr_req = '1' else
                  dbg_spr_data when spr_index_valid = '1' else
                  (others => '0');
 
@@ -245,16 +252,22 @@ begin
         variable odd : std_ulogic;
     begin
         if rising_edge(clk) then
-            if rst = '1' or dmi_req = '0' or dmi_addr /= DBG_CORE_GSPR_DATA then
-                dbg_gpr_req <= '0';
-                dbg_spr_req <= '0';
-            else
-                dbg_gpr_req <= not gspr_index(5);
-                dbg_spr_req <= gspr_index(5);
+            dbg_gpr_req <= '0';
+            dbg_spr_req <= '0';
+            dbg_ls_spr_req <= '0';
+            if rst = '0' and dmi_req = '1' and dmi_addr = DBG_CORE_GSPR_DATA then
+                if gspr_index(5) = '0' then
+                    dbg_gpr_req <= '1';
+                elsif gspr_index(4 downto 2) = "111" then
+                    dbg_ls_spr_req <= '1';
+                else
+                    dbg_spr_req <= '1';
+                end if;
             end if;
 
             -- Map 0 - 0x1f to GPRs, 0x20 - 0x3f to SPRs, and 0x40 - 0x5f to FPRs
             dbg_gpr_addr <= gspr_index(6) & gspr_index(4 downto 0);
+            dbg_ls_spr_addr <= gspr_index(1 downto 0);
 
             -- For SPRs, use the same mapping as when the fast SPRs were in the GPR file
             valid := '1';
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 9dab15b..92ebeec 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -35,6 +35,12 @@ entity loadstore1 is
 
         events  : out Loadstore1EventType;
 
+        -- Access to SPRs from core_debug module
+        dbg_spr_req   : in std_ulogic;
+        dbg_spr_ack   : out std_ulogic;
+        dbg_spr_addr  : in std_ulogic_vector(1 downto 0);
+        dbg_spr_data  : out std_ulogic_vector(63 downto 0);
+
         log_out : out std_ulogic_vector(9 downto 0)
         );
 end loadstore1;
@@ -123,6 +129,8 @@ architecture behave of loadstore1 is
         one_cycle  : std_ulogic;
         wr_sel     : std_ulogic_vector(1 downto 0);
         addr0      : std_ulogic_vector(63 downto 0);
+        sprsel     : std_ulogic_vector(1 downto 0);
+        dbg_spr_rd : std_ulogic;
     end record;
 
     type reg_stage3_t is record
@@ -146,6 +154,8 @@ architecture behave of loadstore1 is
         intr_vec     : integer range 0 to 16#fff#;
         srr1         : std_ulogic_vector(15 downto 0);
         events       : Loadstore1EventType;
+        dbg_spr      : std_ulogic_vector(63 downto 0);
+        dbg_spr_ack  : std_ulogic;
     end record;
 
     signal req_in   : request_t;
@@ -664,6 +674,20 @@ begin
             v.busy := '1';
         end if;
 
+        v.dbg_spr_rd := dbg_spr_req and not (v.req.valid and v.req.read_spr);
+        if v.dbg_spr_rd = '0' then
+            v.sprsel(1) := v.req.sprn(1);
+            if v.req.sprn(1) = '1' then
+                -- DSISR and DAR
+                v.sprsel(0) := v.req.sprn(0);
+            else
+                -- PID and PTCR
+                v.sprsel(0) := v.req.sprn(8);
+            end if;
+        else
+            v.sprsel := dbg_spr_addr;
+        end if;
+
         r2in <= v;
     end process;
 
@@ -763,21 +787,26 @@ begin
             v.load_data := data_permuted;
         end if;
 
+        -- SPR mux
+        if r2.sprsel(1) = '1' then
+            if r2.sprsel(0) = '0' then
+                sprval := x"00000000" & r3.dsisr;
+            else
+                sprval := r3.dar;
+            end if;
+        else
+            sprval := m_in.sprval;
+        end if;
+        if dbg_spr_req = '0' then
+            v.dbg_spr_ack := '0';
+        elsif r2.dbg_spr_rd = '1' and r3.dbg_spr_ack = '0' then
+            v.dbg_spr := sprval;
+            v.dbg_spr_ack := '1';
+        end if;
+
         if r2.req.valid = '1' then
             if r2.req.read_spr = '1' then
                 write_enable := '1';
-                -- partial decode on SPR number should be adequate given
-                -- the restricted set that get sent down this path
-                if r2.req.sprn(8) = '0' and r2.req.sprn(5) = '0' then
-                    if r2.req.sprn(0) = '0' then
-                        sprval := x"00000000" & r3.dsisr;
-                    else
-                        sprval := r3.dar;
-                    end if;
-                else
-                    -- reading one of the SPRs in the MMU
-                    sprval := m_in.sprval;
-                end if;
             end if;
             if r2.req.align_intr = '1' then
                 -- generate alignment interrupt
@@ -940,8 +969,10 @@ begin
         m_out.load <= r2.req.load;
         m_out.priv <= r2.req.priv_mode;
         m_out.tlbie <= r2.req.tlbie;
+        m_out.ric <= r2.req.sprn(3 downto 2);
         m_out.mtspr <= mmu_mtspr;
-        m_out.sprn <= r2.req.sprn;
+        m_out.sprnf <= r2.sprsel(0);
+        m_out.sprnt <= r2.req.sprn(8);
         m_out.addr <= r2.req.addr;
         m_out.slbia <= r2.req.is_slbia;
         m_out.rs <= r2.req.store_data;
@@ -967,6 +998,10 @@ begin
 
         flush <= exception;
 
+        -- SPR values for core_debug
+        dbg_spr_data <= r3.dbg_spr;
+        dbg_spr_ack <= r3.dbg_spr_ack;
+
         -- Update registers
         r3in <= v;
 
diff --git a/mmu.vhdl b/mmu.vhdl
index d80caf4..d95cd3c 100644
--- a/mmu.vhdl
+++ b/mmu.vhdl
@@ -81,8 +81,8 @@ architecture behave of mmu is
 
 begin
     -- Multiplex internal SPR values back to loadstore1, selected
-    -- by l_in.sprn.
-    l_out.sprval <= r.ptcr when l_in.sprn(8) = '1' else x"00000000" & r.pid;
+    -- by l_in.sprnf.
+    l_out.sprval <= r.ptcr when l_in.sprnf = '1' else x"00000000" & r.pid;
 
     mmu_0: process(clk)
     begin
@@ -259,9 +259,8 @@ begin
                     -- RB[IS] != 0 or RB[AP] != 0, or for slbia
                     v.inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or
                                    l_in.addr(7) or l_in.addr(6) or l_in.addr(5);
-                    -- The RIC field of the tlbie instruction comes across on the
-                    -- sprn bus as bits 2--3.  RIC=2 flushes process table caches.
-                    if l_in.sprn(3) = '1' then
+                    -- RIC=2 or 3 flushes process table caches.
+                    if l_in.ric(1) = '1' then
                         v.pt0_valid := '0';
                         v.pt3_valid := '0';
                         v.ptb_valid := '0';
@@ -291,7 +290,7 @@ begin
                 -- Move to PID needs to invalidate L1 TLBs and cached
                 -- pgtbl0 value.  Move to PTCR does that plus
                 -- invalidating the cached pgtbl3 and prtbl values as well.
-                if l_in.sprn(8) = '0' then
+                if l_in.sprnt = '0' then
                     v.pid := l_in.rs(31 downto 0);
                 else
                     v.ptcr := l_in.rs;
diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c
index ef5b1ec..81e8094 100644
--- a/scripts/mw_debug/mw_debug.c
+++ b/scripts/mw_debug/mw_debug.c
@@ -551,6 +551,10 @@ static const char *fast_spr_names[] =
 	"hsprg0", "hsprg1", "xer", "tar",
 };
 
+static const char *ldst_spr_names[] = {
+	"pidr", "ptcr", "dsisr", "dar"
+};
+
 static void gpr_read(uint64_t reg, uint64_t count)
 {
 	uint64_t data;
@@ -566,8 +570,10 @@ static void gpr_read(uint64_t reg, uint64_t count)
 			printf("r%"PRId64, reg);
 		else if ((reg - 32) < sizeof(fast_spr_names) / sizeof(fast_spr_names[0]))
 			printf("%s", fast_spr_names[reg - 32]);
-		else if (reg < 64)
+		else if (reg < 60)
 			printf("gspr%"PRId64, reg);
+		else if (reg < 64)
+			printf("%s", ldst_spr_names[reg - 60]);
 		else
 			printf("FPR%"PRId64, reg - 64);
 		printf(":\t%016"PRIx64"\n", data);

From 047be5c0c3b2f12c9321412518e17b7267fe14ea Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 23 Mar 2022 18:02:28 +0000
Subject: [PATCH 27/30] loadstore1: Do SPR reading in stage 2 rather than stage
 3

This eliminates one leg of the output value multiplexer, and seems
to improve timing slightly on the A7-100.

Since SPR values are written in stage 3 and read in stage 2, an mfspr
immediately following an mtspr to the same SPR won't give the correct
value.  To avoid this, we make mtspr to the load/store SPRs single
issue in decode1.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl    |   4 ++
 loadstore1.vhdl | 114 ++++++++++++++++++++++++------------------------
 2 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index af8cd6c..5ee7b57 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -681,6 +681,10 @@ begin
                     when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>
                         vi.override_decode.unit := LDST;
                         vi.override_unit := '1';
+                        -- make mtspr to loadstore SPRs single-issue
+                        if f_in.insn(8) = '1' then
+                            vi.force_single := '1';
+                        end if;
                     when others =>
                 end case;
             end if;
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 92ebeec..0a2f088 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -90,7 +90,8 @@ architecture behave of loadstore1 is
         virt_mode    : std_ulogic;
         priv_mode    : std_ulogic;
         load_sp      : std_ulogic;
-        sprn         : std_ulogic_vector(9 downto 0);
+        sprsel       : std_ulogic_vector(1 downto 0);
+        ric          : std_ulogic_vector(1 downto 0);
         is_slbia     : std_ulogic;
         align_intr   : std_ulogic;
         dword_index  : std_ulogic;
@@ -109,7 +110,7 @@ architecture behave of loadstore1 is
                                           xerc => xerc_init, reserve => '0',
                                           atomic => '0', atomic_last => '0', rc => '0', nc => '0',
                                           virt_mode => '0', priv_mode => '0', load_sp => '0',
-                                          sprn => 10x"0", is_slbia => '0', align_intr => '0',
+                                          sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0',
                                           dword_index => '0', two_dwords => '0', incomplete => '0');
 
     type reg_stage1_t is record
@@ -130,7 +131,8 @@ architecture behave of loadstore1 is
         wr_sel     : std_ulogic_vector(1 downto 0);
         addr0      : std_ulogic_vector(63 downto 0);
         sprsel     : std_ulogic_vector(1 downto 0);
-        dbg_spr_rd : std_ulogic;
+        dbg_spr    : std_ulogic_vector(63 downto 0);
+        dbg_spr_ack: std_ulogic;
     end record;
 
     type reg_stage3_t is record
@@ -154,8 +156,6 @@ architecture behave of loadstore1 is
         intr_vec     : integer range 0 to 16#fff#;
         srr1         : std_ulogic_vector(15 downto 0);
         events       : Loadstore1EventType;
-        dbg_spr      : std_ulogic_vector(63 downto 0);
-        dbg_spr_ack  : std_ulogic;
     end record;
 
     signal req_in   : request_t;
@@ -287,7 +287,8 @@ begin
                 r1.req.instr_fault <= '0';
                 r1.req.load <= '0';
                 r1.req.priv_mode <= '0';
-                r1.req.sprn <= (others => '0');
+                r1.req.sprsel <= "00";
+                r1.req.ric <= "00";
                 r1.req.xerc <= xerc_init;
 
                 r2.req.valid <= '0';
@@ -297,7 +298,8 @@ begin
                 r2.req.instr_fault <= '0';
                 r2.req.load <= '0';
                 r2.req.priv_mode <= '0';
-                r2.req.sprn <= (others => '0');
+                r2.req.sprsel <= "00";
+                r2.req.ric <= "00";
                 r2.req.xerc <= xerc_init;
 
                 r2.wait_dc <= '0';
@@ -418,7 +420,14 @@ begin
         v.nc := l_in.ci;
         v.virt_mode := l_in.virt_mode;
         v.priv_mode := l_in.priv_mode;
-        v.sprn := sprn;
+        v.ric := l_in.insn(19 downto 18);
+        if sprn(1) = '1' then
+            -- DSISR and DAR
+            v.sprsel := '1' & sprn(0);
+        else
+            -- PID and PTCR
+            v.sprsel := '0' & sprn(8);
+        end if;
 
         lsu_sum := std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2));
 
@@ -494,7 +503,7 @@ begin
                 v.read_spr := '1';
             when OP_MTSPR =>
                 v.write_spr := '1';
-                v.mmu_op := sprn(8) or sprn(5);
+                v.mmu_op := not sprn(1);
             when OP_FETCH_FAILED =>
                 -- send it to the MMU to do the radix walk
                 v.instr_fault := '1';
@@ -605,6 +614,9 @@ begin
         variable idx : unsigned(2 downto 0);
         variable byte_offset : unsigned(2 downto 0);
         variable interrupt : std_ulogic;
+        variable dbg_spr_rd : std_ulogic;
+        variable sprsel : std_ulogic_vector(1 downto 0);
+        variable sprval : std_ulogic_vector(63 downto 0);
     begin
         v := r2;
 
@@ -617,6 +629,28 @@ begin
             store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j);
         end loop;
 
+        dbg_spr_rd := dbg_spr_req and not (r1.req.valid and r1.req.read_spr);
+        if dbg_spr_rd = '0' then
+            sprsel := r1.req.sprsel;
+        else
+            sprsel := dbg_spr_addr;
+        end if;
+        if sprsel(1) = '1' then
+            if sprsel(0) = '0' then
+                sprval := x"00000000" & r3.dsisr;
+            else
+                sprval := r3.dar;
+            end if;
+        else
+            sprval := m_in.sprval;
+        end if;
+        if dbg_spr_req = '0' then
+            v.dbg_spr_ack := '0';
+        elsif dbg_spr_rd = '1' and r2.dbg_spr_ack = '0' then
+            v.dbg_spr := sprval;
+            v.dbg_spr_ack := '1';
+        end if;
+
         if (dc_stall or d_in.error or r2.busy or l_in.e2stall) = '0' then
             if r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0' then
                 v.req := r1.req;
@@ -627,14 +661,15 @@ begin
                 v.wait_mmu := r1.req.valid and r1.req.mmu_op;
                 v.busy := r1.req.valid and r1.req.mmu_op;
                 v.one_cycle := r1.req.valid and not (r1.req.dc_req or r1.req.mmu_op);
-                if r1.req.read_spr = '1' then
+                if r1.req.do_update = '1' or r1.req.store = '1' or r1.req.read_spr = '1' then
                     v.wr_sel := "00";
-                elsif r1.req.do_update = '1' or r1.req.store = '1' then
-                    v.wr_sel := "01";
                 elsif r1.req.load_sp = '1' then
-                    v.wr_sel := "10";
+                    v.wr_sel := "01";
                 else
-                    v.wr_sel := "11";
+                    v.wr_sel := "10";
+                end if;
+                if r1.req.read_spr = '1' then
+                    v.addr0 := sprval;
                 end if;
 
                 -- Work out load formatter controls for next cycle
@@ -674,21 +709,11 @@ begin
             v.busy := '1';
         end if;
 
-        v.dbg_spr_rd := dbg_spr_req and not (v.req.valid and v.req.read_spr);
-        if v.dbg_spr_rd = '0' then
-            v.sprsel(1) := v.req.sprn(1);
-            if v.req.sprn(1) = '1' then
-                -- DSISR and DAR
-                v.sprsel(0) := v.req.sprn(0);
-            else
-                -- PID and PTCR
-                v.sprsel(0) := v.req.sprn(8);
-            end if;
-        else
-            v.sprsel := dbg_spr_addr;
-        end if;
-
         r2in <= v;
+
+        -- SPR values for core_debug
+        dbg_spr_data <= r2.dbg_spr;
+        dbg_spr_ack <= r2.dbg_spr_ack;
     end process;
 
     -- Processing done in the third cycle of a load/store instruction.
@@ -787,22 +812,6 @@ begin
             v.load_data := data_permuted;
         end if;
 
-        -- SPR mux
-        if r2.sprsel(1) = '1' then
-            if r2.sprsel(0) = '0' then
-                sprval := x"00000000" & r3.dsisr;
-            else
-                sprval := r3.dar;
-            end if;
-        else
-            sprval := m_in.sprval;
-        end if;
-        if dbg_spr_req = '0' then
-            v.dbg_spr_ack := '0';
-        elsif r2.dbg_spr_rd = '1' and r3.dbg_spr_ack = '0' then
-            v.dbg_spr := sprval;
-            v.dbg_spr_ack := '1';
-        end if;
 
         if r2.req.valid = '1' then
             if r2.req.read_spr = '1' then
@@ -819,7 +828,7 @@ begin
                 write_enable := '1';
             end if;
             if r2.req.write_spr = '1' and r2.req.mmu_op = '0' then
-                if r2.req.sprn(0) = '0' then
+                if r2.req.sprsel(0) = '0' then
                     v.dsisr := r2.req.store_data(31 downto 0);
                 else
                     v.dar := r2.req.store_data;
@@ -917,12 +926,9 @@ begin
 
         case r2.wr_sel is
         when "00" =>
-            -- mfspr result
-            write_data := sprval;
-        when "01" =>
             -- update reg
             write_data := r2.addr0;
-        when "10" =>
+        when "01" =>
             -- lfs result
             write_data := load_dp_data;
         when others =>
@@ -969,10 +975,10 @@ begin
         m_out.load <= r2.req.load;
         m_out.priv <= r2.req.priv_mode;
         m_out.tlbie <= r2.req.tlbie;
-        m_out.ric <= r2.req.sprn(3 downto 2);
+        m_out.ric <= r2.req.ric;
         m_out.mtspr <= mmu_mtspr;
-        m_out.sprnf <= r2.sprsel(0);
-        m_out.sprnt <= r2.req.sprn(8);
+        m_out.sprnf <= r1.req.sprsel(0);
+        m_out.sprnt <= r2.req.sprsel(0);
         m_out.addr <= r2.req.addr;
         m_out.slbia <= r2.req.is_slbia;
         m_out.rs <= r2.req.store_data;
@@ -998,10 +1004,6 @@ begin
 
         flush <= exception;
 
-        -- SPR values for core_debug
-        dbg_spr_data <= r3.dbg_spr;
-        dbg_spr_ack <= r3.dbg_spr_ack;
-
         -- Update registers
         r3in <= v;
 

From 06c13d4988fee4ec1f5bf089ad71f2acc2883818 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 21 Feb 2022 19:29:09 +1100
Subject: [PATCH 28/30] decode1: Work out register addresses in decode1

This adds some relatively simple logic to decode1 to compute the
GPR/FPR addresses that an instruction will access.  It always computes
three addresses regardless of whether the instruction will actually
use all of them.  The main things it computes are whether the
instruction uses the RS field or the RC field for the 3rd operand, and
whether the operands are FPRs or GPRs (it is possible for RS to be an
FPR but RA and RB to be GPRs, as for example with stfdx).

At the moment all we do with these computed register addresses is to
assert that they are identical to the ones coming from decode2 one
cycle later.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl        |  6 ++++++
 core.vhdl          |  4 ++++
 decode1.vhdl       | 37 +++++++++++++++++++++++++++++++++++++
 register_file.vhdl | 19 +++++++++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/common.vhdl b/common.vhdl
index 39ebfb1..0349a6e 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -276,6 +276,12 @@ package common is
         redirect_nia : std_ulogic_vector(63 downto 0);
     end record;
 
+    type Decode1ToRegisterFileType is record
+        reg_1_addr : gspr_index_t;
+        reg_2_addr : gspr_index_t;
+        reg_3_addr : gspr_index_t;
+    end record;
+
     type bypass_data_t is record
         tag  : instr_tag_t;
         data : std_ulogic_vector(63 downto 0);
diff --git a/core.vhdl b/core.vhdl
index 641c12d..764141a 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -63,6 +63,7 @@ architecture behave of core is
     -- decode signals
     signal decode1_to_decode2: Decode1ToDecode2Type;
     signal decode1_to_fetch1: Decode1ToFetch1Type;
+    signal decode1_to_register_file: Decode1ToRegisterFileType;
     signal decode2_to_execute1: Decode2ToExecute1Type;
 
     -- register file signals
@@ -285,6 +286,7 @@ begin
             f_in => icache_to_decode1,
             d_out => decode1_to_decode2,
             f_out => decode1_to_fetch1,
+            r_out => decode1_to_register_file,
             log_out => log_data(109 downto 97)
             );
 
@@ -329,6 +331,8 @@ begin
             )
         port map (
             clk => clk,
+            stall => decode2_stall_out,
+            d1_in => decode1_to_register_file,
             d_in => decode2_to_register_file,
             d_out => register_file_to_decode2,
             w_in => writeback_to_register_file,
diff --git a/decode1.vhdl b/decode1.vhdl
index 5ee7b57..36d511b 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -5,6 +5,7 @@ use ieee.numeric_std.all;
 library work;
 use work.common.all;
 use work.decode_types.all;
+use work.insn_helpers.all;
 
 entity decode1 is
     generic (
@@ -24,6 +25,7 @@ entity decode1 is
         f_in      : in IcacheToDecode1Type;
         f_out     : out Decode1ToFetch1Type;
         d_out     : out Decode1ToDecode2Type;
+        r_out     : out Decode1ToRegisterFileType;
         log_out   : out std_ulogic_vector(12 downto 0)
 	);
 end entity decode1;
@@ -628,6 +630,7 @@ begin
 
     decode1_1: process(all)
         variable v : Decode1ToDecode2Type;
+        variable vr : Decode1ToRegisterFileType;
         variable vi : reg_internal_t;
         variable majorop : major_opcode_t;
         variable minor4op : std_ulogic_vector(10 downto 0);
@@ -636,6 +639,8 @@ begin
         variable br_target : std_ulogic_vector(61 downto 0);
         variable br_offset : signed(23 downto 0);
         variable bv : br_predictor_t;
+        variable fprs, fprabc : std_ulogic;
+        variable in3rc : std_ulogic;
     begin
         v := Decode1ToDecode2Init;
         vi := reg_internal_t_init;
@@ -646,6 +651,10 @@ begin
         v.stop_mark := f_in.stop_mark;
         v.big_endian := f_in.big_endian;
 
+        fprs := '0';
+        fprabc := '0';
+        in3rc := '0';
+
         if f_in.valid = '1' then
             report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia);
         end if;
@@ -665,6 +674,7 @@ begin
             minor4op := f_in.insn(5 downto 0) & f_in.insn(10 downto 6);
             vi.override := not decode_op_4_valid(to_integer(unsigned(minor4op)));
             v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0))));
+            in3rc := '1';
 
         when 31 =>
             -- major opcode 31, lots of things
@@ -688,6 +698,10 @@ begin
                     when others =>
                 end case;
             end if;
+            if HAS_FPU and std_match(f_in.insn(10 downto 1), "1----10111") then
+                -- lower half of column 23 has FP loads and stores
+                fprs := '1';
+            end if;
 
         when 16 =>
             -- Predict backward branches as taken, forward as untaken
@@ -715,6 +729,12 @@ begin
         when 30 =>
             v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1))));
 
+        when 52 | 53 | 54 | 55 =>
+            -- stfd[u] and stfs[u]
+            if HAS_FPU then
+                fprs := '1';
+            end if;
+
         when 58 =>
             v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
@@ -725,6 +745,9 @@ begin
                 if f_in.insn(5) = '0' and not std_match(f_in.insn(10 downto 1), "11-1001110") then
                     vi.override := '1';
                 end if;
+                in3rc := '1';
+                fprabc := '1';
+                fprs := '1';
             end if;
 
         when 62 =>
@@ -738,11 +761,23 @@ begin
                 else
                     v.decode := decode_op_63h_array(to_integer(unsigned(f_in.insn(4 downto 1))));
                 end if;
+                in3rc := '1';
+                fprabc := '1';
+                fprs := '1';
             end if;
 
         when others =>
         end case;
 
+        -- Work out GPR/FPR read addresses
+        vr.reg_1_addr := fprabc & insn_ra(f_in.insn);
+        vr.reg_2_addr := fprabc & insn_rb(f_in.insn);
+        if in3rc = '1' then
+            vr.reg_3_addr := fprabc & insn_rcreg(f_in.insn);
+        else
+            vr.reg_3_addr := fprs & insn_rs(f_in.insn);
+        end if;
+
         if f_in.fetch_failed = '1' then
             v.valid := '1';
             vi.override := '1';
@@ -788,6 +823,8 @@ begin
         f_out.redirect <= br.predict;
         f_out.redirect_nia <= br_target & "00";
         flush_out <= bv.predict or br.predict;
+
+        r_out <= vr;
     end process;
 
     d1_log: if LOG_LENGTH > 0 generate
diff --git a/register_file.vhdl b/register_file.vhdl
index dcce0a4..bc40c3f 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -14,7 +14,9 @@ entity register_file is
         );
     port(
         clk           : in std_logic;
+        stall         : in std_ulogic;
 
+        d1_in         : in Decode1ToRegisterFileType;
         d_in          : in Decode2ToRegisterFileType;
         d_out         : out RegisterFileToDecode2Type;
 
@@ -39,9 +41,13 @@ architecture behaviour of register_file is
     signal rd_port_b : std_ulogic_vector(63 downto 0);
     signal dbg_data : std_ulogic_vector(63 downto 0);
     signal dbg_ack : std_ulogic;
+    signal addr_1_reg : gspr_index_t;
+    signal addr_2_reg : gspr_index_t;
+    signal addr_3_reg : gspr_index_t;
 begin
     -- synchronous writes
     register_write_0: process(clk)
+        variable a_addr, b_addr, c_addr : gspr_index_t;
         variable w_addr : gspr_index_t;
     begin
         if rising_edge(clk) then
@@ -56,6 +62,19 @@ begin
                 assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
                 registers(to_integer(unsigned(w_addr))) <= w_in.write_data;
             end if;
+
+            a_addr := d1_in.reg_1_addr;
+            b_addr := d1_in.reg_2_addr;
+            c_addr := d1_in.reg_3_addr;
+
+            if stall = '0' then
+                addr_1_reg <= a_addr;
+                addr_2_reg <= b_addr;
+                addr_3_reg <= c_addr;
+            end if;
+            assert (d_in.read1_enable = '0') or (d_in.read1_reg = addr_1_reg) severity failure;
+            assert (d_in.read2_enable = '0') or (d_in.read2_reg = addr_2_reg) severity failure;
+            assert (d_in.read3_enable = '0') or (d_in.read3_reg = addr_3_reg) severity failure;
         end if;
     end process register_write_0;
 

From 1d7de2f1dae295364848940f31c991c8b095f4aa Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 22 Feb 2022 09:30:05 +1100
Subject: [PATCH 29/30] register_file: Make read access to register file
 synchronous

With this, the register RAM is read synchronously using the addresses
supplied by decode1.  That means the register RAM can now be block RAM
rather than LUT RAM.

Debug accesses are done via the B port on cycles when decode1
indicates that there is no valid instruction or the instruction
doesn't use a [F]RB operand.

We latch the addresses being read in each cycle and use the same
address next cycle if stalled.  Data that is being written is latched
and a multiplexer on each read port then supplies the latched write
data if the read address for that port equals the write address.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl        |   3 ++
 decode1.vhdl       |  14 ++++++
 register_file.vhdl | 117 +++++++++++++++++++++++++++++++--------------
 3 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 0349a6e..4d6cb91 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -280,6 +280,9 @@ package common is
         reg_1_addr : gspr_index_t;
         reg_2_addr : gspr_index_t;
         reg_3_addr : gspr_index_t;
+        read_1_enable : std_ulogic;
+        read_2_enable : std_ulogic;
+        read_3_enable : std_ulogic;
     end record;
 
     type bypass_data_t is record
diff --git a/decode1.vhdl b/decode1.vhdl
index 36d511b..cc93dfc 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -641,6 +641,7 @@ begin
         variable bv : br_predictor_t;
         variable fprs, fprabc : std_ulogic;
         variable in3rc : std_ulogic;
+        variable may_read_rb : std_ulogic;
     begin
         v := Decode1ToDecode2Init;
         vi := reg_internal_t_init;
@@ -654,6 +655,7 @@ begin
         fprs := '0';
         fprabc := '0';
         in3rc := '0';
+        may_read_rb := '0';
 
         if f_in.valid = '1' then
             report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia);
@@ -675,10 +677,16 @@ begin
             vi.override := not decode_op_4_valid(to_integer(unsigned(minor4op)));
             v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0))));
             in3rc := '1';
+            may_read_rb := '1';
+
+        when 23 =>
+            -- rlwnm[.]
+            may_read_rb := '1';
 
         when 31 =>
             -- major opcode 31, lots of things
             v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1))));
+            may_read_rb := '1';
 
             if std_match(f_in.insn(10 downto 1), "01-1010011") then
                 -- mfspr or mtspr
@@ -728,6 +736,7 @@ begin
 
         when 30 =>
             v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1))));
+            may_read_rb := f_in.insn(4);
 
         when 52 | 53 | 54 | 55 =>
             -- stfd[u] and stfs[u]
@@ -748,6 +757,7 @@ begin
                 in3rc := '1';
                 fprabc := '1';
                 fprs := '1';
+                may_read_rb := '1';
             end if;
 
         when 62 =>
@@ -764,6 +774,7 @@ begin
                 in3rc := '1';
                 fprabc := '1';
                 fprs := '1';
+                may_read_rb := '1';
             end if;
 
         when others =>
@@ -777,6 +788,9 @@ begin
         else
             vr.reg_3_addr := fprs & insn_rs(f_in.insn);
         end if;
+        vr.read_1_enable := f_in.valid and not f_in.fetch_failed;
+        vr.read_2_enable := f_in.valid and not f_in.fetch_failed and may_read_rb;
+        vr.read_3_enable := f_in.valid and not f_in.fetch_failed;
 
         if f_in.fetch_failed = '1' then
             v.valid := '1';
diff --git a/register_file.vhdl b/register_file.vhdl
index bc40c3f..a8ddee2 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -38,17 +38,27 @@ end entity register_file;
 architecture behaviour of register_file is
     type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0);
     signal registers : regfile := (others => (others => '0'));
-    signal rd_port_b : std_ulogic_vector(63 downto 0);
     signal dbg_data : std_ulogic_vector(63 downto 0);
     signal dbg_ack : std_ulogic;
+    signal dbg_gpr_done : std_ulogic;
     signal addr_1_reg : gspr_index_t;
     signal addr_2_reg : gspr_index_t;
     signal addr_3_reg : gspr_index_t;
+    signal rd_2 : std_ulogic;
+    signal fwd_1 : std_ulogic;
+    signal fwd_2 : std_ulogic;
+    signal fwd_3 : std_ulogic;
+    signal data_1 : std_ulogic_vector(63 downto 0);
+    signal data_2 : std_ulogic_vector(63 downto 0);
+    signal data_3 : std_ulogic_vector(63 downto 0);
+    signal prev_write_data : std_ulogic_vector(63 downto 0);
+
 begin
-    -- synchronous writes
+    -- synchronous reads and writes
     register_write_0: process(clk)
         variable a_addr, b_addr, c_addr : gspr_index_t;
         variable w_addr : gspr_index_t;
+        variable b_enable : std_ulogic;
     begin
         if rising_edge(clk) then
             if w_in.write_enable = '1' then
@@ -66,57 +76,94 @@ begin
             a_addr := d1_in.reg_1_addr;
             b_addr := d1_in.reg_2_addr;
             c_addr := d1_in.reg_3_addr;
-
-            if stall = '0' then
+            b_enable := d1_in.read_2_enable;
+            if stall = '1' then
+                a_addr := addr_1_reg;
+                b_addr := addr_2_reg;
+                c_addr := addr_3_reg;
+                b_enable := rd_2;
+            else
                 addr_1_reg <= a_addr;
                 addr_2_reg <= b_addr;
                 addr_3_reg <= c_addr;
+                rd_2 <= b_enable;
             end if;
+
+            fwd_1 <= '0';
+            fwd_2 <= '0';
+            fwd_3 <= '0';
+            if w_in.write_enable = '1' then
+                if w_addr = a_addr then
+                    fwd_1 <= '1';
+                end if;
+                if w_addr = b_addr then
+                    fwd_2 <= '1';
+                end if;
+                if w_addr = c_addr then
+                    fwd_3 <= '1';
+                end if;
+            end if;
+
+            -- Do debug reads to GPRs and FPRs using the B port when it is not in use
+            if dbg_gpr_req = '1' then
+                if b_enable = '0' then
+                    b_addr := dbg_gpr_addr(5 downto 0);
+                    dbg_gpr_done <= '1';
+                end if;
+            else
+                dbg_gpr_done <= '0';
+            end if;
+
+            if not HAS_FPU then
+                -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation
+                a_addr(5) := '0';
+                b_addr(5) := '0';
+                c_addr(5) := '0';
+            end if;
+            data_1 <= registers(to_integer(unsigned(a_addr)));
+            data_2 <= registers(to_integer(unsigned(b_addr)));
+            data_3 <= registers(to_integer(unsigned(c_addr)));
+
+            prev_write_data <= w_in.write_data;
+
             assert (d_in.read1_enable = '0') or (d_in.read1_reg = addr_1_reg) severity failure;
             assert (d_in.read2_enable = '0') or (d_in.read2_reg = addr_2_reg) severity failure;
             assert (d_in.read3_enable = '0') or (d_in.read3_reg = addr_3_reg) severity failure;
         end if;
     end process register_write_0;
 
-    -- asynchronous reads
+    -- asynchronous forwarding of write data
     register_read_0: process(all)
-        variable a_addr, b_addr, c_addr : gspr_index_t;
-        variable w_addr : gspr_index_t;
+        variable out_data_1 : std_ulogic_vector(63 downto 0);
+        variable out_data_2 : std_ulogic_vector(63 downto 0);
+        variable out_data_3 : std_ulogic_vector(63 downto 0);
     begin
-        a_addr := d_in.read1_reg;
-        b_addr := d_in.read2_reg;
-        c_addr := d_in.read3_reg;
-        w_addr := w_in.write_reg;
-        if not HAS_FPU then
-            -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation
-            a_addr(5) := '0';
-            b_addr(5) := '0';
-            c_addr(5) := '0';
-            w_addr(5) := '0';
+        out_data_1 := data_1;
+        out_data_2 := data_2;
+        out_data_3 := data_3;
+        if fwd_1 = '1' then
+            out_data_1 := prev_write_data;
         end if;
+        if fwd_2 = '1' then
+            out_data_2 := prev_write_data;
+        end if;
+        if fwd_3 = '1' then
+            out_data_3 := prev_write_data;
+        end if;
+
         if d_in.read1_enable = '1' then
-            report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr))));
+            report "Reading GPR " & to_hstring(addr_1_reg) & " " & to_hstring(out_data_1);
         end if;
         if d_in.read2_enable = '1' then
-            report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr))));
+            report "Reading GPR " & to_hstring(addr_2_reg) & " " & to_hstring(out_data_2);
         end if;
         if d_in.read3_enable = '1' then
-            report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr))));
-        end if;
-        d_out.read1_data <= registers(to_integer(unsigned(a_addr)));
-        -- B read port is multiplexed with reads from the debug circuitry
-        if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then
-            b_addr := dbg_gpr_addr;
-            if not HAS_FPU then
-                b_addr(5) := '0';
-            end if;
+            report "Reading GPR " & to_hstring(addr_3_reg) & " " & to_hstring(out_data_3);
         end if;
-        rd_port_b <= registers(to_integer(unsigned(b_addr)));
-        d_out.read2_data <= rd_port_b;
-        d_out.read3_data <= registers(to_integer(unsigned(c_addr)));
 
-        -- Forwarding of written data is now done explicitly with a bypass path
-        -- from writeback to decode2.
+        d_out.read1_data <= out_data_1;
+        d_out.read2_data <= out_data_2;
+        d_out.read3_data <= out_data_3;
     end process register_read_0;
 
     -- Latch read data and ack if dbg read requested and B port not busy
@@ -124,8 +171,8 @@ begin
     begin
         if rising_edge(clk) then
             if dbg_gpr_req = '1' then
-                if d_in.read2_enable = '0' and dbg_ack = '0' then
-                    dbg_data <= rd_port_b;
+                if dbg_ack = '0' and dbg_gpr_done = '1' then
+                    dbg_data <= data_2;
                     dbg_ack <= '1';
                 end if;
             else

From d6121cd636bd5321e57f8fc76ec35b8621241117 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 14 Jul 2022 15:47:21 +1000
Subject: [PATCH 30/30] Use register addresses from decode1 for dependency
 tracking

This improves timing a little because the register addresses now come
directly from a latch instead of being calculated by
decode_input_reg_*.  The asserts that check that the two are the same
are now in decode2 rather than register_file.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl        |  9 +++++----
 decode1.vhdl       |  4 ++++
 decode2.vhdl       | 14 ++++++++------
 register_file.vhdl |  4 ----
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 4d6cb91..cc49e8f 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -265,11 +265,15 @@ package common is
         big_endian: std_ulogic;
         spr_info : spr_id;
         ram_spr : ram_spr_info;
+        reg_a : gspr_index_t;
+        reg_b : gspr_index_t;
+        reg_c : gspr_index_t;
     end record;
     constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
         (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
          decode => decode_rom_init, br_pred => '0', big_endian => '0',
-         spr_info => spr_id_init, ram_spr => ram_spr_info_init);
+         spr_info => spr_id_init, ram_spr => ram_spr_info_init,
+         reg_a => (others => '0'), reg_b => (others => '0'), reg_c => (others => '0'));
 
     type Decode1ToFetch1Type is record
         redirect     : std_ulogic;
@@ -449,11 +453,8 @@ package common is
 
     type Decode2ToRegisterFileType is record
 	read1_enable : std_ulogic;
-	read1_reg : gspr_index_t;
 	read2_enable : std_ulogic;
-	read2_reg : gspr_index_t;
 	read3_enable : std_ulogic;
-	read3_reg : gspr_index_t;
     end record;
 
     type RegisterFileToDecode2Type is record
diff --git a/decode1.vhdl b/decode1.vhdl
index cc93dfc..de9b836 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -792,6 +792,10 @@ begin
         vr.read_2_enable := f_in.valid and not f_in.fetch_failed and may_read_rb;
         vr.read_3_enable := f_in.valid and not f_in.fetch_failed;
 
+        v.reg_a := vr.reg_1_addr;
+        v.reg_b := vr.reg_2_addr;
+        v.reg_c := vr.reg_3_addr;
+
         if f_in.fetch_failed = '1' then
             v.valid := '1';
             vi.override := '1';
diff --git a/decode2.vhdl b/decode2.vhdl
index d91bec5..e24ebb5 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -360,6 +360,11 @@ begin
                 dc2.e.ramspr_odd_rdaddr <= dc2in.e.ramspr_odd_rdaddr;
                 dc2.e.ramspr_rd_odd <= dc2in.e.ramspr_rd_odd;
             end if;
+            if d_in.valid = '1' then
+                assert decoded_reg_a.reg_valid = '0' or decoded_reg_a.reg = d_in.reg_a severity failure;
+                assert decoded_reg_b.reg_valid = '0' or decoded_reg_b.reg = d_in.reg_b severity failure;
+                assert decoded_reg_c.reg_valid = '0' or decoded_reg_c.reg = d_in.reg_c severity failure;
+            end if;
         end if;
     end process;
 
@@ -379,11 +384,8 @@ begin
         end if;
 
         r_out.read1_enable <= decoded_reg_a.reg_valid;
-        r_out.read1_reg    <= decoded_reg_a.reg;
         r_out.read2_enable <= decoded_reg_b.reg_valid;
-        r_out.read2_reg    <= decoded_reg_b.reg;
         r_out.read3_enable <= decoded_reg_c.reg_valid;
-        r_out.read3_reg    <= decoded_reg_c.reg;
 
     end process;
 
@@ -537,9 +539,9 @@ begin
             v.e.nia := d_in.nia;
             v.e.unit := d_in.decode.unit;
             v.e.fac := d_in.decode.facility;
-            v.e.read_reg1 := decoded_reg_a.reg;
-            v.e.read_reg2 := decoded_reg_b.reg;
-            v.e.read_reg3 := decoded_reg_c.reg;
+            v.e.read_reg1 := d_in.reg_a;
+            v.e.read_reg2 := d_in.reg_b;
+            v.e.read_reg3 := d_in.reg_c;
             v.e.write_reg := decoded_reg_o.reg;
             v.e.write_reg_enable := decoded_reg_o.reg_valid;
             v.e.invert_a := d_in.decode.invert_a;
diff --git a/register_file.vhdl b/register_file.vhdl
index a8ddee2..753ce80 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -125,10 +125,6 @@ begin
             data_3 <= registers(to_integer(unsigned(c_addr)));
 
             prev_write_data <= w_in.write_data;
-
-            assert (d_in.read1_enable = '0') or (d_in.read1_reg = addr_1_reg) severity failure;
-            assert (d_in.read2_enable = '0') or (d_in.read2_reg = addr_2_reg) severity failure;
-            assert (d_in.read3_enable = '0') or (d_in.read3_reg = addr_3_reg) severity failure;
         end if;
     end process register_write_0;