Merge pull request #382 from paulusmack/master

Decode in block RAM and other improvements
4 years ago · ff63ffdbfd
parent 0073d23e73 20f49f06f8
commit ff63ffdbfd
28 changed files with 2779 additions and 1450 deletions
--- a/8
+++ b/8
@ -56,13 +56,13 @@ all = core_tb icache_tb dcache_tb dmi_dtm_tb \
 all: $(all)

 core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
-	utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
+	utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl predecode.vhdl \
 	decode1.vhdl helpers.vhdl insn_helpers.vhdl \
 	control.vhdl decode2.vhdl register_file.vhdl \
 	cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
-	logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
-	loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
-	core.vhdl fpu.vhdl pmu.vhdl
+	logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \
+	execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \
+	core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl

 soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
 	wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \
--- a/cache_ram.vhdl
+++ b/cache_ram.vhdl
@ -7,6 +7,7 @@ entity cache_ram is
    generic(
        ROW_BITS : integer := 16;
        WIDTH    : integer := 64;
+        BYTEWID  : integer := 8;
        TRACE    : boolean := false;
        ADD_BUF  : boolean := false
        );
@ -16,7 +17,7 @@ entity cache_ram is
        rd_en   : in  std_logic;
        rd_addr : in  std_logic_vector(ROW_BITS - 1 downto 0);
        rd_data : out std_logic_vector(WIDTH - 1 downto 0);
-        wr_sel  : in  std_logic_vector(WIDTH/8 - 1 downto 0);
+        wr_sel  : in  std_logic_vector(WIDTH/BYTEWID - 1 downto 0);
        wr_addr : in  std_logic_vector(ROW_BITS - 1 downto 0);
        wr_data : in  std_logic_vector(WIDTH - 1 downto 0)
        );
@ -38,7 +39,7 @@ begin
        variable lbit : integer range 0 to WIDTH - 1;
        variable mbit : integer range 0 to WIDTH - 1;
        variable widx : integer range 0 to SIZE - 1;
-        constant sel0 : std_logic_vector(WIDTH/8 - 1 downto 0)
+        constant sel0 : std_logic_vector(WIDTH/BYTEWID - 1 downto 0)
            := (others => '0');
    begin
        if rising_edge(clk) then
@ -49,9 +50,9 @@ begin
                        " dat:" & to_hstring(wr_data);
                end if;
            end if;
-            for i in 0 to WIDTH/8-1 loop
-                lbit := i * 8;
-                mbit := lbit + 7;
+            for i in 0 to WIDTH/BYTEWID-1 loop
+                lbit := i * BYTEWID;
+                mbit := lbit + BYTEWID - 1;
                widx := to_integer(unsigned(wr_addr));
                if wr_sel(i) = '1' then
                    ram(widx)(mbit downto lbit) <= wr_data(mbit downto lbit);
--- a/common.vhdl
+++ b/common.vhdl
@ -246,10 +246,13 @@ package common is
        fetch_failed: std_ulogic;
 	nia: std_ulogic_vector(63 downto 0);
 	insn: std_ulogic_vector(31 downto 0);
+        icode: insn_code;
        big_endian: std_ulogic;
        next_predicted: std_ulogic;
        next_pred_ntaken: std_ulogic;
    end record;
+    constant IcacheToDecode1Init : IcacheToDecode1Type :=
+        (nia => (others => '0'), insn => (others => '0'), icode => INSN_illegal, others => '0');

    type IcacheEventType is record
        icache_miss : std_ulogic;
@ -317,6 +320,9 @@ package common is
 	read_data1: std_ulogic_vector(63 downto 0);
 	read_data2: std_ulogic_vector(63 downto 0);
 	read_data3: std_ulogic_vector(63 downto 0);
+        reg_valid1: std_ulogic;
+        reg_valid2: std_ulogic;
+        reg_valid3: std_ulogic;
 	cr: std_ulogic_vector(31 downto 0);
 	xerc: xer_common_t;
 	lr: std_ulogic;
@ -363,6 +369,7 @@ package common is
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
         byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'),
         read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'),
+         reg_valid1 => '0', reg_valid2 => '0', reg_valid3 => '0',
         cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'),
         result_sel => "000", sub_select => "000",
         repeat => '0', second => '0', spr_select => spr_id_init,
@ -378,12 +385,11 @@ package common is
 	data1: std_ulogic_vector(63 downto 0);
 	data2: std_ulogic_vector(63 downto 0);
        addend: std_ulogic_vector(127 downto 0);
-	is_32bit: std_ulogic;
-        not_result: std_ulogic;
+        is_signed: std_ulogic;
+        subtract: std_ulogic;   -- 0 => addend + data1 * data2, 1 => addend - data1 * data2
    end record;
-    constant MultiplyInputInit : MultiplyInputType := (valid => '0',
-                                                       is_32bit => '0', not_result => '0',
-                                                       others => (others => '0'));
+    constant MultiplyInputInit : MultiplyInputType := (data1 => 64x"0", data2 => 64x"0",
+                                                       addend => 128x"0", others => '0');

    type MultiplyOutputType is record
 	valid: std_ulogic;
@ -476,7 +482,6 @@ package common is
    type Execute1ToLoadstore1Type is record
 	valid : std_ulogic;
        op : insn_type_t;                               -- what ld/st or m[tf]spr or TLB op to do
-        nia : std_ulogic_vector(63 downto 0);
        insn : std_ulogic_vector(31 downto 0);
        instr_tag : instr_tag_t;
 	addr1 : std_ulogic_vector(63 downto 0);
@ -504,7 +509,7 @@ package common is
        (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
         sign_extend => '0', update => '0', xerc => xerc_init,
         reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0',
-         nia => (others => '0'), insn => (others => '0'),
+         insn => (others => '0'),
         instr_tag => instr_tag_init,
         addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
         write_reg => (others => '0'),
@ -525,8 +530,6 @@ package common is
        dcbz : std_ulogic;
 	nc : std_ulogic;
        reserve : std_ulogic;
-        atomic : std_ulogic;                            -- part of a multi-transfer atomic op
-        atomic_last : std_ulogic;
        virt_mode : std_ulogic;
        priv_mode : std_ulogic;
 	addr : std_ulogic_vector(63 downto 0);
@ -674,6 +677,9 @@ package common is
        fra       : std_ulogic_vector(63 downto 0);
        frb       : std_ulogic_vector(63 downto 0);
        frc       : std_ulogic_vector(63 downto 0);
+        valid_a   : std_ulogic;
+        valid_b   : std_ulogic;
+        valid_c   : std_ulogic;
        frt       : gspr_index_t;
        rc        : std_ulogic;
        m32b      : std_ulogic;
@ -687,6 +693,7 @@ package common is
                                                       insn => (others => '0'), fe_mode => "00", rc => '0',
                                                       fra => (others => '0'), frb => (others => '0'),
                                                       frc => (others => '0'), frt => (others => '0'),
+                                                       valid_a => '0', valid_b => '0', valid_c => '0',
                                                       single => '0', is_signed => '0', out_cr => '0',
                                                       m32b => '0', oe => '0', xerc => xerc_init,
                                                       stall => '0');
--- a/core.vhdl
+++ b/core.vhdl
@ -13,7 +13,6 @@ entity core is
        EX1_BYPASS : boolean := true;
        HAS_FPU : boolean := true;
        HAS_BTC : boolean := true;
-        HAS_SHORT_MULT : boolean := false;
 	ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
        LOG_LENGTH : natural := 512;
        ICACHE_NUM_LINES : natural := 64;
@ -246,6 +245,7 @@ begin
    icache_0: entity work.icache
        generic map(
            SIM => SIM,
+            HAS_FPU => HAS_FPU,
            LINE_SIZE => 64,
            NUM_LINES => ICACHE_NUM_LINES,
            NUM_WAYS => ICACHE_NUM_WAYS,
@ -266,7 +266,7 @@ begin
            wishbone_in => wishbone_insn_in,
            wb_snoop_in => wb_snoop_in,
            events => icache_events,
-            log_out => log_data(96 downto 43)
+            log_out => log_data(100 downto 43)
            );

    icache_stall_in <= decode1_busy;
@ -287,7 +287,7 @@ begin
            d_out => decode1_to_decode2,
            f_out => decode1_to_fetch1,
            r_out => decode1_to_register_file,
-            log_out => log_data(109 downto 97)
+            log_out => log_data(113 downto 101)
            );

    decode1_stall_in <= decode2_stall_out;
@ -319,7 +319,7 @@ begin
            writeback_bypass => writeback_bypass,
            dbg_spr_req => dbg_spr_req,
            dbg_spr_addr => dbg_spr_addr,
-            log_out => log_data(119 downto 110)
+            log_out => log_data(123 downto 114)
            );
    decode2_busy_in <= ex1_busy_out;

@ -365,7 +365,6 @@ begin
            SIM => SIM,
            EX1_BYPASS => EX1_BYPASS,
            HAS_FPU => HAS_FPU,
-            HAS_SHORT_MULT => HAS_SHORT_MULT,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
@ -398,7 +397,7 @@ begin
            dbg_spr_data => dbg_spr_data,
            sim_dump => sim_ex_dump,
            sim_dump_done => sim_cr_dump,
-            log_out => log_data(134 downto 120),
+            log_out => log_data(135 downto 124),
            log_rd_addr => log_rd_addr,
            log_rd_data => log_rd_data,
            log_wr_addr => log_wr_addr
@ -500,7 +499,7 @@ begin
            );

    log_data(150) <= '0';
-    log_data(139 downto 135) <= "00000";
+    log_data(139 downto 136) <= "0000";

    debug_0: entity work.core_debug
        generic map (
--- a/core_debug.vhdl
+++ b/core_debug.vhdl
@ -175,7 +175,8 @@ begin
                gspr_index <= (others => '0');
            else
                if do_log_trigger = '1' or log_trigger_delay /= 0 then
-                    if log_trigger_delay = 255 then
+                    if log_trigger_delay = 255 or
+                        (LOG_LENGTH < 1024 and log_trigger_delay = LOG_LENGTH / 4) then
                        log_dmi_trigger(1) <= '1';
                        log_trigger_delay <= 0;
                    else
--- a/dcache.vhdl
+++ b/dcache.vhdl
@ -1004,10 +1004,10 @@ begin
            -- XXX or if r0.req.nc = '1'
            if r0.req.load = '1' then
                -- load with reservation
-                set_rsrv <= r0.req.atomic_last;
+                set_rsrv <= '1';
            else
                -- store conditional
-                clear_rsrv <= r0.req.atomic_last;
+                clear_rsrv <= '1';
                if reservation.valid = '0' or
                    r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
                    cancel_store <= '1';
--- a/decode1.vhdl
+++ b/decode1.vhdl
--- a/decode2.vhdl
+++ b/decode2.vhdl
@ -58,10 +58,6 @@ architecture behaviour of decode2 is
        busy : std_ulogic;
        sgl_pipe : std_ulogic;
        prev_sgl : std_ulogic;
-        reg_a_valid : std_ulogic;
-        reg_b_valid : std_ulogic;
-        reg_c_valid : std_ulogic;
-        reg_o_valid : std_ulogic;
        input_ov  : std_ulogic;
        output_ov : std_ulogic;
        read_rspr : std_ulogic;
@ -192,7 +188,7 @@ architecture behaviour of decode2 is
    function decode_rc (t : rc_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic is
    begin
        case t is
-            when RC =>
+            when RC | RCOE =>
                return insn_rc(insn_in);
            when ONE =>
                return '1';
@ -393,6 +389,7 @@ begin
        variable v : reg_type;
        variable length : std_ulogic_vector(3 downto 0);
        variable op : insn_type_t;
+        variable unit : unit_t;
        variable valid_in : std_ulogic;
        variable decctr : std_ulogic;
        variable sprs_busy : std_ulogic;
@ -405,6 +402,7 @@ begin
            v.e := Decode2ToExecute1Init;

            sprs_busy := '0';
+            unit := d_in.decode.unit;

            if d_in.valid = '1' then
                v.prev_sgl := dc2.sgl_pipe;
@ -429,32 +427,40 @@ begin
            end if;
            case d_in.decode.insn_type is
                when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE =>
-                    -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only
-                    if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then
+                    if d_in.decode.rc = RCOE and insn_oe(d_in.insn) = '1' then
                        v.e.oe := '1';
                        v.e.output_xer := '1';
                        v.output_ov := '1';
                        v.input_ov := '1';      -- need SO state if setting OV to 0
                    end if;
                when OP_MFSPR =>
-                    if decode_spr_num(d_in.insn) = SPR_XER then
-                        v.input_ov := '1';
-                    end if;
+                    case decode_spr_num(d_in.insn) is
+                        when SPR_XER =>
+                            v.input_ov := '1';
+                        when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>
+                            unit := LDST;
+                        when others =>
+                    end case;
                when OP_MTSPR =>
-                    if decode_spr_num(d_in.insn) = SPR_XER then
-                        v.e.output_xer := '1';
-                        v.output_ov := '1';
+                    case decode_spr_num(d_in.insn) is
+                        when SPR_XER =>
+                            v.e.output_xer := '1';
+                            v.output_ov := '1';
+                        when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>
+                            unit := LDST;
+                            if d_in.valid = '1' then
+                                v.sgl_pipe := '1';
+                            end if;
+                        when others =>
+                    end case;
+                    if d_in.spr_info.valid = '1' and d_in.valid = '1' then
+                        v.sgl_pipe := '1';
                    end if;
                when OP_CMP | OP_MCRXRX =>
                    v.input_ov := '1';
                when others =>
            end case;

-            v.reg_a_valid := decoded_reg_a.reg_valid;
-            v.reg_b_valid := decoded_reg_b.reg_valid;
-            v.reg_c_valid := decoded_reg_c.reg_valid;
-            v.reg_o_valid := decoded_reg_o.reg_valid;
-
            if d_in.decode.lr = '1' then
                v.e.lr := insn_lk(d_in.insn);
                -- b and bc have even major opcodes; bcreg is considered absolute
@ -537,11 +543,14 @@ begin

            -- execute unit
            v.e.nia := d_in.nia;
-            v.e.unit := d_in.decode.unit;
+            v.e.unit := unit;
            v.e.fac := d_in.decode.facility;
            v.e.read_reg1 := d_in.reg_a;
            v.e.read_reg2 := d_in.reg_b;
            v.e.read_reg3 := d_in.reg_c;
+            v.e.reg_valid1 := decoded_reg_a.reg_valid;
+            v.e.reg_valid2 := decoded_reg_b.reg_valid;
+            v.e.reg_valid3 := decoded_reg_c.reg_valid;
            v.e.write_reg := decoded_reg_o.reg;
            v.e.write_reg_enable := decoded_reg_o.reg_valid;
            v.e.invert_a := d_in.decode.invert_a;
@ -583,16 +592,16 @@ begin
        control_valid_in <= valid_in;
        control_serialize <= v.sgl_pipe or v.prev_sgl;

-        gpr_write_valid <= v.reg_o_valid;
+        gpr_write_valid <= v.e.write_reg_enable;
        gpr_write <= v.e.write_reg;

-        gpr_a_read_valid <= v.reg_a_valid;
+        gpr_a_read_valid <= v.e.reg_valid1;
        gpr_a_read <= v.e.read_reg1;

-        gpr_b_read_valid <= v.reg_b_valid;
+        gpr_b_read_valid <= v.e.reg_valid2;
        gpr_b_read <= v.e.read_reg2;

-        gpr_c_read_valid <= v.reg_c_valid;
+        gpr_c_read_valid <= v.e.reg_valid3;
        gpr_c_read <= v.e.read_reg3;

        cr_write_valid <= v.e.output_cr or v.e.rc;
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@ -4,14 +4,16 @@ use ieee.std_logic_1164.all;
 package decode_types is
    type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD,
 			 OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG,
-			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
+			 OP_BCD, OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
 			 OP_CNTZ, OP_CROP,
 			 OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
-			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI,
-                         OP_FPOP, OP_FPOP_I,
-                         OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
+			 OP_DCBZ, OP_ICBI, OP_ICBT,
+                         OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC,
+                         OP_DIV, OP_DIVE, OP_MOD,
+                         OP_EXTS, OP_EXTSWSLI,
+                         OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE,
-			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
+			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR,
 			 OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
 			 OP_POPCNT, OP_PRTY, OP_RFID,
@ -19,15 +21,349 @@ package decode_types is
 			 OP_SHL, OP_SHR,
 			 OP_SYNC, OP_TLBIE, OP_TRAP,
 			 OP_XOR,
-                         OP_BCD, OP_ADDG6S,
+                         OP_ADDG6S,
                         OP_FETCH_FAILED
 			 );
+
+    -- The following list is ordered in such a way that we can know some
+    -- things about which registers are accessed by an instruction by its place
+    -- in the list.  In other words we can decide whether an instruction
+    -- accesses FPRs and whether it has an RB operand by doing simple
+    -- comparisons of the insn_code for the instruction with a few constants.
+    type insn_code is (
+        -- The following instructions don't have an RB operand or access FPRs
+        INSN_illegal, -- 0
+        INSN_fetch_fail,
+        INSN_addi,
+        INSN_addic,
+        INSN_addic_dot,
+        INSN_addis,
+        INSN_addme,
+        INSN_addpcis,
+        INSN_addze,
+        INSN_andi_dot,
+        INSN_andis_dot, -- 10
+        INSN_attn,
+        INSN_b,
+        INSN_bc,
+        INSN_bcctr,
+        INSN_bclr,
+        INSN_bctar,
+        INSN_cbcdtd,
+        INSN_cdtbcd,
+        INSN_cmpi,
+        INSN_cmpli, -- 20
+        INSN_cntlzw,
+        INSN_cntlzd,
+        INSN_cnttzw,
+        INSN_cnttzd,
+        INSN_crand,
+        INSN_crandc,
+        INSN_creqv,
+        INSN_crnand,
+        INSN_crnor,
+        INSN_cror, -- 30
+        INSN_crorc,
+        INSN_crxor,
+        INSN_darn,
+        INSN_eieio,
+        INSN_extsb,
+        INSN_extsh,
+        INSN_extsw,
+        INSN_extswsli,
+        INSN_isync,
+        INSN_lbz, -- 40
+        INSN_lbzu,
+        INSN_ld,
+        INSN_ldu,
+        INSN_lha,
+        INSN_lhau,
+        INSN_lhz,
+        INSN_lhzu,
+        INSN_lwa,
+        INSN_lwz,
+        INSN_lwzu, -- 50
+        INSN_mcrf,
+        INSN_mcrfs,
+        INSN_mcrxrx,
+        INSN_mfcr,
+        INSN_mfmsr,
+        INSN_mfspr,
+        INSN_mtcrf,
+        INSN_mtfsb,
+        INSN_mtfsfi,
+        INSN_mtmsr, -- 60
+        INSN_mtmsrd,
+        INSN_mtspr,
+        INSN_mulli,
+        INSN_neg,
+        INSN_nop,
+        INSN_ori,
+        INSN_oris,
+        INSN_popcntb,
+        INSN_popcntw,
+        INSN_popcntd, -- 70
+        INSN_prtyw,
+        INSN_prtyd,
+        INSN_rfid,
+        INSN_rldic,
+        INSN_rldicl,
+        INSN_rldicr,
+        INSN_rldimi,
+        INSN_rlwimi,
+        INSN_rlwinm,
+        INSN_sc, -- 80
+        INSN_setb,
+        INSN_slbia,
+        INSN_sradi,
+        INSN_srawi,
+        INSN_stb,
+        INSN_stbu,
+        INSN_std,
+        INSN_stdu,
+        INSN_sth,
+        INSN_sthu, -- 90
+        INSN_stw,
+        INSN_stwu,
+        INSN_subfic,
+        INSN_subfme,
+        INSN_subfze,
+        INSN_sync,
+        INSN_tdi,
+        INSN_tlbsync,
+        INSN_twi,
+        INSN_wait, -- 100
+        INSN_xori,
+        INSN_xoris,
+
+        -- pad to 112 to simplify comparison logic
+        INSN_103,
+        INSN_104, INSN_105, INSN_106, INSN_107,
+        INSN_108, INSN_109, INSN_110, INSN_111,
+
+        -- The following instructions have an RB operand but don't access FPRs
+        INSN_add,
+        INSN_addc,
+        INSN_adde,
+        INSN_addex,
+        INSN_addg6s,
+        INSN_and,
+        INSN_andc,
+        INSN_bperm,
+        INSN_cmp, -- 120
+        INSN_cmpb,
+        INSN_cmpeqb,
+        INSN_cmpl,
+        INSN_cmprb,
+        INSN_dcbf,
+        INSN_dcbst,
+        INSN_dcbt,
+        INSN_dcbtst,
+        INSN_dcbz,
+        INSN_divd, -- 130
+        INSN_divdu,
+        INSN_divde,
+        INSN_divdeu,
+        INSN_divw,
+        INSN_divwu,
+        INSN_divwe,
+        INSN_divweu,
+        INSN_eqv,
+        INSN_icbi,
+        INSN_icbt, -- 140
+        INSN_isel,
+        INSN_lbarx,
+        INSN_lbzcix,
+        INSN_lbzux,
+        INSN_lbzx,
+        INSN_ldarx,
+        INSN_ldbrx,
+        INSN_ldcix,
+        INSN_ldx,
+        INSN_ldux, -- 150
+        INSN_lharx,
+        INSN_lhax,
+        INSN_lhaux,
+        INSN_lhbrx,
+        INSN_lhzcix,
+        INSN_lhzx,
+        INSN_lhzux,
+        INSN_lwarx,
+        INSN_lwax,
+        INSN_lwaux, -- 160
+        INSN_lwbrx,
+        INSN_lwzcix,
+        INSN_lwzx,
+        INSN_lwzux,
+        INSN_modsd,
+        INSN_modsw,
+        INSN_moduw,
+        INSN_modud,
+        INSN_mulhw,
+        INSN_mulhwu, -- 170
+        INSN_mulhd,
+        INSN_mulhdu,
+        INSN_mullw,
+        INSN_mulld,
+        INSN_nand,
+        INSN_nor,
+        INSN_or,
+        INSN_orc,
+        INSN_rldcl,
+        INSN_rldcr, -- 180
+        INSN_rlwnm,
+        INSN_slw,
+        INSN_sld,
+        INSN_sraw,
+        INSN_srad,
+        INSN_srw,
+        INSN_srd,
+        INSN_stbcix,
+        INSN_stbcx,
+        INSN_stbx, -- 190
+        INSN_stbux,
+        INSN_stdbrx,
+        INSN_stdcix,
+        INSN_stdcx,
+        INSN_stdx,
+        INSN_stdux,
+        INSN_sthbrx,
+        INSN_sthcix,
+        INSN_sthcx,
+        INSN_sthx, -- 200
+        INSN_sthux,
+        INSN_stwbrx,
+        INSN_stwcix,
+        INSN_stwcx,
+        INSN_stwx,
+        INSN_stwux,
+        INSN_subf,
+        INSN_subfc,
+        INSN_subfe,
+        INSN_td, -- 210
+        INSN_tlbie,
+        INSN_tlbiel,
+        INSN_tw,
+        INSN_xor,
+
+        -- pad to 224 to simplify comparison logic
+        INSN_215,
+        INSN_216, INSN_217, INSN_218, INSN_219,
+        INSN_220, INSN_221, INSN_222, INSN_223,
+
+        -- The following instructions have a third input addressed by RC
+        INSN_maddld,
+        INSN_maddhd,
+        INSN_maddhdu,
+
+        -- pad to 256 to simplify comparison logic
+        INSN_227,
+        INSN_228, INSN_229, INSN_230, INSN_231,
+        INSN_232, INSN_233, INSN_234, INSN_235,
+        INSN_236, INSN_237, INSN_238, INSN_239,
+        INSN_240, INSN_241, INSN_242, INSN_243,
+        INSN_244, INSN_245, INSN_246, INSN_247,
+        INSN_248, INSN_249, INSN_250, INSN_251,
+        INSN_252, INSN_253, INSN_254, INSN_255,
+
+        -- The following instructions access floating-point registers
+        -- These ones have an FRS operand, but RA/RB are GPRs
+        INSN_stfd,
+        INSN_stfdu,
+        INSN_stfs,
+        INSN_stfsu,
+        INSN_stfdux, -- 260
+        INSN_stfdx,
+        INSN_stfiwx,
+        INSN_stfsux,
+        INSN_stfsx,
+        -- These ones don't actually have an FRS operand (rather an FRT destination)
+        -- but are here so that all FP instructions are >= INST_first_frs.
+        INSN_lfd,
+        INSN_lfdu,
+        INSN_lfs,
+        INSN_lfsu,
+        INSN_lfdx,
+        INSN_lfdux, -- 270
+        INSN_lfiwax,
+        INSN_lfiwzx,
+        INSN_lfsx,
+        INSN_lfsux,
+        INSN_275, -- padding
+
+        -- The following instructions access FRA and/or FRB operands
+        INSN_fabs,
+        INSN_fadd,
+        INSN_fadds,
+        INSN_fcfid,
+        INSN_fcfids, -- 280
+        INSN_fcfidu,
+        INSN_fcfidus,
+        INSN_fcmpo,
+        INSN_fcmpu,
+        INSN_fcpsgn,
+        INSN_fctid,
+        INSN_fctidz,
+        INSN_fctidu,
+        INSN_fctiduz,
+        INSN_fctiw, -- 290
+        INSN_fctiwz,
+        INSN_fctiwu,
+        INSN_fctiwuz,
+        INSN_fdiv,
+        INSN_fdivs,
+        INSN_fmr,
+        INSN_fmrgew,
+        INSN_fmrgow,
+        INSN_fnabs,
+        INSN_fneg, -- 300
+        INSN_fre,
+        INSN_fres,
+        INSN_frim,
+        INSN_frin,
+        INSN_frip,
+        INSN_friz,
+        INSN_frsp,
+        INSN_frsqrte,
+        INSN_frsqrtes,
+        INSN_fsqrt, -- 310
+        INSN_fsqrts,
+        INSN_fsub,
+        INSN_fsubs,
+        INSN_ftdiv,
+        INSN_ftsqrt,
+        INSN_mffs,
+        INSN_mtfsf,
+
+        -- pad to 320
+        INSN_318, INSN_319,
+
+        -- The following instructions access FRA, FRB (possibly) and FRC operands
+        INSN_fmul, -- 320
+        INSN_fmuls,
+        INSN_fmadd,
+        INSN_fmadds,
+        INSN_fmsub,
+        INSN_fmsubs,
+        INSN_fnmadd,
+        INSN_fnmadds,
+        INSN_fnmsub,
+        INSN_fnmsubs,
+        INSN_fsel  -- 330
+        );
+
+    constant INSN_first_rb : insn_code := INSN_add;
+    constant INSN_first_rc : insn_code := INSN_maddld;
+    constant INSN_first_frs : insn_code := INSN_stfd;
+    constant INSN_first_frab : insn_code := INSN_fabs;
+    constant INSN_first_frabc : insn_code := INSN_fmul;
+
    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, CIA, FRA);
    type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
                           CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, FRB);
    type input_reg_c_t is (NONE, RS, RCR, FRC, FRS);
    type output_reg_a_t is (NONE, RT, RA, FRT);
-    type rc_t is (NONE, ONE, RC);
+    type rc_t is (NONE, ONE, RC, RCOE);
    type carry_in_t is (ZERO, CA, OV, ONE);

    constant SH_OFFSET : integer := 0;
--- a/execute1.vhdl
+++ b/execute1.vhdl
@ -15,7 +15,6 @@ entity execute1 is
        SIM : boolean := false;
        EX1_BYPASS : boolean := true;
        HAS_FPU : boolean := true;
-        HAS_SHORT_MULT : boolean := false;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -65,7 +64,7 @@ entity execute1 is
        sim_dump      : in std_ulogic;
        sim_dump_done : out std_ulogic;

-        log_out : out std_ulogic_vector(14 downto 0);
+        log_out : out std_ulogic_vector(11 downto 0);
        log_rd_addr : out std_ulogic_vector(31 downto 0);
        log_rd_data : in std_ulogic_vector(63 downto 0);
        log_wr_addr : in std_ulogic_vector(31 downto 0)
@ -85,6 +84,7 @@ architecture behaviour of execute1 is
        write_pmuspr : std_ulogic;
        ramspr_write_even : std_ulogic;
        ramspr_write_odd : std_ulogic;
+        mult_32s : std_ulogic;
    end record;
    constant side_effect_init : side_effect_type := (others => '0');

@ -203,6 +203,8 @@ architecture behaviour of execute1 is
    -- multiply signals
    signal x_to_multiply: MultiplyInputType;
    signal multiply_to_x: MultiplyOutputType;
+    signal x_to_mult_32s: MultiplyInputType;
+    signal mult_32s_to_x: MultiplyOutputType;

    -- divider signals
    signal x_to_divider: Execute1ToDividerType;
@ -411,6 +413,14 @@ begin
            m_out => multiply_to_x
            );

+    mult_32s_0: entity work.multiply_32s
+        port map (
+            clk => clk,
+            stall => stage2_stall,
+            m_in => x_to_mult_32s,
+            m_out => mult_32s_to_x
+            );
+
    divider_0: if not HAS_FPU generate
        div_0: entity work.divider
            port map (
@ -437,17 +447,6 @@ begin
            p_out => pmu_to_x
            );

-    short_mult_0: if HAS_SHORT_MULT generate
-    begin
-        short_mult: entity work.short_multiply
-        port map (
-            clk => clk,
-            a_in => a_in(15 downto 0),
-            b_in => b_in(15 downto 0),
-            m_out => mshort_p
-            );
-    end generate;
-
    dbg_ctrl_out <= ctrl;
    log_rd_addr <= ex2.log_addr_spr;

@ -684,78 +683,82 @@ begin
        overflow_32 <= calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31));
        overflow_64 <= calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63));

-        -- signals to multiply and divide units
-        sign1 := '0';
-        sign2 := '0';
-        if e_in.is_signed = '1' then
-            if e_in.is_32bit = '1' then
-                sign1 := a_in(31);
-                sign2 := b_in(31);
-            else
-                sign1 := a_in(63);
-                sign2 := b_in(63);
-            end if;
-        end if;
-        -- take absolute values
-        if sign1 = '0' then
-            abs1 := signed(a_in);
-        else
-            abs1 := - signed(a_in);
-        end if;
-        if sign2 = '0' then
-            abs2 := signed(b_in);
-        else
-            abs2 := - signed(b_in);
-        end if;
-
-        -- Interface to multiply and divide units
-        x_to_divider.is_signed <= e_in.is_signed;
-	x_to_divider.is_32bit <= e_in.is_32bit;
-        x_to_divider.is_extended <= '0';
-        x_to_divider.is_modulus <= '0';
-        if e_in.insn_type = OP_MOD then
-            x_to_divider.is_modulus <= '1';
-        end if;
-        x_to_divider.flush <= flush_in;
-
+        -- signals to multiplier
        addend := (others => '0');
-        if e_in.insn(26) = '0' then
+        if e_in.reg_valid3 = '1' then
            -- integer multiply-add, major op 4 (if it is a multiply)
            addend(63 downto 0) := c_in;
            if e_in.is_signed = '1' then
                addend(127 downto 64) := (others => c_in(63));
            end if;
        end if;
-        if (sign1 xor sign2) = '1' then
-            addend := not addend;
-        end if;
-
-	x_to_multiply.is_32bit <= e_in.is_32bit;
-        x_to_multiply.not_result <= sign1 xor sign2;
+        x_to_multiply.data1 <= std_ulogic_vector(a_in);
+        x_to_multiply.data2 <= std_ulogic_vector(b_in);
+        x_to_multiply.is_signed <= e_in.is_signed;
+        x_to_multiply.subtract <= '0';
        x_to_multiply.addend <= addend;
-        x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
-        if e_in.is_32bit = '0' then
-            -- 64-bit forms
-            x_to_multiply.data1 <= std_ulogic_vector(abs1);
-            x_to_multiply.data2 <= std_ulogic_vector(abs2);
-            if e_in.insn_type = OP_DIVE then
-                x_to_divider.is_extended <= '1';
+
+        -- Interface to divide unit
+        if not HAS_FPU then
+            sign1 := '0';
+            sign2 := '0';
+            if e_in.is_signed = '1' then
+                if e_in.is_32bit = '1' then
+                    sign1 := a_in(31);
+                    sign2 := b_in(31);
+                else
+                    sign1 := a_in(63);
+                    sign2 := b_in(63);
+                end if;
            end if;
-            x_to_divider.dividend <= std_ulogic_vector(abs1);
-            x_to_divider.divisor <= std_ulogic_vector(abs2);
-        else
-            -- 32-bit forms
-            x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
-            x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
+            -- take absolute values
+            if sign1 = '0' then
+                abs1 := signed(a_in);
+            else
+                abs1 := - signed(a_in);
+            end if;
+            if sign2 = '0' then
+                abs2 := signed(b_in);
+            else
+                abs2 := - signed(b_in);
+            end if;
+
+            x_to_divider.is_signed <= e_in.is_signed;
+            x_to_divider.is_32bit <= e_in.is_32bit;
            x_to_divider.is_extended <= '0';
-            if e_in.insn_type = OP_DIVE then   -- extended forms
-                x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
+            x_to_divider.is_modulus <= '0';
+            if e_in.insn_type = OP_MOD then
+                x_to_divider.is_modulus <= '1';
+            end if;
+            x_to_divider.flush <= flush_in;
+            x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
+            if e_in.is_32bit = '0' then
+                -- 64-bit forms
+                if e_in.insn_type = OP_DIVE then
+                    x_to_divider.is_extended <= '1';
+                end if;
+                x_to_divider.dividend <= std_ulogic_vector(abs1);
+                x_to_divider.divisor <= std_ulogic_vector(abs2);
            else
-                x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
+                -- 32-bit forms
+                x_to_divider.is_extended <= '0';
+                if e_in.insn_type = OP_DIVE then   -- extended forms
+                    x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
+                else
+                    x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
+                end if;
+                x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
            end if;
-            x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
        end if;

+        -- signals to 32-bit multiplier
+        x_to_mult_32s.data1 <= 32x"0" & a_in(31 downto 0);
+        x_to_mult_32s.data2 <= 32x"0" & b_in(31 downto 0);
+        x_to_mult_32s.is_signed <= e_in.is_signed;
+        -- The following are unused, but set here to avoid X states
+        x_to_mult_32s.subtract <= '0';
+        x_to_mult_32s.addend <= (others => '0');
+
        shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
        case ex1.mul_select is
            when "00" =>
@ -1271,13 +1274,10 @@ begin
 		v.se.icache_inval := '1';

 	    when OP_MUL_L64 =>
-                if HAS_SHORT_MULT and e_in.insn(26) = '1' and
-                    fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then
-                    -- Operands fit into 16 bits, so use short multiplier
-                    if e_in.oe = '1' then
-                        -- Note 16x16 multiply can't overflow, even for mullwo
-                        set_ov(v.e, '0', '0');
-                    end if;
+                if e_in.is_32bit = '1' then
+                    v.se.mult_32s := '1';
+                    v.res2_sel := "00";
+                    slow_op := '1';
                else
                    -- Use standard multiplier
                    v.start_mul := '1';
@ -1285,11 +1285,16 @@ begin
                    owait := '1';
                end if;

-	    when OP_MUL_H64 | OP_MUL_H32 =>
+	    when OP_MUL_H64 =>
                v.start_mul := '1';
                slow_op := '1';
                owait := '1';

+            when OP_MUL_H32 =>
+                v.se.mult_32s := '1';
+                v.res2_sel := "01";
+                slow_op := '1';
+
 	    when OP_DIV | OP_DIVE | OP_MOD =>
                if not HAS_FPU then
                    v.start_div := '1';
@ -1370,6 +1375,7 @@ begin
        fv := Execute1ToFPUInit;

        x_to_multiply.valid <= '0';
+        x_to_mult_32s.valid <= '0';
        x_to_divider.valid <= '0';
        v.ext_interrupt := '0';
        v.taken_branch_event := '0';
@ -1456,6 +1462,7 @@ begin
            v.res2_sel := actions.res2_sel;
            v.msr := actions.new_msr;
            x_to_multiply.valid <= actions.start_mul;
+            x_to_mult_32s.valid <= actions.se.mult_32s;
            v.mul_in_progress := actions.start_mul;
            x_to_divider.valid <= actions.start_div;
            v.div_in_progress := actions.start_div;
@ -1481,7 +1488,7 @@ begin
            end if;
        end if;

-        if ex1.div_in_progress = '1' then
+        if not HAS_FPU and ex1.div_in_progress = '1' then
            v.div_in_progress := not divider_to_x.valid;
            v.busy := not divider_to_x.valid;
            if divider_to_x.valid = '1' and ex1.oe = '1' then
@ -1554,7 +1561,6 @@ begin

        -- Outputs to loadstore1 (async)
        lv.op := e_in.insn_type;
-        lv.nia := e_in.nia;
        lv.instr_tag := e_in.instr_tag;
        lv.addr1 := a_in;
        lv.addr2 := b_in;
@ -1568,11 +1574,9 @@ begin
        lv.reserve := e_in.reserve;
        lv.rc := e_in.rc;
        lv.insn := e_in.insn;
-        -- decode l*cix and st*cix instructions here
-        if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and
-            e_in.insn(5 downto 1) = "10101" then
-            lv.ci := '1';
-        end if;
+        -- invert_a field is overloaded for load/store instructions
+        -- to mark l*cix and st*cix
+        lv.ci := e_in.invert_a;
        lv.virt_mode := ex1.msr(MSR_DR);
        lv.priv_mode := not ex1.msr(MSR_PR);
        lv.mode_32bit := not ex1.msr(MSR_SF);
@ -1591,6 +1595,9 @@ begin
        fv.fra := a_in;
        fv.frb := b_in;
        fv.frc := c_in;
+        fv.valid_a := e_in.reg_valid1;
+        fv.valid_b := e_in.reg_valid2;
+        fv.valid_c := e_in.reg_valid3;
        fv.frt := e_in.write_reg;
        fv.rc := e_in.rc;
        fv.out_cr := e_in.output_cr;
@ -1624,11 +1631,6 @@ begin
    -- Second execute stage control
    execute2_1: process(all)
 	variable v : reg_stage2_type;
-	variable overflow : std_ulogic;
-        variable lv : Execute1ToLoadstore1Type;
-        variable fv : Execute1ToFPUType;
-        variable k : integer;
-        variable go : std_ulogic;
        variable bypass_valid : std_ulogic;
        variable rcresult : std_ulogic_vector(63 downto 0);
        variable sprres : std_ulogic_vector(63 downto 0);
@ -1647,6 +1649,14 @@ begin
            v.br_mispredict := ex1.br_mispredict;
        end if;

+        if ex1.se.mult_32s = '1' and ex1.oe = '1' then
+            v.e.xerc.ov := mult_32s_to_x.overflow;
+            v.e.xerc.ov32 := mult_32s_to_x.overflow;
+            if mult_32s_to_x.overflow = '1' then
+                v.e.xerc.so := '1';
+            end if;
+        end if;
+
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
 	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
@ -1667,24 +1677,34 @@ begin
            v.e.write_xerc_enable := '0';
            v.e.redirect := '0';
            v.e.br_last := '0';
-            v.se := side_effect_init;
            v.taken_branch_event := '0';
            v.br_mispredict := '0';
        end if;
        if flush_in = '1' then
            v.e.valid := '0';
            v.e.interrupt := '0';
+            v.se := side_effect_init;
            v.ext_interrupt := '0';
        end if;

        -- This is split like this because mfspr doesn't have an Rc bit,
        -- and we don't want the zero-detect logic to be after the
        -- SPR mux for timing reasons.
-        if ex1.res2_sel(0) = '0' then
+        if ex1.se.mult_32s = '1' then
+            if ex1.res2_sel(0) = '0' then
+                rcresult := mult_32s_to_x.result(63 downto 0);
+            else
+                rcresult := mult_32s_to_x.result(63 downto 32) &
+                            mult_32s_to_x.result(63 downto 32);
+            end if;
+        elsif ex1.res2_sel(0) = '0' then
            rcresult := ex1.e.write_data;
-            sprres := spr_result;
        else
            rcresult := countbits_result;
+        end if;
+        if ex1.res2_sel(0) = '0' then
+            sprres := spr_result;
+        else
            sprres := pmu_to_x.spr_val;
        end if;
        if ex1.res2_sel(1) = '0' then
@ -1708,7 +1728,7 @@ begin
            cr_res(31) := sign;
            cr_res(30) := not (sign or zero);
            cr_res(29) := zero;
-            cr_res(28) := ex1.e.xerc.so;
+            cr_res(28) := v.e.xerc.so;
            cr_mask(7) := '1';
        end if;

@ -1802,7 +1822,7 @@ begin
    end generate;

    e1_log: if LOG_LENGTH > 0 generate
-        signal log_data : std_ulogic_vector(14 downto 0);
+        signal log_data : std_ulogic_vector(11 downto 0);
    begin
        ex1_log : process(clk)
        begin
@ -1812,7 +1832,6 @@ begin
                            exception_log &
                            irq_valid_log &
                            interrupt_in.intr &
-                            "000" &
                            ex2.e.write_enable &
                            ex2.e.valid &
                            (ex2.e.redirect or ex2.e.interrupt) &
--- a/fpga/top-arty.vhdl
+++ b/fpga/top-arty.vhdl
@ -16,7 +16,6 @@ entity toplevel is
        CLK_FREQUENCY      : positive := 100000000;
        HAS_FPU            : boolean  := true;
        HAS_BTC            : boolean  := true;
-        HAS_SHORT_MULT     : boolean  := false;
        USE_LITEDRAM       : boolean  := false;
        NO_BRAM            : boolean  := false;
        DISABLE_FLATTEN_CORE : boolean := false;
@ -199,7 +198,6 @@ begin
            CLK_FREQ           => CLK_FREQUENCY,
            HAS_FPU            => HAS_FPU,
            HAS_BTC            => HAS_BTC,
-            HAS_SHORT_MULT     => HAS_SHORT_MULT,
            HAS_DRAM           => USE_LITEDRAM,
            DRAM_SIZE          => 256 * 1024 * 1024,
            DRAM_INIT_SIZE     => PAYLOAD_SIZE,
--- a/fpga/top-generic.vhdl
+++ b/fpga/top-generic.vhdl
@ -13,7 +13,6 @@ entity toplevel is
 	CLK_FREQUENCY : positive := 100000000;
        HAS_FPU       : boolean  := true;
        HAS_BTC       : boolean  := false;
-        HAS_SHORT_MULT: boolean  := false;
        ICACHE_NUM_LINES : natural := 64;
        LOG_LENGTH    : natural := 512;
 	DISABLE_FLATTEN_CORE : boolean := false;
@ -75,7 +74,6 @@ begin
 	    CLK_FREQ      => CLK_FREQUENCY,
            HAS_FPU       => HAS_FPU,
            HAS_BTC       => HAS_BTC,
-            HAS_SHORT_MULT => HAS_SHORT_MULT,
 	    ICACHE_NUM_LINES => ICACHE_NUM_LINES,
            LOG_LENGTH    => LOG_LENGTH,
 	    DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
--- a/fpga/top-nexys-video.vhdl
+++ b/fpga/top-nexys-video.vhdl
@ -16,7 +16,6 @@ entity toplevel is
 	CLK_FREQUENCY : positive := 100000000;
        HAS_FPU       : boolean  := true;
        HAS_BTC       : boolean  := true;
-        HAS_SHORT_MULT: boolean  := false;
 	USE_LITEDRAM  : boolean  := false;
 	NO_BRAM       : boolean  := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
@ -175,7 +174,6 @@ begin
 	    CLK_FREQ      => CLK_FREQUENCY,
            HAS_FPU       => HAS_FPU,
            HAS_BTC       => HAS_BTC,
-            HAS_SHORT_MULT=> HAS_SHORT_MULT,
 	    HAS_DRAM      => USE_LITEDRAM,
 	    DRAM_SIZE     => 512 * 1024 * 1024,
            DRAM_INIT_SIZE => PAYLOAD_SIZE,
--- a/fpga/top-orangecrab0.2.vhdl
+++ b/fpga/top-orangecrab0.2.vhdl
@ -188,7 +188,6 @@ begin
            HAS_UART1          => HAS_UART1,
            HAS_SD_CARD        => USE_LITESDCARD,
            ICACHE_NUM_LINES   => ICACHE_NUM_LINES,
-            HAS_SHORT_MULT     => true,
            NGPIO              => NGPIO
            )
        port map (
--- a/fpga/top-wukong-v2.vhdl
+++ b/fpga/top-wukong-v2.vhdl
@ -16,7 +16,6 @@ entity toplevel is
        CLK_FREQUENCY      : positive := 100000000;
        HAS_FPU            : boolean  := true;
        HAS_BTC            : boolean  := true;
-        HAS_SHORT_MULT     : boolean  := false;
        USE_LITEDRAM       : boolean  := false;
        NO_BRAM            : boolean  := false;
        DISABLE_FLATTEN_CORE : boolean := false;
@ -175,7 +174,6 @@ begin
            CLK_FREQ           => CLK_FREQUENCY,
            HAS_FPU            => HAS_FPU,
            HAS_BTC            => HAS_BTC,
-            HAS_SHORT_MULT     => HAS_SHORT_MULT,
            HAS_DRAM           => USE_LITEDRAM,
            DRAM_SIZE          => 256 * 1024 * 1024,
            DRAM_INIT_SIZE     => PAYLOAD_SIZE,
--- a/fpu.vhdl
+++ b/fpu.vhdl
--- a/icache.vhdl
+++ b/icache.vhdl
@ -23,6 +23,7 @@ use ieee.numeric_std.all;
 library work;
 use work.utils.all;
 use work.common.all;
+use work.decode_types.all;
 use work.wishbone_types.all;

 -- 64 bit direct mapped icache. All instructions are 4B aligned.
@ -30,6 +31,7 @@ use work.wishbone_types.all;
 entity icache is
    generic (
        SIM : boolean := false;
+        HAS_FPU : boolean := true;
        -- Line size in bytes
        LINE_SIZE : positive := 64;
        -- BRAM organisation: We never access more than wishbone_data_bits at
@ -69,7 +71,7 @@ entity icache is
        wb_snoop_in  : in wishbone_master_out := wishbone_master_out_init;

        events       : out IcacheEventType;
-        log_out      : out std_ulogic_vector(53 downto 0)
+        log_out      : out std_ulogic_vector(57 downto 0)
        );
 end entity icache;

@ -122,8 +124,20 @@ architecture rtl of icache is
    subtype way_t is integer range 0 to NUM_WAYS-1;
    subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);

+    -- We store a pre-decoded 10-bit insn_code along with the bottom 26 bits of
+    -- each instruction, giving a total of 36 bits per instruction, which
+    -- fits neatly into the block RAMs available on FPGAs.
+    -- For illegal instructions, the top 4 bits are ones and the bottom 6 bits
+    -- are the instruction's primary opcode, so we have the whole instruction
+    -- word available (e.g. to put in HEIR).  For other instructions, the
+    -- primary opcode is not stored but could be determined from the insn_code.
+    constant PREDECODE_BITS : natural := 10;
+    constant INSN_IMAGE_BITS : natural := 26;
+    constant ICWORDLEN : natural := PREDECODE_BITS + INSN_IMAGE_BITS;
+    constant ROW_WIDTH : natural := INSN_PER_ROW * ICWORDLEN;
+
    -- The cache data BRAM organized as described above for each way
-    subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
+    subtype cache_row_t is std_ulogic_vector(ROW_WIDTH-1 downto 0);

    -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
    -- not handle a clean (commented) definition of the cache tags as a 3d
@ -184,6 +198,8 @@ architecture rtl of icache is
        wb               : wishbone_master_out;
 	store_way        : way_t;
        store_index      : index_t;
+        recv_row         : row_t;
+        recv_valid       : std_ulogic;
 	store_row        : row_t;
        store_tag        : cache_tag_t;
        store_valid      : std_ulogic;
@ -214,7 +230,9 @@ architecture rtl of icache is

    -- Cache RAM interface
    type cache_ram_out_t is array(way_t) of cache_row_t;
-    signal cache_out   : cache_ram_out_t;
+    signal cache_out     : cache_ram_out_t;
+    signal cache_wr_data : std_ulogic_vector(ROW_WIDTH - 1 downto 0);
+    signal wb_rd_data    : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0);

    -- PLRU output interface
    type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
@ -226,6 +244,8 @@ architecture rtl of icache is
    signal snoop_index : index_t;
    signal snoop_hits  : cache_way_valids_t;

+    signal log_insn : std_ulogic_vector(35 downto 0);
+
    -- Return the cache line index (tag index) for an address
    function get_index(addr: std_ulogic_vector) return index_t is
    begin
@ -293,7 +313,7 @@ architecture rtl of icache is
 	variable word: integer range 0 to INSN_PER_ROW-1;
    begin
        word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
-	return data(31+word*32 downto word*32);
+	return data(word * ICWORDLEN + ICWORDLEN - 1 downto word * ICWORDLEN);
    end;

    -- Get the tag value from the address
@ -327,6 +347,34 @@ architecture rtl of icache is

 begin

+    -- byte-swap read data if big endian
+    process(all)
+        variable j: integer;
+    begin
+        if r.store_tag(TAG_BITS - 1) = '0' then
+            wb_rd_data <= wishbone_in.dat;
+        else
+            for ii in 0 to (wishbone_in.dat'length / 8) - 1 loop
+                j := ((ii / 4) * 4) + (3 - (ii mod 4));
+                wb_rd_data(ii * 8 + 7 downto ii * 8) <= wishbone_in.dat(j * 8 + 7 downto j * 8);
+            end loop;
+        end if;
+    end process;
+
+    predecoder_0: entity work.predecoder
+        generic map (
+            HAS_FPU => HAS_FPU,
+            WIDTH => INSN_PER_ROW,
+            ICODE_LEN => PREDECODE_BITS,
+            IMAGE_LEN => INSN_IMAGE_BITS
+            )
+        port map (
+            clk => clk,
+            valid_in => wishbone_in.ack,
+            insns_in => wb_rd_data,
+            icodes_out => cache_wr_data
+            );
+
    assert LINE_SIZE mod ROW_SIZE = 0;
    assert ispow2(LINE_SIZE)    report "LINE_SIZE not power of 2" severity FAILURE;
    assert ispow2(NUM_LINES)    report "NUM_LINES not power of 2" severity FAILURE;
@ -367,13 +415,13 @@ begin
 	signal rd_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 	signal wr_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 	signal dout     : cache_row_t;
-	signal wr_sel   : std_ulogic_vector(ROW_SIZE-1 downto 0);
-        signal wr_dat   : std_ulogic_vector(wishbone_in.dat'left downto 0);
+	signal wr_sel   : std_ulogic_vector(0 downto 0);
    begin
 	way: entity work.cache_ram
 	    generic map (
 		ROW_BITS => ROW_BITS,
-		WIDTH => ROW_SIZE_BITS
+		WIDTH => ROW_WIDTH,
+                BYTEWID => ROW_WIDTH
 		)
 	    port map (
 		clk     => clk,
@ -382,31 +430,19 @@ begin
 		rd_data => dout,
 		wr_sel  => wr_sel,
 		wr_addr => wr_addr,
-		wr_data => wr_dat
+		wr_data => cache_wr_data
 		);
 	process(all)
-            variable j: integer;
 	begin
-            -- byte-swap read data if big endian
-            if r.store_tag(TAG_BITS - 1) = '0' then
-                wr_dat <= wishbone_in.dat;
-            else
-                for ii in 0 to (wishbone_in.dat'length / 8) - 1 loop
-                    j := ((ii / 4) * 4) + (3 - (ii mod 4));
-                    wr_dat(ii * 8 + 7 downto ii * 8) <= wishbone_in.dat(j * 8 + 7 downto j * 8);
-                end loop;
-            end if;
 	    do_read <= not stall_in;
 	    do_write <= '0';
-	    if wishbone_in.ack = '1' and replace_way = i then
+	    if r.recv_valid = '1' and r.store_way = i then
 		do_write <= '1';
 	    end if;
 	    cache_out(i) <= dout;
 	    rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
 	    wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
-            for ii in 0 to ROW_SIZE-1 loop
-                wr_sel(ii) <= do_write;
-            end loop;
+            wr_sel(0) <= do_write;
 	end process;
    end generate;
    
@ -515,6 +551,8 @@ begin
    icache_comb : process(all)
 	variable is_hit  : std_ulogic;
 	variable hit_way : way_t;
+        variable insn    : std_ulogic_vector(ICWORDLEN - 1 downto 0);
+        variable icode   : insn_code;
    begin
 	-- Extract line, row and tag from request
 	if not is_X(i_in.nia) then
@ -575,11 +613,19 @@ begin
 	--       I prefer not to do just yet as it would force fetch2 to know about
 	--       some of the cache geometry information.
 	--
+        insn := (others => '0');
+        icode := INSN_illegal;
 	if r.hit_valid = '1' then
-	    i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
-	else
-            i_out.insn <= (others => '0');
+            insn := read_insn_word(r.hit_nia, cache_out(r.hit_way));
+            -- Currently we use only the top bit for indicating illegal
+            -- instructions because we know that insn_codes fit into 9 bits.
+            if insn(ICWORDLEN - 1) = '0' then
+                icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS))));
+            end if;
 	end if;
+        i_out.insn <= insn(31 downto 0);
+        i_out.icode <= icode;
+        log_insn <= cache_wr_data(35 downto 0);
 	i_out.valid <= r.hit_valid;
 	i_out.nia <= r.hit_nia;
 	i_out.stop_mark <= r.hit_smark;
@ -640,9 +686,11 @@ begin
        variable snoop_addr : real_addr_t;
        variable snoop_tag : cache_tag_t;
        variable snoop_cache_tags : cache_tags_set_t;
+        variable replace_way : way_t;
    begin
        if rising_edge(clk) then
            ev.icache_miss <= '0';
+            r.recv_valid <= '0';
 	    -- On reset, clear all valid bits to force misses
            if rst = '1' then
 		for i in index_t loop
@ -714,13 +762,13 @@ begin
                            " IR:" & std_ulogic'image(i_in.virt_mode) &
 			    " SM:" & std_ulogic'image(i_in.stop_mark) &
 			    " idx:" & integer'image(req_index) &
-			    " way:" & integer'image(replace_way) &
 			    " tag:" & to_hstring(req_tag) &
                            " RA:" & to_hstring(real_addr);
                        ev.icache_miss <= '1';

 			-- Keep track of our index and way for subsequent stores
 			r.store_index <= req_index;
+                        r.recv_row <= get_row(req_raddr);
 			r.store_row <= get_row(req_raddr);
                        r.store_tag <= req_tag;
                        r.store_valid <= '1';
@ -740,6 +788,7 @@ begin
 		when CLR_TAG | WAIT_ACK =>
                    if r.state = CLR_TAG then
                        -- Get victim way from plru
+                        replace_way := to_integer(unsigned(plru_victim(r.store_index)));
 			r.store_way <= replace_way;

 			-- Force misses on that way while reloading that line
@ -757,6 +806,19 @@ begin
                        r.state <= WAIT_ACK;
                    end if;

+                    -- If we are writing in this cycle, mark row valid and see if we are done
+                    if r.recv_valid = '1' then
+                        r.rows_valid(r.store_row mod ROW_PER_LINE) <= not inval_in;
+			if is_last_row(r.store_row, r.end_row_ix) then
+			    -- Cache line is now valid
+			    cache_valids(r.store_index)(r.store_way) <= r.store_valid and not inval_in;
+			    -- We are done
+			    r.state <= IDLE;
+			end if;
+			-- Increment store row counter
+			r.store_row <= r.recv_row;
+                    end if;
+
 		    -- If we are still sending requests, was one accepted ?
 		    if wishbone_in.stall = '0' and r.wb.stb = '1' then
 			-- That was the last word ? We are done sending. Clear stb.
@ -777,33 +839,27 @@ begin

 		    -- Incoming acks processing
 		    if wishbone_in.ack = '1' then
-                        r.rows_valid(r.store_row mod ROW_PER_LINE) <= not inval_in;
 			-- Check for completion
-			if is_last_row(r.store_row, r.end_row_ix) then
+			if is_last_row(r.recv_row, r.end_row_ix) then
 			    -- Complete wishbone cycle
 			    r.wb.cyc <= '0';
-
-			    -- Cache line is now valid
-			    cache_valids(r.store_index)(replace_way) <= r.store_valid and not inval_in;
-
-			    -- We are done
-			    r.state <= IDLE;
 			end if;
+                        r.recv_valid <= '1';

-			-- Increment store row counter
-			r.store_row <= next_row(r.store_row);
+			-- Increment receive row counter
+			r.recv_row <= next_row(r.recv_row);
 		    end if;

                when STOP_RELOAD =>
                    -- Wait for all outstanding requests to be satisfied, then
                    -- go to IDLE state.
-                    if get_row_of_line(r.store_row) = get_row_of_line(get_row(wb_to_addr(r.wb.adr))) then
+                    if get_row_of_line(r.recv_row) = get_row_of_line(get_row(wb_to_addr(r.wb.adr))) then
                        r.wb.cyc <= '0';
                        r.state <= IDLE;
                    end if;
                    if wishbone_in.ack = '1' then
 			-- Increment store row counter
-			r.store_row <= next_row(r.store_row);
+			r.recv_row <= next_row(r.recv_row);
 		    end if;
 		end case;
 	    end if;
@ -819,7 +875,7 @@ begin

    icache_log: if LOG_LENGTH > 0 generate
        -- Output data to logger
-        signal log_data    : std_ulogic_vector(53 downto 0);
+        signal log_data    : std_ulogic_vector(57 downto 0);
    begin
        data_log: process(clk)
            variable lway: way_t;
@ -832,7 +888,7 @@ begin
                    wstate := '1';
                end if;
                log_data <= i_out.valid &
-                            i_out.insn &
+                            log_insn &
                            wishbone_in.ack &
                            r.wb.adr(2 downto 0) &
                            r.wb.stb & r.wb.cyc &
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@ -83,8 +83,6 @@ architecture behave of loadstore1 is
 	update       : std_ulogic;
 	xerc         : xer_common_t;
        reserve      : std_ulogic;
-        atomic       : std_ulogic;
-        atomic_last  : std_ulogic;
        rc           : std_ulogic;
        nc           : std_ulogic;              -- non-cacheable access
        virt_mode    : std_ulogic;
@ -108,7 +106,7 @@ architecture behave of loadstore1 is
                                          elt_length => x"0", byte_reverse => '0', brev_mask => "000",
                                          sign_extend => '0', update => '0',
                                          xerc => xerc_init, reserve => '0',
-                                          atomic => '0', atomic_last => '0', rc => '0', nc => '0',
+                                          rc => '0', nc => '0',
                                          virt_mode => '0', priv_mode => '0', load_sp => '0',
                                          sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0',
                                          dword_index => '0', two_dwords => '0', incomplete => '0');
@ -439,15 +437,9 @@ begin

        addr := lsu_sum;
        if l_in.second = '1' then
-            if l_in.update = '0' then
-                -- for the second half of a 16-byte transfer,
-                -- use the previous address plus 8.
-                addr := std_ulogic_vector(unsigned(r1.addr0(63 downto 3)) + 1) & r1.addr0(2 downto 0);
-            else
-                -- for an update-form load, use the previous address
-                -- as the value to write back to RA.
-                addr := r1.addr0;
-            end if;
+            -- for an update-form load, use the previous address
+            -- as the value to write back to RA.
+            addr := r1.addr0;
        end if;
        if l_in.mode_32bit = '1' then
            addr(63 downto 32) := (others => '0');
@ -474,14 +466,12 @@ begin
        misaligned := or (addr_mask and addr(2 downto 0));
        v.align_intr := l_in.reserve and misaligned;

-        v.atomic := not misaligned;
-        v.atomic_last := not misaligned and (l_in.second or not l_in.repeat);
-
        case l_in.op is
            when OP_STORE =>
                v.store := '1';
            when OP_LOAD =>
-                if l_in.update = '0' or l_in.second = '0' then
+                -- Note: only RA updates have l_in.second = 1
+                if l_in.second = '0' then
                    v.load := '1';
                    if HAS_FPU and l_in.is_32bit = '1' then
                        -- Allow an extra cycle for SP->DP precision conversion
@ -507,7 +497,6 @@ begin
            when OP_FETCH_FAILED =>
                -- send it to the MMU to do the radix walk
                v.instr_fault := '1';
-                v.addr := l_in.nia;
                v.mmu_op := '1';
            when others =>
        end case;
@ -953,8 +942,6 @@ begin
            d_out.dcbz <= stage1_req.dcbz;
            d_out.nc <= stage1_req.nc;
            d_out.reserve <= stage1_req.reserve;
-            d_out.atomic <= stage1_req.atomic;
-            d_out.atomic_last <= stage1_req.atomic_last;
            d_out.addr <= stage1_req.addr;
            d_out.byte_sel <= stage1_req.byte_sel;
            d_out.virt_mode <= stage1_req.virt_mode;
@ -965,8 +952,6 @@ begin
            d_out.dcbz <= r2.req.dcbz;
            d_out.nc <= r2.req.nc;
            d_out.reserve <= r2.req.reserve;
-            d_out.atomic <= r2.req.atomic;
-            d_out.atomic_last <= r2.req.atomic_last;
            d_out.addr <= r2.req.addr;
            d_out.byte_sel <= r2.req.byte_sel;
            d_out.virt_mode <= r2.req.virt_mode;
--- a/microwatt.core
+++ b/microwatt.core
@ -9,6 +9,7 @@ filesets:
      - wishbone_types.vhdl
      - common.vhdl
      - fetch1.vhdl
+      - predecode.vhdl
      - decode1.vhdl
      - helpers.vhdl
      - decode2.vhdl
@ -65,6 +66,7 @@ filesets:
  xilinx_specific:
    files:
      - xilinx-mult.vhdl : {file_type : vhdlSource-2008}
+      - xilinx-mult-32s.vhdl : {file_type : vhdlSource-2008}
      - fpga/fpga-random.vhdl : {file_type : vhdlSource-2008}
      - fpga/fpga-random.xdc : {file_type : xdc}

@ -144,7 +146,6 @@ targets:
      - uart_is_16550
      - has_fpu
      - has_btc
-      - has_short_mult
    tools:
      vivado: {part : xc7a100tcsg324-1}
    toplevel : toplevel
@ -250,7 +251,6 @@ targets:
      - uart_is_16550
      - has_fpu
      - has_btc
-      - has_short_mult
    generate: [litedram_nexys_video, liteeth_nexys_video, litesdcard_nexys_video]
    tools:
      vivado: {part : xc7a200tsbg484-1}
@ -271,7 +271,6 @@ targets:
      - has_uart1
      - has_fpu=false
      - has_btc=false
-      - has_short_mult
      - use_litesdcard
    tools:
      vivado: {part : xc7a35ticsg324-1L}
@ -294,7 +293,6 @@ targets:
      - has_uart1
      - has_fpu=false
      - has_btc=false
-      - has_short_mult
    generate: [litedram_arty, liteeth_arty, litesdcard_arty]
    tools:
      vivado: {part : xc7a35ticsg324-1L}
@ -315,7 +313,6 @@ targets:
      - has_uart1
      - has_fpu
      - has_btc
-      - has_short_mult
      - use_litesdcard
    tools:
      vivado: {part : xc7a100ticsg324-1L}
@ -338,7 +335,6 @@ targets:
      - has_uart1
      - has_fpu
      - has_btc
-      - has_short_mult
    generate: [litedram_arty, liteeth_arty, litesdcard_arty]
    tools:
      vivado: {part : xc7a100ticsg324-1L}
@ -360,7 +356,6 @@ targets:
      - uart_is_16550
      - has_fpu
      - has_btc
-      - has_short_mult
    generate: [litesdcard_wukong-v2]
    tools:
      vivado: {part : xc7a100tfgg676-1}
@ -382,7 +377,6 @@ targets:
      - uart_is_16550
      - has_fpu
      - has_btc
-      - has_short_mult
    generate: [litedram_wukong-v2, liteeth_wukong-v2, litesdcard_wukong-v2]
    tools:
      vivado: {part : xc7a100tfgg676-1}
@ -498,12 +492,6 @@ parameters:
    paramtype   : generic
    default     : true

-  has_short_mult:
-    datatype    : bool
-    description : Include a 16 bit x 16 bit single-cycle multiplier in the core
-    paramtype   : generic
-    default     : false
-
  disable_flatten_core:
    datatype    : bool
    description : Prevent Vivado from flattening the main core components
--- a/multiply-32s.vhdl
+++ b/multiply-32s.vhdl
@ -0,0 +1,56 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+-- Signed 33b x 33b multiplier giving 64-bit product, with no addend,
+-- with fixed 1-cycle latency.
+
+entity multiply_32s is
+    port (
+        clk   : in std_logic;
+        stall : in std_ulogic;
+
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
+        );
+end entity multiply_32s;
+
+architecture behaviour of multiply_32s is
+    type reg_type is record
+        valid     : std_ulogic;
+        data      : signed(65 downto 0);
+    end record;
+    constant reg_type_init : reg_type := (valid => '0', data => (others => '0'));
+
+    signal r, rin : reg_type := reg_type_init;
+begin
+    multiply_0: process(clk)
+    begin
+        if rising_edge(clk) and stall = '0' then
+            r <= rin;
+        end if;
+    end process;
+
+    multiply_1: process(all)
+        variable v : reg_type;
+        variable d : std_ulogic_vector(63 downto 0);
+	variable ov : std_ulogic;
+    begin
+        v.valid := m_in.valid;
+        v.data := signed((m_in.is_signed and m_in.data1(31)) & m_in.data1(31 downto 0)) *
+                  signed((m_in.is_signed and m_in.data2(31)) & m_in.data2(31 downto 0));
+
+        d := std_ulogic_vector(r.data(63 downto 0));
+
+        ov := (or d(63 downto 31)) and not (and d(63 downto 31));
+
+        m_out.result <= 64x"0" & d;
+        m_out.overflow <= ov;
+        m_out.valid <= r.valid;
+
+        rin <= v;
+    end process;
+end architecture behaviour;
--- a/multiply.vhdl
+++ b/multiply.vhdl
@ -7,7 +7,7 @@ use work.common.all;

 entity multiply is
    generic (
-        PIPELINE_DEPTH : natural := 4
+        PIPELINE_DEPTH : natural := 3
        );
    port (
        clk   : in std_logic;
@ -23,11 +23,8 @@ architecture behaviour of multiply is
    type multiply_pipeline_stage is record
        valid     : std_ulogic;
        data      : unsigned(127 downto 0);
-	is_32bit  : std_ulogic;
-        not_res   : std_ulogic;
    end record;
    constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
-								     is_32bit => '0', not_res => '0',
 								     data => (others => '0'));

    type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
@ -52,31 +49,29 @@ begin

    multiply_1: process(all)
        variable v : reg_type;
+        variable a, b : std_ulogic_vector(64 downto 0);
+        variable prod : std_ulogic_vector(129 downto 0);
        variable d : std_ulogic_vector(127 downto 0);
        variable d2 : std_ulogic_vector(63 downto 0);
 	variable ov : std_ulogic;
    begin
        v := r;
+        a := (m.is_signed and m.data1(63)) & m.data1;
+        b := (m.is_signed and m.data2(63)) & m.data2;
+        prod := std_ulogic_vector(signed(a) * signed(b));
        v.multiply_pipeline(0).valid := m.valid;
-        v.multiply_pipeline(0).data := (unsigned(m.data1) * unsigned(m.data2)) + unsigned(m.addend);
-        v.multiply_pipeline(0).is_32bit := m.is_32bit;
-        v.multiply_pipeline(0).not_res := m.not_result;
+        if m.subtract = '1' then
+            v.multiply_pipeline(0).data := unsigned(m.addend) - unsigned(prod(127 downto 0));
+        else
+            v.multiply_pipeline(0).data := unsigned(m.addend) + unsigned(prod(127 downto 0));
+        end if;

        loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
            v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
        end loop;

        d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data);
-        if v.multiply_pipeline(PIPELINE_DEPTH-1).not_res = '1' then
-            d := not d;
-        end if;
-
-        ov := '0';
-        if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then
-            ov := (or d(63 downto 31)) and not (and d(63 downto 31));
-        else
-            ov := (or d(127 downto 63)) and not (and d(127 downto 63));
-        end if;
+        ov := (or d(127 downto 63)) and not (and d(127 downto 63));
        ovf_in <= ov;

        m_out.result <= d;
--- a/multiply_tb.vhdl
+++ b/multiply_tb.vhdl
@ -26,15 +26,6 @@ architecture behave of multiply_tb is
    signal m1               : MultiplyInputType := MultiplyInputInit;
    signal m2               : MultiplyOutputType;

-    function absval(x: std_ulogic_vector) return std_ulogic_vector is
-    begin
-        if x(x'left) = '1' then
-            return std_ulogic_vector(- signed(x));
-        else
-            return x;
-        end if;
-    end;
-
 begin
    multiply_0: entity work.multiply
        generic map (PIPELINE_DEPTH => pipeline_depth)
@ -51,7 +42,6 @@ begin
    stim_process: process
        variable ra, rb, rt, behave_rt: std_ulogic_vector(63 downto 0);
        variable si: std_ulogic_vector(15 downto 0);
-        variable sign: std_ulogic;
        variable rnd : RandomPType;
    begin
        rnd.InitSeed(stim_process'path_name);
@ -102,11 +92,11 @@ begin

                    behave_rt := ppc_mulld(ra, rb);

-                    m1.data1 <= absval(ra);
-                    m1.data2 <= absval(rb);
-                    sign := ra(63) xor rb(63);
-                    m1.not_result <= sign;
-                    m1.addend <= (others => sign);
+                    m1.data1 <= ra;
+                    m1.data2 <= rb;
+                    m1.is_signed <= '1';
+                    m1.subtract <= '0';
+                    m1.addend <= (others => '0');
                    m1.valid <= '1';

                    wait for clk_period;
@ -128,7 +118,8 @@ begin

                    m1.data1 <= ra;
                    m1.data2 <= rb;
-                    m1.not_result <= '0';
+                    m1.is_signed <= '0';
+                    m1.subtract <= '0';
                    m1.addend <= (others => '0');
                    m1.valid <= '1';

@ -149,11 +140,11 @@ begin

                    behave_rt := ppc_mulhd(ra, rb);

-                    m1.data1 <= absval(ra);
-                    m1.data2 <= absval(rb);
-                    sign := ra(63) xor rb(63);
-                    m1.not_result <= sign;
-                    m1.addend <= (others => sign);
+                    m1.data1 <= ra;
+                    m1.data2 <= rb;
+                    m1.is_signed <= '1';
+                    m1.subtract <= '0';
+                    m1.addend <= (others => '0');
                    m1.valid <= '1';

                    wait for clk_period;
@ -173,13 +164,13 @@ begin

                    behave_rt := ppc_mullw(ra, rb);

-                    m1.data1 <= (others => '0');
-                    m1.data1(31 downto 0) <= absval(ra(31 downto 0));
-                    m1.data2 <= (others => '0');
-                    m1.data2(31 downto 0) <= absval(rb(31 downto 0));
-                    sign := ra(31) xor rb(31);
-                    m1.not_result <= sign;
-                    m1.addend <= (others => sign);
+                    m1.data1 <= (others => ra(31));
+                    m1.data1(31 downto 0) <= ra(31 downto 0);
+                    m1.data2 <= (others => rb(31));
+                    m1.data2(31 downto 0) <= rb(31 downto 0);
+                    m1.is_signed <= '1';
+                    m1.subtract <= '0';
+                    m1.addend <= (others => '0');
                    m1.valid <= '1';

                    wait for clk_period;
@ -199,13 +190,13 @@ begin

                    behave_rt := ppc_mulhw(ra, rb);

-                    m1.data1 <= (others => '0');
-                    m1.data1(31 downto 0) <= absval(ra(31 downto 0));
-                    m1.data2 <= (others => '0');
-                    m1.data2(31 downto 0) <= absval(rb(31 downto 0));
-                    sign := ra(31) xor rb(31);
-                    m1.not_result <= sign;
-                    m1.addend <= (others => sign);
+                    m1.data1 <= (others => ra(31));
+                    m1.data1(31 downto 0) <= ra(31 downto 0);
+                    m1.data2 <= (others => rb(31));
+                    m1.data2(31 downto 0) <= rb(31 downto 0);
+                    m1.is_signed <= '1';
+                    m1.subtract <= '0';
+                    m1.addend <= (others => '0');
                    m1.valid <= '1';

                    wait for clk_period;
@ -229,7 +220,8 @@ begin
                    m1.data1(31 downto 0) <= ra(31 downto 0);
                    m1.data2 <= (others => '0');
                    m1.data2(31 downto 0) <= rb(31 downto 0);
-                    m1.not_result <= '0';
+                    m1.is_signed <= '0';
+                    m1.subtract <= '0';
                    m1.addend <= (others => '0');
                    m1.valid <= '1';

@ -250,12 +242,12 @@ begin

                    behave_rt := ppc_mulli(ra, si);

-                    m1.data1 <= absval(ra);
-                    m1.data2 <= (others => '0');
-                    m1.data2(15 downto 0) <= absval(si);
-                    sign := ra(63) xor si(15);
-                    m1.not_result <= sign;
-                    m1.addend <= (others => sign);
+                    m1.data1 <= ra;
+                    m1.data2 <= (others => si(15));
+                    m1.data2(15 downto 0) <= si;
+                    m1.is_signed <= '1';
+                    m1.subtract <= '0';
+                    m1.addend <= (others => '0');
                    m1.valid <= '1';

                    wait for clk_period;
--- a/predecode.vhdl
+++ b/predecode.vhdl
@ -0,0 +1,592 @@
+-- Instruction pre-decoder for microwatt
+-- One cycle latency.  Does 'WIDTH' instructions in parallel.
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+use work.decode_types.all;
+use work.insn_helpers.all;
+
+entity predecoder is
+    generic (
+        HAS_FPU   : boolean := true;
+        WIDTH     : natural := 2;
+        ICODE_LEN : natural := 10;
+        IMAGE_LEN : natural := 26
+        );
+    port (
+        clk        : in  std_ulogic;
+        valid_in   : in  std_ulogic;
+        insns_in   : in  std_ulogic_vector(WIDTH * 32 - 1 downto 0);
+        icodes_out : out std_ulogic_vector(WIDTH * (ICODE_LEN + IMAGE_LEN) - 1 downto 0)
+        );
+end entity predecoder;
+
+architecture behaviour of predecoder is
+
+    type predecoder_rom_t is array(0 to 2047) of insn_code;
+
+    constant major_predecode_rom : predecoder_rom_t := (
+        2#001100_00000# to 2#001100_11111# =>  INSN_addic,
+        2#001101_00000# to 2#001101_11111# =>  INSN_addic_dot,
+        2#001110_00000# to 2#001110_11111# =>  INSN_addi,
+        2#001111_00000# to 2#001111_11111# =>  INSN_addis,
+        2#010011_00100# to 2#010011_00101# =>  INSN_addpcis,
+        2#011100_00000# to 2#011100_11111# =>  INSN_andi_dot,
+        2#011101_00000# to 2#011101_11111# =>  INSN_andis_dot,
+        2#000000_00000#                    =>  INSN_attn,
+        2#010010_00000# to 2#010010_11111# =>  INSN_b,
+        2#010000_00000# to 2#010000_11111# =>  INSN_bc,
+        2#001011_00000# to 2#001011_11111# =>  INSN_cmpi,
+        2#001010_00000# to 2#001010_11111# =>  INSN_cmpli,
+        2#100010_00000# to 2#100010_11111# =>  INSN_lbz,
+        2#100011_00000# to 2#100011_11111# =>  INSN_lbzu,
+        2#110010_00000# to 2#110010_11111# =>  INSN_lfd,
+        2#110011_00000# to 2#110011_11111# =>  INSN_lfdu,
+        2#110000_00000# to 2#110000_11111# =>  INSN_lfs,
+        2#110001_00000# to 2#110001_11111# =>  INSN_lfsu,
+        2#101010_00000# to 2#101010_11111# =>  INSN_lha,
+        2#101011_00000# to 2#101011_11111# =>  INSN_lhau,
+        2#101000_00000# to 2#101000_11111# =>  INSN_lhz,
+        2#101001_00000# to 2#101001_11111# =>  INSN_lhzu,
+        2#100000_00000# to 2#100000_11111# =>  INSN_lwz,
+        2#100001_00000# to 2#100001_11111# =>  INSN_lwzu,
+        2#000111_00000# to 2#000111_11111# =>  INSN_mulli,
+        2#011000_00000# to 2#011000_11111# =>  INSN_ori,
+        2#011001_00000# to 2#011001_11111# =>  INSN_oris,
+        2#010100_00000# to 2#010100_11111# =>  INSN_rlwimi,
+        2#010101_00000# to 2#010101_11111# =>  INSN_rlwinm,
+        2#010111_00000# to 2#010111_11111# =>  INSN_rlwnm,
+        2#010001_00000# to 2#010001_11111# =>  INSN_sc,
+        2#100110_00000# to 2#100110_11111# =>  INSN_stb,
+        2#100111_00000# to 2#100111_11111# =>  INSN_stbu,
+        2#110110_00000# to 2#110110_11111# =>  INSN_stfd,
+        2#110111_00000# to 2#110111_11111# =>  INSN_stfdu,
+        2#110100_00000# to 2#110100_11111# =>  INSN_stfs,
+        2#110101_00000# to 2#110101_11111# =>  INSN_stfsu,
+        2#101100_00000# to 2#101100_11111# =>  INSN_sth,
+        2#101101_00000# to 2#101101_11111# =>  INSN_sthu,
+        2#100100_00000# to 2#100100_11111# =>  INSN_stw,
+        2#100101_00000# to 2#100101_11111# =>  INSN_stwu,
+        2#001000_00000# to 2#001000_11111# =>  INSN_subfic,
+        2#000010_00000# to 2#000010_11111# =>  INSN_tdi,
+        2#000011_00000# to 2#000011_11111# =>  INSN_twi,
+        2#011010_00000# to 2#011010_11111# =>  INSN_xori,
+        2#011011_00000# to 2#011011_11111# =>  INSN_xoris,
+        -- major opcode 4
+        2#000100_10000#                    =>  INSN_maddhd,
+        2#000100_10001#                    =>  INSN_maddhdu,
+        2#000100_10011#                    =>  INSN_maddld,
+        -- major opcode 30
+        2#011110_01000# to 2#011110_01001# =>  INSN_rldic,
+        2#011110_01010# to 2#011110_01011# =>  INSN_rldic,
+        2#011110_00000# to 2#011110_00001# =>  INSN_rldicl,
+        2#011110_00010# to 2#011110_00011# =>  INSN_rldicl,
+        2#011110_00100# to 2#011110_00101# =>  INSN_rldicr,
+        2#011110_00110# to 2#011110_00111# =>  INSN_rldicr,
+        2#011110_01100# to 2#011110_01101# =>  INSN_rldimi,
+        2#011110_01110# to 2#011110_01111# =>  INSN_rldimi,
+        2#011110_10000# to 2#011110_10001# =>  INSN_rldcl,
+        2#011110_10010# to 2#011110_10011# =>  INSN_rldcr,
+        -- major opcode 58
+        2#111010_00000#                    =>  INSN_ld,
+        2#111010_00001#                    =>  INSN_ldu,
+        2#111010_00010#                    =>  INSN_lwa,
+        2#111010_00100#                    =>  INSN_ld,
+        2#111010_00101#                    =>  INSN_ldu,
+        2#111010_00110#                    =>  INSN_lwa,
+        2#111010_01000#                    =>  INSN_ld,
+        2#111010_01001#                    =>  INSN_ldu,
+        2#111010_01010#                    =>  INSN_lwa,
+        2#111010_01100#                    =>  INSN_ld,
+        2#111010_01101#                    =>  INSN_ldu,
+        2#111010_01110#                    =>  INSN_lwa,
+        2#111010_10000#                    =>  INSN_ld,
+        2#111010_10001#                    =>  INSN_ldu,
+        2#111010_10010#                    =>  INSN_lwa,
+        2#111010_10100#                    =>  INSN_ld,
+        2#111010_10101#                    =>  INSN_ldu,
+        2#111010_10110#                    =>  INSN_lwa,
+        2#111010_11000#                    =>  INSN_ld,
+        2#111010_11001#                    =>  INSN_ldu,
+        2#111010_11010#                    =>  INSN_lwa,
+        2#111010_11100#                    =>  INSN_ld,
+        2#111010_11101#                    =>  INSN_ldu,
+        2#111010_11110#                    =>  INSN_lwa,
+        -- major opcode 59
+        2#111011_00100# to 2#111011_00101# =>  INSN_fdivs,
+        2#111011_01000# to 2#111011_01001# =>  INSN_fsubs,
+        2#111011_01010# to 2#111011_01011# =>  INSN_fadds,
+        2#111011_01100# to 2#111011_01101# =>  INSN_fsqrts,
+        2#111011_10000# to 2#111011_10001# =>  INSN_fres,
+        2#111011_10010# to 2#111011_10011# =>  INSN_fmuls,
+        2#111011_10100# to 2#111011_10101# =>  INSN_frsqrtes,
+        2#111011_11000# to 2#111011_11001# =>  INSN_fmsubs,
+        2#111011_11010# to 2#111011_11011# =>  INSN_fmadds,
+        2#111011_11100# to 2#111011_11101# =>  INSN_fnmsubs,
+        2#111011_11110# to 2#111011_11111# =>  INSN_fnmadds,
+        -- major opcode 62
+        2#111110_00000#                    =>  INSN_std,
+        2#111110_00001#                    =>  INSN_stdu,
+        2#111110_00100#                    =>  INSN_std,
+        2#111110_00101#                    =>  INSN_stdu,
+        2#111110_01000#                    =>  INSN_std,
+        2#111110_01001#                    =>  INSN_stdu,
+        2#111110_01100#                    =>  INSN_std,
+        2#111110_01101#                    =>  INSN_stdu,
+        2#111110_10000#                    =>  INSN_std,
+        2#111110_10001#                    =>  INSN_stdu,
+        2#111110_10100#                    =>  INSN_std,
+        2#111110_10101#                    =>  INSN_stdu,
+        2#111110_11000#                    =>  INSN_std,
+        2#111110_11001#                    =>  INSN_stdu,
+        2#111110_11100#                    =>  INSN_std,
+        2#111110_11101#                    =>  INSN_stdu,
+        -- major opcode 63
+        2#111111_00100# to 2#111111_00101# =>  INSN_fdiv,
+        2#111111_01000# to 2#111111_01001# =>  INSN_fsub,
+        2#111111_01010# to 2#111111_01011# =>  INSN_fadd,
+        2#111111_01100# to 2#111111_01101# =>  INSN_fsqrt,
+        2#111111_01110# to 2#111111_01111# =>  INSN_fsel,
+        2#111111_10000# to 2#111111_10001# =>  INSN_fre,
+        2#111111_10010# to 2#111111_10011# =>  INSN_fmul,
+        2#111111_10100# to 2#111111_10101# =>  INSN_frsqrte,
+        2#111111_11000# to 2#111111_11001# =>  INSN_fmsub,
+        2#111111_11010# to 2#111111_11011# =>  INSN_fmadd,
+        2#111111_11100# to 2#111111_11101# =>  INSN_fnmsub,
+        2#111111_11110# to 2#111111_11111# =>  INSN_fnmadd,
+        others                             =>  INSN_illegal
+        );
+
+    constant row_predecode_rom : predecoder_rom_t := (
+        -- Major opcode 31
+        -- Address bits are 0, insn(10:1)
+        2#0_01000_01010#  =>  INSN_add,
+        2#0_11000_01010#  =>  INSN_add, -- addo
+        2#0_00000_01010#  =>  INSN_addc,
+        2#0_10000_01010#  =>  INSN_addc, -- addco
+        2#0_00100_01010#  =>  INSN_adde,
+        2#0_10100_01010#  =>  INSN_adde, -- addeo
+        2#0_00101_01010#  =>  INSN_addex,
+        2#0_00010_01010#  =>  INSN_addg6s,
+        2#0_00111_01010#  =>  INSN_addme,
+        2#0_10111_01010#  =>  INSN_addme, -- addmeo
+        2#0_00110_01010#  =>  INSN_addze,
+        2#0_10110_01010#  =>  INSN_addze, -- addzeo
+        2#0_00000_11100#  =>  INSN_and,
+        2#0_00001_11100#  =>  INSN_andc,
+        2#0_00111_11100#  =>  INSN_bperm,
+        2#0_01001_11010#  =>  INSN_cbcdtd,
+        2#0_01000_11010#  =>  INSN_cdtbcd,
+        2#0_00000_00000#  =>  INSN_cmp,
+        2#0_01111_11100#  =>  INSN_cmpb,
+        2#0_00111_00000#  =>  INSN_cmpeqb,
+        2#0_00001_00000#  =>  INSN_cmpl,
+        2#0_00110_00000#  =>  INSN_cmprb,
+        2#0_00001_11010#  =>  INSN_cntlzd,
+        2#0_00000_11010#  =>  INSN_cntlzw,
+        2#0_10001_11010#  =>  INSN_cnttzd,
+        2#0_10000_11010#  =>  INSN_cnttzw,
+        2#0_10111_10011#  =>  INSN_darn,
+        2#0_00010_10110#  =>  INSN_dcbf,
+        2#0_00001_10110#  =>  INSN_dcbst,
+        2#0_01000_10110#  =>  INSN_dcbt,
+        2#0_00111_10110#  =>  INSN_dcbtst,
+        2#0_11111_10110#  =>  INSN_dcbz,
+        2#0_01100_01001#  =>  INSN_divdeu,
+        2#0_11100_01001#  =>  INSN_divdeu, -- divdeuo
+        2#0_01100_01011#  =>  INSN_divweu,
+        2#0_11100_01011#  =>  INSN_divweu, -- divweuo
+        2#0_01101_01001#  =>  INSN_divde,
+        2#0_11101_01001#  =>  INSN_divde, -- divdeo
+        2#0_01101_01011#  =>  INSN_divwe,
+        2#0_11101_01011#  =>  INSN_divwe, -- divweo
+        2#0_01110_01001#  =>  INSN_divdu,
+        2#0_11110_01001#  =>  INSN_divdu, -- divduo
+        2#0_01110_01011#  =>  INSN_divwu,
+        2#0_11110_01011#  =>  INSN_divwu, -- divwuo
+        2#0_01111_01001#  =>  INSN_divd,
+        2#0_11111_01001#  =>  INSN_divd, -- divdo
+        2#0_01111_01011#  =>  INSN_divw,
+        2#0_11111_01011#  =>  INSN_divw, -- divwo
+        2#0_11001_10110#  =>  INSN_nop, -- dss
+        2#0_01010_10110#  =>  INSN_nop, -- dst
+        2#0_01011_10110#  =>  INSN_nop, -- dstst
+        2#0_11010_10110#  =>  INSN_eieio,
+        2#0_01000_11100#  =>  INSN_eqv,
+        2#0_11101_11010#  =>  INSN_extsb,
+        2#0_11100_11010#  =>  INSN_extsh,
+        2#0_11110_11010#  =>  INSN_extsw,
+        2#0_11011_11010#  =>  INSN_extswsli,
+        2#0_11011_11011#  =>  INSN_extswsli,
+        2#0_11110_10110#  =>  INSN_icbi,
+        2#0_00000_10110#  =>  INSN_icbt,
+        2#0_00000_01111#  =>  INSN_isel,
+        2#0_00001_01111#  =>  INSN_isel,
+        2#0_00010_01111#  =>  INSN_isel,
+        2#0_00011_01111#  =>  INSN_isel,
+        2#0_00100_01111#  =>  INSN_isel,
+        2#0_00101_01111#  =>  INSN_isel,
+        2#0_00110_01111#  =>  INSN_isel,
+        2#0_00111_01111#  =>  INSN_isel,
+        2#0_01000_01111#  =>  INSN_isel,
+        2#0_01001_01111#  =>  INSN_isel,
+        2#0_01010_01111#  =>  INSN_isel,
+        2#0_01011_01111#  =>  INSN_isel,
+        2#0_01100_01111#  =>  INSN_isel,
+        2#0_01101_01111#  =>  INSN_isel,
+        2#0_01110_01111#  =>  INSN_isel,
+        2#0_01111_01111#  =>  INSN_isel,
+        2#0_10000_01111#  =>  INSN_isel,
+        2#0_10001_01111#  =>  INSN_isel,
+        2#0_10010_01111#  =>  INSN_isel,
+        2#0_10011_01111#  =>  INSN_isel,
+        2#0_10100_01111#  =>  INSN_isel,
+        2#0_10101_01111#  =>  INSN_isel,
+        2#0_10110_01111#  =>  INSN_isel,
+        2#0_10111_01111#  =>  INSN_isel,
+        2#0_11000_01111#  =>  INSN_isel,
+        2#0_11001_01111#  =>  INSN_isel,
+        2#0_11010_01111#  =>  INSN_isel,
+        2#0_11011_01111#  =>  INSN_isel,
+        2#0_11100_01111#  =>  INSN_isel,
+        2#0_11101_01111#  =>  INSN_isel,
+        2#0_11110_01111#  =>  INSN_isel,
+        2#0_11111_01111#  =>  INSN_isel,
+        2#0_00001_10100#  =>  INSN_lbarx,
+        2#0_11010_10101#  =>  INSN_lbzcix,
+        2#0_00011_10111#  =>  INSN_lbzux,
+        2#0_00010_10111#  =>  INSN_lbzx,
+        2#0_00010_10100#  =>  INSN_ldarx,
+        2#0_10000_10100#  =>  INSN_ldbrx,
+        2#0_11011_10101#  =>  INSN_ldcix,
+        2#0_00001_10101#  =>  INSN_ldux,
+        2#0_00000_10101#  =>  INSN_ldx,
+        2#0_10010_10111#  =>  INSN_lfdx,
+        2#0_10011_10111#  =>  INSN_lfdux,
+        2#0_11010_10111#  =>  INSN_lfiwax,
+        2#0_11011_10111#  =>  INSN_lfiwzx,
+        2#0_10000_10111#  =>  INSN_lfsx,
+        2#0_10001_10111#  =>  INSN_lfsux,
+        2#0_00011_10100#  =>  INSN_lharx,
+        2#0_01011_10111#  =>  INSN_lhaux,
+        2#0_01010_10111#  =>  INSN_lhax,
+        2#0_11000_10110#  =>  INSN_lhbrx,
+        2#0_11001_10101#  =>  INSN_lhzcix,
+        2#0_01001_10111#  =>  INSN_lhzux,
+        2#0_01000_10111#  =>  INSN_lhzx,
+        2#0_00000_10100#  =>  INSN_lwarx,
+        2#0_01011_10101#  =>  INSN_lwaux,
+        2#0_01010_10101#  =>  INSN_lwax,
+        2#0_10000_10110#  =>  INSN_lwbrx,
+        2#0_11000_10101#  =>  INSN_lwzcix,
+        2#0_00001_10111#  =>  INSN_lwzux,
+        2#0_00000_10111#  =>  INSN_lwzx,
+        2#0_10010_00000#  =>  INSN_mcrxrx,
+        2#0_00000_10011#  =>  INSN_mfcr,
+        2#0_00010_10011#  =>  INSN_mfmsr,
+        2#0_01010_10011#  =>  INSN_mfspr,
+        2#0_01000_01001#  =>  INSN_modud,
+        2#0_01000_01011#  =>  INSN_moduw,
+        2#0_11000_01001#  =>  INSN_modsd,
+        2#0_11000_01011#  =>  INSN_modsw,
+        2#0_00100_10000#  =>  INSN_mtcrf,
+        2#0_00100_10010#  =>  INSN_mtmsr,
+        2#0_00101_10010#  =>  INSN_mtmsrd,
+        2#0_01110_10011#  =>  INSN_mtspr,
+        2#0_00010_01001#  =>  INSN_mulhd,
+        2#0_00000_01001#  =>  INSN_mulhdu,
+        2#0_00010_01011#  =>  INSN_mulhw,
+        2#0_00000_01011#  =>  INSN_mulhwu,
+        -- next 4 have reserved bit set
+        2#0_10010_01001#  =>  INSN_mulhd,
+        2#0_10000_01001#  =>  INSN_mulhdu,
+        2#0_10010_01011#  =>  INSN_mulhw,
+        2#0_10000_01011#  =>  INSN_mulhwu,
+        2#0_00111_01001#  =>  INSN_mulld,
+        2#0_10111_01001#  =>  INSN_mulld, -- mulldo
+        2#0_00111_01011#  =>  INSN_mullw,
+        2#0_10111_01011#  =>  INSN_mullw, -- mullwo
+        2#0_01110_11100#  =>  INSN_nand,
+        2#0_00011_01000#  =>  INSN_neg,
+        2#0_10011_01000#  =>  INSN_neg, -- nego
+        -- next 8 are reserved no-op instructions
+        2#0_10000_10010#  =>  INSN_nop,
+        2#0_10001_10010#  =>  INSN_nop,
+        2#0_10010_10010#  =>  INSN_nop,
+        2#0_10011_10010#  =>  INSN_nop,
+        2#0_10100_10010#  =>  INSN_nop,
+        2#0_10101_10010#  =>  INSN_nop,
+        2#0_10110_10010#  =>  INSN_nop,
+        2#0_10111_10010#  =>  INSN_nop,
+        2#0_00011_11100#  =>  INSN_nor,
+        2#0_01101_11100#  =>  INSN_or,
+        2#0_01100_11100#  =>  INSN_orc,
+        2#0_00011_11010#  =>  INSN_popcntb,
+        2#0_01111_11010#  =>  INSN_popcntd,
+        2#0_01011_11010#  =>  INSN_popcntw,
+        2#0_00101_11010#  =>  INSN_prtyd,
+        2#0_00100_11010#  =>  INSN_prtyw,
+        2#0_00100_00000#  =>  INSN_setb,
+        2#0_01111_10010#  =>  INSN_slbia,
+        2#0_00000_11011#  =>  INSN_sld,
+        2#0_00000_11000#  =>  INSN_slw,
+        2#0_11000_11010#  =>  INSN_srad,
+        2#0_11001_11010#  =>  INSN_sradi,
+        2#0_11001_11011#  =>  INSN_sradi,
+        2#0_11000_11000#  =>  INSN_sraw,
+        2#0_11001_11000#  =>  INSN_srawi,
+        2#0_10000_11011#  =>  INSN_srd,
+        2#0_10000_11000#  =>  INSN_srw,
+        2#0_11110_10101#  =>  INSN_stbcix,
+        2#0_10101_10110#  =>  INSN_stbcx,
+        2#0_00111_10111#  =>  INSN_stbux,
+        2#0_00110_10111#  =>  INSN_stbx,
+        2#0_10100_10100#  =>  INSN_stdbrx,
+        2#0_11111_10101#  =>  INSN_stdcix,
+        2#0_00110_10110#  =>  INSN_stdcx,
+        2#0_00101_10101#  =>  INSN_stdux,
+        2#0_00100_10101#  =>  INSN_stdx,
+        2#0_10110_10111#  =>  INSN_stfdx,
+        2#0_10111_10111#  =>  INSN_stfdux,
+        2#0_11110_10111#  =>  INSN_stfiwx,
+        2#0_10100_10111#  =>  INSN_stfsx,
+        2#0_10101_10111#  =>  INSN_stfsux,
+        2#0_11100_10110#  =>  INSN_sthbrx,
+        2#0_11101_10101#  =>  INSN_sthcix,
+        2#0_10110_10110#  =>  INSN_sthcx,
+        2#0_01101_10111#  =>  INSN_sthux,
+        2#0_01100_10111#  =>  INSN_sthx,
+        2#0_10100_10110#  =>  INSN_stwbrx,
+        2#0_11100_10101#  =>  INSN_stwcix,
+        2#0_00100_10110#  =>  INSN_stwcx,
+        2#0_00101_10111#  =>  INSN_stwux,
+        2#0_00100_10111#  =>  INSN_stwx,
+        2#0_00001_01000#  =>  INSN_subf,
+        2#0_10001_01000#  =>  INSN_subf, -- subfo
+        2#0_00000_01000#  =>  INSN_subfc,
+        2#0_10000_01000#  =>  INSN_subfc, -- subfco
+        2#0_00100_01000#  =>  INSN_subfe,
+        2#0_10100_01000#  =>  INSN_subfe, -- subfeo
+        2#0_00111_01000#  =>  INSN_subfme,
+        2#0_10111_01000#  =>  INSN_subfme, -- subfmeo
+        2#0_00110_01000#  =>  INSN_subfze,
+        2#0_10110_01000#  =>  INSN_subfze, -- subfzeo
+        2#0_10010_10110#  =>  INSN_sync,
+        2#0_00010_00100#  =>  INSN_td,
+        2#0_00000_00100#  =>  INSN_tw,
+        2#0_01001_10010#  =>  INSN_tlbie,
+        2#0_01000_10010#  =>  INSN_tlbiel,
+        2#0_10001_10110#  =>  INSN_tlbsync,
+        2#0_00000_11110#  =>  INSN_wait,
+        2#0_01001_11100#  =>  INSN_xor,
+
+        -- Major opcode 19
+        -- Columns with insn(4) = '1' are all illegal and not mapped here; to
+        -- fit into 2048 entries, the columns are remapped so that 16-24 are
+        -- stored here as 8-15; in other words the address bits are
+        -- 1, insn(10..6), 1, insn(5), insn(3..1)
+        -- Columns 16-17 here are opcode 19 columns 0-1
+        -- Columns 24-31 here are opcode 19 columns 16-23
+        2#1_10000_11000#  =>  INSN_bcctr,
+        2#1_00000_11000#  =>  INSN_bclr,
+        2#1_10001_11000#  =>  INSN_bctar,
+        2#1_01000_10001#  =>  INSN_crand,
+        2#1_00100_10001#  =>  INSN_crandc,
+        2#1_01001_10001#  =>  INSN_creqv,
+        2#1_00111_10001#  =>  INSN_crnand,
+        2#1_00001_10001#  =>  INSN_crnor,
+        2#1_01110_10001#  =>  INSN_cror,
+        2#1_01101_10001#  =>  INSN_crorc,
+        2#1_00110_10001#  =>  INSN_crxor,
+        2#1_00100_11110#  =>  INSN_isync,
+        2#1_00000_10000#  =>  INSN_mcrf,
+        2#1_00000_11010#  =>  INSN_rfid,
+
+        -- Major opcode 59
+        -- Address bits are 1, insn(10..6), 1, 0, insn(3..1)
+        -- Only column 14 is valid here; columns 16-31 are handled in the major table
+        -- Column 14 is mapped to column 22.
+        -- Columns 20-23 here are opcode 59 columns 12-15
+        2#1_11010_10110#  =>  INSN_fcfids,
+        2#1_11110_10110#  =>  INSN_fcfidus,
+
+        -- Major opcode 63
+        -- Columns 0-15 are mapped here; columns 16-31 are in the major table.
+        -- Address bits are 1, insn(10:6), 0, insn(4:1)
+        -- Columns 0-15 here are opcode 63 columns 0-15
+        2#1_00000_00000#  =>  INSN_fcmpu,
+        2#1_00001_00000#  =>  INSN_fcmpo,
+        2#1_00010_00000#  =>  INSN_mcrfs,
+        2#1_00100_00000#  =>  INSN_ftdiv,
+        2#1_00101_00000#  =>  INSN_ftsqrt,
+        2#1_00001_00110#  =>  INSN_mtfsb,
+        2#1_00010_00110#  =>  INSN_mtfsb,
+        2#1_00100_00110#  =>  INSN_mtfsfi,
+        2#1_11010_00110#  =>  INSN_fmrgow,
+        2#1_11110_00110#  =>  INSN_fmrgew,
+        2#1_10010_00111#  =>  INSN_mffs,
+        2#1_10110_00111#  =>  INSN_mtfsf,
+        2#1_00000_01000#  =>  INSN_fcpsgn,
+        2#1_00001_01000#  =>  INSN_fneg,
+        2#1_00010_01000#  =>  INSN_fmr,
+        2#1_00100_01000#  =>  INSN_fnabs,
+        2#1_01000_01000#  =>  INSN_fabs,
+        2#1_01100_01000#  =>  INSN_frin,
+        2#1_01101_01000#  =>  INSN_friz,
+        2#1_01110_01000#  =>  INSN_frip,
+        2#1_01111_01000#  =>  INSN_frim,
+        2#1_00000_01100#  =>  INSN_frsp,
+        2#1_00000_01110#  =>  INSN_fctiw,
+        2#1_00100_01110#  =>  INSN_fctiwu,
+        2#1_11001_01110#  =>  INSN_fctid,
+        2#1_11010_01110#  =>  INSN_fcfid,
+        2#1_11101_01110#  =>  INSN_fctidu,
+        2#1_11110_01110#  =>  INSN_fcfidu,
+        2#1_00000_01111#  =>  INSN_fctiwz,
+        2#1_00100_01111#  =>  INSN_fctiwuz,
+        2#1_11001_01111#  =>  INSN_fctidz,
+        2#1_11101_01111#  =>  INSN_fctiduz,
+
+        others            =>  INSN_illegal
+        );
+
+    constant IOUT_LEN : natural := ICODE_LEN + IMAGE_LEN;
+
+    type predec_t is record
+        image         : std_ulogic_vector(31 downto 0);
+        maj_predecode : unsigned(ICODE_LEN - 1 downto 0);
+        row_predecode : unsigned(ICODE_LEN - 1 downto 0);
+    end record;
+
+    subtype index_t is integer range 0 to WIDTH-1;
+    type predec_array is array(index_t) of predec_t;
+
+    signal pred : predec_array;
+    signal valid : std_ulogic;
+
+begin
+    predecode_0: process(clk)
+        variable majaddr  : std_ulogic_vector(10 downto 0);
+        variable rowaddr  : std_ulogic_vector(10 downto 0);
+        variable iword    : std_ulogic_vector(31 downto 0);
+        variable majcode  : insn_code;
+        variable rowcode  : insn_code;
+    begin
+        if rising_edge(clk) then
+            valid <= valid_in;
+            for i in index_t loop
+                iword := insns_in(i * 32 + 31 downto i * 32);
+                pred(i).image <= iword;
+
+                if is_X(iword) then
+                    pred(i).maj_predecode <= (others => 'X');
+                    pred(i).row_predecode <= (others => 'X');
+                else
+                    majaddr := iword(31 downto 26) & iword(4 downto 0);
+
+                    -- row_predecode_rom is used for op 19, 31, 59, 63
+                    -- addr bit 10 is 0 for op 31, 1 for 19, 59, 63
+                    rowaddr(10) := iword(31) or not iword(29);
+                    rowaddr(9 downto 5) := iword(10 downto 6);
+                    if iword(28) = '0' then
+                        -- op 19 and op 59
+                        rowaddr(4 downto 3) := '1' & iword(5);
+                    else
+                        -- op 31 and 63; for 63 we only use this when iword(5) = '0'
+                        rowaddr(4 downto 3) := iword(5 downto 4);
+                    end if;
+                    rowaddr(2 downto 0) := iword(3 downto 1);
+
+                    majcode := major_predecode_rom(to_integer(unsigned(majaddr)));
+                    pred(i).maj_predecode <= to_unsigned(insn_code'pos(majcode), ICODE_LEN);
+                    rowcode := row_predecode_rom(to_integer(unsigned(rowaddr)));
+                    pred(i).row_predecode <= to_unsigned(insn_code'pos(rowcode), ICODE_LEN);
+                end if;
+            end loop;
+        end if;
+    end process;
+
+    predecode_1: process(all)
+        variable iword    : std_ulogic_vector(31 downto 0);
+        variable use_row  : std_ulogic;
+        variable illegal  : std_ulogic;
+        variable ici      : std_ulogic_vector(IOUT_LEN - 1 downto 0);
+        variable icode    : unsigned(ICODE_LEN - 1 downto 0);
+    begin
+        for i in index_t loop
+            iword := pred(i).image;
+            icode := pred(i).maj_predecode;
+            use_row := '0';
+            illegal := '0';
+
+            case iword(31 downto 26) is
+                when "000100" => -- 4
+                    -- major opcode 4, mostly VMX/VSX stuff but also some integer ops (madd*)
+                    illegal := not iword(5);
+
+                when "010011" => -- 19
+                    -- Columns 8-15 and 24-31 don't have any valid instructions
+                    -- (where insn(5..1) is the column number).
+                    -- addpcis (column 2) is in the major table
+                    -- Other valid columns are mapped to columns in the second
+                    -- half of the row table: columns 0-1 are mapped to 16-17
+                    -- and 16-23 are mapped to 24-31.
+                    illegal := iword(4);
+                    use_row := iword(5) or (not iword(3) and not iword(2));
+
+                when "011000" => -- 24
+                    -- ori, special-case the standard NOP
+                    if std_match(iword, "01100000000000000000000000000000") then
+                        icode := to_unsigned(insn_code'pos(INSN_nop), ICODE_LEN);
+                    end if;
+
+                when "011111" => -- 31
+                    -- major opcode 31, lots of things
+                    -- Use the first half of the row table for all columns
+                    use_row := '1';
+
+                when "111011" => -- 59
+                    -- floating point operations, mostly single-precision
+                    -- Columns 0-11 are illegal; columns 12-15 are mapped
+                    -- to columns 20-23 in the second half of the row table,
+                    -- and columns 16-31 are in the major table.
+                    illegal := not iword(5) and (not iword(4) or not iword(3));
+                    use_row := not iword(5);
+
+                when "111111" => -- 63
+                    -- floating point operations, general and double-precision
+                    -- Use columns 0-15 of the second half of the row table
+                    -- for columns 0-15, and the major table for columns 16-31.
+                    use_row := not iword(5);
+
+                when others =>
+            end case;
+            if use_row = '1' then
+                icode := pred(i).row_predecode;
+            end if;
+
+            -- Mark FP instructions as illegal if we don't have an FPU
+            if not HAS_FPU and not is_X(icode) and
+                to_integer(icode) >= insn_code'pos(INSN_first_frs) then
+                illegal := '1';
+            end if;
+
+            ici(31 downto 0) := iword;
+            ici(IOUT_LEN - 1 downto 32) := (others => '0');
+            if valid = '0' or illegal = '1' or is_X(icode) or
+                icode = to_unsigned(insn_code'pos(INSN_illegal), ICODE_LEN) then
+                -- Since an insn_code currently fits in 9 bits, use just
+                -- the most significant bit of ici to indicate illegal insns.
+                ici(IOUT_LEN - 1) := '1';
+            else
+                ici(IOUT_LEN - 1 downto IMAGE_LEN) := std_ulogic_vector(icode);
+            end if;
+            icodes_out(i * IOUT_LEN + IOUT_LEN - 1 downto i * IOUT_LEN) <= ici;
+        end loop;
+    end process;
+
+end architecture behaviour;
--- a/scripts/fmt_log/fmt_log.c
+++ b/scripts/fmt_log/fmt_log.c
@ -22,7 +22,7 @@ struct log_entry {
 	u64	ic_wb_adr: 3;
 	u64	ic_wb_ack: 1;

-	u64	ic_insn: 32;
+	u64	ic_insn: 36;
 	u64	ic_valid: 1;
 	u64	d1_valid: 1;
 	u64	d1_unit: 2;
@ -39,9 +39,8 @@ struct log_entry {
 	u64	e1_stall_out: 1;
 	u64	e1_redirect: 1;
 	u64	e1_valid: 1;
-	u64	e1_write_enable: 1;
-	u64	e1_unused: 3;

+	u64	e1_write_enable: 1;
 	u64	e1_irq_state: 1;
 	u64	e1_irq: 1;
 	u64	e1_exception: 1;
@ -49,7 +48,7 @@ struct log_entry {
 	u64	e1_msr_ir: 1;
 	u64	e1_msr_pr: 1;
 	u64	e1_msr_ee: 1;
-	u64	pad1: 5;
+	u64	pad1: 4;
 	u64	ls_state: 3;
 	u64	ls_dw_done: 1;
 	u64	ls_min_done: 1;
@ -88,13 +87,13 @@ const char *units[4] = { "--", "al", "ls", "fp" };
 const char *ops[64] =
 {
 	"illegal", "nop    ", "add    ", "and    ", "attn   ", "b      ", "bc     ", "bcreg  ",
-	"bperm  ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "cntz   ", "crop   ", "darn   ",
-	"dcbf   ", "dcbst  ", "dcbt   ", "dcbtst ", "dcbz   ", "div    ", "dive   ", "exts   ",
-	"extswsl", "fpop   ", "fpopi  ", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ",
-	"st     ", "mcrxrx ", "mfcr   ", "mfmsr  ", "mfspr  ", "mod    ", "mtcrf  ", "mtmsr  ",
-	"mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "or     ", "popcnt ", "prty   ", "rfid   ",
-	"rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ", "shl    ", "shr    ", "sync   ",
-	"tlbie  ", "trap   ", "xor    ", "bcd    ", "addg6s ", "ffail  ", "?62    ", "?63    "
+	"bcd    ", "bperm  ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "cntz   ", "crop   ",
+	"darn   ", "dcbf   ", "dcbst  ", "dcbt   ", "dcbtst ", "dcbz   ", "icbi   ", "icbt   ",
+	"fpcmp  ", "fparith", "fpmove ", "fpmisc ", "div    ", "dive   ", "mod    ", "exts   ",
+	"extswsl", "isel   ", "isync  ", "ld     ", "st     ", "mcrxrx ", "mfcr   ", "mfmsr  ",
+	"mfspr  ", "mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "or     ",
+	"popcnt ", "prty   ", "rfid   ", "rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ",
+	"shl    ", "shr    ", "sync   ", "tlbie  ", "trap   ", "xor    ", "addg6s ", "ffail  ",
 };

 const char *spr_names[13] =
@ -134,9 +133,9 @@ int main(int ac, char **av)
 		full_nia[log.nia_lo & 0xf] = (log.nia_hi? 0xc000000000000000: 0) |
 			(log.nia_lo << 2);
 		if (lineno % 20 == 1) {
-			printf("        fetch1 NIA      icache                         decode1       decode2   execute1         loadstore  dcache       CR   GSPR\n");
-			printf("     ----------------   TAHW S -WB-- pN --insn--    pN un op         pN byp    FR IIE MSR  WC   SD MM CE   SRTO DE -WB-- c ms reg val\n");
-			printf("                        LdMy t csnSa IA             IA it            IA abc    le srx EPID em   tw rd mx   tAwp vr csnSa 0 k\n");
+			printf("        fetch1 NIA      icache                             decode1       decode2   execute1         loadstore  dcache       CR   GSPR\n");
+			printf("     ----------------   TAHW S -WB-- pN  ic --insn--    pN un op         pN byp    FR IIE MSR  WC   SD MM CE   SRTO DE -WB-- c ms reg val\n");
+			printf("                        LdMy t csnSa IA                 IA it            IA abc    le srx EPID em   tw rd mx   tAwp vr csnSa 0 k\n");
 		}
 		printf("%4ld %c0000%.11llx %c ", lineno,
 		       (log.nia_hi? 'c': '0'),
@ -154,12 +153,16 @@ int main(int ac, char **av)
 		       FLAG(ic_wb_stall, 'S'),
 		       FLAG(ic_wb_ack, 'a'),
 		       PNIA(ic_part_nia));
-		if (log.ic_valid)
-			printf("%.8x", log.ic_insn);
-		else if (log.ic_fetch_failed)
-			printf("!!!!!!!!");
+		if (log.ic_valid) {
+			if (log.ic_insn & (1ul << 35))
+				printf("ill %.8lx", log.ic_insn & 0xfffffffful);
+			else
+				printf("%3lu x%.7lx", (long)(log.ic_insn >> 26),
+				       (unsigned long)(log.ic_insn & 0x3ffffff));
+		} else if (log.ic_fetch_failed)
+			printf("    !!!!!!!!");
 		else
-			printf("--------");
+			printf("--- --------");
 		printf(" %c%c %.2llx ",
 		       FLAG(ic_valid, '>'),
 		       FLAG(d2_stall_out, '|'),
--- a/soc.vhdl
+++ b/soc.vhdl
@ -59,7 +59,6 @@ entity soc is
 	SIM                : boolean;
        HAS_FPU            : boolean := true;
        HAS_BTC            : boolean := true;
-        HAS_SHORT_MULT     : boolean := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
        ALT_RESET_ADDRESS  : std_logic_vector(63 downto 0) := (23 downto 0 => '0', others => '1');
 	HAS_DRAM           : boolean  := false;
@ -335,7 +334,6 @@ begin
 	    SIM => SIM,
            HAS_FPU => HAS_FPU,
            HAS_BTC => HAS_BTC,
-            HAS_SHORT_MULT => HAS_SHORT_MULT,
 	    DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
 	    ALT_RESET_ADDRESS => ALT_RESET_ADDRESS,
            LOG_LENGTH => LOG_LENGTH,
--- a/writeback.vhdl
+++ b/writeback.vhdl
@ -92,21 +92,20 @@ begin
        intr := e_in.interrupt or l_in.interrupt or fp_in.interrupt;
        interrupt_out.intr <= intr;

-        if intr = '1' then
-            srr1 := (others => '0');
-            if e_in.interrupt = '1' then
-                vec := e_in.intr_vec;
-                srr1 := e_in.srr1;
-            elsif l_in.interrupt = '1' then
-                vec := l_in.intr_vec;
-                srr1 := l_in.srr1;
-            elsif fp_in.interrupt = '1' then
-                vec := fp_in.intr_vec;
-                srr1 := fp_in.srr1;
-            end if;
-            interrupt_out.srr1 <= srr1;
+        srr1 := (others => '0');
+        if e_in.interrupt = '1' then
+            vec := e_in.intr_vec;
+            srr1 := e_in.srr1;
+        elsif l_in.interrupt = '1' then
+            vec := l_in.intr_vec;
+            srr1 := l_in.srr1;
+        elsif fp_in.interrupt = '1' then
+            vec := fp_in.intr_vec;
+            srr1 := fp_in.srr1;
+        end if;
+        interrupt_out.srr1 <= srr1;

-        else
+        if intr = '0' then
            if e_in.write_enable = '1' then
                w_out.write_reg <= e_in.write_reg;
                w_out.write_data <= e_in.write_data;
--- a/xilinx-mult-32s.vhdl
+++ b/xilinx-mult-32s.vhdl
@ -0,0 +1,295 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+library unisim;
+use unisim.vcomponents.all;
+
+-- Signed 33b x 33b multiplier giving 64-bit product, with no addend.
+
+entity multiply_32s is
+    port (
+        clk   : in std_logic;
+        stall : in std_ulogic;
+
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
+        );
+end entity multiply_32s;
+
+architecture behaviour of multiply_32s is
+    signal clocken : std_ulogic;
+    signal data1 : std_ulogic_vector(52 downto 0);
+    signal data2 : std_ulogic_vector(34 downto 0);
+    signal m00_p, m01_p : std_ulogic_vector(47 downto 0);
+    signal m00_pc : std_ulogic_vector(47 downto 0);
+    signal m10_p, m11_p : std_ulogic_vector(47 downto 0);
+    signal m10_pc : std_ulogic_vector(47 downto 0);
+    signal p0_pat, p0_patb : std_ulogic;
+    signal p1_pat, p1_patb : std_ulogic;
+    signal product_lo : std_ulogic_vector(22 downto 0);
+
+begin
+    -- sign extend if signed
+    data1(31 downto 0)  <= m_in.data1(31 downto 0);
+    data1(52 downto 32) <= (others => m_in.is_signed and m_in.data1(31));
+    data2(31 downto 0)  <= m_in.data2(31 downto 0);
+    data2(34 downto 32) <= (others => m_in.is_signed and m_in.data2(31));
+
+    clocken <= m_in.valid and not stall;
+
+    m00: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MREG => 0,
+            OPMODEREG => 0,
+            PREG => 0
+            )
+        port map (
+            A => "0000000" & data1(22 downto 0),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => '0' & data2(16 downto 0),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => '0',
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "0110101",
+            P => m00_p,
+            PCIN => (others => '0'),
+            PCOUT => m00_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m01: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MREG => 0,
+            OPMODEREG => 0,
+            PREG => 0
+            )
+        port map (
+            A => "0000000" & data1(22 downto 0),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => data2(34 downto 17),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => '0',
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "1010101",
+            P => m01_p,
+            PCIN => m00_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m10: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 1,
+            INMODEREG => 0,
+            MASK => x"fffffffe00ff",
+            OPMODEREG => 0,
+            PREG => 0,
+            USE_PATTERN_DETECT => "PATDET"
+            )
+        port map (
+            A => data1(52 downto 23),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => '0' & data2(16 downto 0),
+            BCIN => (others => '0'),
+            C => std_ulogic_vector(resize(signed(m01_p(38 downto 6)), 48)),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => clocken,
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => clocken,
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "0110101",
+            P => m10_p,
+            PATTERNDETECT => p0_pat,
+            PATTERNBDETECT => p0_patb,
+            PCIN => (others => '0'),
+            PCOUT => m10_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m11: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MASK => x"fffffc000000",
+            OPMODEREG => 0,
+            PREG => 0,
+            USE_PATTERN_DETECT => "PATDET"
+            )
+        port map (
+            A => data1(52 downto 23),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => data2(34 downto 17),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => clocken,
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "1010101",
+            P => m11_p,
+            PATTERNDETECT => p1_pat,
+            PATTERNBDETECT => p1_patb,
+            PCIN => m10_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m_out.result(127 downto 64) <= (others => '0');
+    m_out.result(63 downto 40) <= m11_p(23 downto 0);
+    m_out.result(39 downto 23) <= m10_p(16 downto 0);
+    m_out.result(22 downto 0)  <= product_lo;
+
+    m_out.overflow <= not ((p0_pat and p1_pat) or (p0_patb and p1_patb));
+
+    process(clk)
+    begin
+        if rising_edge(clk) and stall = '0' then
+            m_out.valid <= m_in.valid;
+            product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0);
+        end if;
+    end process;
+
+end architecture behaviour;
--- a/xilinx-mult.vhdl
+++ b/xilinx-mult.vhdl
@ -18,26 +18,32 @@ entity multiply is
 end entity multiply;

 architecture behaviour of multiply is
+    signal d1sign : std_ulogic_vector(13 downto 0);
+    signal d2sign : std_ulogic_vector(4 downto 0);
    signal m00_p, m01_p, m02_p, m03_p : std_ulogic_vector(47 downto 0);
-    signal m00_pc : std_ulogic_vector(47 downto 0);
+    signal m00_pc, m02_pc : std_ulogic_vector(47 downto 0);
    signal m10_p, m11_p, m12_p, m13_p : std_ulogic_vector(47 downto 0);
-    signal m11_pc, m12_pc, m13_pc : std_ulogic_vector(47 downto 0);
+    signal m10_pc, m12_pc : std_ulogic_vector(47 downto 0);
    signal m20_p, m21_p, m22_p, m23_p : std_ulogic_vector(47 downto 0);
+    signal m20_pc, m22_pc : std_ulogic_vector(47 downto 0);
+    signal pp0, pp1 : std_ulogic_vector(127 downto 0);
+    signal pp23 : std_ulogic_vector(127 downto 0);
+    signal sumlo : std_ulogic_vector(8 downto 0);
    signal s0_pc, s1_pc : std_ulogic_vector(47 downto 0);
+    signal s0_carry, p0_carry : std_ulogic_vector(3 downto 0);
    signal product : std_ulogic_vector(127 downto 0);
    signal addend : std_ulogic_vector(127 downto 0);
-    signal s0_carry, p0_carry : std_ulogic_vector(3 downto 0);
-    signal p0_mask : std_ulogic_vector(47 downto 0);
    signal p0_pat, p0_patb : std_ulogic;
    signal p1_pat, p1_patb : std_ulogic;

-    signal req_32bit, r32_1 : std_ulogic;
    signal rnot_1 : std_ulogic;
    signal valid_1 : std_ulogic;
-    signal overflow, ovf_in : std_ulogic;
+    signal overflow : std_ulogic;

 begin
-    addend <= m_in.addend;
+    addend <= m_in.addend when m_in.subtract = '0' else not m_in.addend;
+    d1sign <= (others => m_in.data1(63) and m_in.is_signed);
+    d2sign <= (others => m_in.data2(63) and m_in.is_signed);

    m00: DSP48E1
        generic map (
@ -55,12 +61,12 @@ begin
            PREG => 1
            )
        port map (
-            A => "0000000" & m_in.data1(22 downto 0),
+            A => 6x"0" & m_in.data1(23 downto 0),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(16 downto 0),
            BCIN => (others => '0'),
-            C => "00000000000000" & addend(33 downto 0),
+            C => 14x"0" & addend(33 downto 0),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -106,12 +112,14 @@ begin
            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
+            CREG => 0,
            INMODEREG => 0,
+            MREG => 1,
            OPMODEREG => 0,
            PREG => 0
            )
        port map (
-            A => "0000000" & m_in.data1(22 downto 0),
+            A => 6x"0" & m_in.data1(23 downto 0),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(33 downto 17),
@ -126,7 +134,7 @@ begin
            CEALUMODE => '0',
            CEB1 => '0',
            CEB2 => '0',
-            CEC => '1',
+            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
@ -168,12 +176,12 @@ begin
            PREG => 1
            )
        port map (
-            A => "0000000" & m_in.data1(22 downto 0),
+            A => 6x"0" & m_in.data1(23 downto 0),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(50 downto 34),
            BCIN => (others => '0'),
-            C => x"0000000" & "000" & addend(50 downto 34),
+            C => 24x"0" & addend(57 downto 34),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -197,6 +205,7 @@ begin
            OPMODE => "0110101",
            P => m02_p,
            PCIN => (others => '0'),
+            PCOUT => m02_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -220,17 +229,17 @@ begin
            CARRYINSELREG => 0,
            CREG => 0,
            INMODEREG => 0,
-            MREG => 0,
+            MREG => 1,
            OPMODEREG => 0,
-            PREG => 1
+            PREG => 0
            )
        port map (
-            A => "0000000" & m_in.data1(22 downto 0),
+            A => 6x"0" & m_in.data1(23 downto 0),
            ACIN => (others => '0'),
            ALUMODE => "0000",
-            B => "00000" & m_in.data2(63 downto 51),
+            B => d2sign & m_in.data2(63 downto 51),
            BCIN => (others => '0'),
-            C => x"000000" & '0' & addend(73 downto 51),
+            C => (others => '0'),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -245,15 +254,15 @@ begin
            CECTRL => '0',
            CED => '0',
            CEINMODE => '0',
-            CEM => '0',
-            CEP => m_in.valid,
+            CEM => m_in.valid,
+            CEP => '0',
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
-            OPMODE => "0110101",
+            OPMODE => "1010101",
            P => m03_p,
-            PCIN => (others => '0'),
+            PCIN => m02_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -277,16 +286,17 @@ begin
            CARRYINSELREG => 0,
            CREG => 0,
            INMODEREG => 0,
+            MREG => 0,
            OPMODEREG => 0,
-            PREG => 0
+            PREG => 1
            )
        port map (
-            A => "0000000000000" & m_in.data1(39 downto 23),
+            A => 6x"0" & m_in.data1(47 downto 24),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(16 downto 0),
            BCIN => (others => '0'),
-            C => x"000" & "00" & m01_p(39 downto 6),
+            C => (others => '0'),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -301,15 +311,16 @@ begin
            CECTRL => '0',
            CED => '0',
            CEINMODE => '0',
-            CEM => m_in.valid,
-            CEP => '0',
+            CEM => '0',
+            CEP => m_in.valid,
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
-            OPMODE => "0110101",
+            OPMODE => "0000101",
            P => m10_p,
            PCIN => (others => '0'),
+            PCOUT => m10_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -333,16 +344,17 @@ begin
            CARRYINSELREG => 0,
            CREG => 0,
            INMODEREG => 0,
+            MREG => 1,
            OPMODEREG => 0,
            PREG => 0
            )
        port map (
-            A => "0000000000000" & m_in.data1(39 downto 23),
+            A => 6x"0" & m_in.data1(47 downto 24),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(33 downto 17),
            BCIN => (others => '0'),
-            C => x"000" & "00" & m02_p(39 downto 6),
+            C => (others => '0'),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -363,10 +375,9 @@ begin
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
-            OPMODE => "0110101",
+            OPMODE => "1010101",
            P => m11_p,
-            PCIN => (others => '0'),
-            PCOUT => m11_pc,
+            PCIN => m10_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -390,16 +401,17 @@ begin
            CARRYINSELREG => 0,
            CREG => 0,
            INMODEREG => 0,
+            MREG => 0,
            OPMODEREG => 0,
-            PREG => 0
+            PREG => 1
            )
        port map (
-            A => "0000000000000" & m_in.data1(39 downto 23),
+            A => 6x"0" & m_in.data1(47 downto 24),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(50 downto 34),
            BCIN => (others => '0'),
-            C => x"0000" & '0' & m03_p(36 downto 6),
+            C => 24x"0" & addend(81 downto 58),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -414,8 +426,8 @@ begin
            CECTRL => '0',
            CED => '0',
            CEINMODE => '0',
-            CEM => m_in.valid,
-            CEP => '0',
+            CEM => '0',
+            CEP => m_in.valid,
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
@ -445,17 +457,19 @@ begin
            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
+            CREG => 0,
            INMODEREG => 0,
+            MREG => 1,
            OPMODEREG => 0,
            PREG => 0
            )
        port map (
-            A => "0000000000000" & m_in.data1(39 downto 23),
+            A => 6x"0" & m_in.data1(47 downto 24),
            ACIN => (others => '0'),
            ALUMODE => "0000",
-            B => "00000" & m_in.data2(63 downto 51),
+            B => d2sign & m_in.data2(63 downto 51),
            BCIN => (others => '0'),
-            C => x"0000000" & "000" & addend(90 downto 74),
+            C => (others => '0'),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -465,7 +479,7 @@ begin
            CEALUMODE => '0',
            CEB1 => '0',
            CEB2 => '0',
-            CEC => '1',
+            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
@ -476,10 +490,9 @@ begin
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
-            OPMODE => "0110101",
+            OPMODE => "1010101",
            P => m13_p,
-            PCIN => (others => '0'),
-            PCOUT => m13_pc,
+            PCIN => m12_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -501,12 +514,14 @@ begin
            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
+            CREG => 0,
            INMODEREG => 0,
+            MREG => 0,
            OPMODEREG => 0,
-            PREG => 0
+            PREG => 1
            )
        port map (
-            A => "000000" & m_in.data1(63 downto 40),
+            A => d1sign & m_in.data1(63 downto 48),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(16 downto 0),
@ -521,20 +536,21 @@ begin
            CEALUMODE => '0',
            CEB1 => '0',
            CEB2 => '0',
-            CEC => '1',
+            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
            CEINMODE => '0',
-            CEM => m_in.valid,
-            CEP => '0',
+            CEM => '0',
+            CEP => m_in.valid,
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
-            OPMODE => "0010101",
+            OPMODE => "0000101",
            P => m20_p,
-            PCIN => m11_pc,
+            PCIN => (others => '0'),
+            PCOUT => m20_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -556,12 +572,14 @@ begin
            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
+            CREG => 0,
            INMODEREG => 0,
+            MREG => 1,
            OPMODEREG => 0,
            PREG => 0
            )
        port map (
-            A => "000000" & m_in.data1(63 downto 40),
+            A => d1sign & m_in.data1(63 downto 48),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(33 downto 17),
@ -576,7 +594,7 @@ begin
            CEALUMODE => '0',
            CEB1 => '0',
            CEB2 => '0',
-            CEC => '1',
+            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
@ -587,9 +605,9 @@ begin
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
-            OPMODE => "0010101",
+            OPMODE => "1010101",
            P => m21_p,
-            PCIN => m12_pc,
+            PCIN => m20_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -611,17 +629,19 @@ begin
            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
+            CREG => 0,
            INMODEREG => 0,
+            MREG => 0,
            OPMODEREG => 0,
-            PREG => 0
+            PREG => 1
            )
        port map (
-            A => "000000" & m_in.data1(63 downto 40),
+            A => d1sign & m_in.data1(63 downto 48),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => '0' & m_in.data2(50 downto 34),
            BCIN => (others => '0'),
-            C => (others => '0'),
+            C => "00" & addend(127 downto 82),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -631,20 +651,21 @@ begin
            CEALUMODE => '0',
            CEB1 => '0',
            CEB2 => '0',
-            CEC => '1',
+            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
            CEINMODE => '0',
-            CEM => m_in.valid,
-            CEP => '0',
+            CEM => '0',
+            CEP => m_in.valid,
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
-            OPMODE => "0010101",
+            OPMODE => "0110101",
            P => m22_p,
-            PCIN => m13_pc,
+            PCIN => (others => '0'),
+            PCOUT => m22_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -666,17 +687,19 @@ begin
            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
+            CREG => 0,
            INMODEREG => 0,
+            MREG => 1,
            OPMODEREG => 0,
            PREG => 0
            )
        port map (
-            A => "000000" & m_in.data1(63 downto 40),
+            A => d1sign & m_in.data1(63 downto 48),
            ACIN => (others => '0'),
            ALUMODE => "0000",
-            B => "00000" & m_in.data2(63 downto 51),
+            B => d2sign & m_in.data2(63 downto 51),
            BCIN => (others => '0'),
-            C => x"00" & "000" & addend(127 downto 91),
+            C => (others => '0'),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -686,7 +709,7 @@ begin
            CEALUMODE => '0',
            CEB1 => '0',
            CEB2 => '0',
-            CEC => '1',
+            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
@ -697,9 +720,9 @@ begin
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
-            OPMODE => "0110101",
+            OPMODE => "1010101",
            P => m23_p,
-            PCIN => (others => '0'),
+            PCIN => m22_pc,
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
@ -712,6 +735,17 @@ begin
            RSTP => '0'
            );

+    pp0 <= std_ulogic_vector(resize(signed(m13_p(37 downto 0) & m12_p(16 downto 0) &
+                                           m01_p(40 downto 0) & m00_p(16 downto 0)), 128));
+    pp1 <= m23_p(28 downto 0) & m22_p(16 downto 0) & m11_p(40 downto 0) & m10_p(16 downto 0) & 24x"0";
+    -- pp2 <= std_ulogic_vector(resize(signed(m03_p(37 downto 0) & m02_p(16 downto 0) & 34x"0"), 128));
+    -- pp3 <= std_ulogic_vector(resize(signed(m21_p(34 downto 0) & m20_p(16 downto 0) & 48x"0"), 128));
+
+    pp23 <= std_ulogic_vector(resize(resize(signed(m03_p(37 downto 0) & m02_p(16 downto 0) & 34x"0"), 100) +
+                                     signed(m21_p(34 downto 0) & m20_p(16 downto 0) & 48x"0"), 128));
+
+    sumlo <= std_ulogic_vector(unsigned('0' & pp0(31 downto 24)) + unsigned('0' & pp1(31 downto 24)));
+
    s0: DSP48E1
        generic map (
            ACASCREG => 0,
@ -725,16 +759,16 @@ begin
            INMODEREG => 0,
            MREG => 0,
            OPMODEREG => 0,
-            PREG => 1,
+            PREG => 0,
            USE_MULT => "none"
            )
        port map (
-            A => m22_p(5 downto 0) & x"0000" & m10_p(34 downto 27),
+            A => pp0(79 downto 50),
            ACIN => (others => '0'),
            ALUMODE => "0000",
-            B => m10_p(26 downto 9),
+            B => pp0(49 downto 32),
            BCIN => (others => '0'),
-            C => m20_p(39 downto 0) & m02_p(5 downto 0) & "00",
+            C => pp1(79 downto 32),
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
@ -751,7 +785,7 @@ begin
            CED => '0',
            CEINMODE => '0',
            CEM => '0',
-            CEP => valid_1,
+            CEP => '0',
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
@ -773,43 +807,43 @@ begin

    s1: DSP48E1
        generic map (
-            ACASCREG => 1,
+            ACASCREG => 0,
            ALUMODEREG => 0,
-            AREG => 1,
-            BCASCREG => 1,
-            BREG => 1,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
-            CREG => 1,
+            CREG => 0,
            INMODEREG => 0,
            MREG => 0,
            OPMODEREG => 0,
-            PREG => 0,
+            PREG => 1,
            USE_MULT => "none"
            )
        port map (
-            A => x"000" & m22_p(41 downto 24),
+            A => pp0(127 downto 98),
            ACIN => (others => '0'),
            ALUMODE => "0000",
-            B => m22_p(23 downto 6),
+            B => pp0(97 downto 80),
            BCIN => (others => '0'),
-            C => m23_p(36 downto 0) & x"00" & "0" & m20_p(41 downto 40),
+            C => pp1(127 downto 80),
            CARRYCASCIN => '0',
            CARRYIN => s0_carry(3),
            CARRYINSEL => "000",
            CEA1 => '0',
-            CEA2 => valid_1,
+            CEA2 => '0',
            CEAD => '0',
            CEALUMODE => '0',
            CEB1 => '0',
-            CEB2 => valid_1,
-            CEC => valid_1,
+            CEB2 => '0',
+            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
            CEINMODE => '0',
            CEM => '0',
-            CEP => '0',
+            CEP => valid_1,
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
@ -829,52 +863,48 @@ begin
            RSTP => '0'
            );

-    -- mask is 0 for 32-bit ops, 0x0000ffffffff for 64-bit
-    p0_mask(47 downto 31) <= (others => '0');
-    p0_mask(30 downto 0) <= (others => not r32_1);
-
    p0: DSP48E1
        generic map (
-            ACASCREG => 1,
-            ALUMODEREG => 1,
-            AREG => 1,
-            BCASCREG => 1,
-            BREG => 1,
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
-            CREG => 1,
+            CREG => 0,
            INMODEREG => 0,
+            MASK => x"00007fffffff",
            MREG => 0,
            OPMODEREG => 0,
-            PREG => 0,
-            SEL_MASK => "C",
+            PREG => 1,
            USE_MULT => "none",
            USE_PATTERN_DETECT => "PATDET"
            )
        port map (
-            A => m21_p(22 downto 0) & m03_p(5 downto 0) & '0',
+            A => pp23(79 downto 50),
            ACIN => (others => '0'),
            ALUMODE => "00" & rnot_1 & '0',
-            B => (others => '0'),
+            B => pp23(49 downto 32),
            BCIN => (others => '0'),
-            C => p0_mask,
+            C => (others => '0'),
            CARRYCASCIN => '0',
-            CARRYIN => '0',
+            CARRYIN => sumlo(8),
            CARRYINSEL => "000",
            CARRYOUT => p0_carry,
            CEA1 => '0',
-            CEA2 => valid_1,
+            CEA2 => '0',
            CEAD => '0',
-            CEALUMODE => valid_1,
+            CEALUMODE => '0',
            CEB1 => '0',
-            CEB2 => valid_1,
-            CEC => valid_1,
+            CEB2 => '0',
+            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
            CEINMODE => '0',
            CEM => '0',
-            CEP => '0',
+            CEP => valid_1,
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
@ -915,10 +945,10 @@ begin
            USE_PATTERN_DETECT => "PATDET"
            )
        port map (
-            A => x"0000000" & '0' & m21_p(41),
+            A => pp23(127 downto 98),
            ACIN => (others => '0'),
            ALUMODE => "00" & rnot_1 & '0',
-            B => m21_p(40 downto 23),
+            B => pp23(97 downto 80),
            BCIN => (others => '0'),
            C => (others => '0'),
            CARRYCASCIN => '0',
@ -930,7 +960,7 @@ begin
            CEALUMODE => valid_1,
            CEB1 => '0',
            CEB2 => valid_1,
-            CEC => '0',
+            CEC => valid_1,
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
@ -958,39 +988,26 @@ begin
            RSTP => '0'
            );

-    mult_out: process(all)
-        variable ov : std_ulogic;
-    begin
-        -- set overflow if the high bits are neither all zeroes nor all ones
-        if req_32bit = '0' then
-            ov := not ((p1_pat and p0_pat) or (p1_patb and p0_patb));
-        else
-            ov := not ((p1_pat and p0_pat and not product(31)) or
-                       (p1_patb and p0_patb and product(31)));
-        end if;
-        ovf_in <= ov;
-
-        m_out.result <= product;
-        m_out.overflow <= overflow;
-    end process;
-
    process(clk)
    begin
        if rising_edge(clk) then
-            if rnot_1 = '0' then
-                product(31 downto 0) <= m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0);
-            else
-                product(31 downto 0) <= not (m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0));
+            if valid_1 = '1' then
+                if rnot_1 = '0' then
+                    product(31 downto 0) <= sumlo(7 downto 0) & pp0(23 downto 0);
+                else
+                    product(31 downto 0) <= not (sumlo(7 downto 0) & pp0(23 downto 0));
+                end if;
            end if;
            m_out.valid <= valid_1;
            valid_1 <= m_in.valid;
-            req_32bit <= r32_1;
-            r32_1 <= m_in.is_32bit;
-            rnot_1 <= m_in.not_result;
-            overflow <= ovf_in;
+            rnot_1 <= m_in.subtract;
+            overflow <= not ((p1_pat and p0_pat) or (p1_patb and p0_patb));
        end if;
    end process;

+    m_out.result <= product;
+    m_out.overflow <= overflow;
+
 end architecture behaviour;

 library ieee;