From fa9df33f7ea750de5d11078b0bc7586dbfac86a4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 28 Sep 2023 21:58:15 +1000 Subject: [PATCH] Implement cfuged, pdepd and pextd This implements the cfuged, pdepd and pextd instructions in a new unit called bit_sorter (so called because cfuged and pextd can be viewed as sorting the bits of the mask). The cnt* instructions and the popcnt* instructions now use the same OP_COUNTB insn_type so as to free up an insn_type value to use for the new instructions. The new instructions are implemented using a slow and simple algorithm that takes 64 cycles to compute the result. The ex1 stage is stalled while this happens, as for a 64-bit multiply, or for a divide when there is no FPU. Signed-off-by: Paul Mackerras --- Makefile | 2 +- bitsort.vhdl | 102 ++++++++++++++++++++++++++++++++++++++ decode1.vhdl | 17 ++++--- decode2.vhdl | 13 ++--- decode_types.vhdl | 57 +++++++++++---------- execute1.vhdl | 76 +++++++++++++++++++++------- microwatt.core | 1 + predecode.vhdl | 3 ++ scripts/fmt_log/fmt_log.c | 4 +- 9 files changed, 213 insertions(+), 62 deletions(-) create mode 100644 bitsort.vhdl diff --git a/Makefile b/Makefile index fb591a4..01eab73 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \ execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \ - core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl + core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl bitsort.vhdl soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \ wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \ diff --git a/bitsort.vhdl b/bitsort.vhdl new file mode 100644 index 0000000..f2aeddb --- /dev/null +++ b/bitsort.vhdl @@ -0,0 +1,102 @@ +-- Implements instructions that involve sorting bits, +-- that is, cfuged, pextd and pdepd. +-- +-- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right +-- and move the bits in RS in the same fashion to give the result +-- pextd: Like cfuged but the only use the bits of RS where the +-- corresponding bit in RB is 1 +-- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out +-- to the bit positions which have a 1 in RB + +-- NB opc is bits 7-6 of the instruction: +-- 00 = pdepd, 01 = pextd, 10 = cfuged + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.helpers.all; + +entity bit_sorter is + port ( + clk : in std_ulogic; + rst : in std_ulogic; + rs : in std_ulogic_vector(63 downto 0); + rb : in std_ulogic_vector(63 downto 0); + go : in std_ulogic; + opc : in std_ulogic_vector(1 downto 0); + done : out std_ulogic; + result : out std_ulogic_vector(63 downto 0) + ); +end entity bit_sorter; + +architecture behaviour of bit_sorter is + + signal val : std_ulogic_vector(63 downto 0); + signal st : std_ulogic; + signal sd : std_ulogic; + signal opr : std_ulogic_vector(1 downto 0); + signal bc : unsigned(5 downto 0); + signal jl : unsigned(5 downto 0); + signal jr : unsigned(5 downto 0); + signal sr_ml : std_ulogic_vector(63 downto 0); + signal sr_mr : std_ulogic_vector(63 downto 0); + signal sr_vl : std_ulogic_vector(63 downto 0); + signal sr_vr : std_ulogic_vector(63 downto 0); + +begin + bsort_r: process(clk) + begin + if rising_edge(clk) then + sd <= '0'; + if rst = '1' then + st <= '0'; + opr <= "00"; + val <= (others => '0'); + elsif go = '1' then + st <= '1'; + sr_ml <= rb; + sr_mr <= rb; + sr_vl <= rs; + sr_vr <= rs; + opr <= opc; + val <= (others => '0'); + bc <= to_unsigned(0, 6); + jl <= to_unsigned(63, 6); + jr <= to_unsigned(0, 6); + elsif st = '1' then + if bc = 6x"3f" then + st <= '0'; + sd <= '1'; + end if; + bc <= bc + 1; + if sr_ml(63) = '0' and opr(1) = '1' then + -- cfuged + val(to_integer(jl)) <= sr_vl(63); + jl <= jl - 1; + end if; + if sr_mr(0) = '1' then + if opr = "00" then + -- pdepd + val(to_integer(bc)) <= sr_vr(0); + else + -- cfuged or pextd + val(to_integer(jr)) <= sr_vr(0); + end if; + jr <= jr + 1; + end if; + sr_vl <= sr_vl(62 downto 0) & '0'; + if opr /= "00" or sr_mr(0) = '1' then + sr_vr <= '0' & sr_vr(63 downto 1); + end if; + sr_ml <= sr_ml(62 downto 0) & '0'; + sr_mr <= '0' & sr_mr(63 downto 1); + end if; + end if; + end process; + + done <= sd; + result <= val; + +end behaviour; diff --git a/decode1.vhdl b/decode1.vhdl index 75bb9c3..86fb5cf 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -106,6 +106,7 @@ architecture behaviour of decode1 is INSN_brd => (ALU, NONE, OP_BREV, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cbcdtd => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cdtbcd => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_cfuged => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmp => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_cmpb => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmpeqb => (ALU, NONE, OP_CMPEQB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -113,10 +114,10 @@ architecture behaviour of decode1 is INSN_cmpl => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmpli => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmprb => (ALU, NONE, OP_CMPRB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_cntlzd => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), - INSN_cntlzw => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), - INSN_cnttzd => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), - INSN_cnttzw => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), + INSN_cntlzd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), + INSN_cntlzw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), + INSN_cnttzd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), + INSN_cnttzw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), INSN_crand => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_crandc => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_creqv => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -281,6 +282,8 @@ architecture behaviour of decode1 is INSN_ori => (ALU, NONE, OP_LOGIC, NONE, CONST_UI, RS, RA, '0', '0', '1', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_oris => (ALU, NONE, OP_LOGIC, NONE, CONST_UI_HI, RS, RA, '0', '0', '1', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_paddi => (ALU, NONE, OP_ADD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_pdepd => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_pextd => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plbz => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pld => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plfd => (LDST, FPU, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -296,9 +299,9 @@ architecture behaviour of decode1 is INSN_pstfs => (LDST, FPU, OP_STORE, RA0_OR_CIA, CONST_PSI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), INSN_psth => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pstw => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntb => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntd => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntw => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntb => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_prtyd => (ALU, NONE, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_prtyw => (ALU, NONE, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_rfid => (ALU, NONE, OP_RFID, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), diff --git a/decode2.vhdl b/decode2.vhdl index 94fb6a7..a747495 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -232,12 +232,13 @@ architecture behaviour of decode2 is ); constant subresult_select : mux_select_array_t := ( - OP_MUL_L64 => "000", -- muldiv_result - OP_MUL_H64 => "001", - OP_MUL_H32 => "010", - OP_DIV => "011", - OP_DIVE => "011", - OP_MOD => "011", + OP_MUL_L64 => "000", -- multicyc_result + OP_MUL_H64 => "010", + OP_MUL_H32 => "001", + OP_DIV => "101", + OP_DIVE => "101", + OP_MOD => "101", + OP_BSORT => "100", OP_ADDG6S => "001", -- misc_result OP_ISEL => "010", OP_DARN => "011", diff --git a/decode_types.vhdl b/decode_types.vhdl index 4f81a36..dc104cd 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -6,7 +6,7 @@ package decode_types is OP_ATTN, OP_B, OP_BC, OP_BCREG, OP_BCD, OP_BPERM, OP_BREV, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, - OP_CNTZ, OP_CROP, + OP_COUNTB, OP_CROP, OP_DARN, OP_DCBF, OP_DCBST, OP_XCBT, OP_DCBTST, OP_DCBZ, OP_ICBI, OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC, @@ -18,7 +18,8 @@ package decode_types is OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, - OP_POPCNT, OP_PRTY, OP_RFID, + OP_BSORT, + OP_PRTY, OP_RFID, OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_SHL, OP_SHR, OP_SYNC, OP_TLBIE, OP_TRAP, @@ -179,11 +180,12 @@ package decode_types is INSN_and, INSN_andc, INSN_bperm, + INSN_cfuged, INSN_cmp, INSN_cmpb, INSN_cmpeqb, - INSN_cmpl, - INSN_cmprb, -- 140 + INSN_cmpl, -- 140 + INSN_cmprb, INSN_dcbf, INSN_dcbst, INSN_dcbt, @@ -192,8 +194,8 @@ package decode_types is INSN_divd, INSN_divdu, INSN_divde, - INSN_divdeu, - INSN_divw, -- 150 + INSN_divdeu, -- 150 + INSN_divw, INSN_divwu, INSN_divwe, INSN_divweu, @@ -202,8 +204,8 @@ package decode_types is INSN_icbt, INSN_isel, INSN_lbarx, - INSN_lbzcix, - INSN_lbzux, -- 160 + INSN_lbzcix, -- 160 + INSN_lbzux, INSN_lbzx, INSN_ldarx, INSN_ldbrx, @@ -212,8 +214,8 @@ package decode_types is INSN_ldux, INSN_lharx, INSN_lhax, - INSN_lhaux, - INSN_lhbrx, -- 170 + INSN_lhaux, -- 170 + INSN_lhbrx, INSN_lhzcix, INSN_lhzx, INSN_lhzux, @@ -222,8 +224,8 @@ package decode_types is INSN_lwaux, INSN_lwbrx, INSN_lwzcix, - INSN_lwzx, - INSN_lwzux, -- 180 + INSN_lwzx, -- 180 + INSN_lwzux, INSN_modsd, INSN_modsw, INSN_moduw, @@ -232,51 +234,54 @@ package decode_types is INSN_mulhwu, INSN_mulhd, INSN_mulhdu, - INSN_mullw, - INSN_mulld, -- 190 + INSN_mullw, -- 190 + INSN_mulld, INSN_nand, INSN_nor, INSN_or, INSN_orc, + INSN_pdepd, + INSN_pextd, INSN_rldcl, INSN_rldcr, - INSN_rlwnm, + INSN_rlwnm, -- 200 INSN_slw, INSN_sld, - INSN_sraw, -- 200 + INSN_sraw, INSN_srad, INSN_srw, INSN_srd, INSN_stbcix, INSN_stbcx, INSN_stbx, - INSN_stbux, + INSN_stbux, -- 210 INSN_stdbrx, INSN_stdcix, - INSN_stdcx, -- 210 + INSN_stdcx, INSN_stdx, INSN_stdux, INSN_sthbrx, INSN_sthcix, INSN_sthcx, INSN_sthx, - INSN_sthux, + INSN_sthux, -- 220 INSN_stwbrx, INSN_stwcix, - INSN_stwcx, -- 220 + INSN_stwcx, INSN_stwx, INSN_stwux, INSN_subf, INSN_subfc, INSN_subfe, INSN_td, - INSN_tlbie, + INSN_tlbie, -- 230 INSN_tlbiel, INSN_tw, - INSN_xor, -- 230 + INSN_xor, - -- pad to 232 to simplify comparison logic - INSN_231, + -- pad to 240 to simplify comparison logic + INSN_234, INSN_235, + INSN_236, INSN_237, INSN_238, INSN_239, -- The following instructions have a third input addressed by RC INSN_maddld, @@ -284,9 +289,7 @@ package decode_types is INSN_maddhdu, -- pad to 256 to simplify comparison logic - INSN_235, - INSN_236, INSN_237, INSN_238, INSN_239, - INSN_240, INSN_241, INSN_242, INSN_243, + INSN_243, INSN_244, INSN_245, INSN_246, INSN_247, INSN_248, INSN_249, INSN_250, INSN_251, INSN_252, INSN_253, INSN_254, INSN_255, diff --git a/execute1.vhdl b/execute1.vhdl index 9b55195..2cc9c35 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -113,6 +113,7 @@ architecture behaviour of execute1 is direct_branch : std_ulogic; start_mul : std_ulogic; start_div : std_ulogic; + start_bsort : std_ulogic; do_trace : std_ulogic; fp_intr : std_ulogic; res2_sel : std_ulogic_vector(1 downto 0); @@ -134,7 +135,7 @@ architecture behaviour of execute1 is prev_op : insn_type_t; prev_prefixed : std_ulogic; oe : std_ulogic; - mul_select : std_ulogic_vector(1 downto 0); + mul_select : std_ulogic_vector(2 downto 0); res2_sel : std_ulogic_vector(1 downto 0); spr_select : spr_id; pmu_spr_num : std_ulogic_vector(4 downto 0); @@ -144,6 +145,7 @@ architecture behaviour of execute1 is mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; + bsort_in_progress : std_ulogic; no_instr_avail : std_ulogic; instr_dispatch : std_ulogic; ext_interrupt : std_ulogic; @@ -164,10 +166,11 @@ architecture behaviour of execute1 is busy => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, prev_prefixed => '0', - oe => '0', mul_select => "00", res2_sel => "00", + oe => '0', mul_select => "000", res2_sel => "00", spr_select => spr_id_init, pmu_spr_num => 5x"0", redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', + bsort_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", @@ -209,7 +212,8 @@ architecture behaviour of execute1 is signal alu_result: std_ulogic_vector(63 downto 0); signal adder_result: std_ulogic_vector(63 downto 0); signal misc_result: std_ulogic_vector(63 downto 0); - signal muldiv_result: std_ulogic_vector(63 downto 0); + signal multicyc_result: std_ulogic_vector(63 downto 0); + signal bsort_result: std_ulogic_vector(63 downto 0); signal spr_result: std_ulogic_vector(63 downto 0); signal next_nia : std_ulogic_vector(63 downto 0); signal s1_sel : std_ulogic_vector(2 downto 0); @@ -234,6 +238,10 @@ architecture behaviour of execute1 is signal x_to_divider: Execute1ToDividerType; signal divider_to_x: DividerToExecute1Type := DividerToExecute1Init; + -- bit-sort unit signals + signal bsort_start : std_ulogic; + signal bsort_done : std_ulogic; + -- random number generator signals signal random_raw : std_ulogic_vector(63 downto 0); signal random_cond : std_ulogic_vector(63 downto 0); @@ -493,6 +501,18 @@ begin ); end generate; + bsort_0: entity work.bit_sorter + port map ( + clk => clk, + rst => rst, + rs => c_in, + rb => b_in, + go => bsort_start, + opc => e_in.insn(7 downto 6), + done => bsort_done, + result => bsort_result + ); + random_0: entity work.random port map ( clk => clk, @@ -664,7 +684,7 @@ begin adder_result when "000", logical_result when "001", rotator_result when "010", - muldiv_result when "100", + multicyc_result when "100", ramspr_result when "101", misc_result when others; @@ -845,17 +865,21 @@ begin x_to_mult_32s.subtract <= '0'; x_to_mult_32s.addend <= (others => '0'); - case ex1.mul_select is - when "00" => - muldiv_result <= multiply_to_x.result(63 downto 0); - when "01" => - muldiv_result <= multiply_to_x.result(127 downto 64); - when "10" => - muldiv_result <= multiply_to_x.result(63 downto 32) & - multiply_to_x.result(63 downto 32); - when others => - muldiv_result <= divider_to_x.write_reg_data; - end case; + if ex1.mul_select(2) = '0' then + case ex1.mul_select(1 downto 0) is + when "00" => + multicyc_result <= multiply_to_x.result(63 downto 0); + when "01" => + multicyc_result <= multiply_to_x.result(63 downto 32) & + multiply_to_x.result(63 downto 32); + when others => + multicyc_result <= multiply_to_x.result(127 downto 64); + end case; + elsif ex1.mul_select(0) = '1' and not HAS_FPU then + multicyc_result <= divider_to_x.write_reg_data; + else + multicyc_result <= bsort_result; + end if; -- Compute misc_result case e_in.sub_select is @@ -1266,7 +1290,7 @@ begin end if; v.do_trace := '0'; - when OP_CNTZ | OP_POPCNT => + when OP_COUNTB => v.res2_sel := "01"; slow_op := '1'; when OP_ISEL => @@ -1388,6 +1412,11 @@ begin when OP_ICBI => v.se.icache_inval := '1'; + when OP_BSORT => + v.start_bsort := '1'; + slow_op := '1'; + owait := '1'; + when OP_MUL_L64 => if e_in.is_32bit = '1' then v.se.mult_32s := '1'; @@ -1565,7 +1594,7 @@ begin v.oe := e_in.oe; v.spr_select := e_in.spr_select; v.pmu_spr_num := e_in.insn(20 downto 16); - v.mul_select := e_in.sub_select(1 downto 0); + v.mul_select := e_in.sub_select; v.se := side_effect_init; v.ramspr_wraddr := e_in.ramspr_wraddr; v.lr_from_next := e_in.lr; @@ -1596,7 +1625,7 @@ begin rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; + do_popcnt <= '1' when e_in.insn_type = OP_COUNTB and e_in.insn(7 downto 6) = "11" else '0'; if valid_in = '1' then v.prev_op := e_in.insn_type; @@ -1671,6 +1700,7 @@ begin v.mul_in_progress := actions.start_mul; x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; + v.bsort_in_progress := actions.start_bsort; v.br_mispredict := v.e.redirect and actions.direct_branch; v.advance_nia := actions.advance_nia; v.redir_to_next := actions.redir_to_next; @@ -1681,7 +1711,7 @@ begin -- multiply is happening in order to stop following -- instructions from using the wrong XER value -- (and for simplicity in the OE=0 case). - v.busy := actions.start_div or actions.start_mul; + v.busy := actions.start_div or actions.start_mul or actions.start_bsort; -- instruction for other units, i.e. LDST if e_in.unit = LDST then @@ -1692,6 +1722,7 @@ begin end if; end if; is_scv := go and actions.se.scv_trap; + bsort_start <= go and actions.start_bsort; if not HAS_FPU and ex1.div_in_progress = '1' then v.div_in_progress := not divider_to_x.valid; @@ -1724,6 +1755,13 @@ begin end if; v.e.valid := '1'; end if; + if ex1.bsort_in_progress = '1' then + v.bsort_in_progress := not bsort_done; + v.e.valid := bsort_done; + v.busy := not bsort_done; + v.e.write_data := alu_result; + bypass_valid := bsort_done; + end if; if v.e.write_xerc_enable = '1' and v.e.valid = '1' then v.xerc := v.e.xerc; diff --git a/microwatt.core b/microwatt.core index dad180f..f56bee0 100644 --- a/microwatt.core +++ b/microwatt.core @@ -20,6 +20,7 @@ filesets: - sim_console.vhdl - logical.vhdl - countbits.vhdl + - bitsort.vhdl - control.vhdl - execute1.vhdl - fpu.vhdl diff --git a/predecode.vhdl b/predecode.vhdl index 1846e3c..65cb751 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -219,6 +219,7 @@ architecture behaviour of predecoder is 2#0_00101_11011# => INSN_brd, 2#0_01001_11010# => INSN_cbcdtd, 2#0_01000_11010# => INSN_cdtbcd, + 2#0_00110_11100# => INSN_cfuged, 2#0_00000_00000# => INSN_cmp, 2#0_01111_11100# => INSN_cmpb, 2#0_00111_00000# => INSN_cmpeqb, @@ -363,6 +364,8 @@ architecture behaviour of predecoder is 2#0_00011_11100# => INSN_nor, 2#0_01101_11100# => INSN_or, 2#0_01100_11100# => INSN_orc, + 2#0_00100_11100# => INSN_pdepd, + 2#0_00101_11100# => INSN_pextd, 2#0_00011_11010# => INSN_popcntb, 2#0_01111_11010# => INSN_popcntd, 2#0_01011_11010# => INSN_popcntw, diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index 226cfbe..aa0573a 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -87,11 +87,11 @@ const char *units[4] = { "al", "ls", "fp", "3?" }; const char *ops[64] = { "illegal", "nop ", "add ", "attn ", "b ", "bc ", "bcreg ", "bcd ", - "bperm ", "brev ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", + "bperm ", "brev ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "countb ", "crop ", "darn ", "dcbf ", "dcbst ", "xcbt ", "dcbtst ", "dcbz ", "icbi ", "fpcmp ", "fparith", "fpmove ", "fpmisc ", "div ", "dive ", "mod ", "exts ", "extswsl", "isel ", "isync ", "logic ", "ld ", "st ", "mcrxrx ", "mfcr ", "mfmsr ", - "mfspr ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "popcnt ", + "mfspr ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "bsort ", "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", "tlbie ", "trap ", "xor ", "addg6s ", "wait ", "ffail ", };