From 06ff486567aa2c7a9602cc55f9e8851491fa6a83 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 7 Sep 2023 22:22:31 +1000 Subject: [PATCH 01/11] icache: Restore primary opcode to instruction word The icache stores a predecoded insn_code value for each instruction, and so as to fit in 36 bits, omits the primary opcode (the most significant 6 bits) of each instruction. Previously, for valid instructions, the primary opcode field of the instruction delivered to decode1 was a part-representation of the insn_code value rather than the actual primary opcode. This adds a lookup table to compute the primary opcode from the insn_code and deliver it in the instruction words supplied to decode1. In order that each insn_code can be associated with a single primary opcode value, the various no-operation instructions with primary opcode 31 (the reserved no-ops and dss, dst and dstst) have been given a new insn_code, INSN_rnop, leaving INSN_nop for the preferred no-op (ori r0,r0,0). Signed-off-by: Paul Mackerras --- decode1.vhdl | 1 + decode_types.vhdl | 323 ++++++++++++++++++++++++++++++++++++++++++++-- icache.vhdl | 1 + predecode.vhdl | 22 ++-- 4 files changed, 325 insertions(+), 22 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index e090d66..4163584 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -310,6 +310,7 @@ architecture behaviour of decode1 is INSN_rlwimi => (ALU, NONE, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), INSN_rlwinm => (ALU, NONE, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), INSN_rlwnm => (ALU, NONE, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), + INSN_rnop => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_sc => (ALU, NONE, OP_SC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_setb => (ALU, NONE, OP_SETB, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_slbia => (LDST, NONE, OP_TLBIE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), diff --git a/decode_types.vhdl b/decode_types.vhdl index 9e7ef84..cfa4792 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -110,11 +110,12 @@ package decode_types is INSN_rldimi, INSN_rlwimi, INSN_rlwinm, + INSN_rnop, INSN_sc, INSN_setb, INSN_slbia, - INSN_sradi, - INSN_srawi, -- 80 + INSN_sradi, -- 80 + INSN_srawi, INSN_stbu, INSN_std, INSN_stdu, @@ -123,25 +124,27 @@ package decode_types is INSN_subfic, INSN_subfme, INSN_subfze, - INSN_sync, - INSN_tdi, -- 90 + INSN_sync, -- 90 + INSN_tdi, INSN_tlbsync, INSN_twi, INSN_wait, INSN_xori, INSN_xoris, + -- pad to 104 + INSN_061, INSN_062, INSN_063, INSN_064, INSN_065, INSN_066, INSN_067, -- Non-prefixed instructions that have a MLS:D prefixed form and -- their corresponding prefixed instructions. -- The non-prefixed versions have even indexes so that we can -- convert them to the prefixed version by setting bit 0 - INSN_addi, -- 96 + INSN_addi, -- 104 INSN_paddi, INSN_lbz, INSN_plbz, - INSN_lha, -- 100 + INSN_lha, INSN_plha, - INSN_lhz, + INSN_lhz, -- 110 INSN_plhz, INSN_lwz, INSN_plwz, @@ -149,11 +152,11 @@ package decode_types is INSN_pstb, INSN_sth, INSN_psth, - INSN_stw, -- 110 + INSN_stw, INSN_pstw, -- Slots for non-prefixed opcodes that are 8LS:D when prefixed - INSN_lhzu, -- 112 + INSN_lhzu, -- 120 INSN_plwa, INSN_op57, INSN_pld, @@ -161,8 +164,7 @@ package decode_types is INSN_pstd, -- pad to 128 to simplify comparison logic - INSN_076, INSN_077, - INSN_078, INSN_079, INSN_07a, INSN_07b, INSN_07c, INSN_07d, INSN_07e, INSN_07f, + INSN_07e, INSN_07f, -- The following instructions have an RB operand but don't access FPRs INSN_add, @@ -475,7 +477,306 @@ package decode_types is update => '0', reserve => '0', is_32bit => '0', is_signed => '0', rc => NONE, lr => '0', sgl_pipe => '0', repeat => NONE); + -- This function maps from insn_code values to primary opcode. + -- With this, we don't have to store the primary opcode of each instruction + -- in the icache if we are storing its insn_code. + function recode_primary_opcode(icode: insn_code) return std_ulogic_vector; + end decode_types; package body decode_types is + + function recode_primary_opcode(icode: insn_code) return std_ulogic_vector is + begin + case icode is + when INSN_addic => return "001100"; + when INSN_addic_dot => return "001101"; + when INSN_addi => return "001110"; + when INSN_addis => return "001111"; + when INSN_addpcis => return "010011"; + when INSN_andi_dot => return "011100"; + when INSN_andis_dot => return "011101"; + when INSN_attn => return "000000"; + when INSN_b => return "010010"; + when INSN_bc => return "010000"; + when INSN_brh => return "011111"; + when INSN_brw => return "011111"; + when INSN_brd => return "011111"; + when INSN_cmpi => return "001011"; + when INSN_cmpli => return "001010"; + when INSN_lbz => return "100010"; + when INSN_lbzu => return "100011"; + when INSN_lfd => return "110010"; + when INSN_lfdu => return "110011"; + when INSN_lfs => return "110000"; + when INSN_lfsu => return "110001"; + when INSN_lha => return "101010"; + when INSN_lhau => return "101011"; + when INSN_lhz => return "101000"; + when INSN_lhzu => return "101001"; + when INSN_lwz => return "100000"; + when INSN_lwzu => return "100001"; + when INSN_mulli => return "000111"; + when INSN_nop => return "011000"; + when INSN_ori => return "011000"; + when INSN_oris => return "011001"; + when INSN_rlwimi => return "010100"; + when INSN_rlwinm => return "010101"; + when INSN_rlwnm => return "010111"; + when INSN_sc => return "010001"; + when INSN_stb => return "100110"; + when INSN_stbu => return "100111"; + when INSN_stfd => return "110110"; + when INSN_stfdu => return "110111"; + when INSN_stfs => return "110100"; + when INSN_stfsu => return "110101"; + when INSN_sth => return "101100"; + when INSN_sthu => return "101101"; + when INSN_stw => return "100100"; + when INSN_stwu => return "100101"; + when INSN_subfic => return "001000"; + when INSN_tdi => return "000010"; + when INSN_twi => return "000011"; + when INSN_xori => return "011010"; + when INSN_xoris => return "011011"; + when INSN_maddhd => return "000100"; + when INSN_maddhdu => return "000100"; + when INSN_maddld => return "000100"; + when INSN_rldic => return "011110"; + when INSN_rldicl => return "011110"; + when INSN_rldicr => return "011110"; + when INSN_rldimi => return "011110"; + when INSN_rldcl => return "011110"; + when INSN_rldcr => return "011110"; + when INSN_ld => return "111010"; + when INSN_ldu => return "111010"; + when INSN_lwa => return "111010"; + when INSN_fdivs => return "111011"; + when INSN_fsubs => return "111011"; + when INSN_fadds => return "111011"; + when INSN_fsqrts => return "111011"; + when INSN_fres => return "111011"; + when INSN_fmuls => return "111011"; + when INSN_frsqrtes => return "111011"; + when INSN_fmsubs => return "111011"; + when INSN_fmadds => return "111011"; + when INSN_fnmsubs => return "111011"; + when INSN_fnmadds => return "111011"; + when INSN_std => return "111110"; + when INSN_stdu => return "111110"; + when INSN_fdiv => return "111111"; + when INSN_fsub => return "111111"; + when INSN_fadd => return "111111"; + when INSN_fsqrt => return "111111"; + when INSN_fsel => return "111111"; + when INSN_fre => return "111111"; + when INSN_fmul => return "111111"; + when INSN_frsqrte => return "111111"; + when INSN_fmsub => return "111111"; + when INSN_fmadd => return "111111"; + when INSN_fnmsub => return "111111"; + when INSN_fnmadd => return "111111"; + when INSN_prefix => return "000001"; + when INSN_op57 => return "111001"; + when INSN_op61 => return "111101"; + when INSN_add => return "011111"; + when INSN_addc => return "011111"; + when INSN_adde => return "011111"; + when INSN_addex => return "011111"; + when INSN_addg6s => return "011111"; + when INSN_addme => return "011111"; + when INSN_addze => return "011111"; + when INSN_and => return "011111"; + when INSN_andc => return "011111"; + when INSN_bperm => return "011111"; + when INSN_cbcdtd => return "011111"; + when INSN_cdtbcd => return "011111"; + when INSN_cmp => return "011111"; + when INSN_cmpb => return "011111"; + when INSN_cmpeqb => return "011111"; + when INSN_cmpl => return "011111"; + when INSN_cmprb => return "011111"; + when INSN_cntlzd => return "011111"; + when INSN_cntlzw => return "011111"; + when INSN_cnttzd => return "011111"; + when INSN_cnttzw => return "011111"; + when INSN_darn => return "011111"; + when INSN_dcbf => return "011111"; + when INSN_dcbst => return "011111"; + when INSN_dcbt => return "011111"; + when INSN_dcbtst => return "011111"; + when INSN_dcbz => return "011111"; + when INSN_divdeu => return "011111"; + when INSN_divweu => return "011111"; + when INSN_divde => return "011111"; + when INSN_divwe => return "011111"; + when INSN_divdu => return "011111"; + when INSN_divwu => return "011111"; + when INSN_divd => return "011111"; + when INSN_divw => return "011111"; + when INSN_eieio => return "011111"; + when INSN_eqv => return "011111"; + when INSN_extsb => return "011111"; + when INSN_extsh => return "011111"; + when INSN_extsw => return "011111"; + when INSN_extswsli => return "011111"; + when INSN_icbi => return "011111"; + when INSN_icbt => return "011111"; + when INSN_isel => return "011111"; + when INSN_lbarx => return "011111"; + when INSN_lbzcix => return "011111"; + when INSN_lbzux => return "011111"; + when INSN_lbzx => return "011111"; + when INSN_ldarx => return "011111"; + when INSN_ldbrx => return "011111"; + when INSN_ldcix => return "011111"; + when INSN_ldux => return "011111"; + when INSN_ldx => return "011111"; + when INSN_lfdx => return "011111"; + when INSN_lfdux => return "011111"; + when INSN_lfiwax => return "011111"; + when INSN_lfiwzx => return "011111"; + when INSN_lfsx => return "011111"; + when INSN_lfsux => return "011111"; + when INSN_lharx => return "011111"; + when INSN_lhaux => return "011111"; + when INSN_lhax => return "011111"; + when INSN_lhbrx => return "011111"; + when INSN_lhzcix => return "011111"; + when INSN_lhzux => return "011111"; + when INSN_lhzx => return "011111"; + when INSN_lwarx => return "011111"; + when INSN_lwaux => return "011111"; + when INSN_lwax => return "011111"; + when INSN_lwbrx => return "011111"; + when INSN_lwzcix => return "011111"; + when INSN_lwzux => return "011111"; + when INSN_lwzx => return "011111"; + when INSN_mcrxrx => return "011111"; + when INSN_mfcr => return "011111"; + when INSN_mfmsr => return "011111"; + when INSN_mfspr => return "011111"; + when INSN_modud => return "011111"; + when INSN_moduw => return "011111"; + when INSN_modsd => return "011111"; + when INSN_modsw => return "011111"; + when INSN_mtcrf => return "011111"; + when INSN_mtmsr => return "011111"; + when INSN_mtmsrd => return "011111"; + when INSN_mtspr => return "011111"; + when INSN_mulhd => return "011111"; + when INSN_mulhdu => return "011111"; + when INSN_mulhw => return "011111"; + when INSN_mulhwu => return "011111"; + when INSN_mulld => return "011111"; + when INSN_mullw => return "011111"; + when INSN_nand => return "011111"; + when INSN_neg => return "011111"; + when INSN_rnop => return "011111"; + when INSN_nor => return "011111"; + when INSN_or => return "011111"; + when INSN_orc => return "011111"; + when INSN_popcntb => return "011111"; + when INSN_popcntd => return "011111"; + when INSN_popcntw => return "011111"; + when INSN_prtyd => return "011111"; + when INSN_prtyw => return "011111"; + when INSN_setb => return "011111"; + when INSN_slbia => return "011111"; + when INSN_sld => return "011111"; + when INSN_slw => return "011111"; + when INSN_srad => return "011111"; + when INSN_sradi => return "011111"; + when INSN_sraw => return "011111"; + when INSN_srawi => return "011111"; + when INSN_srd => return "011111"; + when INSN_srw => return "011111"; + when INSN_stbcix => return "011111"; + when INSN_stbcx => return "011111"; + when INSN_stbux => return "011111"; + when INSN_stbx => return "011111"; + when INSN_stdbrx => return "011111"; + when INSN_stdcix => return "011111"; + when INSN_stdcx => return "011111"; + when INSN_stdux => return "011111"; + when INSN_stdx => return "011111"; + when INSN_stfdx => return "011111"; + when INSN_stfdux => return "011111"; + when INSN_stfiwx => return "011111"; + when INSN_stfsx => return "011111"; + when INSN_stfsux => return "011111"; + when INSN_sthbrx => return "011111"; + when INSN_sthcix => return "011111"; + when INSN_sthcx => return "011111"; + when INSN_sthux => return "011111"; + when INSN_sthx => return "011111"; + when INSN_stwbrx => return "011111"; + when INSN_stwcix => return "011111"; + when INSN_stwcx => return "011111"; + when INSN_stwux => return "011111"; + when INSN_stwx => return "011111"; + when INSN_subf => return "011111"; + when INSN_subfc => return "011111"; + when INSN_subfe => return "011111"; + when INSN_subfme => return "011111"; + when INSN_subfze => return "011111"; + when INSN_sync => return "011111"; + when INSN_td => return "011111"; + when INSN_tw => return "011111"; + when INSN_tlbie => return "011111"; + when INSN_tlbiel => return "011111"; + when INSN_tlbsync => return "011111"; + when INSN_wait => return "011111"; + when INSN_xor => return "011111"; + when INSN_bcctr => return "010011"; + when INSN_bclr => return "010011"; + when INSN_bctar => return "010011"; + when INSN_crand => return "010011"; + when INSN_crandc => return "010011"; + when INSN_creqv => return "010011"; + when INSN_crnand => return "010011"; + when INSN_crnor => return "010011"; + when INSN_cror => return "010011"; + when INSN_crorc => return "010011"; + when INSN_crxor => return "010011"; + when INSN_isync => return "010011"; + when INSN_mcrf => return "010011"; + when INSN_rfid => return "010011"; + when INSN_fcfids => return "111011"; + when INSN_fcfidus => return "111011"; + when INSN_fcmpu => return "111111"; + when INSN_fcmpo => return "111111"; + when INSN_mcrfs => return "111111"; + when INSN_ftdiv => return "111111"; + when INSN_ftsqrt => return "111111"; + when INSN_mtfsb => return "111111"; + when INSN_mtfsfi => return "111111"; + when INSN_fmrgow => return "111111"; + when INSN_fmrgew => return "111111"; + when INSN_mffs => return "111111"; + when INSN_mtfsf => return "111111"; + when INSN_fcpsgn => return "111111"; + when INSN_fneg => return "111111"; + when INSN_fmr => return "111111"; + when INSN_fnabs => return "111111"; + when INSN_fabs => return "111111"; + when INSN_frin => return "111111"; + when INSN_friz => return "111111"; + when INSN_frip => return "111111"; + when INSN_frim => return "111111"; + when INSN_frsp => return "111111"; + when INSN_fctiw => return "111111"; + when INSN_fctiwu => return "111111"; + when INSN_fctid => return "111111"; + when INSN_fcfid => return "111111"; + when INSN_fctidu => return "111111"; + when INSN_fcfidu => return "111111"; + when INSN_fctiwz => return "111111"; + when INSN_fctiwuz => return "111111"; + when INSN_fctidz => return "111111"; + when INSN_fctiduz => return "111111"; + when others => return "XXXXXX"; + end case; + end; + end decode_types; diff --git a/icache.vhdl b/icache.vhdl index e01eb35..21a7a24 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -621,6 +621,7 @@ begin insn := (others => '0'); elsif insn(ICWORDLEN - 1) = '0' then icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS)))); + insn(31 downto 26) := recode_primary_opcode(icode); end if; end if; i_out.insn <= insn(31 downto 0); diff --git a/predecode.vhdl b/predecode.vhdl index 27f80e1..0ab7427 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -220,9 +220,9 @@ architecture behaviour of predecoder is 2#0_11111_01001# => INSN_divd, -- divdo 2#0_01111_01011# => INSN_divw, 2#0_11111_01011# => INSN_divw, -- divwo - 2#0_11001_10110# => INSN_nop, -- dss - 2#0_01010_10110# => INSN_nop, -- dst - 2#0_01011_10110# => INSN_nop, -- dstst + 2#0_11001_10110# => INSN_rnop, -- dss + 2#0_01010_10110# => INSN_rnop, -- dst + 2#0_01011_10110# => INSN_rnop, -- dstst 2#0_11010_10110# => INSN_eieio, 2#0_01000_11100# => INSN_eqv, 2#0_11101_11010# => INSN_extsb, @@ -322,14 +322,14 @@ architecture behaviour of predecoder is 2#0_00011_01000# => INSN_neg, 2#0_10011_01000# => INSN_neg, -- nego -- next 8 are reserved no-op instructions - 2#0_10000_10010# => INSN_nop, - 2#0_10001_10010# => INSN_nop, - 2#0_10010_10010# => INSN_nop, - 2#0_10011_10010# => INSN_nop, - 2#0_10100_10010# => INSN_nop, - 2#0_10101_10010# => INSN_nop, - 2#0_10110_10010# => INSN_nop, - 2#0_10111_10010# => INSN_nop, + 2#0_10000_10010# => INSN_rnop, + 2#0_10001_10010# => INSN_rnop, + 2#0_10010_10010# => INSN_rnop, + 2#0_10011_10010# => INSN_rnop, + 2#0_10100_10010# => INSN_rnop, + 2#0_10101_10010# => INSN_rnop, + 2#0_10110_10010# => INSN_rnop, + 2#0_10111_10010# => INSN_rnop, 2#0_00011_11100# => INSN_nor, 2#0_01101_11100# => INSN_or, 2#0_01100_11100# => INSN_orc, From 1c4b5def36c77bee61342593386f8b5110d02805 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 26 Jul 2023 16:33:27 +1000 Subject: [PATCH 02/11] Improve timing of redirect_nia going from writeback to fetch1 This gets rid of the adder in writeback that computes redirect_nia. Instead, the main adder in the ALU is used to compute the branch target for relative branches. We now decode b and bc differently depending on the AA field, generating INSN_brel, INSN_babs, INSN_bcrel or INSN_bcabs as appropriate. Each one has a separate entry in the decode table in decode1; the *rel versions use CIA as the A input. The bclr/bcctr/bctar and rfid instructions now select ramspr_result for the main result mux to get the redirect address into ex1.e.write_data. For branches which are predicted taken but not actually taken, we need to redirect to the following instruction. We also need to do that for isync. We do this in the execute2 stage since whether or not to do it depends on the branch result. The next_nia computation is moved to the execute2 stage and comes in via a new leg on the secondary result multiplexer, making next_nia available ultimately in ex2.e.write_data. This also means that the next_nia leg of the primary result multiplexer is gone. Incrementing last_nia by 4 for sc (so that SRR0 points to the following instruction) is also moved to execute2. Writing CIA+4 to LR was previously done through the main result multiplexer. Now it comes in explicitly in the ramspr write logic. Overall this removes the br_offset and abs_br fields and the logic to add br_offset and next_nia, and one leg of the primary result multiplexer, at the cost of a few extra control signals between execute1 and execute2 and some multiplexing for the ramspr write side and an extra input on the secondary result multiplexer. Signed-off-by: Paul Mackerras --- common.vhdl | 3 +-- decode1.vhdl | 10 +++++---- decode2.vhdl | 5 ++--- decode_types.vhdl | 46 ++++++++++++++++++++------------------ execute1.vhdl | 56 ++++++++++++++++++++++++++--------------------- predecode.vhdl | 34 ++++++++++++++++++++++++++-- writeback.vhdl | 6 +---- 7 files changed, 98 insertions(+), 62 deletions(-) diff --git a/common.vhdl b/common.vhdl index 59c855e..a46eff5 100644 --- a/common.vhdl +++ b/common.vhdl @@ -658,7 +658,6 @@ package common is redirect: std_ulogic; redir_mode: std_ulogic_vector(3 downto 0); last_nia: std_ulogic_vector(63 downto 0); - br_offset: std_ulogic_vector(63 downto 0); br_last: std_ulogic; br_taken: std_ulogic; abs_br: std_ulogic; @@ -672,7 +671,7 @@ package common is write_data => (others => '0'), write_cr_mask => (others => '0'), write_cr_data => (others => '0'), write_reg => (others => '0'), interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", - last_nia => (others => '0'), br_offset => (others => '0'), + last_nia => (others => '0'), br_last => '0', br_taken => '0', abs_br => '0', srr1 => (others => '0'), msr => (others => '0')); diff --git a/decode1.vhdl b/decode1.vhdl index 4163584..40e8aef 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -94,8 +94,10 @@ architecture behaviour of decode1 is INSN_andi_dot => (ALU, NONE, OP_LOGIC, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), INSN_andis_dot => (ALU, NONE, OP_LOGIC, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), INSN_attn => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), - INSN_b => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), - INSN_bc => (ALU, NONE, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_brel => (ALU, NONE, OP_B, CIA, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_babs => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_bcrel => (ALU, NONE, OP_BC, CIA, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_bcabs => (ALU, NONE, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bcctr => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bclr => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bctar => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), @@ -597,11 +599,11 @@ begin -- count cache or link stack. br_offset := (others => '0'); case icode is - when INSN_b => + when INSN_brel | INSN_babs => -- Unconditional branches are always taken v.br_pred := '1'; br_offset := signed(f_in.insn(25 downto 2)); - when INSN_bc => + when INSN_bcrel | INSN_bcabs => -- Predict backward branches as taken, forward as untaken v.br_pred := f_in.insn(15); br_offset := resize(signed(f_in.insn(15 downto 2)), 24); diff --git a/decode2.vhdl b/decode2.vhdl index 1f3e7ff..80dfabd 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -221,9 +221,8 @@ architecture behaviour of decode2 is OP_SHR => "010", OP_EXTSWSLI => "010", OP_MUL_L64 => "011", -- muldiv_result - OP_B => "110", -- next_nia - OP_BC => "110", - OP_BCREG => "110", + OP_BCREG => "101", -- ramspr_result + OP_RFID => "101", OP_ADDG6S => "111", -- misc_result OP_ISEL => "111", OP_DARN => "111", diff --git a/decode_types.vhdl b/decode_types.vhdl index cfa4792..5b21fff 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -47,14 +47,16 @@ package decode_types is INSN_andi_dot, -- 10 INSN_andis_dot, INSN_attn, - INSN_b, - INSN_bc, + INSN_brel, + INSN_babs, + INSN_bcrel, + INSN_bcabs, INSN_bcctr, INSN_bclr, INSN_bctar, - INSN_brh, + INSN_brh, -- 20 INSN_brw, - INSN_brd, -- 20 + INSN_brd, INSN_cbcdtd, INSN_cdtbcd, INSN_cmpi, @@ -62,9 +64,9 @@ package decode_types is INSN_cntlzw, INSN_cntlzd, INSN_cnttzw, - INSN_cnttzd, + INSN_cnttzd, -- 30 INSN_crand, - INSN_crandc, -- 30 + INSN_crandc, INSN_creqv, INSN_crnand, INSN_crnor, @@ -72,9 +74,9 @@ package decode_types is INSN_crorc, INSN_crxor, INSN_darn, - INSN_eieio, + INSN_eieio, -- 40 INSN_extsb, - INSN_extsh, -- 40 + INSN_extsh, INSN_extsw, INSN_extswsli, INSN_isync, @@ -82,9 +84,9 @@ package decode_types is INSN_ld, INSN_ldu, INSN_lhau, - INSN_lwa, + INSN_lwa, -- 50 INSN_lwzu, - INSN_mcrf, -- 50 + INSN_mcrf, INSN_mcrxrx, INSN_mfcr, INSN_mfmsr, @@ -92,9 +94,9 @@ package decode_types is INSN_mtcrf, INSN_mtmsr, INSN_mtmsrd, - INSN_mtspr, + INSN_mtspr, -- 60 INSN_mulli, - INSN_neg, -- 60 + INSN_neg, INSN_nop, INSN_ori, INSN_oris, @@ -102,9 +104,9 @@ package decode_types is INSN_popcntw, INSN_popcntd, INSN_prtyw, - INSN_prtyd, + INSN_prtyd, -- 70 INSN_rfid, - INSN_rldic, -- 70 + INSN_rldic, INSN_rldicl, INSN_rldicr, INSN_rldimi, @@ -112,9 +114,9 @@ package decode_types is INSN_rlwinm, INSN_rnop, INSN_sc, - INSN_setb, + INSN_setb, -- 80 INSN_slbia, - INSN_sradi, -- 80 + INSN_sradi, INSN_srawi, INSN_stbu, INSN_std, @@ -122,9 +124,9 @@ package decode_types is INSN_sthu, INSN_stwu, INSN_subfic, - INSN_subfme, + INSN_subfme, -- 90 INSN_subfze, - INSN_sync, -- 90 + INSN_sync, INSN_tdi, INSN_tlbsync, INSN_twi, @@ -132,7 +134,7 @@ package decode_types is INSN_xori, INSN_xoris, -- pad to 104 - INSN_061, INSN_062, INSN_063, INSN_064, INSN_065, INSN_066, INSN_067, + INSN_063, INSN_064, INSN_065, INSN_066, INSN_067, -- Non-prefixed instructions that have a MLS:D prefixed form and -- their corresponding prefixed instructions. @@ -497,8 +499,10 @@ package body decode_types is when INSN_andi_dot => return "011100"; when INSN_andis_dot => return "011101"; when INSN_attn => return "000000"; - when INSN_b => return "010010"; - when INSN_bc => return "010000"; + when INSN_brel => return "010010"; + when INSN_babs => return "010010"; + when INSN_bcrel => return "010000"; + when INSN_bcabs => return "010000"; when INSN_brh => return "011111"; when INSN_brw => return "011111"; when INSN_brd => return "011111"; diff --git a/execute1.vhdl b/execute1.vhdl index 7c1ff8f..dacd66c 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -95,6 +95,7 @@ architecture behaviour of execute1 is exception : std_ulogic; trap : std_ulogic; advance_nia : std_ulogic; + redir_to_next : std_ulogic; new_msr : std_ulogic_vector(63 downto 0); take_branch : std_ulogic; direct_branch : std_ulogic; @@ -124,6 +125,9 @@ architecture behaviour of execute1 is res2_sel : std_ulogic_vector(1 downto 0); spr_select : spr_id; pmu_spr_num : std_ulogic_vector(4 downto 0); + redir_to_next : std_ulogic; + advance_nia : std_ulogic; + lr_from_next : std_ulogic; mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; @@ -145,6 +149,7 @@ architecture behaviour of execute1 is prev_prefixed => '0', oe => '0', mul_select => "00", res2_sel => "00", spr_select => spr_id_init, pmu_spr_num => 5x"0", + redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', @@ -510,6 +515,7 @@ begin variable wr_addr : ramspr_index; variable even_wr_enab, odd_wr_enab : std_ulogic; variable even_wr_data, odd_wr_data : std_ulogic_vector(63 downto 0); + variable ramspr_even_data : std_ulogic_vector(63 downto 0); variable doit : std_ulogic; begin -- Read address mux and async RAM reading @@ -533,11 +539,16 @@ begin else wr_addr := ex1.ramspr_wraddr; end if; + if ex1.lr_from_next = '1' then + ramspr_even_data := next_nia; + else + ramspr_even_data := ex1.e.write_data; + end if; if interrupt_in.intr = '1' then even_wr_data := ex2.e.last_nia; odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1); else - even_wr_data := ex1.e.write_data; + even_wr_data := ramspr_even_data; odd_wr_data := ex1.ramspr_odd_data; end if; ramspr_wr_addr <= wr_addr; @@ -550,7 +561,7 @@ begin -- We assume no instruction executes in the cycle immediately following -- an interrupt, so we don't need to bypass interrupt data if ex1.se.ramspr_write_even = '1' and e_in.ramspr_even_rdaddr = ex1.ramspr_wraddr then - ramspr_even <= ex1.e.write_data; + ramspr_even <= ramspr_even_data; else ramspr_even <= even_rd_data; end if; @@ -593,7 +604,6 @@ begin shortmul_result when "011", muldiv_result when "100", ramspr_result when "101", - next_nia when "110", misc_result when others; execute1_0: process(clk) @@ -1016,7 +1026,6 @@ begin v.e.mode_32bit := not ex1.msr(MSR_SF); v.e.instr_tag := e_in.instr_tag; v.e.last_nia := e_in.nia; - v.e.br_offset := 64x"4"; v.se.ramspr_write_even := e_in.ramspr_write_even; v.se.ramspr_write_odd := e_in.ramspr_write_odd; @@ -1114,8 +1123,6 @@ begin v.direct_branch := '1'; v.e.br_last := '1'; v.e.br_taken := '1'; - v.e.br_offset := b_in; - v.e.abs_br := insn_aa(e_in.insn); if e_in.br_pred = '0' then -- should never happen v.e.redirect := '1'; @@ -1129,14 +1136,13 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); - if v.take_branch = '1' then - v.e.br_offset := b_in; - v.e.abs_br := insn_aa(e_in.insn); - end if; -- Mispredicted branches cause a redirect if v.take_branch /= e_in.br_pred then v.e.redirect := '1'; end if; + if v.take_branch = '0' then + v.redir_to_next := '1'; + end if; v.direct_branch := '1'; v.e.br_last := '1'; v.e.br_taken := v.take_branch; @@ -1150,10 +1156,6 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); - if v.take_branch = '1' then - v.e.br_offset := ramspr_result; - v.e.abs_br := '1'; - end if; -- Indirect branches are never predicted taken v.e.redirect := v.take_branch; v.e.br_taken := v.take_branch; @@ -1177,8 +1179,6 @@ begin v.new_msr(MSR_DR) := '1'; end if; v.se.write_msr := '1'; - v.e.br_offset := ramspr_result; - v.e.abs_br := '1'; v.e.redirect := '1'; v.se.write_cfar := '1'; if HAS_FPU then @@ -1292,6 +1292,7 @@ begin when OP_ISYNC => v.e.redirect := '1'; + v.redir_to_next := '1'; when OP_ICBI => v.se.icache_inval := '1'; @@ -1406,6 +1407,7 @@ begin v.mul_select := e_in.sub_select(1 downto 0); v.se := side_effect_init; v.ramspr_wraddr := e_in.ramspr_wraddr; + v.lr_from_next := e_in.lr; v.ramspr_odd_data := actions.ramspr_odd_data; end if; @@ -1423,9 +1425,6 @@ begin irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); - -- Next insn adder used in a couple of places - next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); - -- rotator control signals right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; @@ -1507,10 +1506,9 @@ begin x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; v.br_mispredict := v.e.redirect and actions.direct_branch; + v.advance_nia := actions.advance_nia; + v.redir_to_next := actions.redir_to_next; exception := actions.trap; - if actions.advance_nia = '1' then - v.e.last_nia := next_nia; - end if; -- Go busy while division is happening because the -- divider is not pipelined. Also go busy while a @@ -1681,6 +1679,9 @@ begin variable sign, zero : std_ulogic; variable rcnz_hi, rcnz_lo : std_ulogic; begin + -- Next insn adder used in a couple of places + next_nia <= std_ulogic_vector(unsigned(ex1.e.last_nia) + 4); + v := ex2; if stage2_stall = '0' then v.e := ex1.e; @@ -1688,6 +1689,9 @@ begin v.ext_interrupt := ex1.ext_interrupt; v.taken_branch_event := ex1.taken_branch_event; v.br_mispredict := ex1.br_mispredict; + if ex1.advance_nia = '1' then + v.e.last_nia := next_nia; + end if; end if; if ex1.se.mult_32s = '1' and ex1.oe = '1' then @@ -1748,10 +1752,12 @@ begin else sprres := pmu_to_x.spr_val; end if; - if ex1.res2_sel(1) = '0' then - ex_result := rcresult; - else + if ex1.res2_sel(1) = '1' then ex_result := sprres; + elsif ex1.redir_to_next = '1' then + ex_result := next_nia; + else + ex_result := rcresult; end if; cr_res := ex1.e.write_cr_data; diff --git a/predecode.vhdl b/predecode.vhdl index 0ab7427..d3ca015 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -38,8 +38,38 @@ architecture behaviour of predecoder is 2#011100_00000# to 2#011100_11111# => INSN_andi_dot, 2#011101_00000# to 2#011101_11111# => INSN_andis_dot, 2#000000_00000# => INSN_attn, - 2#010010_00000# to 2#010010_11111# => INSN_b, - 2#010000_00000# to 2#010000_11111# => INSN_bc, + 2#010010_00000# to 2#010010_00001# => INSN_brel, + 2#010010_00010# to 2#010010_00011# => INSN_babs, + 2#010010_00100# to 2#010010_00101# => INSN_brel, + 2#010010_00110# to 2#010010_00111# => INSN_babs, + 2#010010_01000# to 2#010010_01001# => INSN_brel, + 2#010010_01010# to 2#010010_01011# => INSN_babs, + 2#010010_01100# to 2#010010_01101# => INSN_brel, + 2#010010_01110# to 2#010010_01111# => INSN_babs, + 2#010010_10000# to 2#010010_10001# => INSN_brel, + 2#010010_10010# to 2#010010_10011# => INSN_babs, + 2#010010_10100# to 2#010010_10101# => INSN_brel, + 2#010010_10110# to 2#010010_10111# => INSN_babs, + 2#010010_11000# to 2#010010_11001# => INSN_brel, + 2#010010_11010# to 2#010010_11011# => INSN_babs, + 2#010010_11100# to 2#010010_11101# => INSN_brel, + 2#010010_11110# to 2#010010_11111# => INSN_babs, + 2#010000_00000# to 2#010000_00001# => INSN_bcrel, + 2#010000_00010# to 2#010000_00011# => INSN_bcabs, + 2#010000_00100# to 2#010000_00101# => INSN_bcrel, + 2#010000_00110# to 2#010000_00111# => INSN_bcabs, + 2#010000_01000# to 2#010000_01001# => INSN_bcrel, + 2#010000_01010# to 2#010000_01011# => INSN_bcabs, + 2#010000_01100# to 2#010000_01101# => INSN_bcrel, + 2#010000_01110# to 2#010000_01111# => INSN_bcabs, + 2#010000_10000# to 2#010000_10001# => INSN_bcrel, + 2#010000_10010# to 2#010000_10011# => INSN_bcabs, + 2#010000_10100# to 2#010000_10101# => INSN_bcrel, + 2#010000_10110# to 2#010000_10111# => INSN_bcabs, + 2#010000_11000# to 2#010000_11001# => INSN_bcrel, + 2#010000_11010# to 2#010000_11011# => INSN_bcabs, + 2#010000_11100# to 2#010000_11101# => INSN_bcrel, + 2#010000_11110# to 2#010000_11111# => INSN_bcabs, 2#001011_00000# to 2#001011_11111# => INSN_cmpi, 2#001010_00000# to 2#001010_11111# => INSN_cmpli, 2#100010_00000# to 2#100010_11111# => INSN_lbz, diff --git a/writeback.vhdl b/writeback.vhdl index 7fef5c3..2eb9998 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -174,11 +174,7 @@ begin f.big_endian := '0'; f.mode_32bit := '0'; else - if e_in.abs_br = '1' then - f.redirect_nia := e_in.br_offset; - else - f.redirect_nia := std_ulogic_vector(unsigned(e_in.last_nia) + unsigned(e_in.br_offset)); - end if; + f.redirect_nia := e_in.write_data; -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 f.virt_mode := e_in.redir_mode(3); f.priv_mode := e_in.redir_mode(2); From 2dceb288309f92cf468fc68c76b94d30576a2091 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 27 Jul 2023 14:22:57 +1000 Subject: [PATCH 03/11] Improve timing of redirect_nia going from decode1 to fetch1 This moves the addition that computes the branch target address for statically predicted taken branches before a clock edge, so the redirect_nia signal going to fetch1 comes from a clean latch. The address generation logic is also simplified somewhat, and conditional absolute branches to negative addresses are no longer predicted taken (this should have no impact on performance as such branches are basically never used). Signed-off-by: Paul Mackerras --- decode1.vhdl | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 40e8aef..151977d 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -35,8 +35,7 @@ architecture behaviour of decode1 is signal f, fin : Decode1ToFetch1Type; type br_predictor_t is record - br_nia : std_ulogic_vector(61 downto 0); - br_offset : signed(23 downto 0); + br_target : signed(61 downto 0); predict : std_ulogic; end record; @@ -479,8 +478,6 @@ begin end if; end if; if rst = '1' then - br.br_nia <= (others => '0'); - br.br_offset <= (others => '0'); br.predict <= '0'; else br <= br_in; @@ -502,8 +499,8 @@ begin decode1_1: process(all) variable v : Decode1ToDecode2Type; variable vr : Decode1ToRegisterFileType; - variable br_target : std_ulogic_vector(61 downto 0); - variable br_offset : signed(23 downto 0); + variable br_nia : std_ulogic_vector(61 downto 0); + variable br_offset : std_ulogic_vector(23 downto 0); variable bv : br_predictor_t; variable icode : insn_code; variable sprn : spr_num_t; @@ -597,31 +594,28 @@ begin -- Branch predictor -- Note bclr, bcctr and bctar not predicted as we have no -- count cache or link stack. - br_offset := (others => '0'); + br_offset := f_in.insn(25 downto 2); case icode is when INSN_brel | INSN_babs => -- Unconditional branches are always taken v.br_pred := '1'; - br_offset := signed(f_in.insn(25 downto 2)); - when INSN_bcrel | INSN_bcabs => - -- Predict backward branches as taken, forward as untaken + when INSN_bcrel => + -- Predict backward relative branches as taken, others as untaken v.br_pred := f_in.insn(15); - br_offset := resize(signed(f_in.insn(15 downto 2)), 24); + br_offset(23 downto 14) := (others => '1'); when others => end case; - bv.br_nia := f_in.nia(63 downto 2); + br_nia := f_in.nia(63 downto 2); if f_in.insn(1) = '1' then - bv.br_nia := (others => '0'); + br_nia := (others => '0'); end if; - bv.br_offset := br_offset; + bv.br_target := signed(br_nia) + signed(br_offset); if f_in.next_predicted = '1' then v.br_pred := '1'; elsif f_in.next_pred_ntaken = '1' then v.br_pred := '0'; end if; bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted; - -- after a clock edge... - br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset); -- Work out GPR/FPR read addresses -- Note that for prefixed instructions we are working this out based @@ -668,7 +662,7 @@ begin d_out.decode <= decode; r_out <= vr; f_out.redirect <= br.predict; - f_out.redirect_nia <= br_target & "00"; + f_out.redirect_nia <= std_ulogic_vector(br.br_target) & "00"; flush_out <= bv.predict or br.predict; end process; From e92d49375f92e930498e6915a7940d584245dcaa Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 29 Jul 2023 10:22:56 +1000 Subject: [PATCH 04/11] fetch1: Reorganize fetch1 to provide an asynchronous early next NIA to icache This adds a next_nia field to the Fetch1ToIcacheType record, which provides an indication of what will be in the nia field on the next non-stalled cycle. This is intended to be as fast as possible, being a selection from two redirect addresses (from writeback and decode1) or an internal register (r_int.next_nia). Reset addresses and predicted branch targets come through this internal register. The rearrangement here has the side effect that we can now use the BTC on the first instruction after a taken branch, whereas previously the BTC was only active starting with the second instruction after a taken branch. This provides a slight improvement in performance. This also fixes a buglet in icache where it would assert its stall output when i_in.req was false. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + fetch1.vhdl | 127 +++++++++++++++++++++++++++++----------------------- icache.vhdl | 2 +- 3 files changed, 73 insertions(+), 57 deletions(-) diff --git a/common.vhdl b/common.vhdl index a46eff5..776d74c 100644 --- a/common.vhdl +++ b/common.vhdl @@ -238,6 +238,7 @@ package common is predicted : std_ulogic; pred_ntaken : std_ulogic; nia: std_ulogic_vector(63 downto 0); + next_nia: std_ulogic_vector(63 downto 0); end record; type IcacheToDecode1Type is record diff --git a/fetch1.vhdl b/fetch1.vhdl index 6803fb6..4980b05 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -40,8 +40,7 @@ architecture behaviour of fetch1 is type reg_internal_t is record mode_32bit: std_ulogic; rd_is_niap4: std_ulogic; - predicted_taken: std_ulogic; - predicted_nia: std_ulogic_vector(63 downto 0); + next_nia: std_ulogic_vector(63 downto 0); end record; signal r, r_next : Fetch1ToIcacheType; signal r_int, r_next_int : reg_internal_t; @@ -55,6 +54,7 @@ architecture behaviour of fetch1 is constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS + 2; type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0); + signal btc_rd_addr : unsigned(BTC_ADDR_BITS - 1 downto 0); signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); signal btc_rd_valid : std_ulogic := '0'; @@ -64,7 +64,7 @@ begin begin if rising_edge(clk) then log_nia <= r.nia(63) & r.nia(43 downto 2); - if r /= r_next then + if r /= r_next and advance_nia = '1' then report "fetch1 rst:" & std_ulogic'image(rst) & " IR:" & std_ulogic'image(r_next.virt_mode) & " P:" & std_ulogic'image(r_next.priv_mode) & @@ -73,25 +73,16 @@ begin " R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & - " nia:" & to_hstring(r_next.nia); + " nia:" & to_hstring(r_next.nia) & + " req:" & std_ulogic'image(r_next.req); end if; - if rst = '1' or w_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then - r.virt_mode <= r_next.virt_mode; - r.priv_mode <= r_next.priv_mode; - r.big_endian <= r_next.big_endian; - r_int.mode_32bit <= r_next_int.mode_32bit; - end if; if advance_nia = '1' then - r.predicted <= r_next.predicted; - r.pred_ntaken <= r_next.pred_ntaken; - r.nia <= r_next.nia; - r_int.predicted_taken <= r_next_int.predicted_taken; - r_int.predicted_nia <= r_next_int.predicted_nia; - r_int.rd_is_niap4 <= r_next_int.rd_is_niap4; + r <= r_next; + r_int <= r_next_int; end if; -- always send the up-to-date stop mark and req r.stop_mark <= stop_in; - r.req <= not rst and not stop_in; + r.req <= r_next.req; end if; end process; log_out <= log_nia; @@ -119,15 +110,13 @@ begin variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); begin if rising_edge(clk) then - raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) + - to_unsigned(2, BTC_ADDR_BITS); if advance_nia = '1' then - if is_X(raddr) then + if is_X(btc_rd_addr) then btc_rd_data <= (others => 'X'); btc_rd_valid <= 'X'; else - btc_rd_data <= btc_memory(to_integer(raddr)); - btc_rd_valid <= btc_valids(to_integer(raddr)); + btc_rd_data <= btc_memory(to_integer(btc_rd_addr)); + btc_rd_valid <= btc_valids(to_integer(btc_rd_addr)); end if; end if; if btc_wr = '1' then @@ -147,67 +136,93 @@ begin comb : process(all) variable v : Fetch1ToIcacheType; variable v_int : reg_internal_t; + variable next_nia : std_ulogic_vector(63 downto 0); + variable m32 : std_ulogic; begin v := r; v_int := r_int; v.predicted := '0'; v.pred_ntaken := '0'; - v_int.predicted_taken := '0'; - v_int.rd_is_niap4 := '0'; + v.req := not (rst or stop_in); + -- reduce metavalue warnings in sim + if is_X(rst) then + v.req := '0'; + end if; + + -- Combinatorial computation of the CIA for the next cycle. + -- Needs to be simple so the result can be used for RAM + -- and TLB access in the icache. + -- If we are stalled, this still advances, and the assumption + -- is that it will not be used. + m32 := r_int.mode_32bit; + if w_in.redirect = '1' then + next_nia := w_in.redirect_nia(63 downto 2) & "00"; + m32 := w_in.mode_32bit; + v.virt_mode := w_in.virt_mode; + v.priv_mode := w_in.priv_mode; + v.big_endian := w_in.big_endian; + v_int.mode_32bit := w_in.mode_32bit; + elsif d_in.redirect = '1' then + next_nia := d_in.redirect_nia(63 downto 2) & "00"; + else + next_nia := r_int.next_nia; + end if; + if m32 = '1' then + next_nia(63 downto 32) := (others => '0'); + end if; + v.nia := next_nia; + + v_int.next_nia := std_ulogic_vector(unsigned(next_nia) + 4); + + -- Use v_int.next_nia as the BTC read address before it gets possibly + -- overridden with the reset address or the predicted branch target + -- address, in order to improve timing. If it gets overridden then + -- rd_is_niap4 gets cleared to indicate that the BTC data doesn't apply. + btc_rd_addr <= unsigned(v_int.next_nia(BTC_ADDR_BITS + 1 downto 2)); + v_int.rd_is_niap4 := '1'; - if rst = '1' then + if rst /= '0' then if alt_reset_in = '1' then - v.nia := ALT_RESET_ADDRESS; + v_int.next_nia := ALT_RESET_ADDRESS; else - v.nia := RESET_ADDRESS; + v_int.next_nia := RESET_ADDRESS; end if; v.virt_mode := '0'; v.priv_mode := '1'; v.big_endian := '0'; v_int.mode_32bit := '0'; - v_int.predicted_nia := (others => '0'); - elsif w_in.redirect = '1' then - v.nia := w_in.redirect_nia(63 downto 2) & "00"; - if w_in.mode_32bit = '1' then - v.nia(63 downto 32) := (others => '0'); - end if; - v.virt_mode := w_in.virt_mode; - v.priv_mode := w_in.priv_mode; - v.big_endian := w_in.big_endian; - v_int.mode_32bit := w_in.mode_32bit; - elsif d_in.redirect = '1' then - v.nia := d_in.redirect_nia(63 downto 2) & "00"; - if r_int.mode_32bit = '1' then - v.nia(63 downto 32) := (others => '0'); - end if; - elsif r_int.predicted_taken = '1' then - v.nia := r_int.predicted_nia; - elsif r.req = '1' then - v_int.rd_is_niap4 := '1'; - v.nia := std_ulogic_vector(unsigned(r.nia) + 4); - if r_int.mode_32bit = '1' then - v.nia(63 downto 32) := x"00000000"; - end if; - if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and + v_int.rd_is_niap4 := '0'; + end if; + + -- If there is a valid entry in the BTC which corresponds to the next instruction, + -- use that to predict the address of the instruction after that. + if rst = '0' and w_in.redirect = '0' and d_in.redirect = '0' and + btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and btc_rd_data(BTC_WIDTH - 2) = r.virt_mode and btc_rd_data(BTC_WIDTH - 3 downto BTC_TARGET_BITS) - = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then - v_int.predicted_taken := btc_rd_data(BTC_WIDTH - 1); - v.predicted := btc_rd_data(BTC_WIDTH - 1); - v.pred_ntaken := not btc_rd_data(BTC_WIDTH - 1); + = r_int.next_nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then + v.predicted := btc_rd_data(BTC_WIDTH - 1); + v.pred_ntaken := not btc_rd_data(BTC_WIDTH - 1); + if btc_rd_data(BTC_WIDTH - 1) = '1' then + v_int.next_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; + v_int.rd_is_niap4 := '0'; end if; end if; - v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; -- If the last NIA value went down with a stop mark, it didn't get -- executed, and hence we shouldn't increment NIA. advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); + -- reduce metavalue warnings in sim + if is_X(rst) then + advance_nia <= '1'; + end if; r_next <= v; r_next_int <= v_int; -- Update outputs to the icache i_out <= r; + i_out.next_nia <= next_nia; end process; diff --git a/icache.vhdl b/icache.vhdl index 21a7a24..89204a6 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -636,7 +636,7 @@ begin i_out.next_pred_ntaken <= r.pred_ntaken; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault - stall_out <= not (is_hit and access_ok); + stall_out <= i_in.req and not (is_hit and access_ok); -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; From f34a54d295c12cbc4600cf08057844b8882dfd5b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 29 Jul 2023 19:54:44 +1000 Subject: [PATCH 05/11] fetch1: Streamline next NIA generation further This reduces the number of possible sources for the next NIA from 4 down to 3, by routing interrupt vector addresses through the r_int.next_nia register, as is already done for reset. This adds one extra cycle of latency when taking interrupts. During this extra cycle, i_out.req is 0. Writeback now no longer combines redirects (branches, rfid, isync) with interrupts; they are presented separately to fetch1. Signed-off-by: Paul Mackerras --- common.vhdl | 5 ++++- fetch1.vhdl | 16 +++++++++++----- writeback.vhdl | 29 ++++++++++------------------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/common.vhdl b/common.vhdl index 776d74c..041a5f2 100644 --- a/common.vhdl +++ b/common.vhdl @@ -758,11 +758,14 @@ package common is br_nia : std_ulogic_vector(63 downto 0); br_last : std_ulogic; br_taken : std_ulogic; + interrupt : std_ulogic; + intr_vec : std_ulogic_vector(11 downto 0); end record; constant WritebackToFetch1Init : WritebackToFetch1Type := (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', mode_32bit => '0', redirect_nia => (others => '0'), - br_last => '0', br_taken => '0', br_nia => (others => '0')); + br_last => '0', br_taken => '0', br_nia => (others => '0'), + interrupt => '0', intr_vec => x"000"); type WritebackToRegisterFileType is record write_reg : gspr_index_t; diff --git a/fetch1.vhdl b/fetch1.vhdl index 4980b05..b6c2205 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -70,6 +70,7 @@ begin " P:" & std_ulogic'image(r_next.priv_mode) & " E:" & std_ulogic'image(r_next.big_endian) & " 32:" & std_ulogic'image(r_next_int.mode_32bit) & + " I:" & std_ulogic'image(w_in.interrupt) & " R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & @@ -143,7 +144,7 @@ begin v_int := r_int; v.predicted := '0'; v.pred_ntaken := '0'; - v.req := not (rst or stop_in); + v.req := not (rst or w_in.interrupt or stop_in); -- reduce metavalue warnings in sim if is_X(rst) then v.req := '0'; @@ -175,8 +176,8 @@ begin v_int.next_nia := std_ulogic_vector(unsigned(next_nia) + 4); -- Use v_int.next_nia as the BTC read address before it gets possibly - -- overridden with the reset address or the predicted branch target - -- address, in order to improve timing. If it gets overridden then + -- overridden with the reset or interrupt address or the predicted branch + -- target address, in order to improve timing. If it gets overridden then -- rd_is_niap4 gets cleared to indicate that the BTC data doesn't apply. btc_rd_addr <= unsigned(v_int.next_nia(BTC_ADDR_BITS + 1 downto 2)); v_int.rd_is_niap4 := '1'; @@ -187,6 +188,10 @@ begin else v_int.next_nia := RESET_ADDRESS; end if; + elsif w_in.interrupt = '1' then + v_int.next_nia := 52x"0" & w_in.intr_vec(11 downto 2) & "00"; + end if; + if rst /= '0' or w_in.interrupt = '1' then v.virt_mode := '0'; v.priv_mode := '1'; v.big_endian := '0'; @@ -196,7 +201,7 @@ begin -- If there is a valid entry in the BTC which corresponds to the next instruction, -- use that to predict the address of the instruction after that. - if rst = '0' and w_in.redirect = '0' and d_in.redirect = '0' and + if rst = '0' and w_in.interrupt = '0' and w_in.redirect = '0' and d_in.redirect = '0' and btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and btc_rd_data(BTC_WIDTH - 2) = r.virt_mode and btc_rd_data(BTC_WIDTH - 3 downto BTC_TARGET_BITS) @@ -211,7 +216,8 @@ begin -- If the last NIA value went down with a stop mark, it didn't get -- executed, and hence we shouldn't increment NIA. - advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); + advance_nia <= rst or w_in.interrupt or w_in.redirect or d_in.redirect or + (not r.stop_mark and not stall_in); -- reduce metavalue warnings in sim if is_X(rst) then advance_nia <= '1'; diff --git a/writeback.vhdl b/writeback.vhdl index 2eb9998..6a86fb7 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -160,30 +160,21 @@ begin end if; -- Outputs to fetch1 + f.interrupt := intr; + f.intr_vec := std_ulogic_vector(to_unsigned(vec, 12)); f.redirect := e_in.redirect; + f.redirect_nia := e_in.write_data; f.br_nia := e_in.last_nia; - f.br_last := e_in.br_last; + f.br_last := e_in.br_last and not intr; f.br_taken := e_in.br_taken; - if intr = '1' then - f.redirect := '1'; - f.br_last := '0'; - f.redirect_nia := std_ulogic_vector(to_unsigned(vec, 64)); - f.virt_mode := '0'; - f.priv_mode := '1'; - -- XXX need an interrupt LE bit here, e.g. from LPCR - f.big_endian := '0'; - f.mode_32bit := '0'; - else - f.redirect_nia := e_in.write_data; - -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 - f.virt_mode := e_in.redir_mode(3); - f.priv_mode := e_in.redir_mode(2); - f.big_endian := e_in.redir_mode(1); - f.mode_32bit := e_in.redir_mode(0); - end if; + -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 + f.virt_mode := e_in.redir_mode(3); + f.priv_mode := e_in.redir_mode(2); + f.big_endian := e_in.redir_mode(1); + f.mode_32bit := e_in.redir_mode(0); f_out <= f; - flush_out <= f_out.redirect; + flush_out <= f_out.redirect or intr; -- Register write data bypass to decode2 wb_bypass.tag.tag <= complete_out.tag; From 723008b8c21c662878346d5598c9624218dbcdef Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 31 Jul 2023 10:00:05 +1000 Subject: [PATCH 06/11] icache: Read iTLB using early next NIA from fetch1 Using i_in.next_nia means that we can read the iTLB RAM arrays synchronously rather than asynchronously, which gives more opportunity for using block RAMs in FPGA implementations. The reading is gated by the stall signals because the next_nia can advance when stalled, but we need the iTLB entry for the instruction that i_in.nia points to. If we are stalled because of an iTLB miss, that means we don't see the new iTLB entry when it is written. Instead we save the new entry directly when it arrives and use it instead of the values read from the iTLB RAM. Signed-off-by: Paul Mackerras --- icache.vhdl | 80 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index 89204a6..15ed7bd 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -176,8 +176,16 @@ architecture rtl of icache is signal itlb_valids : tlb_valids_t; signal itlb_tags : tlb_tags_t; signal itlb_ptes : tlb_ptes_t; - attribute ram_style of itlb_tags : signal is "distributed"; - attribute ram_style of itlb_ptes : signal is "distributed"; + + -- Values read from above arrays on a clock edge + signal itlb_valid : std_ulogic; + signal itlb_ttag : tlb_tag_t; + signal itlb_pte : tlb_pte_t; + + -- Values captured from a write to a TLB + signal itlb_bypass_valid : std_ulogic; + signal itlb_bypass_ra : std_ulogic_vector(REAL_ADDR_BITS - TLB_LG_PGSZ - 1 downto 0); + signal itlb_bypass_priv : std_ulogic; -- Privilege bit from PTE EAA field signal eaa_priv : std_ulogic; @@ -491,33 +499,61 @@ begin end process; end generate; + -- Read TLB using the NIA for the next cycle + itlb_read : process(clk) + variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + if rising_edge(clk) then + if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then + tlb_req_index := hash_ea(i_in.next_nia); + if is_X(tlb_req_index) then + itlb_pte <= (others => 'X'); + itlb_ttag <= (others => 'X'); + itlb_valid <= 'X'; + else + itlb_pte <= itlb_ptes(to_integer(unsigned(tlb_req_index))); + itlb_ttag <= itlb_tags(to_integer(unsigned(tlb_req_index))); + itlb_valid <= itlb_valids(to_integer(unsigned(tlb_req_index))); + end if; + end if; + end if; + end process; + + -- Store TLB data being written for use in servicing the current request + itlb_bypass: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + itlb_bypass_valid <= '0'; + itlb_bypass_ra <= (others => '0'); + itlb_bypass_priv <= '0'; + elsif flush_in = '1' or i_in.req = '0' or stall_out = '0' then + itlb_bypass_valid <= '0'; + elsif m_in.tlbld = '1' then + assert i_in.nia(63 downto TLB_LG_PGSZ) = m_in.addr(63 downto TLB_LG_PGSZ); + itlb_bypass_valid <= '1'; + itlb_bypass_ra <= m_in.pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ); + itlb_bypass_priv <= m_in.pte(3); + end if; + end if; + end process; + -- TLB hit detection and real address generation itlb_lookup : process(all) - variable pte : tlb_pte_t; - variable ttag : tlb_tag_t; - variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0); begin - tlb_req_index := hash_ea(i_in.nia); - if is_X(tlb_req_index) then - pte := (others => 'X'); - ttag := (others => 'X'); - else - pte := itlb_ptes(to_integer(unsigned(tlb_req_index))); - ttag := itlb_tags(to_integer(unsigned(tlb_req_index))); - end if; - if i_in.virt_mode = '1' then - real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + if itlb_bypass_valid = '1' then + real_addr <= itlb_bypass_ra & i_in.nia(TLB_LG_PGSZ - 1 downto 0); + ra_valid <= '1'; + eaa_priv <= itlb_bypass_priv; + elsif i_in.virt_mode = '1' then + real_addr <= itlb_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & i_in.nia(TLB_LG_PGSZ - 1 downto 0); - if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then - if is_X(tlb_req_index) then - ra_valid <= 'X'; - else - ra_valid <= itlb_valids(to_integer(unsigned(tlb_req_index))); - end if; + if itlb_ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then + ra_valid <= itlb_valid; else ra_valid <= '0'; end if; - eaa_priv <= pte(3); + eaa_priv <= itlb_pte(3); else real_addr <= addr_to_real(i_in.nia); ra_valid <= '1'; From 963c22595571249131012acbe5a488f6b414552a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 1 Aug 2023 17:47:22 +1000 Subject: [PATCH 07/11] icache: Read icache tag RAM synchronously This uses the next_nia provided to us by fetch1 to enable the icache tag RAM to be read synchronously (using a clock edge), which should enable block RAMs to be used on FPGAs rather than LUT RAM or flip-flops. We define a separate RAM per way to avoid any problems with the tools trying to inference byte write enables for writing to a single way. Since next_nia can move on, we only get one shot at reading it the cache tag RAM entry for the current access. If it is a miss, then the state machine will read the cache line from RAM, and we can consider the access to be a hit once the state machine has brought in the doubleword we need. The TLB hit/miss check has been modified to check r.store_tag rather than the tag read from the tag RAM for this case. However, it is also possible that stall_in will be asserted for the whole time until the cache line refill is completed. To handle this case, we remember (in r.stalled_hit) that we detected a hit while stalled, and use that hit once stall_in is deasserted. This avoids doing an unnecesary second reload of the same cache line. The r.stalled_hit flag gets cleared in CLR_TAG state since that is when cache tags can be overwritten, meaning that a previously detected hit might no longer be valid. There is also the case where the tag read from the tag RAM is the one we are looking for, and is the same index as the line that is starting to be reloaded by the state machine. If the icache gets stalled for long enough that the line reload finishes, it would then be possible for the access to be detected as a hit even though the cache line has been overwritten. To counter this, we detect the case where the cache tag RAM entry being read is the same as the entry being written and set a 'tag_overwrite' flag bit to indicate that one of the tags in cache_tags_set is no longer valid. For snooping writes to memory, we have a second read port on the cache tag RAM. These tags are also read synchronously, so the logic for clearing cache line valid bits on a snoop has been adjusted (the tag comparisons and valid bit clearing now happen in the same cycle). This also simplifies the expression for 'insn' by removing a dependency on r.hit_valid, fixes the instruction value sent to the log, and deasserts stall_out when flush_in is true. Signed-off-by: Paul Mackerras --- icache.vhdl | 189 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 114 insertions(+), 75 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index 15ed7bd..4bd4491 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -139,28 +139,24 @@ architecture rtl of icache is -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(ROW_WIDTH-1 downto 0); - -- The cache tags LUTRAM has a row per set. Vivado is a pain and will - -- not handle a clean (commented) definition of the cache tags as a 3d - -- memory. For now, work around it by putting all the tags + -- We define a cache tag RAM per way, accessed synchronously subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); --- type cache_tags_set_t is array(way_t) of cache_tag_t; --- type cache_tags_array_t is array(index_t) of cache_tags_set_t; - constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; - subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); - type cache_tags_array_t is array(index_t) of cache_tags_set_t; + type cache_tags_set_t is array(way_t) of cache_tag_t; + type cache_tags_array_t is array(index_t) of cache_tag_t; + + -- Set of cache tags read on the last clock edge + signal cache_tags_set : cache_tags_set_t; + -- Set of cache tags for snooping writes to memory + signal snoop_tags_set : cache_tags_set_t; + -- Flags indicating write-hit-read on the cache tags + signal tag_overwrite : std_ulogic_vector(NUM_WAYS - 1 downto 0); -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; - - -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - signal cache_tags : cache_tags_array_t; signal cache_valids : cache_valids_t; - attribute ram_style : string; - attribute ram_style of cache_tags : signal is "distributed"; - -- L1 ITLB. constant TLB_BITS : natural := log2(TLB_SIZE); constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); @@ -216,6 +212,9 @@ architecture rtl of icache is end_row_ix : row_in_line_t; rows_valid : row_per_line_valid_t; + stalled_hit : std_ulogic; -- remembers hit while stalled + stalled_way : way_sig_t; + -- TLB miss state fetch_failed : std_ulogic; end record; @@ -248,9 +247,11 @@ architecture rtl of icache is signal plru_victim : way_sig_t; -- Memory write snoop signals - signal snoop_valid : std_ulogic; - signal snoop_index : index_sig_t; - signal snoop_hits : cache_way_valids_t; + signal snoop_valid : std_ulogic; + signal snoop_index : index_sig_t; + signal snoop_tag : cache_tag_t; + signal snoop_index2 : index_sig_t; + signal snoop_hits : cache_way_valids_t; signal log_insn : std_ulogic_vector(35 downto 0); @@ -329,19 +330,6 @@ architecture rtl of icache is return endian & addr(addr'left downto SET_SIZE_BITS); end; - -- Read a tag from a tag memory row - function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is - begin - return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); - end; - - -- Write a tag to tag memory row - procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; - tag: cache_tag_t) is - begin - tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; - end; - -- Simple hash for direct-mapped TLB index function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); @@ -423,7 +411,9 @@ begin signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal dout : cache_row_t; signal wr_sel : std_ulogic_vector(0 downto 0); + signal ic_tags : cache_tags_array_t; begin + -- Cache data RAMs, one per way way: entity work.cache_ram generic map ( ROW_BITS => ROW_BITS, @@ -451,6 +441,47 @@ begin wr_addr <= std_ulogic_vector(r.store_row); wr_sel(0) <= do_write; end process; + + -- Cache tag RAMs, one per way, are read and written synchronously. + -- They are instantiated like this instead of trying to describe them as + -- a single array in order to avoid problems with writing a single way. + process(clk) + variable replace_way : way_sig_t; + variable snoop_addr : real_addr_t; + begin + replace_way := to_unsigned(0, WAY_BITS); + if NUM_WAYS > 1 then + -- Get victim way from plru + replace_way := plru_victim; + end if; + if rising_edge(clk) then + -- Read tags using NIA for next cycle + if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then + cache_tags_set(i) <= ic_tags(to_integer(get_index(i_in.next_nia))); + -- Check for simultaneous write to the same location + tag_overwrite(i) <= '0'; + if r.state = CLR_TAG and r.store_index = get_index(i_in.next_nia) and + to_unsigned(i, WAY_BITS) = replace_way then + tag_overwrite(i) <= '1'; + end if; + end if; + + -- Second read port for snooping writes to memory + if (wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we) = '1' then + snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr)); + snoop_tags_set(i) <= ic_tags(to_integer(get_index(snoop_addr))); + end if; + + -- Write one tag when in CLR_TAG state + if r.state = CLR_TAG and to_unsigned(i, WAY_BITS) = replace_way then + ic_tags(to_integer(r.store_index)) <= r.store_tag; + end if; + + if rst = '1' then + tag_overwrite(i) <= '0'; + end if; + end if; + end process; end generate; -- Generate PLRUs @@ -616,17 +647,24 @@ begin end if; for i in way_t loop if i_in.req = '1' and - (cache_valids(to_integer(req_index))(i) = '1' or - (r.state = WAIT_ACK and - req_index = r.store_index and - to_unsigned(i, WAY_BITS) = r.store_way and - r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1')) then - if read_tag(i, cache_tags(to_integer(req_index))) = req_tag then - hit_way := to_unsigned(i, WAY_BITS); - is_hit := '1'; - end if; + cache_valids(to_integer(req_index))(i) = '1' and + tag_overwrite(i) = '0' and + cache_tags_set(i) = req_tag then + hit_way := to_unsigned(i, WAY_BITS); + is_hit := '1'; end if; end loop; + if r.state = WAIT_ACK and r.store_valid = '1' and + req_index = r.store_index and + req_tag = r.store_tag and + r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1' then + is_hit := '1'; + hit_way := r.store_way; + end if; + if r.stalled_hit = '1' then + is_hit := '1'; + hit_way := r.stalled_way; + end if; -- Generate the "hit" and "miss" signals for the synchronous blocks if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then @@ -646,20 +684,22 @@ begin -- I prefer not to do just yet as it would force fetch2 to know about -- some of the cache geometry information. -- - insn := (others => '0'); icode := INSN_illegal; - if r.hit_valid = '1' then - assert not is_X(r.hit_way) severity failure; + if is_X(r.hit_way) then + insn := (others => 'X'); + else insn := read_insn_word(r.hit_nia, cache_out(to_integer(r.hit_way))); - -- Currently we use only the top bit for indicating illegal - -- instructions because we know that insn_codes fit into 9 bits. - if is_X(insn) then - insn := (others => '0'); - elsif insn(ICWORDLEN - 1) = '0' then - icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS)))); - insn(31 downto 26) := recode_primary_opcode(icode); - end if; - end if; + end if; + assert not (r.hit_valid = '1' and is_X(r.hit_way)) severity failure; + -- Currently we use only the top bit for indicating illegal + -- instructions because we know that insn_codes fit into 9 bits. + if is_X(insn) then + insn := (others => '0'); + elsif insn(ICWORDLEN - 1) = '0' then + icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS)))); + insn(31 downto 26) := recode_primary_opcode(icode); + end if; + i_out.insn <= insn(31 downto 0); i_out.icode <= icode; log_insn <= insn; @@ -672,7 +712,7 @@ begin i_out.next_pred_ntaken <= r.pred_ntaken; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault - stall_out <= i_in.req and not (is_hit and access_ok); + stall_out <= i_in.req and not (is_hit and access_ok) and not flush_in; -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; @@ -684,9 +724,17 @@ begin if rising_edge(clk) then -- keep outputs to fetch2 unchanged on a stall -- except that flush or reset sets valid to 0 - if stall_in = '1' then - if rst = '1' or flush_in = '1' then - r.hit_valid <= '0'; + if rst = '1' or flush_in = '1' then + r.hit_valid <= '0'; + r.stalled_hit <= '0'; + r.stalled_way <= to_unsigned(0, WAY_BITS); + elsif stall_in = '1' then + if r.state = CLR_TAG then + r.stalled_hit <= '0'; + elsif req_is_hit = '1' then + -- if we have a hit while stalled, remember it + r.stalled_hit <= '1'; + r.stalled_way <= req_hit_way; end if; else -- On a hit, latch the request for the next cycle, when the BRAM data @@ -706,6 +754,7 @@ begin " way:" & to_hstring(req_hit_way) & " RA:" & to_hstring(real_addr); end if; + r.stalled_hit <= '0'; end if; if stall_in = '0' then -- Send stop marks and NIA down regardless of validity @@ -726,7 +775,6 @@ begin variable tagset : cache_tags_set_t; variable tag : cache_tag_t; variable snoop_addr : real_addr_t; - variable snoop_tag : cache_tag_t; variable snoop_cache_tags : cache_tags_set_t; variable replace_way : way_sig_t; begin @@ -759,15 +807,14 @@ begin snoop_valid <= wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we; snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr)); snoop_index <= get_index(snoop_addr); - snoop_tag := get_tag(snoop_addr, '0'); + snoop_tag <= get_tag(snoop_addr, '0'); snoop_hits <= (others => '0'); + + -- On the next cycle, match up tags with the snooped address + -- to see if any ways need to be invalidated if snoop_valid = '1' then - if is_X(snoop_addr) then - report "metavalue in snoop_addr" severity FAILURE; - end if; - snoop_cache_tags := cache_tags(to_integer(get_index(snoop_addr))); for i in way_t loop - tag := read_tag(i, snoop_cache_tags); + tag := snoop_tags_set(i); -- Ignore endian bit in comparison tag(TAG_BITS - 1) := '0'; if tag = snoop_tag then @@ -775,6 +822,7 @@ begin end if; end loop; end if; + snoop_index2 <= snoop_index; -- Process cache invalidations if inval_in = '1' then @@ -783,12 +831,12 @@ begin end loop; r.store_valid <= '0'; else - -- Do invalidations from snooped stores to memory, one - -- cycle after the address appears on wb_snoop_in. + -- Do invalidations from snooped stores to memory, + -- two cycles after the address appears on wb_snoop_in. for i in way_t loop if snoop_hits(i) = '1' then - assert not is_X(snoop_index) severity failure; - cache_valids(to_integer(snoop_index))(i) <= '0'; + assert not is_X(snoop_index2) severity failure; + cache_valids(to_integer(snoop_index2))(i) <= '0'; end if; end loop; end if; @@ -846,15 +894,6 @@ begin assert not is_X(replace_way) severity failure; cache_valids(to_integer(r.store_index))(to_integer(replace_way)) <= '0'; - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if to_unsigned(i, WAY_BITS) = replace_way then - tagset := cache_tags(to_integer(r.store_index)); - write_tag(i, tagset, r.store_tag); - cache_tags(to_integer(r.store_index)) <= tagset; - end if; - end loop; - r.state <= WAIT_ACK; end if; From 27c50bc31185b0081e522ad79322706192c41d4e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 5 Aug 2023 14:31:43 +1000 Subject: [PATCH 08/11] Makefile: Remove overriding of ICACHE_NUM_LINES on ECP5 platforms Now that the icache tag RAM is accessed synchronously, the free tools recognize it as block RAM on ECP5-based platforms; thus we no longer need to force it to a very small value. Signed-off-by: Paul Mackerras --- Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Makefile b/Makefile index bf928da..10c8144 100644 --- a/Makefile +++ b/Makefile @@ -166,10 +166,6 @@ RAM_INIT_FILE ?=hello_world/hello_world.hex FPGA_TARGET ?= ORANGE-CRAB-0.21 -# FIXME: icache RAMs aren't being inferrenced as block RAMs on ECP5 -# with yosys, so make it smaller for now as a workaround. -ICACHE_NUM_LINES=4 - clkgen=fpga/clk_gen_ecp5.vhd toplevel=fpga/top-generic.vhdl dmi_dtm=dmi_dtm_dummy.vhdl @@ -227,7 +223,7 @@ LITEDRAM_GHDL_ARG=-gUSE_LITEDRAM=true endif GHDL_IMAGE_GENERICS=-gMEMORY_SIZE=$(MEMORY_SIZE) -gRAM_INIT_FILE=$(RAM_INIT_FILE) \ - -gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY) -gICACHE_NUM_LINES=$(ICACHE_NUM_LINES) \ + -gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY) \ $(LITEDRAM_GHDL_ARG) From f9e5622327e5d6b0e2e624acead9b71c91948fe7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 15 Aug 2023 20:50:17 +1000 Subject: [PATCH 09/11] Move iTLB from icache to fetch1 This moves the address translation step for instruction fetches one cycle earlier, so that it now happens in the fetch1 stage. There is now a 2-entry mini translation cache ("ERAT", or effective to real address translation cache) which operates on the output of the multiplexer that selects the instruction address for the next cycle. The ERAT consists of two effective address registers and two corresponding real address registers. They store the page number part of the addresses for a 4kB page size, which is the smallest page size supported by the architecture. If the effective address doesn't match either of the EA registers, and address translation is enabled, then i_out.req goes low for two cycles while the iTLB is looked up. Experimentally, this delay results in a 0.1% drop in coremark performance; allowing two cycles for the lookup results in better timing. The result from the iTLB is placed into the least recently used ERAT entry and then used to translate the address as normal. If address translation is not enabled then the EA is used directly as the real address. The iTLB structure is the same as it was before; direct mapped, indexed using a hashed EA. The "fetch failed" signal, which indicates a TLB miss or protection violation, is now generated in fetch1 and passed through icache. When it is asserted, fetch1 goes into a stalled state until a PTE arrives from the MMU (which gets put into both the iTLB and the ERAT), or an interrupt or redirect occurs. Any TLB invalidations from the MMU invalidate the whole ERAT. Signed-off-by: Paul Mackerras --- common.vhdl | 8 +- core.vhdl | 10 +-- fetch1.vhdl | 239 ++++++++++++++++++++++++++++++++++++++++++++++++---- icache.vhdl | 153 ++------------------------------- mmu.vhdl | 2 +- 5 files changed, 244 insertions(+), 168 deletions(-) diff --git a/common.vhdl b/common.vhdl index 041a5f2..efcf7b3 100644 --- a/common.vhdl +++ b/common.vhdl @@ -194,6 +194,10 @@ package common is subtype real_addr_t is std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); function addr_to_real(addr: std_ulogic_vector(63 downto 0)) return real_addr_t; + -- Minimum page size + constant MIN_LG_PGSZ : positive := 12; + constant MIN_PAGESZ : positive := 2 ** MIN_LG_PGSZ; + -- Used for tracking instruction completion and pending register writes constant TAG_COUNT : positive := 4; constant TAG_NUMBER_BITS : natural := log2(TAG_COUNT); @@ -231,6 +235,7 @@ package common is type Fetch1ToIcacheType is record req: std_ulogic; + fetch_fail : std_ulogic; virt_mode : std_ulogic; priv_mode : std_ulogic; big_endian : std_ulogic; @@ -239,6 +244,7 @@ package common is pred_ntaken : std_ulogic; nia: std_ulogic_vector(63 downto 0); next_nia: std_ulogic_vector(63 downto 0); + rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); end record; type IcacheToDecode1Type is record @@ -607,7 +613,7 @@ package common is data : std_ulogic_vector(63 downto 0); end record; - type MmuToIcacheType is record + type MmuToITLBType is record tlbld : std_ulogic; tlbie : std_ulogic; doall : std_ulogic; diff --git a/core.vhdl b/core.vhdl index a556069..35a860e 100644 --- a/core.vhdl +++ b/core.vhdl @@ -57,7 +57,7 @@ architecture behave of core is signal fetch1_to_icache : Fetch1ToIcacheType; signal writeback_to_fetch1: WritebackToFetch1Type; signal icache_to_decode1 : IcacheToDecode1Type; - signal mmu_to_icache : MmuToIcacheType; + signal mmu_to_itlb : MmuToITLBType; -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; @@ -223,6 +223,7 @@ begin generic map ( RESET_ADDRESS => (others => '0'), ALT_RESET_ADDRESS => ALT_RESET_ADDRESS, + TLB_SIZE => ICACHE_TLB_SIZE, HAS_BTC => HAS_BTC ) port map ( @@ -231,8 +232,9 @@ begin alt_reset_in => alt_reset_d, stall_in => fetch1_stall_in, flush_in => fetch1_flush, - inval_btc => ex1_icache_inval or mmu_to_icache.tlbie, + inval_btc => ex1_icache_inval or mmu_to_itlb.tlbie, stop_in => dbg_core_stop, + m_in => mmu_to_itlb, d_in => decode1_to_fetch1, w_in => writeback_to_fetch1, i_out => fetch1_to_icache, @@ -249,7 +251,6 @@ begin LINE_SIZE => 64, NUM_LINES => ICACHE_NUM_LINES, NUM_WAYS => ICACHE_NUM_WAYS, - TLB_SIZE => ICACHE_TLB_SIZE, LOG_LENGTH => LOG_LENGTH ) port map( @@ -257,7 +258,6 @@ begin rst => rst_icache, i_in => fetch1_to_icache, i_out => icache_to_decode1, - m_in => mmu_to_icache, flush_in => fetch1_flush, inval_in => dbg_icache_rst or ex1_icache_inval, stall_in => icache_stall_in, @@ -454,7 +454,7 @@ begin l_out => mmu_to_loadstore1, d_out => mmu_to_dcache, d_in => dcache_to_mmu, - i_out => mmu_to_icache + i_out => mmu_to_itlb ); dcache_0: entity work.dcache diff --git a/fetch1.vhdl b/fetch1.vhdl index b6c2205..98116f9 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -3,12 +3,14 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.utils.all; use work.common.all; entity fetch1 is generic( RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); + TLB_SIZE : positive := 64; -- L1 ITLB number of entries (direct mapped) HAS_BTC : boolean := true ); port( @@ -21,6 +23,7 @@ entity fetch1 is inval_btc : in std_ulogic; stop_in : in std_ulogic; alt_reset_in : in std_ulogic; + m_in : in MmuToITLBType; -- redirect from writeback unit w_in : in WritebackToFetch1Type; @@ -40,13 +43,32 @@ architecture behaviour of fetch1 is type reg_internal_t is record mode_32bit: std_ulogic; rd_is_niap4: std_ulogic; + tlbcheck: std_ulogic; + tlbstall: std_ulogic; next_nia: std_ulogic_vector(63 downto 0); end record; + + -- Mini effective to real translation cache + type erat_t is record + epn0: std_ulogic_vector(63 - MIN_LG_PGSZ downto 0); + epn1: std_ulogic_vector(63 - MIN_LG_PGSZ downto 0); + rpn0: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); + rpn1: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); + priv0: std_ulogic; + priv1: std_ulogic; + valid: std_ulogic_vector(1 downto 0); + mru: std_ulogic; -- '1' => entry 1 most recently used + end record; + signal r, r_next : Fetch1ToIcacheType; signal r_int, r_next_int : reg_internal_t; signal advance_nia : std_ulogic; signal log_nia : std_ulogic_vector(42 downto 0); + signal erat : erat_t; + signal erat_hit : std_ulogic; + signal erat_sel : std_ulogic; + constant BTC_ADDR_BITS : integer := 10; constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS; constant BTC_TARGET_BITS : integer := 62; @@ -58,6 +80,41 @@ architecture behaviour of fetch1 is signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); signal btc_rd_valid : std_ulogic := '0'; + -- L1 ITLB. + constant TLB_BITS : natural := log2(TLB_SIZE); + constant TLB_EA_TAG_BITS : natural := 64 - (MIN_LG_PGSZ + TLB_BITS); + constant TLB_PTE_BITS : natural := 64; + + subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; + type tlb_valids_t is array(tlb_index_t) of std_ulogic; + subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); + type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; + subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); + type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; + + signal itlb_valids : tlb_valids_t; + signal itlb_tags : tlb_tags_t; + signal itlb_ptes : tlb_ptes_t; + + -- Values read from above arrays on a clock edge + signal itlb_valid : std_ulogic; + signal itlb_ttag : tlb_tag_t; + signal itlb_pte : tlb_pte_t; + signal itlb_hit : std_ulogic; + + -- Privilege bit from PTE EAA field + signal eaa_priv : std_ulogic; + + -- Simple hash for direct-mapped TLB index + function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + hash := addr(MIN_LG_PGSZ + TLB_BITS - 1 downto MIN_LG_PGSZ) + xor addr(MIN_LG_PGSZ + 2 * TLB_BITS - 1 downto MIN_LG_PGSZ + TLB_BITS) + xor addr(MIN_LG_PGSZ + 3 * TLB_BITS - 1 downto MIN_LG_PGSZ + 2 * TLB_BITS); + return hash; + end; + begin regs : process(clk) @@ -75,7 +132,8 @@ begin " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & " nia:" & to_hstring(r_next.nia) & - " req:" & std_ulogic'image(r_next.req); + " req:" & std_ulogic'image(r_next.req) & + " FF:" & std_ulogic'image(r_next.fetch_fail); end if; if advance_nia = '1' then r <= r_next; @@ -84,6 +142,9 @@ begin -- always send the up-to-date stop mark and req r.stop_mark <= stop_in; r.req <= r_next.req; + r.fetch_fail <= r_next.fetch_fail; + r_int.tlbcheck <= r_next_int.tlbcheck; + r_int.tlbstall <= r_next_int.tlbstall; end if; end process; log_out <= log_nia; @@ -134,20 +195,113 @@ begin end process; end generate; + erat_sync : process(clk) + begin + if rising_edge(clk) then + if rst /= '0' or m_in.tlbie = '1' then + erat.valid <= "00"; + erat.mru <= '0'; + else + if erat_hit = '1' then + erat.mru <= erat_sel; + end if; + if m_in.tlbld = '1' then + erat.epn0 <= m_in.addr(63 downto MIN_LG_PGSZ); + erat.rpn0 <= m_in.pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ); + erat.priv0 <= m_in.pte(3); + erat.valid(0) <= '1'; + erat.valid(1) <= '0'; + erat.mru <= '0'; + elsif r_int.tlbcheck = '1' and itlb_hit = '1' then + if erat.mru = '0' then + erat.epn1 <= r.nia(63 downto MIN_LG_PGSZ); + erat.rpn1 <= itlb_pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ); + erat.priv1 <= itlb_pte(3); + erat.valid(1) <= '1'; + else + erat.epn0 <= r.nia(63 downto MIN_LG_PGSZ); + erat.rpn0 <= itlb_pte(REAL_ADDR_BITS-1 downto MIN_LG_PGSZ); + erat.priv0 <= itlb_pte(3); + erat.valid(0) <= '1'; + end if; + erat.mru <= not erat.mru; + end if; + end if; + end if; + end process; + + -- Read TLB using the NIA for the next cycle + itlb_read : process(clk) + variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + if rising_edge(clk) then + if advance_nia = '1' then + tlb_req_index := hash_ea(r_next.nia); + if is_X(tlb_req_index) then + itlb_pte <= (others => 'X'); + itlb_ttag <= (others => 'X'); + itlb_valid <= 'X'; + else + itlb_pte <= itlb_ptes(to_integer(unsigned(tlb_req_index))); + itlb_ttag <= itlb_tags(to_integer(unsigned(tlb_req_index))); + itlb_valid <= itlb_valids(to_integer(unsigned(tlb_req_index))); + end if; + end if; + end if; + end process; + + -- TLB hit detection + itlb_lookup : process(all) + begin + itlb_hit <= '0'; + if itlb_ttag = r.nia(63 downto MIN_LG_PGSZ + TLB_BITS) then + itlb_hit <= itlb_valid; + end if; + end process; + + -- iTLB update + itlb_update: process(clk) + variable wr_index : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + if rising_edge(clk) then + wr_index := hash_ea(m_in.addr); + if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then + -- clear all valid bits + for i in tlb_index_t loop + itlb_valids(i) <= '0'; + end loop; + elsif m_in.tlbie = '1' then + assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE; + -- clear entry regardless of hit or miss + itlb_valids(to_integer(unsigned(wr_index))) <= '0'; + elsif m_in.tlbld = '1' then + assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE; + itlb_tags(to_integer(unsigned(wr_index))) <= m_in.addr(63 downto MIN_LG_PGSZ + TLB_BITS); + itlb_ptes(to_integer(unsigned(wr_index))) <= m_in.pte; + itlb_valids(to_integer(unsigned(wr_index))) <= '1'; + end if; + --ev.itlb_miss_resolved <= m_in.tlbld and not rst; + end if; + end process; + comb : process(all) variable v : Fetch1ToIcacheType; variable v_int : reg_internal_t; variable next_nia : std_ulogic_vector(63 downto 0); variable m32 : std_ulogic; + variable ehit, esel : std_ulogic; + variable eaa_priv : std_ulogic; begin v := r; v_int := r_int; v.predicted := '0'; v.pred_ntaken := '0'; - v.req := not (rst or w_in.interrupt or stop_in); - -- reduce metavalue warnings in sim - if is_X(rst) then - v.req := '0'; + v.req := not stop_in; + v_int.tlbstall := r_int.tlbcheck; + v_int.tlbcheck := '0'; + + if r_int.tlbcheck = '1' and itlb_hit = '0' then + v.fetch_fail := '1'; end if; -- Combinatorial computation of the CIA for the next cycle. @@ -163,8 +317,13 @@ begin v.priv_mode := w_in.priv_mode; v.big_endian := w_in.big_endian; v_int.mode_32bit := w_in.mode_32bit; + v.fetch_fail := '0'; elsif d_in.redirect = '1' then next_nia := d_in.redirect_nia(63 downto 2) & "00"; + v.fetch_fail := '0'; + elsif r_int.tlbstall = '1' then + -- this case is needed so that the correct icache tags are read + next_nia := r.nia; else next_nia := r_int.next_nia; end if; @@ -182,6 +341,52 @@ begin btc_rd_addr <= unsigned(v_int.next_nia(BTC_ADDR_BITS + 1 downto 2)); v_int.rd_is_niap4 := '1'; + -- If the last NIA value went down with a stop mark, it didn't get + -- executed, and hence we shouldn't increment NIA. + advance_nia <= rst or w_in.interrupt or w_in.redirect or d_in.redirect or + (not r.stop_mark and not (r.req and stall_in)); + -- reduce metavalue warnings in sim + if is_X(rst) then + advance_nia <= '1'; + end if; + + -- Translate next_nia to real if possible, otherwise we have to stall + -- and look up the TLB. + ehit := '0'; + esel := '0'; + eaa_priv := '1'; + if next_nia(63 downto MIN_LG_PGSZ) = erat.epn1 and erat.valid(1) = '1' then + ehit := '1'; + esel := '1'; + end if; + if next_nia(63 downto MIN_LG_PGSZ) = erat.epn0 and erat.valid(0) = '1' then + ehit := '1'; + end if; + if v.virt_mode = '0' then + v.rpn := v.nia(REAL_ADDR_BITS - 1 downto MIN_LG_PGSZ); + eaa_priv := '1'; + elsif esel = '1' then + v.rpn := erat.rpn1; + eaa_priv := erat.priv1; + else + v.rpn := erat.rpn0; + eaa_priv := erat.priv0; + end if; + if advance_nia = '1' and ehit = '0' and v.virt_mode = '1' and + r_int.tlbcheck = '0' and v.fetch_fail = '0' then + v_int.tlbstall := '1'; + v_int.tlbcheck := '1'; + end if; + if ehit = '1' or v.virt_mode = '0' then + if eaa_priv = '1' and v.priv_mode = '0' then + v.fetch_fail := '1'; + else + v.fetch_fail := '0'; + end if; + end if; + erat_hit <= ehit and advance_nia; + erat_sel <= esel; + if rst /= '0' then if alt_reset_in = '1' then v_int.next_nia := ALT_RESET_ADDRESS; @@ -192,16 +397,29 @@ begin v_int.next_nia := 52x"0" & w_in.intr_vec(11 downto 2) & "00"; end if; if rst /= '0' or w_in.interrupt = '1' then + v.req := '0'; v.virt_mode := '0'; v.priv_mode := '1'; v.big_endian := '0'; v_int.mode_32bit := '0'; v_int.rd_is_niap4 := '0'; + v_int.tlbstall := '0'; + v_int.tlbcheck := '0'; + v.fetch_fail := '0'; + end if; + if v.fetch_fail = '1' then + v_int.tlbstall := '1'; + end if; + if v_int.tlbstall = '1' then + v.req := '0'; end if; -- If there is a valid entry in the BTC which corresponds to the next instruction, -- use that to predict the address of the instruction after that. - if rst = '0' and w_in.interrupt = '0' and w_in.redirect = '0' and d_in.redirect = '0' and + -- (w_in.redirect = '0' and d_in.redirect = '0' and r_int.tlbstall = '0') + -- implies v.nia = r_int.next_nia. + -- r_int.rd_is_niap4 implies r_int.next_nia is the address used to read the BTC. + if v.req = '1' and w_in.redirect = '0' and d_in.redirect = '0' and r_int.tlbstall = '0' and btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and btc_rd_data(BTC_WIDTH - 2) = r.virt_mode and btc_rd_data(BTC_WIDTH - 3 downto BTC_TARGET_BITS) @@ -214,15 +432,6 @@ begin end if; end if; - -- If the last NIA value went down with a stop mark, it didn't get - -- executed, and hence we shouldn't increment NIA. - advance_nia <= rst or w_in.interrupt or w_in.redirect or d_in.redirect or - (not r.stop_mark and not stall_in); - -- reduce metavalue warnings in sim - if is_X(rst) then - advance_nia <= '1'; - end if; - r_next <= v; r_next_int <= v_int; diff --git a/icache.vhdl b/icache.vhdl index 4bd4491..cc1b2b3 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -41,10 +41,6 @@ entity icache is NUM_LINES : positive := 32; -- Number of ways NUM_WAYS : positive := 4; - -- L1 ITLB number of entries (direct mapped) - TLB_SIZE : positive := 64; - -- L1 ITLB log_2(page_size) - TLB_LG_PGSZ : positive := 12; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -55,8 +51,6 @@ entity icache is i_in : in Fetch1ToIcacheType; i_out : out IcacheToDecode1Type; - m_in : in MmuToIcacheType; - stall_in : in std_ulogic; stall_out : out std_ulogic; flush_in : in std_ulogic; @@ -157,35 +151,6 @@ architecture rtl of icache is type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; signal cache_valids : cache_valids_t; - -- L1 ITLB. - constant TLB_BITS : natural := log2(TLB_SIZE); - constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); - constant TLB_PTE_BITS : natural := 64; - - subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; - type tlb_valids_t is array(tlb_index_t) of std_ulogic; - subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); - type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; - subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); - type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; - - signal itlb_valids : tlb_valids_t; - signal itlb_tags : tlb_tags_t; - signal itlb_ptes : tlb_ptes_t; - - -- Values read from above arrays on a clock edge - signal itlb_valid : std_ulogic; - signal itlb_ttag : tlb_tag_t; - signal itlb_pte : tlb_pte_t; - - -- Values captured from a write to a TLB - signal itlb_bypass_valid : std_ulogic; - signal itlb_bypass_ra : std_ulogic_vector(REAL_ADDR_BITS - TLB_LG_PGSZ - 1 downto 0); - signal itlb_bypass_priv : std_ulogic; - - -- Privilege bit from PTE EAA field - signal eaa_priv : std_ulogic; - -- Cache reload state machine type state_t is (IDLE, STOP_RELOAD, CLR_TAG, WAIT_ACK); @@ -233,9 +198,6 @@ architecture rtl of icache is signal req_raddr : real_addr_t; signal real_addr : real_addr_t; - signal ra_valid : std_ulogic; - signal priv_fault : std_ulogic; - signal access_ok : std_ulogic; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -330,16 +292,6 @@ architecture rtl of icache is return endian & addr(addr'left downto SET_SIZE_BITS); end; - -- Simple hash for direct-mapped TLB index - function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is - variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); - begin - hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ) - xor addr(TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto TLB_LG_PGSZ + TLB_BITS) - xor addr(TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto TLB_LG_PGSZ + 2 * TLB_BITS); - return hash; - end; - begin -- byte-swap read data if big endian @@ -530,95 +482,10 @@ begin end process; end generate; - -- Read TLB using the NIA for the next cycle - itlb_read : process(clk) - variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0); - begin - if rising_edge(clk) then - if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then - tlb_req_index := hash_ea(i_in.next_nia); - if is_X(tlb_req_index) then - itlb_pte <= (others => 'X'); - itlb_ttag <= (others => 'X'); - itlb_valid <= 'X'; - else - itlb_pte <= itlb_ptes(to_integer(unsigned(tlb_req_index))); - itlb_ttag <= itlb_tags(to_integer(unsigned(tlb_req_index))); - itlb_valid <= itlb_valids(to_integer(unsigned(tlb_req_index))); - end if; - end if; - end if; - end process; - - -- Store TLB data being written for use in servicing the current request - itlb_bypass: process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - itlb_bypass_valid <= '0'; - itlb_bypass_ra <= (others => '0'); - itlb_bypass_priv <= '0'; - elsif flush_in = '1' or i_in.req = '0' or stall_out = '0' then - itlb_bypass_valid <= '0'; - elsif m_in.tlbld = '1' then - assert i_in.nia(63 downto TLB_LG_PGSZ) = m_in.addr(63 downto TLB_LG_PGSZ); - itlb_bypass_valid <= '1'; - itlb_bypass_ra <= m_in.pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ); - itlb_bypass_priv <= m_in.pte(3); - end if; - end if; - end process; - -- TLB hit detection and real address generation itlb_lookup : process(all) begin - if itlb_bypass_valid = '1' then - real_addr <= itlb_bypass_ra & i_in.nia(TLB_LG_PGSZ - 1 downto 0); - ra_valid <= '1'; - eaa_priv <= itlb_bypass_priv; - elsif i_in.virt_mode = '1' then - real_addr <= itlb_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - i_in.nia(TLB_LG_PGSZ - 1 downto 0); - if itlb_ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then - ra_valid <= itlb_valid; - else - ra_valid <= '0'; - end if; - eaa_priv <= itlb_pte(3); - else - real_addr <= addr_to_real(i_in.nia); - ra_valid <= '1'; - eaa_priv <= '1'; - end if; - - -- no IAMR, so no KUEP support for now - priv_fault <= eaa_priv and not i_in.priv_mode; - access_ok <= ra_valid and not priv_fault; - end process; - - -- iTLB update - itlb_update: process(clk) - variable wr_index : std_ulogic_vector(TLB_BITS - 1 downto 0); - begin - if rising_edge(clk) then - wr_index := hash_ea(m_in.addr); - if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then - -- clear all valid bits - for i in tlb_index_t loop - itlb_valids(i) <= '0'; - end loop; - elsif m_in.tlbie = '1' then - assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE; - -- clear entry regardless of hit or miss - itlb_valids(to_integer(unsigned(wr_index))) <= '0'; - elsif m_in.tlbld = '1' then - assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE; - itlb_tags(to_integer(unsigned(wr_index))) <= m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS); - itlb_ptes(to_integer(unsigned(wr_index))) <= m_in.pte; - itlb_valids(to_integer(unsigned(wr_index))) <= '1'; - end if; - ev.itlb_miss_resolved <= m_in.tlbld and not rst; - end if; + real_addr <= i_in.rpn & i_in.nia(MIN_LG_PGSZ - 1 downto 0); end process; -- Cache hit detection, output to fetch2 and other misc logic @@ -667,7 +534,7 @@ begin end if; -- Generate the "hit" and "miss" signals for the synchronous blocks - if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then + if i_in.req = '1' and flush_in = '0' and rst = '0' then req_is_hit <= is_hit; req_is_miss <= not is_hit; else @@ -711,8 +578,8 @@ begin i_out.next_predicted <= r.predicted; i_out.next_pred_ntaken <= r.pred_ntaken; - -- Stall fetch1 if we have a miss on cache or TLB or a protection fault - stall_out <= i_in.req and not (is_hit and access_ok) and not flush_in; + -- Stall fetch1 if we have a cache miss + stall_out <= i_in.req and not is_hit and not flush_in; -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; @@ -763,6 +630,7 @@ begin r.big_endian <= i_in.big_endian; r.predicted <= i_in.predicted; r.pred_ntaken <= i_in.pred_ntaken; + r.fetch_failed <= i_in.fetch_fail and not flush_in; end if; if i_out.valid = '1' then assert not is_X(i_out.insn) severity failure; @@ -955,13 +823,6 @@ begin end if; end case; end if; - - -- TLB miss and protection fault processing - if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then - r.fetch_failed <= '0'; - elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then - r.fetch_failed <= '1'; - end if; end if; end process; @@ -991,8 +852,8 @@ begin wstate & std_ulogic_vector(resize(lway, 3)) & req_is_hit & req_is_miss & - access_ok & - ra_valid; + '1' & -- was access_ok + '1'; -- was ra_valid end if; end process; log_out <= log_data; diff --git a/mmu.vhdl b/mmu.vhdl index 1774822..fb63cfd 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -20,7 +20,7 @@ entity mmu is d_out : out MmuToDcacheType; d_in : in DcacheToMmuType; - i_out : out MmuToIcacheType + i_out : out MmuToITLBType ); end mmu; From 73b6004ac6ff8787ff05497a6bc5965f0ccea2d3 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 15 Aug 2023 11:30:53 +1000 Subject: [PATCH 10/11] icache: Use next real address to index icache Now that we are translating the fetch effective address to real one cycle earlier, we can use the real address to index the icache array. This has the benefit that the set size can be larger than a page, enabling us to configure the icache to be larger without having to increase its associativity. Previously the set size was limited to the page size to avoid aliasing problems. Thus for example a 32kB icache would need to be 8-way associative, resulting in large numbers of LUTs being used for tag comparisons in FPGA implementations, and poor timing. With this change, a 32kB icache can be 1 or 2-way associative, which means deeper and narrower tag and data RAMs and fewer tag comparators. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + fetch1.vhdl | 1 + icache.vhdl | 35 ++++++++++++++++++----------------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/common.vhdl b/common.vhdl index efcf7b3..eefa2fd 100644 --- a/common.vhdl +++ b/common.vhdl @@ -245,6 +245,7 @@ package common is nia: std_ulogic_vector(63 downto 0); next_nia: std_ulogic_vector(63 downto 0); rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); + next_rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0); end record; type IcacheToDecode1Type is record diff --git a/fetch1.vhdl b/fetch1.vhdl index 98116f9..677fa27 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -438,6 +438,7 @@ begin -- Update outputs to the icache i_out <= r; i_out.next_nia <= next_nia; + i_out.next_rpn <= v.rpn; end process; diff --git a/icache.vhdl b/icache.vhdl index cc1b2b3..8dfbd86 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -158,6 +158,7 @@ architecture rtl of icache is -- Cache hit state (Latches for 1 cycle BRAM access) hit_way : way_sig_t; hit_nia : std_ulogic_vector(63 downto 0); + hit_ra : real_addr_t; hit_smark : std_ulogic; hit_valid : std_ulogic; big_endian: std_ulogic; @@ -218,7 +219,7 @@ architecture rtl of icache is signal log_insn : std_ulogic_vector(35 downto 0); -- Return the cache line index (tag index) for an address - function get_index(addr: std_ulogic_vector) return index_sig_t is + function get_index(addr: real_addr_t) return index_sig_t is begin return unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)); end; @@ -400,6 +401,7 @@ begin process(clk) variable replace_way : way_sig_t; variable snoop_addr : real_addr_t; + variable next_raddr : real_addr_t; begin replace_way := to_unsigned(0, WAY_BITS); if NUM_WAYS > 1 then @@ -409,10 +411,11 @@ begin if rising_edge(clk) then -- Read tags using NIA for next cycle if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then - cache_tags_set(i) <= ic_tags(to_integer(get_index(i_in.next_nia))); + next_raddr := i_in.next_rpn & i_in.next_nia(MIN_LG_PGSZ - 1 downto 0); + cache_tags_set(i) <= ic_tags(to_integer(get_index(next_raddr))); -- Check for simultaneous write to the same location tag_overwrite(i) <= '0'; - if r.state = CLR_TAG and r.store_index = get_index(i_in.next_nia) and + if r.state = CLR_TAG and r.store_index = get_index(next_raddr) and to_unsigned(i, WAY_BITS) = replace_way then tag_overwrite(i) <= '1'; end if; @@ -459,10 +462,10 @@ begin process(all) begin -- Read PLRU bits from array - if is_X(r.hit_nia) then + if is_X(r.hit_ra) then plru_cur <= (others => 'X'); else - plru_cur <= plru_ram(to_integer(get_index(r.hit_nia))); + plru_cur <= plru_ram(to_integer(get_index(r.hit_ra))); end if; -- PLRU interface @@ -475,35 +478,32 @@ begin begin if rising_edge(clk) then if r.hit_valid = '1' then - assert not is_X(r.hit_nia) severity failure; - plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd; + assert not is_X(r.hit_ra) severity failure; + plru_ram(to_integer(get_index(r.hit_ra))) <= plru_upd; end if; end if; end process; end generate; - -- TLB hit detection and real address generation - itlb_lookup : process(all) - begin - real_addr <= i_in.rpn & i_in.nia(MIN_LG_PGSZ - 1 downto 0); - end process; - -- Cache hit detection, output to fetch2 and other misc logic icache_comb : process(all) variable is_hit : std_ulogic; variable hit_way : way_sig_t; variable insn : std_ulogic_vector(ICWORDLEN - 1 downto 0); variable icode : insn_code; + variable ra : real_addr_t; begin -- Extract line, row and tag from request - req_index <= get_index(i_in.nia); - req_row <= get_row(i_in.nia); - req_tag <= get_tag(real_addr, i_in.big_endian); + ra := i_in.rpn & i_in.nia(MIN_LG_PGSZ - 1 downto 0); + real_addr <= ra; + req_index <= get_index(ra); + req_row <= get_row(ra); + req_tag <= get_tag(ra, i_in.big_endian); -- Calculate address of beginning of cache row, will be -- used for cache miss processing if needed -- - req_raddr <= real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + req_raddr <= ra(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & (ROW_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way @@ -627,6 +627,7 @@ begin -- Send stop marks and NIA down regardless of validity r.hit_smark <= i_in.stop_mark; r.hit_nia <= i_in.nia; + r.hit_ra <= real_addr; r.big_endian <= i_in.big_endian; r.predicted <= i_in.predicted; r.pred_ntaken <= i_in.pred_ntaken; From 73a2fcbc7fd4a782d3922fee0776f3feee52fffd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 20 Sep 2023 09:41:16 +1000 Subject: [PATCH 11/11] icache_tb: Update for recent icache changes - Provide next_nia before clock edge where req is asserted - Set rpn and next_rpn to zero - There is no longer an input to the icache from the MMU Signed-off-by: Paul Mackerras --- icache_tb.vhdl | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 83a84b3..05f7bd5 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -15,8 +15,6 @@ architecture behave of icache_tb is signal i_out : Fetch1ToIcacheType; signal i_in : IcacheToDecode1Type; - signal m_out : MmuToIcacheType; - signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -32,7 +30,6 @@ begin rst => rst, i_in => i_out, i_out => i_in, - m_in => m_out, stall_in => '0', flush_in => '0', inval_in => '0', @@ -77,19 +74,21 @@ begin i_out.priv_mode <= '1'; i_out.virt_mode <= '0'; i_out.big_endian <= '0'; - - m_out.tlbld <= '0'; - m_out.tlbie <= '0'; - m_out.addr <= (others => '0'); - m_out.pte <= (others => '0'); + i_out.fetch_fail <= '0'; + i_out.predicted <= '0'; + i_out.pred_ntaken <= '0'; wait until rising_edge(clk); wait until rising_edge(clk); wait until rising_edge(clk); + + i_out.next_nia <= x"0000000000000004"; + i_out.next_rpn <= (others => '0'); wait until rising_edge(clk); i_out.req <= '1'; i_out.nia <= x"0000000000000004"; + i_out.rpn <= (others => '0'); wait for 30*clk_period; wait until rising_edge(clk); @@ -102,6 +101,7 @@ begin severity failure; i_out.req <= '0'; + i_out.next_nia <= x"0000000000000008"; wait until rising_edge(clk); @@ -116,6 +116,8 @@ begin "=" & to_hstring(i_in.insn) & " expected 00000002" severity failure; + + i_out.next_nia <= x"0000000000000040"; wait until rising_edge(clk); -- another miss @@ -133,6 +135,9 @@ begin severity failure; -- test something that aliases + i_out.next_nia <= x"0000000000000100"; + wait until rising_edge(clk); + i_out.req <= '1'; i_out.nia <= x"0000000000000100"; wait until rising_edge(clk);