From 52d8f28d034f2949635a9320685faed902c37cf4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 4 Jan 2025 16:27:10 +1100 Subject: [PATCH 01/12] execute1: Improve timing for execute bypass tag The tags for the bypass data paths back to decode2 don't really need to depend on the stall/busy inputs or on whether an exception might be generated, since the bypass values won't be used until the instruction gets executed. Therefore, this simplifies the expressions for bypass_data.tag.valid and bypass_cr_data.tag.valid. Signed-off-by: Paul Mackerras --- execute1.vhdl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index a3b9522..f218ab8 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1147,7 +1147,7 @@ begin -- side-effect flags or write enables when generating a trap). -- With v.trap = 1 we will assert both ex1.e.valid and ex1.e.interrupt -- to writeback, and it will complete the instruction and take - -- and interrupt. It is OK for v.trap to depend on operand data. + -- an interrupt. It is OK for v.trap to depend on operand data. illegal := '0'; privileged := '0'; @@ -1585,7 +1585,7 @@ begin if e_in.unit = ALU then v.complete := e_in.valid and not v.exception and not owait; - v.bypass_valid := e_in.valid and not v.exception and not slow_op; + v.bypass_valid := e_in.valid and not slow_op; end if; actions <= v; @@ -1631,7 +1631,7 @@ begin v.taken_branch_event := '0'; v.br_mispredict := '0'; v.busy := '0'; - bypass_valid := '0'; + bypass_valid := actions.bypass_valid; irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); @@ -1706,7 +1706,6 @@ begin if go = '1' then v.se := actions.se; v.e.valid := actions.complete; - bypass_valid := actions.bypass_valid; v.taken_branch_event := actions.take_branch; v.trace_next := actions.do_trace or actions.ciabr_trace; v.trace_ciabr := actions.ciabr_trace; @@ -1814,13 +1813,13 @@ begin v.fp_exception_next := '0'; end if; - bypass_data.tag.valid <= v.e.write_enable and bypass_valid; - bypass_data.tag.tag <= v.e.instr_tag.tag; + bypass_data.tag.valid <= e_in.write_reg_enable and bypass_valid; + bypass_data.tag.tag <= e_in.instr_tag.tag; bypass_data.data <= alu_result; - bypass_cr_data.tag.valid <= v.e.write_cr_enable and bypass_valid; - bypass_cr_data.tag.tag <= v.e.instr_tag.tag; - bypass_cr_data.data <= v.e.write_cr_data; + bypass_cr_data.tag.valid <= e_in.output_cr and bypass_valid; + bypass_cr_data.tag.tag <= e_in.instr_tag.tag; + bypass_cr_data.data <= write_cr_data; -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; From f6a839a86b17876f02d38fe89a0554696b82723b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 5 Jan 2025 21:59:45 +1100 Subject: [PATCH 02/12] control: Use a 1-hot encoding for bypass enables Instead of creating a 2-bit encoded bypass selector, we now have a 4-bit encoding where bits 1 to 3 enable separate bypass sources, and bit 0 indicates if any bypass should be used. This results in slightly simpler logic and better timing. Signed-off-by: Paul Mackerras --- control.vhdl | 73 ++++++++++++++++++++++---------------- decode2.vhdl | 98 +++++++++++++++++++++++++++------------------------- 2 files changed, 93 insertions(+), 78 deletions(-) diff --git a/control.vhdl b/control.vhdl index e8c8068..b75fcc1 100644 --- a/control.vhdl +++ b/control.vhdl @@ -45,9 +45,13 @@ entity control is valid_out : out std_ulogic; stopped_out : out std_ulogic; - gpr_bypass_a : out std_ulogic_vector(1 downto 0); - gpr_bypass_b : out std_ulogic_vector(1 downto 0); - gpr_bypass_c : out std_ulogic_vector(1 downto 0); + -- Note on gpr_bypass_*: bits 1 to 3 are a 1-hot encoding of which + -- bypass source we may possibly need to use; bit 0 is 1 if the bypass + -- value should be used (i.e. any of bits 1-3 are 1 and the + -- corresponding gpr_x_read_valid_in is also 1). + gpr_bypass_a : out std_ulogic_vector(3 downto 0); + gpr_bypass_b : out std_ulogic_vector(3 downto 0); + gpr_bypass_c : out std_ulogic_vector(3 downto 0); cr_bypass : out std_ulogic_vector(1 downto 0); instr_tag_out : out instr_tag_t @@ -152,9 +156,9 @@ begin variable tag_s : instr_tag_t; variable tag_t : instr_tag_t; variable incr_tag : tag_number_t; - variable byp_a : std_ulogic_vector(1 downto 0); - variable byp_b : std_ulogic_vector(1 downto 0); - variable byp_c : std_ulogic_vector(1 downto 0); + variable byp_a : std_ulogic_vector(3 downto 0); + variable byp_b : std_ulogic_vector(3 downto 0); + variable byp_c : std_ulogic_vector(3 downto 0); variable tag_cr : instr_tag_t; variable byp_cr : std_ulogic_vector(1 downto 0); variable tag_ov : instr_tag_t; @@ -163,57 +167,66 @@ begin tag_a := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_a_read_in then - tag_a.valid := gpr_a_read_valid_in; + tag_a.valid := '1'; tag_a.tag := i; end if; end loop; tag_b := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then - tag_b.valid := gpr_b_read_valid_in; + tag_b.valid := '1'; tag_b.tag := i; end if; end loop; tag_c := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then - tag_c.valid := gpr_c_read_valid_in; + tag_c.valid := '1'; tag_c.tag := i; end if; end loop; - byp_a := "00"; + byp_a := "0000"; if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then - byp_a := "01"; - elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then - byp_a := "10"; - elsif tag_match(complete_in, tag_a) then - byp_a := "11"; + byp_a(1) := '1'; end if; - byp_b := "00"; + if EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then + byp_a(2) := '1'; + end if; + if tag_match(complete_in, tag_a) then + byp_a(3) := '1'; + end if; + byp_a(0) := gpr_a_read_valid_in and (byp_a(1) or byp_a(2) or byp_a(3)); + byp_b := "0000"; if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then - byp_b := "01"; - elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then - byp_b := "10"; - elsif tag_match(complete_in, tag_b) then - byp_b := "11"; + byp_b(1) := '1'; + end if; + if EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then + byp_b(2) := '1'; end if; - byp_c := "00"; + if tag_match(complete_in, tag_b) then + byp_b(3) := '1'; + end if; + byp_b(0) := gpr_b_read_valid_in and (byp_b(1) or byp_b(2) or byp_b(3)); + byp_c := "0000"; if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then - byp_c := "01"; - elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then - byp_c := "10"; - elsif tag_match(complete_in, tag_c) then - byp_c := "11"; + byp_c(1) := '1'; + end if; + if EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then + byp_c(2) := '1'; + end if; + if tag_match(complete_in, tag_c) then + byp_c(3) := '1'; end if; + byp_c(0) := gpr_c_read_valid_in and (byp_c(1) or byp_c(2) or byp_c(3)); gpr_bypass_a <= byp_a; gpr_bypass_b <= byp_b; gpr_bypass_c <= byp_c; - gpr_tag_stall <= (tag_a.valid and not (or (byp_a))) or - (tag_b.valid and not (or (byp_b))) or - (tag_c.valid and not (or (byp_c))); + gpr_tag_stall <= (tag_a.valid and gpr_a_read_valid_in and not byp_a(0)) or + (tag_b.valid and gpr_b_read_valid_in and not byp_b(0)) or + (tag_c.valid and gpr_c_read_valid_in and not byp_c(0)); incr_tag := curr_tag; instr_tag.tag <= curr_tag; diff --git a/decode2.vhdl b/decode2.vhdl index cc241a2..fd7434c 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -201,6 +201,23 @@ architecture behaviour of decode2 is end case; end; + function andor (mask_a : std_ulogic; val_a : std_ulogic_vector(63 downto 0); + mask_b : std_ulogic; val_b : std_ulogic_vector(63 downto 0); + mask_c : std_ulogic; val_c : std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable t : std_ulogic_vector(63 downto 0) := (others => '0'); + begin + if mask_a = '1' then + t := val_a; + end if; + if mask_b = '1' then + t := t or val_b; + end if; + if mask_c = '1' then + t := t or val_c; + end if; + return t; + end; + -- control signals that are derived from insn_type type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0); @@ -269,15 +286,15 @@ architecture behaviour of decode2 is signal gpr_a_read_valid : std_ulogic; signal gpr_a_read : gspr_index_t; - signal gpr_a_bypass : std_ulogic_vector(1 downto 0); + signal gpr_a_bypass : std_ulogic_vector(3 downto 0); signal gpr_b_read_valid : std_ulogic; signal gpr_b_read : gspr_index_t; - signal gpr_b_bypass : std_ulogic_vector(1 downto 0); + signal gpr_b_bypass : std_ulogic_vector(3 downto 0); signal gpr_c_read_valid : std_ulogic; signal gpr_c_read : gspr_index_t; - signal gpr_c_bypass : std_ulogic_vector(1 downto 0); + signal gpr_c_bypass : std_ulogic_vector(3 downto 0); signal cr_read_valid : std_ulogic; signal cr_write_valid : std_ulogic; @@ -694,53 +711,38 @@ begin ov_write_valid <= v.output_ov; -- See if any of the operands can get their value via the bypass path. - if dc2.busy = '0' or gpr_a_bypass /= "00" then - case gpr_a_bypass is - when "01" => - v.e.read_data1 := execute_bypass.data; - when "10" => - v.e.read_data1 := execute2_bypass.data; - when "11" => - v.e.read_data1 := writeback_bypass.data; - when others => - if decoded_reg_a.reg_valid = '1' then - v.e.read_data1 := r_in.read1_data; - else - v.e.read_data1 := decoded_reg_a.data; - end if; - end case; + if gpr_a_bypass(0) = '1' then + v.e.read_data1 := andor(gpr_a_bypass(1), execute_bypass.data, + gpr_a_bypass(2), execute2_bypass.data, + gpr_a_bypass(3), writeback_bypass.data); + elsif dc2.busy = '0' then + if decoded_reg_a.reg_valid = '1' then + v.e.read_data1 := r_in.read1_data; + else + v.e.read_data1 := decoded_reg_a.data; + end if; end if; - if dc2.busy = '0' or gpr_b_bypass /= "00" then - case gpr_b_bypass is - when "01" => - v.e.read_data2 := execute_bypass.data; - when "10" => - v.e.read_data2 := execute2_bypass.data; - when "11" => - v.e.read_data2 := writeback_bypass.data; - when others => - if decoded_reg_b.reg_valid = '1' then - v.e.read_data2 := r_in.read2_data; - else - v.e.read_data2 := decoded_reg_b.data; - end if; - end case; + if gpr_b_bypass(0) = '1' then + v.e.read_data2 := andor(gpr_b_bypass(1), execute_bypass.data, + gpr_b_bypass(2), execute2_bypass.data, + gpr_b_bypass(3), writeback_bypass.data); + elsif dc2.busy = '0' then + if decoded_reg_b.reg_valid = '1' then + v.e.read_data2 := r_in.read2_data; + else + v.e.read_data2 := decoded_reg_b.data; + end if; end if; - if dc2.busy = '0' or gpr_c_bypass /= "00" then - case gpr_c_bypass is - when "01" => - v.e.read_data3 := execute_bypass.data; - when "10" => - v.e.read_data3 := execute2_bypass.data; - when "11" => - v.e.read_data3 := writeback_bypass.data; - when others => - if decoded_reg_c.reg_valid = '1' then - v.e.read_data3 := r_in.read3_data; - else - v.e.read_data3 := decoded_reg_c.data; - end if; - end case; + if gpr_c_bypass(0) = '1' then + v.e.read_data3 := andor(gpr_c_bypass(1), execute_bypass.data, + gpr_c_bypass(2), execute2_bypass.data, + gpr_c_bypass(3), writeback_bypass.data); + elsif dc2.busy = '0' then + if decoded_reg_c.reg_valid = '1' then + v.e.read_data3 := r_in.read3_data; + else + v.e.read_data3 := decoded_reg_c.data; + end if; end if; case cr_bypass is From 23ff954059fa90e00861f94e8603aa4a958ff7df Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 7 Jan 2025 14:00:01 +1100 Subject: [PATCH 03/12] core: Change bperm to a simpler and slower implementation This does bperm in the bitsort unit instead of the logical unit, and no longer tries to do it in a single cycle with eight 64-to-1 multiplexers. Instead it is now a state machine in the bitsort unit, takes 8 cycles, and only has one 64-to-1 multiplexer. This helps improve timing and reduces LUT usage. Signed-off-by: Paul Mackerras --- bitsort.vhdl | 47 ++++++++++++++++++++++++++++++++++++++++++++++- decode2.vhdl | 2 +- execute1.vhdl | 27 ++++++++++++++++++++++++--- logical.vhdl | 13 ------------- 4 files changed, 71 insertions(+), 18 deletions(-) diff --git a/bitsort.vhdl b/bitsort.vhdl index f2aeddb..01b34b5 100644 --- a/bitsort.vhdl +++ b/bitsort.vhdl @@ -1,5 +1,6 @@ -- Implements instructions that involve sorting bits, -- that is, cfuged, pextd and pdepd. +-- Also does bperm, which is somewhat different. -- -- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right -- and move the bits in RS in the same fashion to give the result @@ -7,6 +8,7 @@ -- corresponding bit in RB is 1 -- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out -- to the bit positions which have a 1 in RB +-- bperm: Select 8 arbitrary bits -- NB opc is bits 7-6 of the instruction: -- 00 = pdepd, 01 = pextd, 10 = cfuged @@ -27,6 +29,8 @@ entity bit_sorter is go : in std_ulogic; opc : in std_ulogic_vector(1 downto 0); done : out std_ulogic; + do_bperm : in std_ulogic; + bperm_done : out std_ulogic; result : out std_ulogic_vector(63 downto 0) ); end entity bit_sorter; @@ -45,6 +49,13 @@ architecture behaviour of bit_sorter is signal sr_vl : std_ulogic_vector(63 downto 0); signal sr_vr : std_ulogic_vector(63 downto 0); + signal is_bperm : std_ulogic; + signal bpc : unsigned(2 downto 0); + signal bp_done : std_ulogic; + signal bperm_res : std_ulogic_vector(7 downto 0); + signal rs_sr : std_ulogic_vector(63 downto 0); + signal rb_bp : std_ulogic_vector(63 downto 0); + begin bsort_r: process(clk) begin @@ -96,7 +107,41 @@ begin end if; end process; + -- bit permutation + bperm_res(7) <= rb_bp(to_integer(unsigned(not rs_sr(5 downto 0)))) when not is_X(rs_sr) + else 'X'; + + bperm_r: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + is_bperm <= '0'; + bp_done <= '0'; + bperm_res(6 downto 0) <= (others => '0'); + bpc <= to_unsigned(0, 3); + elsif do_bperm = '1' then + is_bperm <= '1'; + bp_done <= '0'; + bperm_res(6 downto 0) <= (others => '0'); + bpc <= to_unsigned(0, 3); + rs_sr <= rs; + rb_bp <= rb; + elsif bp_done = '1' then + is_bperm <= '0'; + bp_done <= '0'; + elsif is_bperm = '1' then + bperm_res(6 downto 0) <= bperm_res(7 downto 1); + rs_sr <= x"00" & rs_sr(63 downto 8); + if bpc = "110" then + bp_done <= '1'; + end if; + bpc <= bpc + 1; + end if; + end if; + end process; + done <= sd; - result <= val; + bperm_done <= bp_done; + result <= val when is_bperm = '0' else (56x"0" & bperm_res); end behaviour; diff --git a/decode2.vhdl b/decode2.vhdl index fd7434c..432426d 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -227,7 +227,6 @@ architecture behaviour of decode2 is OP_PRTY => "001", OP_CMPB => "001", OP_EXTS => "001", - OP_BPERM => "001", OP_BREV => "001", OP_BCD => "001", OP_MTSPR => "001", @@ -256,6 +255,7 @@ architecture behaviour of decode2 is OP_DIVE => "101", OP_MOD => "101", OP_BSORT => "100", + OP_BPERM => "100", OP_ADDG6S => "001", -- misc_result OP_ISEL => "010", OP_DARN => "011", diff --git a/execute1.vhdl b/execute1.vhdl index f218ab8..08bc694 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -116,6 +116,7 @@ architecture behaviour of execute1 is start_mul : std_ulogic; start_div : std_ulogic; start_bsort : std_ulogic; + start_bperm : std_ulogic; do_trace : std_ulogic; ciabr_trace : std_ulogic; fp_intr : std_ulogic; @@ -150,6 +151,7 @@ architecture behaviour of execute1 is mul_finish : std_ulogic; div_in_progress : std_ulogic; bsort_in_progress : std_ulogic; + bperm_in_progress : std_ulogic; no_instr_avail : std_ulogic; instr_dispatch : std_ulogic; ext_interrupt : std_ulogic; @@ -174,7 +176,7 @@ architecture behaviour of execute1 is spr_select => spr_id_init, pmu_spr_num => 5x"0", redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', - bsort_in_progress => '0', + bsort_in_progress => '0', bperm_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", @@ -245,6 +247,8 @@ architecture behaviour of execute1 is -- bit-sort unit signals signal bsort_start : std_ulogic; signal bsort_done : std_ulogic; + signal bperm_start : std_ulogic; + signal bperm_done : std_ulogic; -- random number generator signals signal random_raw : std_ulogic_vector(63 downto 0); @@ -515,6 +519,8 @@ begin go => bsort_start, opc => e_in.insn(7 downto 6), done => bsort_done, + do_bperm => bperm_start, + bperm_done => bperm_done, result => bsort_result ); @@ -1228,7 +1234,7 @@ begin when OP_CMPRB => when OP_CMPEQB => when OP_LOGIC | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS | - OP_BPERM | OP_BREV | OP_BCD => + OP_BREV | OP_BCD => when OP_B => v.take_branch := '1'; @@ -1433,6 +1439,11 @@ begin slow_op := '1'; owait := '1'; + when OP_BPERM => + v.start_bperm := '1'; + slow_op := '1'; + owait := '1'; + when OP_MUL_L64 => if e_in.is_32bit = '1' then v.se.mult_32s := '1'; @@ -1718,6 +1729,7 @@ begin x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; v.bsort_in_progress := actions.start_bsort; + v.bperm_in_progress := actions.start_bperm; v.br_mispredict := v.e.redirect and actions.direct_branch; v.advance_nia := actions.advance_nia; v.redir_to_next := actions.redir_to_next; @@ -1728,7 +1740,8 @@ begin -- multiply is happening in order to stop following -- instructions from using the wrong XER value -- (and for simplicity in the OE=0 case). - v.busy := actions.start_div or actions.start_mul or actions.start_bsort; + v.busy := actions.start_div or actions.start_mul or + actions.start_bsort or actions.start_bperm; -- instruction for other units, i.e. LDST if e_in.unit = LDST then @@ -1740,6 +1753,7 @@ begin end if; is_scv := go and actions.se.scv_trap; bsort_start <= go and actions.start_bsort; + bperm_start <= go and actions.start_bperm; pmu_trace <= go and actions.do_trace; if not HAS_FPU and ex1.div_in_progress = '1' then @@ -1780,6 +1794,13 @@ begin v.e.write_data := alu_result; bypass_valid := bsort_done; end if; + if ex1.bperm_in_progress = '1' then + v.bperm_in_progress := not bperm_done; + v.e.valid := bperm_done; + v.busy := not bperm_done; + v.e.write_data := alu_result; + bypass_valid := bperm_done; + end if; if v.e.write_xerc_enable = '1' and v.e.valid = '1' then v.xerc := v.e.xerc; diff --git a/logical.vhdl b/logical.vhdl index 2d139f8..792a896 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -23,7 +23,6 @@ architecture behaviour of logical is signal par0, par1 : std_ulogic; signal parity : std_ulogic_vector(63 downto 0); - signal permute : std_ulogic_vector(7 downto 0); function bcd_to_dpd(bcd: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is variable dpd: std_ulogic_vector(9 downto 0); @@ -109,16 +108,6 @@ begin parity(32) <= par1; end if; - -- bit permutation - for i in 0 to 7 loop - j := i * 8; - if rs(j+7 downto j+6) = "00" then - permute(i) <= rb(to_integer(unsigned(not rs(j+5 downto j)))); - else - permute(i) <= '0'; - end if; - end loop; - rb_adj := rb; if invert_in = '1' then rb_adj := not rb; @@ -157,8 +146,6 @@ begin tmp := parity; when OP_CMPB => tmp := ppc_cmpb(rs, rb); - when OP_BPERM => - tmp := std_ulogic_vector(resize(unsigned(permute), 64)); when OP_BCD => -- invert_in is abused to indicate direction of conversion if invert_in = '0' then From 9a06b0c18295115365e310ab2df2fc73cc8cfa09 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 4 Jan 2025 16:24:41 +1100 Subject: [PATCH 04/12] soc: Implement multiple CPU cores This adds an 'NCPUS' generic parameter to the soc module, which then includes that many CPU cores. The cores have separate addresses on the DMI interconnect, meaning that external JTAG debug tools can view and control the state of each core individually. The syscon module has a new 'cpu_ctrl' register, where byte 0 contains individual enable bits for each core, and byte 1 indicates the number of cores. If a core's enable bit is clear, the core is held in reset. On system reset, the enable byte is set to 0x01, so only core 0 is active. Signed-off-by: Paul Mackerras --- include/microwatt_soc.h | 3 +- soc.vhdl | 108 +++++++++++++++++++++------------------- syscon.vhdl | 16 +++++- 3 files changed, 74 insertions(+), 53 deletions(-) diff --git a/include/microwatt_soc.h b/include/microwatt_soc.h index 6717b4b..67ea13d 100644 --- a/include/microwatt_soc.h +++ b/include/microwatt_soc.h @@ -65,7 +65,8 @@ #define SYS_REG_UART_IS_16550 (1ull << 32) #define SYS_REG_GIT_INFO 0x50 #define SYS_REG_GIT_IS_DIRTY (1ull << 63) - +#define SYS_REG_CPU_CTRL 0x58 +#define SYS_REG_CPU_CTRL_ENABLE 0xff /* * Register definitions for the potato UART diff --git a/soc.vhdl b/soc.vhdl index 3e3b438..0ed234d 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -67,6 +67,7 @@ entity soc is RAM_INIT_FILE : string; CLK_FREQ : positive; SIM : boolean; + NCPUS : positive := 1; HAS_FPU : boolean := true; HAS_BTC : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; @@ -148,20 +149,18 @@ end entity soc; architecture behaviour of soc is + subtype cpu_index_t is natural range 0 to NCPUS-1; + type dword_percpu_array is array(cpu_index_t) of std_ulogic_vector(63 downto 0); + -- internal reset signal soc_reset : std_ulogic; -- Wishbone master signals: - signal wishbone_dcore_in : wishbone_slave_out; - signal wishbone_dcore_out : wishbone_master_out; - signal wishbone_icore_in : wishbone_slave_out; - signal wishbone_icore_out : wishbone_master_out; - signal wishbone_debug_in : wishbone_slave_out; - signal wishbone_debug_out : wishbone_master_out; - - -- Arbiter array (ghdl doesnt' support assigning the array - -- elements in the entity instantiation) - constant NUM_WB_MASTERS : positive := 4; + signal wishbone_debug_in : wishbone_slave_out; + signal wishbone_debug_out : wishbone_master_out; + + -- Arbiter array + constant NUM_WB_MASTERS : positive := NCPUS * 2 + 2; signal wb_masters_out : wishbone_master_out_vector(0 to NUM_WB_MASTERS-1); signal wb_masters_in : wishbone_slave_out_vector(0 to NUM_WB_MASTERS-1); @@ -180,7 +179,7 @@ architecture behaviour of soc is -- Syscon signals signal dram_at_0 : std_ulogic; - signal do_core_reset : std_ulogic; + signal do_core_reset : std_ulogic_vector(NCPUS-1 downto 0); signal alt_reset : std_ulogic; signal wb_syscon_in : wb_io_master_out; signal wb_syscon_out : wb_io_slave_out; @@ -210,7 +209,7 @@ architecture behaviour of soc is signal wb_xics_ics_out : wb_io_slave_out; signal int_level_in : std_ulogic_vector(15 downto 0); signal ics_to_icp : ics_to_icp_t; - signal core_ext_irq : std_ulogic; + signal core_ext_irq : std_ulogic_vector(NCPUS-1 downto 0) := (others => '0'); -- GPIO signals: signal wb_gpio_in : wb_io_master_out; @@ -233,12 +232,12 @@ architecture behaviour of soc is signal dmi_wb_dout : std_ulogic_vector(63 downto 0); signal dmi_wb_req : std_ulogic; signal dmi_wb_ack : std_ulogic; - signal dmi_core_dout : std_ulogic_vector(63 downto 0); - signal dmi_core_req : std_ulogic; - signal dmi_core_ack : std_ulogic; + signal dmi_core_dout : dword_percpu_array; + signal dmi_core_req : std_ulogic_vector(NCPUS-1 downto 0); + signal dmi_core_ack : std_ulogic_vector(NCPUS-1 downto 0); -- Delayed/latched resets and alt_reset - signal rst_core : std_ulogic; + signal rst_core : std_ulogic_vector(NCPUS-1 downto 0); signal rst_uart : std_ulogic; signal rst_xics : std_ulogic; signal rst_spi : std_ulogic; @@ -270,6 +269,8 @@ architecture behaviour of soc is signal io_cycle_gpio : std_ulogic; signal io_cycle_external : std_ulogic; + signal core_run_out : std_ulogic_vector(NCPUS-1 downto 0); + function wishbone_widen_data(wb : wb_io_master_out) return wishbone_master_out is variable wwb : wishbone_master_out; begin @@ -334,7 +335,9 @@ begin resets: process(system_clk) begin if rising_edge(system_clk) then - rst_core <= soc_reset or do_core_reset; + for i in 0 to NCPUS-1 loop + rst_core(i) <= soc_reset or do_core_reset(i); + end loop; rst_uart <= soc_reset; rst_spi <= soc_reset; rst_xics <= soc_reset; @@ -347,11 +350,12 @@ begin end if; end process; - -- Processor core - processor: entity work.core + -- Processor cores + processors: for i in 0 to NCPUS-1 generate + core: entity work.core generic map( SIM => SIM, - CPU_INDEX => 0, + CPU_INDEX => i, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, @@ -367,32 +371,31 @@ begin ) port map( clk => system_clk, - rst => rst_core, + rst => rst_core(i), alt_reset => alt_reset_d, - run_out => run_out, - wishbone_insn_in => wishbone_icore_in, - wishbone_insn_out => wishbone_icore_out, - wishbone_data_in => wishbone_dcore_in, - wishbone_data_out => wishbone_dcore_out, + run_out => core_run_out(i), + wishbone_insn_in => wb_masters_in(i + NCPUS), + wishbone_insn_out => wb_masters_out(i + NCPUS), + wishbone_data_in => wb_masters_in(i), + wishbone_data_out => wb_masters_out(i), wb_snoop_in => wb_snoop, dmi_addr => dmi_addr(3 downto 0), - dmi_dout => dmi_core_dout, + dmi_dout => dmi_core_dout(i), dmi_din => dmi_dout, dmi_wr => dmi_wr, - dmi_ack => dmi_core_ack, - dmi_req => dmi_core_req, - ext_irq => core_ext_irq + dmi_ack => dmi_core_ack(i), + dmi_req => dmi_core_req(i), + ext_irq => core_ext_irq(i) ); + end generate; + + run_out <= or (core_run_out); -- Wishbone bus master arbiter & mux - wb_masters_out <= (0 => wishbone_dcore_out, - 1 => wishbone_icore_out, - 2 => wishbone_widen_data(wishbone_dma_out), - 3 => wishbone_debug_out); - wishbone_dcore_in <= wb_masters_in(0); - wishbone_icore_in <= wb_masters_in(1); - wishbone_dma_in <= wishbone_narrow_data(wb_masters_in(2), wishbone_dma_out.adr); - wishbone_debug_in <= wb_masters_in(3); + wb_masters_out(2*NCPUS) <= wishbone_widen_data(wishbone_dma_out); + wb_masters_out(2*NCPUS + 1) <= wishbone_debug_out; + wishbone_dma_in <= wishbone_narrow_data(wb_masters_in(2*NCPUS), wishbone_dma_out.adr); + wishbone_debug_in <= wb_masters_in(2*NCPUS + 1); wishbone_arbiter_0: entity work.wishbone_arbiter generic map( NUM_MASTERS => NUM_WB_MASTERS @@ -780,6 +783,7 @@ begin -- Syscon slave syscon0: entity work.syscon generic map( + NCPUS => NCPUS, HAS_UART => true, HAS_DRAM => HAS_DRAM, BRAM_SIZE => MEMORY_SIZE, @@ -950,7 +954,7 @@ begin wb_in => wb_xics_icp_in, wb_out => wb_xics_icp_out, ics_in => ics_to_icp, - core_irq_out => core_ext_irq + core_irq_out => core_ext_irq(0) ); xics_ics: entity work.xics_ics @@ -1034,15 +1038,15 @@ begin ); -- DMI interconnect - dmi_intercon: process(dmi_addr, dmi_req, - dmi_wb_ack, dmi_wb_dout, - dmi_core_ack, dmi_core_dout) + dmi_intercon: process(all) -- DMI address map (each address is a full 64-bit register) -- -- Offset: Size: Slave: -- 0 4 Wishbone - -- 10 16 Core + -- 10 16 Core 0 + -- 20 16 Core 1 + -- ... and so on for NCPUS cores type slave_type is (SLAVE_WB, SLAVE_CORE, @@ -1053,25 +1057,29 @@ begin slave := SLAVE_NONE; if std_match(dmi_addr, "000000--") then slave := SLAVE_WB; - elsif std_match(dmi_addr, "0001----") then + elsif not is_X(dmi_addr) and to_integer(unsigned(dmi_addr(7 downto 4))) <= NCPUS then slave := SLAVE_CORE; end if; -- DMI muxing dmi_wb_req <= '0'; - dmi_core_req <= '0'; + dmi_core_req <= (others => '0'); + dmi_din <= (others => '1'); + dmi_ack <= dmi_req; case slave is when SLAVE_WB => dmi_wb_req <= dmi_req; dmi_ack <= dmi_wb_ack; dmi_din <= dmi_wb_dout; when SLAVE_CORE => - dmi_core_req <= dmi_req; - dmi_ack <= dmi_core_ack; - dmi_din <= dmi_core_dout; + for i in 0 to NCPUS-1 loop + if not is_X(dmi_addr) and to_integer(unsigned(dmi_addr(7 downto 4))) = i + 1 then + dmi_core_req(i) <= dmi_req; + dmi_ack <= dmi_core_ack(i); + dmi_din <= dmi_core_dout(i); + end if; + end loop; when others => - dmi_ack <= dmi_req; - dmi_din <= (others => '1'); end case; -- SIM magic exit diff --git a/syscon.vhdl b/syscon.vhdl index 99fa835..98990d1 100644 --- a/syscon.vhdl +++ b/syscon.vhdl @@ -9,6 +9,7 @@ use work.wishbone_types.all; entity syscon is generic ( + NCPUS : positive := 1; SIG_VALUE : std_ulogic_vector(63 downto 0) := x"f00daa5500010001"; CLK_FREQ : integer; HAS_UART : boolean; @@ -33,7 +34,7 @@ entity syscon is -- System control ports dram_at_0 : out std_ulogic; - core_reset : out std_ulogic; + core_reset : out std_ulogic_vector(NCPUS-1 downto 0); soc_reset : out std_ulogic; alt_reset : out std_ulogic ); @@ -56,6 +57,7 @@ architecture behaviour of syscon is constant SYS_REG_UART0_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001000"; constant SYS_REG_UART1_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001001"; constant SYS_REG_GIT_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001010"; + constant SYS_REG_CPU_CTRL : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001011"; -- Muxed reg read signal signal reg_out : std_ulogic_vector(63 downto 0); @@ -116,6 +118,7 @@ architecture behaviour of syscon is signal reg_uart0info : std_ulogic_vector(63 downto 0); signal reg_uart1info : std_ulogic_vector(63 downto 0); signal reg_gitinfo : std_ulogic_vector(63 downto 0); + signal reg_cpuctrl : std_ulogic_vector(63 downto 0); signal info_has_dram : std_ulogic; signal info_has_bram : std_ulogic; signal info_has_uart : std_ulogic; @@ -134,7 +137,8 @@ begin -- Generated output signals dram_at_0 <= '1' when BRAM_SIZE = 0 else reg_ctrl(SYS_REG_CTRL_DRAM_AT_0); soc_reset <= reg_ctrl(SYS_REG_CTRL_SOC_RESET); - core_reset <= reg_ctrl(SYS_REG_CTRL_CORE_RESET); + core_reset <= not reg_cpuctrl(NCPUS-1 downto 0) when reg_ctrl(SYS_REG_CTRL_CORE_RESET) = '0' + else (others => '1'); alt_reset <= reg_ctrl(SYS_REG_CTRL_ALT_RESET); @@ -187,6 +191,8 @@ begin 55 downto 0 => GIT_HASH, others => '0'); + reg_cpuctrl(63 downto 8) <= std_ulogic_vector(to_unsigned(NCPUS, 56)); + -- Wishbone response wb_rsp.ack <= wishbone_in.cyc and wishbone_in.stb; with wishbone_in.adr(SYS_REG_BITS downto 1) select reg_out <= @@ -201,6 +207,7 @@ begin reg_uart0info when SYS_REG_UART0_INFO, reg_uart1info when SYS_REG_UART1_INFO, reg_gitinfo when SYS_REG_GIT_INFO, + reg_cpuctrl when SYS_REG_CPU_CTRL, (others => '0') when others; wb_rsp.dat <= reg_out(63 downto 32) when wishbone_in.adr(0) = '1' else reg_out(31 downto 0); @@ -225,6 +232,7 @@ begin if (rst) then reg_ctrl <= (SYS_REG_CTRL_ALT_RESET => ctrl_init_alt_reset, others => '0'); + reg_cpuctrl(7 downto 0) <= x"01"; -- enable cpu 0 only else if wishbone_in.cyc and wishbone_in.stb and wishbone_in.we then -- Change this if CTRL ever has more than 32 bits @@ -233,6 +241,10 @@ begin reg_ctrl(SYS_REG_CTRL_BITS-1 downto 0) <= wishbone_in.dat(SYS_REG_CTRL_BITS-1 downto 0); end if; + if wishbone_in.adr(SYS_REG_BITS downto 1) = SYS_REG_CPU_CTRL and + wishbone_in.adr(0) = '0' and wishbone_in.sel(0) = '1' then + reg_cpuctrl(7 downto 0) <= wishbone_in.dat(7 downto 0); + end if; end if; -- Reset auto-clear From e0c5af9bb13ff024ee194e2bd832d53ec9174c43 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 4 Jan 2025 17:16:14 +1100 Subject: [PATCH 05/12] mw_debug: Add -c flag to select which CPU core to address Signed-off-by: Paul Mackerras --- scripts/mw_debug/mw_debug.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index d7966d9..1a0b96b 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -24,28 +24,30 @@ #define DBG_WB_DATA 0x01 #define DBG_WB_CTRL 0x02 -#define DBG_CORE_CTRL 0x10 +unsigned int core; + +#define DBG_CORE_CTRL (0x10 + (core << 4)) #define DBG_CORE_CTRL_STOP (1 << 0) #define DBG_CORE_CTRL_RESET (1 << 1) #define DBG_CORE_CTRL_ICRESET (1 << 2) #define DBG_CORE_CTRL_STEP (1 << 3) #define DBG_CORE_CTRL_START (1 << 4) -#define DBG_CORE_STAT 0x11 +#define DBG_CORE_STAT (0x11 + (core << 4)) #define DBG_CORE_STAT_STOPPING (1 << 0) #define DBG_CORE_STAT_STOPPED (1 << 1) #define DBG_CORE_STAT_TERM (1 << 2) -#define DBG_CORE_NIA 0x12 -#define DBG_CORE_MSR 0x13 +#define DBG_CORE_NIA (0x12 + (core << 4)) +#define DBG_CORE_MSR (0x13 + (core << 4)) -#define DBG_CORE_GSPR_INDEX 0x14 -#define DBG_CORE_GSPR_DATA 0x15 +#define DBG_CORE_GSPR_INDEX (0x14 + (core << 4)) +#define DBG_CORE_GSPR_DATA (0x15 + (core << 4)) -#define DBG_LOG_ADDR 0x16 -#define DBG_LOG_DATA 0x17 -#define DBG_LOG_TRIGGER 0x18 -#define DBG_LOG_MTRIGGER 0x19 +#define DBG_LOG_ADDR (0x16 + (core << 4)) +#define DBG_LOG_DATA (0x17 + (core << 4)) +#define DBG_LOG_TRIGGER (0x18 + (core << 4)) +#define DBG_LOG_MTRIGGER (0x19 + (core << 4)) static bool debug; @@ -507,7 +509,7 @@ static void core_status(void) statstr2 = " (terminated)"; } else if (stat & DBG_CORE_STAT_TERM) statstr = "odd state (TERM but no STOP)"; - printf("Core: %s%s\n", statstr, statstr2); + printf("Core%u: %s%s\n", core, statstr, statstr2); printf(" NIA: %016" PRIx64 "\n", nia); printf(" MSR: %016" PRIx64 "\n", msr); } @@ -792,7 +794,7 @@ static void mtrig_set(uint64_t addr) static void usage(const char *cmd) { - fprintf(stderr, "Usage: %s -b \n", cmd); + fprintf(stderr, "Usage: %s -b [-c core#] \n", cmd); fprintf(stderr, "\n"); fprintf(stderr, " CPU core:\n"); @@ -851,12 +853,20 @@ int main(int argc, char *argv[]) { "target", required_argument, 0, 't' }, { "debug", no_argument, 0, 'd' }, { "frequency", no_argument, 0, 's' }, + { "core", required_argument, 0, 'c' }, { 0, 0, 0, 0 } }; - c = getopt_long(argc, argv, "dhb:t:s:", lopts, &oindex); + c = getopt_long(argc, argv, "dhb:t:s:c:", lopts, &oindex); if (c < 0) break; switch(c) { + case 'c': + core = atoi(optarg); + if (core >= 15) { + fprintf(stderr, "Core number out of range (max 14)\n"); + exit(1); + } + break; case 'h': usage(progname); break; From 49fcbaa5b232ec9d2df1804f9e4261f08cf02580 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 9 Jan 2025 19:27:58 +1100 Subject: [PATCH 06/12] soc: Implement a global timebase across all cores Now all cores see the same timebase value at any given instant. Signed-off-by: Paul Mackerras --- common.vhdl | 1 - core.vhdl | 4 ++++ execute1.vhdl | 16 ++++++++-------- soc.vhdl | 18 ++++++++++++++++++ 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/common.vhdl b/common.vhdl index 1c8642b..31793a8 100644 --- a/common.vhdl +++ b/common.vhdl @@ -264,7 +264,6 @@ package common is type ctrl_t is record wait_state: std_ulogic; run: std_ulogic; - tb: std_ulogic_vector(63 downto 0); dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); diff --git a/core.vhdl b/core.vhdl index 187e176..bf0708e 100644 --- a/core.vhdl +++ b/core.vhdl @@ -31,6 +31,9 @@ entity core is -- Alternate reset (0xffff0000) for use by DRAM init fw alt_reset : in std_ulogic; + -- Global timebase + timebase : in std_ulogic_vector(63 downto 0); + -- Wishbone interface wishbone_insn_in : in wishbone_slave_out; wishbone_insn_out : out wishbone_master_out; @@ -373,6 +376,7 @@ begin port map ( clk => clk, rst => rst_ex1, + timebase => timebase, flush_in => flush, busy_out => ex1_busy_out, e_in => decode2_to_execute1, diff --git a/execute1.vhdl b/execute1.vhdl index 08bc694..c5b2dc4 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -34,6 +34,8 @@ entity execute1 is ext_irq_in : std_ulogic; interrupt_in : WritebackToExecute1Type; + timebase : std_ulogic_vector(63 downto 0); + -- asynchronous l_out : out Execute1ToLoadstore1Type; fp_out : out Execute1ToFPUType; @@ -1901,8 +1903,8 @@ begin -- Slow SPR read mux with ex1.spr_select.sel select spr_result <= - ctrl.tb when SPRSEL_TB, - 32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU, + timebase when SPRSEL_TB, + 32x"0" & timebase(63 downto 32) when SPRSEL_TBU, ctrl.dec when SPRSEL_DEC, 32x"0" & PVR_MICROWATT when SPRSEL_PVR, log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA, @@ -1956,16 +1958,14 @@ begin end if; ctrl_tmp <= ctrl; - -- FIXME: run at 512MHz not core freq - ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1); x_to_pmu.mfspr <= '0'; x_to_pmu.mtspr <= '0'; - x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47); - x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51); - x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55); - x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63); + x_to_pmu.tbbits(3) <= timebase(63 - 47); + x_to_pmu.tbbits(2) <= timebase(63 - 51); + x_to_pmu.tbbits(1) <= timebase(63 - 55); + x_to_pmu.tbbits(0) <= timebase(63 - 63); x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM); x_to_pmu.pr_msr <= ctrl.msr(MSR_PR); diff --git a/soc.vhdl b/soc.vhdl index 0ed234d..36f34e9 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -271,6 +271,8 @@ architecture behaviour of soc is signal core_run_out : std_ulogic_vector(NCPUS-1 downto 0); + signal timebase : std_ulogic_vector(63 downto 0); + function wishbone_widen_data(wb : wb_io_master_out) return wishbone_master_out is variable wwb : wishbone_master_out; begin @@ -350,6 +352,21 @@ begin end if; end process; + -- Timebase just increments at the system clock frequency. + -- There is currently no way to set it. + -- Ideally it would (appear to) run at 512MHz like IBM POWER systems, + -- but Linux seems to cope OK with it being 100MHz or whatever. + tbase: process(system_clk) + begin + if rising_edge(system_clk) then + if soc_reset = '1' then + timebase <= (others => '0'); + else + timebase <= std_ulogic_vector(unsigned(timebase) + 1); + end if; + end if; + end process; + -- Processor cores processors: for i in 0 to NCPUS-1 generate core: entity work.core @@ -374,6 +391,7 @@ begin rst => rst_core(i), alt_reset => alt_reset_d, run_out => core_run_out(i), + timebase => timebase, wishbone_insn_in => wb_masters_in(i + NCPUS), wishbone_insn_out => wb_masters_out(i + NCPUS), wishbone_data_in => wb_masters_in(i), From 3924ed0f494b01f4006ccdb26d150e807895cd02 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 9 Jan 2025 09:47:29 +1100 Subject: [PATCH 07/12] xics: Implement a presentation controller per CPU core This is mainly in order to get IPIs. All external interrupts still go to CPU 0 for now. Signed-off-by: Paul Mackerras --- soc.vhdl | 5 +- xics.vhdl | 199 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 116 insertions(+), 88 deletions(-) diff --git a/soc.vhdl b/soc.vhdl index 36f34e9..fff3591 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -966,13 +966,16 @@ begin end generate; xics_icp: entity work.xics_icp + generic map( + NCPUS => NCPUS + ) port map( clk => system_clk, rst => rst_xics, wb_in => wb_xics_icp_in, wb_out => wb_xics_icp_out, ics_in => ics_to_icp, - core_irq_out => core_ext_irq(0) + core_irq_out => core_ext_irq ); xics_ics: entity work.xics_ics diff --git a/xics.vhdl b/xics.vhdl index 62faf77..4e1f29c 100644 --- a/xics.vhdl +++ b/xics.vhdl @@ -25,6 +25,9 @@ use work.common.all; use work.wishbone_types.all; entity xics_icp is + generic ( + NCPUS : natural := 1 + ); port ( clk : in std_logic; rst : in std_logic; @@ -33,32 +36,41 @@ entity xics_icp is wb_out : out wb_io_slave_out; ics_in : in ics_to_icp_t; - core_irq_out : out std_ulogic + core_irq_out : out std_ulogic_vector(NCPUS-1 downto 0) ); end xics_icp; architecture behaviour of xics_icp is - type reg_internal_t is record + type xics_presentation_t is record xisr : std_ulogic_vector(23 downto 0); cppr : std_ulogic_vector(7 downto 0); mfrr : std_ulogic_vector(7 downto 0); irq : std_ulogic; + end record; + constant xics_presentation_t_init : xics_presentation_t := + (mfrr => x"ff", -- mask everything on reset + irq => '0', + others => (others => '0')); + subtype cpu_index_t is natural range 0 to NCPUS-1; + type xicp_array_t is array(cpu_index_t) of xics_presentation_t; + + type reg_internal_t is record + icp : xicp_array_t; wb_rd_data : std_ulogic_vector(31 downto 0); wb_ack : std_ulogic; end record; constant reg_internal_init : reg_internal_t := (wb_ack => '0', - mfrr => x"ff", -- mask everything on reset - irq => '0', - others => (others => '0')); + wb_rd_data => (others => '0'), + icp => (others => xics_presentation_t_init)); signal r, r_next : reg_internal_t; - -- 8 bit offsets for each presentation - constant XIRR_POLL : std_ulogic_vector(7 downto 0) := x"00"; - constant XIRR : std_ulogic_vector(7 downto 0) := x"04"; - constant RESV0 : std_ulogic_vector(7 downto 0) := x"08"; - constant MFRR : std_ulogic_vector(7 downto 0) := x"0c"; + -- 4 bit offsets for each presentation register + constant XIRR_POLL : std_ulogic_vector(3 downto 0) := x"0"; + constant XIRR : std_ulogic_vector(3 downto 0) := x"4"; + constant RESV0 : std_ulogic_vector(3 downto 0) := x"8"; + constant MFRR : std_ulogic_vector(3 downto 0) := x"c"; begin @@ -68,7 +80,9 @@ begin r <= r_next; -- We delay core_irq_out by a cycle to help with timing - core_irq_out <= r.irq; + for i in 0 to NCPUS-1 loop + core_irq_out(i) <= r.icp(i).irq; + end loop; end if; end process; @@ -99,94 +113,105 @@ begin v.wb_ack := '0'; - xirr_accept_rd := '0'; - be_in := bswap(wb_in.dat); be_out := (others => '0'); - if wb_in.cyc = '1' and wb_in.stb = '1' then v.wb_ack := '1'; -- always ack - if wb_in.we = '1' then -- write - -- writes to both XIRR are the same - case wb_in.adr(5 downto 0) & "00" is - when XIRR_POLL => - report "ICP XIRR_POLL write"; - v.cppr := be_in(31 downto 24); - when XIRR => - v.cppr := be_in(31 downto 24); - if wb_in.sel = x"f" then -- 4 byte - report "ICP XIRR write word (EOI) :" & to_hstring(be_in); - elsif wb_in.sel = x"1" then -- 1 byte - report "ICP XIRR write byte (CPPR):" & to_hstring(be_in(31 downto 24)); - else - report "ICP XIRR UNSUPPORTED write ! sel=" & to_hstring(wb_in.sel); - end if; - when MFRR => - v.mfrr := be_in(31 downto 24); - if wb_in.sel = x"f" then -- 4 bytes - report "ICP MFRR write word:" & to_hstring(be_in); - elsif wb_in.sel = x"1" then -- 1 byte - report "ICP MFRR write byte:" & to_hstring(be_in(31 downto 24)); - else - report "ICP MFRR UNSUPPORTED write ! sel=" & to_hstring(wb_in.sel); - end if; - when others => - end case; - - else -- read - - case wb_in.adr(5 downto 0) & "00" is - when XIRR_POLL => - report "ICP XIRR_POLL read"; - be_out := r.cppr & r.xisr; - when XIRR => - report "ICP XIRR read"; - be_out := r.cppr & r.xisr; - if wb_in.sel = x"f" then - xirr_accept_rd := '1'; - end if; - when MFRR => - report "ICP MFRR read"; - be_out(31 downto 24) := r.mfrr; - when others => - end case; - end if; end if; - pending_priority := x"ff"; - v.xisr := x"000000"; - v.irq := '0'; + for i in cpu_index_t loop + xirr_accept_rd := '0'; + + if wb_in.cyc = '1' and wb_in.stb = '1' and + to_integer(unsigned(wb_in.adr(5 downto 2))) = i then + if wb_in.we = '1' then -- write + -- writes to both XIRR are the same + case wb_in.adr(1 downto 0) & "00" is + when XIRR_POLL => + report "ICP XIRR_POLL write"; + v.icp(i).cppr := be_in(31 downto 24); + when XIRR => + v.icp(i).cppr := be_in(31 downto 24); + if wb_in.sel = x"f" then -- 4 byte + report "ICP " & natural'image(i) & " XIRR write word (EOI) :" & + to_hstring(be_in); + elsif wb_in.sel = x"1" then -- 1 byte + report "ICP " & natural'image(i) & " XIRR write byte (CPPR):" & + to_hstring(be_in(31 downto 24)); + else + report "ICP " & natural'image(i) & " XIRR UNSUPPORTED write ! sel=" & + to_hstring(wb_in.sel); + end if; + when MFRR => + v.icp(i).mfrr := be_in(31 downto 24); + if wb_in.sel = x"f" then -- 4 bytes + report "ICP " & natural'image(i) & " MFRR write word:" & + to_hstring(be_in); + elsif wb_in.sel = x"1" then -- 1 byte + report "ICP " & natural'image(i) & " MFRR write byte:" & + to_hstring(be_in(31 downto 24)); + else + report "ICP " & natural'image(i) & " MFRR UNSUPPORTED write ! sel=" & + to_hstring(wb_in.sel); + end if; + when others => + end case; + + else -- read + + case wb_in.adr(1 downto 0) & "00" is + when XIRR_POLL => + report "ICP XIRR_POLL read"; + be_out := r.icp(i).cppr & r.icp(i).xisr; + when XIRR => + report "ICP XIRR read"; + be_out := r.icp(i).cppr & r.icp(i).xisr; + if wb_in.sel = x"f" then + xirr_accept_rd := '1'; + end if; + when MFRR => + report "ICP MFRR read"; + be_out(31 downto 24) := r.icp(i).mfrr; + when others => + end case; + end if; + end if; - if ics_in.pri /= x"ff" then - v.xisr := x"00001" & ics_in.src; - pending_priority := ics_in.pri; - end if; + pending_priority := x"ff"; + v.icp(i).xisr := x"000000"; + v.icp(i).irq := '0'; - -- Check MFRR - if unsigned(r.mfrr) < unsigned(pending_priority) then -- - v.xisr := x"000002"; -- special XICS MFRR IRQ source number - pending_priority := r.mfrr; - end if; + if i = 0 and ics_in.pri /= x"ff" then + v.icp(i).xisr := x"00001" & ics_in.src; + pending_priority := ics_in.pri; + end if; - -- Accept the interrupt - if xirr_accept_rd = '1' then - report "XICS: ICP ACCEPT" & - " cppr:" & to_hstring(r.cppr) & - " xisr:" & to_hstring(r.xisr) & - " mfrr:" & to_hstring(r.mfrr); - v.cppr := pending_priority; - end if; + -- Check MFRR + if unsigned(r.icp(i).mfrr) < unsigned(pending_priority) then -- + v.icp(i).xisr := x"000002"; -- special XICS MFRR IRQ source number + pending_priority := r.icp(i).mfrr; + end if; + + -- Accept the interrupt + if xirr_accept_rd = '1' then + report "XICS " & natural'image(i) & ": ICP ACCEPT" & + " cppr:" & to_hstring(r.icp(i).cppr) & + " xisr:" & to_hstring(r.icp(i).xisr) & + " mfrr:" & to_hstring(r.icp(i).mfrr); + v.icp(i).cppr := pending_priority; + end if; - v.wb_rd_data := bswap(be_out); + v.wb_rd_data := bswap(be_out); - if unsigned(pending_priority) < unsigned(v.cppr) then - if r.irq = '0' then - report "IRQ set"; + if unsigned(pending_priority) < unsigned(v.icp(i).cppr) then + if r.icp(i).irq = '0' then + report "CPU " & natural'image(i) & " IRQ set"; + end if; + v.icp(i).irq := '1'; + elsif r.icp(i).irq = '1' then + report "CPU " & natural'image(i) & " IRQ clr"; end if; - v.irq := '1'; - elsif r.irq = '1' then - report "IRQ clr"; - end if; + end loop; if rst = '1' then v := reg_internal_init; From 9bd6b3d17509c6cb57e9c07f9b034bdecb36009c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Jan 2025 17:04:27 +1100 Subject: [PATCH 08/12] xics: Implement destination server field in interrupt source registers This implements the server field in the XISRs (external interrupt source registers), allowing each interrupt source to be directed to a particular CPU. If the CPU number that is written is out of range, CPU 0 is used. Signed-off-by: Paul Mackerras --- common.vhdl | 6 ++-- soc.vhdl | 1 + xics.vhdl | 94 +++++++++++++++++++++++++++++++++-------------------- 3 files changed, 63 insertions(+), 38 deletions(-) diff --git a/common.vhdl b/common.vhdl index 31793a8..46db202 100644 --- a/common.vhdl +++ b/common.vhdl @@ -252,12 +252,14 @@ package common is -- For now, fixed 16 sources, make this either a parametric -- package of some sort or an unconstrainted array. + -- We don't know NCPUS or SRC_NUM here, so make this + -- large enough for 4 cpus and 16 interrupt sources for now. type ics_to_icp_t is record -- Level interrupts only, ICS just keeps prsenting the -- highest priority interrupt. Once handling edge, something -- smarter involving handshake & reject support will be needed - src : std_ulogic_vector(3 downto 0); - pri : std_ulogic_vector(7 downto 0); + src : std_ulogic_vector(15 downto 0); -- 4 bits each for 4 cpus + pri : std_ulogic_vector(31 downto 0); -- 8 bits each for 4 cpus end record; -- This needs to die... diff --git a/soc.vhdl b/soc.vhdl index fff3591..b3d03b7 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -980,6 +980,7 @@ begin xics_ics: entity work.xics_ics generic map( + NCPUS => NCPUS, SRC_NUM => 16, PRIO_BITS => 3 ) diff --git a/xics.vhdl b/xics.vhdl index 4e1f29c..b999e65 100644 --- a/xics.vhdl +++ b/xics.vhdl @@ -181,9 +181,9 @@ begin v.icp(i).xisr := x"000000"; v.icp(i).irq := '0'; - if i = 0 and ics_in.pri /= x"ff" then - v.icp(i).xisr := x"00001" & ics_in.src; - pending_priority := ics_in.pri; + if ics_in.pri(8*i + 7 downto 8*i) /= x"ff" then + v.icp(i).xisr := x"00001" & ics_in.src(4*i + 3 downto 4*i); + pending_priority := ics_in.pri(8*i + 7 downto 8*i); end if; -- Check MFRR @@ -235,6 +235,7 @@ use work.helpers.all; entity xics_ics is generic ( + NCPUS : natural := 1; SRC_NUM : integer range 1 to 256 := 16; PRIO_BITS : integer range 1 to 8 := 3 ); @@ -253,10 +254,13 @@ end xics_ics; architecture rtl of xics_ics is constant SRC_NUM_BITS : natural := log2(SRC_NUM); + constant SERVER_NUM_BITS : natural := 2; subtype pri_t is std_ulogic_vector(PRIO_BITS-1 downto 0); + subtype server_t is unsigned(SERVER_NUM_BITS-1 downto 0); type xive_t is record pri : pri_t; + server : server_t; end record; constant pri_masked : pri_t := (others => '1'); @@ -333,6 +337,16 @@ architecture rtl of xics_ics is return p(nbits - 1 downto 0); end function; + function server_check(serv_in: std_ulogic_vector(7 downto 0)) return unsigned is + variable srv : server_t; + begin + srv := to_unsigned(0, SERVER_NUM_BITS); + if to_integer(unsigned(serv_in)) < NCPUS then + srv := unsigned(serv_in(SERVER_NUM_BITS - 1 downto 0)); + end if; + return srv; + end; + -- Register map -- 0 : Config -- 4 : Debug/diagnostics @@ -391,16 +405,14 @@ begin be_out := (others => '0'); if reg_is_xive = '1' then - be_out := int_level_l(reg_idx) & - '0' & - int_level_l(reg_idx) & - '0' & - x"00000" & - prio_unpack(xives(reg_idx).pri); + be_out(31) := int_level_l(reg_idx); + be_out(29) := int_level_l(reg_idx); + be_out(8 + SERVER_NUM_BITS - 1 downto 8) := std_ulogic_vector(xives(reg_idx).server); + be_out(7 downto 0) := prio_unpack(xives(reg_idx).pri); elsif reg_is_config = '1' then be_out := get_config; elsif reg_is_debug = '1' then - be_out := x"00000" & icp_out_next.src & icp_out_next.pri; + be_out := icp_out_next.src & icp_out_next.pri(15 downto 0); end if; wb_out.dat <= bswap(be_out); wb_out.ack <= wb_valid; @@ -414,17 +426,20 @@ begin if rising_edge(clk) then if rst = '1' then for i in 0 to SRC_NUM - 1 loop - xives(i) <= (pri => pri_masked); + xives(i) <= (pri => pri_masked, server => to_unsigned(0, SERVER_NUM_BITS)); end loop; elsif wb_valid = '1' and wb_in.we = '1' then -- Byteswapped input be_in := bswap(wb_in.dat); if reg_is_xive then - -- TODO: When adding support for other bits, make sure to - -- properly implement wb_in.sel to allow partial writes. - xives(reg_idx).pri <= prio_pack(be_in(7 downto 0)); - report "ICS irq " & integer'image(reg_idx) & - " set to:" & to_hstring(be_in(7 downto 0)); + if wb_in.sel(3) = '1' then + xives(reg_idx).pri <= prio_pack(be_in(7 downto 0)); + report "ICS irq " & integer'image(reg_idx) & + " set to pri:" & to_hstring(be_in(7 downto 0)); + end if; + if wb_in.sel(2) = '1' then + xives(reg_idx).server <= server_check(be_in(15 downto 8)); + end if; end if; end if; end if; @@ -449,29 +464,36 @@ begin variable pending_pri : pri_vector_t; variable pending_at_pri : std_ulogic_vector(SRC_NUM - 1 downto 0); begin - -- Work out the most-favoured (lowest) priority of the pending interrupts - pending_pri := (others => '0'); - for i in 0 to SRC_NUM - 1 loop - if int_level_l(i) = '1' then - pending_pri := pending_pri or prio_decode(xives(i).pri); - end if; - end loop; - max_pri := priority_encoder(pending_pri, PRIO_BITS); + icp_out_next.src <= (others => '0'); + icp_out_next.pri <= (others => '0'); + for cpu in 0 to NCPUS-1 loop + -- Work out the most-favoured (lowest) priority of the interrupts + -- that are pending and directed to this cpu + pending_pri := (others => '0'); + for i in 0 to SRC_NUM - 1 loop + if int_level_l(i) = '1' and to_integer(xives(i).server) = cpu then + pending_pri := pending_pri or prio_decode(xives(i).pri); + end if; + end loop; + max_pri := priority_encoder(pending_pri, PRIO_BITS); + + -- Work out which interrupts are pending at that priority + pending_at_pri := (others => '0'); + for i in 0 to SRC_NUM - 1 loop + if int_level_l(i) = '1' and xives(i).pri = max_pri and + to_integer(xives(i).server) = cpu then + pending_at_pri(i) := '1'; + end if; + end loop; + max_idx := priority_encoder(pending_at_pri, SRC_NUM_BITS); - -- Work out which interrupts are pending at that priority - pending_at_pri := (others => '0'); - for i in 0 to SRC_NUM - 1 loop - if int_level_l(i) = '1' and xives(i).pri = max_pri then - pending_at_pri(i) := '1'; + if max_pri /= pri_masked then + report "MFI: " & integer'image(to_integer(unsigned(max_idx))) & " pri=" & to_hstring(prio_unpack(max_pri)) & + " srv=" & integer'image(cpu); end if; + icp_out_next.src(4*cpu + 3 downto 4*cpu) <= max_idx; + icp_out_next.pri(8*cpu + 7 downto 8*cpu) <= prio_unpack(max_pri); end loop; - max_idx := priority_encoder(pending_at_pri, SRC_NUM_BITS); - - if max_pri /= pri_masked then - report "MFI: " & integer'image(to_integer(unsigned(max_idx))) & " pri=" & to_hstring(prio_unpack(max_pri)); - end if; - icp_out_next.src <= max_idx; - icp_out_next.pri <= prio_unpack(max_pri); end process; end architecture rtl; From bf55efec6d52884d8d3984b6700e3d063fc4de8c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Jan 2025 11:21:43 +1100 Subject: [PATCH 09/12] Arty A7: Add an option to select the number of CPU cores Timing is currently not very good with 2 cores on the Arty A7-100. Signed-off-by: Paul Mackerras --- fpga/top-arty.vhdl | 2 ++ microwatt.core | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index c3be9d9..6e524f0 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -10,6 +10,7 @@ use work.wishbone_types.all; entity toplevel is generic ( + CPUS : natural := 1; MEMORY_SIZE : integer := 16384; RAM_INIT_FILE : string := "firmware.hex"; RESET_LOW : boolean := true; @@ -241,6 +242,7 @@ begin MEMORY_SIZE => BRAM_SIZE, RAM_INIT_FILE => RAM_INIT_FILE, SIM => false, + NCPUS => CPUS, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, diff --git a/microwatt.core b/microwatt.core index f56bee0..ed2aa01 100644 --- a/microwatt.core +++ b/microwatt.core @@ -335,6 +335,7 @@ targets: default_tool: vivado filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, liteeth, uart16550, xilinx_specific, litesdcard] parameters: + - cpus - memory_size - ram_init_file - use_litedram=true @@ -496,6 +497,12 @@ generate: parameters: {vendor : xilinx, frequency : 100e6} parameters: + cpus: + datatype : int + description : Number of CPU cores to include in the SoC. + paramtype : generic + default : 1 + memory_size: datatype : int description : On-chip memory size (bytes). If no_bram is set, this is the size carved out for the DRAM payload From d1c7b654bb1c8f2614cd80e3b0e0b70561da93f6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 5 Jan 2025 20:52:06 +1100 Subject: [PATCH 10/12] wishbone_arbiter: Remove early_sel optimization when > 4 masters For the sake of overall timing in larger SoCs, remove the early_sel optimization when there are more than 4 masters. Also make the ack and stall signals to a particular master depend on that master's cyc, not on the busy signal, which can depend on any master's cyc. Signed-off-by: Paul Mackerras --- wishbone_arbiter.vhdl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/wishbone_arbiter.vhdl b/wishbone_arbiter.vhdl index cb632bf..a6daec6 100644 --- a/wishbone_arbiter.vhdl +++ b/wishbone_arbiter.vhdl @@ -4,7 +4,6 @@ use ieee.std_logic_1164.all; library work; use work.wishbone_types.all; --- TODO: Use an array of master/slaves with parametric size entity wishbone_arbiter is generic( NUM_MASTERS : positive := 3 @@ -28,18 +27,23 @@ begin busy <= wb_masters_in(selected).cyc; - wishbone_muxes: process(selected, candidate, busy, wb_slave_in, wb_masters_in) + wishbone_muxes: process(all) variable early_sel : wb_arb_master_t; begin early_sel := selected; - if busy = '0' then + if NUM_MASTERS <= 4 and busy = '0' then early_sel := candidate; end if; wb_slave_out <= wb_masters_in(early_sel); for i in 0 to NUM_MASTERS-1 loop wb_masters_out(i).dat <= wb_slave_in.dat; - wb_masters_out(i).ack <= wb_slave_in.ack when early_sel = i else '0'; - wb_masters_out(i).stall <= wb_slave_in.stall when early_sel = i else '1'; + if early_sel = i and wb_masters_in(i).cyc = '1' then + wb_masters_out(i).ack <= wb_slave_in.ack; + wb_masters_out(i).stall <= wb_slave_in.stall; + else + wb_masters_out(i).ack <= '0'; + wb_masters_out(i).stall <= '1'; + end if; end loop; end process; From d8423568b6e1b1781216af9301f4b51476a9ef37 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Jan 2025 16:53:27 +1100 Subject: [PATCH 11/12] core: Evaluate rotator control signals in decode2 Hopefully this improves timing a bit. Signed-off-by: Paul Mackerras --- common.vhdl | 7 +++++++ decode2.vhdl | 8 ++++++++ execute1.vhdl | 21 +++++---------------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/common.vhdl b/common.vhdl index 46db202..7d31b67 100644 --- a/common.vhdl +++ b/common.vhdl @@ -440,6 +440,11 @@ package common is illegal_form : std_ulogic; uses_tar : std_ulogic; uses_dscr : std_ulogic; + right_shift : std_ulogic; + rot_clear_left : std_ulogic; + rot_clear_right : std_ulogic; + rot_sign_ext : std_ulogic; + do_popcnt : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => ALU, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, @@ -462,6 +467,8 @@ package common is dec_ctr => '0', prefixed => '0', prefix => (others => '0'), illegal_suffix => '0', misaligned_prefix => '0', illegal_form => '0', uses_tar => '0', uses_dscr => '0', + right_shift => '0', rot_clear_left => '0', rot_clear_right => '0', rot_sign_ext => '0', + do_popcnt => '0', others => (others => '0')); type MultiplyInputType is record diff --git a/decode2.vhdl b/decode2.vhdl index 432426d..da2fbb3 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -673,6 +673,14 @@ begin v.e.illegal_suffix := d_in.illegal_suffix; v.e.misaligned_prefix := d_in.misaligned_prefix; + -- rotator control signals + v.e.right_shift := '1' when op = OP_SHR else '0'; + v.e.rot_clear_left := '1' when op = OP_RLC or op = OP_RLCL else '0'; + v.e.rot_clear_right := '1' when op = OP_RLC or op = OP_RLCR else '0'; + v.e.rot_sign_ext := '1' when op = OP_EXTSWSLI else '0'; + + v.e.do_popcnt := '1' when op = OP_COUNTB and d_in.insn(7 downto 6) = "11" else '0'; + -- check for invalid forms that cause an illegal instruction interrupt -- Does RA = RT for a load quadword instr, or RB = RT for lqarx? if d_in.decode.repeat = DRTP and diff --git a/execute1.vhdl b/execute1.vhdl index c5b2dc4..b9ad9ad 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -210,12 +210,9 @@ architecture behaviour of execute1 is signal valid_in : std_ulogic; signal ctrl: ctrl_t := ctrl_t_init; signal ctrl_tmp: ctrl_t := ctrl_t_init; - signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; - signal rot_sign_ext: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); - signal do_popcnt: std_ulogic; signal countbits_result: std_ulogic_vector(63 downto 0); signal alu_result: std_ulogic_vector(63 downto 0); signal adder_result: std_ulogic_vector(63 downto 0); @@ -454,11 +451,11 @@ begin shift => b_in(6 downto 0), insn => e_in.insn, is_32bit => e_in.is_32bit, - right_shift => right_shift, + right_shift => e_in.right_shift, arith => e_in.is_signed, - clear_left => rot_clear_left, - clear_right => rot_clear_right, - sign_ext_rs => rot_sign_ext, + clear_left => e_in.rot_clear_left, + clear_right => e_in.rot_clear_right, + sign_ext_rs => e_in.rot_sign_ext, result => rotator_result, carry_out => rotator_carry ); @@ -482,7 +479,7 @@ begin stall => stage2_stall, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, - do_popcnt => do_popcnt, + do_popcnt => e_in.do_popcnt, datalen => e_in.data_len, result => countbits_result ); @@ -1648,14 +1645,6 @@ begin irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); - -- rotator control signals - right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; - rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; - rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; - rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - - do_popcnt <= '1' when e_in.insn_type = OP_COUNTB and e_in.insn(7 downto 6) = "11" else '0'; - if valid_in = '1' then v.prev_op := e_in.insn_type; v.prev_prefixed := e_in.prefixed; From 0a2d3b6f58e24a63916645dc1669fd0c7773869b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 17 Jan 2025 21:47:26 +1100 Subject: [PATCH 12/12] loadstore1: Split DAWR check across a clock edge Instead of doing the address subtractions and subsequent logic for DAWR hit detection in the second cycle of a load or store, this does the subtractions in the first cycle and the remaining logic in the second cycle. This improves timing. Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 0816931..6d59fb3 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -52,6 +52,8 @@ architecture behave of loadstore1 is MMU_WAIT -- waiting for MMU to finish doing something ); + constant num_dawr : positive := 2; + type byte_index_t is array(0 to 7) of unsigned(2 downto 0); subtype byte_trim_t is std_ulogic_vector(1 downto 0); type trim_ctl_t is array(0 to 7) of byte_trim_t; @@ -130,6 +132,9 @@ architecture behave of loadstore1 is busy : std_ulogic; issued : std_ulogic; addr0 : std_ulogic_vector(63 downto 0); + dawr_ll : std_ulogic_vector(num_dawr-1 downto 0); + dawr_ul : std_ulogic_vector(num_dawr-1 downto 0); + dawr_ud : std_ulogic; end record; type reg_stage2_t is record @@ -147,7 +152,6 @@ architecture behave of loadstore1 is dbg_spr_ack: std_ulogic; end record; - constant num_dawr : positive := 2; type dawr_array_t is array(0 to num_dawr - 1) of std_ulogic_vector(63 downto 3); type dawrx_array_t is array(0 to num_dawr - 1) of std_ulogic_vector(15 downto 0); @@ -335,6 +339,9 @@ begin r1.req.sprsel <= "000"; r1.req.ric <= "00"; r1.req.xerc <= xerc_init; + r1.dawr_ll <= (others => '0'); + r1.dawr_ul <= (others => '0'); + r1.dawr_ud <= '0'; r2.req.valid <= '0'; r2.busy <= '0'; @@ -617,6 +624,9 @@ begin variable req : request_t; variable dcreq : std_ulogic; variable issue : std_ulogic; + variable addr : std_ulogic_vector(63 downto 3); + variable addl : unsigned(64 downto 3); + variable addu : unsigned(64 downto 3); begin v := r1; issue := '0'; @@ -661,6 +671,20 @@ begin end if; end if; + -- Do subtractions for DAWR0/1 matches + for i in 0 to 1 loop + addr := req.addr(63 downto 3); + if req.priv_mode = '1' and r3.dawrx(i)(7) = '1' then + -- HRAMMC=1 => trim top bit from address + addr(63) := '0'; + end if; + addl := unsigned('0' & addr) - unsigned('0' & r3.dawr(i)); + addu := unsigned('0' & r3.dawr_uplim(i)) - unsigned('0' & addr); + v.dawr_ll(i) := addl(64); + v.dawr_ul(i) := addu(64); + end loop; + v.dawr_ud := r3.dawr_upd; + if flush = '1' then v.req.valid := '0'; v.req.dc_req := '0'; @@ -702,9 +726,6 @@ begin variable sprsel : std_ulogic_vector(2 downto 0); variable sprval : std_ulogic_vector(63 downto 0); variable dawr_match : std_ulogic; - variable addr : std_ulogic_vector(63 downto 3); - variable addl : unsigned(64 downto 3); - variable addu : unsigned(64 downto 3); begin v := r2; @@ -724,14 +745,7 @@ begin -- Test for DAWR0/1 matches dawr_match := '0'; for i in 0 to 1 loop - addr := r1.req.addr(63 downto 3); - if r1.req.priv_mode = '1' and r3.dawrx(i)(7) = '1' then - -- HRAMMC=1 => trim top bit from address - addr(63) := '0'; - end if; - addl := unsigned('0' & addr) - unsigned('0' & r3.dawr(i)); - addu := unsigned('0' & r3.dawr_uplim(i)) - unsigned('0' & addr); - if addl(64) = '0' and addu(64) = '0' and + if r1.dawr_ll(i) = '0' and r1.dawr_ul(i) = '0' and r1.dawr_ud = '0' and dawrx_match_enable(r3.dawrx(i), r1.req.virt_mode, r1.req.priv_mode, r1.req.store) then dawr_match := r1.req.valid and r1.req.dc_req and not r3.dawr_upd and