diff --git a/bitsort.vhdl b/bitsort.vhdl index f2aeddb..01b34b5 100644 --- a/bitsort.vhdl +++ b/bitsort.vhdl @@ -1,5 +1,6 @@ -- Implements instructions that involve sorting bits, -- that is, cfuged, pextd and pdepd. +-- Also does bperm, which is somewhat different. -- -- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right -- and move the bits in RS in the same fashion to give the result @@ -7,6 +8,7 @@ -- corresponding bit in RB is 1 -- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out -- to the bit positions which have a 1 in RB +-- bperm: Select 8 arbitrary bits -- NB opc is bits 7-6 of the instruction: -- 00 = pdepd, 01 = pextd, 10 = cfuged @@ -27,6 +29,8 @@ entity bit_sorter is go : in std_ulogic; opc : in std_ulogic_vector(1 downto 0); done : out std_ulogic; + do_bperm : in std_ulogic; + bperm_done : out std_ulogic; result : out std_ulogic_vector(63 downto 0) ); end entity bit_sorter; @@ -45,6 +49,13 @@ architecture behaviour of bit_sorter is signal sr_vl : std_ulogic_vector(63 downto 0); signal sr_vr : std_ulogic_vector(63 downto 0); + signal is_bperm : std_ulogic; + signal bpc : unsigned(2 downto 0); + signal bp_done : std_ulogic; + signal bperm_res : std_ulogic_vector(7 downto 0); + signal rs_sr : std_ulogic_vector(63 downto 0); + signal rb_bp : std_ulogic_vector(63 downto 0); + begin bsort_r: process(clk) begin @@ -96,7 +107,41 @@ begin end if; end process; + -- bit permutation + bperm_res(7) <= rb_bp(to_integer(unsigned(not rs_sr(5 downto 0)))) when not is_X(rs_sr) + else 'X'; + + bperm_r: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + is_bperm <= '0'; + bp_done <= '0'; + bperm_res(6 downto 0) <= (others => '0'); + bpc <= to_unsigned(0, 3); + elsif do_bperm = '1' then + is_bperm <= '1'; + bp_done <= '0'; + bperm_res(6 downto 0) <= (others => '0'); + bpc <= to_unsigned(0, 3); + rs_sr <= rs; + rb_bp <= rb; + elsif bp_done = '1' then + is_bperm <= '0'; + bp_done <= '0'; + elsif is_bperm = '1' then + bperm_res(6 downto 0) <= bperm_res(7 downto 1); + rs_sr <= x"00" & rs_sr(63 downto 8); + if bpc = "110" then + bp_done <= '1'; + end if; + bpc <= bpc + 1; + end if; + end if; + end process; + done <= sd; - result <= val; + bperm_done <= bp_done; + result <= val when is_bperm = '0' else (56x"0" & bperm_res); end behaviour; diff --git a/common.vhdl b/common.vhdl index 1c8642b..7d31b67 100644 --- a/common.vhdl +++ b/common.vhdl @@ -252,19 +252,20 @@ package common is -- For now, fixed 16 sources, make this either a parametric -- package of some sort or an unconstrainted array. + -- We don't know NCPUS or SRC_NUM here, so make this + -- large enough for 4 cpus and 16 interrupt sources for now. type ics_to_icp_t is record -- Level interrupts only, ICS just keeps prsenting the -- highest priority interrupt. Once handling edge, something -- smarter involving handshake & reject support will be needed - src : std_ulogic_vector(3 downto 0); - pri : std_ulogic_vector(7 downto 0); + src : std_ulogic_vector(15 downto 0); -- 4 bits each for 4 cpus + pri : std_ulogic_vector(31 downto 0); -- 8 bits each for 4 cpus end record; -- This needs to die... type ctrl_t is record wait_state: std_ulogic; run: std_ulogic; - tb: std_ulogic_vector(63 downto 0); dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); @@ -439,6 +440,11 @@ package common is illegal_form : std_ulogic; uses_tar : std_ulogic; uses_dscr : std_ulogic; + right_shift : std_ulogic; + rot_clear_left : std_ulogic; + rot_clear_right : std_ulogic; + rot_sign_ext : std_ulogic; + do_popcnt : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => ALU, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, @@ -461,6 +467,8 @@ package common is dec_ctr => '0', prefixed => '0', prefix => (others => '0'), illegal_suffix => '0', misaligned_prefix => '0', illegal_form => '0', uses_tar => '0', uses_dscr => '0', + right_shift => '0', rot_clear_left => '0', rot_clear_right => '0', rot_sign_ext => '0', + do_popcnt => '0', others => (others => '0')); type MultiplyInputType is record diff --git a/control.vhdl b/control.vhdl index e8c8068..b75fcc1 100644 --- a/control.vhdl +++ b/control.vhdl @@ -45,9 +45,13 @@ entity control is valid_out : out std_ulogic; stopped_out : out std_ulogic; - gpr_bypass_a : out std_ulogic_vector(1 downto 0); - gpr_bypass_b : out std_ulogic_vector(1 downto 0); - gpr_bypass_c : out std_ulogic_vector(1 downto 0); + -- Note on gpr_bypass_*: bits 1 to 3 are a 1-hot encoding of which + -- bypass source we may possibly need to use; bit 0 is 1 if the bypass + -- value should be used (i.e. any of bits 1-3 are 1 and the + -- corresponding gpr_x_read_valid_in is also 1). + gpr_bypass_a : out std_ulogic_vector(3 downto 0); + gpr_bypass_b : out std_ulogic_vector(3 downto 0); + gpr_bypass_c : out std_ulogic_vector(3 downto 0); cr_bypass : out std_ulogic_vector(1 downto 0); instr_tag_out : out instr_tag_t @@ -152,9 +156,9 @@ begin variable tag_s : instr_tag_t; variable tag_t : instr_tag_t; variable incr_tag : tag_number_t; - variable byp_a : std_ulogic_vector(1 downto 0); - variable byp_b : std_ulogic_vector(1 downto 0); - variable byp_c : std_ulogic_vector(1 downto 0); + variable byp_a : std_ulogic_vector(3 downto 0); + variable byp_b : std_ulogic_vector(3 downto 0); + variable byp_c : std_ulogic_vector(3 downto 0); variable tag_cr : instr_tag_t; variable byp_cr : std_ulogic_vector(1 downto 0); variable tag_ov : instr_tag_t; @@ -163,57 +167,66 @@ begin tag_a := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_a_read_in then - tag_a.valid := gpr_a_read_valid_in; + tag_a.valid := '1'; tag_a.tag := i; end if; end loop; tag_b := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then - tag_b.valid := gpr_b_read_valid_in; + tag_b.valid := '1'; tag_b.tag := i; end if; end loop; tag_c := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then - tag_c.valid := gpr_c_read_valid_in; + tag_c.valid := '1'; tag_c.tag := i; end if; end loop; - byp_a := "00"; + byp_a := "0000"; if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then - byp_a := "01"; - elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then - byp_a := "10"; - elsif tag_match(complete_in, tag_a) then - byp_a := "11"; + byp_a(1) := '1'; end if; - byp_b := "00"; + if EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then + byp_a(2) := '1'; + end if; + if tag_match(complete_in, tag_a) then + byp_a(3) := '1'; + end if; + byp_a(0) := gpr_a_read_valid_in and (byp_a(1) or byp_a(2) or byp_a(3)); + byp_b := "0000"; if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then - byp_b := "01"; - elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then - byp_b := "10"; - elsif tag_match(complete_in, tag_b) then - byp_b := "11"; + byp_b(1) := '1'; + end if; + if EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then + byp_b(2) := '1'; end if; - byp_c := "00"; + if tag_match(complete_in, tag_b) then + byp_b(3) := '1'; + end if; + byp_b(0) := gpr_b_read_valid_in and (byp_b(1) or byp_b(2) or byp_b(3)); + byp_c := "0000"; if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then - byp_c := "01"; - elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then - byp_c := "10"; - elsif tag_match(complete_in, tag_c) then - byp_c := "11"; + byp_c(1) := '1'; + end if; + if EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then + byp_c(2) := '1'; + end if; + if tag_match(complete_in, tag_c) then + byp_c(3) := '1'; end if; + byp_c(0) := gpr_c_read_valid_in and (byp_c(1) or byp_c(2) or byp_c(3)); gpr_bypass_a <= byp_a; gpr_bypass_b <= byp_b; gpr_bypass_c <= byp_c; - gpr_tag_stall <= (tag_a.valid and not (or (byp_a))) or - (tag_b.valid and not (or (byp_b))) or - (tag_c.valid and not (or (byp_c))); + gpr_tag_stall <= (tag_a.valid and gpr_a_read_valid_in and not byp_a(0)) or + (tag_b.valid and gpr_b_read_valid_in and not byp_b(0)) or + (tag_c.valid and gpr_c_read_valid_in and not byp_c(0)); incr_tag := curr_tag; instr_tag.tag <= curr_tag; diff --git a/core.vhdl b/core.vhdl index 187e176..bf0708e 100644 --- a/core.vhdl +++ b/core.vhdl @@ -31,6 +31,9 @@ entity core is -- Alternate reset (0xffff0000) for use by DRAM init fw alt_reset : in std_ulogic; + -- Global timebase + timebase : in std_ulogic_vector(63 downto 0); + -- Wishbone interface wishbone_insn_in : in wishbone_slave_out; wishbone_insn_out : out wishbone_master_out; @@ -373,6 +376,7 @@ begin port map ( clk => clk, rst => rst_ex1, + timebase => timebase, flush_in => flush, busy_out => ex1_busy_out, e_in => decode2_to_execute1, diff --git a/decode2.vhdl b/decode2.vhdl index cc241a2..da2fbb3 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -201,6 +201,23 @@ architecture behaviour of decode2 is end case; end; + function andor (mask_a : std_ulogic; val_a : std_ulogic_vector(63 downto 0); + mask_b : std_ulogic; val_b : std_ulogic_vector(63 downto 0); + mask_c : std_ulogic; val_c : std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable t : std_ulogic_vector(63 downto 0) := (others => '0'); + begin + if mask_a = '1' then + t := val_a; + end if; + if mask_b = '1' then + t := t or val_b; + end if; + if mask_c = '1' then + t := t or val_c; + end if; + return t; + end; + -- control signals that are derived from insn_type type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0); @@ -210,7 +227,6 @@ architecture behaviour of decode2 is OP_PRTY => "001", OP_CMPB => "001", OP_EXTS => "001", - OP_BPERM => "001", OP_BREV => "001", OP_BCD => "001", OP_MTSPR => "001", @@ -239,6 +255,7 @@ architecture behaviour of decode2 is OP_DIVE => "101", OP_MOD => "101", OP_BSORT => "100", + OP_BPERM => "100", OP_ADDG6S => "001", -- misc_result OP_ISEL => "010", OP_DARN => "011", @@ -269,15 +286,15 @@ architecture behaviour of decode2 is signal gpr_a_read_valid : std_ulogic; signal gpr_a_read : gspr_index_t; - signal gpr_a_bypass : std_ulogic_vector(1 downto 0); + signal gpr_a_bypass : std_ulogic_vector(3 downto 0); signal gpr_b_read_valid : std_ulogic; signal gpr_b_read : gspr_index_t; - signal gpr_b_bypass : std_ulogic_vector(1 downto 0); + signal gpr_b_bypass : std_ulogic_vector(3 downto 0); signal gpr_c_read_valid : std_ulogic; signal gpr_c_read : gspr_index_t; - signal gpr_c_bypass : std_ulogic_vector(1 downto 0); + signal gpr_c_bypass : std_ulogic_vector(3 downto 0); signal cr_read_valid : std_ulogic; signal cr_write_valid : std_ulogic; @@ -656,6 +673,14 @@ begin v.e.illegal_suffix := d_in.illegal_suffix; v.e.misaligned_prefix := d_in.misaligned_prefix; + -- rotator control signals + v.e.right_shift := '1' when op = OP_SHR else '0'; + v.e.rot_clear_left := '1' when op = OP_RLC or op = OP_RLCL else '0'; + v.e.rot_clear_right := '1' when op = OP_RLC or op = OP_RLCR else '0'; + v.e.rot_sign_ext := '1' when op = OP_EXTSWSLI else '0'; + + v.e.do_popcnt := '1' when op = OP_COUNTB and d_in.insn(7 downto 6) = "11" else '0'; + -- check for invalid forms that cause an illegal instruction interrupt -- Does RA = RT for a load quadword instr, or RB = RT for lqarx? if d_in.decode.repeat = DRTP and @@ -694,53 +719,38 @@ begin ov_write_valid <= v.output_ov; -- See if any of the operands can get their value via the bypass path. - if dc2.busy = '0' or gpr_a_bypass /= "00" then - case gpr_a_bypass is - when "01" => - v.e.read_data1 := execute_bypass.data; - when "10" => - v.e.read_data1 := execute2_bypass.data; - when "11" => - v.e.read_data1 := writeback_bypass.data; - when others => - if decoded_reg_a.reg_valid = '1' then - v.e.read_data1 := r_in.read1_data; - else - v.e.read_data1 := decoded_reg_a.data; - end if; - end case; + if gpr_a_bypass(0) = '1' then + v.e.read_data1 := andor(gpr_a_bypass(1), execute_bypass.data, + gpr_a_bypass(2), execute2_bypass.data, + gpr_a_bypass(3), writeback_bypass.data); + elsif dc2.busy = '0' then + if decoded_reg_a.reg_valid = '1' then + v.e.read_data1 := r_in.read1_data; + else + v.e.read_data1 := decoded_reg_a.data; + end if; end if; - if dc2.busy = '0' or gpr_b_bypass /= "00" then - case gpr_b_bypass is - when "01" => - v.e.read_data2 := execute_bypass.data; - when "10" => - v.e.read_data2 := execute2_bypass.data; - when "11" => - v.e.read_data2 := writeback_bypass.data; - when others => - if decoded_reg_b.reg_valid = '1' then - v.e.read_data2 := r_in.read2_data; - else - v.e.read_data2 := decoded_reg_b.data; - end if; - end case; + if gpr_b_bypass(0) = '1' then + v.e.read_data2 := andor(gpr_b_bypass(1), execute_bypass.data, + gpr_b_bypass(2), execute2_bypass.data, + gpr_b_bypass(3), writeback_bypass.data); + elsif dc2.busy = '0' then + if decoded_reg_b.reg_valid = '1' then + v.e.read_data2 := r_in.read2_data; + else + v.e.read_data2 := decoded_reg_b.data; + end if; end if; - if dc2.busy = '0' or gpr_c_bypass /= "00" then - case gpr_c_bypass is - when "01" => - v.e.read_data3 := execute_bypass.data; - when "10" => - v.e.read_data3 := execute2_bypass.data; - when "11" => - v.e.read_data3 := writeback_bypass.data; - when others => - if decoded_reg_c.reg_valid = '1' then - v.e.read_data3 := r_in.read3_data; - else - v.e.read_data3 := decoded_reg_c.data; - end if; - end case; + if gpr_c_bypass(0) = '1' then + v.e.read_data3 := andor(gpr_c_bypass(1), execute_bypass.data, + gpr_c_bypass(2), execute2_bypass.data, + gpr_c_bypass(3), writeback_bypass.data); + elsif dc2.busy = '0' then + if decoded_reg_c.reg_valid = '1' then + v.e.read_data3 := r_in.read3_data; + else + v.e.read_data3 := decoded_reg_c.data; + end if; end if; case cr_bypass is diff --git a/execute1.vhdl b/execute1.vhdl index a3b9522..b9ad9ad 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -34,6 +34,8 @@ entity execute1 is ext_irq_in : std_ulogic; interrupt_in : WritebackToExecute1Type; + timebase : std_ulogic_vector(63 downto 0); + -- asynchronous l_out : out Execute1ToLoadstore1Type; fp_out : out Execute1ToFPUType; @@ -116,6 +118,7 @@ architecture behaviour of execute1 is start_mul : std_ulogic; start_div : std_ulogic; start_bsort : std_ulogic; + start_bperm : std_ulogic; do_trace : std_ulogic; ciabr_trace : std_ulogic; fp_intr : std_ulogic; @@ -150,6 +153,7 @@ architecture behaviour of execute1 is mul_finish : std_ulogic; div_in_progress : std_ulogic; bsort_in_progress : std_ulogic; + bperm_in_progress : std_ulogic; no_instr_avail : std_ulogic; instr_dispatch : std_ulogic; ext_interrupt : std_ulogic; @@ -174,7 +178,7 @@ architecture behaviour of execute1 is spr_select => spr_id_init, pmu_spr_num => 5x"0", redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', - bsort_in_progress => '0', + bsort_in_progress => '0', bperm_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", @@ -206,12 +210,9 @@ architecture behaviour of execute1 is signal valid_in : std_ulogic; signal ctrl: ctrl_t := ctrl_t_init; signal ctrl_tmp: ctrl_t := ctrl_t_init; - signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; - signal rot_sign_ext: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); - signal do_popcnt: std_ulogic; signal countbits_result: std_ulogic_vector(63 downto 0); signal alu_result: std_ulogic_vector(63 downto 0); signal adder_result: std_ulogic_vector(63 downto 0); @@ -245,6 +246,8 @@ architecture behaviour of execute1 is -- bit-sort unit signals signal bsort_start : std_ulogic; signal bsort_done : std_ulogic; + signal bperm_start : std_ulogic; + signal bperm_done : std_ulogic; -- random number generator signals signal random_raw : std_ulogic_vector(63 downto 0); @@ -448,11 +451,11 @@ begin shift => b_in(6 downto 0), insn => e_in.insn, is_32bit => e_in.is_32bit, - right_shift => right_shift, + right_shift => e_in.right_shift, arith => e_in.is_signed, - clear_left => rot_clear_left, - clear_right => rot_clear_right, - sign_ext_rs => rot_sign_ext, + clear_left => e_in.rot_clear_left, + clear_right => e_in.rot_clear_right, + sign_ext_rs => e_in.rot_sign_ext, result => rotator_result, carry_out => rotator_carry ); @@ -476,7 +479,7 @@ begin stall => stage2_stall, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, - do_popcnt => do_popcnt, + do_popcnt => e_in.do_popcnt, datalen => e_in.data_len, result => countbits_result ); @@ -515,6 +518,8 @@ begin go => bsort_start, opc => e_in.insn(7 downto 6), done => bsort_done, + do_bperm => bperm_start, + bperm_done => bperm_done, result => bsort_result ); @@ -1147,7 +1152,7 @@ begin -- side-effect flags or write enables when generating a trap). -- With v.trap = 1 we will assert both ex1.e.valid and ex1.e.interrupt -- to writeback, and it will complete the instruction and take - -- and interrupt. It is OK for v.trap to depend on operand data. + -- an interrupt. It is OK for v.trap to depend on operand data. illegal := '0'; privileged := '0'; @@ -1228,7 +1233,7 @@ begin when OP_CMPRB => when OP_CMPEQB => when OP_LOGIC | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS | - OP_BPERM | OP_BREV | OP_BCD => + OP_BREV | OP_BCD => when OP_B => v.take_branch := '1'; @@ -1433,6 +1438,11 @@ begin slow_op := '1'; owait := '1'; + when OP_BPERM => + v.start_bperm := '1'; + slow_op := '1'; + owait := '1'; + when OP_MUL_L64 => if e_in.is_32bit = '1' then v.se.mult_32s := '1'; @@ -1585,7 +1595,7 @@ begin if e_in.unit = ALU then v.complete := e_in.valid and not v.exception and not owait; - v.bypass_valid := e_in.valid and not v.exception and not slow_op; + v.bypass_valid := e_in.valid and not slow_op; end if; actions <= v; @@ -1631,18 +1641,10 @@ begin v.taken_branch_event := '0'; v.br_mispredict := '0'; v.busy := '0'; - bypass_valid := '0'; + bypass_valid := actions.bypass_valid; irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); - -- rotator control signals - right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; - rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; - rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; - rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - - do_popcnt <= '1' when e_in.insn_type = OP_COUNTB and e_in.insn(7 downto 6) = "11" else '0'; - if valid_in = '1' then v.prev_op := e_in.insn_type; v.prev_prefixed := e_in.prefixed; @@ -1706,7 +1708,6 @@ begin if go = '1' then v.se := actions.se; v.e.valid := actions.complete; - bypass_valid := actions.bypass_valid; v.taken_branch_event := actions.take_branch; v.trace_next := actions.do_trace or actions.ciabr_trace; v.trace_ciabr := actions.ciabr_trace; @@ -1719,6 +1720,7 @@ begin x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; v.bsort_in_progress := actions.start_bsort; + v.bperm_in_progress := actions.start_bperm; v.br_mispredict := v.e.redirect and actions.direct_branch; v.advance_nia := actions.advance_nia; v.redir_to_next := actions.redir_to_next; @@ -1729,7 +1731,8 @@ begin -- multiply is happening in order to stop following -- instructions from using the wrong XER value -- (and for simplicity in the OE=0 case). - v.busy := actions.start_div or actions.start_mul or actions.start_bsort; + v.busy := actions.start_div or actions.start_mul or + actions.start_bsort or actions.start_bperm; -- instruction for other units, i.e. LDST if e_in.unit = LDST then @@ -1741,6 +1744,7 @@ begin end if; is_scv := go and actions.se.scv_trap; bsort_start <= go and actions.start_bsort; + bperm_start <= go and actions.start_bperm; pmu_trace <= go and actions.do_trace; if not HAS_FPU and ex1.div_in_progress = '1' then @@ -1781,6 +1785,13 @@ begin v.e.write_data := alu_result; bypass_valid := bsort_done; end if; + if ex1.bperm_in_progress = '1' then + v.bperm_in_progress := not bperm_done; + v.e.valid := bperm_done; + v.busy := not bperm_done; + v.e.write_data := alu_result; + bypass_valid := bperm_done; + end if; if v.e.write_xerc_enable = '1' and v.e.valid = '1' then v.xerc := v.e.xerc; @@ -1814,13 +1825,13 @@ begin v.fp_exception_next := '0'; end if; - bypass_data.tag.valid <= v.e.write_enable and bypass_valid; - bypass_data.tag.tag <= v.e.instr_tag.tag; + bypass_data.tag.valid <= e_in.write_reg_enable and bypass_valid; + bypass_data.tag.tag <= e_in.instr_tag.tag; bypass_data.data <= alu_result; - bypass_cr_data.tag.valid <= v.e.write_cr_enable and bypass_valid; - bypass_cr_data.tag.tag <= v.e.instr_tag.tag; - bypass_cr_data.data <= v.e.write_cr_data; + bypass_cr_data.tag.valid <= e_in.output_cr and bypass_valid; + bypass_cr_data.tag.tag <= e_in.instr_tag.tag; + bypass_cr_data.data <= write_cr_data; -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; @@ -1881,8 +1892,8 @@ begin -- Slow SPR read mux with ex1.spr_select.sel select spr_result <= - ctrl.tb when SPRSEL_TB, - 32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU, + timebase when SPRSEL_TB, + 32x"0" & timebase(63 downto 32) when SPRSEL_TBU, ctrl.dec when SPRSEL_DEC, 32x"0" & PVR_MICROWATT when SPRSEL_PVR, log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA, @@ -1936,16 +1947,14 @@ begin end if; ctrl_tmp <= ctrl; - -- FIXME: run at 512MHz not core freq - ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1); x_to_pmu.mfspr <= '0'; x_to_pmu.mtspr <= '0'; - x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47); - x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51); - x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55); - x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63); + x_to_pmu.tbbits(3) <= timebase(63 - 47); + x_to_pmu.tbbits(2) <= timebase(63 - 51); + x_to_pmu.tbbits(1) <= timebase(63 - 55); + x_to_pmu.tbbits(0) <= timebase(63 - 63); x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM); x_to_pmu.pr_msr <= ctrl.msr(MSR_PR); diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index c3be9d9..6e524f0 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -10,6 +10,7 @@ use work.wishbone_types.all; entity toplevel is generic ( + CPUS : natural := 1; MEMORY_SIZE : integer := 16384; RAM_INIT_FILE : string := "firmware.hex"; RESET_LOW : boolean := true; @@ -241,6 +242,7 @@ begin MEMORY_SIZE => BRAM_SIZE, RAM_INIT_FILE => RAM_INIT_FILE, SIM => false, + NCPUS => CPUS, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, diff --git a/include/microwatt_soc.h b/include/microwatt_soc.h index 6717b4b..67ea13d 100644 --- a/include/microwatt_soc.h +++ b/include/microwatt_soc.h @@ -65,7 +65,8 @@ #define SYS_REG_UART_IS_16550 (1ull << 32) #define SYS_REG_GIT_INFO 0x50 #define SYS_REG_GIT_IS_DIRTY (1ull << 63) - +#define SYS_REG_CPU_CTRL 0x58 +#define SYS_REG_CPU_CTRL_ENABLE 0xff /* * Register definitions for the potato UART diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 0816931..6d59fb3 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -52,6 +52,8 @@ architecture behave of loadstore1 is MMU_WAIT -- waiting for MMU to finish doing something ); + constant num_dawr : positive := 2; + type byte_index_t is array(0 to 7) of unsigned(2 downto 0); subtype byte_trim_t is std_ulogic_vector(1 downto 0); type trim_ctl_t is array(0 to 7) of byte_trim_t; @@ -130,6 +132,9 @@ architecture behave of loadstore1 is busy : std_ulogic; issued : std_ulogic; addr0 : std_ulogic_vector(63 downto 0); + dawr_ll : std_ulogic_vector(num_dawr-1 downto 0); + dawr_ul : std_ulogic_vector(num_dawr-1 downto 0); + dawr_ud : std_ulogic; end record; type reg_stage2_t is record @@ -147,7 +152,6 @@ architecture behave of loadstore1 is dbg_spr_ack: std_ulogic; end record; - constant num_dawr : positive := 2; type dawr_array_t is array(0 to num_dawr - 1) of std_ulogic_vector(63 downto 3); type dawrx_array_t is array(0 to num_dawr - 1) of std_ulogic_vector(15 downto 0); @@ -335,6 +339,9 @@ begin r1.req.sprsel <= "000"; r1.req.ric <= "00"; r1.req.xerc <= xerc_init; + r1.dawr_ll <= (others => '0'); + r1.dawr_ul <= (others => '0'); + r1.dawr_ud <= '0'; r2.req.valid <= '0'; r2.busy <= '0'; @@ -617,6 +624,9 @@ begin variable req : request_t; variable dcreq : std_ulogic; variable issue : std_ulogic; + variable addr : std_ulogic_vector(63 downto 3); + variable addl : unsigned(64 downto 3); + variable addu : unsigned(64 downto 3); begin v := r1; issue := '0'; @@ -661,6 +671,20 @@ begin end if; end if; + -- Do subtractions for DAWR0/1 matches + for i in 0 to 1 loop + addr := req.addr(63 downto 3); + if req.priv_mode = '1' and r3.dawrx(i)(7) = '1' then + -- HRAMMC=1 => trim top bit from address + addr(63) := '0'; + end if; + addl := unsigned('0' & addr) - unsigned('0' & r3.dawr(i)); + addu := unsigned('0' & r3.dawr_uplim(i)) - unsigned('0' & addr); + v.dawr_ll(i) := addl(64); + v.dawr_ul(i) := addu(64); + end loop; + v.dawr_ud := r3.dawr_upd; + if flush = '1' then v.req.valid := '0'; v.req.dc_req := '0'; @@ -702,9 +726,6 @@ begin variable sprsel : std_ulogic_vector(2 downto 0); variable sprval : std_ulogic_vector(63 downto 0); variable dawr_match : std_ulogic; - variable addr : std_ulogic_vector(63 downto 3); - variable addl : unsigned(64 downto 3); - variable addu : unsigned(64 downto 3); begin v := r2; @@ -724,14 +745,7 @@ begin -- Test for DAWR0/1 matches dawr_match := '0'; for i in 0 to 1 loop - addr := r1.req.addr(63 downto 3); - if r1.req.priv_mode = '1' and r3.dawrx(i)(7) = '1' then - -- HRAMMC=1 => trim top bit from address - addr(63) := '0'; - end if; - addl := unsigned('0' & addr) - unsigned('0' & r3.dawr(i)); - addu := unsigned('0' & r3.dawr_uplim(i)) - unsigned('0' & addr); - if addl(64) = '0' and addu(64) = '0' and + if r1.dawr_ll(i) = '0' and r1.dawr_ul(i) = '0' and r1.dawr_ud = '0' and dawrx_match_enable(r3.dawrx(i), r1.req.virt_mode, r1.req.priv_mode, r1.req.store) then dawr_match := r1.req.valid and r1.req.dc_req and not r3.dawr_upd and diff --git a/logical.vhdl b/logical.vhdl index 2d139f8..792a896 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -23,7 +23,6 @@ architecture behaviour of logical is signal par0, par1 : std_ulogic; signal parity : std_ulogic_vector(63 downto 0); - signal permute : std_ulogic_vector(7 downto 0); function bcd_to_dpd(bcd: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is variable dpd: std_ulogic_vector(9 downto 0); @@ -109,16 +108,6 @@ begin parity(32) <= par1; end if; - -- bit permutation - for i in 0 to 7 loop - j := i * 8; - if rs(j+7 downto j+6) = "00" then - permute(i) <= rb(to_integer(unsigned(not rs(j+5 downto j)))); - else - permute(i) <= '0'; - end if; - end loop; - rb_adj := rb; if invert_in = '1' then rb_adj := not rb; @@ -157,8 +146,6 @@ begin tmp := parity; when OP_CMPB => tmp := ppc_cmpb(rs, rb); - when OP_BPERM => - tmp := std_ulogic_vector(resize(unsigned(permute), 64)); when OP_BCD => -- invert_in is abused to indicate direction of conversion if invert_in = '0' then diff --git a/microwatt.core b/microwatt.core index f56bee0..ed2aa01 100644 --- a/microwatt.core +++ b/microwatt.core @@ -335,6 +335,7 @@ targets: default_tool: vivado filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, liteeth, uart16550, xilinx_specific, litesdcard] parameters: + - cpus - memory_size - ram_init_file - use_litedram=true @@ -496,6 +497,12 @@ generate: parameters: {vendor : xilinx, frequency : 100e6} parameters: + cpus: + datatype : int + description : Number of CPU cores to include in the SoC. + paramtype : generic + default : 1 + memory_size: datatype : int description : On-chip memory size (bytes). If no_bram is set, this is the size carved out for the DRAM payload diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index d7966d9..1a0b96b 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -24,28 +24,30 @@ #define DBG_WB_DATA 0x01 #define DBG_WB_CTRL 0x02 -#define DBG_CORE_CTRL 0x10 +unsigned int core; + +#define DBG_CORE_CTRL (0x10 + (core << 4)) #define DBG_CORE_CTRL_STOP (1 << 0) #define DBG_CORE_CTRL_RESET (1 << 1) #define DBG_CORE_CTRL_ICRESET (1 << 2) #define DBG_CORE_CTRL_STEP (1 << 3) #define DBG_CORE_CTRL_START (1 << 4) -#define DBG_CORE_STAT 0x11 +#define DBG_CORE_STAT (0x11 + (core << 4)) #define DBG_CORE_STAT_STOPPING (1 << 0) #define DBG_CORE_STAT_STOPPED (1 << 1) #define DBG_CORE_STAT_TERM (1 << 2) -#define DBG_CORE_NIA 0x12 -#define DBG_CORE_MSR 0x13 +#define DBG_CORE_NIA (0x12 + (core << 4)) +#define DBG_CORE_MSR (0x13 + (core << 4)) -#define DBG_CORE_GSPR_INDEX 0x14 -#define DBG_CORE_GSPR_DATA 0x15 +#define DBG_CORE_GSPR_INDEX (0x14 + (core << 4)) +#define DBG_CORE_GSPR_DATA (0x15 + (core << 4)) -#define DBG_LOG_ADDR 0x16 -#define DBG_LOG_DATA 0x17 -#define DBG_LOG_TRIGGER 0x18 -#define DBG_LOG_MTRIGGER 0x19 +#define DBG_LOG_ADDR (0x16 + (core << 4)) +#define DBG_LOG_DATA (0x17 + (core << 4)) +#define DBG_LOG_TRIGGER (0x18 + (core << 4)) +#define DBG_LOG_MTRIGGER (0x19 + (core << 4)) static bool debug; @@ -507,7 +509,7 @@ static void core_status(void) statstr2 = " (terminated)"; } else if (stat & DBG_CORE_STAT_TERM) statstr = "odd state (TERM but no STOP)"; - printf("Core: %s%s\n", statstr, statstr2); + printf("Core%u: %s%s\n", core, statstr, statstr2); printf(" NIA: %016" PRIx64 "\n", nia); printf(" MSR: %016" PRIx64 "\n", msr); } @@ -792,7 +794,7 @@ static void mtrig_set(uint64_t addr) static void usage(const char *cmd) { - fprintf(stderr, "Usage: %s -b \n", cmd); + fprintf(stderr, "Usage: %s -b [-c core#] \n", cmd); fprintf(stderr, "\n"); fprintf(stderr, " CPU core:\n"); @@ -851,12 +853,20 @@ int main(int argc, char *argv[]) { "target", required_argument, 0, 't' }, { "debug", no_argument, 0, 'd' }, { "frequency", no_argument, 0, 's' }, + { "core", required_argument, 0, 'c' }, { 0, 0, 0, 0 } }; - c = getopt_long(argc, argv, "dhb:t:s:", lopts, &oindex); + c = getopt_long(argc, argv, "dhb:t:s:c:", lopts, &oindex); if (c < 0) break; switch(c) { + case 'c': + core = atoi(optarg); + if (core >= 15) { + fprintf(stderr, "Core number out of range (max 14)\n"); + exit(1); + } + break; case 'h': usage(progname); break; diff --git a/soc.vhdl b/soc.vhdl index 3e3b438..b3d03b7 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -67,6 +67,7 @@ entity soc is RAM_INIT_FILE : string; CLK_FREQ : positive; SIM : boolean; + NCPUS : positive := 1; HAS_FPU : boolean := true; HAS_BTC : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; @@ -148,20 +149,18 @@ end entity soc; architecture behaviour of soc is + subtype cpu_index_t is natural range 0 to NCPUS-1; + type dword_percpu_array is array(cpu_index_t) of std_ulogic_vector(63 downto 0); + -- internal reset signal soc_reset : std_ulogic; -- Wishbone master signals: - signal wishbone_dcore_in : wishbone_slave_out; - signal wishbone_dcore_out : wishbone_master_out; - signal wishbone_icore_in : wishbone_slave_out; - signal wishbone_icore_out : wishbone_master_out; - signal wishbone_debug_in : wishbone_slave_out; - signal wishbone_debug_out : wishbone_master_out; - - -- Arbiter array (ghdl doesnt' support assigning the array - -- elements in the entity instantiation) - constant NUM_WB_MASTERS : positive := 4; + signal wishbone_debug_in : wishbone_slave_out; + signal wishbone_debug_out : wishbone_master_out; + + -- Arbiter array + constant NUM_WB_MASTERS : positive := NCPUS * 2 + 2; signal wb_masters_out : wishbone_master_out_vector(0 to NUM_WB_MASTERS-1); signal wb_masters_in : wishbone_slave_out_vector(0 to NUM_WB_MASTERS-1); @@ -180,7 +179,7 @@ architecture behaviour of soc is -- Syscon signals signal dram_at_0 : std_ulogic; - signal do_core_reset : std_ulogic; + signal do_core_reset : std_ulogic_vector(NCPUS-1 downto 0); signal alt_reset : std_ulogic; signal wb_syscon_in : wb_io_master_out; signal wb_syscon_out : wb_io_slave_out; @@ -210,7 +209,7 @@ architecture behaviour of soc is signal wb_xics_ics_out : wb_io_slave_out; signal int_level_in : std_ulogic_vector(15 downto 0); signal ics_to_icp : ics_to_icp_t; - signal core_ext_irq : std_ulogic; + signal core_ext_irq : std_ulogic_vector(NCPUS-1 downto 0) := (others => '0'); -- GPIO signals: signal wb_gpio_in : wb_io_master_out; @@ -233,12 +232,12 @@ architecture behaviour of soc is signal dmi_wb_dout : std_ulogic_vector(63 downto 0); signal dmi_wb_req : std_ulogic; signal dmi_wb_ack : std_ulogic; - signal dmi_core_dout : std_ulogic_vector(63 downto 0); - signal dmi_core_req : std_ulogic; - signal dmi_core_ack : std_ulogic; + signal dmi_core_dout : dword_percpu_array; + signal dmi_core_req : std_ulogic_vector(NCPUS-1 downto 0); + signal dmi_core_ack : std_ulogic_vector(NCPUS-1 downto 0); -- Delayed/latched resets and alt_reset - signal rst_core : std_ulogic; + signal rst_core : std_ulogic_vector(NCPUS-1 downto 0); signal rst_uart : std_ulogic; signal rst_xics : std_ulogic; signal rst_spi : std_ulogic; @@ -270,6 +269,10 @@ architecture behaviour of soc is signal io_cycle_gpio : std_ulogic; signal io_cycle_external : std_ulogic; + signal core_run_out : std_ulogic_vector(NCPUS-1 downto 0); + + signal timebase : std_ulogic_vector(63 downto 0); + function wishbone_widen_data(wb : wb_io_master_out) return wishbone_master_out is variable wwb : wishbone_master_out; begin @@ -334,7 +337,9 @@ begin resets: process(system_clk) begin if rising_edge(system_clk) then - rst_core <= soc_reset or do_core_reset; + for i in 0 to NCPUS-1 loop + rst_core(i) <= soc_reset or do_core_reset(i); + end loop; rst_uart <= soc_reset; rst_spi <= soc_reset; rst_xics <= soc_reset; @@ -347,11 +352,27 @@ begin end if; end process; - -- Processor core - processor: entity work.core + -- Timebase just increments at the system clock frequency. + -- There is currently no way to set it. + -- Ideally it would (appear to) run at 512MHz like IBM POWER systems, + -- but Linux seems to cope OK with it being 100MHz or whatever. + tbase: process(system_clk) + begin + if rising_edge(system_clk) then + if soc_reset = '1' then + timebase <= (others => '0'); + else + timebase <= std_ulogic_vector(unsigned(timebase) + 1); + end if; + end if; + end process; + + -- Processor cores + processors: for i in 0 to NCPUS-1 generate + core: entity work.core generic map( SIM => SIM, - CPU_INDEX => 0, + CPU_INDEX => i, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, @@ -367,32 +388,32 @@ begin ) port map( clk => system_clk, - rst => rst_core, + rst => rst_core(i), alt_reset => alt_reset_d, - run_out => run_out, - wishbone_insn_in => wishbone_icore_in, - wishbone_insn_out => wishbone_icore_out, - wishbone_data_in => wishbone_dcore_in, - wishbone_data_out => wishbone_dcore_out, + run_out => core_run_out(i), + timebase => timebase, + wishbone_insn_in => wb_masters_in(i + NCPUS), + wishbone_insn_out => wb_masters_out(i + NCPUS), + wishbone_data_in => wb_masters_in(i), + wishbone_data_out => wb_masters_out(i), wb_snoop_in => wb_snoop, dmi_addr => dmi_addr(3 downto 0), - dmi_dout => dmi_core_dout, + dmi_dout => dmi_core_dout(i), dmi_din => dmi_dout, dmi_wr => dmi_wr, - dmi_ack => dmi_core_ack, - dmi_req => dmi_core_req, - ext_irq => core_ext_irq + dmi_ack => dmi_core_ack(i), + dmi_req => dmi_core_req(i), + ext_irq => core_ext_irq(i) ); + end generate; + + run_out <= or (core_run_out); -- Wishbone bus master arbiter & mux - wb_masters_out <= (0 => wishbone_dcore_out, - 1 => wishbone_icore_out, - 2 => wishbone_widen_data(wishbone_dma_out), - 3 => wishbone_debug_out); - wishbone_dcore_in <= wb_masters_in(0); - wishbone_icore_in <= wb_masters_in(1); - wishbone_dma_in <= wishbone_narrow_data(wb_masters_in(2), wishbone_dma_out.adr); - wishbone_debug_in <= wb_masters_in(3); + wb_masters_out(2*NCPUS) <= wishbone_widen_data(wishbone_dma_out); + wb_masters_out(2*NCPUS + 1) <= wishbone_debug_out; + wishbone_dma_in <= wishbone_narrow_data(wb_masters_in(2*NCPUS), wishbone_dma_out.adr); + wishbone_debug_in <= wb_masters_in(2*NCPUS + 1); wishbone_arbiter_0: entity work.wishbone_arbiter generic map( NUM_MASTERS => NUM_WB_MASTERS @@ -780,6 +801,7 @@ begin -- Syscon slave syscon0: entity work.syscon generic map( + NCPUS => NCPUS, HAS_UART => true, HAS_DRAM => HAS_DRAM, BRAM_SIZE => MEMORY_SIZE, @@ -944,6 +966,9 @@ begin end generate; xics_icp: entity work.xics_icp + generic map( + NCPUS => NCPUS + ) port map( clk => system_clk, rst => rst_xics, @@ -955,6 +980,7 @@ begin xics_ics: entity work.xics_ics generic map( + NCPUS => NCPUS, SRC_NUM => 16, PRIO_BITS => 3 ) @@ -1034,15 +1060,15 @@ begin ); -- DMI interconnect - dmi_intercon: process(dmi_addr, dmi_req, - dmi_wb_ack, dmi_wb_dout, - dmi_core_ack, dmi_core_dout) + dmi_intercon: process(all) -- DMI address map (each address is a full 64-bit register) -- -- Offset: Size: Slave: -- 0 4 Wishbone - -- 10 16 Core + -- 10 16 Core 0 + -- 20 16 Core 1 + -- ... and so on for NCPUS cores type slave_type is (SLAVE_WB, SLAVE_CORE, @@ -1053,25 +1079,29 @@ begin slave := SLAVE_NONE; if std_match(dmi_addr, "000000--") then slave := SLAVE_WB; - elsif std_match(dmi_addr, "0001----") then + elsif not is_X(dmi_addr) and to_integer(unsigned(dmi_addr(7 downto 4))) <= NCPUS then slave := SLAVE_CORE; end if; -- DMI muxing dmi_wb_req <= '0'; - dmi_core_req <= '0'; + dmi_core_req <= (others => '0'); + dmi_din <= (others => '1'); + dmi_ack <= dmi_req; case slave is when SLAVE_WB => dmi_wb_req <= dmi_req; dmi_ack <= dmi_wb_ack; dmi_din <= dmi_wb_dout; when SLAVE_CORE => - dmi_core_req <= dmi_req; - dmi_ack <= dmi_core_ack; - dmi_din <= dmi_core_dout; + for i in 0 to NCPUS-1 loop + if not is_X(dmi_addr) and to_integer(unsigned(dmi_addr(7 downto 4))) = i + 1 then + dmi_core_req(i) <= dmi_req; + dmi_ack <= dmi_core_ack(i); + dmi_din <= dmi_core_dout(i); + end if; + end loop; when others => - dmi_ack <= dmi_req; - dmi_din <= (others => '1'); end case; -- SIM magic exit diff --git a/syscon.vhdl b/syscon.vhdl index 99fa835..98990d1 100644 --- a/syscon.vhdl +++ b/syscon.vhdl @@ -9,6 +9,7 @@ use work.wishbone_types.all; entity syscon is generic ( + NCPUS : positive := 1; SIG_VALUE : std_ulogic_vector(63 downto 0) := x"f00daa5500010001"; CLK_FREQ : integer; HAS_UART : boolean; @@ -33,7 +34,7 @@ entity syscon is -- System control ports dram_at_0 : out std_ulogic; - core_reset : out std_ulogic; + core_reset : out std_ulogic_vector(NCPUS-1 downto 0); soc_reset : out std_ulogic; alt_reset : out std_ulogic ); @@ -56,6 +57,7 @@ architecture behaviour of syscon is constant SYS_REG_UART0_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001000"; constant SYS_REG_UART1_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001001"; constant SYS_REG_GIT_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001010"; + constant SYS_REG_CPU_CTRL : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001011"; -- Muxed reg read signal signal reg_out : std_ulogic_vector(63 downto 0); @@ -116,6 +118,7 @@ architecture behaviour of syscon is signal reg_uart0info : std_ulogic_vector(63 downto 0); signal reg_uart1info : std_ulogic_vector(63 downto 0); signal reg_gitinfo : std_ulogic_vector(63 downto 0); + signal reg_cpuctrl : std_ulogic_vector(63 downto 0); signal info_has_dram : std_ulogic; signal info_has_bram : std_ulogic; signal info_has_uart : std_ulogic; @@ -134,7 +137,8 @@ begin -- Generated output signals dram_at_0 <= '1' when BRAM_SIZE = 0 else reg_ctrl(SYS_REG_CTRL_DRAM_AT_0); soc_reset <= reg_ctrl(SYS_REG_CTRL_SOC_RESET); - core_reset <= reg_ctrl(SYS_REG_CTRL_CORE_RESET); + core_reset <= not reg_cpuctrl(NCPUS-1 downto 0) when reg_ctrl(SYS_REG_CTRL_CORE_RESET) = '0' + else (others => '1'); alt_reset <= reg_ctrl(SYS_REG_CTRL_ALT_RESET); @@ -187,6 +191,8 @@ begin 55 downto 0 => GIT_HASH, others => '0'); + reg_cpuctrl(63 downto 8) <= std_ulogic_vector(to_unsigned(NCPUS, 56)); + -- Wishbone response wb_rsp.ack <= wishbone_in.cyc and wishbone_in.stb; with wishbone_in.adr(SYS_REG_BITS downto 1) select reg_out <= @@ -201,6 +207,7 @@ begin reg_uart0info when SYS_REG_UART0_INFO, reg_uart1info when SYS_REG_UART1_INFO, reg_gitinfo when SYS_REG_GIT_INFO, + reg_cpuctrl when SYS_REG_CPU_CTRL, (others => '0') when others; wb_rsp.dat <= reg_out(63 downto 32) when wishbone_in.adr(0) = '1' else reg_out(31 downto 0); @@ -225,6 +232,7 @@ begin if (rst) then reg_ctrl <= (SYS_REG_CTRL_ALT_RESET => ctrl_init_alt_reset, others => '0'); + reg_cpuctrl(7 downto 0) <= x"01"; -- enable cpu 0 only else if wishbone_in.cyc and wishbone_in.stb and wishbone_in.we then -- Change this if CTRL ever has more than 32 bits @@ -233,6 +241,10 @@ begin reg_ctrl(SYS_REG_CTRL_BITS-1 downto 0) <= wishbone_in.dat(SYS_REG_CTRL_BITS-1 downto 0); end if; + if wishbone_in.adr(SYS_REG_BITS downto 1) = SYS_REG_CPU_CTRL and + wishbone_in.adr(0) = '0' and wishbone_in.sel(0) = '1' then + reg_cpuctrl(7 downto 0) <= wishbone_in.dat(7 downto 0); + end if; end if; -- Reset auto-clear diff --git a/wishbone_arbiter.vhdl b/wishbone_arbiter.vhdl index cb632bf..a6daec6 100644 --- a/wishbone_arbiter.vhdl +++ b/wishbone_arbiter.vhdl @@ -4,7 +4,6 @@ use ieee.std_logic_1164.all; library work; use work.wishbone_types.all; --- TODO: Use an array of master/slaves with parametric size entity wishbone_arbiter is generic( NUM_MASTERS : positive := 3 @@ -28,18 +27,23 @@ begin busy <= wb_masters_in(selected).cyc; - wishbone_muxes: process(selected, candidate, busy, wb_slave_in, wb_masters_in) + wishbone_muxes: process(all) variable early_sel : wb_arb_master_t; begin early_sel := selected; - if busy = '0' then + if NUM_MASTERS <= 4 and busy = '0' then early_sel := candidate; end if; wb_slave_out <= wb_masters_in(early_sel); for i in 0 to NUM_MASTERS-1 loop wb_masters_out(i).dat <= wb_slave_in.dat; - wb_masters_out(i).ack <= wb_slave_in.ack when early_sel = i else '0'; - wb_masters_out(i).stall <= wb_slave_in.stall when early_sel = i else '1'; + if early_sel = i and wb_masters_in(i).cyc = '1' then + wb_masters_out(i).ack <= wb_slave_in.ack; + wb_masters_out(i).stall <= wb_slave_in.stall; + else + wb_masters_out(i).ack <= '0'; + wb_masters_out(i).stall <= '1'; + end if; end loop; end process; diff --git a/xics.vhdl b/xics.vhdl index 62faf77..b999e65 100644 --- a/xics.vhdl +++ b/xics.vhdl @@ -25,6 +25,9 @@ use work.common.all; use work.wishbone_types.all; entity xics_icp is + generic ( + NCPUS : natural := 1 + ); port ( clk : in std_logic; rst : in std_logic; @@ -33,32 +36,41 @@ entity xics_icp is wb_out : out wb_io_slave_out; ics_in : in ics_to_icp_t; - core_irq_out : out std_ulogic + core_irq_out : out std_ulogic_vector(NCPUS-1 downto 0) ); end xics_icp; architecture behaviour of xics_icp is - type reg_internal_t is record + type xics_presentation_t is record xisr : std_ulogic_vector(23 downto 0); cppr : std_ulogic_vector(7 downto 0); mfrr : std_ulogic_vector(7 downto 0); irq : std_ulogic; + end record; + constant xics_presentation_t_init : xics_presentation_t := + (mfrr => x"ff", -- mask everything on reset + irq => '0', + others => (others => '0')); + subtype cpu_index_t is natural range 0 to NCPUS-1; + type xicp_array_t is array(cpu_index_t) of xics_presentation_t; + + type reg_internal_t is record + icp : xicp_array_t; wb_rd_data : std_ulogic_vector(31 downto 0); wb_ack : std_ulogic; end record; constant reg_internal_init : reg_internal_t := (wb_ack => '0', - mfrr => x"ff", -- mask everything on reset - irq => '0', - others => (others => '0')); + wb_rd_data => (others => '0'), + icp => (others => xics_presentation_t_init)); signal r, r_next : reg_internal_t; - -- 8 bit offsets for each presentation - constant XIRR_POLL : std_ulogic_vector(7 downto 0) := x"00"; - constant XIRR : std_ulogic_vector(7 downto 0) := x"04"; - constant RESV0 : std_ulogic_vector(7 downto 0) := x"08"; - constant MFRR : std_ulogic_vector(7 downto 0) := x"0c"; + -- 4 bit offsets for each presentation register + constant XIRR_POLL : std_ulogic_vector(3 downto 0) := x"0"; + constant XIRR : std_ulogic_vector(3 downto 0) := x"4"; + constant RESV0 : std_ulogic_vector(3 downto 0) := x"8"; + constant MFRR : std_ulogic_vector(3 downto 0) := x"c"; begin @@ -68,7 +80,9 @@ begin r <= r_next; -- We delay core_irq_out by a cycle to help with timing - core_irq_out <= r.irq; + for i in 0 to NCPUS-1 loop + core_irq_out(i) <= r.icp(i).irq; + end loop; end if; end process; @@ -99,94 +113,105 @@ begin v.wb_ack := '0'; - xirr_accept_rd := '0'; - be_in := bswap(wb_in.dat); be_out := (others => '0'); - if wb_in.cyc = '1' and wb_in.stb = '1' then v.wb_ack := '1'; -- always ack - if wb_in.we = '1' then -- write - -- writes to both XIRR are the same - case wb_in.adr(5 downto 0) & "00" is - when XIRR_POLL => - report "ICP XIRR_POLL write"; - v.cppr := be_in(31 downto 24); - when XIRR => - v.cppr := be_in(31 downto 24); - if wb_in.sel = x"f" then -- 4 byte - report "ICP XIRR write word (EOI) :" & to_hstring(be_in); - elsif wb_in.sel = x"1" then -- 1 byte - report "ICP XIRR write byte (CPPR):" & to_hstring(be_in(31 downto 24)); - else - report "ICP XIRR UNSUPPORTED write ! sel=" & to_hstring(wb_in.sel); - end if; - when MFRR => - v.mfrr := be_in(31 downto 24); - if wb_in.sel = x"f" then -- 4 bytes - report "ICP MFRR write word:" & to_hstring(be_in); - elsif wb_in.sel = x"1" then -- 1 byte - report "ICP MFRR write byte:" & to_hstring(be_in(31 downto 24)); - else - report "ICP MFRR UNSUPPORTED write ! sel=" & to_hstring(wb_in.sel); - end if; - when others => - end case; - - else -- read - - case wb_in.adr(5 downto 0) & "00" is - when XIRR_POLL => - report "ICP XIRR_POLL read"; - be_out := r.cppr & r.xisr; - when XIRR => - report "ICP XIRR read"; - be_out := r.cppr & r.xisr; - if wb_in.sel = x"f" then - xirr_accept_rd := '1'; - end if; - when MFRR => - report "ICP MFRR read"; - be_out(31 downto 24) := r.mfrr; - when others => - end case; - end if; end if; - pending_priority := x"ff"; - v.xisr := x"000000"; - v.irq := '0'; + for i in cpu_index_t loop + xirr_accept_rd := '0'; + + if wb_in.cyc = '1' and wb_in.stb = '1' and + to_integer(unsigned(wb_in.adr(5 downto 2))) = i then + if wb_in.we = '1' then -- write + -- writes to both XIRR are the same + case wb_in.adr(1 downto 0) & "00" is + when XIRR_POLL => + report "ICP XIRR_POLL write"; + v.icp(i).cppr := be_in(31 downto 24); + when XIRR => + v.icp(i).cppr := be_in(31 downto 24); + if wb_in.sel = x"f" then -- 4 byte + report "ICP " & natural'image(i) & " XIRR write word (EOI) :" & + to_hstring(be_in); + elsif wb_in.sel = x"1" then -- 1 byte + report "ICP " & natural'image(i) & " XIRR write byte (CPPR):" & + to_hstring(be_in(31 downto 24)); + else + report "ICP " & natural'image(i) & " XIRR UNSUPPORTED write ! sel=" & + to_hstring(wb_in.sel); + end if; + when MFRR => + v.icp(i).mfrr := be_in(31 downto 24); + if wb_in.sel = x"f" then -- 4 bytes + report "ICP " & natural'image(i) & " MFRR write word:" & + to_hstring(be_in); + elsif wb_in.sel = x"1" then -- 1 byte + report "ICP " & natural'image(i) & " MFRR write byte:" & + to_hstring(be_in(31 downto 24)); + else + report "ICP " & natural'image(i) & " MFRR UNSUPPORTED write ! sel=" & + to_hstring(wb_in.sel); + end if; + when others => + end case; + + else -- read + + case wb_in.adr(1 downto 0) & "00" is + when XIRR_POLL => + report "ICP XIRR_POLL read"; + be_out := r.icp(i).cppr & r.icp(i).xisr; + when XIRR => + report "ICP XIRR read"; + be_out := r.icp(i).cppr & r.icp(i).xisr; + if wb_in.sel = x"f" then + xirr_accept_rd := '1'; + end if; + when MFRR => + report "ICP MFRR read"; + be_out(31 downto 24) := r.icp(i).mfrr; + when others => + end case; + end if; + end if; - if ics_in.pri /= x"ff" then - v.xisr := x"00001" & ics_in.src; - pending_priority := ics_in.pri; - end if; + pending_priority := x"ff"; + v.icp(i).xisr := x"000000"; + v.icp(i).irq := '0'; - -- Check MFRR - if unsigned(r.mfrr) < unsigned(pending_priority) then -- - v.xisr := x"000002"; -- special XICS MFRR IRQ source number - pending_priority := r.mfrr; - end if; + if ics_in.pri(8*i + 7 downto 8*i) /= x"ff" then + v.icp(i).xisr := x"00001" & ics_in.src(4*i + 3 downto 4*i); + pending_priority := ics_in.pri(8*i + 7 downto 8*i); + end if; - -- Accept the interrupt - if xirr_accept_rd = '1' then - report "XICS: ICP ACCEPT" & - " cppr:" & to_hstring(r.cppr) & - " xisr:" & to_hstring(r.xisr) & - " mfrr:" & to_hstring(r.mfrr); - v.cppr := pending_priority; - end if; + -- Check MFRR + if unsigned(r.icp(i).mfrr) < unsigned(pending_priority) then -- + v.icp(i).xisr := x"000002"; -- special XICS MFRR IRQ source number + pending_priority := r.icp(i).mfrr; + end if; + + -- Accept the interrupt + if xirr_accept_rd = '1' then + report "XICS " & natural'image(i) & ": ICP ACCEPT" & + " cppr:" & to_hstring(r.icp(i).cppr) & + " xisr:" & to_hstring(r.icp(i).xisr) & + " mfrr:" & to_hstring(r.icp(i).mfrr); + v.icp(i).cppr := pending_priority; + end if; - v.wb_rd_data := bswap(be_out); + v.wb_rd_data := bswap(be_out); - if unsigned(pending_priority) < unsigned(v.cppr) then - if r.irq = '0' then - report "IRQ set"; + if unsigned(pending_priority) < unsigned(v.icp(i).cppr) then + if r.icp(i).irq = '0' then + report "CPU " & natural'image(i) & " IRQ set"; + end if; + v.icp(i).irq := '1'; + elsif r.icp(i).irq = '1' then + report "CPU " & natural'image(i) & " IRQ clr"; end if; - v.irq := '1'; - elsif r.irq = '1' then - report "IRQ clr"; - end if; + end loop; if rst = '1' then v := reg_internal_init; @@ -210,6 +235,7 @@ use work.helpers.all; entity xics_ics is generic ( + NCPUS : natural := 1; SRC_NUM : integer range 1 to 256 := 16; PRIO_BITS : integer range 1 to 8 := 3 ); @@ -228,10 +254,13 @@ end xics_ics; architecture rtl of xics_ics is constant SRC_NUM_BITS : natural := log2(SRC_NUM); + constant SERVER_NUM_BITS : natural := 2; subtype pri_t is std_ulogic_vector(PRIO_BITS-1 downto 0); + subtype server_t is unsigned(SERVER_NUM_BITS-1 downto 0); type xive_t is record pri : pri_t; + server : server_t; end record; constant pri_masked : pri_t := (others => '1'); @@ -308,6 +337,16 @@ architecture rtl of xics_ics is return p(nbits - 1 downto 0); end function; + function server_check(serv_in: std_ulogic_vector(7 downto 0)) return unsigned is + variable srv : server_t; + begin + srv := to_unsigned(0, SERVER_NUM_BITS); + if to_integer(unsigned(serv_in)) < NCPUS then + srv := unsigned(serv_in(SERVER_NUM_BITS - 1 downto 0)); + end if; + return srv; + end; + -- Register map -- 0 : Config -- 4 : Debug/diagnostics @@ -366,16 +405,14 @@ begin be_out := (others => '0'); if reg_is_xive = '1' then - be_out := int_level_l(reg_idx) & - '0' & - int_level_l(reg_idx) & - '0' & - x"00000" & - prio_unpack(xives(reg_idx).pri); + be_out(31) := int_level_l(reg_idx); + be_out(29) := int_level_l(reg_idx); + be_out(8 + SERVER_NUM_BITS - 1 downto 8) := std_ulogic_vector(xives(reg_idx).server); + be_out(7 downto 0) := prio_unpack(xives(reg_idx).pri); elsif reg_is_config = '1' then be_out := get_config; elsif reg_is_debug = '1' then - be_out := x"00000" & icp_out_next.src & icp_out_next.pri; + be_out := icp_out_next.src & icp_out_next.pri(15 downto 0); end if; wb_out.dat <= bswap(be_out); wb_out.ack <= wb_valid; @@ -389,17 +426,20 @@ begin if rising_edge(clk) then if rst = '1' then for i in 0 to SRC_NUM - 1 loop - xives(i) <= (pri => pri_masked); + xives(i) <= (pri => pri_masked, server => to_unsigned(0, SERVER_NUM_BITS)); end loop; elsif wb_valid = '1' and wb_in.we = '1' then -- Byteswapped input be_in := bswap(wb_in.dat); if reg_is_xive then - -- TODO: When adding support for other bits, make sure to - -- properly implement wb_in.sel to allow partial writes. - xives(reg_idx).pri <= prio_pack(be_in(7 downto 0)); - report "ICS irq " & integer'image(reg_idx) & - " set to:" & to_hstring(be_in(7 downto 0)); + if wb_in.sel(3) = '1' then + xives(reg_idx).pri <= prio_pack(be_in(7 downto 0)); + report "ICS irq " & integer'image(reg_idx) & + " set to pri:" & to_hstring(be_in(7 downto 0)); + end if; + if wb_in.sel(2) = '1' then + xives(reg_idx).server <= server_check(be_in(15 downto 8)); + end if; end if; end if; end if; @@ -424,29 +464,36 @@ begin variable pending_pri : pri_vector_t; variable pending_at_pri : std_ulogic_vector(SRC_NUM - 1 downto 0); begin - -- Work out the most-favoured (lowest) priority of the pending interrupts - pending_pri := (others => '0'); - for i in 0 to SRC_NUM - 1 loop - if int_level_l(i) = '1' then - pending_pri := pending_pri or prio_decode(xives(i).pri); - end if; - end loop; - max_pri := priority_encoder(pending_pri, PRIO_BITS); + icp_out_next.src <= (others => '0'); + icp_out_next.pri <= (others => '0'); + for cpu in 0 to NCPUS-1 loop + -- Work out the most-favoured (lowest) priority of the interrupts + -- that are pending and directed to this cpu + pending_pri := (others => '0'); + for i in 0 to SRC_NUM - 1 loop + if int_level_l(i) = '1' and to_integer(xives(i).server) = cpu then + pending_pri := pending_pri or prio_decode(xives(i).pri); + end if; + end loop; + max_pri := priority_encoder(pending_pri, PRIO_BITS); + + -- Work out which interrupts are pending at that priority + pending_at_pri := (others => '0'); + for i in 0 to SRC_NUM - 1 loop + if int_level_l(i) = '1' and xives(i).pri = max_pri and + to_integer(xives(i).server) = cpu then + pending_at_pri(i) := '1'; + end if; + end loop; + max_idx := priority_encoder(pending_at_pri, SRC_NUM_BITS); - -- Work out which interrupts are pending at that priority - pending_at_pri := (others => '0'); - for i in 0 to SRC_NUM - 1 loop - if int_level_l(i) = '1' and xives(i).pri = max_pri then - pending_at_pri(i) := '1'; + if max_pri /= pri_masked then + report "MFI: " & integer'image(to_integer(unsigned(max_idx))) & " pri=" & to_hstring(prio_unpack(max_pri)) & + " srv=" & integer'image(cpu); end if; + icp_out_next.src(4*cpu + 3 downto 4*cpu) <= max_idx; + icp_out_next.pri(8*cpu + 7 downto 8*cpu) <= prio_unpack(max_pri); end loop; - max_idx := priority_encoder(pending_at_pri, SRC_NUM_BITS); - - if max_pri /= pri_masked then - report "MFI: " & integer'image(to_integer(unsigned(max_idx))) & " pri=" & to_hstring(prio_unpack(max_pri)); - end if; - icp_out_next.src <= max_idx; - icp_out_next.pri <= prio_unpack(max_pri); end process; end architecture rtl;