Merge pull request #436 from paulusmack/smp

Implement SMP
master
Paul Mackerras 3 days ago committed by GitHub
commit e9b57ca5bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1,5 +1,6 @@
-- Implements instructions that involve sorting bits,
-- that is, cfuged, pextd and pdepd.
-- Also does bperm, which is somewhat different.
--
-- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right
-- and move the bits in RS in the same fashion to give the result
@ -7,6 +8,7 @@
-- corresponding bit in RB is 1
-- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out
-- to the bit positions which have a 1 in RB
-- bperm: Select 8 arbitrary bits

-- NB opc is bits 7-6 of the instruction:
-- 00 = pdepd, 01 = pextd, 10 = cfuged
@ -27,6 +29,8 @@ entity bit_sorter is
go : in std_ulogic;
opc : in std_ulogic_vector(1 downto 0);
done : out std_ulogic;
do_bperm : in std_ulogic;
bperm_done : out std_ulogic;
result : out std_ulogic_vector(63 downto 0)
);
end entity bit_sorter;
@ -45,6 +49,13 @@ architecture behaviour of bit_sorter is
signal sr_vl : std_ulogic_vector(63 downto 0);
signal sr_vr : std_ulogic_vector(63 downto 0);

signal is_bperm : std_ulogic;
signal bpc : unsigned(2 downto 0);
signal bp_done : std_ulogic;
signal bperm_res : std_ulogic_vector(7 downto 0);
signal rs_sr : std_ulogic_vector(63 downto 0);
signal rb_bp : std_ulogic_vector(63 downto 0);

begin
bsort_r: process(clk)
begin
@ -96,7 +107,41 @@ begin
end if;
end process;

-- bit permutation
bperm_res(7) <= rb_bp(to_integer(unsigned(not rs_sr(5 downto 0)))) when not is_X(rs_sr)
else 'X';

bperm_r: process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
is_bperm <= '0';
bp_done <= '0';
bperm_res(6 downto 0) <= (others => '0');
bpc <= to_unsigned(0, 3);
elsif do_bperm = '1' then
is_bperm <= '1';
bp_done <= '0';
bperm_res(6 downto 0) <= (others => '0');
bpc <= to_unsigned(0, 3);
rs_sr <= rs;
rb_bp <= rb;
elsif bp_done = '1' then
is_bperm <= '0';
bp_done <= '0';
elsif is_bperm = '1' then
bperm_res(6 downto 0) <= bperm_res(7 downto 1);
rs_sr <= x"00" & rs_sr(63 downto 8);
if bpc = "110" then
bp_done <= '1';
end if;
bpc <= bpc + 1;
end if;
end if;
end process;

done <= sd;
result <= val;
bperm_done <= bp_done;
result <= val when is_bperm = '0' else (56x"0" & bperm_res);

end behaviour;

@ -252,19 +252,20 @@ package common is

-- For now, fixed 16 sources, make this either a parametric
-- package of some sort or an unconstrainted array.
-- We don't know NCPUS or SRC_NUM here, so make this
-- large enough for 4 cpus and 16 interrupt sources for now.
type ics_to_icp_t is record
-- Level interrupts only, ICS just keeps prsenting the
-- highest priority interrupt. Once handling edge, something
-- smarter involving handshake & reject support will be needed
src : std_ulogic_vector(3 downto 0);
pri : std_ulogic_vector(7 downto 0);
src : std_ulogic_vector(15 downto 0); -- 4 bits each for 4 cpus
pri : std_ulogic_vector(31 downto 0); -- 8 bits each for 4 cpus
end record;

-- This needs to die...
type ctrl_t is record
wait_state: std_ulogic;
run: std_ulogic;
tb: std_ulogic_vector(63 downto 0);
dec: std_ulogic_vector(63 downto 0);
msr: std_ulogic_vector(63 downto 0);
cfar: std_ulogic_vector(63 downto 0);
@ -439,6 +440,11 @@ package common is
illegal_form : std_ulogic;
uses_tar : std_ulogic;
uses_dscr : std_ulogic;
right_shift : std_ulogic;
rot_clear_left : std_ulogic;
rot_clear_right : std_ulogic;
rot_sign_ext : std_ulogic;
do_popcnt : std_ulogic;
end record;
constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', unit => ALU, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init,
@ -461,6 +467,8 @@ package common is
dec_ctr => '0',
prefixed => '0', prefix => (others => '0'), illegal_suffix => '0',
misaligned_prefix => '0', illegal_form => '0', uses_tar => '0', uses_dscr => '0',
right_shift => '0', rot_clear_left => '0', rot_clear_right => '0', rot_sign_ext => '0',
do_popcnt => '0',
others => (others => '0'));

type MultiplyInputType is record

@ -45,9 +45,13 @@ entity control is
valid_out : out std_ulogic;
stopped_out : out std_ulogic;

gpr_bypass_a : out std_ulogic_vector(1 downto 0);
gpr_bypass_b : out std_ulogic_vector(1 downto 0);
gpr_bypass_c : out std_ulogic_vector(1 downto 0);
-- Note on gpr_bypass_*: bits 1 to 3 are a 1-hot encoding of which
-- bypass source we may possibly need to use; bit 0 is 1 if the bypass
-- value should be used (i.e. any of bits 1-3 are 1 and the
-- corresponding gpr_x_read_valid_in is also 1).
gpr_bypass_a : out std_ulogic_vector(3 downto 0);
gpr_bypass_b : out std_ulogic_vector(3 downto 0);
gpr_bypass_c : out std_ulogic_vector(3 downto 0);
cr_bypass : out std_ulogic_vector(1 downto 0);

instr_tag_out : out instr_tag_t
@ -152,9 +156,9 @@ begin
variable tag_s : instr_tag_t;
variable tag_t : instr_tag_t;
variable incr_tag : tag_number_t;
variable byp_a : std_ulogic_vector(1 downto 0);
variable byp_b : std_ulogic_vector(1 downto 0);
variable byp_c : std_ulogic_vector(1 downto 0);
variable byp_a : std_ulogic_vector(3 downto 0);
variable byp_b : std_ulogic_vector(3 downto 0);
variable byp_c : std_ulogic_vector(3 downto 0);
variable tag_cr : instr_tag_t;
variable byp_cr : std_ulogic_vector(1 downto 0);
variable tag_ov : instr_tag_t;
@ -163,57 +167,66 @@ begin
tag_a := instr_tag_init;
for i in tag_number_t loop
if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_a_read_in then
tag_a.valid := gpr_a_read_valid_in;
tag_a.valid := '1';
tag_a.tag := i;
end if;
end loop;
tag_b := instr_tag_init;
for i in tag_number_t loop
if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then
tag_b.valid := gpr_b_read_valid_in;
tag_b.valid := '1';
tag_b.tag := i;
end if;
end loop;
tag_c := instr_tag_init;
for i in tag_number_t loop
if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then
tag_c.valid := gpr_c_read_valid_in;
tag_c.valid := '1';
tag_c.tag := i;
end if;
end loop;

byp_a := "00";
byp_a := "0000";
if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then
byp_a := "01";
elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then
byp_a := "10";
elsif tag_match(complete_in, tag_a) then
byp_a := "11";
byp_a(1) := '1';
end if;
byp_b := "00";
if EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then
byp_a(2) := '1';
end if;
if tag_match(complete_in, tag_a) then
byp_a(3) := '1';
end if;
byp_a(0) := gpr_a_read_valid_in and (byp_a(1) or byp_a(2) or byp_a(3));
byp_b := "0000";
if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then
byp_b := "01";
elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then
byp_b := "10";
elsif tag_match(complete_in, tag_b) then
byp_b := "11";
byp_b(1) := '1';
end if;
if EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then
byp_b(2) := '1';
end if;
byp_c := "00";
if tag_match(complete_in, tag_b) then
byp_b(3) := '1';
end if;
byp_b(0) := gpr_b_read_valid_in and (byp_b(1) or byp_b(2) or byp_b(3));
byp_c := "0000";
if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then
byp_c := "01";
elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then
byp_c := "10";
elsif tag_match(complete_in, tag_c) then
byp_c := "11";
byp_c(1) := '1';
end if;
if EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then
byp_c(2) := '1';
end if;
if tag_match(complete_in, tag_c) then
byp_c(3) := '1';
end if;
byp_c(0) := gpr_c_read_valid_in and (byp_c(1) or byp_c(2) or byp_c(3));

gpr_bypass_a <= byp_a;
gpr_bypass_b <= byp_b;
gpr_bypass_c <= byp_c;

gpr_tag_stall <= (tag_a.valid and not (or (byp_a))) or
(tag_b.valid and not (or (byp_b))) or
(tag_c.valid and not (or (byp_c)));
gpr_tag_stall <= (tag_a.valid and gpr_a_read_valid_in and not byp_a(0)) or
(tag_b.valid and gpr_b_read_valid_in and not byp_b(0)) or
(tag_c.valid and gpr_c_read_valid_in and not byp_c(0));

incr_tag := curr_tag;
instr_tag.tag <= curr_tag;

@ -31,6 +31,9 @@ entity core is
-- Alternate reset (0xffff0000) for use by DRAM init fw
alt_reset : in std_ulogic;

-- Global timebase
timebase : in std_ulogic_vector(63 downto 0);

-- Wishbone interface
wishbone_insn_in : in wishbone_slave_out;
wishbone_insn_out : out wishbone_master_out;
@ -373,6 +376,7 @@ begin
port map (
clk => clk,
rst => rst_ex1,
timebase => timebase,
flush_in => flush,
busy_out => ex1_busy_out,
e_in => decode2_to_execute1,

@ -201,6 +201,23 @@ architecture behaviour of decode2 is
end case;
end;

function andor (mask_a : std_ulogic; val_a : std_ulogic_vector(63 downto 0);
mask_b : std_ulogic; val_b : std_ulogic_vector(63 downto 0);
mask_c : std_ulogic; val_c : std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
variable t : std_ulogic_vector(63 downto 0) := (others => '0');
begin
if mask_a = '1' then
t := val_a;
end if;
if mask_b = '1' then
t := t or val_b;
end if;
if mask_c = '1' then
t := t or val_c;
end if;
return t;
end;

-- control signals that are derived from insn_type
type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0);

@ -210,7 +227,6 @@ architecture behaviour of decode2 is
OP_PRTY => "001",
OP_CMPB => "001",
OP_EXTS => "001",
OP_BPERM => "001",
OP_BREV => "001",
OP_BCD => "001",
OP_MTSPR => "001",
@ -239,6 +255,7 @@ architecture behaviour of decode2 is
OP_DIVE => "101",
OP_MOD => "101",
OP_BSORT => "100",
OP_BPERM => "100",
OP_ADDG6S => "001", -- misc_result
OP_ISEL => "010",
OP_DARN => "011",
@ -269,15 +286,15 @@ architecture behaviour of decode2 is

signal gpr_a_read_valid : std_ulogic;
signal gpr_a_read : gspr_index_t;
signal gpr_a_bypass : std_ulogic_vector(1 downto 0);
signal gpr_a_bypass : std_ulogic_vector(3 downto 0);

signal gpr_b_read_valid : std_ulogic;
signal gpr_b_read : gspr_index_t;
signal gpr_b_bypass : std_ulogic_vector(1 downto 0);
signal gpr_b_bypass : std_ulogic_vector(3 downto 0);

signal gpr_c_read_valid : std_ulogic;
signal gpr_c_read : gspr_index_t;
signal gpr_c_bypass : std_ulogic_vector(1 downto 0);
signal gpr_c_bypass : std_ulogic_vector(3 downto 0);

signal cr_read_valid : std_ulogic;
signal cr_write_valid : std_ulogic;
@ -656,6 +673,14 @@ begin
v.e.illegal_suffix := d_in.illegal_suffix;
v.e.misaligned_prefix := d_in.misaligned_prefix;

-- rotator control signals
v.e.right_shift := '1' when op = OP_SHR else '0';
v.e.rot_clear_left := '1' when op = OP_RLC or op = OP_RLCL else '0';
v.e.rot_clear_right := '1' when op = OP_RLC or op = OP_RLCR else '0';
v.e.rot_sign_ext := '1' when op = OP_EXTSWSLI else '0';

v.e.do_popcnt := '1' when op = OP_COUNTB and d_in.insn(7 downto 6) = "11" else '0';

-- check for invalid forms that cause an illegal instruction interrupt
-- Does RA = RT for a load quadword instr, or RB = RT for lqarx?
if d_in.decode.repeat = DRTP and
@ -694,53 +719,38 @@ begin
ov_write_valid <= v.output_ov;

-- See if any of the operands can get their value via the bypass path.
if dc2.busy = '0' or gpr_a_bypass /= "00" then
case gpr_a_bypass is
when "01" =>
v.e.read_data1 := execute_bypass.data;
when "10" =>
v.e.read_data1 := execute2_bypass.data;
when "11" =>
v.e.read_data1 := writeback_bypass.data;
when others =>
if decoded_reg_a.reg_valid = '1' then
v.e.read_data1 := r_in.read1_data;
else
v.e.read_data1 := decoded_reg_a.data;
end if;
end case;
if gpr_a_bypass(0) = '1' then
v.e.read_data1 := andor(gpr_a_bypass(1), execute_bypass.data,
gpr_a_bypass(2), execute2_bypass.data,
gpr_a_bypass(3), writeback_bypass.data);
elsif dc2.busy = '0' then
if decoded_reg_a.reg_valid = '1' then
v.e.read_data1 := r_in.read1_data;
else
v.e.read_data1 := decoded_reg_a.data;
end if;
end if;
if dc2.busy = '0' or gpr_b_bypass /= "00" then
case gpr_b_bypass is
when "01" =>
v.e.read_data2 := execute_bypass.data;
when "10" =>
v.e.read_data2 := execute2_bypass.data;
when "11" =>
v.e.read_data2 := writeback_bypass.data;
when others =>
if decoded_reg_b.reg_valid = '1' then
v.e.read_data2 := r_in.read2_data;
else
v.e.read_data2 := decoded_reg_b.data;
end if;
end case;
if gpr_b_bypass(0) = '1' then
v.e.read_data2 := andor(gpr_b_bypass(1), execute_bypass.data,
gpr_b_bypass(2), execute2_bypass.data,
gpr_b_bypass(3), writeback_bypass.data);
elsif dc2.busy = '0' then
if decoded_reg_b.reg_valid = '1' then
v.e.read_data2 := r_in.read2_data;
else
v.e.read_data2 := decoded_reg_b.data;
end if;
end if;
if dc2.busy = '0' or gpr_c_bypass /= "00" then
case gpr_c_bypass is
when "01" =>
v.e.read_data3 := execute_bypass.data;
when "10" =>
v.e.read_data3 := execute2_bypass.data;
when "11" =>
v.e.read_data3 := writeback_bypass.data;
when others =>
if decoded_reg_c.reg_valid = '1' then
v.e.read_data3 := r_in.read3_data;
else
v.e.read_data3 := decoded_reg_c.data;
end if;
end case;
if gpr_c_bypass(0) = '1' then
v.e.read_data3 := andor(gpr_c_bypass(1), execute_bypass.data,
gpr_c_bypass(2), execute2_bypass.data,
gpr_c_bypass(3), writeback_bypass.data);
elsif dc2.busy = '0' then
if decoded_reg_c.reg_valid = '1' then
v.e.read_data3 := r_in.read3_data;
else
v.e.read_data3 := decoded_reg_c.data;
end if;
end if;

case cr_bypass is

@ -34,6 +34,8 @@ entity execute1 is
ext_irq_in : std_ulogic;
interrupt_in : WritebackToExecute1Type;

timebase : std_ulogic_vector(63 downto 0);

-- asynchronous
l_out : out Execute1ToLoadstore1Type;
fp_out : out Execute1ToFPUType;
@ -116,6 +118,7 @@ architecture behaviour of execute1 is
start_mul : std_ulogic;
start_div : std_ulogic;
start_bsort : std_ulogic;
start_bperm : std_ulogic;
do_trace : std_ulogic;
ciabr_trace : std_ulogic;
fp_intr : std_ulogic;
@ -150,6 +153,7 @@ architecture behaviour of execute1 is
mul_finish : std_ulogic;
div_in_progress : std_ulogic;
bsort_in_progress : std_ulogic;
bperm_in_progress : std_ulogic;
no_instr_avail : std_ulogic;
instr_dispatch : std_ulogic;
ext_interrupt : std_ulogic;
@ -174,7 +178,7 @@ architecture behaviour of execute1 is
spr_select => spr_id_init, pmu_spr_num => 5x"0",
redir_to_next => '0', advance_nia => '0', lr_from_next => '0',
mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
bsort_in_progress => '0',
bsort_in_progress => '0', bperm_in_progress => '0',
no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
taken_branch_event => '0', br_mispredict => '0',
msr => 64x"0",
@ -206,12 +210,9 @@ architecture behaviour of execute1 is
signal valid_in : std_ulogic;
signal ctrl: ctrl_t := ctrl_t_init;
signal ctrl_tmp: ctrl_t := ctrl_t_init;
signal right_shift, rot_clear_left, rot_clear_right: std_ulogic;
signal rot_sign_ext: std_ulogic;
signal rotator_result: std_ulogic_vector(63 downto 0);
signal rotator_carry: std_ulogic;
signal logical_result: std_ulogic_vector(63 downto 0);
signal do_popcnt: std_ulogic;
signal countbits_result: std_ulogic_vector(63 downto 0);
signal alu_result: std_ulogic_vector(63 downto 0);
signal adder_result: std_ulogic_vector(63 downto 0);
@ -245,6 +246,8 @@ architecture behaviour of execute1 is
-- bit-sort unit signals
signal bsort_start : std_ulogic;
signal bsort_done : std_ulogic;
signal bperm_start : std_ulogic;
signal bperm_done : std_ulogic;

-- random number generator signals
signal random_raw : std_ulogic_vector(63 downto 0);
@ -448,11 +451,11 @@ begin
shift => b_in(6 downto 0),
insn => e_in.insn,
is_32bit => e_in.is_32bit,
right_shift => right_shift,
right_shift => e_in.right_shift,
arith => e_in.is_signed,
clear_left => rot_clear_left,
clear_right => rot_clear_right,
sign_ext_rs => rot_sign_ext,
clear_left => e_in.rot_clear_left,
clear_right => e_in.rot_clear_right,
sign_ext_rs => e_in.rot_sign_ext,
result => rotator_result,
carry_out => rotator_carry
);
@ -476,7 +479,7 @@ begin
stall => stage2_stall,
count_right => e_in.insn(10),
is_32bit => e_in.is_32bit,
do_popcnt => do_popcnt,
do_popcnt => e_in.do_popcnt,
datalen => e_in.data_len,
result => countbits_result
);
@ -515,6 +518,8 @@ begin
go => bsort_start,
opc => e_in.insn(7 downto 6),
done => bsort_done,
do_bperm => bperm_start,
bperm_done => bperm_done,
result => bsort_result
);

@ -1147,7 +1152,7 @@ begin
-- side-effect flags or write enables when generating a trap).
-- With v.trap = 1 we will assert both ex1.e.valid and ex1.e.interrupt
-- to writeback, and it will complete the instruction and take
-- and interrupt. It is OK for v.trap to depend on operand data.
-- an interrupt. It is OK for v.trap to depend on operand data.

illegal := '0';
privileged := '0';
@ -1228,7 +1233,7 @@ begin
when OP_CMPRB =>
when OP_CMPEQB =>
when OP_LOGIC | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS |
OP_BPERM | OP_BREV | OP_BCD =>
OP_BREV | OP_BCD =>

when OP_B =>
v.take_branch := '1';
@ -1433,6 +1438,11 @@ begin
slow_op := '1';
owait := '1';

when OP_BPERM =>
v.start_bperm := '1';
slow_op := '1';
owait := '1';

when OP_MUL_L64 =>
if e_in.is_32bit = '1' then
v.se.mult_32s := '1';
@ -1585,7 +1595,7 @@ begin

if e_in.unit = ALU then
v.complete := e_in.valid and not v.exception and not owait;
v.bypass_valid := e_in.valid and not v.exception and not slow_op;
v.bypass_valid := e_in.valid and not slow_op;
end if;

actions <= v;
@ -1631,18 +1641,10 @@ begin
v.taken_branch_event := '0';
v.br_mispredict := '0';
v.busy := '0';
bypass_valid := '0';
bypass_valid := actions.bypass_valid;

irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);

-- rotator control signals
right_shift <= '1' when e_in.insn_type = OP_SHR else '0';
rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0';
rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0';
rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0';

do_popcnt <= '1' when e_in.insn_type = OP_COUNTB and e_in.insn(7 downto 6) = "11" else '0';

if valid_in = '1' then
v.prev_op := e_in.insn_type;
v.prev_prefixed := e_in.prefixed;
@ -1706,7 +1708,6 @@ begin
if go = '1' then
v.se := actions.se;
v.e.valid := actions.complete;
bypass_valid := actions.bypass_valid;
v.taken_branch_event := actions.take_branch;
v.trace_next := actions.do_trace or actions.ciabr_trace;
v.trace_ciabr := actions.ciabr_trace;
@ -1719,6 +1720,7 @@ begin
x_to_divider.valid <= actions.start_div;
v.div_in_progress := actions.start_div;
v.bsort_in_progress := actions.start_bsort;
v.bperm_in_progress := actions.start_bperm;
v.br_mispredict := v.e.redirect and actions.direct_branch;
v.advance_nia := actions.advance_nia;
v.redir_to_next := actions.redir_to_next;
@ -1729,7 +1731,8 @@ begin
-- multiply is happening in order to stop following
-- instructions from using the wrong XER value
-- (and for simplicity in the OE=0 case).
v.busy := actions.start_div or actions.start_mul or actions.start_bsort;
v.busy := actions.start_div or actions.start_mul or
actions.start_bsort or actions.start_bperm;

-- instruction for other units, i.e. LDST
if e_in.unit = LDST then
@ -1741,6 +1744,7 @@ begin
end if;
is_scv := go and actions.se.scv_trap;
bsort_start <= go and actions.start_bsort;
bperm_start <= go and actions.start_bperm;
pmu_trace <= go and actions.do_trace;

if not HAS_FPU and ex1.div_in_progress = '1' then
@ -1781,6 +1785,13 @@ begin
v.e.write_data := alu_result;
bypass_valid := bsort_done;
end if;
if ex1.bperm_in_progress = '1' then
v.bperm_in_progress := not bperm_done;
v.e.valid := bperm_done;
v.busy := not bperm_done;
v.e.write_data := alu_result;
bypass_valid := bperm_done;
end if;

if v.e.write_xerc_enable = '1' and v.e.valid = '1' then
v.xerc := v.e.xerc;
@ -1814,13 +1825,13 @@ begin
v.fp_exception_next := '0';
end if;

bypass_data.tag.valid <= v.e.write_enable and bypass_valid;
bypass_data.tag.tag <= v.e.instr_tag.tag;
bypass_data.tag.valid <= e_in.write_reg_enable and bypass_valid;
bypass_data.tag.tag <= e_in.instr_tag.tag;
bypass_data.data <= alu_result;

bypass_cr_data.tag.valid <= v.e.write_cr_enable and bypass_valid;
bypass_cr_data.tag.tag <= v.e.instr_tag.tag;
bypass_cr_data.data <= v.e.write_cr_data;
bypass_cr_data.tag.valid <= e_in.output_cr and bypass_valid;
bypass_cr_data.tag.tag <= e_in.instr_tag.tag;
bypass_cr_data.data <= write_cr_data;

-- Outputs to loadstore1 (async)
lv.op := e_in.insn_type;
@ -1881,8 +1892,8 @@ begin

-- Slow SPR read mux
with ex1.spr_select.sel select spr_result <=
ctrl.tb when SPRSEL_TB,
32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU,
timebase when SPRSEL_TB,
32x"0" & timebase(63 downto 32) when SPRSEL_TBU,
ctrl.dec when SPRSEL_DEC,
32x"0" & PVR_MICROWATT when SPRSEL_PVR,
log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA,
@ -1936,16 +1947,14 @@ begin
end if;

ctrl_tmp <= ctrl;
-- FIXME: run at 512MHz not core freq
ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);

x_to_pmu.mfspr <= '0';
x_to_pmu.mtspr <= '0';
x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47);
x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51);
x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55);
x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63);
x_to_pmu.tbbits(3) <= timebase(63 - 47);
x_to_pmu.tbbits(2) <= timebase(63 - 51);
x_to_pmu.tbbits(1) <= timebase(63 - 55);
x_to_pmu.tbbits(0) <= timebase(63 - 63);
x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM);
x_to_pmu.pr_msr <= ctrl.msr(MSR_PR);


@ -10,6 +10,7 @@ use work.wishbone_types.all;

entity toplevel is
generic (
CPUS : natural := 1;
MEMORY_SIZE : integer := 16384;
RAM_INIT_FILE : string := "firmware.hex";
RESET_LOW : boolean := true;
@ -241,6 +242,7 @@ begin
MEMORY_SIZE => BRAM_SIZE,
RAM_INIT_FILE => RAM_INIT_FILE,
SIM => false,
NCPUS => CPUS,
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,

@ -65,7 +65,8 @@
#define SYS_REG_UART_IS_16550 (1ull << 32)
#define SYS_REG_GIT_INFO 0x50
#define SYS_REG_GIT_IS_DIRTY (1ull << 63)

#define SYS_REG_CPU_CTRL 0x58
#define SYS_REG_CPU_CTRL_ENABLE 0xff

/*
* Register definitions for the potato UART

@ -52,6 +52,8 @@ architecture behave of loadstore1 is
MMU_WAIT -- waiting for MMU to finish doing something
);

constant num_dawr : positive := 2;

type byte_index_t is array(0 to 7) of unsigned(2 downto 0);
subtype byte_trim_t is std_ulogic_vector(1 downto 0);
type trim_ctl_t is array(0 to 7) of byte_trim_t;
@ -130,6 +132,9 @@ architecture behave of loadstore1 is
busy : std_ulogic;
issued : std_ulogic;
addr0 : std_ulogic_vector(63 downto 0);
dawr_ll : std_ulogic_vector(num_dawr-1 downto 0);
dawr_ul : std_ulogic_vector(num_dawr-1 downto 0);
dawr_ud : std_ulogic;
end record;

type reg_stage2_t is record
@ -147,7 +152,6 @@ architecture behave of loadstore1 is
dbg_spr_ack: std_ulogic;
end record;

constant num_dawr : positive := 2;
type dawr_array_t is array(0 to num_dawr - 1) of std_ulogic_vector(63 downto 3);
type dawrx_array_t is array(0 to num_dawr - 1) of std_ulogic_vector(15 downto 0);

@ -335,6 +339,9 @@ begin
r1.req.sprsel <= "000";
r1.req.ric <= "00";
r1.req.xerc <= xerc_init;
r1.dawr_ll <= (others => '0');
r1.dawr_ul <= (others => '0');
r1.dawr_ud <= '0';

r2.req.valid <= '0';
r2.busy <= '0';
@ -617,6 +624,9 @@ begin
variable req : request_t;
variable dcreq : std_ulogic;
variable issue : std_ulogic;
variable addr : std_ulogic_vector(63 downto 3);
variable addl : unsigned(64 downto 3);
variable addu : unsigned(64 downto 3);
begin
v := r1;
issue := '0';
@ -661,6 +671,20 @@ begin
end if;
end if;

-- Do subtractions for DAWR0/1 matches
for i in 0 to 1 loop
addr := req.addr(63 downto 3);
if req.priv_mode = '1' and r3.dawrx(i)(7) = '1' then
-- HRAMMC=1 => trim top bit from address
addr(63) := '0';
end if;
addl := unsigned('0' & addr) - unsigned('0' & r3.dawr(i));
addu := unsigned('0' & r3.dawr_uplim(i)) - unsigned('0' & addr);
v.dawr_ll(i) := addl(64);
v.dawr_ul(i) := addu(64);
end loop;
v.dawr_ud := r3.dawr_upd;

if flush = '1' then
v.req.valid := '0';
v.req.dc_req := '0';
@ -702,9 +726,6 @@ begin
variable sprsel : std_ulogic_vector(2 downto 0);
variable sprval : std_ulogic_vector(63 downto 0);
variable dawr_match : std_ulogic;
variable addr : std_ulogic_vector(63 downto 3);
variable addl : unsigned(64 downto 3);
variable addu : unsigned(64 downto 3);
begin
v := r2;

@ -724,14 +745,7 @@ begin
-- Test for DAWR0/1 matches
dawr_match := '0';
for i in 0 to 1 loop
addr := r1.req.addr(63 downto 3);
if r1.req.priv_mode = '1' and r3.dawrx(i)(7) = '1' then
-- HRAMMC=1 => trim top bit from address
addr(63) := '0';
end if;
addl := unsigned('0' & addr) - unsigned('0' & r3.dawr(i));
addu := unsigned('0' & r3.dawr_uplim(i)) - unsigned('0' & addr);
if addl(64) = '0' and addu(64) = '0' and
if r1.dawr_ll(i) = '0' and r1.dawr_ul(i) = '0' and r1.dawr_ud = '0' and
dawrx_match_enable(r3.dawrx(i), r1.req.virt_mode,
r1.req.priv_mode, r1.req.store) then
dawr_match := r1.req.valid and r1.req.dc_req and not r3.dawr_upd and

@ -23,7 +23,6 @@ architecture behaviour of logical is

signal par0, par1 : std_ulogic;
signal parity : std_ulogic_vector(63 downto 0);
signal permute : std_ulogic_vector(7 downto 0);

function bcd_to_dpd(bcd: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is
variable dpd: std_ulogic_vector(9 downto 0);
@ -109,16 +108,6 @@ begin
parity(32) <= par1;
end if;

-- bit permutation
for i in 0 to 7 loop
j := i * 8;
if rs(j+7 downto j+6) = "00" then
permute(i) <= rb(to_integer(unsigned(not rs(j+5 downto j))));
else
permute(i) <= '0';
end if;
end loop;

rb_adj := rb;
if invert_in = '1' then
rb_adj := not rb;
@ -157,8 +146,6 @@ begin
tmp := parity;
when OP_CMPB =>
tmp := ppc_cmpb(rs, rb);
when OP_BPERM =>
tmp := std_ulogic_vector(resize(unsigned(permute), 64));
when OP_BCD =>
-- invert_in is abused to indicate direction of conversion
if invert_in = '0' then

@ -335,6 +335,7 @@ targets:
default_tool: vivado
filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, liteeth, uart16550, xilinx_specific, litesdcard]
parameters:
- cpus
- memory_size
- ram_init_file
- use_litedram=true
@ -496,6 +497,12 @@ generate:
parameters: {vendor : xilinx, frequency : 100e6}

parameters:
cpus:
datatype : int
description : Number of CPU cores to include in the SoC.
paramtype : generic
default : 1

memory_size:
datatype : int
description : On-chip memory size (bytes). If no_bram is set, this is the size carved out for the DRAM payload

@ -24,28 +24,30 @@
#define DBG_WB_DATA 0x01
#define DBG_WB_CTRL 0x02

#define DBG_CORE_CTRL 0x10
unsigned int core;

#define DBG_CORE_CTRL (0x10 + (core << 4))
#define DBG_CORE_CTRL_STOP (1 << 0)
#define DBG_CORE_CTRL_RESET (1 << 1)
#define DBG_CORE_CTRL_ICRESET (1 << 2)
#define DBG_CORE_CTRL_STEP (1 << 3)
#define DBG_CORE_CTRL_START (1 << 4)

#define DBG_CORE_STAT 0x11
#define DBG_CORE_STAT (0x11 + (core << 4))
#define DBG_CORE_STAT_STOPPING (1 << 0)
#define DBG_CORE_STAT_STOPPED (1 << 1)
#define DBG_CORE_STAT_TERM (1 << 2)

#define DBG_CORE_NIA 0x12
#define DBG_CORE_MSR 0x13
#define DBG_CORE_NIA (0x12 + (core << 4))
#define DBG_CORE_MSR (0x13 + (core << 4))

#define DBG_CORE_GSPR_INDEX 0x14
#define DBG_CORE_GSPR_DATA 0x15
#define DBG_CORE_GSPR_INDEX (0x14 + (core << 4))
#define DBG_CORE_GSPR_DATA (0x15 + (core << 4))

#define DBG_LOG_ADDR 0x16
#define DBG_LOG_DATA 0x17
#define DBG_LOG_TRIGGER 0x18
#define DBG_LOG_MTRIGGER 0x19
#define DBG_LOG_ADDR (0x16 + (core << 4))
#define DBG_LOG_DATA (0x17 + (core << 4))
#define DBG_LOG_TRIGGER (0x18 + (core << 4))
#define DBG_LOG_MTRIGGER (0x19 + (core << 4))

static bool debug;

@ -507,7 +509,7 @@ static void core_status(void)
statstr2 = " (terminated)";
} else if (stat & DBG_CORE_STAT_TERM)
statstr = "odd state (TERM but no STOP)";
printf("Core: %s%s\n", statstr, statstr2);
printf("Core%u: %s%s\n", core, statstr, statstr2);
printf(" NIA: %016" PRIx64 "\n", nia);
printf(" MSR: %016" PRIx64 "\n", msr);
}
@ -792,7 +794,7 @@ static void mtrig_set(uint64_t addr)

static void usage(const char *cmd)
{
fprintf(stderr, "Usage: %s -b <jtag|ecp5|sim> <command> <args>\n", cmd);
fprintf(stderr, "Usage: %s -b <jtag|ecp5|sim> [-c core#] <command> <args>\n", cmd);

fprintf(stderr, "\n");
fprintf(stderr, " CPU core:\n");
@ -851,12 +853,20 @@ int main(int argc, char *argv[])
{ "target", required_argument, 0, 't' },
{ "debug", no_argument, 0, 'd' },
{ "frequency", no_argument, 0, 's' },
{ "core", required_argument, 0, 'c' },
{ 0, 0, 0, 0 }
};
c = getopt_long(argc, argv, "dhb:t:s:", lopts, &oindex);
c = getopt_long(argc, argv, "dhb:t:s:c:", lopts, &oindex);
if (c < 0)
break;
switch(c) {
case 'c':
core = atoi(optarg);
if (core >= 15) {
fprintf(stderr, "Core number out of range (max 14)\n");
exit(1);
}
break;
case 'h':
usage(progname);
break;

@ -67,6 +67,7 @@ entity soc is
RAM_INIT_FILE : string;
CLK_FREQ : positive;
SIM : boolean;
NCPUS : positive := 1;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
DISABLE_FLATTEN_CORE : boolean := false;
@ -148,20 +149,18 @@ end entity soc;

architecture behaviour of soc is

subtype cpu_index_t is natural range 0 to NCPUS-1;
type dword_percpu_array is array(cpu_index_t) of std_ulogic_vector(63 downto 0);

-- internal reset
signal soc_reset : std_ulogic;

-- Wishbone master signals:
signal wishbone_dcore_in : wishbone_slave_out;
signal wishbone_dcore_out : wishbone_master_out;
signal wishbone_icore_in : wishbone_slave_out;
signal wishbone_icore_out : wishbone_master_out;
signal wishbone_debug_in : wishbone_slave_out;
signal wishbone_debug_out : wishbone_master_out;

-- Arbiter array (ghdl doesnt' support assigning the array
-- elements in the entity instantiation)
constant NUM_WB_MASTERS : positive := 4;
signal wishbone_debug_in : wishbone_slave_out;
signal wishbone_debug_out : wishbone_master_out;

-- Arbiter array
constant NUM_WB_MASTERS : positive := NCPUS * 2 + 2;
signal wb_masters_out : wishbone_master_out_vector(0 to NUM_WB_MASTERS-1);
signal wb_masters_in : wishbone_slave_out_vector(0 to NUM_WB_MASTERS-1);

@ -180,7 +179,7 @@ architecture behaviour of soc is

-- Syscon signals
signal dram_at_0 : std_ulogic;
signal do_core_reset : std_ulogic;
signal do_core_reset : std_ulogic_vector(NCPUS-1 downto 0);
signal alt_reset : std_ulogic;
signal wb_syscon_in : wb_io_master_out;
signal wb_syscon_out : wb_io_slave_out;
@ -210,7 +209,7 @@ architecture behaviour of soc is
signal wb_xics_ics_out : wb_io_slave_out;
signal int_level_in : std_ulogic_vector(15 downto 0);
signal ics_to_icp : ics_to_icp_t;
signal core_ext_irq : std_ulogic;
signal core_ext_irq : std_ulogic_vector(NCPUS-1 downto 0) := (others => '0');

-- GPIO signals:
signal wb_gpio_in : wb_io_master_out;
@ -233,12 +232,12 @@ architecture behaviour of soc is
signal dmi_wb_dout : std_ulogic_vector(63 downto 0);
signal dmi_wb_req : std_ulogic;
signal dmi_wb_ack : std_ulogic;
signal dmi_core_dout : std_ulogic_vector(63 downto 0);
signal dmi_core_req : std_ulogic;
signal dmi_core_ack : std_ulogic;
signal dmi_core_dout : dword_percpu_array;
signal dmi_core_req : std_ulogic_vector(NCPUS-1 downto 0);
signal dmi_core_ack : std_ulogic_vector(NCPUS-1 downto 0);

-- Delayed/latched resets and alt_reset
signal rst_core : std_ulogic;
signal rst_core : std_ulogic_vector(NCPUS-1 downto 0);
signal rst_uart : std_ulogic;
signal rst_xics : std_ulogic;
signal rst_spi : std_ulogic;
@ -270,6 +269,10 @@ architecture behaviour of soc is
signal io_cycle_gpio : std_ulogic;
signal io_cycle_external : std_ulogic;

signal core_run_out : std_ulogic_vector(NCPUS-1 downto 0);

signal timebase : std_ulogic_vector(63 downto 0);

function wishbone_widen_data(wb : wb_io_master_out) return wishbone_master_out is
variable wwb : wishbone_master_out;
begin
@ -334,7 +337,9 @@ begin
resets: process(system_clk)
begin
if rising_edge(system_clk) then
rst_core <= soc_reset or do_core_reset;
for i in 0 to NCPUS-1 loop
rst_core(i) <= soc_reset or do_core_reset(i);
end loop;
rst_uart <= soc_reset;
rst_spi <= soc_reset;
rst_xics <= soc_reset;
@ -347,11 +352,27 @@ begin
end if;
end process;

-- Processor core
processor: entity work.core
-- Timebase just increments at the system clock frequency.
-- There is currently no way to set it.
-- Ideally it would (appear to) run at 512MHz like IBM POWER systems,
-- but Linux seems to cope OK with it being 100MHz or whatever.
tbase: process(system_clk)
begin
if rising_edge(system_clk) then
if soc_reset = '1' then
timebase <= (others => '0');
else
timebase <= std_ulogic_vector(unsigned(timebase) + 1);
end if;
end if;
end process;

-- Processor cores
processors: for i in 0 to NCPUS-1 generate
core: entity work.core
generic map(
SIM => SIM,
CPU_INDEX => 0,
CPU_INDEX => i,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
@ -367,32 +388,32 @@ begin
)
port map(
clk => system_clk,
rst => rst_core,
rst => rst_core(i),
alt_reset => alt_reset_d,
run_out => run_out,
wishbone_insn_in => wishbone_icore_in,
wishbone_insn_out => wishbone_icore_out,
wishbone_data_in => wishbone_dcore_in,
wishbone_data_out => wishbone_dcore_out,
run_out => core_run_out(i),
timebase => timebase,
wishbone_insn_in => wb_masters_in(i + NCPUS),
wishbone_insn_out => wb_masters_out(i + NCPUS),
wishbone_data_in => wb_masters_in(i),
wishbone_data_out => wb_masters_out(i),
wb_snoop_in => wb_snoop,
dmi_addr => dmi_addr(3 downto 0),
dmi_dout => dmi_core_dout,
dmi_dout => dmi_core_dout(i),
dmi_din => dmi_dout,
dmi_wr => dmi_wr,
dmi_ack => dmi_core_ack,
dmi_req => dmi_core_req,
ext_irq => core_ext_irq
dmi_ack => dmi_core_ack(i),
dmi_req => dmi_core_req(i),
ext_irq => core_ext_irq(i)
);
end generate;

run_out <= or (core_run_out);

-- Wishbone bus master arbiter & mux
wb_masters_out <= (0 => wishbone_dcore_out,
1 => wishbone_icore_out,
2 => wishbone_widen_data(wishbone_dma_out),
3 => wishbone_debug_out);
wishbone_dcore_in <= wb_masters_in(0);
wishbone_icore_in <= wb_masters_in(1);
wishbone_dma_in <= wishbone_narrow_data(wb_masters_in(2), wishbone_dma_out.adr);
wishbone_debug_in <= wb_masters_in(3);
wb_masters_out(2*NCPUS) <= wishbone_widen_data(wishbone_dma_out);
wb_masters_out(2*NCPUS + 1) <= wishbone_debug_out;
wishbone_dma_in <= wishbone_narrow_data(wb_masters_in(2*NCPUS), wishbone_dma_out.adr);
wishbone_debug_in <= wb_masters_in(2*NCPUS + 1);
wishbone_arbiter_0: entity work.wishbone_arbiter
generic map(
NUM_MASTERS => NUM_WB_MASTERS
@ -780,6 +801,7 @@ begin
-- Syscon slave
syscon0: entity work.syscon
generic map(
NCPUS => NCPUS,
HAS_UART => true,
HAS_DRAM => HAS_DRAM,
BRAM_SIZE => MEMORY_SIZE,
@ -944,6 +966,9 @@ begin
end generate;

xics_icp: entity work.xics_icp
generic map(
NCPUS => NCPUS
)
port map(
clk => system_clk,
rst => rst_xics,
@ -955,6 +980,7 @@ begin

xics_ics: entity work.xics_ics
generic map(
NCPUS => NCPUS,
SRC_NUM => 16,
PRIO_BITS => 3
)
@ -1034,15 +1060,15 @@ begin
);

-- DMI interconnect
dmi_intercon: process(dmi_addr, dmi_req,
dmi_wb_ack, dmi_wb_dout,
dmi_core_ack, dmi_core_dout)
dmi_intercon: process(all)

-- DMI address map (each address is a full 64-bit register)
--
-- Offset: Size: Slave:
-- 0 4 Wishbone
-- 10 16 Core
-- 10 16 Core 0
-- 20 16 Core 1
-- ... and so on for NCPUS cores

type slave_type is (SLAVE_WB,
SLAVE_CORE,
@ -1053,25 +1079,29 @@ begin
slave := SLAVE_NONE;
if std_match(dmi_addr, "000000--") then
slave := SLAVE_WB;
elsif std_match(dmi_addr, "0001----") then
elsif not is_X(dmi_addr) and to_integer(unsigned(dmi_addr(7 downto 4))) <= NCPUS then
slave := SLAVE_CORE;
end if;

-- DMI muxing
dmi_wb_req <= '0';
dmi_core_req <= '0';
dmi_core_req <= (others => '0');
dmi_din <= (others => '1');
dmi_ack <= dmi_req;
case slave is
when SLAVE_WB =>
dmi_wb_req <= dmi_req;
dmi_ack <= dmi_wb_ack;
dmi_din <= dmi_wb_dout;
when SLAVE_CORE =>
dmi_core_req <= dmi_req;
dmi_ack <= dmi_core_ack;
dmi_din <= dmi_core_dout;
for i in 0 to NCPUS-1 loop
if not is_X(dmi_addr) and to_integer(unsigned(dmi_addr(7 downto 4))) = i + 1 then
dmi_core_req(i) <= dmi_req;
dmi_ack <= dmi_core_ack(i);
dmi_din <= dmi_core_dout(i);
end if;
end loop;
when others =>
dmi_ack <= dmi_req;
dmi_din <= (others => '1');
end case;

-- SIM magic exit

@ -9,6 +9,7 @@ use work.wishbone_types.all;

entity syscon is
generic (
NCPUS : positive := 1;
SIG_VALUE : std_ulogic_vector(63 downto 0) := x"f00daa5500010001";
CLK_FREQ : integer;
HAS_UART : boolean;
@ -33,7 +34,7 @@ entity syscon is

-- System control ports
dram_at_0 : out std_ulogic;
core_reset : out std_ulogic;
core_reset : out std_ulogic_vector(NCPUS-1 downto 0);
soc_reset : out std_ulogic;
alt_reset : out std_ulogic
);
@ -56,6 +57,7 @@ architecture behaviour of syscon is
constant SYS_REG_UART0_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001000";
constant SYS_REG_UART1_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001001";
constant SYS_REG_GIT_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001010";
constant SYS_REG_CPU_CTRL : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001011";

-- Muxed reg read signal
signal reg_out : std_ulogic_vector(63 downto 0);
@ -116,6 +118,7 @@ architecture behaviour of syscon is
signal reg_uart0info : std_ulogic_vector(63 downto 0);
signal reg_uart1info : std_ulogic_vector(63 downto 0);
signal reg_gitinfo : std_ulogic_vector(63 downto 0);
signal reg_cpuctrl : std_ulogic_vector(63 downto 0);
signal info_has_dram : std_ulogic;
signal info_has_bram : std_ulogic;
signal info_has_uart : std_ulogic;
@ -134,7 +137,8 @@ begin
-- Generated output signals
dram_at_0 <= '1' when BRAM_SIZE = 0 else reg_ctrl(SYS_REG_CTRL_DRAM_AT_0);
soc_reset <= reg_ctrl(SYS_REG_CTRL_SOC_RESET);
core_reset <= reg_ctrl(SYS_REG_CTRL_CORE_RESET);
core_reset <= not reg_cpuctrl(NCPUS-1 downto 0) when reg_ctrl(SYS_REG_CTRL_CORE_RESET) = '0'
else (others => '1');
alt_reset <= reg_ctrl(SYS_REG_CTRL_ALT_RESET);


@ -187,6 +191,8 @@ begin
55 downto 0 => GIT_HASH,
others => '0');

reg_cpuctrl(63 downto 8) <= std_ulogic_vector(to_unsigned(NCPUS, 56));

-- Wishbone response
wb_rsp.ack <= wishbone_in.cyc and wishbone_in.stb;
with wishbone_in.adr(SYS_REG_BITS downto 1) select reg_out <=
@ -201,6 +207,7 @@ begin
reg_uart0info when SYS_REG_UART0_INFO,
reg_uart1info when SYS_REG_UART1_INFO,
reg_gitinfo when SYS_REG_GIT_INFO,
reg_cpuctrl when SYS_REG_CPU_CTRL,
(others => '0') when others;
wb_rsp.dat <= reg_out(63 downto 32) when wishbone_in.adr(0) = '1' else
reg_out(31 downto 0);
@ -225,6 +232,7 @@ begin
if (rst) then
reg_ctrl <= (SYS_REG_CTRL_ALT_RESET => ctrl_init_alt_reset,
others => '0');
reg_cpuctrl(7 downto 0) <= x"01"; -- enable cpu 0 only
else
if wishbone_in.cyc and wishbone_in.stb and wishbone_in.we then
-- Change this if CTRL ever has more than 32 bits
@ -233,6 +241,10 @@ begin
reg_ctrl(SYS_REG_CTRL_BITS-1 downto 0) <=
wishbone_in.dat(SYS_REG_CTRL_BITS-1 downto 0);
end if;
if wishbone_in.adr(SYS_REG_BITS downto 1) = SYS_REG_CPU_CTRL and
wishbone_in.adr(0) = '0' and wishbone_in.sel(0) = '1' then
reg_cpuctrl(7 downto 0) <= wishbone_in.dat(7 downto 0);
end if;
end if;

-- Reset auto-clear

@ -4,7 +4,6 @@ use ieee.std_logic_1164.all;
library work;
use work.wishbone_types.all;

-- TODO: Use an array of master/slaves with parametric size
entity wishbone_arbiter is
generic(
NUM_MASTERS : positive := 3
@ -28,18 +27,23 @@ begin

busy <= wb_masters_in(selected).cyc;

wishbone_muxes: process(selected, candidate, busy, wb_slave_in, wb_masters_in)
wishbone_muxes: process(all)
variable early_sel : wb_arb_master_t;
begin
early_sel := selected;
if busy = '0' then
if NUM_MASTERS <= 4 and busy = '0' then
early_sel := candidate;
end if;
wb_slave_out <= wb_masters_in(early_sel);
for i in 0 to NUM_MASTERS-1 loop
wb_masters_out(i).dat <= wb_slave_in.dat;
wb_masters_out(i).ack <= wb_slave_in.ack when early_sel = i else '0';
wb_masters_out(i).stall <= wb_slave_in.stall when early_sel = i else '1';
if early_sel = i and wb_masters_in(i).cyc = '1' then
wb_masters_out(i).ack <= wb_slave_in.ack;
wb_masters_out(i).stall <= wb_slave_in.stall;
else
wb_masters_out(i).ack <= '0';
wb_masters_out(i).stall <= '1';
end if;
end loop;
end process;


@ -25,6 +25,9 @@ use work.common.all;
use work.wishbone_types.all;

entity xics_icp is
generic (
NCPUS : natural := 1
);
port (
clk : in std_logic;
rst : in std_logic;
@ -33,32 +36,41 @@ entity xics_icp is
wb_out : out wb_io_slave_out;

ics_in : in ics_to_icp_t;
core_irq_out : out std_ulogic
core_irq_out : out std_ulogic_vector(NCPUS-1 downto 0)
);
end xics_icp;

architecture behaviour of xics_icp is
type reg_internal_t is record
type xics_presentation_t is record
xisr : std_ulogic_vector(23 downto 0);
cppr : std_ulogic_vector(7 downto 0);
mfrr : std_ulogic_vector(7 downto 0);
irq : std_ulogic;
end record;
constant xics_presentation_t_init : xics_presentation_t :=
(mfrr => x"ff", -- mask everything on reset
irq => '0',
others => (others => '0'));
subtype cpu_index_t is natural range 0 to NCPUS-1;
type xicp_array_t is array(cpu_index_t) of xics_presentation_t;

type reg_internal_t is record
icp : xicp_array_t;
wb_rd_data : std_ulogic_vector(31 downto 0);
wb_ack : std_ulogic;
end record;
constant reg_internal_init : reg_internal_t :=
(wb_ack => '0',
mfrr => x"ff", -- mask everything on reset
irq => '0',
others => (others => '0'));
wb_rd_data => (others => '0'),
icp => (others => xics_presentation_t_init));

signal r, r_next : reg_internal_t;

-- 8 bit offsets for each presentation
constant XIRR_POLL : std_ulogic_vector(7 downto 0) := x"00";
constant XIRR : std_ulogic_vector(7 downto 0) := x"04";
constant RESV0 : std_ulogic_vector(7 downto 0) := x"08";
constant MFRR : std_ulogic_vector(7 downto 0) := x"0c";
-- 4 bit offsets for each presentation register
constant XIRR_POLL : std_ulogic_vector(3 downto 0) := x"0";
constant XIRR : std_ulogic_vector(3 downto 0) := x"4";
constant RESV0 : std_ulogic_vector(3 downto 0) := x"8";
constant MFRR : std_ulogic_vector(3 downto 0) := x"c";

begin

@ -68,7 +80,9 @@ begin
r <= r_next;

-- We delay core_irq_out by a cycle to help with timing
core_irq_out <= r.irq;
for i in 0 to NCPUS-1 loop
core_irq_out(i) <= r.icp(i).irq;
end loop;
end if;
end process;

@ -99,94 +113,105 @@ begin

v.wb_ack := '0';

xirr_accept_rd := '0';

be_in := bswap(wb_in.dat);
be_out := (others => '0');

if wb_in.cyc = '1' and wb_in.stb = '1' then
v.wb_ack := '1'; -- always ack
if wb_in.we = '1' then -- write
-- writes to both XIRR are the same
case wb_in.adr(5 downto 0) & "00" is
when XIRR_POLL =>
report "ICP XIRR_POLL write";
v.cppr := be_in(31 downto 24);
when XIRR =>
v.cppr := be_in(31 downto 24);
if wb_in.sel = x"f" then -- 4 byte
report "ICP XIRR write word (EOI) :" & to_hstring(be_in);
elsif wb_in.sel = x"1" then -- 1 byte
report "ICP XIRR write byte (CPPR):" & to_hstring(be_in(31 downto 24));
else
report "ICP XIRR UNSUPPORTED write ! sel=" & to_hstring(wb_in.sel);
end if;
when MFRR =>
v.mfrr := be_in(31 downto 24);
if wb_in.sel = x"f" then -- 4 bytes
report "ICP MFRR write word:" & to_hstring(be_in);
elsif wb_in.sel = x"1" then -- 1 byte
report "ICP MFRR write byte:" & to_hstring(be_in(31 downto 24));
else
report "ICP MFRR UNSUPPORTED write ! sel=" & to_hstring(wb_in.sel);
end if;
when others =>
end case;

else -- read

case wb_in.adr(5 downto 0) & "00" is
when XIRR_POLL =>
report "ICP XIRR_POLL read";
be_out := r.cppr & r.xisr;
when XIRR =>
report "ICP XIRR read";
be_out := r.cppr & r.xisr;
if wb_in.sel = x"f" then
xirr_accept_rd := '1';
end if;
when MFRR =>
report "ICP MFRR read";
be_out(31 downto 24) := r.mfrr;
when others =>
end case;
end if;
end if;

pending_priority := x"ff";
v.xisr := x"000000";
v.irq := '0';
for i in cpu_index_t loop
xirr_accept_rd := '0';

if wb_in.cyc = '1' and wb_in.stb = '1' and
to_integer(unsigned(wb_in.adr(5 downto 2))) = i then
if wb_in.we = '1' then -- write
-- writes to both XIRR are the same
case wb_in.adr(1 downto 0) & "00" is
when XIRR_POLL =>
report "ICP XIRR_POLL write";
v.icp(i).cppr := be_in(31 downto 24);
when XIRR =>
v.icp(i).cppr := be_in(31 downto 24);
if wb_in.sel = x"f" then -- 4 byte
report "ICP " & natural'image(i) & " XIRR write word (EOI) :" &
to_hstring(be_in);
elsif wb_in.sel = x"1" then -- 1 byte
report "ICP " & natural'image(i) & " XIRR write byte (CPPR):" &
to_hstring(be_in(31 downto 24));
else
report "ICP " & natural'image(i) & " XIRR UNSUPPORTED write ! sel=" &
to_hstring(wb_in.sel);
end if;
when MFRR =>
v.icp(i).mfrr := be_in(31 downto 24);
if wb_in.sel = x"f" then -- 4 bytes
report "ICP " & natural'image(i) & " MFRR write word:" &
to_hstring(be_in);
elsif wb_in.sel = x"1" then -- 1 byte
report "ICP " & natural'image(i) & " MFRR write byte:" &
to_hstring(be_in(31 downto 24));
else
report "ICP " & natural'image(i) & " MFRR UNSUPPORTED write ! sel=" &
to_hstring(wb_in.sel);
end if;
when others =>
end case;

else -- read

case wb_in.adr(1 downto 0) & "00" is
when XIRR_POLL =>
report "ICP XIRR_POLL read";
be_out := r.icp(i).cppr & r.icp(i).xisr;
when XIRR =>
report "ICP XIRR read";
be_out := r.icp(i).cppr & r.icp(i).xisr;
if wb_in.sel = x"f" then
xirr_accept_rd := '1';
end if;
when MFRR =>
report "ICP MFRR read";
be_out(31 downto 24) := r.icp(i).mfrr;
when others =>
end case;
end if;
end if;

if ics_in.pri /= x"ff" then
v.xisr := x"00001" & ics_in.src;
pending_priority := ics_in.pri;
end if;
pending_priority := x"ff";
v.icp(i).xisr := x"000000";
v.icp(i).irq := '0';

-- Check MFRR
if unsigned(r.mfrr) < unsigned(pending_priority) then --
v.xisr := x"000002"; -- special XICS MFRR IRQ source number
pending_priority := r.mfrr;
end if;
if ics_in.pri(8*i + 7 downto 8*i) /= x"ff" then
v.icp(i).xisr := x"00001" & ics_in.src(4*i + 3 downto 4*i);
pending_priority := ics_in.pri(8*i + 7 downto 8*i);
end if;

-- Accept the interrupt
if xirr_accept_rd = '1' then
report "XICS: ICP ACCEPT" &
" cppr:" & to_hstring(r.cppr) &
" xisr:" & to_hstring(r.xisr) &
" mfrr:" & to_hstring(r.mfrr);
v.cppr := pending_priority;
end if;
-- Check MFRR
if unsigned(r.icp(i).mfrr) < unsigned(pending_priority) then --
v.icp(i).xisr := x"000002"; -- special XICS MFRR IRQ source number
pending_priority := r.icp(i).mfrr;
end if;

-- Accept the interrupt
if xirr_accept_rd = '1' then
report "XICS " & natural'image(i) & ": ICP ACCEPT" &
" cppr:" & to_hstring(r.icp(i).cppr) &
" xisr:" & to_hstring(r.icp(i).xisr) &
" mfrr:" & to_hstring(r.icp(i).mfrr);
v.icp(i).cppr := pending_priority;
end if;

v.wb_rd_data := bswap(be_out);
v.wb_rd_data := bswap(be_out);

if unsigned(pending_priority) < unsigned(v.cppr) then
if r.irq = '0' then
report "IRQ set";
if unsigned(pending_priority) < unsigned(v.icp(i).cppr) then
if r.icp(i).irq = '0' then
report "CPU " & natural'image(i) & " IRQ set";
end if;
v.icp(i).irq := '1';
elsif r.icp(i).irq = '1' then
report "CPU " & natural'image(i) & " IRQ clr";
end if;
v.irq := '1';
elsif r.irq = '1' then
report "IRQ clr";
end if;
end loop;

if rst = '1' then
v := reg_internal_init;
@ -210,6 +235,7 @@ use work.helpers.all;

entity xics_ics is
generic (
NCPUS : natural := 1;
SRC_NUM : integer range 1 to 256 := 16;
PRIO_BITS : integer range 1 to 8 := 3
);
@ -228,10 +254,13 @@ end xics_ics;
architecture rtl of xics_ics is

constant SRC_NUM_BITS : natural := log2(SRC_NUM);
constant SERVER_NUM_BITS : natural := 2;

subtype pri_t is std_ulogic_vector(PRIO_BITS-1 downto 0);
subtype server_t is unsigned(SERVER_NUM_BITS-1 downto 0);
type xive_t is record
pri : pri_t;
server : server_t;
end record;
constant pri_masked : pri_t := (others => '1');

@ -308,6 +337,16 @@ architecture rtl of xics_ics is
return p(nbits - 1 downto 0);
end function;

function server_check(serv_in: std_ulogic_vector(7 downto 0)) return unsigned is
variable srv : server_t;
begin
srv := to_unsigned(0, SERVER_NUM_BITS);
if to_integer(unsigned(serv_in)) < NCPUS then
srv := unsigned(serv_in(SERVER_NUM_BITS - 1 downto 0));
end if;
return srv;
end;

-- Register map
-- 0 : Config
-- 4 : Debug/diagnostics
@ -366,16 +405,14 @@ begin
be_out := (others => '0');

if reg_is_xive = '1' then
be_out := int_level_l(reg_idx) &
'0' &
int_level_l(reg_idx) &
'0' &
x"00000" &
prio_unpack(xives(reg_idx).pri);
be_out(31) := int_level_l(reg_idx);
be_out(29) := int_level_l(reg_idx);
be_out(8 + SERVER_NUM_BITS - 1 downto 8) := std_ulogic_vector(xives(reg_idx).server);
be_out(7 downto 0) := prio_unpack(xives(reg_idx).pri);
elsif reg_is_config = '1' then
be_out := get_config;
elsif reg_is_debug = '1' then
be_out := x"00000" & icp_out_next.src & icp_out_next.pri;
be_out := icp_out_next.src & icp_out_next.pri(15 downto 0);
end if;
wb_out.dat <= bswap(be_out);
wb_out.ack <= wb_valid;
@ -389,17 +426,20 @@ begin
if rising_edge(clk) then
if rst = '1' then
for i in 0 to SRC_NUM - 1 loop
xives(i) <= (pri => pri_masked);
xives(i) <= (pri => pri_masked, server => to_unsigned(0, SERVER_NUM_BITS));
end loop;
elsif wb_valid = '1' and wb_in.we = '1' then
-- Byteswapped input
be_in := bswap(wb_in.dat);
if reg_is_xive then
-- TODO: When adding support for other bits, make sure to
-- properly implement wb_in.sel to allow partial writes.
xives(reg_idx).pri <= prio_pack(be_in(7 downto 0));
report "ICS irq " & integer'image(reg_idx) &
" set to:" & to_hstring(be_in(7 downto 0));
if wb_in.sel(3) = '1' then
xives(reg_idx).pri <= prio_pack(be_in(7 downto 0));
report "ICS irq " & integer'image(reg_idx) &
" set to pri:" & to_hstring(be_in(7 downto 0));
end if;
if wb_in.sel(2) = '1' then
xives(reg_idx).server <= server_check(be_in(15 downto 8));
end if;
end if;
end if;
end if;
@ -424,29 +464,36 @@ begin
variable pending_pri : pri_vector_t;
variable pending_at_pri : std_ulogic_vector(SRC_NUM - 1 downto 0);
begin
-- Work out the most-favoured (lowest) priority of the pending interrupts
pending_pri := (others => '0');
for i in 0 to SRC_NUM - 1 loop
if int_level_l(i) = '1' then
pending_pri := pending_pri or prio_decode(xives(i).pri);
end if;
end loop;
max_pri := priority_encoder(pending_pri, PRIO_BITS);
icp_out_next.src <= (others => '0');
icp_out_next.pri <= (others => '0');
for cpu in 0 to NCPUS-1 loop
-- Work out the most-favoured (lowest) priority of the interrupts
-- that are pending and directed to this cpu
pending_pri := (others => '0');
for i in 0 to SRC_NUM - 1 loop
if int_level_l(i) = '1' and to_integer(xives(i).server) = cpu then
pending_pri := pending_pri or prio_decode(xives(i).pri);
end if;
end loop;
max_pri := priority_encoder(pending_pri, PRIO_BITS);

-- Work out which interrupts are pending at that priority
pending_at_pri := (others => '0');
for i in 0 to SRC_NUM - 1 loop
if int_level_l(i) = '1' and xives(i).pri = max_pri and
to_integer(xives(i).server) = cpu then
pending_at_pri(i) := '1';
end if;
end loop;
max_idx := priority_encoder(pending_at_pri, SRC_NUM_BITS);

-- Work out which interrupts are pending at that priority
pending_at_pri := (others => '0');
for i in 0 to SRC_NUM - 1 loop
if int_level_l(i) = '1' and xives(i).pri = max_pri then
pending_at_pri(i) := '1';
if max_pri /= pri_masked then
report "MFI: " & integer'image(to_integer(unsigned(max_idx))) & " pri=" & to_hstring(prio_unpack(max_pri)) &
" srv=" & integer'image(cpu);
end if;
icp_out_next.src(4*cpu + 3 downto 4*cpu) <= max_idx;
icp_out_next.pri(8*cpu + 7 downto 8*cpu) <= prio_unpack(max_pri);
end loop;
max_idx := priority_encoder(pending_at_pri, SRC_NUM_BITS);

if max_pri /= pri_masked then
report "MFI: " & integer'image(to_integer(unsigned(max_idx))) & " pri=" & to_hstring(prio_unpack(max_pri));
end if;
icp_out_next.src <= max_idx;
icp_out_next.pri <= prio_unpack(max_pri);
end process;

end architecture rtl;

Loading…
Cancel
Save