Merge pull request #208 from paulusmack/faster

Make the core go faster

Several major improvements in here:
- Simple branch predictor
- Reduced latency for mispredicted branches and interrupts by removing fetch2 stage
- Cache improvements
  o Request critical dword first on refill
  o Handle hits while refilling, including on line being refilled
  o Sizes doubled for both D and I
- Loadstore improvements: can now do one load or store every two cycles in most cases
- Optimized 2-cycle multiplier for Xilinx 7-series parts using DSP slices
- Timing improvements, including:
  o Stash buffer in decode1
  o Reduced width of execute1 result mux
  o Improved SPR decode in decode1
  o Some non-critical operation take a cycle longer so we can break some long combinatorial chains
- Core logging: logs 256 bits of info every cycle into a ring buffer, to help with debugging and performance analysis

This increases the LUT usage for the "synth" + A35 target from 9182 to 10297 = 12%.
pull/211/head
Michael Neuling 3 years ago committed by GitHub
commit b90a0a2139
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -42,7 +42,7 @@ all = core_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \
all: $(all)

core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
fetch2.vhdl utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \
cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \

@ -93,10 +93,11 @@ package common is
virt_mode : std_ulogic;
priv_mode : std_ulogic;
stop_mark: std_ulogic;
sequential: std_ulogic;
nia: std_ulogic_vector(63 downto 0);
end record;

type IcacheToFetch2Type is record
type IcacheToDecode1Type is record
valid: std_ulogic;
stop_mark: std_ulogic;
fetch_failed: std_ulogic;
@ -104,16 +105,6 @@ package common is
insn: std_ulogic_vector(31 downto 0);
end record;

type Fetch2ToDecode1Type is record
valid: std_ulogic;
stop_mark : std_ulogic;
fetch_failed: std_ulogic;
nia: std_ulogic_vector(63 downto 0);
insn: std_ulogic_vector(31 downto 0);
end record;
constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0',
nia => (others => '0'), insn => (others => '0'));

type Decode1ToDecode2Type is record
valid: std_ulogic;
stop_mark : std_ulogic;
@ -122,8 +113,16 @@ package common is
ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr
ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR)
decode: decode_rom_t;
br_pred: std_ulogic; -- Branch was predicted to be taken
end record;
constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
(valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, br_pred => '0');

type Decode1ToFetch1Type is record
redirect : std_ulogic;
redirect_nia : std_ulogic_vector(63 downto 0);
end record;
constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init);

type Decode2ToExecute1Type is record
valid: std_ulogic;
@ -158,23 +157,24 @@ package common is
sign_extend : std_ulogic; -- do we need to sign extend?
update : std_ulogic; -- is this an update instruction?
reserve : std_ulogic; -- set for larx/stcx
br_pred : std_ulogic;
end record;
constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
lr => '0', rc => '0', oe => '0', invert_a => '0',
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0'));

type Execute1ToMultiplyType is record
valid: std_ulogic;
insn_type: insn_type_t;
data1: std_ulogic_vector(64 downto 0);
data2: std_ulogic_vector(64 downto 0);
data1: std_ulogic_vector(63 downto 0);
data2: std_ulogic_vector(63 downto 0);
is_32bit: std_ulogic;
neg_result: std_ulogic;
end record;
constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL,
is_32bit => '0',
constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0',
is_32bit => '0', neg_result => '0',
others => (others => '0'));

type Execute1ToDividerType is record
@ -253,6 +253,7 @@ package common is
others => (others => '0'));

type Loadstore1ToExecute1Type is record
busy : std_ulogic;
exception : std_ulogic;
invalid : std_ulogic;
perm_error : std_ulogic;
@ -366,7 +367,7 @@ package common is

type MultiplyToExecute1Type is record
valid: std_ulogic;
write_reg_data: std_ulogic_vector(63 downto 0);
result: std_ulogic_vector(127 downto 0);
overflow : std_ulogic;
end record;
constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0',

@ -15,7 +15,8 @@ entity control is
complete_in : in std_ulogic;
valid_in : in std_ulogic;
flush_in : in std_ulogic;
stall_in : in std_ulogic;
busy_in : in std_ulogic;
deferred : in std_ulogic;
sgl_pipe_in : in std_ulogic;
stop_mark_in : in std_ulogic;

@ -23,6 +24,9 @@ entity control is
gpr_write_in : in gspr_index_t;
gpr_bypassable : in std_ulogic;

update_gpr_write_valid : in std_ulogic;
update_gpr_write_reg : in gspr_index_t;

gpr_a_read_valid_in : in std_ulogic;
gpr_a_read_in : in gspr_index_t;

@ -72,7 +76,11 @@ begin
)
port map (
clk => clk,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
complete_in => complete_in,
flush_in => flush_in,
issuing => valid_out,

gpr_write_valid_in => gpr_write_valid,
gpr_write_in => gpr_write_in,
@ -80,6 +88,9 @@ begin
gpr_read_valid_in => gpr_a_read_valid_in,
gpr_read_in => gpr_a_read_in,

ugpr_write_valid => update_gpr_write_valid,
ugpr_write_reg => update_gpr_write_reg,

stall_out => stall_a_out,
use_bypass => gpr_bypass_a
);
@ -90,7 +101,11 @@ begin
)
port map (
clk => clk,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
complete_in => complete_in,
flush_in => flush_in,
issuing => valid_out,

gpr_write_valid_in => gpr_write_valid,
gpr_write_in => gpr_write_in,
@ -98,6 +113,9 @@ begin
gpr_read_valid_in => gpr_b_read_valid_in,
gpr_read_in => gpr_b_read_in,

ugpr_write_valid => update_gpr_write_valid,
ugpr_write_reg => update_gpr_write_reg,

stall_out => stall_b_out,
use_bypass => gpr_bypass_b
);
@ -110,7 +128,11 @@ begin
)
port map (
clk => clk,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
complete_in => complete_in,
flush_in => flush_in,
issuing => valid_out,

gpr_write_valid_in => gpr_write_valid,
gpr_write_in => gpr_write_in,
@ -118,6 +140,9 @@ begin
gpr_read_valid_in => gpr_c_read_valid_in,
gpr_read_in => gpr_c_read_in_fmt,

ugpr_write_valid => update_gpr_write_valid,
ugpr_write_reg => update_gpr_write_reg,

stall_out => stall_c_out,
use_bypass => gpr_bypass_c
);
@ -128,7 +153,11 @@ begin
)
port map (
clk => clk,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
complete_in => complete_in,
flush_in => flush_in,
issuing => valid_out,

cr_read_in => cr_read_in,
cr_write_in => cr_write_valid,
@ -139,7 +168,8 @@ begin
control0: process(clk)
begin
if rising_edge(clk) then
assert r_int.outstanding >= 0 and r_int.outstanding <= (PIPELINE_DEPTH+1) report "Outstanding bad " & integer'image(r_int.outstanding) severity failure;
assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1)
report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure;
r_int <= rin_int;
end if;
end process;
@ -152,17 +182,18 @@ begin
v_int := r_int;

-- asynchronous
valid_tmp := valid_in and not flush_in and not stall_in;
stall_tmp := stall_in;
valid_tmp := valid_in and not flush_in;
stall_tmp := '0';

if complete_in = '1' then
if flush_in = '1' then
-- expect to see complete_in next cycle
v_int.outstanding := 1;
elsif complete_in = '1' then
v_int.outstanding := r_int.outstanding - 1;
end if;

if rst = '1' then
v_int.state := IDLE;
v_int.outstanding := 0;
stall_tmp := '0';
v_int := reg_internal_init;
valid_tmp := '0';
end if;

@ -227,7 +258,9 @@ begin
end if;

if valid_tmp = '1' then
v_int.outstanding := v_int.outstanding + 1;
if deferred = '0' then
v_int.outstanding := v_int.outstanding + 1;
end if;
gpr_write_valid <= gpr_write_valid_in;
cr_write_valid <= cr_write_in;
else
@ -237,7 +270,7 @@ begin

-- update outputs
valid_out <= valid_tmp;
stall_out <= stall_tmp;
stall_out <= stall_tmp or deferred;

-- update registers
rin_int <= v_int;

@ -11,7 +11,8 @@ entity core is
SIM : boolean := false;
DISABLE_FLATTEN : boolean := false;
EX1_BYPASS : boolean := true;
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0')
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
LOG_LENGTH : natural := 512
);
port (
clk : in std_ulogic;
@ -41,16 +42,14 @@ entity core is
end core;

architecture behave of core is
-- fetch signals
signal fetch2_to_decode1: Fetch2ToDecode1Type;

-- icache signals
signal fetch1_to_icache : Fetch1ToIcacheType;
signal icache_to_fetch2 : IcacheToFetch2Type;
signal icache_to_decode1 : IcacheToDecode1Type;
signal mmu_to_icache : MmuToIcacheType;

-- decode signals
signal decode1_to_decode2: Decode1ToDecode2Type;
signal decode1_to_fetch1: Decode1ToFetch1Type;
signal decode2_to_execute1: Decode2ToExecute1Type;

-- register file signals
@ -83,16 +82,18 @@ architecture behave of core is
-- local signals
signal fetch1_stall_in : std_ulogic;
signal icache_stall_out : std_ulogic;
signal fetch2_stall_in : std_ulogic;
signal icache_stall_in : std_ulogic;
signal decode1_stall_in : std_ulogic;
signal decode2_stall_in : std_ulogic;
signal decode1_busy : std_ulogic;
signal decode2_busy_in : std_ulogic;
signal decode2_stall_out : std_ulogic;
signal ex1_icache_inval: std_ulogic;
signal ex1_stall_out: std_ulogic;
signal ls1_stall_out: std_ulogic;
signal ex1_busy_out: std_ulogic;
signal dcache_stall_out: std_ulogic;

signal flush: std_ulogic;
signal decode1_flush: std_ulogic;
signal fetch1_flush: std_ulogic;

signal complete: std_ulogic;
signal terminate: std_ulogic;
@ -128,6 +129,12 @@ architecture behave of core is
-- Debug status
signal dbg_core_is_stopped: std_ulogic;

-- Logging signals
signal log_data : std_ulogic_vector(255 downto 0);
signal log_rd_addr : std_ulogic_vector(31 downto 0);
signal log_wr_addr : std_ulogic_vector(31 downto 0);
signal log_rd_data : std_ulogic_vector(63 downto 0);

function keep_h(disable : boolean) return string is
begin
if disable then
@ -139,7 +146,6 @@ architecture behave of core is
attribute keep_hierarchy : string;
attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of fetch2_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
@ -180,45 +186,40 @@ begin
rst => rst_fetch1,
alt_reset_in => alt_reset_d,
stall_in => fetch1_stall_in,
flush_in => flush,
flush_in => fetch1_flush,
stop_in => dbg_core_stop,
d_in => decode1_to_fetch1,
e_in => execute1_to_fetch1,
i_out => fetch1_to_icache
i_out => fetch1_to_icache,
log_out => log_data(42 downto 0)
);

fetch1_stall_in <= icache_stall_out or decode2_stall_out;
fetch1_stall_in <= icache_stall_out or decode1_busy;
fetch1_flush <= flush or decode1_flush;

icache_0: entity work.icache
generic map(
SIM => SIM,
LINE_SIZE => 64,
NUM_LINES => 32,
NUM_LINES => 64,
NUM_WAYS => 2
)
port map(
clk => clk,
rst => rst_icache,
i_in => fetch1_to_icache,
i_out => icache_to_fetch2,
i_out => icache_to_decode1,
m_in => mmu_to_icache,
flush_in => flush,
flush_in => fetch1_flush,
inval_in => dbg_icache_rst or ex1_icache_inval,
stall_in => icache_stall_in,
stall_out => icache_stall_out,
wishbone_out => wishbone_insn_out,
wishbone_in => wishbone_insn_in
);

fetch2_0: entity work.fetch2
port map (
clk => clk,
rst => rst_fetch2,
stall_in => fetch2_stall_in,
flush_in => flush,
i_in => icache_to_fetch2,
f_out => fetch2_to_decode1
wishbone_in => wishbone_insn_in,
log_out => log_data(96 downto 43)
);

fetch2_stall_in <= decode2_stall_out;
icache_stall_in <= decode1_busy;

decode1_0: entity work.decode1
port map (
@ -226,8 +227,12 @@ begin
rst => rst_dec1,
stall_in => decode1_stall_in,
flush_in => flush,
f_in => fetch2_to_decode1,
d_out => decode1_to_decode2
flush_out => decode1_flush,
busy_out => decode1_busy,
f_in => icache_to_decode1,
d_out => decode1_to_decode2,
f_out => decode1_to_fetch1,
log_out => log_data(109 downto 97)
);

decode1_stall_in <= decode2_stall_out;
@ -239,7 +244,7 @@ begin
port map (
clk => clk,
rst => rst_dec2,
stall_in => decode2_stall_in,
busy_in => decode2_busy_in,
stall_out => decode2_stall_out,
flush_in => flush,
complete_in => complete,
@ -249,9 +254,10 @@ begin
r_in => register_file_to_decode2,
r_out => decode2_to_register_file,
c_in => cr_file_to_decode2,
c_out => decode2_to_cr_file
c_out => decode2_to_cr_file,
log_out => log_data(119 downto 110)
);
decode2_stall_in <= ex1_stall_out or ls1_stall_out;
decode2_busy_in <= ex1_busy_out;

register_file_0: entity work.register_file
generic map (
@ -267,7 +273,8 @@ begin
dbg_gpr_addr => dbg_gpr_addr,
dbg_gpr_data => dbg_gpr_data,
sim_dump => terminate,
sim_dump_done => sim_cr_dump
sim_dump_done => sim_cr_dump,
log_out => log_data(255 downto 185)
);

cr_file_0: entity work.cr_file
@ -279,7 +286,8 @@ begin
d_in => decode2_to_cr_file,
d_out => cr_file_to_decode2,
w_in => writeback_to_cr_file,
sim_dump => sim_cr_dump
sim_dump => sim_cr_dump,
log_out => log_data(184 downto 172)
);

execute1_0: entity work.execute1
@ -290,7 +298,7 @@ begin
clk => clk,
rst => rst_ex1,
flush_out => flush,
stall_out => ex1_stall_out,
busy_out => ex1_busy_out,
e_in => decode2_to_execute1,
l_in => loadstore1_to_execute1,
ext_irq_in => ext_irq,
@ -299,7 +307,11 @@ begin
e_out => execute1_to_writeback,
icache_inval => ex1_icache_inval,
dbg_msr_out => msr,
terminate_out => terminate
terminate_out => terminate,
log_out => log_data(134 downto 120),
log_rd_addr => log_rd_addr,
log_rd_data => log_rd_data,
log_wr_addr => log_wr_addr
);

loadstore1_0: entity work.loadstore1
@ -314,7 +326,7 @@ begin
m_out => loadstore1_to_mmu,
m_in => mmu_to_loadstore1,
dc_stall => dcache_stall_out,
stall_out => ls1_stall_out
log_out => log_data(149 downto 140)
);

mmu_0: entity work.mmu
@ -331,7 +343,7 @@ begin
dcache_0: entity work.dcache
generic map(
LINE_SIZE => 64,
NUM_LINES => 32,
NUM_LINES => 64,
NUM_WAYS => 2
)
port map (
@ -343,7 +355,8 @@ begin
m_out => dcache_to_mmu,
stall_out => dcache_stall_out,
wishbone_in => wishbone_data_in,
wishbone_out => wishbone_data_out
wishbone_out => wishbone_data_out,
log_out => log_data(171 downto 152)
);

writeback_0: entity work.writeback
@ -356,7 +369,13 @@ begin
complete_out => complete
);

log_data(151 downto 150) <= "00";
log_data(139 downto 135) <= "00000";

debug_0: entity work.core_debug
generic map (
LOG_LENGTH => LOG_LENGTH
)
port map (
clk => clk,
rst => rst_dbg,
@ -377,6 +396,10 @@ begin
dbg_gpr_ack => dbg_gpr_ack,
dbg_gpr_addr => dbg_gpr_addr,
dbg_gpr_data => dbg_gpr_data,
log_data => log_data,
log_read_addr => log_rd_addr,
log_read_data => log_rd_data,
log_write_addr => log_wr_addr,
terminated_out => terminated_out
);


@ -3,9 +3,14 @@ use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.utils.all;
use work.common.all;

entity core_debug is
generic (
-- Length of log buffer
LOG_LENGTH : natural := 512
);
port (
clk : in std_logic;
rst : in std_logic;
@ -34,6 +39,12 @@ entity core_debug is
dbg_gpr_addr : out gspr_index_t;
dbg_gpr_data : in std_ulogic_vector(63 downto 0);

-- Core logging data
log_data : in std_ulogic_vector(255 downto 0);
log_read_addr : in std_ulogic_vector(31 downto 0);
log_read_data : out std_ulogic_vector(63 downto 0);
log_write_addr : out std_ulogic_vector(31 downto 0);

-- Misc
terminated_out : out std_ulogic
);
@ -77,6 +88,12 @@ architecture behave of core_debug is
-- GSPR register data
constant DBG_CORE_GSPR_DATA : std_ulogic_vector(3 downto 0) := "0101";

-- Log buffer address and data registers
constant DBG_CORE_LOG_ADDR : std_ulogic_vector(3 downto 0) := "0110";
constant DBG_CORE_LOG_DATA : std_ulogic_vector(3 downto 0) := "0111";

constant LOG_INDEX_BITS : natural := log2(LOG_LENGTH);

-- Some internal wires
signal stat_reg : std_ulogic_vector(63 downto 0);

@ -89,6 +106,12 @@ architecture behave of core_debug is
signal do_gspr_rd : std_ulogic;
signal gspr_index : gspr_index_t;

signal log_dmi_addr : std_ulogic_vector(31 downto 0) := (others => '0');
signal log_dmi_data : std_ulogic_vector(63 downto 0) := (others => '0');
signal do_dmi_log_rd : std_ulogic;
signal dmi_read_log_data : std_ulogic;
signal dmi_read_log_data_1 : std_ulogic;

begin
-- Single cycle register accesses on DMI except for GSPR data
dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA
@ -108,6 +131,8 @@ begin
nia when DBG_CORE_NIA,
msr when DBG_CORE_MSR,
dbg_gpr_data when DBG_CORE_GSPR_DATA,
log_write_addr & log_dmi_addr when DBG_CORE_LOG_ADDR,
log_dmi_data when DBG_CORE_LOG_DATA,
(others => '0') when others;

-- DMI writes
@ -118,6 +143,7 @@ begin
do_step <= '0';
do_reset <= '0';
do_icreset <= '0';
do_dmi_log_rd <= '0';

if (rst) then
stopping <= '0';
@ -151,11 +177,26 @@ begin
end if;
elsif dmi_addr = DBG_CORE_GSPR_INDEX then
gspr_index <= dmi_din(gspr_index_t'left downto 0);
elsif dmi_addr = DBG_CORE_LOG_ADDR then
log_dmi_addr <= dmi_din(31 downto 0);
do_dmi_log_rd <= '1';
end if;
else
report("DMI read from " & to_string(dmi_addr));
end if;

elsif dmi_read_log_data = '0' and dmi_read_log_data_1 = '1' then
-- Increment log_dmi_addr after the end of a read from DBG_CORE_LOG_DATA
log_dmi_addr(LOG_INDEX_BITS + 1 downto 0) <=
std_ulogic_vector(unsigned(log_dmi_addr(LOG_INDEX_BITS+1 downto 0)) + 1);
do_dmi_log_rd <= '1';
end if;
dmi_read_log_data_1 <= dmi_read_log_data;
if dmi_req = '1' and dmi_addr = DBG_CORE_LOG_DATA then
dmi_read_log_data <= '1';
else
dmi_read_log_data <= '0';
end if;

-- Set core stop on terminate. We'll be stopping some time *after*
-- the offending instruction, at least until we can do back flushes
@ -175,5 +216,87 @@ begin
core_rst <= do_reset;
icache_rst <= do_icreset;
terminated_out <= terminated;

-- Logging RAM
maybe_log: if LOG_LENGTH > 0 generate
subtype log_ptr_t is unsigned(LOG_INDEX_BITS - 1 downto 0);
type log_array_t is array(0 to LOG_LENGTH - 1) of std_ulogic_vector(255 downto 0);
signal log_array : log_array_t;
signal log_rd_ptr : log_ptr_t;
signal log_wr_ptr : log_ptr_t;
signal log_toggle : std_ulogic;
signal log_wr_enable : std_ulogic;
signal log_rd_ptr_latched : log_ptr_t;
signal log_rd : std_ulogic_vector(255 downto 0);
signal log_dmi_reading : std_ulogic;
signal log_dmi_read_done : std_ulogic;

function select_dword(data : std_ulogic_vector(255 downto 0);
addr : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is
variable firstbit : integer;
begin
firstbit := to_integer(unsigned(addr(1 downto 0))) * 64;
return data(firstbit + 63 downto firstbit);
end;

attribute ram_style : string;
attribute ram_style of log_array : signal is "block";
attribute ram_decomp : string;
attribute ram_decomp of log_array : signal is "power";

begin
-- Use MSB of read addresses to stop the logging
log_wr_enable <= not (log_read_addr(31) or log_dmi_addr(31));

log_ram: process(clk)
begin
if rising_edge(clk) then
if log_wr_enable = '1' then
log_array(to_integer(log_wr_ptr)) <= log_data;
end if;
log_rd <= log_array(to_integer(log_rd_ptr_latched));
end if;
end process;


log_buffer: process(clk)
variable b : integer;
variable data : std_ulogic_vector(255 downto 0);
begin
if rising_edge(clk) then
if rst = '1' then
log_wr_ptr <= (others => '0');
log_toggle <= '0';
elsif log_wr_enable = '1' then
if log_wr_ptr = to_unsigned(LOG_LENGTH - 1, LOG_INDEX_BITS) then
log_toggle <= not log_toggle;
end if;
log_wr_ptr <= log_wr_ptr + 1;
end if;
if do_dmi_log_rd = '1' then
log_rd_ptr_latched <= unsigned(log_dmi_addr(LOG_INDEX_BITS + 1 downto 2));
else
log_rd_ptr_latched <= unsigned(log_read_addr(LOG_INDEX_BITS + 1 downto 2));
end if;
if log_dmi_read_done = '1' then
log_dmi_data <= select_dword(log_rd, log_dmi_addr);
else
log_read_data <= select_dword(log_rd, log_read_addr);
end if;
log_dmi_read_done <= log_dmi_reading;
log_dmi_reading <= do_dmi_log_rd;
end if;
end process;
log_write_addr(LOG_INDEX_BITS - 1 downto 0) <= std_ulogic_vector(log_wr_ptr);
log_write_addr(LOG_INDEX_BITS) <= '1';
log_write_addr(31 downto LOG_INDEX_BITS + 1) <= (others => '0');
end generate;

no_log: if LOG_LENGTH = 0 generate
begin
log_read_data <= (others => '0');
log_write_addr <= x"00000001";
end generate;

end behave;


@ -18,7 +18,9 @@ entity cr_file is
w_in : in WritebackToCrFileType;

-- debug
sim_dump : in std_ulogic
sim_dump : in std_ulogic;

log_out : out std_ulogic_vector(12 downto 0)
);
end entity cr_file;

@ -27,6 +29,7 @@ architecture behaviour of cr_file is
signal crs_updated : std_ulogic_vector(31 downto 0);
signal xerc : xer_common_t := xerc_init;
signal xerc_updated : xer_common_t;
signal log_data : std_ulogic_vector(12 downto 0);
begin
cr_create_0: process(all)
variable hi, lo : integer := 0;
@ -88,4 +91,14 @@ begin
end process;
end generate;

cr_log: process(clk)
begin
if rising_edge(clk) then
log_data <= w_in.write_cr_enable &
w_in.write_cr_data(31 downto 28) &
w_in.write_cr_mask;
end if;
end process;
log_out <= log_data;

end architecture behaviour;

@ -4,11 +4,15 @@ use ieee.numeric_std.all;

entity cr_hazard is
generic (
PIPELINE_DEPTH : natural := 2
PIPELINE_DEPTH : natural := 1
);
port(
clk : in std_ulogic;
stall_in : in std_ulogic;
busy_in : in std_ulogic;
deferred : in std_ulogic;
complete_in : in std_ulogic;
flush_in : in std_ulogic;
issuing : in std_ulogic;

cr_read_in : in std_ulogic;
cr_write_in : in std_ulogic;
@ -22,7 +26,7 @@ architecture behaviour of cr_hazard is
end record;
constant pipeline_entry_init : pipeline_entry_type := (valid => '0');

type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type;
type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type;
constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);

signal r, rin : pipeline_t := pipeline_t_init;
@ -30,9 +34,7 @@ begin
cr_hazard0: process(clk)
begin
if rising_edge(clk) then
if stall_in = '0' then
r <= rin;
end if;
r <= rin;
end if;
end process;

@ -41,22 +43,23 @@ begin
begin
v := r;

stall_out <= '0';
loop_0: for i in 0 to PIPELINE_DEPTH-1 loop
if (r(i).valid = cr_read_in) then
stall_out <= '1';
end if;
end loop;

v(0).valid := cr_write_in;
loop_1: for i in 0 to PIPELINE_DEPTH-2 loop
-- propagate to next slot
v(i+1) := r(i);
end loop;
-- XXX assumes PIPELINE_DEPTH = 1
if complete_in = '1' then
v(1).valid := '0';
end if;
stall_out <= cr_read_in and (v(0).valid or v(1).valid);

-- asynchronous output
if cr_read_in = '0' then
stall_out <= '0';
-- XXX assumes PIPELINE_DEPTH = 1
if busy_in = '0' then
v(1) := r(0);
v(0).valid := '0';
end if;
if deferred = '0' and issuing = '1' then
v(0).valid := cr_write_in;
end if;
if flush_in = '1' then
v(0).valid := '0';
v(1).valid := '0';
end if;

-- update registers

File diff suppressed because it is too large Load Diff

@ -8,19 +8,24 @@ use work.decode_types.all;

entity decode1 is
port (
clk : in std_ulogic;
rst : in std_ulogic;

stall_in : in std_ulogic;
flush_in : in std_ulogic;

f_in : in Fetch2ToDecode1Type;
d_out : out Decode1ToDecode2Type
clk : in std_ulogic;
rst : in std_ulogic;

stall_in : in std_ulogic;
flush_in : in std_ulogic;
busy_out : out std_ulogic;
flush_out : out std_ulogic;

f_in : in IcacheToDecode1Type;
f_out : out Decode1ToFetch1Type;
d_out : out Decode1ToDecode2Type;
log_out : out std_ulogic_vector(12 downto 0)
);
end entity decode1;

architecture behaviour of decode1 is
signal r, rin : Decode1ToDecode2Type;
signal s : Decode1ToDecode2Type;

subtype major_opcode_t is unsigned(5 downto 0);
type major_rom_array_t is array(0 to 63) of decode_rom_t;
@ -352,24 +357,45 @@ architecture behaviour of decode1 is
constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');

signal log_data : std_ulogic_vector(12 downto 0);

begin
decode1_0: process(clk)
begin
if rising_edge(clk) then
-- Output state remains unchanged on stall, unless we are flushing
if rst = '1' or flush_in = '1' or stall_in = '0' then
r <= rin;
if rst = '1' then
r <= Decode1ToDecode2Init;
s <= Decode1ToDecode2Init;
elsif flush_in = '1' then
r.valid <= '0';
s.valid <= '0';
elsif s.valid = '1' then
if stall_in = '0' then
r <= s;
s.valid <= '0';
end if;
else
s <= rin;
s.valid <= rin.valid and r.valid and stall_in;
if r.valid = '0' or stall_in = '0' then
r <= rin;
end if;
end if;
end if;
end process;
busy_out <= s.valid;

decode1_1: process(all)
variable v : Decode1ToDecode2Type;
variable f : Decode1ToFetch1Type;
variable majorop : major_opcode_t;
variable op_19_bits: std_ulogic_vector(2 downto 0);
variable sprn : spr_num_t;
variable br_nia : std_ulogic_vector(61 downto 0);
variable br_target : std_ulogic_vector(61 downto 0);
variable br_offset : signed(23 downto 0);
begin
v := r;
v := Decode1ToDecode2Init;

v.valid := f_in.valid;
v.nia := f_in.nia;
@ -395,6 +421,31 @@ begin
-- major opcode 31, lots of things
v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1))));

-- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path
sprn := decode_spr_num(f_in.insn);
v.ispr1 := fast_spr_num(sprn);

if std_match(f_in.insn(10 downto 1), "01-1010011") then
-- mfspr or mtspr
-- Make slow SPRs single issue
if is_fast_spr(v.ispr1) = '0' then
v.decode.sgl_pipe := '1';
-- send MMU-related SPRs to loadstore1
case sprn is
when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL =>
v.decode.unit := LDST;
when others =>
end case;
end if;
end if;

elsif majorop = "010000" then
-- CTR may be needed as input to bc
v.decode := major_decode_rom_array(to_integer(majorop));
if f_in.insn(23) = '0' then
v.ispr1 := fast_spr_num(SPR_CTR);
end if;

elsif majorop = "010011" then
if decode_op_19_valid(to_integer(unsigned(f_in.insn(10 downto 1)))) = '0' then
report "op 19 illegal subcode";
@ -405,6 +456,27 @@ begin
report "op 19 sub " & to_hstring(op_19_bits);
end if;

-- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path
if f_in.insn(2) = '0' then
-- Could be OP_BCREG: bclr, bcctr, bctar
-- Branch uses CTR as condition when BO(2) is 0. This is
-- also used to indicate that CTR is modified (they go
-- together).
if f_in.insn(23) = '0' then
v.ispr1 := fast_spr_num(SPR_CTR);
end if;
-- TODO: Add TAR
if f_in.insn(10) = '0' then
v.ispr2 := fast_spr_num(SPR_LR);
else
v.ispr2 := fast_spr_num(SPR_CTR);
end if;
else
-- Could be OP_RFID
v.ispr1 := fast_spr_num(SPR_SRR0);
v.ispr2 := fast_spr_num(SPR_SRR1);
end if;

elsif majorop = "011110" then
v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1))));

@ -422,56 +494,45 @@ begin
v.decode := major_decode_rom_array(to_integer(majorop));
end if;

-- Set ISPR1/ISPR2 when needed
if v.decode.insn_type = OP_BC or v.decode.insn_type = OP_BCREG then
-- Branch uses CTR as condition when BO(2) is 0. This is
-- also used to indicate that CTR is modified (they go
-- together).
--
if f_in.insn(23) = '0' then
v.ispr1 := fast_spr_num(SPR_CTR);
end if;

-- Branch source register is an SPR
if v.decode.insn_type = OP_BCREG then
-- TODO: Add TAR
if f_in.insn(10) = '0' then
v.ispr2 := fast_spr_num(SPR_LR);
else
v.ispr2 := fast_spr_num(SPR_CTR);
end if;
end if;
elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then
sprn := decode_spr_num(f_in.insn);
v.ispr1 := fast_spr_num(sprn);
-- Make slow SPRs single issue
if is_fast_spr(v.ispr1) = '0' then
v.decode.sgl_pipe := '1';
-- send MMU-related SPRs to loadstore1
case sprn is
when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL =>
v.decode.unit := LDST;
when others =>
end case;
end if;
elsif v.decode.insn_type = OP_RFID then
report "PPC RFID";
v.ispr1 := fast_spr_num(SPR_SRR0);
v.ispr2 := fast_spr_num(SPR_SRR1);
-- Branch predictor
-- Note bclr, bcctr and bctar are predicted not taken as we have no
-- count cache or link stack.
br_offset := (others => '0');
if majorop = 18 then
-- Unconditional branches are always taken
v.br_pred := '1';
br_offset := signed(f_in.insn(25 downto 2));
elsif majorop = 16 then
-- Predict backward branches as taken, forward as untaken
v.br_pred := f_in.insn(15);
br_offset := resize(signed(f_in.insn(15 downto 2)), 24);
end if;

if flush_in = '1' then
v.valid := '0';
end if;

if rst = '1' then
v := Decode1ToDecode2Init;
br_nia := f_in.nia(63 downto 2);
if f_in.insn(1) = '1' then
br_nia := (others => '0');
end if;
br_target := std_ulogic_vector(signed(br_nia) + br_offset);
f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid;
f.redirect_nia := br_target & "00";

-- Update registers
rin <= v;

-- Update outputs
d_out <= r;
f_out <= f;
flush_out <= f.redirect;
end process;

dec1_log : process(clk)
begin
if rising_edge(clk) then
log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) &
r.nia(5 downto 2) &
std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) &
r.valid;
end if;
end process;
log_out <= log_data;

end architecture behaviour;

@ -17,7 +17,7 @@ entity decode2 is
rst : in std_ulogic;

complete_in : in std_ulogic;
stall_in : in std_ulogic;
busy_in : in std_ulogic;
stall_out : out std_ulogic;

stopped_out : out std_ulogic;
@ -32,7 +32,9 @@ entity decode2 is
r_out : out Decode2ToRegisterFileType;

c_in : in CrFileToDecode2Type;
c_out : out Decode2ToCrFileType
c_out : out Decode2ToCrFileType;

log_out : out std_ulogic_vector(9 downto 0)
);
end entity decode2;

@ -43,6 +45,10 @@ architecture behaviour of decode2 is

signal r, rin : reg_type;

signal deferred : std_ulogic;

signal log_data : std_ulogic_vector(9 downto 0);

type decode_input_reg_t is record
reg_valid : std_ulogic;
reg : gspr_index_t;
@ -61,8 +67,6 @@ architecture behaviour of decode2 is
return decode_input_reg_t is
begin
if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then
assert is_fast_spr(ispr) = '0' report "Decode A says GPR but ISPR says SPR:" &
to_hstring(ispr) severity failure;
return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data);
elsif t = SPR then
-- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
@ -87,8 +91,6 @@ architecture behaviour of decode2 is
begin
case t is
when RB =>
assert is_fast_spr(ispr) = '0' report "Decode B says GPR but ISPR says SPR:" &
to_hstring(ispr) severity failure;
ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data);
when CONST_UI =>
ret := ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64)));
@ -196,6 +198,9 @@ architecture behaviour of decode2 is
signal gpr_write : gspr_index_t;
signal gpr_bypassable : std_ulogic;

signal update_gpr_write_valid : std_ulogic;
signal update_gpr_write_reg : gspr_index_t;

signal gpr_a_read_valid : std_ulogic;
signal gpr_a_read :gspr_index_t;
signal gpr_a_bypass : std_ulogic;
@ -220,7 +225,8 @@ begin

complete_in => complete_in,
valid_in => control_valid_in,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
flush_in => flush_in,
sgl_pipe_in => control_sgl_pipe,
stop_mark_in => d_in.stop_mark,
@ -229,6 +235,9 @@ begin
gpr_write_in => gpr_write,
gpr_bypassable => gpr_bypassable,

update_gpr_write_valid => update_gpr_write_valid,
update_gpr_write_reg => update_gpr_write_reg,

gpr_a_read_valid_in => gpr_a_read_valid,
gpr_a_read_in => gpr_a_read,

@ -250,18 +259,24 @@ begin
gpr_bypass_c => gpr_c_bypass
);

deferred <= r.e.valid and busy_in;

decode2_0: process(clk)
begin
if rising_edge(clk) then
if rin.e.valid = '1' then
report "execute " & to_hstring(rin.e.nia);
if rst = '1' or flush_in = '1' or deferred = '0' then
if rin.e.valid = '1' then
report "execute " & to_hstring(rin.e.nia);
end if;
r <= rin;
end if;
r <= rin;
end if;
end process;

r_out.read1_reg <= gpr_or_spr_to_gspr(insn_ra(d_in.insn), d_in.ispr1);
r_out.read2_reg <= gpr_or_spr_to_gspr(insn_rb(d_in.insn), d_in.ispr2);
r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR
else gpr_to_gspr(insn_ra(d_in.insn));
r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
else gpr_to_gspr(insn_rb(d_in.insn));
r_out.read3_reg <= insn_rs(d_in.insn);

c_out.read <= d_in.decode.input_cr;
@ -343,6 +358,7 @@ begin
v.e.sign_extend := d_in.decode.sign_extend;
v.e.update := d_in.decode.update;
v.e.reserve := d_in.decode.reserve;
v.e.br_pred := d_in.br_pred;

-- issue control
control_valid_in <= d_in.valid;
@ -354,6 +370,13 @@ begin
if EX1_BYPASS and d_in.decode.unit = ALU then
gpr_bypassable <= '1';
end if;
update_gpr_write_valid <= d_in.decode.update;
update_gpr_write_reg <= decoded_reg_a.reg;
if v.e.lr = '1' then
-- there are no instructions that have both update=1 and lr=1
update_gpr_write_valid <= '1';
update_gpr_write_reg <= fast_spr_num(SPR_LR);
end if;

gpr_a_read_valid <= decoded_reg_a.reg_valid;
gpr_a_read <= decoded_reg_a.reg;
@ -371,7 +394,7 @@ begin
v.e.insn_type := OP_ILLEGAL;
end if;

if rst = '1' then
if rst = '1' or flush_in = '1' then
v.e := Decode2ToExecute1Init;
end if;

@ -381,4 +404,19 @@ begin
-- Update outputs
e_out <= r.e;
end process;

dec2_log : process(clk)
begin
if rising_edge(clk) then
log_data <= r.e.nia(5 downto 2) &
r.e.valid &
stopped_out &
stall_out &
r.e.bypass_data3 &
r.e.bypass_data2 &
r.e.bypass_data1;
end if;
end process;
log_out <= log_data;

end architecture behaviour;

@ -20,7 +20,7 @@ entity execute1 is

-- asynchronous
flush_out : out std_ulogic;
stall_out : out std_ulogic;
busy_out : out std_ulogic;

e_in : in Decode2ToExecute1Type;
l_in : in Loadstore1ToExecute1Type;
@ -36,34 +36,44 @@ entity execute1 is
dbg_msr_out : out std_ulogic_vector(63 downto 0);

icache_inval : out std_ulogic;
terminate_out : out std_ulogic
terminate_out : out std_ulogic;

log_out : out std_ulogic_vector(14 downto 0);
log_rd_addr : out std_ulogic_vector(31 downto 0);
log_rd_data : in std_ulogic_vector(63 downto 0);
log_wr_addr : in std_ulogic_vector(31 downto 0)
);
end entity execute1;

architecture behaviour of execute1 is
type reg_type is record
e : Execute1ToWritebackType;
busy: std_ulogic;
terminate: std_ulogic;
lr_update : std_ulogic;
next_lr : std_ulogic_vector(63 downto 0);
mul_in_progress : std_ulogic;
div_in_progress : std_ulogic;
cntz_in_progress : std_ulogic;
slow_op_insn : insn_type_t;
slow_op_dest : gpr_index_t;
slow_op_rc : std_ulogic;
slow_op_oe : std_ulogic;
slow_op_xerc : xer_common_t;
ldst_nia : std_ulogic_vector(63 downto 0);
last_nia : std_ulogic_vector(63 downto 0);
log_addr_spr : std_ulogic_vector(31 downto 0);
end record;
constant reg_type_init : reg_type :=
(e => Execute1ToWritebackInit, lr_update => '0',
(e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0',
mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0',
slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
next_lr => (others => '0'), ldst_nia => (others => '0'), others => (others => '0'));
slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));

signal r, rin : reg_type;

signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);

signal valid_in : std_ulogic;
signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0'));
signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0'));
signal right_shift, rot_clear_left, rot_clear_right: std_ulogic;
@ -72,8 +82,6 @@ architecture behaviour of execute1 is
signal rotator_carry: std_ulogic;
signal logical_result: std_ulogic_vector(63 downto 0);
signal countzero_result: std_ulogic_vector(63 downto 0);
signal popcnt_result: std_ulogic_vector(63 downto 0);
signal parity_result: std_ulogic_vector(63 downto 0);

-- multiply signals
signal x_to_multiply: Execute1ToMultiplyType;
@ -83,6 +91,11 @@ architecture behaviour of execute1 is
signal x_to_divider: Execute1ToDividerType;
signal divider_to_x: DividerToExecute1Type;

-- signals for logging
signal exception_log : std_ulogic;
signal irq_valid_log : std_ulogic;
signal log_data : std_ulogic_vector(14 downto 0);

type privilege_level is (USER, SUPER);
type op_privilege_array is array(insn_type_t) of privilege_level;
constant op_privilege: op_privilege_array := (
@ -193,9 +206,7 @@ begin
invert_in => e_in.invert_a,
invert_out => e_in.invert_out,
result => logical_result,
datalen => e_in.data_len,
popcnt => popcnt_result,
parity => parity_result
datalen => e_in.data_len
);

countzero_0: entity work.zero_counter
@ -223,11 +234,17 @@ begin
);

dbg_msr_out <= ctrl.msr;
log_rd_addr <= r.log_addr_spr;

a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1;
b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;

busy_out <= l_in.busy or r.busy;
valid_in <= e_in.valid and not busy_out;

terminate_out <= r.terminate;

execute1_0: process(clk)
begin
if rising_edge(clk) then
@ -238,7 +255,7 @@ begin
else
r <= rin;
ctrl <= ctrl_tmp;
assert not (r.lr_update = '1' and e_in.valid = '1')
assert not (r.lr_update = '1' and valid_in = '1')
report "LR update collision with valid in EX1"
severity failure;
if r.lr_update = '1' then
@ -274,7 +291,6 @@ begin
variable sign1, sign2 : std_ulogic;
variable abs1, abs2 : signed(63 downto 0);
variable overflow : std_ulogic;
variable negative : std_ulogic;
variable zerohi, zerolo : std_ulogic;
variable msb_a, msb_b : std_ulogic;
variable a_lt : std_ulogic;
@ -284,11 +300,18 @@ begin
variable exception_nextpc : std_ulogic;
variable trapval : std_ulogic_vector(4 downto 0);
variable illegal : std_ulogic;
variable is_branch : std_ulogic;
variable taken_branch : std_ulogic;
variable abs_branch : std_ulogic;
variable spr_val : std_ulogic_vector(63 downto 0);
begin
result := (others => '0');
result_with_carry := (others => '0');
result_en := '0';
newcrf := (others => '0');
is_branch := '0';
taken_branch := '0';
abs_branch := '0';

v := r;
v.e := Execute1ToWritebackInit;
@ -334,32 +357,7 @@ begin
v.div_in_progress := '0';
v.cntz_in_progress := '0';

-- signals to multiply unit
x_to_multiply <= Execute1ToMultiplyInit;
x_to_multiply.insn_type <= e_in.insn_type;
x_to_multiply.is_32bit <= e_in.is_32bit;

if e_in.is_32bit = '1' then
if e_in.is_signed = '1' then
x_to_multiply.data1 <= (others => a_in(31));
x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0);
x_to_multiply.data2 <= (others => b_in(31));
x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0);
else
x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0);
x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0);
end if;
else
if e_in.is_signed = '1' then
x_to_multiply.data1 <= a_in(63) & a_in;
x_to_multiply.data2 <= b_in(63) & b_in;
else
x_to_multiply.data1 <= '0' & a_in;
x_to_multiply.data2 <= '0' & b_in;
end if;
end if;

-- signals to divide unit
-- signals to multiply and divide units
sign1 := '0';
sign2 := '0';
if e_in.is_signed = '1' then
@ -383,15 +381,22 @@ begin
abs2 := - signed(b_in);
end if;

x_to_multiply <= Execute1ToMultiplyInit;
x_to_multiply.is_32bit <= e_in.is_32bit;

x_to_divider <= Execute1ToDividerInit;
x_to_divider.is_signed <= e_in.is_signed;
x_to_divider.is_32bit <= e_in.is_32bit;
if e_in.insn_type = OP_MOD then
x_to_divider.is_modulus <= '1';
end if;

x_to_multiply.neg_result <= sign1 xor sign2;
x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
if e_in.is_32bit = '0' then
-- 64-bit forms
x_to_multiply.data1 <= std_ulogic_vector(abs1);
x_to_multiply.data2 <= std_ulogic_vector(abs2);
if e_in.insn_type = OP_DIVE then
x_to_divider.is_extended <= '1';
end if;
@ -399,6 +404,8 @@ begin
x_to_divider.divisor <= std_ulogic_vector(abs2);
else
-- 32-bit forms
x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
x_to_divider.is_extended <= '0';
if e_in.insn_type = OP_DIVE then -- extended forms
x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
@ -426,9 +433,9 @@ begin
end if;
end if;

terminate_out <= '0';
v.terminate := '0';
icache_inval <= '0';
stall_out <= '0';
v.busy := '0';
f_out <= Execute1ToFetch1TypeInit;
-- send MSR[IR] and ~MSR[PR] up to fetch1
f_out.virt_mode <= ctrl.msr(MSR_IR);
@ -450,6 +457,9 @@ begin
v.e.exc_write_enable := '0';
v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
v.e.exc_write_data := e_in.nia;
if valid_in = '1' then
v.last_nia := e_in.nia;
end if;

if ctrl.irq_state = WRITE_SRR1 then
v.e.exc_write_reg := fast_spr_num(SPR_SRR1);
@ -466,10 +476,10 @@ begin
f_out.virt_mode <= '0';
f_out.priv_mode <= '1';
f_out.redirect_nia <= ctrl.irq_nia;
v.e.valid := e_in.valid;
v.e.valid := '1';
report "Writing SRR1: " & to_hstring(ctrl.srr1);

elsif irq_valid = '1' and e_in.valid = '1' then
elsif irq_valid = '1' and valid_in = '1' then
-- we need two cycles to write srr0 and 1
-- will need more when we have to write HEIR
-- Don't deliver the interrupt until we have a valid instruction
@ -477,7 +487,7 @@ begin
exception := '1';
ctrl_tmp.srr1 <= msr_copy(ctrl.msr);

elsif e_in.valid = '1' and ctrl.msr(MSR_PR) = '1' and
elsif valid_in = '1' and ctrl.msr(MSR_PR) = '1' and
instr_is_privileged(e_in.insn_type, e_in.insn) then
-- generate a program interrupt
exception := '1';
@ -487,12 +497,13 @@ begin
ctrl_tmp.srr1(63 - 45) <= '1';
report "privileged instruction";
elsif e_in.valid = '1' and e_in.unit = ALU then
elsif valid_in = '1' and e_in.unit = ALU then

report "execute nia " & to_hstring(e_in.nia);

v.e.valid := '1';
v.e.write_reg := e_in.write_reg;
v.slow_op_insn := e_in.insn_type;
v.slow_op_dest := gspr_to_gpr(e_in.write_reg);
v.slow_op_rc := e_in.rc;
v.slow_op_oe := e_in.oe;
@ -521,7 +532,7 @@ begin
-- check bits 1-10 of the instruction to make sure it's attn
-- if not then it is illegal
if e_in.insn(10 downto 1) = "0100000000" then
terminate_out <= '1';
v.terminate := '1';
report "ATTN";
else
illegal := '1';
@ -612,16 +623,13 @@ begin
end if;
end if;
end if;
when OP_AND | OP_OR | OP_XOR =>
when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS =>
result := logical_result;
result_en := '1';
when OP_B =>
f_out.redirect <= '1';
if (insn_aa(e_in.insn)) then
f_out.redirect_nia <= b_in;
else
f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
end if;
is_branch := '1';
taken_branch := '1';
abs_branch := insn_aa(e_in.insn);
when OP_BC =>
-- read_data1 is CTR
bo := insn_bo(e_in.insn);
@ -631,14 +639,9 @@ begin
result_en := '1';
v.e.write_reg := fast_spr_num(SPR_CTR);
end if;
if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
f_out.redirect <= '1';
if (insn_aa(e_in.insn)) then
f_out.redirect_nia <= b_in;
else
f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
end if;
end if;
is_branch := '1';
taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in);
abs_branch := insn_aa(e_in.insn);
when OP_BCREG =>
-- read_data1 is CTR
-- read_data2 is target register (CTR, LR or TAR)
@ -649,7 +652,7 @@ begin
result_en := '1';
v.e.write_reg := fast_spr_num(SPR_CTR);
end if;
if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
if ppc_bc_taken(bo, bi, e_in.cr, a_in) = '1' then
f_out.redirect <= '1';
f_out.redirect_nia <= b_in(63 downto 2) & "00";
end if;
@ -670,27 +673,10 @@ begin
ctrl_tmp.msr(MSR_DR) <= '1';
end if;

when OP_CMPB =>
result := ppc_cmpb(c_in, b_in);
result_en := '1';
when OP_CNTZ =>
v.e.valid := '0';
v.cntz_in_progress := '1';
stall_out <= '1';
when OP_EXTS =>
-- note data_len is a 1-hot encoding