Merge pull request #268 from paulusmack/btc

Implement branch target cache
pull/274/head
Michael Neuling 4 years ago committed by GitHub
commit 9a6a7e9fe5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -155,6 +155,7 @@ package common is
big_endian : std_ulogic;
stop_mark: std_ulogic;
sequential: std_ulogic;
predicted : std_ulogic;
nia: std_ulogic_vector(63 downto 0);
end record;

@ -165,6 +166,7 @@ package common is
nia: std_ulogic_vector(63 downto 0);
insn: std_ulogic_vector(31 downto 0);
big_endian: std_ulogic;
next_predicted: std_ulogic;
end record;

type Decode1ToDecode2Type is record
@ -195,6 +197,7 @@ package common is
insn_type: insn_type_t;
nia: std_ulogic_vector(63 downto 0);
write_reg: gspr_index_t;
write_reg_enable: std_ulogic;
read_reg1: gspr_index_t;
read_reg2: gspr_index_t;
read_data1: std_ulogic_vector(63 downto 0);
@ -210,6 +213,7 @@ package common is
rc: std_ulogic;
oe: std_ulogic;
invert_a: std_ulogic;
addm1 : std_ulogic;
invert_out: std_ulogic;
input_carry: carry_in_t;
output_carry: std_ulogic;
@ -224,18 +228,21 @@ package common is
update : std_ulogic; -- is this an update instruction?
reserve : std_ulogic; -- set for larx/stcx
br_pred : std_ulogic;
result_sel : std_ulogic_vector(2 downto 0); -- select source of result
sub_select : std_ulogic_vector(2 downto 0); -- sub-result selection
repeat : std_ulogic; -- set if instruction is cracked into two ops
second : std_ulogic; -- set if this is the second op
end record;
constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL,
bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0',
write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0',
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'),
read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'),
cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'),
result_sel => "000", sub_select => "000",
repeat => '0', second => '0', others => (others => '0'));

type MultiplyInputType is record
@ -303,10 +310,14 @@ package common is
big_endian: std_ulogic;
mode_32bit: std_ulogic;
redirect_nia: std_ulogic_vector(63 downto 0);
br_nia : std_ulogic_vector(63 downto 0);
br_last : std_ulogic;
br_taken : std_ulogic;
end record;
constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0',
priv_mode => '0', big_endian => '0',
mode_32bit => '0', others => (others => '0'));
mode_32bit => '0', br_taken => '0',
br_last => '0', others => (others => '0'));

type Execute1ToLoadstore1Type is record
valid : std_ulogic;
@ -365,7 +376,7 @@ package common is
virt_mode : std_ulogic;
priv_mode : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1
byte_sel : std_ulogic_vector(7 downto 0);
end record;


@ -12,6 +12,7 @@ entity core is
DISABLE_FLATTEN : boolean := false;
EX1_BYPASS : boolean := true;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
LOG_LENGTH : natural := 512
);
@ -187,7 +188,8 @@ begin
fetch1_0: entity work.fetch1
generic map (
RESET_ADDRESS => (others => '0'),
ALT_RESET_ADDRESS => ALT_RESET_ADDRESS
ALT_RESET_ADDRESS => ALT_RESET_ADDRESS,
HAS_BTC => HAS_BTC
)
port map (
clk => clk,
@ -195,6 +197,7 @@ begin
alt_reset_in => alt_reset_d,
stall_in => fetch1_stall_in,
flush_in => fetch1_flush,
inval_btc => ex1_icache_inval or mmu_to_icache.tlbie,
stop_in => dbg_core_stop,
d_in => decode1_to_fetch1,
e_in => execute1_to_fetch1,

@ -1306,7 +1306,7 @@ begin
req.real_addr := ra;
-- Force data to 0 for dcbz
if r0.req.dcbz = '0' then
req.data := r0.req.data;
req.data := d_in.data;
else
req.data := (others => '0');
end if;

@ -31,6 +31,7 @@ end entity decode1;
architecture behaviour of decode1 is
signal r, rin : Decode1ToDecode2Type;
signal s : Decode1ToDecode2Type;
signal f, fin : Decode1ToFetch1Type;

constant illegal_inst : decode_rom_t :=
(NONE, NONE, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);
@ -47,6 +48,14 @@ architecture behaviour of decode1 is
signal ri, ri_in : reg_internal_t;
signal si : reg_internal_t;

type br_predictor_t is record
br_nia : std_ulogic_vector(61 downto 0);
br_offset : signed(23 downto 0);
predict : std_ulogic;
end record;

signal br, br_in : br_predictor_t;

subtype major_opcode_t is unsigned(5 downto 0);
type major_rom_array_t is array(0 to 63) of decode_rom_t;
type minor_valid_array_t is array(0 to 1023) of std_ulogic;
@ -537,6 +546,13 @@ begin
ri <= ri_in;
end if;
end if;
if rst = '1' then
br.br_nia <= (others => '0');
br.br_offset <= (others => '0');
br.predict <= '0';
else
br <= br_in;
end if;
end if;
end process;
busy_out <= s.valid;
@ -544,14 +560,13 @@ begin
decode1_1: process(all)
variable v : Decode1ToDecode2Type;
variable vi : reg_internal_t;
variable f : Decode1ToFetch1Type;
variable majorop : major_opcode_t;
variable minor4op : std_ulogic_vector(10 downto 0);
variable op_19_bits: std_ulogic_vector(2 downto 0);
variable sprn : spr_num_t;
variable br_nia : std_ulogic_vector(61 downto 0);
variable br_target : std_ulogic_vector(61 downto 0);
variable br_offset : signed(23 downto 0);
variable bv : br_predictor_t;
begin
v := Decode1ToDecode2Init;
vi := reg_internal_t_init;
@ -707,17 +722,22 @@ begin
-- Branch predictor
-- Note bclr, bcctr and bctar are predicted not taken as we have no
-- count cache or link stack.
br_nia := f_in.nia(63 downto 2);
bv.br_nia := f_in.nia(63 downto 2);
if f_in.insn(1) = '1' then
br_nia := (others => '0');
bv.br_nia := (others => '0');
end if;
bv.br_offset := br_offset;
if f_in.next_predicted = '1' then
v.br_pred := '1';
end if;
br_target := std_ulogic_vector(signed(br_nia) + br_offset);
f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid;
f.redirect_nia := br_target & "00";
bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted;
-- after a clock edge...
br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset);

-- Update registers
rin <= v;
ri_in <= vi;
br_in <= bv;

-- Update outputs
d_out <= r;
@ -729,8 +749,9 @@ begin
if ri.force_single = '1' then
d_out.decode.sgl_pipe <= '1';
end if;
f_out <= f;
flush_out <= f.redirect;
f_out.redirect <= br.predict;
f_out.redirect_nia <= br_target & "00";
flush_out <= bv.predict or br.predict;
end process;

d1_log: if LOG_LENGTH > 0 generate

@ -221,6 +221,59 @@ architecture behaviour of decode2 is
end case;
end;

-- control signals that are derived from insn_type
type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0);

constant result_select : mux_select_array_t := (
OP_AND => "001", -- logical_result
OP_OR => "001",
OP_XOR => "001",
OP_POPCNT => "001",
OP_PRTY => "001",
OP_CMPB => "001",
OP_EXTS => "001",
OP_BPERM => "001",
OP_BCD => "001",
OP_MTSPR => "001",
OP_RLC => "010", -- rotator_result
OP_RLCL => "010",
OP_RLCR => "010",
OP_SHL => "010",
OP_SHR => "010",
OP_EXTSWSLI => "010",
OP_MUL_L64 => "011", -- muldiv_result
OP_MUL_H64 => "011",
OP_MUL_H32 => "011",
OP_DIV => "011",
OP_DIVE => "011",
OP_MOD => "011",
OP_CNTZ => "100", -- countzero_result
OP_MFSPR => "101", -- spr_result
OP_ADDG6S => "111", -- misc_result
OP_ISEL => "111",
OP_DARN => "111",
OP_MFMSR => "111",
OP_MFCR => "111",
OP_SETB => "111",
others => "000" -- default to adder_result
);

constant subresult_select : mux_select_array_t := (
OP_MUL_L64 => "000", -- muldiv_result
OP_MUL_H64 => "001",
OP_MUL_H32 => "010",
OP_DIV => "011",
OP_DIVE => "011",
OP_MOD => "011",
OP_ADDG6S => "001", -- misc_result
OP_ISEL => "010",
OP_DARN => "011",
OP_MFMSR => "100",
OP_MFCR => "101",
OP_SETB => "110",
others => "000"
);

-- issue control signals
signal control_valid_in : std_ulogic;
signal control_valid_out : std_ulogic;
@ -392,6 +445,7 @@ begin
v.e.read_data3 := decoded_reg_c.data;
v.e.bypass_data3 := gpr_c_bypass;
v.e.write_reg := decoded_reg_o.reg;
v.e.write_reg_enable := decoded_reg_o.reg_valid;
v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then
v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
@ -400,6 +454,16 @@ begin
v.e.bypass_cr := cr_bypass;
v.e.xerc := c_in.read_xerc_data;
v.e.invert_a := d_in.decode.invert_a;
v.e.addm1 := '0';
if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then
-- add -1 to CTR
v.e.addm1 := '1';
if d_in.insn(23) = '1' or
(d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
-- don't write decremented CTR if BO(2) = 1 or bcctr
v.e.write_reg_enable := '0';
end if;
end if;
v.e.invert_out := d_in.decode.invert_out;
v.e.input_carry := d_in.decode.input_carry;
v.e.output_carry := d_in.decode.output_carry;
@ -415,12 +479,14 @@ begin
v.e.update := d_in.decode.update;
v.e.reserve := d_in.decode.reserve;
v.e.br_pred := d_in.br_pred;
v.e.result_sel := result_select(d_in.decode.insn_type);
v.e.sub_select := subresult_select(d_in.decode.insn_type);

-- issue control
control_valid_in <= d_in.valid;
control_sgl_pipe <= d_in.decode.sgl_pipe;

gpr_write_valid <= decoded_reg_o.reg_valid;
gpr_write_valid <= v.e.write_reg_enable;
gpr_write <= decoded_reg_o.reg;
gpr_bypassable <= '0';
if EX1_BYPASS and d_in.decode.unit = ALU then

File diff suppressed because it is too large Load Diff

@ -8,7 +8,8 @@ use work.common.all;
entity fetch1 is
generic(
RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0');
ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0')
ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0');
HAS_BTC : boolean := true
);
port(
clk : in std_ulogic;
@ -17,6 +18,7 @@ entity fetch1 is
-- Control inputs:
stall_in : in std_ulogic;
flush_in : in std_ulogic;
inval_btc : in std_ulogic;
stop_in : in std_ulogic;
alt_reset_in : in std_ulogic;

@ -37,10 +39,25 @@ end entity fetch1;
architecture behaviour of fetch1 is
type reg_internal_t is record
mode_32bit: std_ulogic;
rd_is_niap4: std_ulogic;
predicted: std_ulogic;
predicted_nia: std_ulogic_vector(63 downto 0);
end record;
signal r, r_next : Fetch1ToIcacheType;
signal r_int, r_next_int : reg_internal_t;
signal advance_nia : std_ulogic;
signal log_nia : std_ulogic_vector(42 downto 0);

constant BTC_ADDR_BITS : integer := 10;
constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS;
constant BTC_TARGET_BITS : integer := 62;
constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS;
constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS;
type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0);

signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0');
signal btc_rd_valid : std_ulogic := '0';

begin

regs : process(clk)
@ -56,15 +73,70 @@ begin
" R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) &
" S:" & std_ulogic'image(stall_in) &
" T:" & std_ulogic'image(stop_in) &
" nia:" & to_hstring(r_next.nia) &
" SM:" & std_ulogic'image(r_next.stop_mark);
" nia:" & to_hstring(r_next.nia);
end if;
r <= r_next;
r_int <= r_next_int;
if rst = '1' or e_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then
r.virt_mode <= r_next.virt_mode;
r.priv_mode <= r_next.priv_mode;
r.big_endian <= r_next.big_endian;
r_int.mode_32bit <= r_next_int.mode_32bit;
end if;
if advance_nia = '1' then
r.predicted <= r_next.predicted;
r.nia <= r_next.nia;
r_int.predicted <= r_next_int.predicted;
r_int.predicted_nia <= r_next_int.predicted_nia;
r_int.rd_is_niap4 <= r_next.sequential;
end if;
r.sequential <= r_next.sequential and advance_nia;
-- always send the up-to-date stop mark and req
r.stop_mark <= stop_in;
r.req <= not rst;
end if;
end process;
log_out <= log_nia;

btc : if HAS_BTC generate
signal btc_memory : btc_mem_type;
attribute ram_style : string;
attribute ram_style of btc_memory : signal is "block";

signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0);
attribute ram_style of btc_valids : signal is "distributed";

signal btc_wr : std_ulogic;
signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0);
signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0);
signal btc_wr_v : std_ulogic;
begin
btc_wr_data <= e_in.br_nia(63 downto BTC_ADDR_BITS + 2) &
e_in.redirect_nia(63 downto 2);
btc_wr_addr <= e_in.br_nia(BTC_ADDR_BITS + 1 downto 2);
btc_wr <= e_in.br_last;
btc_wr_v <= e_in.br_taken;

btc_ram : process(clk)
variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0);
begin
if rising_edge(clk) then
raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) +
to_unsigned(2, BTC_ADDR_BITS);
if advance_nia = '1' then
btc_rd_data <= btc_memory(to_integer(raddr));
btc_rd_valid <= btc_valids(to_integer(raddr));
end if;
if btc_wr = '1' then
btc_memory(to_integer(unsigned(btc_wr_addr))) <= btc_wr_data;
end if;
if inval_btc = '1' or rst = '1' then
btc_valids <= (others => '0');
elsif btc_wr = '1' then
btc_valids(to_integer(unsigned(btc_wr_addr))) <= btc_wr_v;
end if;
end if;
end process;
end generate;

comb : process(all)
variable v : Fetch1ToIcacheType;
variable v_int : reg_internal_t;
@ -72,6 +144,8 @@ begin
v := r;
v_int := r_int;
v.sequential := '0';
v.predicted := '0';
v_int.predicted := '0';

if rst = '1' then
if alt_reset_in = '1' then
@ -83,6 +157,7 @@ begin
v.priv_mode := '1';
v.big_endian := '0';
v_int.mode_32bit := '0';
v_int.predicted_nia := (others => '0');
elsif e_in.redirect = '1' then
v.nia := e_in.redirect_nia(63 downto 2) & "00";
if e_in.mode_32bit = '1' then
@ -97,22 +172,26 @@ begin
if r_int.mode_32bit = '1' then
v.nia(63 downto 32) := (others => '0');
end if;
elsif stall_in = '0' then

-- If the last NIA value went down with a stop mark, it didn't get
-- executed, and hence we shouldn't increment NIA.
if r.stop_mark = '0' then
if r_int.mode_32bit = '0' then
v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
else
v.nia := x"00000000" & std_ulogic_vector(unsigned(r.nia(31 downto 0)) + 4);
end if;
v.sequential := '1';
end if;
end if;
elsif r_int.predicted = '1' then
v.nia := r_int.predicted_nia;
v.predicted := '1';
else
v.sequential := '1';
v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
if r_int.mode_32bit = '1' then
v.nia(63 downto 32) := x"00000000";
end if;
if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
btc_rd_data(BTC_WIDTH - 1 downto BTC_TARGET_BITS)
= v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then
v_int.predicted := '1';
end if;
end if;
v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";

v.req := not rst and not stop_in;
v.stop_mark := stop_in;
-- If the last NIA value went down with a stop mark, it didn't get
-- executed, and hence we shouldn't increment NIA.
advance_nia <= rst or e_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in);

r_next <= v;
r_next_int <= v_int;

@ -15,6 +15,7 @@ entity toplevel is
RESET_LOW : boolean := true;
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
USE_LITEDRAM : boolean := false;
NO_BRAM : boolean := false;
DISABLE_FLATTEN_CORE : boolean := false;
@ -170,6 +171,7 @@ begin
SIM => false,
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_DRAM => USE_LITEDRAM,
DRAM_SIZE => 256 * 1024 * 1024,
DRAM_INIT_SIZE => PAYLOAD_SIZE,

@ -12,6 +12,7 @@ entity toplevel is
CLK_INPUT : positive := 100000000;
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := false;
LOG_LENGTH : natural := 512;
DISABLE_FLATTEN_CORE : boolean := false;
UART_IS_16550 : boolean := true
@ -71,6 +72,7 @@ begin
SIM => false,
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
LOG_LENGTH => LOG_LENGTH,
DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
UART0_IS_16550 => UART_IS_16550

@ -15,6 +15,7 @@ entity toplevel is
RESET_LOW : boolean := true;
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
USE_LITEDRAM : boolean := false;
NO_BRAM : boolean := false;
DISABLE_FLATTEN_CORE : boolean := false;
@ -122,6 +123,7 @@ begin
SIM => false,
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_DRAM => USE_LITEDRAM,
DRAM_SIZE => 512 * 1024 * 1024,
DRAM_INIT_SIZE => PAYLOAD_SIZE,

@ -565,6 +565,7 @@ begin
i_out.stop_mark <= r.hit_smark;
i_out.fetch_failed <= r.fetch_failed;
i_out.big_endian <= r.big_endian;
i_out.next_predicted <= i_in.predicted;

-- Stall fetch1 if we have a miss on cache or TLB or a protection fault
stall_out <= not (is_hit and access_ok);

@ -45,7 +45,6 @@ architecture behave of loadstore1 is

-- State machine for unaligned loads/stores
type state_t is (IDLE, -- ready for instruction
FPR_CONV, -- converting double to float for store
SECOND_REQ, -- send 2nd request of unaligned xfer
ACK_WAIT, -- waiting for ack from dcache
MMU_LOOKUP, -- waiting for MMU to look up translation
@ -54,18 +53,23 @@ architecture behave of loadstore1 is
COMPLETE -- extra cycle to complete an operation
);

type byte_index_t is array(0 to 7) of unsigned(2 downto 0);
subtype byte_trim_t is std_ulogic_vector(1 downto 0);
type trim_ctl_t is array(0 to 7) of byte_trim_t;

type reg_stage_t is record
-- latch most of the input request
load : std_ulogic;
tlbie : std_ulogic;
dcbz : std_ulogic;
mfspr : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
store_data : std_ulogic_vector(63 downto 0);
load_data : std_ulogic_vector(63 downto 0);
write_reg : gspr_index_t;
length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
byte_offset : unsigned(2 downto 0);
brev_mask : unsigned(2 downto 0);
sign_extend : std_ulogic;
update : std_ulogic;
update_reg : gpr_index_t;
@ -93,17 +97,16 @@ architecture behave of loadstore1 is
do_update : std_ulogic;
extra_cycle : std_ulogic;
mode_32bit : std_ulogic;
byte_index : byte_index_t;
use_second : std_ulogic_vector(7 downto 0);
trim_ctl : trim_ctl_t;
load_sp : std_ulogic;
ld_sp_data : std_ulogic_vector(31 downto 0);
ld_sp_nz : std_ulogic;
ld_sp_lz : std_ulogic_vector(5 downto 0);
st_sp_data : std_ulogic_vector(31 downto 0);
wr_sel : std_ulogic_vector(1 downto 0);
end record;

type byte_sel_t is array(0 to 7) of std_ulogic;
subtype byte_trim_t is std_ulogic_vector(1 downto 0);
type trim_ctl_t is array(0 to 7) of byte_trim_t;

signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0);

@ -296,11 +299,8 @@ begin
variable data_permuted : std_ulogic_vector(63 downto 0);
variable data_trimmed : std_ulogic_vector(63 downto 0);
variable store_data : std_ulogic_vector(63 downto 0);
variable data_in : std_ulogic_vector(63 downto 0);
variable byte_rev : std_ulogic;
variable length : std_ulogic_vector(3 downto 0);
variable use_second : byte_sel_t;
variable trim_ctl : trim_ctl_t;
variable negative : std_ulogic;
variable sprn : std_ulogic_vector(9 downto 0);
variable exception : std_ulogic;
@ -310,37 +310,25 @@ begin
variable mmu_mtspr : std_ulogic;
variable itlb_fault : std_ulogic;
variable misaligned : std_ulogic;
variable fp_reg_conv : std_ulogic;
variable lfs_done : std_ulogic;
begin
v := r;
req := '0';
v.mfspr := '0';
mmu_mtspr := '0';
itlb_fault := '0';
sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
dsisr := (others => '0');
mmureq := '0';
fp_reg_conv := '0';
v.wr_sel := "11";

write_enable := '0';
lfs_done := '0';

do_update := r.do_update;
v.do_update := '0';

-- load data formatting
byte_offset := unsigned(r.addr(2 downto 0));
brev_lenm1 := "000";
if r.byte_reverse = '1' then
brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
end if;

-- shift and byte-reverse data bytes
for i in 0 to 7 loop
kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
use_second(i) := kk(3);
j := to_integer(kk(2 downto 0)) * 8;
j := to_integer(r.byte_index(i)) * 8;
data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
end loop;

@ -362,62 +350,32 @@ begin

-- trim and sign-extend
for i in 0 to 7 loop
if i < to_integer(unsigned(r.length)) then
if r.dwords_done = '1' then
trim_ctl(i) := '1' & not use_second(i);
else
trim_ctl(i) := "10";
end if;
else
trim_ctl(i) := '0' & (negative and r.sign_extend);
end if;
case trim_ctl(i) is
case r.trim_ctl(i) is
when "11" =>
data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
when "10" =>
data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
when "01" =>
data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
data_trimmed(i * 8 + 7 downto i * 8) := (others => negative);
when others =>
data_trimmed(i * 8 + 7 downto i * 8) := x"00";
end case;
end loop;

if HAS_FPU then
-- Single-precision FP conversion
v.st_sp_data := store_sp_data;
-- Single-precision FP conversion for loads
v.ld_sp_data := data_trimmed(31 downto 0);
v.ld_sp_nz := or (data_trimmed(22 downto 0));
v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
end if;

-- Byte reversing and rotating for stores.
-- Done in the first cycle (when l_in.valid = 1) for integer stores
-- and DP float stores, and in the second cycle for SP float stores.
store_data := r.store_data;
if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then
if HAS_FPU and r.state = FPR_CONV then
data_in := x"00000000" & r.st_sp_data;
byte_offset := unsigned(r.addr(2 downto 0));
byte_rev := r.byte_reverse;
length := r.length;
else
data_in := l_in.data;
byte_offset := unsigned(lsu_sum(2 downto 0));
byte_rev := l_in.byte_reverse;
length := l_in.length;
end if;
brev_lenm1 := "000";
if byte_rev = '1' then
brev_lenm1 := unsigned(length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
j := to_integer(k) * 8;
store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j);
end loop;
end if;
v.store_data := store_data;
-- Done in the second cycle (the cycle after l_in.valid = 1).
for i in 0 to 7 loop
k := (to_unsigned(i, 3) - r.byte_offset) xor r.brev_mask;
j := to_integer(k) * 8;
store_data(i * 8 + 7 downto i * 8) := r.store_data(j + 7 downto j);
end loop;

-- compute (addr + 8) & ~7 for the second doubleword when unaligned
next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
@ -449,20 +407,17 @@ begin
case r.state is
when IDLE =>

when FPR_CONV =>
req := '1';
if r.second_bytes /= "00000000" then
v.state := SECOND_REQ;
else
v.state := ACK_WAIT;
end if;

when SECOND_REQ =>
req := '1';
v.state := ACK_WAIT;
v.last_dword := '0';

when ACK_WAIT =>
-- r.wr_sel gets set one cycle after we come into ACK_WAIT state,
-- which is OK because the dcache always takes at least two cycles.
if r.update = '1' and (r.load = '0' or (HAS_FPU and r.load_sp = '1')) then
v.wr_sel := "01";
end if;
if d_in.error = '1' then
-- dcache will discard the second request if it
-- gets an error on the 1st of two requests
@ -493,9 +448,11 @@ begin
-- SP to DP conversion takes a cycle
-- Write back rA update in this cycle if needed
do_update := r.update;
v.wr_sel := "10";
v.state := FINISH_LFS;
elsif r.extra_cycle = '1' then
-- loads with rA update need an extra cycle
v.wr_sel := "01";
v.state := COMPLETE;
v.do_update := r.update;
else
@ -533,7 +490,6 @@ begin
when TLBIE_WAIT =>

when FINISH_LFS =>
lfs_done := '1';

when COMPLETE =>
exception := r.align_intr;
@ -573,6 +529,12 @@ begin
v.do_update := '0';
v.extra_cycle := '0';

if HAS_FPU and l_in.is_32bit = '1' then
v.store_data := x"00000000" & store_sp_data;
else
v.store_data := l_in.data;
end if;

addr := lsu_sum;
if l_in.second = '1' then
-- for the second half of a 16-byte transfer, use next_addr
@ -621,12 +583,7 @@ begin

case l_in.op is
when OP_STORE =>
if HAS_FPU and l_in.is_32bit = '1' then
v.state := FPR_CONV;
fp_reg_conv := '1';
else
req := '1';
end if;
req := '1';
when OP_LOAD =>
req := '1';
v.load := '1';
@ -647,7 +604,7 @@ begin
v.state := TLBIE_WAIT;
v.wait_mmu := '1';
when OP_MFSPR =>
v.mfspr := '1';
v.wr_sel := "00";
-- partial decode on SPR number should be adequate given
-- the restricted set that get sent down this path
if sprn(9) = '0' and sprn(5) = '0' then
@ -696,9 +653,47 @@ begin
end if;
end if;

v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
v.busy := req or mmureq or mmu_mtspr;
end if;

-- Work out controls for store formatting
if l_in.valid = '1' then
byte_offset := unsigned(lsu_sum(2 downto 0));
byte_rev := l_in.byte_reverse;
length := l_in.length;
brev_lenm1 := "000";
if byte_rev = '1' then
brev_lenm1 := unsigned(length(2 downto 0)) - 1;
end if;
v.byte_offset := byte_offset;
v.brev_mask := brev_lenm1;
end if;

-- Work out load formatter controls for next cycle
byte_offset := unsigned(v.addr(2 downto 0));
brev_lenm1 := "000";
if v.byte_reverse = '1' then
brev_lenm1 := unsigned(v.length(2 downto 0)) - 1;
end if;

for i in 0 to 7 loop
kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
v.use_second(i) := kk(3);
v.byte_index(i) := kk(2 downto 0);
end loop;

for i in 0 to 7 loop
if i < to_integer(unsigned(v.length)) then
if v.dwords_done = '1' then
v.trim_ctl(i) := '1' & not v.use_second(i);
else
v.trim_ctl(i) := "10";
end if;
else
v.trim_ctl(i) := '0' & v.sign_extend;
end if;
end loop;

-- Update outputs to dcache
d_out.valid <= req and not v.align_intr;
d_out.load <= v.load;
@ -729,23 +724,24 @@ begin
-- Multiplex either cache data to the destination GPR or
-- the address for the rA update.
l_out.valid <= done;
if r.mfspr = '1' then
case r.wr_sel is
when "00" =>
l_out.write_enable <= '1';
l_out.write_reg <= r.write_reg;
l_out.write_data <= r.sprval;
elsif do_update = '1' then
l_out.write_enable <= '1';
when "01" =>
l_out.write_enable <= do_update;
l_out.write_reg <= gpr_to_gspr(r.update_reg);
l_out.write_data <= r.addr;
elsif lfs_done = '1' then
when "10" =>
l_out.write_enable <= '1';
l_out.write_reg <= r.write_reg;
l_out.write_data <= load_dp_data;
else
when others =>
l_out.write_enable <= write_enable;
l_out.write_reg <= r.write_reg;
l_out.write_data <= data_trimmed;
end if;
end case;
l_out.xerc <= r.xerc;
l_out.rc <= r.rc and done;
l_out.store_done <= d_in.store_done;

@ -197,8 +197,7 @@ begin
tmp := x"00" & dpd_to_bcd(rs(51 downto 42)) & dpd_to_bcd(rs(41 downto 32)) &
x"00" & dpd_to_bcd(rs(19 downto 10)) & dpd_to_bcd(rs(9 downto 0));
end if;
when others =>
-- EXTS
when OP_EXTS =>
-- note datalen is a 1-hot encoding
negative := (datalen(0) and rs(7)) or
(datalen(1) and rs(15)) or
@ -211,6 +210,9 @@ begin
tmp(15 downto 8) := rs(15 downto 8);
end if;
tmp(7 downto 0) := rs(7 downto 0);
when others =>
-- e.g. OP_MTSPR
tmp := rs;
end case;

result <= tmp;

@ -134,6 +134,7 @@ targets:
- log_length=2048
- uart_is_16550
- has_fpu
- has_btc
tools:
vivado: {part : xc7a100tcsg324-1}
toplevel : toplevel
@ -218,6 +219,7 @@ targets:
- log_length=2048
- uart_is_16550
- has_fpu
- has_btc
tools:
vivado: {part : xc7a200tsbg484-1}
toplevel : toplevel
@ -235,6 +237,7 @@ targets:
- log_length=2048
- uart_is_16550
- has_fpu
- has_btc
generate: [litedram_nexys_video]
tools:
vivado: {part : xc7a200tsbg484-1}
@ -254,6 +257,7 @@ targets:
- uart_is_16550
- has_uart1
- has_fpu=false
- has_btc=false
tools:
vivado: {part : xc7a35ticsg324-1L}
toplevel : toplevel
@ -273,6 +277,7 @@ targets:
- uart_is_16550
- has_uart1
- has_fpu=false
- has_btc=false
generate: [litedram_arty, liteeth_arty]
tools:
vivado: {part : xc7a35ticsg324-1L}
@ -292,6 +297,7 @@ targets:
- uart_is_16550
- has_uart1
- has_fpu
- has_btc
tools:
vivado: {part : xc7a100ticsg324-1L}
toplevel : toplevel
@ -311,6 +317,7 @@ targets:
- uart_is_16550
- has_uart1
- has_fpu
- has_btc
generate: [litedram_arty, liteeth_arty]
tools:
vivado: {part : xc7a100ticsg324-1L}
@ -329,6 +336,7 @@ targets:
- log_length=512
- uart_is_16550
- has_fpu=false
- has_btc=false
tools:
vivado: {part : xc7a35tcpg236-1}
toplevel : toplevel
@ -395,6 +403,12 @@ parameters:
paramtype : generic
default : true

has_btc:
datatype : bool
description : Include a branch target cache in the core
paramtype : generic
default : true

disable_flatten_core:
datatype : bool
description : Prevent Vivado from flattening the main core components

@ -53,6 +53,7 @@ entity soc is
CLK_FREQ : positive;
SIM : boolean;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
DISABLE_FLATTEN_CORE : boolean := false;
HAS_DRAM : boolean := false;
DRAM_SIZE : integer := 0;
@ -255,6 +256,7 @@ begin
generic map(
SIM => SIM,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
LOG_LENGTH => LOG_LENGTH

Loading…
Cancel
Save