You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
microwatt/loadstore1.vhdl

278 lines
9.3 KiB
VHDL

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
library work;
use work.common.all;
use work.helpers.all;
-- 2 cycle LSU
-- We calculate the address in the first cycle
entity loadstore1 is
port (
clk : in std_ulogic;
rst : in std_ulogic;
l_in : in Execute1ToLoadstore1Type;
l_out : out Loadstore1ToWritebackType;
d_out : out Loadstore1ToDcacheType;
d_in : in DcacheToLoadstore1Type;
dc_stall : in std_ulogic;
stall_out : out std_ulogic
);
end loadstore1;
-- Note, we don't currently use the stall output from the dcache because
-- we know it can take two requests without stalling when idle, we are
-- its only user, and we know it never stalls when idle.
architecture behave of loadstore1 is
-- State machine for unaligned loads/stores
type state_t is (IDLE, -- ready for instruction
SECOND_REQ, -- send 2nd request of unaligned xfer
FIRST_ACK_WAIT, -- waiting for 1st ack from dcache
LAST_ACK_WAIT, -- waiting for last ack from dcache
LD_UPDATE -- writing rA with computed addr on load
);
type reg_stage_t is record
-- latch most of the input request
load : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
sign_extend : std_ulogic;
update : std_ulogic;
update_reg : gpr_index_t;
xerc : xer_common_t;
reserve : std_ulogic;
rc : std_ulogic;
nc : std_ulogic; -- non-cacheable access
state : state_t;
second_bytes : std_ulogic_vector(7 downto 0);
end record;
signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0);
-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;
-- Calculate byte enables
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function xfer_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(2 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
longsel := "00000000" & length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address))));
end function xfer_data_sel;
begin
-- Calculate the address in the first cycle
lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
loadstore1_0: process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
r.state <= IDLE;
else
r <= rin;
end if;
end if;
end process;
loadstore1_1: process(all)
variable v : reg_stage_t;
variable brev_lenm1 : unsigned(2 downto 0);
variable byte_offset : unsigned(2 downto 0);
variable j : integer;
variable k : unsigned(2 downto 0);
variable long_sel : std_ulogic_vector(15 downto 0);
variable byte_sel : std_ulogic_vector(7 downto 0);
variable req : std_ulogic;
variable stall : std_ulogic;
variable addr : std_ulogic_vector(63 downto 0);
variable wdata : std_ulogic_vector(63 downto 0);
variable write_enable : std_ulogic;
variable do_update : std_ulogic;
variable second_dword : std_ulogic;
variable done : std_ulogic;
begin
v := r;
req := '0';
stall := '0';
done := '0';
byte_sel := (others => '0');
addr := lsu_sum;
write_enable := '0';
do_update := '0';
second_dword := '0';
case r.state is
when IDLE =>
if l_in.valid = '1' then
v.load := l_in.load;
v.addr := lsu_sum;
v.data := l_in.data;
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;
-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc-------
--
-- This will have to be replaced by a combination of implementing the
-- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise.
if lsu_sum(31 downto 28) = "1100" then
v.nc := '1';
else
v.nc := '0';
end if;
-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0);
v.second_bytes := long_sel(15 downto 8);
v.addr := lsu_sum;
-- Do byte reversing and rotating for stores in the first cycle
if v.load = '0' then
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;
end if;
req := '1';
stall := '1';
if long_sel(15 downto 8) = "00000000" then
v.state := LAST_ACK_WAIT;
else
v.state := SECOND_REQ;
end if;
end if;
when SECOND_REQ =>
-- compute (addr + 8) & ~7 for the second doubleword when unaligned
addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
byte_sel := r.second_bytes;
req := '1';
stall := '1';
v.state := FIRST_ACK_WAIT;
when FIRST_ACK_WAIT =>
stall := '1';
if d_in.valid = '1' then
write_enable := r.load;
v.state := LAST_ACK_WAIT;
end if;
when LAST_ACK_WAIT =>
stall := '1';
second_dword := or (r.second_bytes);
if d_in.valid = '1' then
write_enable := r.load;
if r.load = '1' and r.update = '1' then
-- loads with rA update need an extra cycle
v.state := LD_UPDATE;
else
-- stores write back rA update in this cycle
do_update := r.update;
stall := '0';
done := '1';
v.state := IDLE;
end if;
end if;
when LD_UPDATE =>
do_update := '1';
v.state := IDLE;
done := '1';
end case;
-- Update registers
rin <= v;
-- Update outputs to dcache
d_out.valid <= req;
d_out.load <= v.load;
d_out.nc <= v.nc;
d_out.reserve <= v.reserve;
d_out.addr <= addr;
d_out.data <= v.data;
d_out.byte_sel <= byte_sel;
-- Update outputs to writeback
-- Multiplex either cache data to the destination GPR or
-- the address for the rA update.
l_out.valid <= done;
if do_update = '1' then
l_out.write_enable <= '1';
l_out.write_reg <= r.update_reg;
l_out.write_data <= r.addr;
l_out.write_len <= x"8";
l_out.write_shift <= "000";
l_out.sign_extend <= '0';
l_out.byte_reverse <= '0';
l_out.second_word <= '0';
l_out.rc <= '0';
l_out.store_done <= '0';
else
l_out.write_enable <= write_enable;
l_out.write_reg <= r.write_reg;
l_out.write_data <= d_in.data;
l_out.write_len <= r.length;
l_out.write_shift <= r.addr(2 downto 0);
l_out.sign_extend <= r.sign_extend;
l_out.byte_reverse <= r.byte_reverse;
l_out.second_word <= second_dword;
l_out.rc <= r.rc and done;
l_out.store_done <= d_in.store_done;
end if;
l_out.xerc <= r.xerc;
stall_out <= stall;
end process;
end;