|
|
|
library ieee;
|
|
|
|
use ieee.std_logic_1164.all;
|
|
|
|
use ieee.numeric_std.all;
|
|
|
|
|
|
|
|
library work;
|
|
|
|
use work.decode_types.all;
|
|
|
|
use work.common.all;
|
|
|
|
use work.insn_helpers.all;
|
|
|
|
use work.helpers.all;
|
|
|
|
|
|
|
|
-- 2 cycle LSU
|
|
|
|
-- We calculate the address in the first cycle
|
|
|
|
|
|
|
|
entity loadstore1 is
|
|
|
|
generic (
|
|
|
|
HAS_FPU : boolean := true;
|
|
|
|
-- Non-zero to enable log data collection
|
|
|
|
LOG_LENGTH : natural := 0
|
|
|
|
);
|
|
|
|
port (
|
|
|
|
clk : in std_ulogic;
|
|
|
|
rst : in std_ulogic;
|
|
|
|
|
|
|
|
l_in : in Execute1ToLoadstore1Type;
|
|
|
|
e_out : out Loadstore1ToExecute1Type;
|
|
|
|
l_out : out Loadstore1ToWritebackType;
|
|
|
|
|
|
|
|
d_out : out Loadstore1ToDcacheType;
|
|
|
|
d_in : in DcacheToLoadstore1Type;
|
|
|
|
|
|
|
|
m_out : out Loadstore1ToMmuType;
|
|
|
|
m_in : in MmuToLoadstore1Type;
|
|
|
|
|
|
|
|
dc_stall : in std_ulogic;
|
|
|
|
|
|
|
|
log_out : out std_ulogic_vector(9 downto 0)
|
|
|
|
);
|
|
|
|
end loadstore1;
|
|
|
|
|
|
|
|
-- Note, we don't currently use the stall output from the dcache because
|
|
|
|
-- we know it can take two requests without stalling when idle, we are
|
|
|
|
-- its only user, and we know it never stalls when idle.
|
|
|
|
|
|
|
|
architecture behave of loadstore1 is
|
|
|
|
|
|
|
|
-- State machine for unaligned loads/stores
|
|
|
|
type state_t is (IDLE, -- ready for instruction
|
|
|
|
SECOND_REQ, -- send 2nd request of unaligned xfer
|
|
|
|
ACK_WAIT, -- waiting for ack from dcache
|
|
|
|
MMU_LOOKUP, -- waiting for MMU to look up translation
|
|
|
|
TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie
|
|
|
|
FINISH_LFS, -- write back converted SP data for lfs*
|
|
|
|
COMPLETE -- extra cycle to complete an operation
|
|
|
|
);
|
|
|
|
|
|
|
|
type byte_index_t is array(0 to 7) of unsigned(2 downto 0);
|
|
|
|
subtype byte_trim_t is std_ulogic_vector(1 downto 0);
|
|
|
|
type trim_ctl_t is array(0 to 7) of byte_trim_t;
|
|
|
|
|
|
|
|
type reg_stage_t is record
|
|
|
|
-- latch most of the input request
|
|
|
|
load : std_ulogic;
|
|
|
|
tlbie : std_ulogic;
|
|
|
|
dcbz : std_ulogic;
|
|
|
|
addr : std_ulogic_vector(63 downto 0);
|
|
|
|
store_data : std_ulogic_vector(63 downto 0);
|
|
|
|
load_data : std_ulogic_vector(63 downto 0);
|
|
|
|
write_reg : gspr_index_t;
|
|
|
|
length : std_ulogic_vector(3 downto 0);
|
|
|
|
byte_reverse : std_ulogic;
|
|
|
|
byte_offset : unsigned(2 downto 0);
|
|
|
|
brev_mask : unsigned(2 downto 0);
|
|
|
|
sign_extend : std_ulogic;
|
|
|
|
update : std_ulogic;
|
|
|
|
update_reg : gpr_index_t;
|
|
|
|
xerc : xer_common_t;
|
|
|
|
reserve : std_ulogic;
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
atomic : std_ulogic;
|
|
|
|
atomic_last : std_ulogic;
|
|
|
|
rc : std_ulogic;
|
|
|
|
nc : std_ulogic; -- non-cacheable access
|
|
|
|
virt_mode : std_ulogic;
|
|
|
|
priv_mode : std_ulogic;
|
|
|
|
state : state_t;
|
|
|
|
dwords_done : std_ulogic;
|
|
|
|
last_dword : std_ulogic;
|
|
|
|
first_bytes : std_ulogic_vector(7 downto 0);
|
|
|
|
second_bytes : std_ulogic_vector(7 downto 0);
|
|
|
|
dar : std_ulogic_vector(63 downto 0);
|
|
|
|
dsisr : std_ulogic_vector(31 downto 0);
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
instr_fault : std_ulogic;
|
|
|
|
align_intr : std_ulogic;
|
|
|
|
sprval : std_ulogic_vector(63 downto 0);
|
|
|
|
busy : std_ulogic;
|
|
|
|
wait_dcache : std_ulogic;
|
|
|
|
wait_mmu : std_ulogic;
|
|
|
|
do_update : std_ulogic;
|
|
|
|
extra_cycle : std_ulogic;
|
|
|
|
mode_32bit : std_ulogic;
|
|
|
|
byte_index : byte_index_t;
|
|
|
|
use_second : std_ulogic_vector(7 downto 0);
|
|
|
|
trim_ctl : trim_ctl_t;
|
|
|
|
load_sp : std_ulogic;
|
|
|
|
ld_sp_data : std_ulogic_vector(31 downto 0);
|
|
|
|
ld_sp_nz : std_ulogic;
|
|
|
|
ld_sp_lz : std_ulogic_vector(5 downto 0);
|
|
|
|
wr_sel : std_ulogic_vector(1 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
signal r, rin : reg_stage_t;
|
|
|
|
signal lsu_sum : std_ulogic_vector(63 downto 0);
|
|
|
|
|
|
|
|
signal store_sp_data : std_ulogic_vector(31 downto 0);
|
|
|
|
signal load_dp_data : std_ulogic_vector(63 downto 0);
|
|
|
|
|
|
|
|
-- Generate byte enables from sizes
|
|
|
|
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
|
|
|
|
begin
|
|
|
|
case length is
|
|
|
|
when "0001" =>
|
|
|
|
return "00000001";
|
|
|
|
when "0010" =>
|
|
|
|
return "00000011";
|
|
|
|
when "0100" =>
|
|
|
|
return "00001111";
|
|
|
|
when "1000" =>
|
|
|
|
return "11111111";
|
|
|
|
when others =>
|
|
|
|
return "00000000";
|
|
|
|
end case;
|
|
|
|
end function length_to_sel;
|
|
|
|
|
|
|
|
-- Calculate byte enables
|
|
|
|
-- This returns 16 bits, giving the select signals for two transfers,
|
|
|
|
-- to account for unaligned loads or stores
|
|
|
|
function xfer_data_sel(size : in std_logic_vector(3 downto 0);
|
|
|
|
address : in std_logic_vector(2 downto 0))
|
|
|
|
return std_ulogic_vector is
|
|
|
|
variable longsel : std_ulogic_vector(15 downto 0);
|
|
|
|
begin
|
|
|
|
longsel := "00000000" & length_to_sel(size);
|
|
|
|
return std_ulogic_vector(shift_left(unsigned(longsel),
|
|
|
|
to_integer(unsigned(address))));
|
|
|
|
end function xfer_data_sel;
|
|
|
|
|
|
|
|
-- 23-bit right shifter for DP -> SP float conversions
|
|
|
|
function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
|
|
|
|
return std_ulogic_vector is
|
|
|
|
variable fs1 : std_ulogic_vector(22 downto 0);
|
|
|
|
variable fs2 : std_ulogic_vector(22 downto 0);
|
|
|
|
begin
|
|
|
|
case shift(1 downto 0) is
|
|
|
|
when "00" =>
|
|
|
|
fs1 := frac;
|
|
|
|
when "01" =>
|
|
|
|
fs1 := '0' & frac(22 downto 1);
|
|
|
|
when "10" =>
|
|
|
|
fs1 := "00" & frac(22 downto 2);
|
|
|
|
when others =>
|
|
|
|
fs1 := "000" & frac(22 downto 3);
|
|
|
|
end case;
|
|
|
|
case shift(4 downto 2) is
|
|
|
|
when "000" =>
|
|
|
|
fs2 := fs1;
|
|
|
|
when "001" =>
|
|
|
|
fs2 := x"0" & fs1(22 downto 4);
|
|
|
|
when "010" =>
|
|
|
|
fs2 := x"00" & fs1(22 downto 8);
|
|
|
|
when "011" =>
|
|
|
|
fs2 := x"000" & fs1(22 downto 12);
|
|
|
|
when "100" =>
|
|
|
|
fs2 := x"0000" & fs1(22 downto 16);
|
|
|
|
when others =>
|
|
|
|
fs2 := x"00000" & fs1(22 downto 20);
|
|
|
|
end case;
|
|
|
|
return fs2;
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- 23-bit left shifter for SP -> DP float conversions
|
|
|
|
function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
|
|
|
|
return std_ulogic_vector is
|
|
|
|
variable fs1 : std_ulogic_vector(22 downto 0);
|
|
|
|
variable fs2 : std_ulogic_vector(22 downto 0);
|
|
|
|
begin
|
|
|
|
case shift(1 downto 0) is
|
|
|
|
when "00" =>
|
|
|
|
fs1 := frac;
|
|
|
|
when "01" =>
|
|
|
|
fs1 := frac(21 downto 0) & '0';
|
|
|
|
when "10" =>
|
|
|
|
fs1 := frac(20 downto 0) & "00";
|
|
|
|
when others =>
|
|
|
|
fs1 := frac(19 downto 0) & "000";
|
|
|
|
end case;
|
|
|
|
case shift(4 downto 2) is
|
|
|
|
when "000" =>
|
|
|
|
fs2 := fs1;
|
|
|
|
when "001" =>
|
|
|
|
fs2 := fs1(18 downto 0) & x"0" ;
|
|
|
|
when "010" =>
|
|
|
|
fs2 := fs1(14 downto 0) & x"00";
|
|
|
|
when "011" =>
|
|
|
|
fs2 := fs1(10 downto 0) & x"000";
|
|
|
|
when "100" =>
|
|
|
|
fs2 := fs1(6 downto 0) & x"0000";
|
|
|
|
when others =>
|
|
|
|
fs2 := fs1(2 downto 0) & x"00000";
|
|
|
|
end case;
|
|
|
|
return fs2;
|
|
|
|
end;
|
|
|
|
|
|
|
|
begin
|
|
|
|
-- Calculate the address in the first cycle
|
|
|
|
lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
|
|
|
|
|
|
|
|
loadstore1_0: process(clk)
|
|
|
|
begin
|
|
|
|
if rising_edge(clk) then
|
|
|
|
if rst = '1' then
|
|
|
|
r.state <= IDLE;
|
|
|
|
r.busy <= '0';
|
|
|
|
r.do_update <= '0';
|
|
|
|
else
|
|
|
|
r <= rin;
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
end process;
|
|
|
|
|
|
|
|
ls_fp_conv: if HAS_FPU generate
|
|
|
|
-- Convert DP data to SP for stfs
|
|
|
|
dp_to_sp: process(all)
|
|
|
|
variable exp : unsigned(10 downto 0);
|
|
|
|
variable frac : std_ulogic_vector(22 downto 0);
|
|
|
|
variable shift : unsigned(4 downto 0);
|
|
|
|
begin
|
|
|
|
store_sp_data(31) <= l_in.data(63);
|
|
|
|
store_sp_data(30 downto 0) <= (others => '0');
|
|
|
|
exp := unsigned(l_in.data(62 downto 52));
|
|
|
|
if exp > 896 then
|
|
|
|
store_sp_data(30) <= l_in.data(62);
|
|
|
|
store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
|
|
|
|
elsif exp >= 874 then
|
|
|
|
-- denormalization required
|
|
|
|
frac := '1' & l_in.data(51 downto 30);
|
|
|
|
shift := 0 - exp(4 downto 0);
|
|
|
|
store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
|
|
|
|
end if;
|
|
|
|
end process;
|
|
|
|
|
|
|
|
-- Convert SP data to DP for lfs
|
|
|
|
sp_to_dp: process(all)
|
|
|
|
variable exp : unsigned(7 downto 0);
|
|
|
|
variable exp_dp : unsigned(10 downto 0);
|
|
|
|
variable exp_nz : std_ulogic;
|
|
|
|
variable exp_ao : std_ulogic;
|
|
|
|
variable frac : std_ulogic_vector(22 downto 0);
|
|
|
|
variable frac_shift : unsigned(4 downto 0);
|
|
|
|
begin
|
|
|
|
frac := r.ld_sp_data(22 downto 0);
|
|
|
|
exp := unsigned(r.ld_sp_data(30 downto 23));
|
|
|
|
exp_nz := or (r.ld_sp_data(30 downto 23));
|
|
|
|
exp_ao := and (r.ld_sp_data(30 downto 23));
|
|
|
|
frac_shift := (others => '0');
|
|
|
|
if exp_ao = '1' then
|
|
|
|
exp_dp := to_unsigned(2047, 11); -- infinity or NaN
|
|
|
|
elsif exp_nz = '1' then
|
|
|
|
exp_dp := 896 + resize(exp, 11); -- finite normalized value
|
|
|
|
elsif r.ld_sp_nz = '0' then
|
|
|
|
exp_dp := to_unsigned(0, 11); -- zero
|
|
|
|
else
|
|
|
|
-- denormalized SP operand, need to normalize
|
|
|
|
exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11);
|
|
|
|
frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1;
|
|
|
|
end if;
|
|
|
|
load_dp_data(63) <= r.ld_sp_data(31);
|
|
|
|
load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
|
|
|
|
load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
|
|
|
|
load_dp_data(28 downto 0) <= (others => '0');
|
|
|
|
end process;
|
|
|
|
end generate;
|
|
|
|
|
|
|
|
loadstore1_1: process(all)
|
|
|
|
variable v : reg_stage_t;
|
|
|
|
variable brev_lenm1 : unsigned(2 downto 0);
|
|
|
|
variable byte_offset : unsigned(2 downto 0);
|
|
|
|
variable j : integer;
|
|
|
|
variable k : unsigned(2 downto 0);
|
|
|
|
variable kk : unsigned(3 downto 0);
|
|
|
|
variable long_sel : std_ulogic_vector(15 downto 0);
|
|
|
|
variable byte_sel : std_ulogic_vector(7 downto 0);
|
|
|
|
variable req : std_ulogic;
|
|
|
|
variable busy : std_ulogic;
|
|
|
|
variable addr : std_ulogic_vector(63 downto 0);
|
|
|
|
variable maddr : std_ulogic_vector(63 downto 0);
|
|
|
|
variable wdata : std_ulogic_vector(63 downto 0);
|
|
|
|
variable write_enable : std_ulogic;
|
|
|
|
variable do_update : std_ulogic;
|
|
|
|
variable done : std_ulogic;
|
|
|
|
variable data_permuted : std_ulogic_vector(63 downto 0);
|
|
|
|
variable data_trimmed : std_ulogic_vector(63 downto 0);
|
|
|
|
variable store_data : std_ulogic_vector(63 downto 0);
|
|
|
|
variable byte_rev : std_ulogic;
|
|
|
|
variable length : std_ulogic_vector(3 downto 0);
|
|
|
|
variable negative : std_ulogic;
|
|
|
|
variable sprn : std_ulogic_vector(9 downto 0);
|
|
|
|
variable exception : std_ulogic;
|
|
|
|
variable next_addr : std_ulogic_vector(63 downto 0);
|
|
|
|
variable mmureq : std_ulogic;
|
|
|
|
variable dsisr : std_ulogic_vector(31 downto 0);
|
MMU: Implement radix page table machinery
This adds the necessary machinery to the MMU for it to do radix page
table walks. The core elements are a shifter that can shift the
address right by between 0 and 47 bits, a mask generator that can
generate a mask of between 5 and 16 bits, a final mask generator,
and new states in the state machine.
(The final mask generator is used for transferring bits of the
original address into the resulting TLB entry when the leaf PTE
corresponds to a page size larger than 4kB.)
The hardware does not implement a partition table or a process table.
Software is expected to load the appropriate process table entry
into a new SPR called PGTBL0, SPR 720. The contents should be
formatted as described in Book III section 5.7.6.2 of the Power ISA
v3.0B. PGTBL0 is set to 0 on hard reset. At present, the top two bits
of the address (the quadrant) are ignored.
There is currently no caching of any step in the translation process
or of the final result, other than the entry created in the dTLB.
That entry is a 4k page entry even if the leaf PTE found in the walk
corresponds to a larger page size.
This implementation can handle almost any page table layout and any
page size. The RTS field (in PGTBL0) can have any value between 0
and 31, corresponding to a total address space size between 2^31
and 2^62 bytes. The RPDS field of PGTBL0 can be any value between
5 and 16, except that a value of 0 is taken to disable radix page
table walking (for use when one is using software loading of TLB
entries). The NLS field of the page directory entries can have any
value between 5 and 16. The minimum page size is 4kB, meaning that
the sum of RPDS and the NLS values of the PDEs found on the path to
a leaf PTE must be less than or equal to RTS + 31 - 12.
The PGTBL0 SPR is in the mmu module; thus this adds a path for
loadstore1 to read and write SPRs in mmu. This adds code in dcache
to service doubleword read requests from the MMU, as well as requests
to write dTLB entries.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
variable mmu_mtspr : std_ulogic;
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
variable itlb_fault : std_ulogic;
|
|
|
|
variable misaligned : std_ulogic;
|
|
|
|
begin
|
|
|
|
v := r;
|
|
|
|
req := '0';
|
MMU: Implement radix page table machinery
This adds the necessary machinery to the MMU for it to do radix page
table walks. The core elements are a shifter that can shift the
address right by between 0 and 47 bits, a mask generator that can
generate a mask of between 5 and 16 bits, a final mask generator,
and new states in the state machine.
(The final mask generator is used for transferring bits of the
original address into the resulting TLB entry when the leaf PTE
corresponds to a page size larger than 4kB.)
The hardware does not implement a partition table or a process table.
Software is expected to load the appropriate process table entry
into a new SPR called PGTBL0, SPR 720. The contents should be
formatted as described in Book III section 5.7.6.2 of the Power ISA
v3.0B. PGTBL0 is set to 0 on hard reset. At present, the top two bits
of the address (the quadrant) are ignored.
There is currently no caching of any step in the translation process
or of the final result, other than the entry created in the dTLB.
That entry is a 4k page entry even if the leaf PTE found in the walk
corresponds to a larger page size.
This implementation can handle almost any page table layout and any
page size. The RTS field (in PGTBL0) can have any value between 0
and 31, corresponding to a total address space size between 2^31
and 2^62 bytes. The RPDS field of PGTBL0 can be any value between
5 and 16, except that a value of 0 is taken to disable radix page
table walking (for use when one is using software loading of TLB
entries). The NLS field of the page directory entries can have any
value between 5 and 16. The minimum page size is 4kB, meaning that
the sum of RPDS and the NLS values of the PDEs found on the path to
a leaf PTE must be less than or equal to RTS + 31 - 12.
The PGTBL0 SPR is in the mmu module; thus this adds a path for
loadstore1 to read and write SPRs in mmu. This adds code in dcache
to service doubleword read requests from the MMU, as well as requests
to write dTLB entries.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
mmu_mtspr := '0';
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
itlb_fault := '0';
|
|
|
|
sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
|
|
|
|
dsisr := (others => '0');
|
|
|
|
mmureq := '0';
|
|
|
|
v.wr_sel := "11";
|
|
|
|
|
|
|
|
write_enable := '0';
|
|
|
|
|
|
|
|
do_update := r.do_update;
|
|
|
|
v.do_update := '0';
|
|
|
|
|
|
|
|
-- load data formatting
|
|
|
|
-- shift and byte-reverse data bytes
|
|
|
|
for i in 0 to 7 loop
|
|
|
|
j := to_integer(r.byte_index(i)) * 8;
|
|
|
|
data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
|
|
|
|
end loop;
|
|
|
|
|
|
|
|
-- Work out the sign bit for sign extension.
|
|
|
|
-- For unaligned loads crossing two dwords, the sign bit is in the
|
|
|
|
-- first dword for big-endian (byte_reverse = 1), or the second dword
|
|
|
|
-- for little-endian.
|
|
|
|
if r.dwords_done = '1' and r.byte_reverse = '1' then
|
|
|
|
negative := (r.length(3) and r.load_data(63)) or
|
|
|
|
(r.length(2) and r.load_data(31)) or
|
|
|
|
(r.length(1) and r.load_data(15)) or
|
|
|
|
(r.length(0) and r.load_data(7));
|
|
|
|
else
|
|
|
|
negative := (r.length(3) and data_permuted(63)) or
|
|
|
|
(r.length(2) and data_permuted(31)) or
|
|
|
|
(r.length(1) and data_permuted(15)) or
|
|
|
|
(r.length(0) and data_permuted(7));
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- trim and sign-extend
|
|
|
|
for i in 0 to 7 loop
|
|
|
|
case r.trim_ctl(i) is
|
|
|
|
when "11" =>
|
|
|
|
data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
|
|
|
|
when "10" =>
|
|
|
|
data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
|
|
|
|
when "01" =>
|
|
|
|
data_trimmed(i * 8 + 7 downto i * 8) := (others => negative);
|
|
|
|
when others =>
|
|
|
|
data_trimmed(i * 8 + 7 downto i * 8) := x"00";
|
|
|
|
end case;
|
|
|
|
end loop;
|
|
|
|
|
|
|
|
if HAS_FPU then
|
|
|
|
-- Single-precision FP conversion for loads
|
|
|
|
v.ld_sp_data := data_trimmed(31 downto 0);
|
|
|
|
v.ld_sp_nz := or (data_trimmed(22 downto 0));
|
|
|
|
v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Byte reversing and rotating for stores.
|
|
|
|
-- Done in the second cycle (the cycle after l_in.valid = 1).
|
|
|
|
for i in 0 to 7 loop
|
|
|
|
k := (to_unsigned(i, 3) - r.byte_offset) xor r.brev_mask;
|
|
|
|
j := to_integer(k) * 8;
|
|
|
|
store_data(i * 8 + 7 downto i * 8) := r.store_data(j + 7 downto j);
|
|
|
|
end loop;
|
|
|
|
|
|
|
|
-- compute (addr + 8) & ~7 for the second doubleword when unaligned
|
|
|
|
next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
|
|
|
|
|
|
|
|
-- Busy calculation.
|
|
|
|
-- We need to minimize the delay from clock to busy valid because it
|
|
|
|
-- gates the start of execution of the next instruction.
|
|
|
|
busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done));
|
|
|
|
v.busy := busy;
|
|
|
|
|
|
|
|
done := '0';
|
|
|
|
if r.state /= IDLE and busy = '0' then
|
|
|
|
done := '1';
|
|
|
|
end if;
|
|
|
|
exception := '0';
|
|
|
|
|
|
|
|
if r.dwords_done = '1' or r.state = SECOND_REQ then
|
|
|
|
addr := next_addr;
|
|
|
|
byte_sel := r.second_bytes;
|
|
|
|
else
|
|
|
|
addr := r.addr;
|
|
|
|
byte_sel := r.first_bytes;
|
|
|
|
end if;
|
|
|
|
if r.mode_32bit = '1' then
|
|
|
|
addr(63 downto 32) := (others => '0');
|
|
|
|
end if;
|
|
|
|
maddr := addr;
|
|
|
|
|
|
|
|
case r.state is
|
|
|
|
when IDLE =>
|
|
|
|
|
|
|
|
when SECOND_REQ =>
|
|
|
|
req := '1';
|
|
|
|
v.state := ACK_WAIT;
|
|
|
|
v.last_dword := '0';
|
|
|
|
|
|
|
|
when ACK_WAIT =>
|
|
|
|
-- r.wr_sel gets set one cycle after we come into ACK_WAIT state,
|
|
|
|
-- which is OK because the dcache always takes at least two cycles.
|
|
|
|
if r.update = '1' and (r.load = '0' or (HAS_FPU and r.load_sp = '1')) then
|
|
|
|
v.wr_sel := "01";
|
|
|
|
end if;
|
|
|
|
if d_in.error = '1' then
|
|
|
|
-- dcache will discard the second request if it
|
|
|
|
-- gets an error on the 1st of two requests
|
|
|
|
if d_in.cache_paradox = '1' then
|
|
|
|
-- signal an interrupt straight away
|
|
|
|
exception := '1';
|
|
|
|
dsisr(63 - 38) := not r.load;
|
|
|
|
-- XXX there is no architected bit for this
|
|
|
|
dsisr(63 - 35) := d_in.cache_paradox;
|
|
|
|
else
|
|
|
|
-- Look up the translation for TLB miss
|
|
|
|
-- and also for permission error and RC error
|
|
|
|
-- in case the PTE has been updated.
|
|
|
|
mmureq := '1';
|
|
|
|
v.state := MMU_LOOKUP;
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
if d_in.valid = '1' then
|
|
|
|
if r.last_dword = '0' then
|
|
|
|
v.dwords_done := '1';
|
|
|
|
v.last_dword := '1';
|
|
|
|
if r.load = '1' then
|
|
|
|
v.load_data := data_permuted;
|
|
|
|
end if;
|
|
|
|
else
|
|
|
|
write_enable := r.load and not r.load_sp;
|
|
|
|
if HAS_FPU and r.load_sp = '1' then
|
|
|
|
-- SP to DP conversion takes a cycle
|
|
|
|
-- Write back rA update in this cycle if needed
|
|
|
|
do_update := r.update;
|
|
|
|
v.wr_sel := "10";
|
|
|
|
v.state := FINISH_LFS;
|
|
|
|
elsif r.extra_cycle = '1' then
|
|
|
|
-- loads with rA update need an extra cycle
|
|
|
|
v.wr_sel := "01";
|
|
|
|
v.state := COMPLETE;
|
|
|
|
v.do_update := r.update;
|
|
|
|
else
|
|
|
|
-- stores write back rA update in this cycle
|
|
|
|
do_update := r.update;
|
|
|
|
end if;
|
|
|
|
v.busy := '0';
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
-- r.wait_dcache gets set one cycle after we come into ACK_WAIT state,
|
|
|
|
-- which is OK because the dcache always takes at least two cycles.
|
|
|
|
v.wait_dcache := r.last_dword and not r.extra_cycle;
|
|
|
|
|
|
|
|
when MMU_LOOKUP =>
|
|
|
|
if m_in.done = '1' then
|
|
|
|
if r.instr_fault = '0' then
|
|
|
|
-- retry the request now that the MMU has installed a TLB entry
|
|
|
|
req := '1';
|
|
|
|
if r.last_dword = '0' then
|
|
|
|
v.state := SECOND_REQ;
|
|
|
|
else
|
|
|
|
v.state := ACK_WAIT;
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
if m_in.err = '1' then
|
|
|
|
exception := '1';
|
|
|
|
dsisr(63 - 33) := m_in.invalid;
|
|
|
|
dsisr(63 - 36) := m_in.perm_error;
|
|
|
|
dsisr(63 - 38) := not r.load;
|
|
|
|
dsisr(63 - 44) := m_in.badtree;
|
|
|
|
dsisr(63 - 45) := m_in.rc_error;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
when TLBIE_WAIT =>
|
|
|
|
|
|
|
|
when FINISH_LFS =>
|
|
|
|
|
|
|
|
when COMPLETE =>
|
|
|
|
exception := r.align_intr;
|
|
|
|
|
|
|
|
end case;
|
|
|
|
|
|
|
|
if done = '1' or exception = '1' then
|
|
|
|
v.state := IDLE;
|
|
|
|
v.busy := '0';
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Note that l_in.valid is gated with busy inside execute1
|
|
|
|
if l_in.valid = '1' then
|
|
|
|
v.mode_32bit := l_in.mode_32bit;
|
|
|
|
v.load := '0';
|
|
|
|
v.dcbz := '0';
|
|
|
|
v.tlbie := '0';
|
|
|
|
v.instr_fault := '0';
|
|
|
|
v.align_intr := '0';
|
|
|
|
v.dwords_done := '0';
|
|
|
|
v.last_dword := '1';
|
|
|
|
v.write_reg := l_in.write_reg;
|
|
|
|
v.length := l_in.length;
|
|
|
|
v.byte_reverse := l_in.byte_reverse;
|
|
|
|
v.sign_extend := l_in.sign_extend;
|
|
|
|
v.update := l_in.update;
|
|
|
|
v.update_reg := l_in.update_reg;
|
|
|
|
v.xerc := l_in.xerc;
|
|
|
|
v.reserve := l_in.reserve;
|
|
|
|
v.rc := l_in.rc;
|
|
|
|
v.nc := l_in.ci;
|
|
|
|
v.virt_mode := l_in.virt_mode;
|
|
|
|
v.priv_mode := l_in.priv_mode;
|
|
|
|
v.load_sp := '0';
|
|
|
|
v.wait_dcache := '0';
|
|
|
|
v.wait_mmu := '0';
|
|
|
|
v.do_update := '0';
|
|
|
|
v.extra_cycle := '0';
|
|
|
|
|
|
|
|
if HAS_FPU and l_in.is_32bit = '1' then
|
|
|
|
v.store_data := x"00000000" & store_sp_data;
|
|
|
|
else
|
|
|
|
v.store_data := l_in.data;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
addr := lsu_sum;
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
if l_in.second = '1' then
|
|
|
|
-- for the second half of a 16-byte transfer, use next_addr
|
|
|
|
addr := next_addr;
|
|
|
|
end if;
|
|
|
|
if l_in.mode_32bit = '1' then
|
|
|
|
addr(63 downto 32) := (others => '0');
|
|
|
|
end if;
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
v.addr := addr;
|
|
|
|
maddr := l_in.addr2; -- address from RB for tlbie
|
|
|
|
|
|
|
|
-- XXX Temporary hack. Mark the op as non-cachable if the address
|
|
|
|
-- is the form 0xc------- for a real-mode access.
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
if addr(31 downto 28) = "1100" and l_in.virt_mode = '0' then
|
|