@ -34,6 +34,7 @@ architecture behave of mmu is
PROC_TBL_READ,
PROC_TBL_WAIT,
SEGMENT_CHECK,
TLBWAIT,
RADIX_LOOKUP,
RADIX_READ_WAIT,
RADIX_LOAD_TLB,
@ -50,7 +51,7 @@ architecture behave of mmu is
inval_all : std_ulogic;
-- config SPRs
ptcr : std_ulogic_vector(63 downto 0);
pid : std_ulogic_vector(31 downto 0);
pid : std_ulogic_vector(11 downto 0);
-- internal state
state : state_t;
done : std_ulogic;
@ -70,6 +71,9 @@ architecture behave of mmu is
segerror : std_ulogic;
perm_err : std_ulogic;
rc_error : std_ulogic;
wr_tlbram : std_ulogic;
tlbie_req : std_ulogic;
is_mtspr : std_ulogic;
end record;
signal r, rin : reg_stage_t;
@ -78,10 +82,406 @@ architecture behave of mmu is
signal mask : std_ulogic_vector(15 downto 0);
signal finalmask : std_ulogic_vector(43 downto 0);
-- Small page (4k) TLB, 256 entries, 4-way set associative.
-- This is implemented using a 512 x 64 bit RAM, divided
-- into 64 blocks of 8 words, each block containing a set of
-- 4 entries.
-- In each block, word 0 contains a valid bit, 12-bit PID,
-- and 3 bits of address tag for each of the 4 entries.
-- (This allows us to do invalidate-all or invalidate-by-PID
-- in 64 cycles instead of 256.)
-- Word 1 contains 32 bits of address tag for entries 0 and 1,
-- and word 2 contains the same for entries 2 and 3.
-- Words 4 to 7 contain the PTE value for entries 0 to 3,
-- Word 3 is currently unused.
-- EAs are expected to be in a 4PB (52-bit) space per PID
-- (ignoring the quadrant bits); anything outside that
-- doesn't get cached.
constant TLB_WIDTH : natural := 64;
constant TLB_DEPTH : natural := 256;
constant TLB_HASH_BITS : natural := 6;
constant TLB_ADDR_BITS : natural := TLB_HASH_BITS + 3;
subtype tlb_word_t is std_ulogic_vector(TLB_WIDTH - 1 downto 0);
type tlb_t is array(0 to 2 * TLB_DEPTH - 1) of tlb_word_t;
signal tlb : tlb_t;
subtype tlb_index_t is integer range 0 to 2**TLB_HASH_BITS - 1;
signal tlb_doread : std_ulogic;
signal tlb_rdren : std_ulogic;
signal tlb_rdaddr : std_ulogic_vector(TLB_ADDR_BITS - 1 downto 0);
signal tlb_rddata : std_ulogic_vector(TLB_WIDTH - 1 downto 0);
signal tlb_rdreg : std_ulogic_vector(TLB_WIDTH - 1 downto 0);
signal tlb_wren : std_ulogic_vector(3 downto 0);
signal tlb_wraddr : std_ulogic_vector(TLB_ADDR_BITS - 1 downto 0);
signal tlb_wrdata : std_ulogic_vector(TLB_WIDTH - 1 downto 0);
type tlb_state_t is (IDLE,
SEARCH1, SEARCH2, SEARCH3, SEARCH4,
RDPTE,
WAITW, WRPTE1, WRPTE2,
INVAL1, INVAL2);
type mmu_tlb_reg_t is record
state : tlb_state_t;
addr : std_ulogic_vector(39 downto 0);
bad_ea : std_ulogic;
pid : std_ulogic_vector(11 downto 0);
hash_4k : std_ulogic_vector(TLB_HASH_BITS - 1 downto 0);
is_tlbie : std_ulogic;
may_hit : std_ulogic_vector(3 downto 0);
hit : std_ulogic;
miss : std_ulogic;
hit_way : std_ulogic_vector(1 downto 0);
repl_way : std_ulogic_vector(1 downto 0);
update_plru : std_ulogic;
tlbie_done : std_ulogic;
inval_all : std_ulogic;
wr_hash : std_ulogic_vector(TLB_HASH_BITS - 1 downto 0);
end record;
constant mmu_tlb_reg_init : mmu_tlb_reg_t := (
state => IDLE, addr => 40x"0", pid => 12x"0",
hash_4k => (others => '0'), wr_hash => (others => '0'),
may_hit => "0000", hit_way => "00", repl_way => "00",
others => '0');
signal tr, trin : mmu_tlb_reg_t;
-- TLB PLRU array
type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(2 downto 0);
signal tlb_plru_ram : tlb_plru_array;
signal tlb_plru_cur : std_ulogic_vector(2 downto 0);
signal tlb_plru_upd : std_ulogic_vector(2 downto 0);
signal tlb_plru_victim : std_ulogic_vector(1 downto 0);
function addr_hash_4k(ea: std_ulogic_vector(63 downto 0);
pid: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is
variable h : std_ulogic_vector(TLB_HASH_BITS - 1 downto 0);
begin
-- Make this a bit different to the hashes used in the dcache and icache
h := ea(17 downto 12) xor ea(23 downto 18) xor ea(51 downto 46) xor
pid(5 downto 0);
return h;
end;
function find_first_zero(x: std_ulogic_vector(3 downto 0)) return std_ulogic_vector is
begin
for i in 0 to 2 loop
if x(i) = '0' then
return std_ulogic_vector(to_unsigned(i, 2));
end if;
end loop;
return "11";
end;
function check_perm(pte: std_ulogic_vector(63 downto 0); priv: std_ulogic;
iside: std_ulogic; store: std_ulogic) return std_ulogic is
variable ok: std_ulogic;
begin
ok := '0';
if priv = '1' or pte(3) = '0' then
if iside = '0' then
ok := pte(1) or (pte(2) and not store);
else
-- no IAMR, so no KUEP support for now
-- deny execute permission if cache inhibited
ok := pte(0) and not pte(5);
end if;
end if;
return ok;
end;
begin
-- Synchronous reads and writes to TLB array
mmu_tlb_ram: process(clk)
begin
if rising_edge(clk) then
if tlb_rdren = '1' then
tlb_rdreg <= tlb_rddata;
end if;
if tlb_doread = '1' then
tlb_rddata <= tlb(to_integer(unsigned(tlb_rdaddr)));
end if;
if tlb_wren /= "0000" then
for i in 0 to 3 loop
if tlb_wren(i) = '1' then
tlb(to_integer(unsigned(tlb_wraddr)))(i*16 + 15 downto i*16) <=
tlb_wrdata(i*16 + 15 downto i*16);
end if;
end loop;
end if;
end if;
end process;
-- TLB PLRU
tlb_plru : entity work.plrufn
generic map (
BITS => 2
)
port map (
acc => tr.hit_way,
tree_in => tlb_plru_cur,
tree_out => tlb_plru_upd,
lru => tlb_plru_victim
);
process(all)
begin
if is_X(tr.hash_4k) then
tlb_plru_cur <= (others => 'X');
else
tlb_plru_cur <= tlb_plru_ram(to_integer(unsigned(tr.hash_4k)));
end if;
end process;
process(clk)
begin
if rising_edge(clk) then
if tr.update_plru = '1' then
assert not is_X(tr.hash_4k) severity failure;
tlb_plru_ram(to_integer(unsigned(tr.hash_4k))) <= tlb_plru_upd;
end if;
end if;
end process;
-- State machine for doing TLB searches, updates and invalidations
mmu_tlb_0: process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
tr <= mmu_tlb_reg_init;
else
tr <= trin;
end if;
end if;
end process;
mmu_tlb_1: process(all)
variable tv : mmu_tlb_reg_t;
variable isf : std_ulogic_vector(1 downto 0);
variable is_hit : std_ulogic;
variable valids : std_ulogic_vector(3 downto 0);
variable idx : std_ulogic_vector(2 downto 0);
variable wdat : std_ulogic_vector(15 downto 0);
begin
tv := tr;
tlb_doread <= '0';
tlb_rdren <= '0';
tlb_wren <= "0000";
tlb_wrdata <= (others => '0');
is_hit := '0';
idx := "000";
tv.update_plru := '0';
case tr.state is
when IDLE =>
tv.addr := l_in.addr(51 downto 12);
tv.pid := (others => '0');
if l_in.tlbie = '1' then
-- PID for tlbie comes from RS
tv.pid := l_in.rs(43 downto 32);
elsif l_in.addr(63) = '0' then
-- we currently only implement quadrants 0 and 3
tv.pid := r.pid;
end if;
tv.bad_ea := (or (l_in.addr(61 downto 52)) or (l_in.addr(63) xor l_in.addr(62)))
and not l_in.tlbie;
tv.hash_4k := addr_hash_4k(l_in.addr, tv.pid);
tv.wr_hash := tv.hash_4k;
tv.is_tlbie := l_in.tlbie;
if l_in.valid = '1' then
tv.hit := '0';
tv.miss := '0';
tv.tlbie_done := '0';
tv.inval_all := '0';
if l_in.tlbie = '1' then
-- decode what type of tlbie this is
isf := l_in.addr(11 downto 10);
if l_in.slbia = '1' or l_in.ric(0) = '1' then
-- no effect on this TLB (flushes L1 TLBs below)
tv.tlbie_done := '1';
elsif isf(1) = '1' then
-- invalidate all
tv.inval_all := '1';
tv.wr_hash := (others => '0');
tv.state := INVAL2;
elsif isf(0) = '1' then
-- invalidate PID
tv.hash_4k := (others => '0');
tlb_doread <= '1';
tv.state := INVAL1;
else
-- invalidate single page
tlb_doread <= '1';
tv.state := SEARCH1;
end if;
else
tlb_doread <= '1';
tv.state := SEARCH1;
end if;
end if;
when SEARCH1 =>
-- next read word 1 of group
idx := "001";
tlb_doread <= '1';
tlb_rdren <= '1';
if tr.bad_ea = '0' then
tv.state := SEARCH2;
else
tv.miss := '1';
tv.tlbie_done := tr.is_tlbie;
tv.state := IDLE;
end if;
when SEARCH2 =>
-- tlb_rdreg contains word 0, check for hits/misses
valids := "0000";
tv.may_hit := "0000";
for i in 0 to 3 loop
valids(i) := tlb_rdreg(i*16 + 15);
if tlb_rdreg(i*16 + 15) = '1' and
tlb_rdreg(i*16 + 11 downto i*16) = tr.pid and
tlb_rdreg(i*16 + 13 downto i*16 + 12) = tr.addr(7 downto 6) then
tv.may_hit(i) := '1';
end if;
end loop;
-- work out which way to replace in case of a miss
if valids = "1111" then
tv.repl_way := tlb_plru_victim;
else
tv.repl_way := find_first_zero(valids);
end if;
-- next read word 2 of group
idx := "010";
if tv.may_hit = "0000" then
tv.miss := '1';
if tr.is_tlbie = '0' then
tv.state := WAITW;
else
tv.tlbie_done := '1';
tv.state := IDLE;
end if;
else
tlb_doread <= '1';
tlb_rdren <= '1';
tv.state := SEARCH3;
end if;
when SEARCH3 =>
-- tlb_rdreg contains word 1
for i in 0 to 1 loop
if tr.may_hit(i) = '1' then
if tlb_rdreg(i*32 + 31 downto i*32) /= tr.addr(39 downto 8) then
tv.may_hit(i) := '0';
end if;
end if;
end loop;
if tv.may_hit(0) = '1' then
tv.hit_way := "00";
is_hit := '1';
elsif tv.may_hit(1) = '1' then
tv.hit_way := "01";
is_hit := '1';
end if;
if tr.is_tlbie = '1' then
tlb_rdren <= '1';
tv.state := SEARCH4;
elsif is_hit = '1' then
tv.state := RDPTE;
idx := '1' & tv.hit_way;
tlb_doread <= '1';
elsif tv.may_hit = "0000" then
tv.miss := '1';
tv.state := WAITW;
else
tlb_rdren <= '1';
tv.state := SEARCH4;
end if;
when SEARCH4 =>
-- tlb_rdreg contains word 2
for i in 0 to 1 loop
if tr.may_hit(i+2) = '1' then
if tlb_rdreg(i*32 + 31 downto i*32) /= tr.addr(39 downto 8) then
tv.may_hit(i+2) := '0';
end if;
end if;
end loop;
if tr.is_tlbie = '1' then
-- write zeroes to word 0 where hit(s) detected
tlb_wren <= tv.may_hit;
tv.tlbie_done := '1';
tv.state := IDLE;
elsif tv.may_hit = "0000" then
tv.miss := '1';
tv.state := WAITW;
else
tv.hit_way := '1' & not tv.may_hit(2);
idx := '1' & tv.hit_way;
tlb_doread <= '1';
tv.state := RDPTE;
end if;
when RDPTE =>
tv.repl_way := tr.hit_way;
tlb_rdren <= '1';
tv.hit := '1';
tv.update_plru := '1';
tv.state := WAITW;
when WAITW =>
wdat := "10" & tr.addr(7 downto 6) & tr.pid;
tlb_wrdata <= wdat & wdat & wdat & wdat;
if r.wr_tlbram = '1' then
-- write one 16b section of word 0
tlb_wren(to_integer(unsigned(tr.repl_way))) <= '1';
tv.hit_way := tv.repl_way;
tv.update_plru := '1';
tv.state := WRPTE1;
elsif r.done = '1' or r.err = '1' then
tv.state := IDLE;
end if;
when WRPTE1 =>
tlb_wrdata <= tr.addr(39 downto 8) & tr.addr(39 downto 8);
if tr.repl_way(0) = '1' then
tlb_wren <= "1100";
else
tlb_wren <= "0011";
end if;
idx := '0' & tr.repl_way(1) & not tr.repl_way(1);
tv.state := WRPTE2;
when WRPTE2 =>
tlb_wrdata <= r.pde;
tlb_wren <= "1111";
idx := '1' & tr.repl_way;
tv.state := IDLE;
when INVAL1 =>
tv.hash_4k := 6x"01";
tv.wr_hash := (others => '0');
tlb_doread <= '1';
tlb_rdren <= '1';
tv.state := INVAL2;
when INVAL2 =>
if tr.inval_all = '1' then
tlb_wren <= "1111";
else
valids := "0000";
for i in 0 to 3 loop
if tlb_rdreg(i*16 + 15) = '1' and
tlb_rdreg(i*16 + 11 downto i*16) = tr.pid then
valids(i) := '1';
end if;
end loop;
tlb_wren <= valids;
tlb_doread <= '1';
tlb_rdren <= '1';
end if;
tv.wr_hash := std_ulogic_vector(unsigned(tr.wr_hash) + 1);
tv.hash_4k := std_ulogic_vector(unsigned(tv.hash_4k) + 1);
if tr.wr_hash = 6x"3f" then
tv.tlbie_done := '1';
tv.state := IDLE;
end if;
end case;
tlb_rdaddr <= tv.hash_4k & idx;
tlb_wraddr <= tr.wr_hash & idx;
trin <= tv;
end process;
-- Multiplex internal SPR values back to loadstore1, selected
-- by l_in.sprnf.
l_out.sprval <= r.ptcr when l_in.sprnf = '1' else x"00000000" & r.pid;
l_out.sprval <= r.ptcr when l_in.sprnf = '1' else x"0000000000000" & r.pid;
mmu_0: process(clk)
begin
@ -94,6 +494,7 @@ begin
r.pt3_valid <= '0';
r.ptcr <= (others => '0');
r.pid <= (others => '0');
r.wr_tlbram <= '0';
else
if rin.valid = '1' then
report "MMU got tlb miss for " & to_hstring(rin.addr);
@ -194,12 +595,12 @@ begin
variable v : reg_stage_t;
variable dcreq : std_ulogic;
variable tlb_load : std_ulogic;
variable tlbie_req : std_ulogic;
variable ptbl_rd : std_ulogic;
variable prtbl_rd : std_ulogic;
variable pt_valid : std_ulogic;
variable effpid : std_ulogic_vector(31 downto 0);
variable effpid : std_ulogic_vector(11 downto 0);
variable prtable_addr : std_ulogic_vector(63 downto 0);
variable six : std_ulogic_vector(5 downto 0);
variable rts : unsigned(5 downto 0);
variable mbits : unsigned(5 downto 0);
variable pgtable_addr : std_ulogic_vector(63 downto 0);
@ -223,7 +624,7 @@ begin
v.perm_err := '0';
v.rc_error := '0';
tlb_load := '0';
tlbie_req := '0';
v.tlbie_req := '0';
v.inval_all := '0';
ptbl_rd := '0';
prtbl_rd := '0';
@ -244,7 +645,8 @@ begin
pt_valid := r.pt3_valid;
end if;
-- rts == radix tree size, # address bits being translated
rts := unsigned('0' & pgtbl(62 downto 61) & pgtbl(7 downto 5));
six := '0' & pgtbl(62 downto 61) & pgtbl(7 downto 5);
rts := unsigned(six);
-- mbits == # address bits to index top level of tree
mbits := unsigned('0' & pgtbl(4 downto 0));
-- set v.shift to rts so that we can use finalmask for the segment check
@ -268,6 +670,7 @@ begin
v.pt3_valid := '0';
v.ptb_valid := '0';
end if;
v.tlbie_req := '1';
v.state := DO_TLBIE;
else
v.valid := '1';
@ -289,12 +692,13 @@ begin
end if;
end if;
end if;
v.is_mtspr := l_in.mtspr;
if l_in.mtspr = '1' then
-- Move to PID needs to invalidate L1 TLBs and cached
-- pgtbl0 value. Move to PTCR does that plus
-- invalidating the cached pgtbl3 and prtbl values as well.
if l_in.sprnt = '0' then
v.pid := l_in.rs(31 downto 0);
v.pid := l_in.rs(11 downto 0);
else
v.ptcr := l_in.rs;
v.pt3_valid := '0';
@ -302,12 +706,14 @@ begin
end if;
v.pt0_valid := '0';
v.inval_all := '1';
v.tlbie_req := '1';
v.state := DO_TLBIE;
end if;
when DO_TLBIE =>
tlbie_req := '1';
if r.is_mtspr = '1' or tr.tlbie_done = '1' then
v.state := RADIX_FINISH;
end if;
when PART_TBL_READ =>
dcreq := '1';
@ -340,7 +746,8 @@ begin
v.pt0_valid := '1';
end if;
-- rts == radix tree size, # address bits being translated
rts := unsigned('0' & data(62 downto 61) & data(7 downto 5));
six := '0' & data(62 downto 61) & data(7 downto 5);
rts := unsigned(six);
-- mbits == # address bits to index top level of tree
mbits := unsigned('0' & data(4 downto 0));
-- set v.shift to rts so that we can use finalmask for the segment check
@ -369,9 +776,32 @@ begin
elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then
v.state := RADIX_FINISH;
v.badtree := '1';
elsif tr.miss = '1' then
v.state := RADIX_LOOKUP;
else
v.state := TLBWAIT;
end if;
when TLBWAIT =>
v.pde := tlb_rdreg;
if tr.hit = '1' then
-- PTE from the TLB entry is in tlb_rdreg
-- Check permissions; if the access is not permitted,
-- reread the PTE from memory to verify, because increasing
-- permission on a PTE doesn't require tlbie.
-- Note that R must be set in the PTE, otherwise it
-- wouldn't have been written to the TLB.
perm_ok := check_perm(tlb_rdreg, r.priv, r.iside, r.store);
rc_ok := tlb_rdreg(7) or not r.store;
if perm_ok = '1' and rc_ok = '1' then
v.shift := to_unsigned(0, 6);
v.state := RADIX_LOAD_TLB;
else
v.state := RADIX_LOOKUP;
end if;
elsif tr.miss = '1' then
v.state := RADIX_LOOKUP;
end if;
when RADIX_LOOKUP =>
dcreq := '1';
@ -385,19 +815,15 @@ begin
-- test leaf bit
if data(62) = '1' then
-- check permissions and RC bits
perm_ok := '0';
if r.priv = '1' or data(3) = '0' then
if r.iside = '0' then
perm_ok := data(1) or (data(2) and not r.store);
else
-- no IAMR, so no KUEP support for now
-- deny execute permission if cache inhibited
perm_ok := data(0) and not data(5);
end if;
end if;
perm_ok := check_perm(data, r.priv, r.iside, r.store);
rc_ok := data(8) and (data(7) or not r.store);
if perm_ok = '1' and rc_ok = '1' then
v.state := RADIX_LOAD_TLB;
-- only cache 4k PTEs in our TLB, and only if the
-- address is within the standard 52 bit EA space
if r.shift = 0 then
v.wr_tlbram := '1';
end if;
else
v.state := RADIX_FINISH;
v.perm_err := not perm_ok;
@ -432,6 +858,7 @@ begin
v.state := RADIX_FINISH;
when RADIX_FINISH =>
v.wr_tlbram := '0';
v.state := IDLE;
end case;
@ -442,13 +869,13 @@ begin
end if;
if r.addr(63) = '1' then
effpid := x"00000000";
effpid := (others => '0');
else
effpid := r.pid;
end if;
prtable_addr := x"00" & r.prtbl(55 downto 36) &
((r.prtbl(35 downto 12) and not finalmask(23 downto 0)) or
(effpid(31 downto 8) and finalmask(23 downto 0))) &
prtable_addr := x"00" & r.prtbl(55 downto 16) &
((r.prtbl(15 downto 12) and not finalmask(3 downto 0)) or
(effpid(11 downto 8) and finalmask(3 downto 0))) &
effpid(7 downto 0) & "0000";
pgtable_addr := x"00" & r.pgbase(55 downto 19) &
@ -462,7 +889,7 @@ begin
rin <= v;
-- drive outputs
if tlbie_req = '1' then
if r.tlbie_req = '1' then
addr := r.addr;
tlb_data := (others => '0');
elsif tlb_load = '1' then
@ -488,14 +915,14 @@ begin
l_out.rc_error <= r.rc_error;
d_out.valid <= dcreq;
d_out.tlbie <= tlbie_req;
d_out.tlbie <= r.tlbie_req;
d_out.doall <= r.inval_all;
d_out.tlbld <= tlb_load and not r.iside;
d_out.addr <= addr;
d_out.pte <= tlb_data;
i_out.tlbld <= tlb_load and r.iside;
i_out.tlbie <= tlbie_req;
i_out.tlbie <= r.tlbie_req;
i_out.doall <= r.inval_all;
i_out.addr <= addr;
i_out.pte <= tlb_data;