From 9c66ab9153b7817cae02e0777e11384154924213 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Mar 2026 15:06:12 +1100 Subject: [PATCH] MMU: Implement a page-walk cache This adds a page-walk cache (PWC) which stores PDEs from the page tables at the 2MB, 1GB and 512GB levels, provided they point to tables with 512 entries and map addresses below 4PB. The PWC also stores PTEs for 2MB large pages. It uses a 512 x 64b block RAM structured as 64 sets, each set using 8 words of RAM and storing 4 ways. The valid bit, page size, leaf indication (PTE vs. PDE), and PID for all 4 ways are stored in the first 64b word so that invalidate-all and invalidate-by-PID can be done in 64 cycles. The MMU test (tests/mmu/mmu.c) is modified to use a three-level tree mapping a total of 512GB, where the 1G and 2M levels can be cached in the PWC. Signed-off-by: Paul Mackerras --- mmu.vhdl | 833 ++++++++++++++++++++++++++++++++++++++++----- tests/mmu/mmu.c | 28 +- tests/test_mmu.bin | Bin 24608 -> 24616 bytes 3 files changed, 772 insertions(+), 89 deletions(-) diff --git a/mmu.vhdl b/mmu.vhdl index e58f809..e476caa 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -30,7 +30,6 @@ architecture behave of mmu is DO_TLBIE, PART_TBL_READ, PART_TBL_WAIT, - PART_TBL_DONE, PROC_TBL_READ, PROC_TBL_WAIT, SEGMENT_CHECK, @@ -71,9 +70,13 @@ architecture behave of mmu is segerror : std_ulogic; perm_err : std_ulogic; rc_error : std_ulogic; - wr_tlbram : std_ulogic; tlbie_req : std_ulogic; is_mtspr : std_ulogic; + rereadpte : std_ulogic; + -- communication with TLB and PWC + wr_tlbram : std_ulogic; + wr_pwcram : std_ulogic; + pwc_level : std_ulogic_vector(1 downto 0); end record; signal r, rin : reg_stage_t; @@ -151,6 +154,112 @@ architecture behave of mmu is signal tlb_plru_upd : std_ulogic_vector(2 downto 0); signal tlb_plru_victim : std_ulogic_vector(1 downto 0); + -- Page walk cache, 256 entries, 4-way set associative + -- (also stores large page PTEs). + -- This is implemented using a 512 x 64 bit RAM, divided + -- into 64 blocks of 8 words, each block containing a set of + -- 4 entries. It caches PDEs and PTEs at the 2MB, 1GB + -- and 512GB levels for a 52-bit address space, giving + -- 31-6, 22-6, and 13-6 bits of address tag respectively + -- (the -6 is because of the 6-bit index). + -- In each block, word 0 contains a 2-bit size/valid field, + -- 12-bit PID, and 1 bit indicating leaf (PTE) vs. PDE. + -- (This allows us to do invalidate-all or invalidate-by-PID + -- in 64 cycles instead of 256.) + -- For 2MB entries, word 1 contains two 32-bit fields containing + -- address tags (25 bits) for entries 0 and 1, and word 2 has + -- the tags for entries 2 and 3. + -- For 1GB entries, word 3 contains four 16-bit fields containing + -- address tags (16 bits) for entries 0 - 3. For 512GB entries, + -- word 3 is used similarly but there are only 7 bits per tag. + -- Words 4 to 7 contain the PTE/PDE value for entries 0 to 3. + -- Words 1 - 3 are arranged in this way so that any entry can be + -- written in 3 cycles without disturbing other entries. + -- EAs are expected to be in a 4PB (52-bit) space per PID + -- (ignoring the quadrant bits); anything outside that + -- doesn't get cached. + constant PWC_WIDTH : natural := 64; + constant PWC_DEPTH : natural := 256; + constant PWC_HASH_BITS : natural := 6; + constant PWC_ADDR_BITS : natural := PWC_HASH_BITS + 3; + subtype pwc_word_t is std_ulogic_vector(PWC_WIDTH - 1 downto 0); + type pwc_t is array(0 to 2 * PWC_DEPTH - 1) of pwc_word_t; + signal pwc : pwc_t; + subtype pwc_index_t is integer range 0 to 2**PWC_HASH_BITS - 1; + + signal pwc_doread : std_ulogic; + signal pwc_rdren : std_ulogic; + signal pwc_rdaddr : std_ulogic_vector(PWC_ADDR_BITS - 1 downto 0); + signal pwc_rddata : std_ulogic_vector(PWC_WIDTH - 1 downto 0); + signal pwc_rdreg : std_ulogic_vector(PWC_WIDTH - 1 downto 0); + signal pwc_wren : std_ulogic_vector(3 downto 0); + signal pwc_wraddr : std_ulogic_vector(PWC_ADDR_BITS - 1 downto 0); + signal pwc_wrdata : std_ulogic_vector(PWC_WIDTH - 1 downto 0); + + type pwc_state_t is (IDLE, + SEARCH1, + SEARCH_2M_0, SEARCH_2M_1, SEARCH_2M_2, + SEARCH_1G_0, SEARCH_1G_3, + SEARCH_HT_0, SEARCH_HT_3, + RDPDE, + WAITW, WRPTE1_2M, WRPTE1_W3, WRPTE2, + INVAL1, INVAL2, + INVAL_2M, INVAL_2M_0, INVAL_2M_1, INVAL_2M_2); + + type mmu_pwc_reg_t is record + state : pwc_state_t; + next_state : pwc_state_t; + addr : std_ulogic_vector(30 downto 0); + pid : std_ulogic_vector(11 downto 0); + bad_ea : std_ulogic; + hash_2M : std_ulogic_vector(PWC_HASH_BITS - 1 downto 0); + hash_1G : std_ulogic_vector(PWC_HASH_BITS - 1 downto 0); + hash_512G : std_ulogic_vector(PWC_HASH_BITS - 1 downto 0); + is_tlbie : std_ulogic; + may_hit_2M : std_ulogic_vector(3 downto 0); + may_hit_1G : std_ulogic_vector(3 downto 0); + may_hit_512G : std_ulogic_vector(3 downto 0); + missed_2M : std_ulogic; + missed_1G : std_ulogic; + missed_512G : std_ulogic; + hit : std_ulogic; + miss : std_ulogic; + hit_size : std_ulogic_vector(1 downto 0); + sel_way : std_ulogic_vector(1 downto 0); + repl_way_2M : std_ulogic_vector(1 downto 0); + repl_way_1G : std_ulogic_vector(1 downto 0); + repl_way_HT : std_ulogic_vector(1 downto 0); + wr_leaf : std_ulogic; + wr_level : std_ulogic_vector(1 downto 0); + update_plru : std_ulogic; + tlbie_done : std_ulogic; + inval_all : std_ulogic; + inval_pdes : std_ulogic; + inval_pid : std_ulogic; + rd_hash : std_ulogic_vector(PWC_HASH_BITS - 1 downto 0); + reg_hash : std_ulogic_vector(PWC_HASH_BITS - 1 downto 0); + end record; + + constant mmu_pwc_reg_init : mmu_pwc_reg_t := ( + state => INVAL2, next_state => IDLE, inval_all => '1', + addr => 31x"0", pid => 12x"0", + hash_2M => (others => '0'), hash_1G => (others => '0'), + hash_512G => (others => '0'), + rd_hash => (others => '0'), reg_hash => (others => '0'), + may_hit_2M => "0000", may_hit_1G => "0000", may_hit_512G => "0000", + sel_way => "00", hit_size => "00", + repl_way_2M => "00", repl_way_1G => "00", repl_way_HT => "00", + wr_level => "00", + others => '0'); + signal pr, prin : mmu_pwc_reg_t; + + -- PWC PLRU array + type pwc_plru_array is array(pwc_index_t) of std_ulogic_vector(2 downto 0); + signal pwc_plru_ram : pwc_plru_array; + signal pwc_plru_cur : std_ulogic_vector(2 downto 0); + signal pwc_plru_upd : std_ulogic_vector(2 downto 0); + signal pwc_plru_victim : std_ulogic_vector(1 downto 0); + function addr_hash_4k(ea: std_ulogic_vector(63 downto 0); pid: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is variable h : std_ulogic_vector(TLB_HASH_BITS - 1 downto 0); @@ -161,24 +270,56 @@ architecture behave of mmu is return h; end; - function find_first_zero(x: std_ulogic_vector(3 downto 0)) return std_ulogic_vector is + function addr_hash_2M(ea: std_ulogic_vector(63 downto 0); + pid: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is + variable h : std_ulogic_vector(PWC_HASH_BITS - 1 downto 0); + begin + h := ea(26 downto 21) xor ea(32 downto 27) xor ea(51 downto 46) xor + pid(5 downto 0) xor pid(11 downto 6) xor 6x"09"; + return h; + end; + + function addr_hash_1G(ea: std_ulogic_vector(63 downto 0); + pid: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is + variable h : std_ulogic_vector(PWC_HASH_BITS - 1 downto 0); + begin + h := ea(35 downto 30) xor ea(41 downto 36) xor ea(51 downto 46) xor + pid(5 downto 0) xor pid(11 downto 6) xor 6x"12"; + return h; + end; + + function addr_hash_512G(ea: std_ulogic_vector(63 downto 0); + pid: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is + variable h : std_ulogic_vector(PWC_HASH_BITS - 1 downto 0); + begin + h := ea(44 downto 39) xor ea(51 downto 46) xor + pid(5 downto 0) xor pid(11 downto 6) xor 6x"24"; + return h; + end; + + function find_first_one(x: std_ulogic_vector(3 downto 0)) return std_ulogic_vector is begin for i in 0 to 2 loop - if x(i) = '0' then + if x(i) = '1' then return std_ulogic_vector(to_unsigned(i, 2)); end if; end loop; return "11"; end; - function check_perm(pte: std_ulogic_vector(63 downto 0); priv: std_ulogic; - iside: std_ulogic; store: std_ulogic) return std_ulogic is + function check_perm_c(pte: std_ulogic_vector(63 downto 0); priv: std_ulogic; + iside: std_ulogic; store: std_ulogic; cbit : std_ulogic) + return std_ulogic is variable ok: std_ulogic; begin ok := '0'; if priv = '1' or pte(3) = '0' then if iside = '0' then - ok := pte(1) or (pte(2) and not store); + if store = '0' then + ok := pte(1) or pte(2); -- loads need R or W permission + else + ok := pte(1) and cbit; -- stores need W and C + end if; else -- no IAMR, so no KUEP support for now -- deny execute permission if cache inhibited @@ -344,7 +485,7 @@ begin if valids = "1111" then tv.repl_way := tlb_plru_victim; else - tv.repl_way := find_first_zero(valids); + tv.repl_way := find_first_one(not valids); end if; -- next read word 2 of group idx := "010"; @@ -479,6 +620,520 @@ begin trin <= tv; end process; + -- Synchronous reads and writes to PWC array + mmu_pwc_ram: process(clk) + begin + if rising_edge(clk) then + if pwc_rdren = '1' then + pwc_rdreg <= pwc_rddata; + end if; + if pwc_doread = '1' then + pwc_rddata <= pwc(to_integer(unsigned(pwc_rdaddr))); + end if; + if pwc_wren /= "0000" then + for i in 0 to 3 loop + if pwc_wren(i) = '1' then + pwc(to_integer(unsigned(pwc_wraddr)))(i*16 + 15 downto i*16) <= + pwc_wrdata(i*16 + 15 downto i*16); + end if; + end loop; + end if; + end if; + end process; + + -- PWC PLRU + pwc_plru : entity work.plrufn + generic map ( + BITS => 2 + ) + port map ( + acc => pr.sel_way, + tree_in => pwc_plru_cur, + tree_out => pwc_plru_upd, + lru => pwc_plru_victim + ); + + process(clk) + begin + if rising_edge(clk) then + if is_X(pr.rd_hash) then + pwc_plru_cur <= (others => 'X'); + else + pwc_plru_cur <= pwc_plru_ram(to_integer(unsigned(pr.rd_hash))); + end if; + if pr.update_plru = '1' then + assert not is_X(pr.rd_hash) severity failure; + pwc_plru_ram(to_integer(unsigned(pr.rd_hash))) <= pwc_plru_upd; + end if; + end if; + end process; + + -- State machine for doing PWC searches, updates and invalidations + mmu_pwc_0: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + pr <= mmu_pwc_reg_init; + else + pr <= prin; + end if; + end if; + end process; + + mmu_pwc_1: process(all) + variable pv : mmu_pwc_reg_t; + variable isf : std_ulogic_vector(1 downto 0); + variable ap : std_ulogic_vector(2 downto 0); + variable is_hit : std_ulogic; + variable valids : std_ulogic_vector(3 downto 0); + variable idx : std_ulogic_vector(2 downto 0); + variable wdat : std_ulogic_vector(15 downto 0); + variable rway : std_ulogic_vector(1 downto 0); + variable wr_hash : std_ulogic_vector(5 downto 0); + begin + pv := pr; + pwc_doread <= '0'; + pwc_rdren <= '0'; + pwc_wren <= "0000"; + pwc_wrdata <= (others => '0'); + is_hit := '0'; + idx := "000"; + wr_hash := (others => '0'); + pv.update_plru := '0'; + case pr.state is + when IDLE => + pv.state := IDLE; + pv.next_state := IDLE; + pv.addr := l_in.addr(51 downto 21); + pv.pid := (others => '0'); + if l_in.tlbie = '1' then + -- PID for tlbie comes from RS + pv.pid := l_in.rs(43 downto 32); + elsif l_in.addr(63) = '0' then + -- we currently only implement quadrants 0 and 3 + pv.pid := r.pid; + end if; + pv.bad_ea := (or (l_in.addr(61 downto 52)) or (l_in.addr(63) xor l_in.addr(62))) + and not l_in.tlbie; + pv.hash_2M := addr_hash_2M(l_in.addr, pv.pid); + pv.hash_1G := addr_hash_1G(l_in.addr, pv.pid); + pv.hash_512G := addr_hash_512G(l_in.addr, pv.pid); + pv.rd_hash := pv.hash_2M; + pv.is_tlbie := l_in.tlbie; + pv.missed_2M := '0'; + pv.missed_1G := '0'; + pv.missed_512G := '0'; + if l_in.valid = '1' then + pv.hit := '0'; + pv.miss := '0'; + pv.tlbie_done := '0'; + pv.inval_all := '0'; + pv.inval_pdes := '0'; + pv.inval_pid := '0'; + if l_in.tlbie = '1' then + -- decode what type of tlbie this is + isf := l_in.addr(11 downto 10); + pv.inval_pdes := (l_in.ric(0) or l_in.ric(1)); + if l_in.slbia = '1' then + -- no effect on this PWC (flushes L1 TLBs below) + pv.tlbie_done := '1'; + elsif isf(1) = '1' and pv.inval_pdes = '1' then + -- invalidate everything in this cache + pv.inval_all := '1'; + pv.rd_hash := (others => '0'); + pv.reg_hash := (others => '0'); + pv.state := INVAL2; + elsif isf(1) = '1' or isf(0) = '1' then + -- invalidate PTEs but not PDEs, or invalidate by PID + -- in these cases we need to read word 0 of each group + pv.inval_pid := not isf(1); + pv.rd_hash := (others => '0'); + pwc_doread <= '1'; + pv.state := INVAL1; + else + -- invalidate single page + ap := l_in.addr(7 downto 5); -- actual page size + if ap = "001" then -- 2MB page + pwc_doread <= '1'; + pv.state := INVAL_2M; + else + -- 4k, 64k, 1G or unrecognized + pv.tlbie_done := '1'; + end if; + end if; + else + -- first read word 0 of 2M group + pwc_doread <= '1'; + pv.state := SEARCH1; + pv.next_state := SEARCH_2M_0; + end if; + end if; + + when SEARCH1 => + -- next read word 0 of 1G group + pv.rd_hash := pr.hash_1G; + pwc_doread <= '1'; + pwc_rdren <= '1'; + if pr.bad_ea = '0' then + pv.state := SEARCH_2M_0; + else + pv.miss := '1'; + pv.state := IDLE; + end if; + + when SEARCH_2M_0 => + -- pwc_rdreg contains 2M group word 0, check for hits/misses + pv.may_hit_2M := "0000"; + valids := "0000"; + for i in 0 to 3 loop + valids(i) := pwc_rdreg(i*16 + 15); + if pwc_rdreg(i*16 + 15) = '1' and + pwc_rdreg(i*16 + 11 downto i*16) = pr.pid and + pwc_rdreg(i*16 + 13 downto i*16 + 12) = "00" then + pv.may_hit_2M(i) := '1'; + end if; + end loop; + if valids = "1111" then + pv.repl_way_2M := pwc_plru_victim; + else + pv.repl_way_2M := find_first_one(not valids); + end if; + -- if any 2M hits are possible, read word 1 of 2M group next + if pv.may_hit_2M /= "0000" then + pv.rd_hash := pr.hash_2M; + idx := "001"; + pv.next_state := SEARCH_2M_1; + else + -- otherwise read word 0 of 512G group next + pv.missed_2M := '1'; + pv.rd_hash := pr.hash_512G; + pv.next_state := SEARCH_HT_0; + end if; + pv.state := SEARCH_1G_0; + pwc_doread <= '1'; + pwc_rdren <= '1'; + when SEARCH_2M_1 => + -- pwc_rdreg contains 2M group word 1 + for i in 0 to 1 loop + if pwc_rdreg(i*32 + 31 downto i*32 + 7) /= pr.addr(30 downto 6) then + pv.may_hit_2M(i) := '0'; + end if; + end loop; + if pv.may_hit_2M = "0000" then + pv.missed_2M := '1'; + end if; + -- decide what to read next based on whether 1G hits are still possible + if pr.missed_1G = '0' then + pv.rd_hash := pr.hash_1G; + idx := "011"; + pv.next_state := SEARCH_1G_3; + else + pv.rd_hash := pr.hash_512G; + pv.next_state := SEARCH_HT_0; + end if; + pv.state := pr.next_state; -- will be SEARCH_2M_2 + pwc_doread <= '1'; + pwc_rdren <= '1'; + when SEARCH_2M_2 => + -- pwc_rdreg contains 2M group word 2 + for i in 0 to 1 loop + if pwc_rdreg(i*32 + 31 downto i*32 + 7) /= pr.addr(30 downto 6) then + pv.may_hit_2M(i+2) := '0'; + end if; + end loop; + -- Can now decide hit/miss for 2M entries + if pv.may_hit_2M /= "0000" then + pv.sel_way := find_first_one(pv.may_hit_2M); + pv.hit_size := "00"; + pv.rd_hash := pr.hash_2M; + idx := '1' & pv.sel_way; + pv.state := RDPDE; + else + pv.missed_2M := '1'; + pv.rd_hash := pr.hash_512G; + if pr.missed_1G = '0' then + pv.next_state := SEARCH_HT_0; + else + idx := "011"; + pv.next_state := SEARCH_HT_3; + end if; + pv.state := pr.next_state; + pwc_rdren <= '1'; + end if; + pwc_doread <= '1'; + + when SEARCH_1G_0 => + -- pwc_rdreg contains 1G group word 0, check for hits/misses + pv.may_hit_1G := "0000"; + valids := "0000"; + for i in 0 to 3 loop + valids(i) := pwc_rdreg(i*16 + 15); + if pwc_rdreg(i*16 + 15) = '1' and + pwc_rdreg(i*16 + 11 downto i*16) = pr.pid and + pwc_rdreg(i*16 + 13 downto i*16 + 12) = "01" then + pv.may_hit_1G(i) := '1'; + end if; + end loop; + if valids = "1111" then + pv.repl_way_1G := pwc_plru_victim; + else + pv.repl_way_1G := find_first_one(not valids); + end if; + if pv.may_hit_1G = "0000" then + pv.missed_1G := '1'; + end if; + if pr.missed_2M = '0' then + -- If 2M hits are still possible, read word 2 of 2M group next + pv.rd_hash := pr.hash_2M; + idx := "010"; + pv.next_state := SEARCH_2M_2; + elsif pv.missed_1G = '0' then + -- otherwise, if any 1G hits are possible, read word 3 of 1G group next + pv.rd_hash := pr.hash_1G; + idx := "011"; + pv.next_state := SEARCH_1G_3; + else + -- otherwise read word 0 of 512G group + pv.rd_hash := pr.hash_512G; + pv.next_state := SEARCH_HT_0; + end if; + pv.state := pr.next_state; + pwc_doread <= '1'; + pwc_rdren <= '1'; + when SEARCH_1G_3 => + -- pwc_rdreg contains 1G group word 3 + for i in 0 to 3 loop + if pwc_rdreg(i*16 + 15 downto i*16) /= pr.addr(30 downto 15) then + pv.may_hit_1G(i) := '0'; + end if; + end loop; + -- Can now decide hit/miss for 1G entries + if pv.may_hit_1G /= "0000" then + pv.sel_way := find_first_one(pv.may_hit_1G); + pv.hit_size := "01"; + pv.rd_hash := pr.hash_1G; + idx := '1' & pv.sel_way; + pv.state := RDPDE; + pwc_doread <= '1'; + else + pv.missed_1G := '1'; + if pr.missed_512G = '0' then + pv.state := pr.next_state; + pwc_rdren <= '1'; + else + pv.miss := '1'; + pv.state := WAITW; + end if; + end if; + + when SEARCH_HT_0 => + -- pwc_rdreg contains 512G group (half TB) word 0, check for hits/misses + pv.may_hit_512G := "0000"; + valids := "0000"; + for i in 0 to 3 loop + valids(i) := pwc_rdreg(i*16 + 15); + if pwc_rdreg(i*16 + 15) = '1' and + pwc_rdreg(i*16 + 11 downto i*16) = pr.pid and + pwc_rdreg(i*16 + 13 downto i*16 + 12) = "10" then + pv.may_hit_512G(i) := '1'; + end if; + end loop; + if valids = "1111" then + pv.repl_way_HT := pwc_plru_victim; + else + pv.repl_way_HT := find_first_one(not valids); + end if; + -- if any 512G hits are possible, read word 3 of 512G group next + if pv.may_hit_512G /= "0000" then + pv.rd_hash := pr.hash_512G; + idx := "011"; + pv.next_state := SEARCH_HT_3; + pwc_doread <= '1'; + else + pv.missed_512G := '1'; + end if; + if pv.missed_512G = '1' and pr.missed_1G = '1' then + pv.miss := '1'; + pv.state := WAITW; + else + pv.state := pr.next_state; + pwc_rdren <= '1'; + end if; + when SEARCH_HT_3 => + -- pwc_rdreg contains 512G group word 3 + for i in 0 to 3 loop + if pwc_rdreg(i*16 + 15 downto i*16 + 9) /= pr.addr(30 downto 24) then + pv.may_hit_512G(i) := '0'; + end if; + end loop; + -- Can now decide hit/miss for 512G entries + if pv.may_hit_512G /= "0000" then + pv.sel_way := find_first_one(pv.may_hit_512G); + pv.hit_size := "10"; + pv.rd_hash := pr.hash_512G; + idx := '1' & pv.sel_way; + pv.state := RDPDE; + pwc_doread <= '1'; + else + pv.miss := '1'; + pv.state := WAITW; + end if; + + when RDPDE => + pwc_rdren <= '1'; + pv.hit := '1'; + pv.update_plru := '1'; + pv.state := WAITW; + when WAITW => + pwc_wrdata <= r.pde; + pv.wr_leaf := r.pde(62); + pv.wr_level := r.pwc_level; + rway := "00"; + if r.rereadpte = '1' then + -- rewriting a 2M PTE with changed permissions + rway := pr.sel_way; + wr_hash := pr.hash_2M; + else + -- choose way according to which group is to be written + case r.pwc_level is + when "00" => -- 2M + rway := pr.repl_way_2M; + wr_hash := pr.hash_2M; + when "01" => + rway := pr.repl_way_1G; + wr_hash := pr.hash_1G; + when others => + rway := pr.repl_way_HT; + wr_hash := pr.hash_512G; + end case; + end if; + if r.wr_pwcram = '1' then + -- write PDE to one of words 4-7 + pwc_wren <= "1111"; + idx := '1' & rway; + pv.rd_hash := wr_hash; + pv.sel_way := rway; + pv.update_plru := '1'; + if r.pwc_level = "00" then + pv.state := WRPTE1_2M; + else + pv.state := WRPTE1_W3; + end if; + elsif r.done = '1' or r.err = '1' then + pv.state := IDLE; + end if; + when WRPTE1_2M => + pwc_wrdata <= pr.addr & '0' & pr.addr & '0'; + wr_hash := pr.rd_hash; + if pr.sel_way(0) = '1' then + pwc_wren <= "1100"; + else + pwc_wren <= "0011"; + end if; + idx := '0' & pr.sel_way(1) & not pr.sel_way(1); + pv.state := WRPTE2; + when WRPTE1_W3 => + pwc_wrdata <= pr.addr(30 downto 15) & pr.addr(30 downto 15) & + pr.addr(30 downto 15) & pr.addr(30 downto 15); + wr_hash := pr.rd_hash; + pwc_wren(to_integer(unsigned(pr.sel_way))) <= '1'; + idx := "011"; + pv.state := WRPTE2; + when WRPTE2 => + -- word 0 gets valid, leaf bit, page size, PID + wdat := '1' & pr.wr_leaf & pr.wr_level & pr.pid; + pwc_wrdata <= wdat & wdat & wdat & wdat; + -- write one 16b section of word 0 + wr_hash := pr.rd_hash; + pwc_wren(to_integer(unsigned(pr.sel_way))) <= '1'; + if pr.wr_leaf = '1' then + pv.state := IDLE; + else + pv.state := WAITW; + end if; + + when INVAL1 => + pv.rd_hash := 6x"01"; + pwc_doread <= '1'; + pwc_rdren <= '1'; + pv.state := INVAL2; + when INVAL2 => + if pr.inval_all = '1' then + pwc_wren <= "1111"; + pv.reg_hash := pr.rd_hash; + else + valids := "0000"; + for i in 0 to 3 loop + if pwc_rdreg(i*16 + 15) = '1' and + (pwc_rdreg(i*16 + 14) = '1' or pr.inval_pdes = '1') and + (pwc_rdreg(i*16 + 11 downto i*16) = pr.pid or pr.inval_pid = '0') then + valids(i) := '1'; + end if; + end loop; + pwc_wren <= valids; + pwc_doread <= '1'; + pwc_rdren <= '1'; + end if; + wr_hash := pr.reg_hash; + pv.rd_hash := std_ulogic_vector(unsigned(pv.rd_hash) + 1); + if pr.reg_hash = 6x"3f" then + pv.tlbie_done := '1'; + pv.state := IDLE; + end if; + + when INVAL_2M => + -- next read word 1 of 2M group + idx := "001"; + pwc_doread <= '1'; + pwc_rdren <= '1'; + pv.state := INVAL_2M_0; + when INVAL_2M_0 => + -- pwc_rdreg contains 2M group word 0 + pv.may_hit_2M := "0000"; + for i in 0 to 3 loop + if pwc_rdreg(i*16 + 15 downto i*16 + 12) = "1100" and + pwc_rdreg(i*16 + 11 downto i*16) = pr.pid then + pv.may_hit_2M(i) := '1'; + end if; + end loop; + -- next read word 2 of 2M group + idx := "010"; + pwc_doread <= '1'; + pwc_rdren <= '1'; + pv.state := INVAL_2M_1; + when INVAL_2M_1 => + -- pwc_rdreg contains 2M group word 1 + for i in 0 to 1 loop + if pwc_rdreg(i*32 + 31 downto i*32 + 7) /= pr.addr(30 downto 6) then + pv.may_hit_2M(i) := '0'; + end if; + end loop; + pwc_rdren <= '1'; + pv.state := INVAL_2M_2; + when INVAL_2M_2 => + -- pwc_rdreg contains 2M group word 2 + for i in 0 to 1 loop + if pwc_rdreg(i*32 + 31 downto i*32 + 7) /= r.addr(30 downto 6) then + pv.may_hit_2M(i+2) := '0'; + end if; + end loop; + wr_hash := pr.hash_2M; + pwc_wren <= pv.may_hit_2M; + pv.tlbie_done := '1'; + pv.state := IDLE; + + end case; + if r.done = '1' or r.err = '1' then + pv.state := IDLE; + end if; + if pwc_rdren = '1' then + pv.reg_hash := pr.rd_hash; + end if; + pwc_rdaddr <= pv.rd_hash & idx; + pwc_wraddr <= wr_hash & idx; + prin <= pv; + end process; + -- Multiplex internal SPR values back to loadstore1, selected -- by l_in.sprnf. l_out.sprval <= r.ptcr when l_in.sprnf = '1' else x"0000000000000" & r.pid; @@ -514,6 +1169,9 @@ begin report "send load addr=" & to_hstring(d_out.addr) & " addrsh=" & to_hstring(addrsh) & " mask=" & to_hstring(mask); end if; + if l_in.valid = '1' or l_in.mtspr = '1' then + assert r.state = IDLE severity failure; + end if; r <= rin; end if; end if; @@ -612,6 +1270,7 @@ begin variable rc_ok : std_ulogic; variable addr : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0); + variable tlbdone, pwcdone : std_ulogic; begin v := r; v.valid := '0'; @@ -623,6 +1282,7 @@ begin v.segerror := '0'; v.perm_err := '0'; v.rc_error := '0'; + v.wr_pwcram := '0'; tlb_load := '0'; v.tlbie_req := '0'; v.inval_all := '0'; @@ -635,24 +1295,17 @@ begin data(i * 8 + 7 downto i * 8) := d_in.data((7 - i) * 8 + 7 downto (7 - i) * 8); end loop; + if r.addr(63) = '0' then + pgtbl := r.pgtbl0; + pt_valid := r.pt0_valid; + else + pgtbl := r.pgtbl3; + pt_valid := r.pt3_valid; + end if; + case r.state is when IDLE => - if l_in.addr(63) = '0' then - pgtbl := r.pgtbl0; - pt_valid := r.pt0_valid; - else - pgtbl := r.pgtbl3; - pt_valid := r.pt3_valid; - end if; - -- rts == radix tree size, # address bits being translated - six := '0' & pgtbl(62 downto 61) & pgtbl(7 downto 5); - rts := unsigned(six); - -- mbits == # address bits to index top level of tree - mbits := unsigned('0' & pgtbl(4 downto 0)); - -- set v.shift to rts so that we can use finalmask for the segment check - v.shift := rts; - v.mask_size := mbits(4 downto 0); - v.pgbase := pgtbl(55 downto 8) & x"00"; + v.rereadpte := '0'; if l_in.valid = '1' then v.addr := l_in.addr; @@ -677,18 +1330,9 @@ begin if r.ptb_valid = '0' then -- need to fetch process table base from partition table v.state := PART_TBL_READ; - elsif pt_valid = '0' then - -- need to fetch process table entry - -- set v.shift so we can use finalmask for generating - -- the process table entry address - v.shift := unsigned('0' & r.prtbl(4 downto 0)); - v.state := PROC_TBL_READ; - elsif mbits = 0 then - -- Use RPDS = 0 to disable radix tree walks - v.state := RADIX_FINISH; - v.invalid := '1'; else - v.state := SEGMENT_CHECK; + -- wait for TLB and PWC to do their stuff + v.state := TLBWAIT; end if; end if; end if; @@ -711,7 +1355,7 @@ begin end if; when DO_TLBIE => - if r.is_mtspr = '1' or tr.tlbie_done = '1' then + if r.is_mtspr = '1' or (tr.tlbie_done = '1' and pr.tlbie_done = '1') then v.state := RADIX_FINISH; end if; @@ -724,12 +1368,61 @@ begin if d_in.done = '1' then v.prtbl := data; v.ptb_valid := '1'; - v.state := PART_TBL_DONE; + v.state := TLBWAIT; end if; - when PART_TBL_DONE => - v.shift := unsigned('0' & r.prtbl(4 downto 0)); - v.state := PROC_TBL_READ; + when TLBWAIT => + -- If we have a TLB hit, or a PWC hit that is a + -- large-page PTE, check permissions; + -- if the access is not permitted, we will need to reread + -- the PTE from memory to verify, because increasing + -- permission on a PTE doesn't require tlbie. + -- (Note that R must be set in the PTE, otherwise it + -- wouldn't have been written to the TLB.) + tlbdone := tr.hit or tr.miss; + pwcdone := pr.hit or pr.miss; + if tr.hit = '1' and r.rereadpte = '0' then + v.pde := tlb_rdreg; + if check_perm_c(tlb_rdreg, r.priv, r.iside, r.store, tlb_rdreg(7)) = '1' then + v.shift := to_unsigned(0, 6); + v.state := RADIX_LOAD_TLB; + else + v.rereadpte := '1'; + end if; + elsif pr.hit = '1' and pr.hit_size = "00" and pwc_rdreg(62) = '1' and r.rereadpte = '0' then + v.pde := pwc_rdreg; + if check_perm_c(pwc_rdreg, r.priv, r.iside, r.store, pwc_rdreg(7)) = '1' then + -- Large-page (2M) PTE from PWC is in pwc_rdreg + v.shift := to_unsigned(9, 6); + v.state := RADIX_LOAD_TLB; + else + v.rereadpte := '1'; + end if; + elsif pr.hit = '1' and pwc_rdreg(62) = '0' and tlbdone = '1' then + v.pde := pwc_rdreg; + -- PDE from PWC is in pwc_rdreg + -- multiply pr.hit_size by 9 to get shift + six := '0' & pr.hit_size & '0' & pr.hit_size; + v.shift := unsigned(six); + v.mask_size := to_unsigned(9, 5); + v.pgbase := pwc_rdreg(55 downto 8) & x"00"; + v.state := RADIX_LOOKUP; + elsif tlbdone = '1' and pwcdone = '1' then + if pt_valid = '0' then + -- need to fetch process table entry + -- set v.shift so we can use finalmask for generating + -- the process table entry address + v.shift := unsigned('0' & r.prtbl(4 downto 0)); + v.state := PROC_TBL_READ; + else + -- rts == radix tree size, # address bits being translated + six := '0' & pgtbl(62 downto 61) & pgtbl(7 downto 5); + rts := unsigned(six); + -- set v.shift to rts so that we can use finalmask for the segment check + v.shift := rts; + v.state := SEGMENT_CHECK; + end if; + end if; when PROC_TBL_READ => dcreq := '1'; @@ -748,18 +1441,9 @@ begin -- rts == radix tree size, # address bits being translated six := '0' & data(62 downto 61) & data(7 downto 5); rts := unsigned(six); - -- mbits == # address bits to index top level of tree - mbits := unsigned('0' & data(4 downto 0)); -- set v.shift to rts so that we can use finalmask for the segment check v.shift := rts; - v.mask_size := mbits(4 downto 0); - v.pgbase := data(55 downto 8) & x"00"; - if mbits = 0 then - v.state := RADIX_FINISH; - v.invalid := '1'; - else - v.state := SEGMENT_CHECK; - end if; + v.state := SEGMENT_CHECK; end if; if d_in.err = '1' then v.state := RADIX_FINISH; @@ -767,39 +1451,22 @@ begin end if; when SEGMENT_CHECK => - mbits := '0' & r.mask_size; + mbits := unsigned('0' & pgtbl(4 downto 0)); + v.mask_size := unsigned(pgtbl(4 downto 0)); + v.pgbase := pgtbl(55 downto 8) & x"00"; v.shift := r.shift + (31 - 12) - mbits; nonzero := or(r.addr(61 downto 31) and not finalmask(30 downto 0)); - if r.addr(63) /= r.addr(62) or nonzero = '1' then + if mbits = 0 then + -- Use RPDS = 0 to disable radix tree walks + v.state := RADIX_FINISH; + v.invalid := '1'; + elsif r.addr(63) /= r.addr(62) or nonzero = '1' then v.state := RADIX_FINISH; v.segerror := '1'; elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then v.state := RADIX_FINISH; v.badtree := '1'; - elsif tr.miss = '1' then - v.state := RADIX_LOOKUP; else - v.state := TLBWAIT; - end if; - - when TLBWAIT => - v.pde := tlb_rdreg; - if tr.hit = '1' then - -- PTE from the TLB entry is in tlb_rdreg - -- Check permissions; if the access is not permitted, - -- reread the PTE from memory to verify, because increasing - -- permission on a PTE doesn't require tlbie. - -- Note that R must be set in the PTE, otherwise it - -- wouldn't have been written to the TLB. - perm_ok := check_perm(tlb_rdreg, r.priv, r.iside, r.store); - rc_ok := tlb_rdreg(7) or not r.store; - if perm_ok = '1' and rc_ok = '1' then - v.shift := to_unsigned(0, 6); - v.state := RADIX_LOAD_TLB; - else - v.state := RADIX_LOOKUP; - end if; - elsif tr.miss = '1' then v.state := RADIX_LOOKUP; end if; @@ -815,7 +1482,7 @@ begin -- test leaf bit if data(62) = '1' then -- check permissions and RC bits - perm_ok := check_perm(data, r.priv, r.iside, r.store); + perm_ok := check_perm_c(data, r.priv, r.iside, r.store, '1'); rc_ok := data(8) and (data(7) or not r.store); if perm_ok = '1' and rc_ok = '1' then v.state := RADIX_LOAD_TLB; @@ -824,6 +1491,11 @@ begin if r.shift = 0 then v.wr_tlbram := '1'; end if; + -- 2M PTEs can be cached in the PWC + if r.shift = 9 then + v.pwc_level := "00"; + v.wr_pwcram := '1'; + end if; else v.state := RADIX_FINISH; v.perm_err := not perm_ok; @@ -836,10 +1508,17 @@ begin v.state := RADIX_FINISH; v.badtree := '1'; else - v.shift := v.shift - mbits; + v.shift := r.shift - mbits; v.mask_size := mbits(4 downto 0); v.pgbase := data(55 downto 8) & x"00"; v.state := RADIX_LOOKUP; + -- Write entry to PWC if it is one of the supported sizes + -- i.e. 2M, 1G or 512G + if (r.shift = 9 or r.shift = 18 or r.shift = 27) and + mbits = 9 and r.rereadpte = '0' then + v.wr_pwcram := '1'; + v.pwc_level := std_ulogic_vector(r.shift(4 downto 3) - 1); + end if; end if; end if; else diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c index 6301068..80477df 100644 --- a/tests/mmu/mmu.c +++ b/tests/mmu/mmu.c @@ -115,6 +115,7 @@ void zero_memory(void *ptr, unsigned long nbytes) * 8kB PGD level pointing to 4kB PTE pages. */ unsigned long *pgdir = (unsigned long *) 0x10000; +unsigned long *pmdir = (unsigned long *) 0x11000; unsigned long *proc_tbl = (unsigned long *) 0x12000; unsigned long *part_tbl = (unsigned long *) 0x13000; unsigned long free_ptr = 0x14000; @@ -129,17 +130,20 @@ void init_mmu(void) zero_memory(proc_tbl, 512 * sizeof(unsigned long)); mtspr(PTCR, (unsigned long)part_tbl); mtspr(PID, 1); - zero_memory(pgdir, 1024 * sizeof(unsigned long)); - /* RTS = 0 (2GB address space), RPDS = 10 (1024-entry top level) */ - store_pte(&proc_tbl[2 * 1], (unsigned long) pgdir | 10); + zero_memory(pgdir, 512 * sizeof(unsigned long)); + store_pte(&pgdir[0], 0x8000000000000000ul | (unsigned long) pmdir | 9); + zero_memory(pmdir, 512 * sizeof(unsigned long)); + /* RTS = 8 (512GB address space), RPDS = 9 (512-entry top level) */ + /* we only use the first 1GB of the space */ + store_pte(&proc_tbl[2 * 1], (unsigned long) pgdir | 0xa000000000000009ul); do_tlbie(0xc00, 0); /* invalidate all TLB entries */ } -static unsigned long *read_pgd(unsigned long i) +static unsigned long *read_pmd(unsigned long i) { unsigned long ret; - __asm__ volatile("ldbrx %0,%1,%2" : "=r" (ret) : "b" (pgdir), + __asm__ volatile("ldbrx %0,%1,%2" : "=r" (ret) : "b" (pmdir), "r" (i * sizeof(unsigned long))); return (unsigned long *) (ret & 0x00ffffffffffff00); } @@ -150,14 +154,14 @@ void map(void *ea, void *pa, unsigned long perm_attr) unsigned long i, j; unsigned long *ptep; - i = (epn >> 9) & 0x3ff; + i = (epn >> 9) & 0x1ff; j = epn & 0x1ff; - if (pgdir[i] == 0) { + if (pmdir[i] == 0) { zero_memory((void *)free_ptr, 512 * sizeof(unsigned long)); - store_pte(&pgdir[i], 0x8000000000000000 | free_ptr | 9); + store_pte(&pmdir[i], 0x8000000000000000 | free_ptr | 9); free_ptr += 512 * sizeof(unsigned long); } - ptep = read_pgd(i); + ptep = read_pmd(i); store_pte(&ptep[j], 0xc000000000000000 | ((unsigned long)pa & 0x00fffffffffff000) | perm_attr); eas_mapped[neas_mapped++] = ea; } @@ -168,11 +172,11 @@ void unmap(void *ea) unsigned long i, j; unsigned long *ptep; - i = (epn >> 9) & 0x3ff; + i = (epn >> 9) & 0x1ff; j = epn & 0x1ff; - if (pgdir[i] == 0) + if (pmdir[i] == 0) return; - ptep = read_pgd(i); + ptep = read_pmd(i); ptep[j] = 0; do_tlbie(((unsigned long)ea & ~0xfff), 1ul << 32); } diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin index 369ca8bb8630e55ba38706e358347866bd8d7cda..a91fef91e48563b360f14f7c6aca3e243965ce22 100755 GIT binary patch delta 991 zcmXYve@K%-7{~8kO=qO8AJ)p$&5{&Tm!{ox)BP}~bE70nGtA|T3bbD)INREk_aB2r z@#KGH{Us(uKa40!FqDeGLMSjOC?wi4$p}FyyPh}i;NH8>_xav)&)vN@N9o`wy&R-1 z_qxC8V>Ty~$~X<2+;Jf3-Ud=h2yy@6Y1^lGk6BN2Y!Ke>D3MgHus%OE*CGsi=K?-G z@vsN69$${oiiw-)#9UdZ-2~0~)79!A(*~Qv8Jt%usn-&-L0WKlHNo^q=e{hzHfW7x z?$`P?=ppD``pa=idPLD(gczi-%Bv<>hEAKq5AFh^#5$nAAeVLwPZUH^+6R+6>!=B` z3T@KjD>zwrpZY;pw1?)vnWAcX3r327CPB?EPF+yLUf_45p^0XI*YHM4c@AC1Jn$C3 zp(#*R(j?7~!%)dFxM;S*OOsxTAA?I}*^p7X3-dRY zw$nnGEPX<~5GwQFUO&r@;ryUEk5)mi`3CKPEK3vWy_Qon9u_TjY0HnW!25YiE64p_x%Thi}yR zdpd9wwFfv?yn58*|9iW9N*+aHs4Yey)t-+Bt+6KuuGPxyIzAldJ;o^f4IVYiEF?G@ z`O|)x`30{)*4N3bO>hkI9CD-JSmY1&GII%5A*&i>W)d8Syy}q5as;m=Ziz9NwWkG) zjWSzWS}r3VZErm?n-!dZ{2k*T1t%hpAC}pKU^Q}Ulgvg0uSOnik=bp*Ymn>Rcuc`- zk>`*@g4Zc!goi_wzoO_~=ys;*rlvW+ijP7Z3EI_`3$+Qv7Gpu&ioG7Y6P`J@(N>5! zv*}$(a&60r{=#_>BT7?TVeF?-8&`z#I1A~QtT$*yf*#Wy89w1Uz6PjUlc=Uj9NY5v E52s0pH~;_u delta 939 zcmXYuTS!z<6o&UYlR8S-XcsjxFC{jVIeBZVGdeR~Q>P1LUchFWDJs&6>OpO6&d7PF z2f7lY$l>);ett6utq3pDd?gRUr_5JJL>#V)ky;0~Hg$@}q z2mN17!9TXJvZ*LMyXVLze-tZYjJf`*@w+C~Hmivx@+O?xlP67{!OiJgcSi`r!9}0f z#5{aB#N%Dwyt*lux%e*;Ya171(49UTn;`RO**%x6sucz~X+B!E&l{LACybRJ!;TEI zv~&q4GIm4n&~kqT<8XB=St57n&!LF=FFo1`$4?sQM$*zDYT*yW!M*m(FTG7v6 z(|ccz6RNO3=RG*EIX6v1v{}Se=pT<&%X#K5DK0_KL!ds^shwgNs6yo)1VngtliLywKgZ|Berv3L8HA? zJHiF~W9URr$v1qIMu)TPxcz6|r&TKp5nu(4!Y*320X*Y3!`U~W+z z`@W=p`rfD0d*6OjE1F_`USJ71+8|WbZJr~qEGJzb#PO1}x<#T^a`j0Uy#e}ajwRAg zuV@#%{X#P|c=#o{^QJ$uzMG^CQ>=Lz>6R65i&v|oX?oI(-*A6vIvvzs8tZE~tnki% zTC&Zoqp@CfE4*Fs8sdjl3U3u0M9iuc-Y7Vjc$Bzaa0v1J8ihLq>xn086>bq6O8lWt z;i-bdm`mb%j4O-xt*ux1^73jM>&SN3qwod6;l$4=|55OI;u}X5{!Xxg*z8nzKyUG_5QZv7(s!SQ}(Oh%1dPrNTQWR=qU6Q>UO2SQ4u4?F( S<5VP`2NQxTI)s6?iTwxQ#(_Zq