diff --git a/mmu.vhdl b/mmu.vhdl index 91429f9..e58f809 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -34,6 +34,7 @@ architecture behave of mmu is PROC_TBL_READ, PROC_TBL_WAIT, SEGMENT_CHECK, + TLBWAIT, RADIX_LOOKUP, RADIX_READ_WAIT, RADIX_LOAD_TLB, @@ -50,7 +51,7 @@ architecture behave of mmu is inval_all : std_ulogic; -- config SPRs ptcr : std_ulogic_vector(63 downto 0); - pid : std_ulogic_vector(31 downto 0); + pid : std_ulogic_vector(11 downto 0); -- internal state state : state_t; done : std_ulogic; @@ -70,6 +71,9 @@ architecture behave of mmu is segerror : std_ulogic; perm_err : std_ulogic; rc_error : std_ulogic; + wr_tlbram : std_ulogic; + tlbie_req : std_ulogic; + is_mtspr : std_ulogic; end record; signal r, rin : reg_stage_t; @@ -78,10 +82,406 @@ architecture behave of mmu is signal mask : std_ulogic_vector(15 downto 0); signal finalmask : std_ulogic_vector(43 downto 0); + -- Small page (4k) TLB, 256 entries, 4-way set associative. + -- This is implemented using a 512 x 64 bit RAM, divided + -- into 64 blocks of 8 words, each block containing a set of + -- 4 entries. + -- In each block, word 0 contains a valid bit, 12-bit PID, + -- and 3 bits of address tag for each of the 4 entries. + -- (This allows us to do invalidate-all or invalidate-by-PID + -- in 64 cycles instead of 256.) + -- Word 1 contains 32 bits of address tag for entries 0 and 1, + -- and word 2 contains the same for entries 2 and 3. + -- Words 4 to 7 contain the PTE value for entries 0 to 3, + -- Word 3 is currently unused. + -- EAs are expected to be in a 4PB (52-bit) space per PID + -- (ignoring the quadrant bits); anything outside that + -- doesn't get cached. + constant TLB_WIDTH : natural := 64; + constant TLB_DEPTH : natural := 256; + constant TLB_HASH_BITS : natural := 6; + constant TLB_ADDR_BITS : natural := TLB_HASH_BITS + 3; + subtype tlb_word_t is std_ulogic_vector(TLB_WIDTH - 1 downto 0); + type tlb_t is array(0 to 2 * TLB_DEPTH - 1) of tlb_word_t; + signal tlb : tlb_t; + subtype tlb_index_t is integer range 0 to 2**TLB_HASH_BITS - 1; + + signal tlb_doread : std_ulogic; + signal tlb_rdren : std_ulogic; + signal tlb_rdaddr : std_ulogic_vector(TLB_ADDR_BITS - 1 downto 0); + signal tlb_rddata : std_ulogic_vector(TLB_WIDTH - 1 downto 0); + signal tlb_rdreg : std_ulogic_vector(TLB_WIDTH - 1 downto 0); + signal tlb_wren : std_ulogic_vector(3 downto 0); + signal tlb_wraddr : std_ulogic_vector(TLB_ADDR_BITS - 1 downto 0); + signal tlb_wrdata : std_ulogic_vector(TLB_WIDTH - 1 downto 0); + + type tlb_state_t is (IDLE, + SEARCH1, SEARCH2, SEARCH3, SEARCH4, + RDPTE, + WAITW, WRPTE1, WRPTE2, + INVAL1, INVAL2); + type mmu_tlb_reg_t is record + state : tlb_state_t; + addr : std_ulogic_vector(39 downto 0); + bad_ea : std_ulogic; + pid : std_ulogic_vector(11 downto 0); + hash_4k : std_ulogic_vector(TLB_HASH_BITS - 1 downto 0); + is_tlbie : std_ulogic; + may_hit : std_ulogic_vector(3 downto 0); + hit : std_ulogic; + miss : std_ulogic; + hit_way : std_ulogic_vector(1 downto 0); + repl_way : std_ulogic_vector(1 downto 0); + update_plru : std_ulogic; + tlbie_done : std_ulogic; + inval_all : std_ulogic; + wr_hash : std_ulogic_vector(TLB_HASH_BITS - 1 downto 0); + end record; + constant mmu_tlb_reg_init : mmu_tlb_reg_t := ( + state => IDLE, addr => 40x"0", pid => 12x"0", + hash_4k => (others => '0'), wr_hash => (others => '0'), + may_hit => "0000", hit_way => "00", repl_way => "00", + others => '0'); + signal tr, trin : mmu_tlb_reg_t; + + -- TLB PLRU array + type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(2 downto 0); + signal tlb_plru_ram : tlb_plru_array; + signal tlb_plru_cur : std_ulogic_vector(2 downto 0); + signal tlb_plru_upd : std_ulogic_vector(2 downto 0); + signal tlb_plru_victim : std_ulogic_vector(1 downto 0); + + function addr_hash_4k(ea: std_ulogic_vector(63 downto 0); + pid: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is + variable h : std_ulogic_vector(TLB_HASH_BITS - 1 downto 0); + begin + -- Make this a bit different to the hashes used in the dcache and icache + h := ea(17 downto 12) xor ea(23 downto 18) xor ea(51 downto 46) xor + pid(5 downto 0); + return h; + end; + + function find_first_zero(x: std_ulogic_vector(3 downto 0)) return std_ulogic_vector is + begin + for i in 0 to 2 loop + if x(i) = '0' then + return std_ulogic_vector(to_unsigned(i, 2)); + end if; + end loop; + return "11"; + end; + + function check_perm(pte: std_ulogic_vector(63 downto 0); priv: std_ulogic; + iside: std_ulogic; store: std_ulogic) return std_ulogic is + variable ok: std_ulogic; + begin + ok := '0'; + if priv = '1' or pte(3) = '0' then + if iside = '0' then + ok := pte(1) or (pte(2) and not store); + else + -- no IAMR, so no KUEP support for now + -- deny execute permission if cache inhibited + ok := pte(0) and not pte(5); + end if; + end if; + return ok; + end; + begin + -- Synchronous reads and writes to TLB array + mmu_tlb_ram: process(clk) + begin + if rising_edge(clk) then + if tlb_rdren = '1' then + tlb_rdreg <= tlb_rddata; + end if; + if tlb_doread = '1' then + tlb_rddata <= tlb(to_integer(unsigned(tlb_rdaddr))); + end if; + if tlb_wren /= "0000" then + for i in 0 to 3 loop + if tlb_wren(i) = '1' then + tlb(to_integer(unsigned(tlb_wraddr)))(i*16 + 15 downto i*16) <= + tlb_wrdata(i*16 + 15 downto i*16); + end if; + end loop; + end if; + end if; + end process; + + -- TLB PLRU + tlb_plru : entity work.plrufn + generic map ( + BITS => 2 + ) + port map ( + acc => tr.hit_way, + tree_in => tlb_plru_cur, + tree_out => tlb_plru_upd, + lru => tlb_plru_victim + ); + + process(all) + begin + if is_X(tr.hash_4k) then + tlb_plru_cur <= (others => 'X'); + else + tlb_plru_cur <= tlb_plru_ram(to_integer(unsigned(tr.hash_4k))); + end if; + end process; + + process(clk) + begin + if rising_edge(clk) then + if tr.update_plru = '1' then + assert not is_X(tr.hash_4k) severity failure; + tlb_plru_ram(to_integer(unsigned(tr.hash_4k))) <= tlb_plru_upd; + end if; + end if; + end process; + + -- State machine for doing TLB searches, updates and invalidations + mmu_tlb_0: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + tr <= mmu_tlb_reg_init; + else + tr <= trin; + end if; + end if; + end process; + + mmu_tlb_1: process(all) + variable tv : mmu_tlb_reg_t; + variable isf : std_ulogic_vector(1 downto 0); + variable is_hit : std_ulogic; + variable valids : std_ulogic_vector(3 downto 0); + variable idx : std_ulogic_vector(2 downto 0); + variable wdat : std_ulogic_vector(15 downto 0); + begin + tv := tr; + tlb_doread <= '0'; + tlb_rdren <= '0'; + tlb_wren <= "0000"; + tlb_wrdata <= (others => '0'); + is_hit := '0'; + idx := "000"; + tv.update_plru := '0'; + case tr.state is + when IDLE => + tv.addr := l_in.addr(51 downto 12); + tv.pid := (others => '0'); + if l_in.tlbie = '1' then + -- PID for tlbie comes from RS + tv.pid := l_in.rs(43 downto 32); + elsif l_in.addr(63) = '0' then + -- we currently only implement quadrants 0 and 3 + tv.pid := r.pid; + end if; + tv.bad_ea := (or (l_in.addr(61 downto 52)) or (l_in.addr(63) xor l_in.addr(62))) + and not l_in.tlbie; + tv.hash_4k := addr_hash_4k(l_in.addr, tv.pid); + tv.wr_hash := tv.hash_4k; + tv.is_tlbie := l_in.tlbie; + if l_in.valid = '1' then + tv.hit := '0'; + tv.miss := '0'; + tv.tlbie_done := '0'; + tv.inval_all := '0'; + if l_in.tlbie = '1' then + -- decode what type of tlbie this is + isf := l_in.addr(11 downto 10); + if l_in.slbia = '1' or l_in.ric(0) = '1' then + -- no effect on this TLB (flushes L1 TLBs below) + tv.tlbie_done := '1'; + elsif isf(1) = '1' then + -- invalidate all + tv.inval_all := '1'; + tv.wr_hash := (others => '0'); + tv.state := INVAL2; + elsif isf(0) = '1' then + -- invalidate PID + tv.hash_4k := (others => '0'); + tlb_doread <= '1'; + tv.state := INVAL1; + else + -- invalidate single page + tlb_doread <= '1'; + tv.state := SEARCH1; + end if; + else + tlb_doread <= '1'; + tv.state := SEARCH1; + end if; + end if; + when SEARCH1 => + -- next read word 1 of group + idx := "001"; + tlb_doread <= '1'; + tlb_rdren <= '1'; + if tr.bad_ea = '0' then + tv.state := SEARCH2; + else + tv.miss := '1'; + tv.tlbie_done := tr.is_tlbie; + tv.state := IDLE; + end if; + when SEARCH2 => + -- tlb_rdreg contains word 0, check for hits/misses + valids := "0000"; + tv.may_hit := "0000"; + for i in 0 to 3 loop + valids(i) := tlb_rdreg(i*16 + 15); + if tlb_rdreg(i*16 + 15) = '1' and + tlb_rdreg(i*16 + 11 downto i*16) = tr.pid and + tlb_rdreg(i*16 + 13 downto i*16 + 12) = tr.addr(7 downto 6) then + tv.may_hit(i) := '1'; + end if; + end loop; + -- work out which way to replace in case of a miss + if valids = "1111" then + tv.repl_way := tlb_plru_victim; + else + tv.repl_way := find_first_zero(valids); + end if; + -- next read word 2 of group + idx := "010"; + if tv.may_hit = "0000" then + tv.miss := '1'; + if tr.is_tlbie = '0' then + tv.state := WAITW; + else + tv.tlbie_done := '1'; + tv.state := IDLE; + end if; + else + tlb_doread <= '1'; + tlb_rdren <= '1'; + tv.state := SEARCH3; + end if; + when SEARCH3 => + -- tlb_rdreg contains word 1 + for i in 0 to 1 loop + if tr.may_hit(i) = '1' then + if tlb_rdreg(i*32 + 31 downto i*32) /= tr.addr(39 downto 8) then + tv.may_hit(i) := '0'; + end if; + end if; + end loop; + if tv.may_hit(0) = '1' then + tv.hit_way := "00"; + is_hit := '1'; + elsif tv.may_hit(1) = '1' then + tv.hit_way := "01"; + is_hit := '1'; + end if; + if tr.is_tlbie = '1' then + tlb_rdren <= '1'; + tv.state := SEARCH4; + elsif is_hit = '1' then + tv.state := RDPTE; + idx := '1' & tv.hit_way; + tlb_doread <= '1'; + elsif tv.may_hit = "0000" then + tv.miss := '1'; + tv.state := WAITW; + else + tlb_rdren <= '1'; + tv.state := SEARCH4; + end if; + when SEARCH4 => + -- tlb_rdreg contains word 2 + for i in 0 to 1 loop + if tr.may_hit(i+2) = '1' then + if tlb_rdreg(i*32 + 31 downto i*32) /= tr.addr(39 downto 8) then + tv.may_hit(i+2) := '0'; + end if; + end if; + end loop; + if tr.is_tlbie = '1' then + -- write zeroes to word 0 where hit(s) detected + tlb_wren <= tv.may_hit; + tv.tlbie_done := '1'; + tv.state := IDLE; + elsif tv.may_hit = "0000" then + tv.miss := '1'; + tv.state := WAITW; + else + tv.hit_way := '1' & not tv.may_hit(2); + idx := '1' & tv.hit_way; + tlb_doread <= '1'; + tv.state := RDPTE; + end if; + when RDPTE => + tv.repl_way := tr.hit_way; + tlb_rdren <= '1'; + tv.hit := '1'; + tv.update_plru := '1'; + tv.state := WAITW; + when WAITW => + wdat := "10" & tr.addr(7 downto 6) & tr.pid; + tlb_wrdata <= wdat & wdat & wdat & wdat; + if r.wr_tlbram = '1' then + -- write one 16b section of word 0 + tlb_wren(to_integer(unsigned(tr.repl_way))) <= '1'; + tv.hit_way := tv.repl_way; + tv.update_plru := '1'; + tv.state := WRPTE1; + elsif r.done = '1' or r.err = '1' then + tv.state := IDLE; + end if; + when WRPTE1 => + tlb_wrdata <= tr.addr(39 downto 8) & tr.addr(39 downto 8); + if tr.repl_way(0) = '1' then + tlb_wren <= "1100"; + else + tlb_wren <= "0011"; + end if; + idx := '0' & tr.repl_way(1) & not tr.repl_way(1); + tv.state := WRPTE2; + when WRPTE2 => + tlb_wrdata <= r.pde; + tlb_wren <= "1111"; + idx := '1' & tr.repl_way; + tv.state := IDLE; + when INVAL1 => + tv.hash_4k := 6x"01"; + tv.wr_hash := (others => '0'); + tlb_doread <= '1'; + tlb_rdren <= '1'; + tv.state := INVAL2; + when INVAL2 => + if tr.inval_all = '1' then + tlb_wren <= "1111"; + else + valids := "0000"; + for i in 0 to 3 loop + if tlb_rdreg(i*16 + 15) = '1' and + tlb_rdreg(i*16 + 11 downto i*16) = tr.pid then + valids(i) := '1'; + end if; + end loop; + tlb_wren <= valids; + tlb_doread <= '1'; + tlb_rdren <= '1'; + end if; + tv.wr_hash := std_ulogic_vector(unsigned(tr.wr_hash) + 1); + tv.hash_4k := std_ulogic_vector(unsigned(tv.hash_4k) + 1); + if tr.wr_hash = 6x"3f" then + tv.tlbie_done := '1'; + tv.state := IDLE; + end if; + end case; + tlb_rdaddr <= tv.hash_4k & idx; + tlb_wraddr <= tr.wr_hash & idx; + trin <= tv; + end process; + -- Multiplex internal SPR values back to loadstore1, selected -- by l_in.sprnf. - l_out.sprval <= r.ptcr when l_in.sprnf = '1' else x"00000000" & r.pid; + l_out.sprval <= r.ptcr when l_in.sprnf = '1' else x"0000000000000" & r.pid; mmu_0: process(clk) begin @@ -94,6 +494,7 @@ begin r.pt3_valid <= '0'; r.ptcr <= (others => '0'); r.pid <= (others => '0'); + r.wr_tlbram <= '0'; else if rin.valid = '1' then report "MMU got tlb miss for " & to_hstring(rin.addr); @@ -194,12 +595,12 @@ begin variable v : reg_stage_t; variable dcreq : std_ulogic; variable tlb_load : std_ulogic; - variable tlbie_req : std_ulogic; variable ptbl_rd : std_ulogic; variable prtbl_rd : std_ulogic; variable pt_valid : std_ulogic; - variable effpid : std_ulogic_vector(31 downto 0); + variable effpid : std_ulogic_vector(11 downto 0); variable prtable_addr : std_ulogic_vector(63 downto 0); + variable six : std_ulogic_vector(5 downto 0); variable rts : unsigned(5 downto 0); variable mbits : unsigned(5 downto 0); variable pgtable_addr : std_ulogic_vector(63 downto 0); @@ -223,7 +624,7 @@ begin v.perm_err := '0'; v.rc_error := '0'; tlb_load := '0'; - tlbie_req := '0'; + v.tlbie_req := '0'; v.inval_all := '0'; ptbl_rd := '0'; prtbl_rd := '0'; @@ -244,7 +645,8 @@ begin pt_valid := r.pt3_valid; end if; -- rts == radix tree size, # address bits being translated - rts := unsigned('0' & pgtbl(62 downto 61) & pgtbl(7 downto 5)); + six := '0' & pgtbl(62 downto 61) & pgtbl(7 downto 5); + rts := unsigned(six); -- mbits == # address bits to index top level of tree mbits := unsigned('0' & pgtbl(4 downto 0)); -- set v.shift to rts so that we can use finalmask for the segment check @@ -268,6 +670,7 @@ begin v.pt3_valid := '0'; v.ptb_valid := '0'; end if; + v.tlbie_req := '1'; v.state := DO_TLBIE; else v.valid := '1'; @@ -289,12 +692,13 @@ begin end if; end if; end if; + v.is_mtspr := l_in.mtspr; if l_in.mtspr = '1' then -- Move to PID needs to invalidate L1 TLBs and cached -- pgtbl0 value. Move to PTCR does that plus -- invalidating the cached pgtbl3 and prtbl values as well. if l_in.sprnt = '0' then - v.pid := l_in.rs(31 downto 0); + v.pid := l_in.rs(11 downto 0); else v.ptcr := l_in.rs; v.pt3_valid := '0'; @@ -302,12 +706,14 @@ begin end if; v.pt0_valid := '0'; v.inval_all := '1'; + v.tlbie_req := '1'; v.state := DO_TLBIE; end if; when DO_TLBIE => - tlbie_req := '1'; - v.state := RADIX_FINISH; + if r.is_mtspr = '1' or tr.tlbie_done = '1' then + v.state := RADIX_FINISH; + end if; when PART_TBL_READ => dcreq := '1'; @@ -340,7 +746,8 @@ begin v.pt0_valid := '1'; end if; -- rts == radix tree size, # address bits being translated - rts := unsigned('0' & data(62 downto 61) & data(7 downto 5)); + six := '0' & data(62 downto 61) & data(7 downto 5); + rts := unsigned(six); -- mbits == # address bits to index top level of tree mbits := unsigned('0' & data(4 downto 0)); -- set v.shift to rts so that we can use finalmask for the segment check @@ -369,7 +776,30 @@ begin elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then v.state := RADIX_FINISH; v.badtree := '1'; + elsif tr.miss = '1' then + v.state := RADIX_LOOKUP; else + v.state := TLBWAIT; + end if; + + when TLBWAIT => + v.pde := tlb_rdreg; + if tr.hit = '1' then + -- PTE from the TLB entry is in tlb_rdreg + -- Check permissions; if the access is not permitted, + -- reread the PTE from memory to verify, because increasing + -- permission on a PTE doesn't require tlbie. + -- Note that R must be set in the PTE, otherwise it + -- wouldn't have been written to the TLB. + perm_ok := check_perm(tlb_rdreg, r.priv, r.iside, r.store); + rc_ok := tlb_rdreg(7) or not r.store; + if perm_ok = '1' and rc_ok = '1' then + v.shift := to_unsigned(0, 6); + v.state := RADIX_LOAD_TLB; + else + v.state := RADIX_LOOKUP; + end if; + elsif tr.miss = '1' then v.state := RADIX_LOOKUP; end if; @@ -385,19 +815,15 @@ begin -- test leaf bit if data(62) = '1' then -- check permissions and RC bits - perm_ok := '0'; - if r.priv = '1' or data(3) = '0' then - if r.iside = '0' then - perm_ok := data(1) or (data(2) and not r.store); - else - -- no IAMR, so no KUEP support for now - -- deny execute permission if cache inhibited - perm_ok := data(0) and not data(5); - end if; - end if; + perm_ok := check_perm(data, r.priv, r.iside, r.store); rc_ok := data(8) and (data(7) or not r.store); if perm_ok = '1' and rc_ok = '1' then v.state := RADIX_LOAD_TLB; + -- only cache 4k PTEs in our TLB, and only if the + -- address is within the standard 52 bit EA space + if r.shift = 0 then + v.wr_tlbram := '1'; + end if; else v.state := RADIX_FINISH; v.perm_err := not perm_ok; @@ -432,6 +858,7 @@ begin v.state := RADIX_FINISH; when RADIX_FINISH => + v.wr_tlbram := '0'; v.state := IDLE; end case; @@ -442,13 +869,13 @@ begin end if; if r.addr(63) = '1' then - effpid := x"00000000"; + effpid := (others => '0'); else effpid := r.pid; end if; - prtable_addr := x"00" & r.prtbl(55 downto 36) & - ((r.prtbl(35 downto 12) and not finalmask(23 downto 0)) or - (effpid(31 downto 8) and finalmask(23 downto 0))) & + prtable_addr := x"00" & r.prtbl(55 downto 16) & + ((r.prtbl(15 downto 12) and not finalmask(3 downto 0)) or + (effpid(11 downto 8) and finalmask(3 downto 0))) & effpid(7 downto 0) & "0000"; pgtable_addr := x"00" & r.pgbase(55 downto 19) & @@ -462,7 +889,7 @@ begin rin <= v; -- drive outputs - if tlbie_req = '1' then + if r.tlbie_req = '1' then addr := r.addr; tlb_data := (others => '0'); elsif tlb_load = '1' then @@ -488,14 +915,14 @@ begin l_out.rc_error <= r.rc_error; d_out.valid <= dcreq; - d_out.tlbie <= tlbie_req; + d_out.tlbie <= r.tlbie_req; d_out.doall <= r.inval_all; d_out.tlbld <= tlb_load and not r.iside; d_out.addr <= addr; d_out.pte <= tlb_data; i_out.tlbld <= tlb_load and r.iside; - i_out.tlbie <= tlbie_req; + i_out.tlbie <= r.tlbie_req; i_out.doall <= r.inval_all; i_out.addr <= addr; i_out.pte <= tlb_data; diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c index 749c121..6301068 100644 --- a/tests/mmu/mmu.c +++ b/tests/mmu/mmu.c @@ -17,7 +17,7 @@ extern int test_exec(int testno, unsigned long pc, unsigned long msr); static inline void do_tlbie(unsigned long rb, unsigned long rs) { - __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); + __asm__ volatile(".machine \"power10\"; tlbie %0,%1,0,1,1" : : "r" (rb), "r" (rs) : "memory"); } #define DSISR 18 @@ -174,7 +174,7 @@ void unmap(void *ea) return; ptep = read_pgd(i); ptep[j] = 0; - do_tlbie(((unsigned long)ea & ~0xfff), 0); + do_tlbie(((unsigned long)ea & ~0xfff), 1ul << 32); } void unmap_all(void) diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin index 1c4f876..369ca8b 100755 Binary files a/tests/test_mmu.bin and b/tests/test_mmu.bin differ