From 4e6fc6811a17fbc524f0fe632d2d6e5adb268420 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 Apr 2020 15:28:22 +1000 Subject: [PATCH] MMU: Implement radix page table machinery This adds the necessary machinery to the MMU for it to do radix page table walks. The core elements are a shifter that can shift the address right by between 0 and 47 bits, a mask generator that can generate a mask of between 5 and 16 bits, a final mask generator, and new states in the state machine. (The final mask generator is used for transferring bits of the original address into the resulting TLB entry when the leaf PTE corresponds to a page size larger than 4kB.) The hardware does not implement a partition table or a process table. Software is expected to load the appropriate process table entry into a new SPR called PGTBL0, SPR 720. The contents should be formatted as described in Book III section 5.7.6.2 of the Power ISA v3.0B. PGTBL0 is set to 0 on hard reset. At present, the top two bits of the address (the quadrant) are ignored. There is currently no caching of any step in the translation process or of the final result, other than the entry created in the dTLB. That entry is a 4k page entry even if the leaf PTE found in the walk corresponds to a larger page size. This implementation can handle almost any page table layout and any page size. The RTS field (in PGTBL0) can have any value between 0 and 31, corresponding to a total address space size between 2^31 and 2^62 bytes. The RPDS field of PGTBL0 can be any value between 5 and 16, except that a value of 0 is taken to disable radix page table walking (for use when one is using software loading of TLB entries). The NLS field of the page directory entries can have any value between 5 and 16. The minimum page size is 4kB, meaning that the sum of RPDS and the NLS values of the PDEs found on the path to a leaf PTE must be less than or equal to RTS + 31 - 12. The PGTBL0 SPR is in the mmu module; thus this adds a path for loadstore1 to read and write SPRs in mmu. This adds code in dcache to service doubleword read requests from the MMU, as well as requests to write dTLB entries. Signed-off-by: Paul Mackerras --- common.vhdl | 12 ++- dcache.vhdl | 120 +++++++++++++++++--------- decode1.vhdl | 2 +- loadstore1.vhdl | 34 +++++--- mmu.vhdl | 224 ++++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 322 insertions(+), 70 deletions(-) diff --git a/common.vhdl b/common.vhdl index 3ee19d7..d617fa4 100644 --- a/common.vhdl +++ b/common.vhdl @@ -39,6 +39,7 @@ package common is constant SPR_SPRG3U : spr_num_t := 259; constant SPR_HSPRG0 : spr_num_t := 304; constant SPR_HSPRG1 : spr_num_t := 305; + constant SPR_PGTBL0 : spr_num_t := 720; -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); @@ -269,18 +270,23 @@ package common is type Loadstore1ToMmuType is record valid : std_ulogic; tlbie : std_ulogic; + mtspr : std_ulogic; + sprn : std_ulogic_vector(3 downto 0); addr : std_ulogic_vector(63 downto 0); rs : std_ulogic_vector(63 downto 0); end record; type MmuToLoadstore1Type is record - done : std_ulogic; - error : std_ulogic; + done : std_ulogic; + invalid : std_ulogic; + badtree : std_ulogic; + sprval : std_ulogic_vector(63 downto 0); end record; type MmuToDcacheType is record valid : std_ulogic; tlbie : std_ulogic; + tlbld : std_ulogic; addr : std_ulogic_vector(63 downto 0); pte : std_ulogic_vector(63 downto 0); end record; @@ -288,6 +294,8 @@ package common is type DcacheToMmuType is record stall : std_ulogic; done : std_ulogic; + err : std_ulogic; + data : std_ulogic_vector(63 downto 0); end record; type Loadstore1ToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 126df48..96563a5 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -209,6 +209,8 @@ architecture rtl of dcache is type reg_stage_0_t is record req : Loadstore1ToDcacheType; tlbie : std_ulogic; + tlbld : std_ulogic; + mmu_req : std_ulogic; -- indicates source of request end record; signal r0 : reg_stage_0_t; @@ -220,6 +222,7 @@ architecture rtl of dcache is type reg_stage_1_t is record -- Latch the complete request from ls1 req : Loadstore1ToDcacheType; + mmu_req : std_ulogic; -- Cache hit state hit_way : way_t; @@ -444,7 +447,7 @@ begin "request collision loadstore vs MMU"; if m_in.valid = '1' then r0.req.valid <= '1'; - r0.req.load <= '0'; + r0.req.load <= not (m_in.tlbie or m_in.tlbld); r0.req.dcbz <= '0'; r0.req.nc <= '0'; r0.req.reserve <= '0'; @@ -454,10 +457,13 @@ begin r0.req.data <= m_in.pte; r0.req.byte_sel <= (others => '1'); r0.tlbie <= m_in.tlbie; - assert m_in.tlbie = '1' report "unknown request from MMU"; + r0.tlbld <= m_in.tlbld; + r0.mmu_req <= '1'; else r0.req <= d_in; r0.tlbie <= '0'; + r0.tlbld <= '0'; + r0.mmu_req <= '0'; end if; end if; end if; @@ -549,7 +555,11 @@ begin end loop; tlb_hit <= hit and r0_valid; tlb_hit_way <= hitway; - pte <= read_tlb_pte(hitway, tlb_pte_way); + if tlb_hit = '1' then + pte <= read_tlb_pte(hitway, tlb_pte_way); + else + pte <= (others => '0'); + end if; valid_ra <= tlb_hit or not r0.req.virt_mode; if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & @@ -573,7 +583,7 @@ begin if rising_edge(clk) then tlbie := '0'; tlbia := '0'; - tlbwe := '0'; + tlbwe := r0_valid and r0.tlbld; if r0_valid = '1' and r0.tlbie = '1' then if r0.req.addr(11 downto 10) /= "00" then tlbia := '1'; @@ -607,7 +617,6 @@ begin dtlb_ptes(tlb_req_index) <= pteset; dtlb_valids(tlb_req_index)(repl_way) <= '1'; end if; - m_out.done <= r0_valid and r0.tlbie; end if; end process; @@ -669,7 +678,7 @@ begin req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 - go := r0_valid and not r0.tlbie; + go := r0_valid and not (r0.tlbie or r0.tlbld); -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed @@ -824,6 +833,11 @@ begin d_out.perm_error <= '0'; d_out.rc_error <= '0'; + -- Outputs to MMU + m_out.done <= r1.tlbie_done; + m_out.err <= '0'; + m_out.data <= cache_out(r1.hit_way); + -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store -- @@ -842,40 +856,65 @@ begin "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - -- Load hit case is the standard path - if r1.hit_load_valid = '1' then - report "completing load hit"; - d_out.valid <= '1'; - end if; + if r1.mmu_req = '0' then + -- Request came from loadstore1... + -- Load hit case is the standard path + if r1.hit_load_valid = '1' then + report "completing load hit"; + d_out.valid <= '1'; + end if; - -- error cases complete without stalling - if r1.error_done = '1' then - report "completing ld/st with error"; - d_out.error <= '1'; - d_out.tlb_miss <= r1.tlb_miss; - d_out.perm_error <= r1.perm_error; - d_out.rc_error <= r1.rc_error; - d_out.valid <= '1'; - end if; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing ld/st with error"; + d_out.error <= '1'; + d_out.tlb_miss <= r1.tlb_miss; + d_out.perm_error <= r1.perm_error; + d_out.rc_error <= r1.rc_error; + d_out.valid <= '1'; + end if; - -- Slow ops (load miss, NC, stores) - if r1.slow_valid = '1' then - -- If it's a load, enable register writeback and switch - -- mux accordingly - -- - if r1.req.load then - -- Read data comes from the slow data latch - d_out.data <= r1.slow_data; - end if; - d_out.store_done <= '1'; + -- Slow ops (load miss, NC, stores) + if r1.slow_valid = '1' then + -- If it's a load, enable register writeback and switch + -- mux accordingly + -- + if r1.req.load then + -- Read data comes from the slow data latch + d_out.data <= r1.slow_data; + end if; + d_out.store_done <= '1'; - report "completing store or load miss"; - d_out.valid <= '1'; - end if; + report "completing store or load miss"; + d_out.valid <= '1'; + end if; + + if r1.stcx_fail = '1' then + d_out.store_done <= '0'; + d_out.valid <= '1'; + end if; + + else + -- Request came from MMU + if r1.hit_load_valid = '1' then + report "completing load hit to MMU, data=" & to_hstring(m_out.data); + m_out.done <= '1'; + end if; - if r1.stcx_fail = '1' then - d_out.store_done <= '0'; - d_out.valid <= '1'; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing MMU ld with error"; + m_out.err <= '1'; + m_out.done <= '1'; + end if; + + -- Slow ops (i.e. load miss) + if r1.slow_valid = '1' then + -- Read data comes from the slow data latch + m_out.data <= r1.slow_data; + report "completing MMU load miss, data=" & to_hstring(m_out.data); + m_out.done <= '1'; + end if; end if; end process; @@ -978,6 +1017,7 @@ begin if req_op /= OP_NONE and stall_out = '0' then r1.req <= r0.req; + r1.mmu_req <= r0.mmu_req; report "op:" & op_t'image(req_op) & " addr:" & to_hstring(r0.req.addr) & " nc:" & std_ulogic'image(r0.req.nc) & @@ -995,8 +1035,8 @@ begin end if; if req_op = OP_BAD then - report "Signalling ld/st error valid_ra=" & " rc_ok=" & std_ulogic'image(rc_ok) & - " perm_ok=" & std_ulogic'image(perm_ok); + report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & + " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.error_done <= '1'; r1.tlb_miss <= not valid_ra; r1.perm_error <= valid_ra and not perm_ok; @@ -1005,8 +1045,8 @@ begin r1.error_done <= '0'; end if; - -- complete tlbies in the third cycle - r1.tlbie_done <= r0_valid and r0.tlbie; + -- complete tlbies and TLB loads in the third cycle + r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; end process; diff --git a/decode1.vhdl b/decode1.vhdl index fd799fe..b7212c2 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -438,7 +438,7 @@ begin v.decode.sgl_pipe := '1'; -- send MMU-related SPRs to loadstore1 case sprn is - when SPR_DAR | SPR_DSISR => + when SPR_DAR | SPR_DSISR | SPR_PGTBL0 => v.decode.unit := LDST; when others => end case; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index d5dd010..03aaa6f 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -153,6 +153,7 @@ begin variable next_addr : std_ulogic_vector(63 downto 0); variable mmureq : std_ulogic; variable dsisr : std_ulogic_vector(31 downto 0); + variable mmu_mtspr : std_ulogic; begin v := r; req := '0'; @@ -161,6 +162,8 @@ begin byte_sel := (others => '0'); addr := lsu_sum; mfspr := '0'; + mmu_mtspr := '0'; + sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); sprval := (others => '0'); -- avoid inferred latches exception := '0'; dsisr := (others => '0'); @@ -244,19 +247,27 @@ begin mfspr := '1'; -- partial decode on SPR number should be adequate given -- the restricted set that get sent down this path - sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); - if sprn(0) = '0' then - sprval := x"00000000" & r.dsisr; + if sprn(9) = '0' then + if sprn(0) = '0' then + sprval := x"00000000" & r.dsisr; + else + sprval := r.dar; + end if; else - sprval := r.dar; + -- reading one of the SPRs in the MMU + sprval := m_in.sprval; end if; when OP_MTSPR => done := '1'; - sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); - if sprn(0) = '0' then - v.dsisr := l_in.data(31 downto 0); + if sprn(9) = '0' then + if sprn(0) = '0' then + v.dsisr := l_in.data(31 downto 0); + else + v.dar := l_in.data; + end if; else - v.dar := l_in.data; + -- writing one of the SPRs in the MMU + mmu_mtspr := '1'; end if; when others => assert false report "unknown op sent to loadstore1"; @@ -361,7 +372,7 @@ begin byte_sel := r.first_bytes; end if; if m_in.done = '1' then - if m_in.error = '0' then + if m_in.invalid = '0' and m_in.badtree = '0' then -- retry the request now that the MMU has installed a TLB entry req := '1'; if r.state = MMU_LOOKUP_1ST then @@ -371,8 +382,9 @@ begin end if; else exception := '1'; - dsisr(63 - 33) := '1'; + dsisr(63 - 33) := m_in.invalid; dsisr(63 - 38) := not r.load; + dsisr(63 - 44) := m_in.badtree; v.state := IDLE; end if; end if; @@ -440,6 +452,8 @@ begin -- Update outputs to MMU m_out.valid <= mmureq; m_out.tlbie <= v.tlbie; + m_out.mtspr <= mmu_mtspr; + m_out.sprn <= sprn(3 downto 0); m_out.addr <= addr; m_out.rs <= l_in.data; diff --git a/mmu.vhdl b/mmu.vhdl index 2e6d0fd..fe6ad16 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -25,20 +25,37 @@ end mmu; architecture behave of mmu is type state_t is (IDLE, - TLBIE_WAIT, - RADIX_LOOKUP_0 + TLB_WAIT, + RADIX_LOOKUP, + RADIX_READ_WAIT, + RADIX_LOAD_TLB, + RADIX_NO_TRANS, + RADIX_BAD_TREE ); type reg_stage_t is record -- latched request from loadstore1 valid : std_ulogic; addr : std_ulogic_vector(63 downto 0); + -- internal state state : state_t; + pgtbl0 : std_ulogic_vector(63 downto 0); + shift : unsigned(5 downto 0); + mask_size : unsigned(4 downto 0); + pgbase : std_ulogic_vector(55 downto 0); + pde : std_ulogic_vector(63 downto 0); end record; signal r, rin : reg_stage_t; + signal addrsh : std_ulogic_vector(15 downto 0); + signal mask : std_ulogic_vector(15 downto 0); + signal finalmask : std_ulogic_vector(43 downto 0); + begin + -- Multiplex internal SPR values back to loadstore1, selected + -- by l_in.sprn. Easy when there's only one... + l_out.sprval <= r.pgtbl0; mmu_0: process(clk) begin @@ -46,64 +63,237 @@ begin if rst = '1' then r.state <= IDLE; r.valid <= '0'; + r.pgtbl0 <= (others => '0'); else if rin.valid = '1' then report "MMU got tlb miss for " & to_hstring(rin.addr); end if; if l_out.done = '1' then - report "MMU completing miss with error=" & std_ulogic'image(l_out.error); + report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) & + " badtree=" & std_ulogic'image(l_out.badtree); + end if; + if rin.state = RADIX_LOOKUP then + report "radix lookup shift=" & integer'image(to_integer(rin.shift)) & + " msize=" & integer'image(to_integer(rin.mask_size)); + end if; + if r.state = RADIX_LOOKUP then + report "send load addr=" & to_hstring(d_out.addr) & + " addrsh=" & to_hstring(addrsh) & " mask=" & to_hstring(mask); end if; r <= rin; end if; end if; end process; + -- Shift address bits 61--12 right by 0--47 bits and + -- supply the least significant 16 bits of the result. + addrshifter: process(all) + variable sh1 : std_ulogic_vector(30 downto 0); + variable sh2 : std_ulogic_vector(18 downto 0); + variable result : std_ulogic_vector(15 downto 0); + begin + case r.shift(5 downto 4) is + when "00" => + sh1 := r.addr(42 downto 12); + when "01" => + sh1 := r.addr(58 downto 28); + when others => + sh1 := "0000000000000" & r.addr(61 downto 44); + end case; + case r.shift(3 downto 2) is + when "00" => + sh2 := sh1(18 downto 0); + when "01" => + sh2 := sh1(22 downto 4); + when "10" => + sh2 := sh1(26 downto 8); + when others => + sh2 := sh1(30 downto 12); + end case; + case r.shift(1 downto 0) is + when "00" => + result := sh2(15 downto 0); + when "01" => + result := sh2(16 downto 1); + when "10" => + result := sh2(17 downto 2); + when others => + result := sh2(18 downto 3); + end case; + addrsh <= result; + end process; + + -- generate mask for extracting address fields for PTE address generation + addrmaskgen: process(all) + variable m : std_ulogic_vector(15 downto 0); + begin + -- mask_count has to be >= 5 + m := x"001f"; + for i in 5 to 15 loop + if i < to_integer(r.mask_size) then + m(i) := '1'; + end if; + end loop; + mask <= m; + end process; + + -- generate mask for extracting address bits to go in TLB entry + -- in order to support pages > 4kB + finalmaskgen: process(all) + variable m : std_ulogic_vector(43 downto 0); + begin + m := (others => '0'); + for i in 0 to 43 loop + if i < to_integer(r.shift) then + m(i) := '1'; + end if; + end loop; + finalmask <= m; + end process; + mmu_1: process(all) variable v : reg_stage_t; variable dcreq : std_ulogic; variable done : std_ulogic; - variable err : std_ulogic; + variable invalid : std_ulogic; + variable badtree : std_ulogic; + variable tlb_load : std_ulogic; + variable tlbie_req : std_ulogic; + variable rts : unsigned(5 downto 0); + variable mbits : unsigned(5 downto 0); + variable pgtable_addr : std_ulogic_vector(63 downto 0); + variable pte : std_ulogic_vector(63 downto 0); + variable data : std_ulogic_vector(63 downto 0); begin - v.valid := l_in.valid; - v.addr := l_in.addr; - v.state := r.state; + v := r; + v.valid := '0'; dcreq := '0'; done := '0'; - err := '0'; + invalid := '0'; + badtree := '0'; + tlb_load := '0'; + tlbie_req := '0'; + + -- Radix tree data structures in memory are big-endian, + -- so we need to byte-swap them + for i in 0 to 7 loop + data(i * 8 + 7 downto i * 8) := d_in.data((7 - i) * 8 + 7 downto (7 - i) * 8); + end loop; case r.state is when IDLE => + -- rts == radix tree size, # address bits being translated + rts := unsigned('0' & r.pgtbl0(62 downto 61) & r.pgtbl0(7 downto 5)) + (31 - 12); + -- mbits == # address bits to index top level of tree + mbits := unsigned('0' & r.pgtbl0(4 downto 0)); + v.shift := rts - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := r.pgtbl0(55 downto 8) & x"00"; + if l_in.valid = '1' then + v.addr := l_in.addr; if l_in.tlbie = '1' then dcreq := '1'; - v.state := TLBIE_WAIT; + tlbie_req := '1'; + v.state := TLB_WAIT; else - v.state := RADIX_LOOKUP_0; + v.valid := '1'; + -- for now, take RPDS = 0 to disable radix translation + if mbits = 0 then + v.state := RADIX_NO_TRANS; + elsif mbits < 5 or mbits > 16 or mbits > rts then + v.state := RADIX_BAD_TREE; + else + v.state := RADIX_LOOKUP; + end if; end if; end if; + if l_in.mtspr = '1' then + v.pgtbl0 := l_in.rs; + end if; - when TLBIE_WAIT => + when TLB_WAIT => if d_in.done = '1' then done := '1'; v.state := IDLE; end if; - when RADIX_LOOKUP_0 => + when RADIX_LOOKUP => + dcreq := '1'; + v.state := RADIX_READ_WAIT; + + when RADIX_READ_WAIT => + if d_in.done = '1' then + if d_in.err = '0' then + v.pde := data; + -- test valid bit + if data(63) = '1' then + -- test leaf bit + if data(62) = '1' then + v.state := RADIX_LOAD_TLB; + else + mbits := unsigned('0' & data(4 downto 0)); + if mbits < 5 or mbits > 16 or mbits > r.shift then + v.state := RADIX_BAD_TREE; + else + v.shift := v.shift - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + v.state := RADIX_LOOKUP; + end if; + end if; + else + -- non-present PTE, generate a DSI + v.state := RADIX_NO_TRANS; + end if; + else + v.state := RADIX_BAD_TREE; + end if; + end if; + + when RADIX_LOAD_TLB => + tlb_load := '1'; + dcreq := '1'; + v.state := TLB_WAIT; + + when RADIX_NO_TRANS => + done := '1'; + invalid := '1'; + v.state := IDLE; + + when RADIX_BAD_TREE => done := '1'; - err := '1'; + badtree := '1'; v.state := IDLE; end case; + pgtable_addr := x"00" & r.pgbase(55 downto 19) & + ((r.pgbase(18 downto 3) and not mask) or (addrsh and mask)) & + "000"; + pte := x"00" & + ((r.pde(55 downto 12) and not finalmask) or (r.addr(55 downto 12) and finalmask)) + & r.pde(11 downto 0); + -- update registers rin <= v; -- drive outputs l_out.done <= done; - l_out.error <= err; + l_out.invalid <= invalid; + l_out.badtree <= badtree; d_out.valid <= dcreq; - d_out.tlbie <= l_in.tlbie; - d_out.addr <= l_in.addr; - d_out.pte <= l_in.rs; + d_out.tlbie <= tlbie_req; + d_out.tlbld <= tlb_load; + if tlbie_req = '1' then + d_out.addr <= l_in.addr; + d_out.pte <= l_in.rs; + elsif tlb_load = '1' then + d_out.addr <= r.addr(63 downto 12) & x"000"; + d_out.pte <= pte; + else + d_out.addr <= pgtable_addr; + d_out.pte <= (others => '0'); + end if; end process; end;