diff --git a/common.vhdl b/common.vhdl index 3ee19d7..d617fa4 100644 --- a/common.vhdl +++ b/common.vhdl @@ -39,6 +39,7 @@ package common is constant SPR_SPRG3U : spr_num_t := 259; constant SPR_HSPRG0 : spr_num_t := 304; constant SPR_HSPRG1 : spr_num_t := 305; + constant SPR_PGTBL0 : spr_num_t := 720; -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); @@ -269,18 +270,23 @@ package common is type Loadstore1ToMmuType is record valid : std_ulogic; tlbie : std_ulogic; + mtspr : std_ulogic; + sprn : std_ulogic_vector(3 downto 0); addr : std_ulogic_vector(63 downto 0); rs : std_ulogic_vector(63 downto 0); end record; type MmuToLoadstore1Type is record - done : std_ulogic; - error : std_ulogic; + done : std_ulogic; + invalid : std_ulogic; + badtree : std_ulogic; + sprval : std_ulogic_vector(63 downto 0); end record; type MmuToDcacheType is record valid : std_ulogic; tlbie : std_ulogic; + tlbld : std_ulogic; addr : std_ulogic_vector(63 downto 0); pte : std_ulogic_vector(63 downto 0); end record; @@ -288,6 +294,8 @@ package common is type DcacheToMmuType is record stall : std_ulogic; done : std_ulogic; + err : std_ulogic; + data : std_ulogic_vector(63 downto 0); end record; type Loadstore1ToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 126df48..96563a5 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -209,6 +209,8 @@ architecture rtl of dcache is type reg_stage_0_t is record req : Loadstore1ToDcacheType; tlbie : std_ulogic; + tlbld : std_ulogic; + mmu_req : std_ulogic; -- indicates source of request end record; signal r0 : reg_stage_0_t; @@ -220,6 +222,7 @@ architecture rtl of dcache is type reg_stage_1_t is record -- Latch the complete request from ls1 req : Loadstore1ToDcacheType; + mmu_req : std_ulogic; -- Cache hit state hit_way : way_t; @@ -444,7 +447,7 @@ begin "request collision loadstore vs MMU"; if m_in.valid = '1' then r0.req.valid <= '1'; - r0.req.load <= '0'; + r0.req.load <= not (m_in.tlbie or m_in.tlbld); r0.req.dcbz <= '0'; r0.req.nc <= '0'; r0.req.reserve <= '0'; @@ -454,10 +457,13 @@ begin r0.req.data <= m_in.pte; r0.req.byte_sel <= (others => '1'); r0.tlbie <= m_in.tlbie; - assert m_in.tlbie = '1' report "unknown request from MMU"; + r0.tlbld <= m_in.tlbld; + r0.mmu_req <= '1'; else r0.req <= d_in; r0.tlbie <= '0'; + r0.tlbld <= '0'; + r0.mmu_req <= '0'; end if; end if; end if; @@ -549,7 +555,11 @@ begin end loop; tlb_hit <= hit and r0_valid; tlb_hit_way <= hitway; - pte <= read_tlb_pte(hitway, tlb_pte_way); + if tlb_hit = '1' then + pte <= read_tlb_pte(hitway, tlb_pte_way); + else + pte <= (others => '0'); + end if; valid_ra <= tlb_hit or not r0.req.virt_mode; if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & @@ -573,7 +583,7 @@ begin if rising_edge(clk) then tlbie := '0'; tlbia := '0'; - tlbwe := '0'; + tlbwe := r0_valid and r0.tlbld; if r0_valid = '1' and r0.tlbie = '1' then if r0.req.addr(11 downto 10) /= "00" then tlbia := '1'; @@ -607,7 +617,6 @@ begin dtlb_ptes(tlb_req_index) <= pteset; dtlb_valids(tlb_req_index)(repl_way) <= '1'; end if; - m_out.done <= r0_valid and r0.tlbie; end if; end process; @@ -669,7 +678,7 @@ begin req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 - go := r0_valid and not r0.tlbie; + go := r0_valid and not (r0.tlbie or r0.tlbld); -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed @@ -824,6 +833,11 @@ begin d_out.perm_error <= '0'; d_out.rc_error <= '0'; + -- Outputs to MMU + m_out.done <= r1.tlbie_done; + m_out.err <= '0'; + m_out.data <= cache_out(r1.hit_way); + -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store -- @@ -842,40 +856,65 @@ begin "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - -- Load hit case is the standard path - if r1.hit_load_valid = '1' then - report "completing load hit"; - d_out.valid <= '1'; - end if; + if r1.mmu_req = '0' then + -- Request came from loadstore1... + -- Load hit case is the standard path + if r1.hit_load_valid = '1' then + report "completing load hit"; + d_out.valid <= '1'; + end if; - -- error cases complete without stalling - if r1.error_done = '1' then - report "completing ld/st with error"; - d_out.error <= '1'; - d_out.tlb_miss <= r1.tlb_miss; - d_out.perm_error <= r1.perm_error; - d_out.rc_error <= r1.rc_error; - d_out.valid <= '1'; - end if; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing ld/st with error"; + d_out.error <= '1'; + d_out.tlb_miss <= r1.tlb_miss; + d_out.perm_error <= r1.perm_error; + d_out.rc_error <= r1.rc_error; + d_out.valid <= '1'; + end if; - -- Slow ops (load miss, NC, stores) - if r1.slow_valid = '1' then - -- If it's a load, enable register writeback and switch - -- mux accordingly - -- - if r1.req.load then - -- Read data comes from the slow data latch - d_out.data <= r1.slow_data; - end if; - d_out.store_done <= '1'; + -- Slow ops (load miss, NC, stores) + if r1.slow_valid = '1' then + -- If it's a load, enable register writeback and switch + -- mux accordingly + -- + if r1.req.load then + -- Read data comes from the slow data latch + d_out.data <= r1.slow_data; + end if; + d_out.store_done <= '1'; - report "completing store or load miss"; - d_out.valid <= '1'; - end if; + report "completing store or load miss"; + d_out.valid <= '1'; + end if; + + if r1.stcx_fail = '1' then + d_out.store_done <= '0'; + d_out.valid <= '1'; + end if; + + else + -- Request came from MMU + if r1.hit_load_valid = '1' then + report "completing load hit to MMU, data=" & to_hstring(m_out.data); + m_out.done <= '1'; + end if; - if r1.stcx_fail = '1' then - d_out.store_done <= '0'; - d_out.valid <= '1'; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing MMU ld with error"; + m_out.err <= '1'; + m_out.done <= '1'; + end if; + + -- Slow ops (i.e. load miss) + if r1.slow_valid = '1' then + -- Read data comes from the slow data latch + m_out.data <= r1.slow_data; + report "completing MMU load miss, data=" & to_hstring(m_out.data); + m_out.done <= '1'; + end if; end if; end process; @@ -978,6 +1017,7 @@ begin if req_op /= OP_NONE and stall_out = '0' then r1.req <= r0.req; + r1.mmu_req <= r0.mmu_req; report "op:" & op_t'image(req_op) & " addr:" & to_hstring(r0.req.addr) & " nc:" & std_ulogic'image(r0.req.nc) & @@ -995,8 +1035,8 @@ begin end if; if req_op = OP_BAD then - report "Signalling ld/st error valid_ra=" & " rc_ok=" & std_ulogic'image(rc_ok) & - " perm_ok=" & std_ulogic'image(perm_ok); + report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & + " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.error_done <= '1'; r1.tlb_miss <= not valid_ra; r1.perm_error <= valid_ra and not perm_ok; @@ -1005,8 +1045,8 @@ begin r1.error_done <= '0'; end if; - -- complete tlbies in the third cycle - r1.tlbie_done <= r0_valid and r0.tlbie; + -- complete tlbies and TLB loads in the third cycle + r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; end process; diff --git a/decode1.vhdl b/decode1.vhdl index fd799fe..b7212c2 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -438,7 +438,7 @@ begin v.decode.sgl_pipe := '1'; -- send MMU-related SPRs to loadstore1 case sprn is - when SPR_DAR | SPR_DSISR => + when SPR_DAR | SPR_DSISR | SPR_PGTBL0 => v.decode.unit := LDST; when others => end case; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index d5dd010..03aaa6f 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -153,6 +153,7 @@ begin variable next_addr : std_ulogic_vector(63 downto 0); variable mmureq : std_ulogic; variable dsisr : std_ulogic_vector(31 downto 0); + variable mmu_mtspr : std_ulogic; begin v := r; req := '0'; @@ -161,6 +162,8 @@ begin byte_sel := (others => '0'); addr := lsu_sum; mfspr := '0'; + mmu_mtspr := '0'; + sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); sprval := (others => '0'); -- avoid inferred latches exception := '0'; dsisr := (others => '0'); @@ -244,19 +247,27 @@ begin mfspr := '1'; -- partial decode on SPR number should be adequate given -- the restricted set that get sent down this path - sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); - if sprn(0) = '0' then - sprval := x"00000000" & r.dsisr; + if sprn(9) = '0' then + if sprn(0) = '0' then + sprval := x"00000000" & r.dsisr; + else + sprval := r.dar; + end if; else - sprval := r.dar; + -- reading one of the SPRs in the MMU + sprval := m_in.sprval; end if; when OP_MTSPR => done := '1'; - sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); - if sprn(0) = '0' then - v.dsisr := l_in.data(31 downto 0); + if sprn(9) = '0' then + if sprn(0) = '0' then + v.dsisr := l_in.data(31 downto 0); + else + v.dar := l_in.data; + end if; else - v.dar := l_in.data; + -- writing one of the SPRs in the MMU + mmu_mtspr := '1'; end if; when others => assert false report "unknown op sent to loadstore1"; @@ -361,7 +372,7 @@ begin byte_sel := r.first_bytes; end if; if m_in.done = '1' then - if m_in.error = '0' then + if m_in.invalid = '0' and m_in.badtree = '0' then -- retry the request now that the MMU has installed a TLB entry req := '1'; if r.state = MMU_LOOKUP_1ST then @@ -371,8 +382,9 @@ begin end if; else exception := '1'; - dsisr(63 - 33) := '1'; + dsisr(63 - 33) := m_in.invalid; dsisr(63 - 38) := not r.load; + dsisr(63 - 44) := m_in.badtree; v.state := IDLE; end if; end if; @@ -440,6 +452,8 @@ begin -- Update outputs to MMU m_out.valid <= mmureq; m_out.tlbie <= v.tlbie; + m_out.mtspr <= mmu_mtspr; + m_out.sprn <= sprn(3 downto 0); m_out.addr <= addr; m_out.rs <= l_in.data; diff --git a/mmu.vhdl b/mmu.vhdl index 2e6d0fd..fe6ad16 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -25,20 +25,37 @@ end mmu; architecture behave of mmu is type state_t is (IDLE, - TLBIE_WAIT, - RADIX_LOOKUP_0 + TLB_WAIT, + RADIX_LOOKUP, + RADIX_READ_WAIT, + RADIX_LOAD_TLB, + RADIX_NO_TRANS, + RADIX_BAD_TREE ); type reg_stage_t is record -- latched request from loadstore1 valid : std_ulogic; addr : std_ulogic_vector(63 downto 0); + -- internal state state : state_t; + pgtbl0 : std_ulogic_vector(63 downto 0); + shift : unsigned(5 downto 0); + mask_size : unsigned(4 downto 0); + pgbase : std_ulogic_vector(55 downto 0); + pde : std_ulogic_vector(63 downto 0); end record; signal r, rin : reg_stage_t; + signal addrsh : std_ulogic_vector(15 downto 0); + signal mask : std_ulogic_vector(15 downto 0); + signal finalmask : std_ulogic_vector(43 downto 0); + begin + -- Multiplex internal SPR values back to loadstore1, selected + -- by l_in.sprn. Easy when there's only one... + l_out.sprval <= r.pgtbl0; mmu_0: process(clk) begin @@ -46,64 +63,237 @@ begin if rst = '1' then r.state <= IDLE; r.valid <= '0'; + r.pgtbl0 <= (others => '0'); else if rin.valid = '1' then report "MMU got tlb miss for " & to_hstring(rin.addr); end if; if l_out.done = '1' then - report "MMU completing miss with error=" & std_ulogic'image(l_out.error); + report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) & + " badtree=" & std_ulogic'image(l_out.badtree); + end if; + if rin.state = RADIX_LOOKUP then + report "radix lookup shift=" & integer'image(to_integer(rin.shift)) & + " msize=" & integer'image(to_integer(rin.mask_size)); + end if; + if r.state = RADIX_LOOKUP then + report "send load addr=" & to_hstring(d_out.addr) & + " addrsh=" & to_hstring(addrsh) & " mask=" & to_hstring(mask); end if; r <= rin; end if; end if; end process; + -- Shift address bits 61--12 right by 0--47 bits and + -- supply the least significant 16 bits of the result. + addrshifter: process(all) + variable sh1 : std_ulogic_vector(30 downto 0); + variable sh2 : std_ulogic_vector(18 downto 0); + variable result : std_ulogic_vector(15 downto 0); + begin + case r.shift(5 downto 4) is + when "00" => + sh1 := r.addr(42 downto 12); + when "01" => + sh1 := r.addr(58 downto 28); + when others => + sh1 := "0000000000000" & r.addr(61 downto 44); + end case; + case r.shift(3 downto 2) is + when "00" => + sh2 := sh1(18 downto 0); + when "01" => + sh2 := sh1(22 downto 4); + when "10" => + sh2 := sh1(26 downto 8); + when others => + sh2 := sh1(30 downto 12); + end case; + case r.shift(1 downto 0) is + when "00" => + result := sh2(15 downto 0); + when "01" => + result := sh2(16 downto 1); + when "10" => + result := sh2(17 downto 2); + when others => + result := sh2(18 downto 3); + end case; + addrsh <= result; + end process; + + -- generate mask for extracting address fields for PTE address generation + addrmaskgen: process(all) + variable m : std_ulogic_vector(15 downto 0); + begin + -- mask_count has to be >= 5 + m := x"001f"; + for i in 5 to 15 loop + if i < to_integer(r.mask_size) then + m(i) := '1'; + end if; + end loop; + mask <= m; + end process; + + -- generate mask for extracting address bits to go in TLB entry + -- in order to support pages > 4kB + finalmaskgen: process(all) + variable m : std_ulogic_vector(43 downto 0); + begin + m := (others => '0'); + for i in 0 to 43 loop + if i < to_integer(r.shift) then + m(i) := '1'; + end if; + end loop; + finalmask <= m; + end process; + mmu_1: process(all) variable v : reg_stage_t; variable dcreq : std_ulogic; variable done : std_ulogic; - variable err : std_ulogic; + variable invalid : std_ulogic; + variable badtree : std_ulogic; + variable tlb_load : std_ulogic; + variable tlbie_req : std_ulogic; + variable rts : unsigned(5 downto 0); + variable mbits : unsigned(5 downto 0); + variable pgtable_addr : std_ulogic_vector(63 downto 0); + variable pte : std_ulogic_vector(63 downto 0); + variable data : std_ulogic_vector(63 downto 0); begin - v.valid := l_in.valid; - v.addr := l_in.addr; - v.state := r.state; + v := r; + v.valid := '0'; dcreq := '0'; done := '0'; - err := '0'; + invalid := '0'; + badtree := '0'; + tlb_load := '0'; + tlbie_req := '0'; + + -- Radix tree data structures in memory are big-endian, + -- so we need to byte-swap them + for i in 0 to 7 loop + data(i * 8 + 7 downto i * 8) := d_in.data((7 - i) * 8 + 7 downto (7 - i) * 8); + end loop; case r.state is when IDLE => + -- rts == radix tree size, # address bits being translated + rts := unsigned('0' & r.pgtbl0(62 downto 61) & r.pgtbl0(7 downto 5)) + (31 - 12); + -- mbits == # address bits to index top level of tree + mbits := unsigned('0' & r.pgtbl0(4 downto 0)); + v.shift := rts - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := r.pgtbl0(55 downto 8) & x"00"; + if l_in.valid = '1' then + v.addr := l_in.addr; if l_in.tlbie = '1' then dcreq := '1'; - v.state := TLBIE_WAIT; + tlbie_req := '1'; + v.state := TLB_WAIT; else - v.state := RADIX_LOOKUP_0; + v.valid := '1'; + -- for now, take RPDS = 0 to disable radix translation + if mbits = 0 then + v.state := RADIX_NO_TRANS; + elsif mbits < 5 or mbits > 16 or mbits > rts then + v.state := RADIX_BAD_TREE; + else + v.state := RADIX_LOOKUP; + end if; end if; end if; + if l_in.mtspr = '1' then + v.pgtbl0 := l_in.rs; + end if; - when TLBIE_WAIT => + when TLB_WAIT => if d_in.done = '1' then done := '1'; v.state := IDLE; end if; - when RADIX_LOOKUP_0 => + when RADIX_LOOKUP => + dcreq := '1'; + v.state := RADIX_READ_WAIT; + + when RADIX_READ_WAIT => + if d_in.done = '1' then + if d_in.err = '0' then + v.pde := data; + -- test valid bit + if data(63) = '1' then + -- test leaf bit + if data(62) = '1' then + v.state := RADIX_LOAD_TLB; + else + mbits := unsigned('0' & data(4 downto 0)); + if mbits < 5 or mbits > 16 or mbits > r.shift then + v.state := RADIX_BAD_TREE; + else + v.shift := v.shift - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + v.state := RADIX_LOOKUP; + end if; + end if; + else + -- non-present PTE, generate a DSI + v.state := RADIX_NO_TRANS; + end if; + else + v.state := RADIX_BAD_TREE; + end if; + end if; + + when RADIX_LOAD_TLB => + tlb_load := '1'; + dcreq := '1'; + v.state := TLB_WAIT; + + when RADIX_NO_TRANS => + done := '1'; + invalid := '1'; + v.state := IDLE; + + when RADIX_BAD_TREE => done := '1'; - err := '1'; + badtree := '1'; v.state := IDLE; end case; + pgtable_addr := x"00" & r.pgbase(55 downto 19) & + ((r.pgbase(18 downto 3) and not mask) or (addrsh and mask)) & + "000"; + pte := x"00" & + ((r.pde(55 downto 12) and not finalmask) or (r.addr(55 downto 12) and finalmask)) + & r.pde(11 downto 0); + -- update registers rin <= v; -- drive outputs l_out.done <= done; - l_out.error <= err; + l_out.invalid <= invalid; + l_out.badtree <= badtree; d_out.valid <= dcreq; - d_out.tlbie <= l_in.tlbie; - d_out.addr <= l_in.addr; - d_out.pte <= l_in.rs; + d_out.tlbie <= tlbie_req; + d_out.tlbld <= tlb_load; + if tlbie_req = '1' then + d_out.addr <= l_in.addr; + d_out.pte <= l_in.rs; + elsif tlb_load = '1' then + d_out.addr <= r.addr(63 downto 12) & x"000"; + d_out.pte <= pte; + else + d_out.addr <= pgtable_addr; + d_out.pte <= (others => '0'); + end if; end process; end;