From bdd4d041629f92484806812e54744ed5d8413c55 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Jun 2022 19:20:57 +1000 Subject: [PATCH 01/30] Simplify flow control in the dcache and loadstore units Simplify the flow control by stalling the whole upstream pipeline when a stage can't proceed, instead of trying to let each stage progress independently when it can. Signed-off-by: Paul Mackerras --- dcache.vhdl | 6 +- loadstore1.vhdl | 334 ++++++++++++++++++++++++------------------------ 2 files changed, 173 insertions(+), 167 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 8f7af52..2d5ebe3 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -588,7 +588,7 @@ begin end if; if rst = '1' then r0_full <= '0'; - elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then + elsif r1.full = '0' and d_in.hold = '0' then r0 <= r; r0_full <= r.req.valid; elsif r0.d_valid = '0' then @@ -605,9 +605,9 @@ begin m_out.stall <= '0'; -- Hold off the request in r0 when r1 has an uncompleted request - r0_stall <= r0_full and (r1.full or d_in.hold); + r0_stall <= r1.full or d_in.hold; r0_valid <= r0_full and not r1.full and not d_in.hold; - stall_out <= r0_stall; + stall_out <= r1.full; events <= ev; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 6c4b0db..ea7baec 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -43,9 +43,7 @@ architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction - MMU_LOOKUP, -- waiting for MMU to look up translation - TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie - FINISH_LFS -- write back converted SP data for lfs* + MMU_WAIT -- waiting for MMU to finish doing something ); type byte_index_t is array(0 to 7) of unsigned(2 downto 0); @@ -63,9 +61,7 @@ architecture behave of loadstore1 is write_spr : std_ulogic; mmu_op : std_ulogic; instr_fault : std_ulogic; - load_zero : std_ulogic; do_update : std_ulogic; - noop : std_ulogic; mode_32bit : std_ulogic; addr : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -93,11 +89,12 @@ architecture behave of loadstore1 is align_intr : std_ulogic; dword_index : std_ulogic; two_dwords : std_ulogic; + incomplete : std_ulogic; nia : std_ulogic_vector(63 downto 0); end record; constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', - instr_fault => '0', load_zero => '0', do_update => '0', noop => '0', + instr_fault => '0', do_update => '0', mode_32bit => '0', addr => (others => '0'), byte_sel => x"00", second_bytes => x"00", store_data => (others => '0'), instr_tag => instr_tag_init, @@ -108,11 +105,12 @@ architecture behave of loadstore1 is atomic => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', sprn => 10x"0", is_slbia => '0', align_intr => '0', - dword_index => '0', two_dwords => '0', + dword_index => '0', two_dwords => '0', incomplete => '0', nia => (others => '0')); type reg_stage1_t is record req : request_t; + busy : std_ulogic; issued : std_ulogic; addr0 : std_ulogic_vector(63 downto 0); end record; @@ -121,6 +119,7 @@ architecture behave of loadstore1 is req : request_t; byte_index : byte_index_t; use_second : std_ulogic_vector(7 downto 0); + busy : std_ulogic; wait_dc : std_ulogic; wait_mmu : std_ulogic; one_cycle : std_ulogic; @@ -130,6 +129,7 @@ architecture behave of loadstore1 is type reg_stage3_t is record state : state_t; + complete : std_ulogic; instr_tag : instr_tag_t; write_enable : std_ulogic; write_reg : gspr_index_t; @@ -137,7 +137,6 @@ architecture behave of loadstore1 is rc : std_ulogic; xerc : xer_common_t; store_done : std_ulogic; - convert_lfs : std_ulogic; load_data : std_ulogic_vector(63 downto 0); dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); @@ -157,6 +156,7 @@ architecture behave of loadstore1 is signal r2, r2in : reg_stage2_t; signal r3, r3in : reg_stage3_t; + signal flush : std_ulogic; signal busy : std_ulogic; signal complete : std_ulogic; signal in_progress : std_ulogic; @@ -166,12 +166,9 @@ architecture behave of loadstore1 is signal load_dp_data : std_ulogic_vector(63 downto 0); signal store_data : std_ulogic_vector(63 downto 0); - signal stage1_issue_enable : std_ulogic; signal stage1_req : request_t; signal stage1_dcreq : std_ulogic; signal stage1_dreq : std_ulogic; - signal stage2_busy_next : std_ulogic; - signal stage3_busy_next : std_ulogic; -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is @@ -274,7 +271,11 @@ begin begin if rising_edge(clk) then if rst = '1' then + r1.busy <= '0'; + r1.issued <= '0'; r1.req.valid <= '0'; + r1.req.dc_req <= '0'; + r1.req.incomplete <= '0'; r1.req.tlbie <= '0'; r1.req.is_slbia <= '0'; r1.req.instr_fault <= '0'; @@ -284,6 +285,7 @@ begin r1.req.xerc <= xerc_init; r2.req.valid <= '0'; + r2.busy <= '0'; r2.req.tlbie <= '0'; r2.req.is_slbia <= '0'; r2.req.instr_fault <= '0'; @@ -301,8 +303,8 @@ begin r3.state <= IDLE; r3.write_enable <= '0'; r3.interrupt <= '0'; + r3.complete <= '0'; r3.stage1_en <= '1'; - r3.convert_lfs <= '0'; r3.events.load_complete <= '0'; r3.events.store_complete <= '0'; flushing <= '0'; @@ -311,7 +313,7 @@ begin r2 <= r2in; r3 <= r3in; flushing <= (flushing or (r1in.req.valid and r1in.req.align_intr)) and - not r3in.interrupt; + not flush; end if; stage1_dreq <= stage1_dcreq; if d_in.valid = '1' then @@ -321,7 +323,7 @@ begin assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure; end if; if m_in.done = '1' or m_in.err = '1' then - assert r2.req.valid = '1' and (r3.state = MMU_LOOKUP or r3.state = TLBIE_WAIT) severity failure; + assert r2.req.valid = '1' and r3.state = MMU_WAIT severity failure; end if; end if; end process; @@ -507,6 +509,7 @@ begin when others => end case; v.dc_req := l_in.valid and (v.load or v.store or v.dcbz) and not v.align_intr; + v.incomplete := v.dc_req and v.two_dwords; -- Work out controls for load and store formatting brev_lenm1 := "000"; @@ -518,17 +521,10 @@ begin req_in <= v; end process; - busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or - (r1.issued and d_in.error) or - stage2_busy_next or - (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); - complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or - (r2.wait_mmu and m_in.done) or r3.convert_lfs; + busy <= dc_stall or d_in.error or r1.busy or r2.busy; + complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or r3.complete; in_progress <= r1.req.valid or (r2.req.valid and not complete); - stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and - not (r2.req.valid and r2.req.mmu_op); - -- Processing done in the first cycle of a load/store instruction loadstore1_1: process(all) variable v : reg_stage1_t; @@ -538,10 +534,11 @@ begin begin v := r1; issue := '0'; + dcreq := '0'; - if busy = '0' then + if r1.busy = '0' then req := req_in; - v.issued := '0'; + req.valid := l_in.valid; if flushing = '1' then -- Make this a no-op request rather than simply invalid. -- It will never get to stage 3 since there is a request ahead of @@ -554,37 +551,49 @@ begin end if; else req := r1.req; - end if; - - if r1.req.valid = '1' then if r1.req.dc_req = '1' and r1.issued = '0' then issue := '1'; - elsif r1.issued = '1' and d_in.error = '1' then - v.issued := '0'; - elsif stage2_busy_next = '0' then - -- we can change what's in r1 next cycle because the current thing - -- in r1 will go into r2 - if r1.req.dc_req = '1' and r1.req.two_dwords = '1' and r1.req.dword_index = '0' then - -- construct the second request for a misaligned access - req.dword_index := '1'; - req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000"; - if r1.req.mode_32bit = '1' then - req.addr(32) := '0'; - end if; - req.byte_sel := r1.req.second_bytes; - issue := '1'; + elsif r1.req.incomplete = '1' then + -- construct the second request for a misaligned access + req.dword_index := '1'; + req.incomplete := '0'; + req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000"; + if r1.req.mode_32bit = '1' then + req.addr(32) := '0'; end if; + req.byte_sel := r1.req.second_bytes; + issue := '1'; + else + -- For the lfs conversion cycle, leave the request valid + -- for another cycle but with req.dc_req = 0. + -- For an MMU request last cycle, we have nothing + -- to do in this cycle, so make it invalid. + if r1.req.load_sp = '0' then + req.valid := '0'; + end if; + req.dc_req := '0'; end if; end if; - if r3in.interrupt = '1' then - req.valid := '0'; - issue := '0'; - end if; - v.req := req; - dcreq := issue and stage1_issue_enable and not d_in.error and not dc_stall; - if issue = '1' then - v.issued := dcreq; + if flush = '1' then + v.req.valid := '0'; + v.req.dc_req := '0'; + v.req.incomplete := '0'; + v.issued := '0'; + v.busy := '0'; + elsif (dc_stall or d_in.error or r2.busy) = '0' then + -- we can change what's in r1 next cycle because the current thing + -- in r1 will go into r2 + v.req := req; + dcreq := issue; + v.issued := issue; + v.busy := (issue and (req.incomplete or req.load_sp)) or (req.valid and req.mmu_op); + else + -- pipeline is stalled + if r1.issued = '1' and d_in.error = '1' then + v.issued := '0'; + v.busy := '1'; + end if; end if; stage1_req <= req; @@ -602,6 +611,7 @@ begin variable kk : unsigned(3 downto 0); variable idx : unsigned(2 downto 0); variable byte_offset : unsigned(2 downto 0); + variable interrupt : std_ulogic; begin v := r2; @@ -614,44 +624,61 @@ begin store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j); end loop; - if stage3_busy_next = '0' and - (r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0') then - v.req := r1.req; - v.addr0 := r1.addr0; - v.req.store_data := store_data; - v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and - not (r1.req.two_dwords and not r1.req.dword_index); - v.wait_mmu := r1.req.valid and r1.req.mmu_op; - v.one_cycle := r1.req.valid and (r1.req.noop or r1.req.read_spr or - (r1.req.write_spr and not r1.req.mmu_op) or - r1.req.load_zero or r1.req.do_update); - if r1.req.read_spr = '1' then - v.wr_sel := "00"; - elsif r1.req.do_update = '1' or r1.req.store = '1' then - v.wr_sel := "01"; - elsif r1.req.load_sp = '1' then - v.wr_sel := "10"; + if (dc_stall or d_in.error or r2.busy) = '0' then + if r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0' then + v.req := r1.req; + v.addr0 := r1.addr0; + v.req.store_data := store_data; + v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and + not r1.req.incomplete; + v.wait_mmu := r1.req.valid and r1.req.mmu_op; + v.busy := r1.req.valid and r1.req.mmu_op; + v.one_cycle := r1.req.valid and not (r1.req.dc_req or r1.req.mmu_op); + if r1.req.read_spr = '1' then + v.wr_sel := "00"; + elsif r1.req.do_update = '1' or r1.req.store = '1' then + v.wr_sel := "01"; + elsif r1.req.load_sp = '1' then + v.wr_sel := "10"; + else + v.wr_sel := "11"; + end if; + + -- Work out load formatter controls for next cycle + for i in 0 to 7 loop + idx := to_unsigned(i, 3) xor r1.req.brev_mask; + kk := ('0' & idx) + ('0' & byte_offset); + v.use_second(i) := kk(3); + v.byte_index(i) := kk(2 downto 0); + end loop; else - v.wr_sel := "11"; + v.req.valid := '0'; + v.wait_dc := '0'; + v.wait_mmu := '0'; + v.one_cycle := '0'; + end if; + end if; + if r2.wait_mmu = '1' and m_in.done = '1' then + if r2.req.mmu_op = '1' then + v.req.valid := '0'; + v.busy := '0'; end if; - - -- Work out load formatter controls for next cycle - for i in 0 to 7 loop - idx := to_unsigned(i, 3) xor r1.req.brev_mask; - kk := ('0' & idx) + ('0' & byte_offset); - v.use_second(i) := kk(3); - v.byte_index(i) := kk(2 downto 0); - end loop; - elsif stage3_busy_next = '0' then - v.req.valid := '0'; - v.wait_dc := '0'; v.wait_mmu := '0'; end if; + if r2.busy = '1' and r2.wait_mmu = '0' then + v.busy := '0'; + end if; - stage2_busy_next <= r1.req.valid and stage3_busy_next; - - if r3in.interrupt = '1' then + interrupt := (r2.req.valid and r2.req.align_intr) or + (d_in.error and d_in.cache_paradox) or m_in.err; + if interrupt = '1' then v.req.valid := '0'; + v.busy := '0'; + v.wait_dc := '0'; + v.wait_mmu := '0'; + elsif d_in.error = '1' then + v.wait_mmu := '1'; + v.busy := '1'; end if; r2in <= v; @@ -671,7 +698,6 @@ begin variable write_data : std_ulogic_vector(63 downto 0); variable do_update : std_ulogic; variable done : std_ulogic; - variable part_done : std_ulogic; variable exception : std_ulogic; variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); @@ -687,13 +713,12 @@ begin mmureq := '0'; mmu_mtspr := '0'; done := '0'; - part_done := '0'; exception := '0'; dsisr := (others => '0'); write_enable := '0'; sprval := (others => '0'); do_update := '0'; - v.convert_lfs := '0'; + v.complete := '0'; v.srr1 := (others => '0'); v.events := (others => '0'); @@ -775,94 +800,74 @@ begin -- generate alignment interrupt exception := '1'; end if; - if r2.req.load_zero = '1' then - write_enable := '1'; - end if; if r2.req.do_update = '1' then do_update := '1'; end if; - end if; - - case r3.state is - when IDLE => - if d_in.valid = '1' then - if r2.req.two_dwords = '0' or r2.req.dword_index = '1' then - write_enable := r2.req.load and not r2.req.load_sp; - if HAS_FPU and r2.req.load_sp = '1' then - -- SP to DP conversion takes a cycle - v.state := FINISH_LFS; - v.convert_lfs := '1'; - else - -- stores write back rA update - do_update := r2.req.update and r2.req.store; - end if; - else - part_done := '1'; - end if; + if r2.req.load_sp = '1' and r2.req.dc_req = '0' then + write_enable := '1'; end if; - if d_in.error = '1' then - if d_in.cache_paradox = '1' then - -- signal an interrupt straight away - exception := '1'; - dsisr(63 - 38) := not r2.req.load; - -- XXX there is no architected bit for this - -- (probably should be a machine check in fact) - dsisr(63 - 35) := d_in.cache_paradox; + if r2.req.write_spr = '1' and r2.req.mmu_op = '0' then + if r2.req.sprn(0) = '0' then + v.dsisr := r2.req.store_data(31 downto 0); else - -- Look up the translation for TLB miss - -- and also for permission error and RC error - -- in case the PTE has been updated. - mmureq := '1'; - v.state := MMU_LOOKUP; - v.stage1_en := '0'; + v.dar := r2.req.store_data; end if; end if; - if r2.req.valid = '1' then - if r2.req.mmu_op = '1' then - -- send request (tlbie, mtspr, itlb miss) to MMU - mmureq := not r2.req.write_spr; - mmu_mtspr := r2.req.write_spr; - if r2.req.instr_fault = '1' then - v.state := MMU_LOOKUP; - v.events.itlb_miss := '1'; - else - v.state := TLBIE_WAIT; - end if; - elsif r2.req.write_spr = '1' then - if r2.req.sprn(0) = '0' then - v.dsisr := r2.req.store_data(31 downto 0); - else - v.dar := r2.req.store_data; - end if; - end if; + end if; + + if r3.state = IDLE and r2.req.valid = '1' and r2.req.mmu_op = '1' then + -- send request (tlbie, mtspr, itlb miss) to MMU + mmureq := not r2.req.write_spr; + mmu_mtspr := r2.req.write_spr; + if r2.req.instr_fault = '1' then + v.events.itlb_miss := '1'; end if; + v.state := MMU_WAIT; + end if; - when MMU_LOOKUP => - if m_in.done = '1' then - if r2.req.instr_fault = '0' then - -- retry the request now that the MMU has installed a TLB entry - req := '1'; - v.stage1_en := '1'; - v.state := IDLE; - end if; + if d_in.valid = '1' then + if r2.req.incomplete = '0' then + write_enable := r2.req.load and not r2.req.load_sp; + -- stores write back rA update + do_update := r2.req.update and r2.req.store; end if; - if m_in.err = '1' then + end if; + if d_in.error = '1' then + if d_in.cache_paradox = '1' then + -- signal an interrupt straight away exception := '1'; - dsisr(63 - 33) := m_in.invalid; - dsisr(63 - 36) := m_in.perm_error; - dsisr(63 - 38) := r2.req.store or r2.req.dcbz; - dsisr(63 - 44) := m_in.badtree; - dsisr(63 - 45) := m_in.rc_error; + dsisr(63 - 38) := not r2.req.load; + -- XXX there is no architected bit for this + -- (probably should be a machine check in fact) + dsisr(63 - 35) := d_in.cache_paradox; + else + -- Look up the translation for TLB miss + -- and also for permission error and RC error + -- in case the PTE has been updated. + mmureq := '1'; + v.state := MMU_WAIT; + v.stage1_en := '0'; end if; + end if; - when TLBIE_WAIT => - - when FINISH_LFS => - write_enable := '1'; - - end case; + if m_in.done = '1' then + if r2.req.dc_req = '1' then + -- retry the request now that the MMU has installed a TLB entry + req := '1'; + else + v.complete := '1'; + end if; + end if; + if m_in.err = '1' then + exception := '1'; + dsisr(63 - 33) := m_in.invalid; + dsisr(63 - 36) := m_in.perm_error; + dsisr(63 - 38) := r2.req.store or r2.req.dcbz; + dsisr(63 - 44) := m_in.badtree; + dsisr(63 - 45) := m_in.rc_error; + end if; - if complete = '1' or exception = '1' then + if (m_in.done or m_in.err) = '1' then v.stage1_en := '1'; v.state := IDLE; end if; @@ -915,7 +920,7 @@ begin end case; -- Update outputs to dcache - if stage1_issue_enable = '1' then + if r3.stage1_en = '1' then d_out.valid <= stage1_dcreq; d_out.load <= stage1_req.load; d_out.dcbz <= stage1_req.dcbz; @@ -945,7 +950,7 @@ begin else d_out.data <= r2.req.store_data; end if; - d_out.hold <= r2.req.valid and r2.req.load_sp and d_in.valid; + d_out.hold <= '0'; -- Update outputs to MMU m_out.valid <= mmureq; @@ -980,8 +985,7 @@ begin events <= r3.events; - -- Busy calculation. - stage3_busy_next <= r2.req.valid and not (complete or part_done or exception); + flush <= exception; -- Update registers r3in <= v; @@ -1001,7 +1005,9 @@ begin d_out.valid & m_in.done & r2.req.dword_index & - std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 3)); + r2.req.valid & + r2.wait_dc & + std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 1)); end if; end process; log_out <= log_data; From 204fedc63f7831e35cea09688b6e5249de8938da Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Jun 2022 20:02:36 +1000 Subject: [PATCH 02/30] Move XER low bits out of register file Besides the overflow and status carry bits, XER has 18 bits which need to retain the value written by mtxer (in case software wants to emulate the move-assist instructions (lswi, lswx, stswi, stswx). Until now these bits (and others) have been stored in the GPR file as a "fast" SPR, but this causes complications because XER is not really a fast SPR. Instead, we now store these 18 bits in the 'ctrl' signal, which exists in execute1. This will enable us to simplify the data path in future, and has the added bonus that with a little bit of plumbing, we can get the full XER value printed when dumping registers at the end of a simulation. Therefore this changes scripts/run_test.sh to remove the greps which exclude XER from the comparison of actual and expected register results. Signed-off-by: Paul Mackerras --- common.vhdl | 7 ++++--- core.vhdl | 7 ++++--- cr_file.vhdl | 10 ++++++++++ execute1.vhdl | 42 +++++++++++++++++++----------------------- register_file.vhdl | 1 - scripts/run_test.sh | 4 ++-- 6 files changed, 39 insertions(+), 32 deletions(-) diff --git a/common.vhdl b/common.vhdl index 14a8801..bab5aed 100644 --- a/common.vhdl +++ b/common.vhdl @@ -114,7 +114,7 @@ package common is -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are -- in the CR file as a kind of CR extension (with a separate write - -- control). The rest is stored as a fast SPR. + -- control). The rest is stored in ctrl_t (effectively in execute1). type xer_common_t is record ca : std_ulogic; ca32 : std_ulogic; @@ -192,7 +192,10 @@ package common is dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); + xer_low: std_ulogic_vector(17 downto 0); end record; + constant ctrl_t_init : ctrl_t := + (xer_low => 18x"0", others => (others => '0')); type Fetch1ToIcacheType is record req: std_ulogic; @@ -739,8 +742,6 @@ package body common is n := 10; when SPR_HSPRG1 => n := 11; - when SPR_XER => - n := 12; when SPR_TAR => n := 13; when others => diff --git a/core.vhdl b/core.vhdl index b18f09a..070a1f1 100644 --- a/core.vhdl +++ b/core.vhdl @@ -145,7 +145,7 @@ architecture behave of core is signal dbg_gpr_addr : gspr_index_t; signal dbg_gpr_data : std_ulogic_vector(63 downto 0); - signal msr : std_ulogic_vector(63 downto 0); + signal ctrl_debug : ctrl_t; -- PMU event bus signal icache_events : IcacheEventType; @@ -333,6 +333,7 @@ begin d_out => cr_file_to_decode2, w_in => writeback_to_cr_file, sim_dump => sim_cr_dump, + ctrl => ctrl_debug, log_out => log_data(183 downto 171) ); @@ -359,7 +360,7 @@ begin bypass_data => execute1_bypass, bypass_cr_data => execute1_cr_bypass, icache_inval => ex1_icache_inval, - dbg_msr_out => msr, + dbg_ctrl_out => ctrl_debug, wb_events => writeback_events, ls_events => loadstore_events, dc_events => dcache_events, @@ -482,7 +483,7 @@ begin terminate => terminate, core_stopped => dbg_core_is_stopped, nia => fetch1_to_icache.nia, - msr => msr, + msr => ctrl_debug.msr, dbg_gpr_req => dbg_gpr_req, dbg_gpr_ack => dbg_gpr_ack, dbg_gpr_addr => dbg_gpr_addr, diff --git a/cr_file.vhdl b/cr_file.vhdl index e9788cb..d1aedba 100644 --- a/cr_file.vhdl +++ b/cr_file.vhdl @@ -18,6 +18,7 @@ entity cr_file is d_out : out CrFileToDecode2Type; w_in : in WritebackToCrFileType; + ctrl : in ctrl_t; -- debug sim_dump : in std_ulogic; @@ -84,9 +85,18 @@ begin sim_dump_test: if SIM generate dump_cr: process(all) + variable xer : std_ulogic_vector(31 downto 0); begin if sim_dump = '1' then report "CR 00000000" & to_hstring(crs); + xer := (others => '0'); + xer(31) := xerc.so; + xer(30) := xerc.ov; + xer(29) := xerc.ca; + xer(19) := xerc.ov32; + xer(18) := xerc.ca32; + xer(17 downto 0) := ctrl.xer_low; + report "XER 00000000" & to_hstring(xer); assert false report "end of test" severity failure; end if; end process; diff --git a/execute1.vhdl b/execute1.vhdl index 955a1da..b955b75 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -41,7 +41,7 @@ entity execute1 is bypass_data : out bypass_data_t; bypass_cr_data : out cr_bypass_data_t; - dbg_msr_out : out std_ulogic_vector(63 downto 0); + dbg_ctrl_out : out ctrl_t; icache_inval : out std_ulogic; terminate_out : out std_ulogic; @@ -99,8 +99,8 @@ architecture behaviour of execute1 is signal mshort_p : std_ulogic_vector(31 downto 0) := (others => '0'); signal valid_in : std_ulogic; - signal ctrl: ctrl_t; - signal ctrl_tmp: ctrl_t; + signal ctrl: ctrl_t := ctrl_t_init; + signal ctrl_tmp: ctrl_t := ctrl_t_init; signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; signal rot_sign_ext: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); @@ -249,6 +249,13 @@ architecture behaviour of execute1 is return x(n - 1) = '1'; end; + function assemble_xer(xerc: xer_common_t; xer_low: std_ulogic_vector) + return std_ulogic_vector is + begin + return 32x"0" & xerc.so & xerc.ov & xerc.ca & "000000000" & + xerc.ov32 & xerc.ca32 & xer_low(17 downto 0); + end; + -- Tell vivado to keep the hierarchy for the random module so that the -- net names in the xdc file match. attribute keep_hierarchy : string; @@ -336,7 +343,7 @@ begin ); end generate; - dbg_msr_out <= ctrl.msr; + dbg_ctrl_out <= ctrl; log_rd_addr <= r.log_addr_spr; a_in <= e_in.read_data1; @@ -402,9 +409,7 @@ begin if rising_edge(clk) then if rst = '1' then r <= reg_type_init; - ctrl.tb <= (others => '0'); - ctrl.dec <= (others => '0'); - ctrl.cfar <= (others => '0'); + ctrl <= ctrl_t_init; ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); else r <= rin; @@ -1043,19 +1048,11 @@ begin "=" & to_hstring(a_in); if is_fast_spr(e_in.read_reg1) = '1' then spr_val := a_in; - if decode_spr_num(e_in.insn) = SPR_XER then - -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer - spr_val(63 downto 32) := (others => '0'); - spr_val(63-32) := xerc_in.so; - spr_val(63-33) := xerc_in.ov; - spr_val(63-34) := xerc_in.ca; - spr_val(63-35 downto 63-43) := "000000000"; - spr_val(63-44) := xerc_in.ov32; - spr_val(63-45) := xerc_in.ca32; - end if; else spr_val := c_in; case decode_spr_num(e_in.insn) is + when SPR_XER => + spr_val := assemble_xer(xerc_in, ctrl.xer_low); when SPR_TB => spr_val := ctrl.tb; when SPR_TBU => @@ -1118,17 +1115,16 @@ begin when OP_MTSPR => report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(c_in); - if is_fast_spr(e_in.write_reg) then - if decode_spr_num(e_in.insn) = SPR_XER then + if is_fast_spr(e_in.write_reg) = '0' then + -- slow spr + case decode_spr_num(e_in.insn) is + when SPR_XER => v.e.xerc.so := c_in(63-32); v.e.xerc.ov := c_in(63-33); v.e.xerc.ca := c_in(63-34); v.e.xerc.ov32 := c_in(63-44); v.e.xerc.ca32 := c_in(63-45); - end if; - else - -- slow spr - case decode_spr_num(e_in.insn) is + ctrl_tmp.xer_low <= c_in(17 downto 0); when SPR_DEC => ctrl_tmp.dec <= c_in; when 724 => -- LOG_ADDR SPR diff --git a/register_file.vhdl b/register_file.vhdl index b5e7246..ab35855 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -143,7 +143,6 @@ begin report "LR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_LR))))); report "CTR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_CTR))))); - report "XER " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_XER))))); sim_dump_done <= '1'; else sim_dump_done <= '0'; diff --git a/scripts/run_test.sh b/scripts/run_test.sh index 9fcb7ce..185c3a6 100755 --- a/scripts/run_test.sh +++ b/scripts/run_test.sh @@ -21,9 +21,9 @@ cd $TMPDIR cp ${MICROWATT_DIR}/tests/${TEST}.bin main_ram.bin -${MICROWATT_DIR}/core_tb | sed 's/.*: //' | egrep '^(GPR[0-9]|LR |CTR |XER |CR [0-9])' | sort | grep -v GPR31 | grep -v XER > test.out || true +${MICROWATT_DIR}/core_tb | sed 's/.*: //' | egrep '^(GPR[0-9]|LR |CTR |XER |CR [0-9])' | sort | grep -v GPR31 > test.out || true -grep -v "^$" ${MICROWATT_DIR}/tests/${TEST}.out | sort | grep -v GPR31 | grep -v XER > exp.out +grep -v "^$" ${MICROWATT_DIR}/tests/${TEST}.out | sort | grep -v GPR31 > exp.out cp test.out /tmp cp exp.out /tmp From 813e2317bf1f1c10d988f660c0a4282da316a3b9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 18 Jun 2022 16:24:30 +1000 Subject: [PATCH 03/30] execute1: Restructure to separate out execution of side effects We now have a record that represents the actions taken in executing an instruction, and a process that computes that for the incoming instruction. We no longer have 'current' or 'r.cur_instr', instead things like the destination register are put into r.e in the first cycle of an instruction and not reinitialized in subsequent busy cycles. For mfspr and mtspr, we now decode "slow" SPR numbers (those SPRs that are not stored in the register file) to a new "spr_selector" record in decode1 (excluding those in the loadstore unit). With this, the result for mfspr is determined in the data path. Signed-off-by: Paul Mackerras --- common.vhdl | 25 +- cr_file.vhdl | 6 +- decode1.vhdl | 38 ++- decode2.vhdl | 20 +- execute1.vhdl | 903 ++++++++++++++++++++++++++++---------------------- 5 files changed, 579 insertions(+), 413 deletions(-) diff --git a/common.vhdl b/common.vhdl index bab5aed..7ecf4e2 100644 --- a/common.vhdl +++ b/common.vhdl @@ -124,6 +124,23 @@ package common is end record; constant xerc_init : xer_common_t := (others => '0'); + subtype spr_selector is std_ulogic_vector(2 downto 0); + type spr_id is record + sel : spr_selector; + valid : std_ulogic; + ispmu : std_ulogic; + end record; + constant spr_id_init : spr_id := (sel => "000", others => '0'); + + constant SPRSEL_TB : spr_selector := 3x"0"; + constant SPRSEL_TBU : spr_selector := 3x"1"; + constant SPRSEL_DEC : spr_selector := 3x"2"; + constant SPRSEL_PVR : spr_selector := 3x"3"; + constant SPRSEL_LOGA : spr_selector := 3x"4"; + constant SPRSEL_LOGD : spr_selector := 3x"5"; + constant SPRSEL_CFAR : spr_selector := 3x"6"; + constant SPRSEL_XER : spr_selector := 3x"7"; + -- FPSCR bit numbers constant FPSCR_FX : integer := 63 - 32; constant FPSCR_FEX : integer := 63 - 33; @@ -235,11 +252,13 @@ package common is decode: decode_rom_t; br_pred: std_ulogic; -- Branch was predicted to be taken big_endian: std_ulogic; + spr_info : spr_id; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'), - decode => decode_rom_init, br_pred => '0', big_endian => '0'); + decode => decode_rom_init, br_pred => '0', big_endian => '0', + spr_info => spr_id_init); type Decode1ToFetch1Type is record redirect : std_ulogic; @@ -299,6 +318,7 @@ package common is sub_select : std_ulogic_vector(2 downto 0); -- sub-result selection repeat : std_ulogic; -- set if instruction is cracked into two ops second : std_ulogic; -- set if this is the second op + spr_select : spr_id; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, @@ -311,7 +331,8 @@ package common is read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), result_sel => "000", sub_select => "000", - repeat => '0', second => '0', others => (others => '0')); + repeat => '0', second => '0', spr_select => spr_id_init, + others => (others => '0')); type MultiplyInputType is record valid: std_ulogic; diff --git a/cr_file.vhdl b/cr_file.vhdl index d1aedba..940b95b 100644 --- a/cr_file.vhdl +++ b/cr_file.vhdl @@ -66,7 +66,11 @@ begin crs <= crs_updated; end if; if w_in.write_xerc_enable = '1' then - report "Writing XERC"; + report "Writing XERC SO=" & std_ulogic'image(xerc_updated.so) & + " OV=" & std_ulogic'image(xerc_updated.ov) & + " CA=" & std_ulogic'image(xerc_updated.ca) & + " OV32=" & std_ulogic'image(xerc_updated.ov32) & + " CA32=" & std_ulogic'image(xerc_updated.ca32); xerc <= xerc_updated; end if; end if; diff --git a/decode1.vhdl b/decode1.vhdl index baf4347..fb92b9e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -519,6 +519,40 @@ architecture behaviour of decode1 is constant nop_instr : decode_rom_t := (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); constant fetch_fail_inst: decode_rom_t := (LDST, NONE, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); + function map_spr(sprn : spr_num_t) return spr_id is + variable i : spr_id; + begin + i.sel := "000"; + i.valid := '1'; + i.ispmu := '0'; + case sprn is + when SPR_TB => + i.sel := SPRSEL_TB; + when SPR_TBU => + i.sel := SPRSEL_TBU; + when SPR_DEC => + i.sel := SPRSEL_DEC; + when SPR_PVR => + i.sel := SPRSEL_PVR; + when 724 => -- LOG_ADDR SPR + i.sel := SPRSEL_LOGA; + when 725 => -- LOG_DATA SPR + i.sel := SPRSEL_LOGD; + when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 | + SPR_UMMCR0 | SPR_UMMCR1 | SPR_UMMCR2 | SPR_UMMCRA | SPR_USIER | SPR_USIAR | SPR_USDAR | + SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 | + SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR => + i.ispmu := '1'; + when SPR_CFAR => + i.sel := SPRSEL_CFAR; + when SPR_XER => + i.sel := SPRSEL_XER; + when others => + i.valid := '0'; + end case; + return i; + end; + begin decode1_0: process(clk) begin @@ -586,6 +620,9 @@ begin majorop := unsigned(f_in.insn(31 downto 26)); v.decode := major_decode_rom_array(to_integer(majorop)); + sprn := decode_spr_num(f_in.insn); + v.spr_info := map_spr(sprn); + case to_integer(unsigned(majorop)) is when 4 => -- major opcode 4, mostly VMX/VSX stuff but also some integer ops (madd*) @@ -598,7 +635,6 @@ begin v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); -- Work out ispr1/ispro independent of v.decode since they seem to be critical path - sprn := decode_spr_num(f_in.insn); v.ispr1 := fast_spr_num(sprn); v.ispro := fast_spr_num(sprn); diff --git a/decode2.vhdl b/decode2.vhdl index 5aa1a6f..8998f2b 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -228,13 +228,6 @@ architecture behaviour of decode2 is OP_SHR => "010", OP_EXTSWSLI => "010", OP_MUL_L64 => "011", -- muldiv_result - OP_MUL_H64 => "011", - OP_MUL_H32 => "011", - OP_DIV => "011", - OP_DIVE => "011", - OP_MOD => "011", - OP_CNTZ => "100", -- countbits_result - OP_POPCNT => "100", OP_MFSPR => "101", -- spr_result OP_B => "110", -- next_nia OP_BC => "110", @@ -440,6 +433,8 @@ begin decoded_reg_o.reg(0) := not r.repeat; end if; + v.e.spr_select := d_in.spr_info; + r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; r_out.read1_reg <= decoded_reg_a.reg; r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid; @@ -496,6 +491,17 @@ begin v.e.result_sel := "000"; -- select adder output end if; end if; + if op = OP_MFSPR then + if is_fast_spr(d_in.ispr1) = '1' then + v.e.result_sel := "000"; -- adder_result, effectively a_in + elsif d_in.spr_info.valid = '0' then + -- Privileged mfspr to invalid/unimplemented SPR numbers + -- writes the contents of RT back to RT (i.e. it's a no-op) + v.e.result_sel := "001"; -- logical_result + elsif d_in.spr_info.ispmu = '1' then + v.e.result_sel := "100"; -- pmuspr_result + end if; + end if; -- See if any of the operands can get their value via the bypass path. case gpr_a_bypass is diff --git a/execute1.vhdl b/execute1.vhdl index b955b75..21f6f8f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -62,7 +62,6 @@ end entity execute1; architecture behaviour of execute1 is type reg_type is record e : Execute1ToWritebackType; - cur_instr : Decode2ToExecute1Type; busy: std_ulogic; terminate: std_ulogic; intr_pending : std_ulogic; @@ -70,6 +69,8 @@ architecture behaviour of execute1 is trace_next : std_ulogic; prev_op : insn_type_t; br_taken : std_ulogic; + oe : std_ulogic; + mul_select : std_ulogic_vector(1 downto 0); mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; @@ -83,15 +84,42 @@ architecture behaviour of execute1 is end record; constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, - cur_instr => Decode2ToExecute1Init, busy => '0', terminate => '0', intr_pending => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', + oe => '0', mul_select => "00", mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', others => (others => '0')); + type actions_type is record + e : Execute1ToWritebackType; + complete : std_ulogic; + exception : std_ulogic; + trap : std_ulogic; + terminate : std_ulogic; + write_msr : std_ulogic; + new_msr : std_ulogic_vector(63 downto 0); + write_xerlow : std_ulogic; + write_pmuspr : std_ulogic; + write_dec : std_ulogic; + write_loga : std_ulogic; + inc_loga : std_ulogic; + write_cfar : std_ulogic; + take_branch : std_ulogic; + direct_branch : std_ulogic; + start_mul : std_ulogic; + start_div : std_ulogic; + start_cntz : std_ulogic; + do_trace : std_ulogic; + fp_intr : std_ulogic; + icache_inval : std_ulogic; + end record; + constant actions_type_init : actions_type := + (e => Execute1ToWritebackInit, new_msr => (others => '0'), others => '0'); + signal r, rin : reg_type; + signal actions : actions_type; signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); signal cr_in : std_ulogic_vector(31 downto 0); @@ -112,9 +140,9 @@ architecture behaviour of execute1 is signal adder_result: std_ulogic_vector(63 downto 0); signal misc_result: std_ulogic_vector(63 downto 0); signal muldiv_result: std_ulogic_vector(63 downto 0); + signal shortmul_result: std_ulogic_vector(63 downto 0); signal spr_result: std_ulogic_vector(63 downto 0); signal next_nia : std_ulogic_vector(63 downto 0); - signal current: Decode2ToExecute1Type; signal carry_32 : std_ulogic; signal carry_64 : std_ulogic; @@ -369,7 +397,7 @@ begin br_taken_complete => r.taken_branch_event, br_mispredict => r.br_mispredict, others => '0'); - x_to_pmu.nia <= current.nia; + x_to_pmu.nia <= e_in.nia; x_to_pmu.addr <= (others => '0'); x_to_pmu.addr_v <= '0'; x_to_pmu.spr_num <= e_in.insn(20 downto 16); @@ -381,7 +409,7 @@ begin -- (SO, OV[32] and CA[32]) are only modified by instructions that are -- handled here, we can just forward the result being sent to -- writeback. - xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc; + xerc_in <= r.e.xerc when (r.e.write_xerc_enable and r.e.valid) = '1' else e_in.xerc; with e_in.unit select busy_out <= l_in.busy or r.busy or fp_in.busy when LDST, @@ -391,15 +419,24 @@ begin terminate_out <= r.terminate; - current <= e_in when r.busy = '0' else r.cur_instr; + -- Slow SPR read mux + with e_in.spr_select.sel select spr_result <= + ctrl.tb when SPRSEL_TB, + 32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU, + ctrl.dec when SPRSEL_DEC, + 32x"0" & PVR_MICROWATT when SPRSEL_PVR, + log_wr_addr & r.log_addr_spr when SPRSEL_LOGA, + log_rd_data when SPRSEL_LOGD, + ctrl.cfar when SPRSEL_CFAR, + assemble_xer(xerc_in, ctrl.xer_low) when others; -- Result mux - with current.result_sel select alu_result <= + with e_in.result_sel select alu_result <= adder_result when "000", logical_result when "001", rotator_result when "010", - muldiv_result when "011", - countbits_result when "100", + shortmul_result when "011", + pmu_to_x.spr_val when "100", spr_result when "101", next_nia when "110", misc_result when others; @@ -545,13 +582,10 @@ begin x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); end if; - case current.sub_select(1 downto 0) is + shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64)); + case r.mul_select is when "00" => - if HAS_SHORT_MULT and r.mul_in_progress = '0' then - muldiv_result <= std_ulogic_vector(resize(signed(mshort_p), 64)); - else - muldiv_result <= multiply_to_x.result(63 downto 0); - end if; + muldiv_result <= multiply_to_x.result(63 downto 0); when "01" => muldiv_result <= multiply_to_x.result(127 downto 64); when "10" => @@ -562,7 +596,7 @@ begin end case; -- Compute misc_result - case current.sub_select is + case e_in.sub_select is when "000" => misc_result <= (others => '0'); when "001" => @@ -684,7 +718,7 @@ begin bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); newcrf := (others => '0'); - case current.sub_select is + case e_in.sub_select is when "000" => -- CMP and CMPL instructions if e_in.is_signed = '1' then @@ -697,7 +731,7 @@ begin when "010" => newcrf := ppc_cmpeqb(a_in, b_in); when "011" => - if current.insn(1) = '1' then + if e_in.insn(1) = '1' then -- CR logical instructions j := (7 - crnum) * 4; newcrf := cr_in(j + 3 downto j); @@ -728,7 +762,7 @@ begin newcrf := xerc_in.ov & xerc_in.ov32 & xerc_in.ca & xerc_in.ca32; when others => end case; - if current.insn_type = OP_MTCRF then + if e_in.insn_type = OP_MTCRF then if e_in.insn(20) = '0' then -- mtcrf write_cr_mask <= insn_fxm(e_in.insn); @@ -737,201 +771,86 @@ begin crnum := fxm_to_num(insn_fxm(e_in.insn)); write_cr_mask <= num_to_fxm(crnum); end if; - write_cr_data <= c_in(31 downto 0); else write_cr_mask <= num_to_fxm(crnum); - write_cr_data <= newcrf & newcrf & newcrf & newcrf & - newcrf & newcrf & newcrf & newcrf; end if; + for i in 0 to 7 loop + if write_cr_mask(i) = '0' then + write_cr_data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4); + elsif e_in.insn_type = OP_MTCRF then + write_cr_data(i*4 + 3 downto i*4) <= c_in(i*4 + 3 downto i*4); + else + write_cr_data(i*4 + 3 downto i*4) <= newcrf; + end if; + end loop; end process; - execute1_1: process(all) - variable v : reg_type; + execute1_actions: process(all) + variable v: actions_type; variable bo, bi : std_ulogic_vector(4 downto 0); - variable overflow : std_ulogic; - variable lv : Execute1ToLoadstore1Type; - variable irq_valid : std_ulogic; - variable exception : std_ulogic; variable illegal : std_ulogic; - variable is_branch : std_ulogic; - variable is_direct_branch : std_ulogic; - variable taken_branch : std_ulogic; - variable abs_branch : std_ulogic; - variable spr_val : std_ulogic_vector(63 downto 0); - variable do_trace : std_ulogic; - variable hold_wr_data : std_ulogic; - variable fv : Execute1ToFPUType; + variable privileged : std_ulogic; + variable slow_op : std_ulogic; begin - is_branch := '0'; - is_direct_branch := '0'; - taken_branch := '0'; - abs_branch := '0'; - hold_wr_data := '0'; - - v := r; - v.e := Execute1ToWritebackInit; + v := actions_type_init; + v.e.write_data := alu_result; + v.e.write_reg := e_in.write_reg; + v.e.write_enable := e_in.write_reg_enable; + v.e.rc := e_in.rc; + v.e.write_cr_data := write_cr_data; + v.e.write_cr_mask := write_cr_mask; + v.e.write_cr_enable := e_in.output_cr; + v.e.write_xerc_enable := e_in.output_xer; + v.e.xerc := xerc_in; + v.new_msr := ctrl.msr; + v.e.write_xerc_enable := e_in.output_xer; v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); - v.e.xerc := xerc_in; - - lv := Execute1ToLoadstore1Init; - fv := Execute1ToFPUInit; - - x_to_multiply.valid <= '0'; - x_to_divider.valid <= '0'; - v.mul_in_progress := '0'; - v.div_in_progress := '0'; - v.cntz_in_progress := '0'; - v.mul_finish := '0'; - v.ext_interrupt := '0'; - v.taken_branch_event := '0'; - v.br_mispredict := '0'; - - x_to_pmu.mfspr <= '0'; - x_to_pmu.mtspr <= '0'; - x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47); - x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51); - x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55); - x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63); - x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM); - x_to_pmu.pr_msr <= ctrl.msr(MSR_PR); - - spr_result <= (others => '0'); - spr_val := (others => '0'); - - ctrl_tmp <= ctrl; - -- FIXME: run at 512MHz not core freq - ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); - ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1); - - irq_valid := ctrl.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); - - v.terminate := '0'; - icache_inval <= '0'; - v.busy := '0'; - - -- Next insn adder used in a couple of places - next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); - - -- rotator control signals - right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; - rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; - rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; - rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - - do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; + v.e.intr_vec := 16#700#; + v.e.mode_32bit := not ctrl.msr(MSR_SF); + v.e.instr_tag := e_in.instr_tag; + v.e.last_nia := e_in.nia; + v.e.br_offset := 64x"4"; + + -- Note the difference between v.exception and v.trap: + -- v.exception signals a condition that prevents execution of the + -- instruction, and hence shouldn't depend on operand data, so as to + -- avoid timing chains through both data and control paths. + -- v.trap also means we want to generate an interrupt, but doesn't + -- cancel instruction execution (hence we need to avoid setting any + -- side-effect flags or write enables when generating a trap). + -- With v.trap = 1 we will assert both r.e.valid and r.e.interrupt + -- to writeback, and it will complete the instruction and take + -- and interrupt. It is OK for v.trap to depend on operand data. illegal := '0'; - if r.intr_pending = '1' then - v.e.srr1 := r.e.srr1; - v.e.intr_vec := r.e.intr_vec; - end if; - if valid_in = '1' then - v.e.last_nia := e_in.nia; - else - v.e.last_nia := r.e.last_nia; - end if; - - v.e.mode_32bit := not ctrl.msr(MSR_SF); - v.e.instr_tag := current.instr_tag; + privileged := '0'; + slow_op := '0'; - do_trace := valid_in and ctrl.msr(MSR_SE); - if valid_in = '1' then - v.cur_instr := e_in; - v.prev_op := e_in.insn_type; + if ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then + privileged := '1'; end if; - -- Determine if there is any interrupt to be taken - -- before/instead of executing this instruction - exception := r.intr_pending; - if valid_in = '1' and e_in.second = '0' and r.intr_pending = '0' then - if HAS_FPU and r.fp_exception_next = '1' then - -- This is used for FP-type program interrupts that - -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. - exception := '1'; - v.e.intr_vec := 16#700#; - v.e.srr1(47 - 43) := '1'; - v.e.srr1(47 - 47) := '1'; - elsif r.trace_next = '1' then - -- Generate a trace interrupt rather than executing the next instruction - -- or taking any asynchronous interrupt - exception := '1'; - v.e.intr_vec := 16#d00#; - v.e.srr1(47 - 33) := '1'; - if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or - r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then - v.e.srr1(47 - 35) := '1'; - elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then - v.e.srr1(47 - 36) := '1'; - end if; - - elsif irq_valid = '1' then - -- Don't deliver the interrupt until we have a valid instruction - -- coming in, so we have a valid NIA to put in SRR0. - if pmu_to_x.intr = '1' then - v.e.intr_vec := 16#f00#; - report "IRQ valid: PMU"; - elsif ctrl.dec(63) = '1' then - v.e.intr_vec := 16#900#; - report "IRQ valid: DEC"; - elsif ext_irq_in = '1' then - v.e.intr_vec := 16#500#; - report "IRQ valid: External"; - v.ext_interrupt := '1'; - end if; - exception := '1'; - - elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then - -- generate a program interrupt - exception := '1'; - v.e.intr_vec := 16#700#; - -- set bit 45 to indicate privileged instruction type interrupt - v.e.srr1(47 - 45) := '1'; - report "privileged instruction"; - - elsif not HAS_FPU and e_in.fac = FPU then - -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations - illegal := '1'; - - elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then - -- generate a floating-point unavailable interrupt - exception := '1'; - v.e.intr_vec := 16#800#; - report "FP unavailable interrupt"; - end if; + if (not HAS_FPU and e_in.fac = FPU) or e_in.unit = NONE then + -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations + illegal := '1'; end if; - if exception = '1' and l_in.in_progress = '1' then - -- We can't send this interrupt to writeback yet because there are - -- still instructions in loadstore1 that haven't completed. - v.intr_pending := '1'; - v.busy := '1'; - end if; - if l_in.interrupt = '1' then - v.intr_pending := '0'; - end if; - - v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or r.busy or fp_in.busy); - v.instr_dispatch := valid_in and not exception and not illegal; - - if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then - v.e.valid := '1'; - - case_0: case e_in.insn_type is + v.do_trace := ctrl.msr(MSR_SE); + case_0: case e_in.insn_type is when OP_ILLEGAL => - -- we need two cycles to write srr0 and 1 - -- will need more when we have to write HEIR illegal := '1'; when OP_SC => -- check bit 1 of the instruction is 1 so we know this is sc; -- 0 would mean scv, so generate an illegal instruction interrupt - -- we need two cycles to write srr0 and 1 if e_in.insn(1) = '1' then - exception := '1'; + v.trap := '1'; v.e.intr_vec := 16#C00#; v.e.last_nia := next_nia; - report "sc"; + if e_in.valid = '1' then + report "sc"; + end if; else illegal := '1'; end if; @@ -940,12 +859,14 @@ begin -- if not then it is illegal if e_in.insn(10 downto 1) = "0100000000" then v.terminate := '1'; - report "ATTN"; + if e_in.valid = '1' then + report "ATTN"; + end if; else illegal := '1'; end if; when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT => - -- Do nothing + -- Do nothing when OP_ADD => if e_in.output_carry = '1' then if e_in.input_carry /= OV then @@ -966,27 +887,34 @@ begin v.e.srr1(47 - 46) := '1'; if or (trapval and insn_to(e_in.insn)) = '1' then -- generate trap-type program interrupt - exception := '1'; - report "trap"; + v.trap := '1'; + if e_in.valid = '1' then + report "trap"; + end if; end if; when OP_ADDG6S => when OP_CMPRB => when OP_CMPEQB => when OP_AND | OP_OR | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS | - OP_BPERM | OP_BCD => + OP_BPERM | OP_BCD => when OP_B => - is_branch := '1'; - taken_branch := '1'; - is_direct_branch := '1'; - abs_branch := e_in.br_abs; + v.take_branch := '1'; + v.direct_branch := '1'; + v.e.br_last := '1'; + v.e.br_taken := '1'; + v.e.br_offset := b_in; + v.e.abs_br := insn_aa(e_in.insn); + if e_in.br_pred = '0' then + -- should never happen + v.e.redirect := '1'; + end if; if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + v.do_trace := '1'; end if; - v.taken_branch_event := '1'; - when OP_BC | OP_BCREG => + v.write_cfar := '1'; + when OP_BC => -- read_data1 is CTR - -- for OP_BCREG, read_data2 is target register (CTR, LR or TAR) -- If this instruction updates both CTR and LR, then it is -- doubled; the first instruction decrements CTR and determines -- whether the branch is taken, and the second does the @@ -994,21 +922,52 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if e_in.second = '0' then - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); + v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in); else - taken_branch := r.br_taken; + v.take_branch := r.br_taken; + end if; + if v.take_branch = '1' then + v.e.br_offset := b_in; + v.e.abs_br := insn_aa(e_in.insn); end if; - v.br_taken := taken_branch; - v.taken_branch_event := taken_branch; - abs_branch := e_in.br_abs; if e_in.repeat = '0' or e_in.second = '1' then - is_branch := '1'; - if e_in.insn_type = OP_BC then - is_direct_branch := '1'; + -- Mispredicted branches cause a redirect + if v.take_branch /= e_in.br_pred then + v.e.redirect := '1'; end if; + v.direct_branch := '1'; + v.e.br_last := '1'; + v.e.br_taken := v.take_branch; if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + v.do_trace := '1'; end if; + v.write_cfar := v.take_branch; + end if; + when OP_BCREG => + -- read_data1 is CTR, read_data2 is target register (CTR, LR or TAR) + -- If this instruction updates both CTR and LR, then it is + -- doubled; the first instruction decrements CTR and determines + -- whether the branch is taken, and the second does the + -- redirect and the LR update. + bo := insn_bo(e_in.insn); + bi := insn_bi(e_in.insn); + if e_in.second = '0' then + v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in); + else + v.take_branch := r.br_taken; + end if; + if v.take_branch = '1' then + v.e.br_offset := b_in; + v.e.abs_br := '1'; + end if; + if e_in.repeat = '0' or e_in.second = '1' then + -- Indirect branches are never predicted taken + v.e.redirect := v.take_branch; + v.e.br_taken := v.take_branch; + if ctrl.msr(MSR_BE) = '1' then + v.do_trace := '1'; + end if; + v.write_cfar := v.take_branch; end if; when OP_RFID => @@ -1016,131 +975,115 @@ begin not a_in(MSR_LE) & not a_in(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. - ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31); - ctrl_tmp.msr(26 downto 22) <= a_in(26 downto 22); - ctrl_tmp.msr(15 downto 0) <= a_in(15 downto 0); + v.new_msr(63 downto 31) := a_in(63 downto 31); + v.new_msr(26 downto 22) := a_in(26 downto 22); + v.new_msr(15 downto 0) := a_in(15 downto 0); if a_in(MSR_PR) = '1' then - ctrl_tmp.msr(MSR_EE) <= '1'; - ctrl_tmp.msr(MSR_IR) <= '1'; - ctrl_tmp.msr(MSR_DR) <= '1'; + v.new_msr(MSR_EE) := '1'; + v.new_msr(MSR_IR) := '1'; + v.new_msr(MSR_DR) := '1'; end if; - -- mark this as a branch so CFAR gets updated - is_branch := '1'; - taken_branch := '1'; - abs_branch := '1'; + v.write_msr := '1'; + v.e.br_offset := b_in; + v.e.abs_br := '1'; + v.e.redirect := '1'; + v.write_cfar := '1'; if HAS_FPU then - v.fp_exception_next := fp_in.exception and - (a_in(MSR_FE0) or a_in(MSR_FE1)); + v.fp_intr := fp_in.exception and + (a_in(MSR_FE0) or a_in(MSR_FE1)); end if; - do_trace := '0'; + v.do_trace := '0'; when OP_CNTZ | OP_POPCNT => - v.e.valid := '0'; - v.cntz_in_progress := '1'; - v.busy := '1'; + slow_op := '1'; + v.start_cntz := '1'; when OP_ISEL => when OP_CROP => when OP_MCRXRX => when OP_DARN => when OP_MFMSR => when OP_MFSPR => - report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(a_in); if is_fast_spr(e_in.read_reg1) = '1' then - spr_val := a_in; - else - spr_val := c_in; - case decode_spr_num(e_in.insn) is - when SPR_XER => - spr_val := assemble_xer(xerc_in, ctrl.xer_low); - when SPR_TB => - spr_val := ctrl.tb; - when SPR_TBU => - spr_val(63 downto 32) := (others => '0'); - spr_val(31 downto 0) := ctrl.tb(63 downto 32); - when SPR_DEC => - spr_val := ctrl.dec; - when SPR_CFAR => - spr_val := ctrl.cfar; - when SPR_PVR => - spr_val(63 downto 32) := (others => '0'); - spr_val(31 downto 0) := PVR_MICROWATT; - when 724 => -- LOG_ADDR SPR - spr_val := log_wr_addr & r.log_addr_spr; - when 725 => -- LOG_DATA SPR - spr_val := log_rd_data; - v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1); - when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 | - SPR_UMMCR0 | SPR_UMMCR1 | SPR_UMMCR2 | SPR_UMMCRA | SPR_USIER | SPR_USIAR | SPR_USDAR | - SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 | - SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR => - x_to_pmu.mfspr <= '1'; - spr_val := pmu_to_x.spr_val; - when others => - -- mfspr from unimplemented SPRs should be a nop in - -- supervisor mode and a program interrupt for user mode - if is_fast_spr(e_in.read_reg1) = '0' and ctrl.msr(MSR_PR) = '1' then - illegal := '1'; - end if; + if e_in.valid = '1' then + report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + "=" & to_hstring(a_in); + end if; + elsif e_in.spr_select.valid = '1' then + if e_in.valid = '1' then + report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + "=" & to_hstring(spr_result); + end if; + case e_in.spr_select.sel is + when SPRSEL_LOGD => + v.inc_loga := '1'; + when others => end case; + else + -- mfspr from unimplemented SPRs should be a nop in + -- supervisor mode and a program interrupt for user mode + if e_in.valid = '1' then + report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + " invalid"; + end if; + if ctrl.msr(MSR_PR) = '1' then + illegal := '1'; + end if; end if; - spr_result <= spr_val; when OP_MFCR => when OP_MTCRF => when OP_MTMSRD => + v.write_msr := '1'; if e_in.insn(16) = '1' then -- just update EE and RI - ctrl_tmp.msr(MSR_EE) <= c_in(MSR_EE); - ctrl_tmp.msr(MSR_RI) <= c_in(MSR_RI); + v.new_msr(MSR_EE) := c_in(MSR_EE); + v.new_msr(MSR_RI) := c_in(MSR_RI); else -- Architecture says to leave out bits 3 (HV), 51 (ME) -- and 63 (LE) (IBM bit numbering) if e_in.is_32bit = '0' then - ctrl_tmp.msr(63 downto 61) <= c_in(63 downto 61); - ctrl_tmp.msr(59 downto 32) <= c_in(59 downto 32); + v.new_msr(63 downto 61) := c_in(63 downto 61); + v.new_msr(59 downto 32) := c_in(59 downto 32); end if; - ctrl_tmp.msr(31 downto 13) <= c_in(31 downto 13); - ctrl_tmp.msr(11 downto 1) <= c_in(11 downto 1); + v.new_msr(31 downto 13) := c_in(31 downto 13); + v.new_msr(11 downto 1) := c_in(11 downto 1); if c_in(MSR_PR) = '1' then - ctrl_tmp.msr(MSR_EE) <= '1'; - ctrl_tmp.msr(MSR_IR) <= '1'; - ctrl_tmp.msr(MSR_DR) <= '1'; + v.new_msr(MSR_EE) := '1'; + v.new_msr(MSR_IR) := '1'; + v.new_msr(MSR_DR) := '1'; end if; if HAS_FPU then - v.fp_exception_next := fp_in.exception and - (c_in(MSR_FE0) or c_in(MSR_FE1)); + v.fp_intr := fp_in.exception and + (c_in(MSR_FE0) or c_in(MSR_FE1)); end if; end if; when OP_MTSPR => - report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(c_in); - if is_fast_spr(e_in.write_reg) = '0' then - -- slow spr - case decode_spr_num(e_in.insn) is - when SPR_XER => - v.e.xerc.so := c_in(63-32); - v.e.xerc.ov := c_in(63-33); - v.e.xerc.ca := c_in(63-34); - v.e.xerc.ov32 := c_in(63-44); - v.e.xerc.ca32 := c_in(63-45); - ctrl_tmp.xer_low <= c_in(17 downto 0); - when SPR_DEC => - ctrl_tmp.dec <= c_in; - when 724 => -- LOG_ADDR SPR - v.log_addr_spr := c_in(31 downto 0); - when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 | - SPR_UMMCR0 | SPR_UMMCR2 | SPR_UMMCRA | - SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 | - SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR => - x_to_pmu.mtspr <= '1'; - when others => - -- mtspr to unimplemented SPRs should be a nop in - -- supervisor mode and a program interrupt for user mode - if ctrl.msr(MSR_PR) = '1' then - illegal := '1'; - end if; - end case; + if e_in.valid = '1' then + report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + "=" & to_hstring(c_in); + end if; + v.write_pmuspr := e_in.spr_select.ispmu; + if e_in.spr_select.valid = '1' and e_in.spr_select.ispmu = '0' then + case e_in.spr_select.sel is + when SPRSEL_XER => + v.e.xerc.so := c_in(63-32); + v.e.xerc.ov := c_in(63-33); + v.e.xerc.ca := c_in(63-34); + v.e.xerc.ov32 := c_in(63-44); + v.e.xerc.ca32 := c_in(63-45); + v.write_xerlow := '1'; + when SPRSEL_DEC => + v.write_dec := '1'; + when SPRSEL_LOGA => + v.write_loga := '1'; + when others => + end case; + elsif is_fast_spr(e_in.write_reg) = '0' then + -- mtspr to unimplemented SPRs should be a nop in + -- supervisor mode and a program interrupt for user mode + if ctrl.msr(MSR_PR) = '1' then + illegal := '1'; + end if; end if; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => if e_in.output_carry = '1' then @@ -1150,13 +1093,12 @@ begin when OP_ISYNC => v.e.redirect := '1'; - v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64)); when OP_ICBI => - icache_inval <= '1'; + v.icache_inval := '1'; - when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => - if HAS_SHORT_MULT and e_in.insn_type = OP_MUL_L64 and e_in.insn(26) = '1' and + when OP_MUL_L64 => + if HAS_SHORT_MULT and e_in.insn(26) = '1' and fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then -- Operands fit into 16 bits, so use short multiplier if e_in.oe = '1' then @@ -1165,54 +1107,230 @@ begin end if; else -- Use standard multiplier - v.e.valid := '0'; - v.mul_in_progress := '1'; - v.busy := '1'; - x_to_multiply.valid <= '1'; + v.start_mul := '1'; + slow_op := '1'; end if; + when OP_MUL_H64 | OP_MUL_H32 => + v.start_mul := '1'; + slow_op := '1'; + when OP_DIV | OP_DIVE | OP_MOD => - v.e.valid := '0'; - v.div_in_progress := '1'; - v.busy := '1'; - x_to_divider.valid <= '1'; + v.start_div := '1'; + slow_op := '1'; + + when OP_FETCH_FAILED => + -- Handling an ITLB miss doesn't count as having executed an instruction + v.do_trace := '0'; when others => - v.terminate := '1'; - report "illegal"; - end case; - - -- Mispredicted branches cause a redirect - if is_branch = '1' then - if taken_branch = '1' then - ctrl_tmp.cfar <= e_in.nia; + if e_in.valid = '1' and e_in.unit = ALU then + report "unhandled insn_type " & insn_type_t'image(e_in.insn_type); end if; - if taken_branch = '1' then - v.e.br_offset := b_in; - v.e.abs_br := abs_branch; - else - v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64)); + end case; + + if privileged = '1' then + -- generate a program interrupt + v.exception := '1'; + -- set bit 45 to indicate privileged instruction type interrupt + v.e.srr1(47 - 45) := '1'; + if e_in.valid = '1' then + report "privileged instruction"; + end if; + + elsif illegal = '1' then + v.exception := '1'; + -- Since we aren't doing Hypervisor emulation assist (0xe40) we + -- set bit 44 to indicate we have an illegal + v.e.srr1(47 - 44) := '1'; + if e_in.valid = '1' then + report "illegal instruction"; + end if; + + elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then + -- generate a floating-point unavailable interrupt + v.exception := '1'; + v.e.intr_vec := 16#800#; + if e_in.valid = '1' then + report "FP unavailable interrupt"; + end if; + end if; + + if e_in.unit = ALU then + v.complete := e_in.valid and not v.exception and not slow_op; + end if; + + actions <= v; + end process; + + execute1_1: process(all) + variable v : reg_type; + variable overflow : std_ulogic; + variable lv : Execute1ToLoadstore1Type; + variable irq_valid : std_ulogic; + variable exception : std_ulogic; + variable fv : Execute1ToFPUType; + variable go : std_ulogic; + begin + v := r; + if r.busy = '0' then + v.e := actions.e; + v.oe := e_in.oe; + v.mul_select := e_in.sub_select(1 downto 0); + end if; + + lv := Execute1ToLoadstore1Init; + fv := Execute1ToFPUInit; + + x_to_multiply.valid <= '0'; + x_to_divider.valid <= '0'; + v.mul_in_progress := '0'; + v.div_in_progress := '0'; + v.cntz_in_progress := '0'; + v.mul_finish := '0'; + v.ext_interrupt := '0'; + v.taken_branch_event := '0'; + v.br_mispredict := '0'; + + x_to_pmu.mfspr <= '0'; + x_to_pmu.mtspr <= '0'; + x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47); + x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51); + x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55); + x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63); + x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM); + x_to_pmu.pr_msr <= ctrl.msr(MSR_PR); + + ctrl_tmp <= ctrl; + -- FIXME: run at 512MHz not core freq + ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); + ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1); + + irq_valid := ctrl.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); + + v.terminate := '0'; + icache_inval <= '0'; + v.busy := '0'; + + -- Next insn adder used in a couple of places + next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); + + -- rotator control signals + right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; + rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; + rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; + rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; + + do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; + + if r.intr_pending = '1' then + v.e.srr1 := r.e.srr1; + v.e.intr_vec := r.e.intr_vec; + end if; + + if valid_in = '1' then + v.prev_op := e_in.insn_type; + end if; + + -- Determine if there is any interrupt to be taken + -- before/instead of executing this instruction + exception := r.intr_pending or (valid_in and actions.exception); + if valid_in = '1' and e_in.second = '0' and r.intr_pending = '0' then + if HAS_FPU and r.fp_exception_next = '1' then + -- This is used for FP-type program interrupts that + -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. + exception := '1'; + v.e.intr_vec := 16#700#; + v.e.srr1 := (others => '0'); + v.e.srr1(47 - 43) := '1'; + v.e.srr1(47 - 47) := '1'; + elsif r.trace_next = '1' then + -- Generate a trace interrupt rather than executing the next instruction + -- or taking any asynchronous interrupt + exception := '1'; + v.e.intr_vec := 16#d00#; + v.e.srr1 := (others => '0'); + v.e.srr1(47 - 33) := '1'; + if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or + r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then + v.e.srr1(47 - 35) := '1'; + elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then + v.e.srr1(47 - 36) := '1'; end if; - if taken_branch /= e_in.br_pred then - v.e.redirect := '1'; - v.br_mispredict := is_direct_branch; + + elsif irq_valid = '1' then + -- Don't deliver the interrupt until we have a valid instruction + -- coming in, so we have a valid NIA to put in SRR0. + if pmu_to_x.intr = '1' then + v.e.intr_vec := 16#f00#; + report "IRQ valid: PMU"; + elsif ctrl.dec(63) = '1' then + v.e.intr_vec := 16#900#; + report "IRQ valid: DEC"; + elsif ext_irq_in = '1' then + v.e.intr_vec := 16#500#; + report "IRQ valid: External"; + v.ext_interrupt := '1'; end if; - v.e.br_last := is_direct_branch; - v.e.br_taken := taken_branch; + v.e.srr1 := (others => '0'); + exception := '1'; + end if; + end if; + if exception = '1' and l_in.in_progress = '1' then + -- We can't send this interrupt to writeback yet because there are + -- still instructions in loadstore1 that haven't completed. + v.intr_pending := '1'; + v.busy := '1'; + end if; + + v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or r.busy or fp_in.busy); + + go := valid_in and not exception; + v.instr_dispatch := go; + + if go = '1' then + v.e.valid := actions.complete; + v.taken_branch_event := actions.take_branch; + v.br_taken := actions.take_branch; + v.trace_next := actions.do_trace; + v.fp_exception_next := actions.fp_intr; + v.cntz_in_progress := actions.start_cntz; + + if actions.write_msr = '1' then + ctrl_tmp.msr <= actions.new_msr; + end if; + if actions.write_xerlow = '1' then + ctrl_tmp.xer_low <= c_in(17 downto 0); + end if; + if actions.write_dec = '1' then + ctrl_tmp.dec <= c_in; + end if; + if actions.write_cfar = '1' then + ctrl_tmp.cfar <= e_in.nia; + end if; + if actions.write_loga = '1' then + v.log_addr_spr := c_in(31 downto 0); + elsif actions.inc_loga = '1' then + v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1); + end if; + x_to_pmu.mtspr <= actions.write_pmuspr; + icache_inval <= actions.icache_inval; + x_to_multiply.valid <= actions.start_mul; + v.mul_in_progress := actions.start_mul; + x_to_divider.valid <= actions.start_div; + v.div_in_progress := actions.start_div; + v.terminate := actions.terminate; + v.br_mispredict := v.e.redirect and actions.direct_branch; + v.busy := actions.start_cntz or actions.start_mul or actions.start_div; + exception := actions.trap; - elsif valid_in = '1' and exception = '0' and illegal = '0' then -- instruction for other units, i.e. LDST if e_in.unit = LDST then lv.valid := '1'; - elsif e_in.unit = NONE then - illegal := '1'; - elsif HAS_FPU and e_in.unit = FPU then - fv.valid := '1'; end if; - -- Handling an ITLB miss doesn't count as having executed an instruction - if e_in.insn_type = OP_FETCH_FAILED then - do_trace := '0'; + if HAS_FPU and e_in.unit = FPU then + fv.valid := '1'; end if; end if; @@ -1222,38 +1340,44 @@ begin if r.cntz_in_progress = '1' then -- cnt[lt]z and popcnt* always take two cycles v.e.valid := '1'; - elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then - if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or - (r.div_in_progress = '1' and divider_to_x.valid = '1') then - if r.mul_in_progress = '1' then - overflow := '0'; - else - overflow := divider_to_x.overflow; - end if; - if r.mul_in_progress = '1' and current.oe = '1' then + v.e.write_data := countbits_result; + end if; + if r.div_in_progress = '1' then + if divider_to_x.valid = '1' then + v.e.write_data := muldiv_result; + overflow := divider_to_x.overflow; + -- We must test oe because the RC update code in writeback + -- will use the xerc value to set CR0:SO so we must not clobber + -- xerc if OE wasn't set. + if r.oe = '1' then + v.e.xerc.ov := overflow; + v.e.xerc.ov32 := overflow; + if overflow = '1' then + v.e.xerc.so := '1'; + end if; + end if; + v.e.valid := '1'; + else + v.busy := '1'; + v.div_in_progress := '1'; + end if; + end if; + if r.mul_in_progress = '1' then + if multiply_to_x.valid = '1' then + v.e.write_data := muldiv_result; + if r.oe = '1' then -- have to wait until next cycle for overflow indication v.mul_finish := '1'; v.busy := '1'; else - -- We must test oe because the RC update code in writeback - -- will use the xerc value to set CR0:SO so we must not clobber - -- xerc if OE wasn't set. - if current.oe = '1' then - v.e.xerc.ov := overflow; - v.e.xerc.ov32 := overflow; - if overflow = '1' then - v.e.xerc.so := '1'; - end if; - end if; v.e.valid := '1'; end if; else v.busy := '1'; - v.mul_in_progress := r.mul_in_progress; - v.div_in_progress := r.div_in_progress; + v.mul_in_progress := '1'; end if; - elsif r.mul_finish = '1' then - hold_wr_data := '1'; + end if; + if r.mul_finish = '1' then v.e.xerc.ov := multiply_to_x.overflow; v.e.xerc.ov32 := multiply_to_x.overflow; if multiply_to_x.overflow = '1' then @@ -1262,24 +1386,11 @@ begin v.e.valid := '1'; end if; - if illegal = '1' then - exception := '1'; - v.e.intr_vec := 16#700#; - -- Since we aren't doing Hypervisor emulation assist (0xe40) we - -- set bit 44 to indicate we have an illegal - v.e.srr1(47 - 44) := '1'; - report "illegal"; - end if; - v.e.interrupt := exception and not (l_in.in_progress or l_in.interrupt); if v.e.interrupt = '1' then v.intr_pending := '0'; end if; - if do_trace = '1' then - v.trace_next := '1'; - end if; - if interrupt_in = '1' then ctrl_tmp.msr(MSR_SF) <= '1'; ctrl_tmp.msr(MSR_EE) <= '0'; @@ -1298,32 +1409,13 @@ begin v.intr_pending := '0'; end if; - if hold_wr_data = '0' then - v.e.write_data := alu_result; - else - v.e.write_data := r.e.write_data; - end if; - v.e.write_reg := current.write_reg; - v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; - v.e.rc := current.rc and v.e.valid and not exception; - v.e.write_cr_data := write_cr_data; - v.e.write_cr_mask := write_cr_mask; - v.e.write_cr_enable := current.output_cr and v.e.valid and not exception; - v.e.write_xerc_enable := current.output_xer and v.e.valid and not exception; - - bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid; - bypass_data.tag.tag <= current.instr_tag.tag; + bypass_data.tag.valid <= v.e.write_enable and v.e.valid; + bypass_data.tag.tag <= v.e.instr_tag.tag; bypass_data.data <= v.e.write_data; - bypass_cr_data.tag.valid <= current.instr_tag.valid and current.output_cr and v.e.valid; - bypass_cr_data.tag.tag <= current.instr_tag.tag; - for i in 0 to 7 loop - if v.e.write_cr_mask(i) = '1' then - bypass_cr_data.data(i*4 + 3 downto i*4) <= v.e.write_cr_data(i*4 + 3 downto i*4); - else - bypass_cr_data.data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4); - end if; - end loop; + bypass_cr_data.tag.valid <= v.e.write_cr_enable and v.e.valid; + bypass_cr_data.tag.tag <= v.e.instr_tag.tag; + bypass_cr_data.data <= v.e.write_cr_data; -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; @@ -1373,6 +1465,13 @@ begin -- update outputs l_out <= lv; e_out <= r.e; + if r.e.valid = '0' then + e_out.write_enable <= '0'; + e_out.write_cr_enable <= '0'; + e_out.write_xerc_enable <= '0'; + e_out.redirect <= '0'; + e_out.br_last <= '0'; + end if; e_out.msr <= msr_copy(ctrl.msr); fp_out <= fv; @@ -1394,7 +1493,7 @@ begin "000" & r.e.write_enable & r.e.valid & - (r.e.redirect or r.e.interrupt) & + ((r.e.redirect and r.e.valid) or r.e.interrupt) & r.busy & flush_in; end if; From 521a5403a9b04c49a4f724f67e67b93ae7f6fb44 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 18 Jun 2022 17:29:43 +1000 Subject: [PATCH 04/30] execute1: Rename 'r' to 'ex1' Maybe this will give us slightly better names in critical path reports and the like. Signed-off-by: Paul Mackerras --- execute1.vhdl | 97 ++++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index 21f6f8f..7bd0913 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -118,7 +118,7 @@ architecture behaviour of execute1 is constant actions_type_init : actions_type := (e => Execute1ToWritebackInit, new_msr => (others => '0'), others => '0'); - signal r, rin : reg_type; + signal ex1, ex1in : reg_type; signal actions : actions_type; signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); @@ -372,7 +372,7 @@ begin end generate; dbg_ctrl_out <= ctrl; - log_rd_addr <= r.log_addr_spr; + log_rd_addr <= ex1.log_addr_spr; a_in <= e_in.read_data1; b_in <= e_in.read_data2; @@ -391,11 +391,11 @@ begin dtlb_miss_resolved => dc_events.dtlb_miss_resolved, icache_miss => ic_events.icache_miss, itlb_miss_resolved => ic_events.itlb_miss_resolved, - no_instr_avail => r.no_instr_avail, - dispatch => r.instr_dispatch, - ext_interrupt => r.ext_interrupt, - br_taken_complete => r.taken_branch_event, - br_mispredict => r.br_mispredict, + no_instr_avail => ex1.no_instr_avail, + dispatch => ex1.instr_dispatch, + ext_interrupt => ex1.ext_interrupt, + br_taken_complete => ex1.taken_branch_event, + br_mispredict => ex1.br_mispredict, others => '0'); x_to_pmu.nia <= e_in.nia; x_to_pmu.addr <= (others => '0'); @@ -409,15 +409,15 @@ begin -- (SO, OV[32] and CA[32]) are only modified by instructions that are -- handled here, we can just forward the result being sent to -- writeback. - xerc_in <= r.e.xerc when (r.e.write_xerc_enable and r.e.valid) = '1' else e_in.xerc; + xerc_in <= ex1.e.xerc when (ex1.e.write_xerc_enable and ex1.e.valid) = '1' else e_in.xerc; with e_in.unit select busy_out <= - l_in.busy or r.busy or fp_in.busy when LDST, - l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others; + l_in.busy or ex1.busy or fp_in.busy when LDST, + l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy when others; valid_in <= e_in.valid and not busy_out and not flush_in; - terminate_out <= r.terminate; + terminate_out <= ex1.terminate; -- Slow SPR read mux with e_in.spr_select.sel select spr_result <= @@ -425,7 +425,7 @@ begin 32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU, ctrl.dec when SPRSEL_DEC, 32x"0" & PVR_MICROWATT when SPRSEL_PVR, - log_wr_addr & r.log_addr_spr when SPRSEL_LOGA, + log_wr_addr & ex1.log_addr_spr when SPRSEL_LOGA, log_rd_data when SPRSEL_LOGD, ctrl.cfar when SPRSEL_CFAR, assemble_xer(xerc_in, ctrl.xer_low) when others; @@ -445,16 +445,16 @@ begin begin if rising_edge(clk) then if rst = '1' then - r <= reg_type_init; + ex1 <= reg_type_init; ctrl <= ctrl_t_init; ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); else - r <= rin; + ex1 <= ex1in; ctrl <= ctrl_tmp; if valid_in = '1' then report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & - " wr=" & to_hstring(rin.e.write_reg) & " we=" & std_ulogic'image(rin.e.write_enable) & - " tag=" & integer'image(rin.e.instr_tag.tag) & std_ulogic'image(rin.e.instr_tag.valid); + " wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) & + " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid); end if; end if; end if; @@ -583,7 +583,7 @@ begin end if; shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64)); - case r.mul_select is + case ex1.mul_select is when "00" => muldiv_result <= multiply_to_x.result(63 downto 0); when "01" => @@ -820,7 +820,7 @@ begin -- v.trap also means we want to generate an interrupt, but doesn't -- cancel instruction execution (hence we need to avoid setting any -- side-effect flags or write enables when generating a trap). - -- With v.trap = 1 we will assert both r.e.valid and r.e.interrupt + -- With v.trap = 1 we will assert both ex1.e.valid and ex1.e.interrupt -- to writeback, and it will complete the instruction and take -- and interrupt. It is OK for v.trap to depend on operand data. @@ -924,7 +924,7 @@ begin if e_in.second = '0' then v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in); else - v.take_branch := r.br_taken; + v.take_branch := ex1.br_taken; end if; if v.take_branch = '1' then v.e.br_offset := b_in; @@ -954,7 +954,7 @@ begin if e_in.second = '0' then v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in); else - v.take_branch := r.br_taken; + v.take_branch := ex1.br_taken; end if; if v.take_branch = '1' then v.e.br_offset := b_in; @@ -1172,8 +1172,8 @@ begin variable fv : Execute1ToFPUType; variable go : std_ulogic; begin - v := r; - if r.busy = '0' then + v := ex1; + if ex1.busy = '0' then v.e := actions.e; v.oe := e_in.oe; v.mul_select := e_in.sub_select(1 downto 0); @@ -1223,9 +1223,9 @@ begin do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; - if r.intr_pending = '1' then - v.e.srr1 := r.e.srr1; - v.e.intr_vec := r.e.intr_vec; + if ex1.intr_pending = '1' then + v.e.srr1 := ex1.e.srr1; + v.e.intr_vec := ex1.e.intr_vec; end if; if valid_in = '1' then @@ -1234,9 +1234,9 @@ begin -- Determine if there is any interrupt to be taken -- before/instead of executing this instruction - exception := r.intr_pending or (valid_in and actions.exception); - if valid_in = '1' and e_in.second = '0' and r.intr_pending = '0' then - if HAS_FPU and r.fp_exception_next = '1' then + exception := ex1.intr_pending or (valid_in and actions.exception); + if valid_in = '1' and e_in.second = '0' and ex1.intr_pending = '0' then + if HAS_FPU and ex1.fp_exception_next = '1' then -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. exception := '1'; @@ -1244,17 +1244,18 @@ begin v.e.srr1 := (others => '0'); v.e.srr1(47 - 43) := '1'; v.e.srr1(47 - 47) := '1'; - elsif r.trace_next = '1' then + elsif ex1.trace_next = '1' then -- Generate a trace interrupt rather than executing the next instruction -- or taking any asynchronous interrupt exception := '1'; v.e.intr_vec := 16#d00#; v.e.srr1 := (others => '0'); v.e.srr1(47 - 33) := '1'; - if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or - r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then + if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or + ex1.prev_op = OP_DCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then v.e.srr1(47 - 35) := '1'; - elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then + elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or + ex1.prev_op = OP_DCBTST then v.e.srr1(47 - 36) := '1'; end if; @@ -1284,7 +1285,7 @@ begin v.busy := '1'; end if; - v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or r.busy or fp_in.busy); + v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy); go := valid_in and not exception; v.instr_dispatch := go; @@ -1312,7 +1313,7 @@ begin if actions.write_loga = '1' then v.log_addr_spr := c_in(31 downto 0); elsif actions.inc_loga = '1' then - v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1); + v.log_addr_spr := std_ulogic_vector(unsigned(ex1.log_addr_spr) + 1); end if; x_to_pmu.mtspr <= actions.write_pmuspr; icache_inval <= actions.icache_inval; @@ -1334,22 +1335,22 @@ begin end if; end if; - -- The following cases all occur when r.busy = 1 and therefore + -- The following cases all occur when ex1.busy = 1 and therefore -- valid_in = 0. Hence they don't happen in the same cycle as any of -- the cases above which depend on valid_in = 1. - if r.cntz_in_progress = '1' then + if ex1.cntz_in_progress = '1' then -- cnt[lt]z and popcnt* always take two cycles v.e.valid := '1'; v.e.write_data := countbits_result; end if; - if r.div_in_progress = '1' then + if ex1.div_in_progress = '1' then if divider_to_x.valid = '1' then v.e.write_data := muldiv_result; overflow := divider_to_x.overflow; -- We must test oe because the RC update code in writeback -- will use the xerc value to set CR0:SO so we must not clobber -- xerc if OE wasn't set. - if r.oe = '1' then + if ex1.oe = '1' then v.e.xerc.ov := overflow; v.e.xerc.ov32 := overflow; if overflow = '1' then @@ -1362,10 +1363,10 @@ begin v.div_in_progress := '1'; end if; end if; - if r.mul_in_progress = '1' then + if ex1.mul_in_progress = '1' then if multiply_to_x.valid = '1' then v.e.write_data := muldiv_result; - if r.oe = '1' then + if ex1.oe = '1' then -- have to wait until next cycle for overflow indication v.mul_finish := '1'; v.busy := '1'; @@ -1377,7 +1378,7 @@ begin v.mul_in_progress := '1'; end if; end if; - if r.mul_finish = '1' then + if ex1.mul_finish = '1' then v.e.xerc.ov := multiply_to_x.overflow; v.e.xerc.ov32 := multiply_to_x.overflow; if multiply_to_x.overflow = '1' then @@ -1460,12 +1461,12 @@ begin fv.out_cr := e_in.output_cr; -- Update registers - rin <= v; + ex1in <= v; -- update outputs l_out <= lv; - e_out <= r.e; - if r.e.valid = '0' then + e_out <= ex1.e; + if ex1.e.valid = '0' then e_out.write_enable <= '0'; e_out.write_cr_enable <= '0'; e_out.write_xerc_enable <= '0'; @@ -1491,10 +1492,10 @@ begin irq_valid_log & interrupt_in & "000" & - r.e.write_enable & - r.e.valid & - ((r.e.redirect and r.e.valid) or r.e.interrupt) & - r.busy & + ex1.e.write_enable & + ex1.e.valid & + ((ex1.e.redirect and ex1.e.valid) or ex1.e.interrupt) & + ex1.busy & flush_in; end if; end process; From 3510071d9a8dde12056f90dacb15c34eb6601971 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 30 Jun 2022 20:33:33 +1000 Subject: [PATCH 05/30] Add a second execute stage to the pipeline This adds a second execute stage to the pipeline, in order to match up the length of the pipeline through loadstore and dcache with the length through execute1. This will ultimately enable us to get rid of the 1-cycle bubble that we currently have when issuing ALU instructions after one or more LSU instructions. Most ALU instructions execute in the first stage, except for count-zeroes and popcount instructions (which take two cycles and do some of their work in the second stage) and mfspr/mtspr to "slow" SPRs (TB, DEC, PVR, LOGA/LOGD, CFAR). Multiply and divide/mod instructions take several cycles but the instruction stays in the first stage (ex1) and ex1.busy is asserted until the operation is complete. There is currently a bypass from the first stage but not the second stage. Performance is down somewhat because of that and because this doesn't yet eliminate the bubble between LSU and ALU instructions. The forwarding of XER common bits has been changed somewhat because now there is another pipeline stage between ex1 and the committed state in cr_file. The simplest thing for now is to record the last value written and use that, unless there has been a flush, in which case the committed state (obtained via e_in.xerc) is used. Note that this fixes what was previously a benign bug in control.vhdl, where it was possible for control to forget an instructions dependency on a value from a previous instruction (a GPR or the CR) if this instruction writes the value and the instruction gets to the point where it could issue but is blocked by the busy signal from execute1. In that situation, control may incorrectly not indicate that a bypass should be used. That didn't matter previously because, for ALU and FPU instructions, there was only one previous instruction in flight and once the current instruction could issue, the previous instruction was completing and the correct value would be obtained from register_file or cr_file. For loadstore instructions there could be two being executed, but because there are no bypass paths, failing to indicate use of a bypass path is fine. Signed-off-by: Paul Mackerras --- common.vhdl | 6 +- control.vhdl | 5 +- countbits_tb.vhdl | 1 + decode2.vhdl | 1 - divider.vhdl | 2 +- execute1.vhdl | 570 +++++++++++++++++++++++++++------------------- 6 files changed, 339 insertions(+), 246 deletions(-) diff --git a/common.vhdl b/common.vhdl index 7ecf4e2..6cbf181 100644 --- a/common.vhdl +++ b/common.vhdl @@ -356,6 +356,7 @@ package common is type Execute1ToDividerType is record valid: std_ulogic; + flush: std_ulogic; dividend: std_ulogic_vector(63 downto 0); divisor: std_ulogic_vector(63 downto 0); is_signed: std_ulogic; @@ -364,9 +365,8 @@ package common is is_modulus: std_ulogic; neg_result: std_ulogic; end record; - constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', - is_extended => '0', is_modulus => '0', - neg_result => '0', others => (others => '0')); + constant Execute1ToDividerInit: Execute1ToDividerType := ( + dividend => 64x"0", divisor => 64x"0", others => '0'); type PMUEventType is record no_instr_avail : std_ulogic; diff --git a/control.vhdl b/control.vhdl index 1d55517..0bbe9ad 100644 --- a/control.vhdl +++ b/control.vhdl @@ -104,7 +104,8 @@ begin tag_regs(i).wr_cr <= '0'; report "tag " & integer'image(i) & " not valid"; end if; - if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then + if instr_tag.valid = '1' and gpr_write_valid = '1' and + tag_regs(i).reg = gpr_write_in then tag_regs(i).recent <= '0'; if tag_regs(i).recent = '1' and tag_regs(i).wr_gpr = '1' then report "tag " & integer'image(i) & " not recent"; @@ -126,7 +127,7 @@ begin curr_cr_tag <= 0; else curr_tag <= next_tag; - if cr_write_valid = '1' then + if instr_tag.valid = '1' and cr_write_valid = '1' then curr_cr_tag <= instr_tag.tag; end if; end if; diff --git a/countbits_tb.vhdl b/countbits_tb.vhdl index c00a6b6..c945c57 100644 --- a/countbits_tb.vhdl +++ b/countbits_tb.vhdl @@ -26,6 +26,7 @@ begin bitcounter_0: entity work.bit_counter port map ( clk => clk, + stall => '0', rs => rs, result => res, count_right => count_right, diff --git a/decode2.vhdl b/decode2.vhdl index 8998f2b..af0c27d 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -228,7 +228,6 @@ architecture behaviour of decode2 is OP_SHR => "010", OP_EXTSWSLI => "010", OP_MUL_L64 => "011", -- muldiv_result - OP_MFSPR => "101", -- spr_result OP_B => "110", -- next_nia OP_BC => "110", OP_BCREG => "110", diff --git a/divider.vhdl b/divider.vhdl index 3f9b312..55e3c5d 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -36,7 +36,7 @@ begin divider_0: process(clk) begin if rising_edge(clk) then - if rst = '1' then + if rst = '1' or d_in.flush = '1' then dend <= (others => '0'); div <= (others => '0'); quot <= (others => '0'); diff --git a/execute1.vhdl b/execute1.vhdl index 7bd0913..ebcdfeb 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -60,65 +60,90 @@ entity execute1 is end entity execute1; architecture behaviour of execute1 is - type reg_type is record + type side_effect_type is record + terminate : std_ulogic; + icache_inval : std_ulogic; + write_msr : std_ulogic; + write_xerlow : std_ulogic; + write_dec : std_ulogic; + write_cfar : std_ulogic; + write_loga : std_ulogic; + inc_loga : std_ulogic; + write_pmuspr : std_ulogic; + end record; + constant side_effect_init : side_effect_type := (others => '0'); + + type actions_type is record + e : Execute1ToWritebackType; + se : side_effect_type; + complete : std_ulogic; + exception : std_ulogic; + trap : std_ulogic; + new_msr : std_ulogic_vector(63 downto 0); + take_branch : std_ulogic; + direct_branch : std_ulogic; + start_mul : std_ulogic; + start_div : std_ulogic; + do_trace : std_ulogic; + fp_intr : std_ulogic; + res2_sel : std_ulogic_vector(1 downto 0); + bypass_valid : std_ulogic; + end record; + constant actions_type_init : actions_type := + (e => Execute1ToWritebackInit, se => side_effect_init, + new_msr => (others => '0'), res2_sel => "00", others => '0'); + + type reg_stage1_type is record e : Execute1ToWritebackType; + se : side_effect_type; busy: std_ulogic; - terminate: std_ulogic; - intr_pending : std_ulogic; fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; br_taken : std_ulogic; oe : std_ulogic; mul_select : std_ulogic_vector(1 downto 0); + res2_sel : std_ulogic_vector(1 downto 0); + spr_select : spr_id; + pmu_spr_num : std_ulogic_vector(4 downto 0); mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; - cntz_in_progress : std_ulogic; no_instr_avail : std_ulogic; instr_dispatch : std_ulogic; ext_interrupt : std_ulogic; taken_branch_event : std_ulogic; br_mispredict : std_ulogic; - log_addr_spr : std_ulogic_vector(31 downto 0); + msr : std_ulogic_vector(63 downto 0); + xerc : xer_common_t; + xerc_valid : std_ulogic; end record; - constant reg_type_init : reg_type := - (e => Execute1ToWritebackInit, - busy => '0', terminate => '0', intr_pending => '0', + constant reg_stage1_type_init : reg_stage1_type := + (e => Execute1ToWritebackInit, se => side_effect_init, + busy => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', - oe => '0', mul_select => "00", - mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', + oe => '0', mul_select => "00", res2_sel => "00", + spr_select => spr_id_init, pmu_spr_num => 5x"0", + mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', - others => (others => '0')); + msr => 64x"0", + xerc => xerc_init, xerc_valid => '0'); - type actions_type is record + type reg_stage2_type is record e : Execute1ToWritebackType; - complete : std_ulogic; - exception : std_ulogic; - trap : std_ulogic; - terminate : std_ulogic; - write_msr : std_ulogic; - new_msr : std_ulogic_vector(63 downto 0); - write_xerlow : std_ulogic; - write_pmuspr : std_ulogic; - write_dec : std_ulogic; - write_loga : std_ulogic; - inc_loga : std_ulogic; - write_cfar : std_ulogic; - take_branch : std_ulogic; - direct_branch : std_ulogic; - start_mul : std_ulogic; - start_div : std_ulogic; - start_cntz : std_ulogic; - do_trace : std_ulogic; - fp_intr : std_ulogic; - icache_inval : std_ulogic; + se : side_effect_type; + ext_interrupt : std_ulogic; + taken_branch_event : std_ulogic; + br_mispredict : std_ulogic; + log_addr_spr : std_ulogic_vector(31 downto 0); end record; - constant actions_type_init : actions_type := - (e => Execute1ToWritebackInit, new_msr => (others => '0'), others => '0'); + constant reg_stage2_type_init : reg_stage2_type := + (e => Execute1ToWritebackInit, se => side_effect_init, + log_addr_spr => 32x"0", others => '0'); - signal ex1, ex1in : reg_type; + signal ex1, ex1in : reg_stage1_type; + signal ex2, ex2in : reg_stage2_type; signal actions : actions_type; signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); @@ -142,7 +167,9 @@ architecture behaviour of execute1 is signal muldiv_result: std_ulogic_vector(63 downto 0); signal shortmul_result: std_ulogic_vector(63 downto 0); signal spr_result: std_ulogic_vector(63 downto 0); + signal ex_result: std_ulogic_vector(63 downto 0); signal next_nia : std_ulogic_vector(63 downto 0); + signal s1_sel : std_ulogic_vector(2 downto 0); signal carry_32 : std_ulogic; signal carry_64 : std_ulogic; @@ -372,7 +399,7 @@ begin end generate; dbg_ctrl_out <= ctrl; - log_rd_addr <= ex1.log_addr_spr; + log_rd_addr <= ex2.log_addr_spr; a_in <= e_in.read_data1; b_in <= e_in.read_data2; @@ -393,15 +420,15 @@ begin itlb_miss_resolved => ic_events.itlb_miss_resolved, no_instr_avail => ex1.no_instr_avail, dispatch => ex1.instr_dispatch, - ext_interrupt => ex1.ext_interrupt, - br_taken_complete => ex1.taken_branch_event, - br_mispredict => ex1.br_mispredict, + ext_interrupt => ex2.ext_interrupt, + br_taken_complete => ex2.taken_branch_event, + br_mispredict => ex2.br_mispredict, others => '0'); x_to_pmu.nia <= e_in.nia; x_to_pmu.addr <= (others => '0'); x_to_pmu.addr_v <= '0'; - x_to_pmu.spr_num <= e_in.insn(20 downto 16); - x_to_pmu.spr_val <= c_in; + x_to_pmu.spr_num <= ex1.pmu_spr_num; + x_to_pmu.spr_val <= ex1.e.write_data; x_to_pmu.run <= '1'; -- XER forwarding. To avoid having to track XER hazards, we use @@ -409,35 +436,23 @@ begin -- (SO, OV[32] and CA[32]) are only modified by instructions that are -- handled here, we can just forward the result being sent to -- writeback. - xerc_in <= ex1.e.xerc when (ex1.e.write_xerc_enable and ex1.e.valid) = '1' else e_in.xerc; + xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc; with e_in.unit select busy_out <= - l_in.busy or ex1.busy or fp_in.busy when LDST, + l_in.busy or ex1.e.valid or ex1.busy or fp_in.busy when LDST, + l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU, l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy when others; - valid_in <= e_in.valid and not busy_out and not flush_in; + valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt); - terminate_out <= ex1.terminate; - - -- Slow SPR read mux - with e_in.spr_select.sel select spr_result <= - ctrl.tb when SPRSEL_TB, - 32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU, - ctrl.dec when SPRSEL_DEC, - 32x"0" & PVR_MICROWATT when SPRSEL_PVR, - log_wr_addr & ex1.log_addr_spr when SPRSEL_LOGA, - log_rd_data when SPRSEL_LOGD, - ctrl.cfar when SPRSEL_CFAR, - assemble_xer(xerc_in, ctrl.xer_low) when others; - - -- Result mux - with e_in.result_sel select alu_result <= + -- First stage result mux + s1_sel <= e_in.result_sel when ex1.busy = '0' else "100"; + with s1_sel select alu_result <= adder_result when "000", logical_result when "001", rotator_result when "010", shortmul_result when "011", - pmu_to_x.spr_val when "100", - spr_result when "101", + muldiv_result when "100", next_nia when "110", misc_result when others; @@ -445,22 +460,31 @@ begin begin if rising_edge(clk) then if rst = '1' then - ex1 <= reg_type_init; + ex1 <= reg_stage1_type_init; + ex2 <= reg_stage2_type_init; ctrl <= ctrl_t_init; ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); + ex1.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); else ex1 <= ex1in; + ex2 <= ex2in; ctrl <= ctrl_tmp; if valid_in = '1' then report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & " wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) & " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid); end if; + -- We mustn't get stalled on a cycle where execute2 is + -- completing an instruction or generating an interrupt + if ex2.e.valid = '1' or ex2.e.interrupt = '1' then + assert (l_in.busy or fp_in.busy) = '0' + severity failure; + end if; end if; end if; end process; - -- Data path for integer instructions + -- Data path for integer instructions (first execute stage) execute1_dp: process(all) variable a_inv : std_ulogic_vector(63 downto 0); variable b_or_m1 : std_ulogic_vector(63 downto 0); @@ -543,6 +567,7 @@ begin if e_in.insn_type = OP_MOD then x_to_divider.is_modulus <= '1'; end if; + x_to_divider.flush <= flush_in; addend := (others => '0'); if e_in.insn(26) = '0' then @@ -638,7 +663,7 @@ begin misc_result <= darn; when "100" => -- mfmsr - misc_result <= ctrl.msr; + misc_result <= ex1.msr; when "101" => if e_in.insn(20) = '0' then -- mfcr @@ -792,6 +817,7 @@ begin variable illegal : std_ulogic; variable privileged : std_ulogic; variable slow_op : std_ulogic; + variable owait : std_ulogic; begin v := actions_type_init; v.e.write_data := alu_result; @@ -803,12 +829,11 @@ begin v.e.write_cr_enable := e_in.output_cr; v.e.write_xerc_enable := e_in.output_xer; v.e.xerc := xerc_in; - v.new_msr := ctrl.msr; - v.e.write_xerc_enable := e_in.output_xer; - v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & - not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); + v.new_msr := ex1.msr; + v.e.redir_mode := ex1.msr(MSR_IR) & not ex1.msr(MSR_PR) & + not ex1.msr(MSR_LE) & not ex1.msr(MSR_SF); v.e.intr_vec := 16#700#; - v.e.mode_32bit := not ctrl.msr(MSR_SF); + v.e.mode_32bit := not ex1.msr(MSR_SF); v.e.instr_tag := e_in.instr_tag; v.e.last_nia := e_in.nia; v.e.br_offset := 64x"4"; @@ -827,8 +852,9 @@ begin illegal := '0'; privileged := '0'; slow_op := '0'; + owait := '0'; - if ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then + if ex1.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then privileged := '1'; end if; @@ -837,7 +863,7 @@ begin illegal := '1'; end if; - v.do_trace := ctrl.msr(MSR_SE); + v.do_trace := ex1.msr(MSR_SE); case_0: case e_in.insn_type is when OP_ILLEGAL => illegal := '1'; @@ -858,7 +884,7 @@ begin -- check bits 1-10 of the instruction to make sure it's attn -- if not then it is illegal if e_in.insn(10 downto 1) = "0100000000" then - v.terminate := '1'; + v.se.terminate := '1'; if e_in.valid = '1' then report "ATTN"; end if; @@ -909,10 +935,10 @@ begin -- should never happen v.e.redirect := '1'; end if; - if ctrl.msr(MSR_BE) = '1' then + if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.write_cfar := '1'; + v.se.write_cfar := '1'; when OP_BC => -- read_data1 is CTR -- If this instruction updates both CTR and LR, then it is @@ -938,10 +964,10 @@ begin v.direct_branch := '1'; v.e.br_last := '1'; v.e.br_taken := v.take_branch; - if ctrl.msr(MSR_BE) = '1' then + if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.write_cfar := v.take_branch; + v.se.write_cfar := v.take_branch; end if; when OP_BCREG => -- read_data1 is CTR, read_data2 is target register (CTR, LR or TAR) @@ -964,10 +990,10 @@ begin -- Indirect branches are never predicted taken v.e.redirect := v.take_branch; v.e.br_taken := v.take_branch; - if ctrl.msr(MSR_BE) = '1' then + if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.write_cfar := v.take_branch; + v.se.write_cfar := v.take_branch; end if; when OP_RFID => @@ -983,11 +1009,11 @@ begin v.new_msr(MSR_IR) := '1'; v.new_msr(MSR_DR) := '1'; end if; - v.write_msr := '1'; + v.se.write_msr := '1'; v.e.br_offset := b_in; v.e.abs_br := '1'; v.e.redirect := '1'; - v.write_cfar := '1'; + v.se.write_cfar := '1'; if HAS_FPU then v.fp_intr := fp_in.exception and (a_in(MSR_FE0) or a_in(MSR_FE1)); @@ -995,8 +1021,8 @@ begin v.do_trace := '0'; when OP_CNTZ | OP_POPCNT => + v.res2_sel := "01"; slow_op := '1'; - v.start_cntz := '1'; when OP_ISEL => when OP_CROP => when OP_MCRXRX => @@ -1010,14 +1036,19 @@ begin end if; elsif e_in.spr_select.valid = '1' then if e_in.valid = '1' then - report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(spr_result); + report "MFSPR to slow SPR " & integer'image(decode_spr_num(e_in.insn)); + end if; + slow_op := '1'; + if e_in.spr_select.ispmu = '0' then + case e_in.spr_select.sel is + when SPRSEL_LOGD => + v.se.inc_loga := '1'; + when others => + end case; + v.res2_sel := "10"; + else + v.res2_sel := "11"; end if; - case e_in.spr_select.sel is - when SPRSEL_LOGD => - v.inc_loga := '1'; - when others => - end case; else -- mfspr from unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode @@ -1025,7 +1056,7 @@ begin report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & " invalid"; end if; - if ctrl.msr(MSR_PR) = '1' then + if ex1.msr(MSR_PR) = '1' then illegal := '1'; end if; end if; @@ -1033,7 +1064,7 @@ begin when OP_MFCR => when OP_MTCRF => when OP_MTMSRD => - v.write_msr := '1'; + v.se.write_msr := '1'; if e_in.insn(16) = '1' then -- just update EE and RI v.new_msr(MSR_EE) := c_in(MSR_EE); @@ -1062,7 +1093,7 @@ begin report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(c_in); end if; - v.write_pmuspr := e_in.spr_select.ispmu; + v.se.write_pmuspr := e_in.spr_select.ispmu; if e_in.spr_select.valid = '1' and e_in.spr_select.ispmu = '0' then case e_in.spr_select.sel is when SPRSEL_XER => @@ -1071,17 +1102,17 @@ begin v.e.xerc.ca := c_in(63-34); v.e.xerc.ov32 := c_in(63-44); v.e.xerc.ca32 := c_in(63-45); - v.write_xerlow := '1'; + v.se.write_xerlow := '1'; when SPRSEL_DEC => - v.write_dec := '1'; + v.se.write_dec := '1'; when SPRSEL_LOGA => - v.write_loga := '1'; + v.se.write_loga := '1'; when others => end case; elsif is_fast_spr(e_in.write_reg) = '0' then -- mtspr to unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode - if ctrl.msr(MSR_PR) = '1' then + if ex1.msr(MSR_PR) = '1' then illegal := '1'; end if; end if; @@ -1095,7 +1126,7 @@ begin v.e.redirect := '1'; when OP_ICBI => - v.icache_inval := '1'; + v.se.icache_inval := '1'; when OP_MUL_L64 => if HAS_SHORT_MULT and e_in.insn(26) = '1' and @@ -1109,15 +1140,18 @@ begin -- Use standard multiplier v.start_mul := '1'; slow_op := '1'; + owait := '1'; end if; when OP_MUL_H64 | OP_MUL_H32 => v.start_mul := '1'; slow_op := '1'; + owait := '1'; when OP_DIV | OP_DIVE | OP_MOD => v.start_div := '1'; slow_op := '1'; + owait := '1'; when OP_FETCH_FAILED => -- Handling an ITLB miss doesn't count as having executed an instruction @@ -1147,7 +1181,7 @@ begin report "illegal instruction"; end if; - elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then + elsif HAS_FPU and ex1.msr(MSR_FP) = '0' and e_in.fac = FPU then -- generate a floating-point unavailable interrupt v.exception := '1'; v.e.intr_vec := 16#800#; @@ -1157,26 +1191,33 @@ begin end if; if e_in.unit = ALU then - v.complete := e_in.valid and not v.exception and not slow_op; + v.complete := e_in.valid and not v.exception and not owait; + v.bypass_valid := e_in.valid and not v.exception and not slow_op; end if; actions <= v; end process; + -- First execute stage execute1_1: process(all) - variable v : reg_type; + variable v : reg_stage1_type; variable overflow : std_ulogic; variable lv : Execute1ToLoadstore1Type; variable irq_valid : std_ulogic; variable exception : std_ulogic; variable fv : Execute1ToFPUType; variable go : std_ulogic; + variable bypass_valid : std_ulogic; begin v := ex1; - if ex1.busy = '0' then + if (ex1.busy or l_in.busy or fp_in.busy) = '0' then v.e := actions.e; + v.e.valid := '0'; v.oe := e_in.oe; + v.spr_select := e_in.spr_select; + v.pmu_spr_num := e_in.insn(20 downto 16); v.mul_select := e_in.sub_select(1 downto 0); + v.se := side_effect_init; end if; lv := Execute1ToLoadstore1Init; @@ -1184,33 +1225,13 @@ begin x_to_multiply.valid <= '0'; x_to_divider.valid <= '0'; - v.mul_in_progress := '0'; - v.div_in_progress := '0'; - v.cntz_in_progress := '0'; - v.mul_finish := '0'; v.ext_interrupt := '0'; v.taken_branch_event := '0'; v.br_mispredict := '0'; + v.busy := '0'; + bypass_valid := '0'; - x_to_pmu.mfspr <= '0'; - x_to_pmu.mtspr <= '0'; - x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47); - x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51); - x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55); - x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63); - x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM); - x_to_pmu.pr_msr <= ctrl.msr(MSR_PR); - - ctrl_tmp <= ctrl; - -- FIXME: run at 512MHz not core freq - ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); - ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1); - - irq_valid := ctrl.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); - - v.terminate := '0'; - icache_inval <= '0'; - v.busy := '0'; + irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); -- Next insn adder used in a couple of places next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); @@ -1223,19 +1244,14 @@ begin do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; - if ex1.intr_pending = '1' then - v.e.srr1 := ex1.e.srr1; - v.e.intr_vec := ex1.e.intr_vec; - end if; - if valid_in = '1' then v.prev_op := e_in.insn_type; end if; -- Determine if there is any interrupt to be taken -- before/instead of executing this instruction - exception := ex1.intr_pending or (valid_in and actions.exception); - if valid_in = '1' and e_in.second = '0' and ex1.intr_pending = '0' then + exception := valid_in and actions.exception; + if valid_in = '1' and e_in.second = '0' then if HAS_FPU and ex1.fp_exception_next = '1' then -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. @@ -1278,54 +1294,37 @@ begin end if; end if; - if exception = '1' and l_in.in_progress = '1' then - -- We can't send this interrupt to writeback yet because there are - -- still instructions in loadstore1 that haven't completed. - v.intr_pending := '1'; - v.busy := '1'; - end if; - v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy); + v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or + ex1.busy or fp_in.busy); go := valid_in and not exception; v.instr_dispatch := go; if go = '1' then + v.se := actions.se; v.e.valid := actions.complete; + bypass_valid := actions.bypass_valid; v.taken_branch_event := actions.take_branch; v.br_taken := actions.take_branch; v.trace_next := actions.do_trace; v.fp_exception_next := actions.fp_intr; - v.cntz_in_progress := actions.start_cntz; - - if actions.write_msr = '1' then - ctrl_tmp.msr <= actions.new_msr; - end if; - if actions.write_xerlow = '1' then - ctrl_tmp.xer_low <= c_in(17 downto 0); - end if; - if actions.write_dec = '1' then - ctrl_tmp.dec <= c_in; - end if; - if actions.write_cfar = '1' then - ctrl_tmp.cfar <= e_in.nia; - end if; - if actions.write_loga = '1' then - v.log_addr_spr := c_in(31 downto 0); - elsif actions.inc_loga = '1' then - v.log_addr_spr := std_ulogic_vector(unsigned(ex1.log_addr_spr) + 1); - end if; - x_to_pmu.mtspr <= actions.write_pmuspr; - icache_inval <= actions.icache_inval; + v.res2_sel := actions.res2_sel; + v.msr := actions.new_msr; x_to_multiply.valid <= actions.start_mul; v.mul_in_progress := actions.start_mul; x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; - v.terminate := actions.terminate; v.br_mispredict := v.e.redirect and actions.direct_branch; - v.busy := actions.start_cntz or actions.start_mul or actions.start_div; exception := actions.trap; + -- Go busy while division is happening because the + -- divider is not pipelined. Also go busy while a + -- multiply is happening in order to stop following + -- instructions from using the wrong XER value + -- (and for simplicity in the OE=0 case). + v.busy := actions.start_div or actions.start_mul; + -- instruction for other units, i.e. LDST if e_in.unit = LDST then lv.valid := '1'; @@ -1335,86 +1334,74 @@ begin end if; end if; - -- The following cases all occur when ex1.busy = 1 and therefore - -- valid_in = 0. Hence they don't happen in the same cycle as any of - -- the cases above which depend on valid_in = 1. - if ex1.cntz_in_progress = '1' then - -- cnt[lt]z and popcnt* always take two cycles - v.e.valid := '1'; - v.e.write_data := countbits_result; - end if; - if ex1.div_in_progress = '1' then - if divider_to_x.valid = '1' then - v.e.write_data := muldiv_result; - overflow := divider_to_x.overflow; - -- We must test oe because the RC update code in writeback - -- will use the xerc value to set CR0:SO so we must not clobber - -- xerc if OE wasn't set. - if ex1.oe = '1' then - v.e.xerc.ov := overflow; - v.e.xerc.ov32 := overflow; - if overflow = '1' then - v.e.xerc.so := '1'; - end if; + if ex1.div_in_progress = '1' then + v.div_in_progress := not divider_to_x.valid; + v.busy := not divider_to_x.valid; + if divider_to_x.valid = '1' and ex1.oe = '1' then + v.e.xerc.ov := divider_to_x.overflow; + v.e.xerc.ov32 := divider_to_x.overflow; + if divider_to_x.overflow = '1' then + v.e.xerc.so := '1'; end if; - v.e.valid := '1'; - else - v.busy := '1'; - v.div_in_progress := '1'; - end if; + end if; + v.e.valid := divider_to_x.valid; + v.e.write_data := alu_result; + bypass_valid := v.e.valid; end if; - if ex1.mul_in_progress = '1' then - if multiply_to_x.valid = '1' then - v.e.write_data := muldiv_result; - if ex1.oe = '1' then - -- have to wait until next cycle for overflow indication - v.mul_finish := '1'; - v.busy := '1'; - else - v.e.valid := '1'; - end if; - else - v.busy := '1'; - v.mul_in_progress := '1'; - end if; + if ex1.mul_in_progress = '1' then + v.mul_in_progress := not multiply_to_x.valid; + v.mul_finish := multiply_to_x.valid and ex1.oe; + v.e.valid := multiply_to_x.valid and not ex1.oe; + v.busy := not v.e.valid; + v.e.write_data := alu_result; + bypass_valid := v.e.valid; end if; if ex1.mul_finish = '1' then + v.mul_finish := '0'; v.e.xerc.ov := multiply_to_x.overflow; v.e.xerc.ov32 := multiply_to_x.overflow; if multiply_to_x.overflow = '1' then v.e.xerc.so := '1'; end if; v.e.valid := '1'; - end if; + end if; - v.e.interrupt := exception and not (l_in.in_progress or l_in.interrupt); - if v.e.interrupt = '1' then - v.intr_pending := '0'; + if v.e.write_xerc_enable = '1' and v.e.valid = '1' then + v.xerc := v.e.xerc; + v.xerc_valid := '1'; end if; - if interrupt_in = '1' then - ctrl_tmp.msr(MSR_SF) <= '1'; - ctrl_tmp.msr(MSR_EE) <= '0'; - ctrl_tmp.msr(MSR_PR) <= '0'; - ctrl_tmp.msr(MSR_SE) <= '0'; - ctrl_tmp.msr(MSR_BE) <= '0'; - ctrl_tmp.msr(MSR_FP) <= '0'; - ctrl_tmp.msr(MSR_FE0) <= '0'; - ctrl_tmp.msr(MSR_FE1) <= '0'; - ctrl_tmp.msr(MSR_IR) <= '0'; - ctrl_tmp.msr(MSR_DR) <= '0'; - ctrl_tmp.msr(MSR_RI) <= '0'; - ctrl_tmp.msr(MSR_LE) <= '1'; + if (ex1.busy or l_in.busy or fp_in.busy) = '0' then + v.e.interrupt := exception; + end if; + if v.e.valid = '0' then + v.e.redirect := '0'; + v.e.br_last := '0'; + end if; + if flush_in = '1' then + v.e.valid := '0'; + v.e.interrupt := '0'; + v.e.redirect := '0'; + v.e.br_last := '0'; + v.busy := '0'; + v.div_in_progress := '0'; + v.mul_in_progress := '0'; + v.mul_finish := '0'; + v.xerc_valid := '0'; + end if; + if flush_in = '1' or interrupt_in = '1' then + v.msr := ctrl_tmp.msr; + end if; + if interrupt_in = '1' then v.trace_next := '0'; v.fp_exception_next := '0'; - v.intr_pending := '0'; end if; - bypass_data.tag.valid <= v.e.write_enable and v.e.valid; + bypass_data.tag.valid <= v.e.write_enable and bypass_valid; bypass_data.tag.tag <= v.e.instr_tag.tag; - bypass_data.data <= v.e.write_data; + bypass_data.data <= alu_result; - bypass_cr_data.tag.valid <= v.e.write_cr_enable and v.e.valid; + bypass_cr_data.tag.valid <= v.e.write_cr_enable and bypass_valid; bypass_cr_data.tag.tag <= v.e.instr_tag.tag; bypass_cr_data.data <= v.e.write_cr_data; @@ -1427,7 +1414,7 @@ begin lv.data := c_in; lv.write_reg := e_in.write_reg; lv.length := e_in.data_len; - lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE); + lv.byte_reverse := e_in.byte_reverse xnor ex1.msr(MSR_LE); lv.sign_extend := e_in.sign_extend; lv.update := e_in.update; lv.xerc := xerc_in; @@ -1439,9 +1426,9 @@ begin e_in.insn(5 downto 1) = "10101" then lv.ci := '1'; end if; - lv.virt_mode := ctrl.msr(MSR_DR); - lv.priv_mode := not ctrl.msr(MSR_PR); - lv.mode_32bit := not ctrl.msr(MSR_SF); + lv.virt_mode := ex1.msr(MSR_DR); + lv.priv_mode := not ex1.msr(MSR_PR); + lv.mode_32bit := not ex1.msr(MSR_SF); lv.is_32bit := e_in.is_32bit; lv.repeat := e_in.repeat; lv.second := e_in.second; @@ -1452,7 +1439,7 @@ begin fv.insn := e_in.insn; fv.itag := e_in.instr_tag; fv.single := e_in.is_32bit; - fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1); + fv.fe_mode := ex1.msr(MSR_FE0) & ex1.msr(MSR_FE1); fv.fra := a_in; fv.frb := b_in; fv.frc := c_in; @@ -1465,19 +1452,124 @@ begin -- update outputs l_out <= lv; - e_out <= ex1.e; - if ex1.e.valid = '0' then - e_out.write_enable <= '0'; - e_out.write_cr_enable <= '0'; - e_out.write_xerc_enable <= '0'; - e_out.redirect <= '0'; - e_out.br_last <= '0'; + fp_out <= fv; + irq_valid_log <= irq_valid; + end process; + + -- Slow SPR read mux + with ex1.spr_select.sel select spr_result <= + ctrl.tb when SPRSEL_TB, + 32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU, + ctrl.dec when SPRSEL_DEC, + 32x"0" & PVR_MICROWATT when SPRSEL_PVR, + log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA, + log_rd_data when SPRSEL_LOGD, + ctrl.cfar when SPRSEL_CFAR, + assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; + + -- Second stage result mux + with ex1.res2_sel select ex_result <= + countbits_result when "01", + spr_result when "10", + pmu_to_x.spr_val when "11", + ex1.e.write_data when others; + + -- Second execute stage control + execute2_1: process(all) + variable v : reg_stage2_type; + variable overflow : std_ulogic; + variable lv : Execute1ToLoadstore1Type; + variable fv : Execute1ToFPUType; + variable k : integer; + variable go : std_ulogic; + begin + v := ex2; + if (l_in.busy or fp_in.busy) = '0' then + v.e := ex1.e; + v.se := ex1.se; + v.e.write_data := ex_result; + v.ext_interrupt := ex1.ext_interrupt; + v.taken_branch_event := ex1.taken_branch_event; + v.br_mispredict := ex1.br_mispredict; + end if; + + ctrl_tmp <= ctrl; + -- FIXME: run at 512MHz not core freq + ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); + ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1); + + x_to_pmu.mfspr <= '0'; + x_to_pmu.mtspr <= '0'; + x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47); + x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51); + x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55); + x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63); + x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM); + x_to_pmu.pr_msr <= ctrl.msr(MSR_PR); + + if v.e.valid = '0' or flush_in = '1' then + v.e.write_enable := '0'; + v.e.write_cr_enable := '0'; + v.e.write_xerc_enable := '0'; + v.e.redirect := '0'; + v.e.br_last := '0'; + v.se := side_effect_init; + v.taken_branch_event := '0'; + v.br_mispredict := '0'; + end if; + if flush_in = '1' then + v.e.valid := '0'; + v.e.interrupt := '0'; + v.ext_interrupt := '0'; + end if; + + if (l_in.busy or fp_in.busy) = '0' then + if ex1.se.write_msr = '1' then + ctrl_tmp.msr <= ex1.msr; + end if; + if ex1.se.write_xerlow = '1' then + ctrl_tmp.xer_low <= ex1.e.write_data(17 downto 0); + end if; + if ex1.se.write_dec = '1' then + ctrl_tmp.dec <= ex1.e.write_data; + end if; + if ex1.se.write_cfar = '1' then + ctrl_tmp.cfar <= ex1.e.last_nia; + end if; + if ex1.se.write_loga = '1' then + v.log_addr_spr := ex1.e.write_data(31 downto 0); + elsif ex1.se.inc_loga = '1' then + v.log_addr_spr := std_ulogic_vector(unsigned(ex2.log_addr_spr) + 1); + end if; + x_to_pmu.mtspr <= ex1.se.write_pmuspr; end if; + + if interrupt_in = '1' then + ctrl_tmp.msr(MSR_SF) <= '1'; + ctrl_tmp.msr(MSR_EE) <= '0'; + ctrl_tmp.msr(MSR_PR) <= '0'; + ctrl_tmp.msr(MSR_SE) <= '0'; + ctrl_tmp.msr(MSR_BE) <= '0'; + ctrl_tmp.msr(MSR_FP) <= '0'; + ctrl_tmp.msr(MSR_FE0) <= '0'; + ctrl_tmp.msr(MSR_FE1) <= '0'; + ctrl_tmp.msr(MSR_IR) <= '0'; + ctrl_tmp.msr(MSR_DR) <= '0'; + ctrl_tmp.msr(MSR_RI) <= '0'; + ctrl_tmp.msr(MSR_LE) <= '1'; + end if; + + -- Update registers + ex2in <= v; + + -- update outputs + e_out <= ex2.e; e_out.msr <= msr_copy(ctrl.msr); - fp_out <= fv; - exception_log <= exception; - irq_valid_log <= irq_valid; + terminate_out <= ex2.se.terminate; + icache_inval <= ex2.se.icache_inval; + + exception_log <= v.e.interrupt; end process; e1_log: if LOG_LENGTH > 0 generate @@ -1492,9 +1584,9 @@ begin irq_valid_log & interrupt_in & "000" & - ex1.e.write_enable & - ex1.e.valid & - ((ex1.e.redirect and ex1.e.valid) or ex1.e.interrupt) & + ex2.e.write_enable & + ex2.e.valid & + (ex2.e.redirect or ex2.e.interrupt) & ex1.busy & flush_in; end if; From 4b6148ada6a58adb48167733b492c73c505b6930 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 Jun 2022 08:40:42 +1000 Subject: [PATCH 06/30] Add a bypass path from the execute2 stage This enables some instructions to issue earlier and thus improves performance, at the cost of some extra multiplexers in decode2. Signed-off-by: Paul Mackerras --- control.vhdl | 50 ++++++++++++++++++++++++++++++-------------------- core.vhdl | 6 ++++++ decode2.vhdl | 34 +++++++++++++++++++++++----------- execute1.vhdl | 16 ++++++++++++++++ 4 files changed, 75 insertions(+), 31 deletions(-) diff --git a/control.vhdl b/control.vhdl index 0bbe9ad..17a288b 100644 --- a/control.vhdl +++ b/control.vhdl @@ -36,6 +36,8 @@ entity control is execute_next_tag : in instr_tag_t; execute_next_cr_tag : in instr_tag_t; + execute2_next_tag : in instr_tag_t; + execute2_next_cr_tag : in instr_tag_t; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -44,10 +46,10 @@ entity control is stall_out : out std_ulogic; stopped_out : out std_ulogic; - gpr_bypass_a : out std_ulogic; - gpr_bypass_b : out std_ulogic; - gpr_bypass_c : out std_ulogic; - cr_bypass : out std_ulogic; + gpr_bypass_a : out std_ulogic_vector(1 downto 0); + gpr_bypass_b : out std_ulogic_vector(1 downto 0); + gpr_bypass_c : out std_ulogic_vector(1 downto 0); + cr_bypass : out std_ulogic_vector(1 downto 0); instr_tag_out : out instr_tag_t ); @@ -142,11 +144,11 @@ begin variable tag_s : instr_tag_t; variable tag_t : instr_tag_t; variable incr_tag : tag_number_t; - variable byp_a : std_ulogic; - variable byp_b : std_ulogic; - variable byp_c : std_ulogic; + variable byp_a : std_ulogic_vector(1 downto 0); + variable byp_b : std_ulogic_vector(1 downto 0); + variable byp_c : std_ulogic_vector(1 downto 0); variable tag_cr : instr_tag_t; - variable byp_cr : std_ulogic; + variable byp_cr : std_ulogic_vector(1 downto 0); begin tag_a := instr_tag_init; for i in tag_number_t loop @@ -179,26 +181,32 @@ begin tag_c.valid := '0'; end if; - byp_a := '0'; + byp_a := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then - byp_a := '1'; + byp_a := "10"; + elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then + byp_a := "11"; end if; - byp_b := '0'; + byp_b := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then - byp_b := '1'; + byp_b := "10"; + elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then + byp_b := "11"; end if; - byp_c := '0'; + byp_c := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then - byp_c := '1'; + byp_c := "10"; + elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then + byp_c := "11"; end if; gpr_bypass_a <= byp_a; gpr_bypass_b <= byp_b; gpr_bypass_c <= byp_c; - gpr_tag_stall <= (tag_a.valid and not byp_a) or - (tag_b.valid and not byp_b) or - (tag_c.valid and not byp_c); + gpr_tag_stall <= (tag_a.valid and not byp_a(1)) or + (tag_b.valid and not byp_b(1)) or + (tag_c.valid and not byp_c(1)); incr_tag := curr_tag; instr_tag.tag <= curr_tag; @@ -215,13 +223,15 @@ begin if tag_match(tag_cr, complete_in) then tag_cr.valid := '0'; end if; - byp_cr := '0'; + byp_cr := "00"; if EX1_BYPASS and tag_match(execute_next_cr_tag, tag_cr) then - byp_cr := '1'; + byp_cr := "10"; + elsif EX1_BYPASS and tag_match(execute2_next_cr_tag, tag_cr) then + byp_cr := "11"; end if; cr_bypass <= byp_cr; - cr_tag_stall <= tag_cr.valid and not byp_cr; + cr_tag_stall <= tag_cr.valid and not byp_cr(1); end process; control1 : process(all) diff --git a/core.vhdl b/core.vhdl index 070a1f1..84604c6 100644 --- a/core.vhdl +++ b/core.vhdl @@ -79,6 +79,8 @@ architecture behave of core is signal execute1_to_writeback: Execute1ToWritebackType; signal execute1_bypass: bypass_data_t; signal execute1_cr_bypass: cr_bypass_data_t; + signal execute2_bypass: bypass_data_t; + signal execute2_cr_bypass: cr_bypass_data_t; -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; @@ -298,6 +300,8 @@ begin c_out => decode2_to_cr_file, execute_bypass => execute1_bypass, execute_cr_bypass => execute1_cr_bypass, + execute2_bypass => execute2_bypass, + execute2_cr_bypass => execute2_cr_bypass, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -359,6 +363,8 @@ begin e_out => execute1_to_writeback, bypass_data => execute1_bypass, bypass_cr_data => execute1_cr_bypass, + bypass2_data => execute2_bypass, + bypass2_cr_data => execute2_cr_bypass, icache_inval => ex1_icache_inval, dbg_ctrl_out => ctrl_debug, wb_events => writeback_events, diff --git a/decode2.vhdl b/decode2.vhdl index af0c27d..c290c98 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -39,6 +39,8 @@ entity decode2 is execute_bypass : in bypass_data_t; execute_cr_bypass : in cr_bypass_data_t; + execute2_bypass : in bypass_data_t; + execute2_cr_bypass : in cr_bypass_data_t; log_out : out std_ulogic_vector(9 downto 0) ); @@ -273,19 +275,19 @@ architecture behaviour of decode2 is signal gpr_a_read_valid : std_ulogic; signal gpr_a_read : gspr_index_t; - signal gpr_a_bypass : std_ulogic; + signal gpr_a_bypass : std_ulogic_vector(1 downto 0); signal gpr_b_read_valid : std_ulogic; signal gpr_b_read : gspr_index_t; - signal gpr_b_bypass : std_ulogic; + signal gpr_b_bypass : std_ulogic_vector(1 downto 0); signal gpr_c_read_valid : std_ulogic; signal gpr_c_read : gspr_index_t; - signal gpr_c_bypass : std_ulogic; + signal gpr_c_bypass : std_ulogic_vector(1 downto 0); signal cr_read_valid : std_ulogic; signal cr_write_valid : std_ulogic; - signal cr_bypass : std_ulogic; + signal cr_bypass : std_ulogic_vector(1 downto 0); signal instr_tag : instr_tag_t; @@ -321,6 +323,8 @@ begin execute_next_tag => execute_bypass.tag, execute_next_cr_tag => execute_cr_bypass.tag, + execute2_next_tag => execute2_bypass.tag, + execute2_next_cr_tag => execute2_cr_bypass.tag, cr_read_in => cr_read_valid, cr_write_in => cr_write_valid, @@ -504,27 +508,35 @@ begin -- See if any of the operands can get their value via the bypass path. case gpr_a_bypass is - when '1' => + when "10" => v.e.read_data1 := execute_bypass.data; + when "11" => + v.e.read_data1 := execute2_bypass.data; when others => v.e.read_data1 := decoded_reg_a.data; end case; case gpr_b_bypass is - when '1' => + when "10" => v.e.read_data2 := execute_bypass.data; + when "11" => + v.e.read_data2 := execute2_bypass.data; when others => v.e.read_data2 := decoded_reg_b.data; end case; case gpr_c_bypass is - when '1' => + when "10" => v.e.read_data3 := execute_bypass.data; + when "11" => + v.e.read_data3 := execute2_bypass.data; when others => v.e.read_data3 := decoded_reg_c.data; end case; v.e.cr := c_in.read_cr_data; - if cr_bypass = '1' then + if cr_bypass = "10" then v.e.cr := execute_cr_bypass.data; + elsif cr_bypass = "11" then + v.e.cr := execute2_cr_bypass.data; end if; -- issue control @@ -577,9 +589,9 @@ begin r.e.valid & stopped_out & stall_out & - gpr_a_bypass & - gpr_b_bypass & - gpr_c_bypass; + (gpr_a_bypass(1) or gpr_a_bypass(0)) & + (gpr_b_bypass(1) or gpr_b_bypass(0)) & + (gpr_c_bypass(1) or gpr_c_bypass(0)); end if; end process; log_out <= log_data; diff --git a/execute1.vhdl b/execute1.vhdl index ebcdfeb..ebc24c5 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -40,6 +40,8 @@ entity execute1 is e_out : out Execute1ToWritebackType; bypass_data : out bypass_data_t; bypass_cr_data : out cr_bypass_data_t; + bypass2_data : out bypass_data_t; + bypass2_cr_data : out cr_bypass_data_t; dbg_ctrl_out : out ctrl_t; @@ -1482,6 +1484,7 @@ begin variable fv : Execute1ToFPUType; variable k : integer; variable go : std_ulogic; + variable bypass_valid : std_ulogic; begin v := ex2; if (l_in.busy or fp_in.busy) = '0' then @@ -1559,6 +1562,19 @@ begin ctrl_tmp.msr(MSR_LE) <= '1'; end if; + bypass_valid := ex1.e.valid; + if (ex2.busy or l_in.busy or fp_in.busy) = '1' and ex1.res2_sel(1) = '1' then + bypass_valid := '0'; + end if; + + bypass2_data.tag.valid <= ex1.e.write_enable and bypass_valid; + bypass2_data.tag.tag <= ex1.e.instr_tag.tag; + bypass2_data.data <= ex_result; + + bypass2_cr_data.tag.valid <= ex1.e.write_cr_enable and bypass_valid; + bypass2_cr_data.tag.tag <= ex1.e.instr_tag.tag; + bypass2_cr_data.data <= ex1.e.write_cr_data; + -- Update registers ex2in <= v; From e030a500e85ad0e22e47dfb7af087e7fef9df20d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 27 Jun 2022 18:53:04 +1000 Subject: [PATCH 07/30] Allow integer instructions and load/store instructions to execute together Execute1 and loadstore1 now send each other stall signals that indicate that a valid instruction in stage 2 can't complete in this cycle, and hence any valid instruction in stage 1 in the other unit can't move to stage 2. With this in place, an ALU instruction can move into stage 1 while a LSU instruction is in stage 2. Since the FPU doesn't yet have a way to stall completion, we can't yet start FPU instructions while any LSU or ALU instruction is in progress. Signed-off-by: Paul Mackerras --- common.vhdl | 5 +++-- countbits.vhdl | 5 +++-- execute1.vhdl | 22 +++++++++++++--------- loadstore1.vhdl | 6 +++--- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/common.vhdl b/common.vhdl index 6cbf181..ac733db 100644 --- a/common.vhdl +++ b/common.vhdl @@ -461,6 +461,7 @@ package common is is_32bit : std_ulogic; repeat : std_ulogic; second : std_ulogic; + e2stall : std_ulogic; msr : std_ulogic_vector(63 downto 0); end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := @@ -473,13 +474,13 @@ package common is write_reg => (others => '0'), length => (others => '0'), mode_32bit => '0', is_32bit => '0', - repeat => '0', second => '0', + repeat => '0', second => '0', e2stall => '0', msr => (others => '0')); type Loadstore1ToExecute1Type is record busy : std_ulogic; + l2stall : std_ulogic; in_progress : std_ulogic; - interrupt : std_ulogic; end record; type Loadstore1ToDcacheType is record diff --git a/countbits.vhdl b/countbits.vhdl index b16baa0..87417a9 100644 --- a/countbits.vhdl +++ b/countbits.vhdl @@ -9,6 +9,7 @@ entity bit_counter is port ( clk : in std_logic; rs : in std_ulogic_vector(63 downto 0); + stall : in std_ulogic; count_right : in std_ulogic; do_popcnt : in std_ulogic; is_32bit : in std_ulogic; @@ -49,7 +50,7 @@ architecture behaviour of bit_counter is begin countzero_r: process(clk) begin - if rising_edge(clk) then + if rising_edge(clk) and stall = '0' then inp_r <= inp; sum_r <= sum; end if; @@ -88,7 +89,7 @@ begin popcnt_r: process(clk) begin - if rising_edge(clk) then + if rising_edge(clk) and stall = '0' then for i in 0 to 7 loop pc8_r(i) <= pc8(i); end loop; diff --git a/execute1.vhdl b/execute1.vhdl index ebc24c5..e4db56f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -204,6 +204,8 @@ architecture behaviour of execute1 is signal exception_log : std_ulogic; signal irq_valid_log : std_ulogic; + signal stage2_stall : std_ulogic; + type privilege_level is (USER, SUPER); type op_privilege_array is array(insn_type_t) of privilege_level; constant op_privilege: op_privilege_array := ( @@ -351,6 +353,7 @@ begin port map ( clk => clk, rs => c_in, + stall => stage2_stall, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, do_popcnt => do_popcnt, @@ -436,14 +439,13 @@ begin -- XER forwarding. To avoid having to track XER hazards, we use -- the previously latched value. Since the XER common bits -- (SO, OV[32] and CA[32]) are only modified by instructions that are - -- handled here, we can just forward the result being sent to - -- writeback. + -- handled here, we can just use the result most recently sent to + -- writeback, unless a pipeline flush has happened in the meantime. xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc; with e_in.unit select busy_out <= - l_in.busy or ex1.e.valid or ex1.busy or fp_in.busy when LDST, l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU, - l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy when others; + l_in.busy or ex1.busy or fp_in.busy when others; valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt); @@ -479,8 +481,7 @@ begin -- We mustn't get stalled on a cycle where execute2 is -- completing an instruction or generating an interrupt if ex2.e.valid = '1' or ex2.e.interrupt = '1' then - assert (l_in.busy or fp_in.busy) = '0' - severity failure; + assert stage2_stall = '0' severity failure; end if; end if; end if; @@ -1434,6 +1435,7 @@ begin lv.is_32bit := e_in.is_32bit; lv.repeat := e_in.repeat; lv.second := e_in.second; + lv.e2stall := '0'; -- Outputs to FPU fv.op := e_in.insn_type; @@ -1476,6 +1478,8 @@ begin pmu_to_x.spr_val when "11", ex1.e.write_data when others; + stage2_stall <= l_in.l2stall or fp_in.busy; + -- Second execute stage control execute2_1: process(all) variable v : reg_stage2_type; @@ -1487,7 +1491,7 @@ begin variable bypass_valid : std_ulogic; begin v := ex2; - if (l_in.busy or fp_in.busy) = '0' then + if stage2_stall = '0' then v.e := ex1.e; v.se := ex1.se; v.e.write_data := ex_result; @@ -1526,7 +1530,7 @@ begin v.ext_interrupt := '0'; end if; - if (l_in.busy or fp_in.busy) = '0' then + if stage2_stall = '0' then if ex1.se.write_msr = '1' then ctrl_tmp.msr <= ex1.msr; end if; @@ -1563,7 +1567,7 @@ begin end if; bypass_valid := ex1.e.valid; - if (ex2.busy or l_in.busy or fp_in.busy) = '1' and ex1.res2_sel(1) = '1' then + if stage2_stall = '1' and ex1.res2_sel(1) = '1' then bypass_valid := '0'; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index ea7baec..bd62f0b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -624,7 +624,7 @@ begin store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j); end loop; - if (dc_stall or d_in.error or r2.busy) = '0' then + if (dc_stall or d_in.error or r2.busy or l_in.e2stall) = '0' then if r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0' then v.req := r1.req; v.addr0 := r1.addr0; @@ -950,7 +950,7 @@ begin else d_out.data <= r2.req.store_data; end if; - d_out.hold <= '0'; + d_out.hold <= l_in.e2stall; -- Update outputs to MMU m_out.valid <= mmureq; @@ -980,8 +980,8 @@ begin -- update busy signal back to execute1 e_out.busy <= busy; + e_out.l2stall <= dc_stall or d_in.error or r2.busy; e_out.in_progress <= in_progress; - e_out.interrupt <= r3.interrupt; events <= r3.events; From ef122868d55d4681c4823ea9705179a60fc04da6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 Jun 2022 18:18:08 +1000 Subject: [PATCH 08/30] Do CR0 setting for Rc=1 instructions in execute2 instead of writeback This lets us forward the CR0 result to following instructions that use CR, meaning they get to issue one cycle earlier. Signed-off-by: Paul Mackerras --- execute1.vhdl | 67 +++++++++++++++++++++++++++++++++++++++++--------- writeback.vhdl | 23 +---------------- 2 files changed, 56 insertions(+), 34 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index e4db56f..75e8275 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -169,7 +169,6 @@ architecture behaviour of execute1 is signal muldiv_result: std_ulogic_vector(63 downto 0); signal shortmul_result: std_ulogic_vector(63 downto 0); signal spr_result: std_ulogic_vector(63 downto 0); - signal ex_result: std_ulogic_vector(63 downto 0); signal next_nia : std_ulogic_vector(63 downto 0); signal s1_sel : std_ulogic_vector(2 downto 0); @@ -799,8 +798,10 @@ begin crnum := fxm_to_num(insn_fxm(e_in.insn)); write_cr_mask <= num_to_fxm(crnum); end if; - else + elsif e_in.output_cr = '1' then write_cr_mask <= num_to_fxm(crnum); + else + write_cr_mask <= (others => '0'); end if; for i in 0 to 7 loop if write_cr_mask(i) = '0' then @@ -1471,13 +1472,6 @@ begin ctrl.cfar when SPRSEL_CFAR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; - -- Second stage result mux - with ex1.res2_sel select ex_result <= - countbits_result when "01", - spr_result when "10", - pmu_to_x.spr_val when "11", - ex1.e.write_data when others; - stage2_stall <= l_in.l2stall or fp_in.busy; -- Second execute stage control @@ -1489,12 +1483,18 @@ begin variable k : integer; variable go : std_ulogic; variable bypass_valid : std_ulogic; + variable rcresult : std_ulogic_vector(63 downto 0); + variable sprres : std_ulogic_vector(63 downto 0); + variable ex_result : std_ulogic_vector(63 downto 0); + variable cr_res : std_ulogic_vector(31 downto 0); + variable cr_mask : std_ulogic_vector(7 downto 0); + variable sign, zero : std_ulogic; + variable rcnz_hi, rcnz_lo : std_ulogic; begin v := ex2; if stage2_stall = '0' then v.e := ex1.e; v.se := ex1.se; - v.e.write_data := ex_result; v.ext_interrupt := ex1.ext_interrupt; v.taken_branch_event := ex1.taken_branch_event; v.br_mispredict := ex1.br_mispredict; @@ -1530,7 +1530,49 @@ begin v.ext_interrupt := '0'; end if; + -- This is split like this because mfspr doesn't have an Rc bit, + -- and we don't want the zero-detect logic to be after the + -- SPR mux for timing reasons. + if ex1.res2_sel(0) = '0' then + rcresult := ex1.e.write_data; + sprres := spr_result; + else + rcresult := countbits_result; + sprres := pmu_to_x.spr_val; + end if; + if ex1.res2_sel(1) = '0' then + ex_result := rcresult; + else + ex_result := sprres; + end if; + + cr_res := ex1.e.write_cr_data; + cr_mask := ex1.e.write_cr_mask; + if ex1.e.rc = '1' and ex1.e.write_enable = '1' then + rcnz_lo := or (rcresult(31 downto 0)); + if ex1.e.mode_32bit = '0' then + rcnz_hi := or (rcresult(63 downto 32)); + zero := not (rcnz_hi or rcnz_lo); + sign := ex_result(63); + else + zero := not rcnz_lo; + sign := ex_result(31); + end if; + cr_res(31) := sign; + cr_res(30) := not (sign or zero); + cr_res(29) := zero; + cr_res(28) := ex1.xerc.so; + cr_mask(7) := '1'; + end if; + if stage2_stall = '0' then + v.e.write_data := ex_result; + v.e.write_cr_data := cr_res; + v.e.write_cr_mask := cr_mask; + if ex1.e.rc = '1' and ex1.e.write_enable = '1' and v.e.valid = '1' then + v.e.write_cr_enable := '1'; + end if; + if ex1.se.write_msr = '1' then ctrl_tmp.msr <= ex1.msr; end if; @@ -1575,9 +1617,10 @@ begin bypass2_data.tag.tag <= ex1.e.instr_tag.tag; bypass2_data.data <= ex_result; - bypass2_cr_data.tag.valid <= ex1.e.write_cr_enable and bypass_valid; + bypass2_cr_data.tag.valid <= (ex1.e.write_cr_enable or (ex1.e.rc and ex1.e.write_enable)) + and bypass_valid; bypass2_cr_data.tag.tag <= ex1.e.instr_tag.tag; - bypass2_cr_data.data <= ex1.e.write_cr_data; + bypass2_cr_data.data <= cr_res; -- Update registers ex2in <= v; diff --git a/writeback.vhdl b/writeback.vhdl index a99d4d2..db30164 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -66,7 +66,7 @@ begin to_integer(unsigned(w))) <= 1 severity failure; w(0) := e_in.write_cr_enable; - x(0) := (e_in.write_enable and e_in.rc); + x(0) := l_in.rc; y(0) := fp_in.write_cr_enable; assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; @@ -80,9 +80,6 @@ begin writeback_1: process(all) variable v : reg_type; variable f : WritebackToFetch1Type; - variable cf: std_ulogic_vector(3 downto 0); - variable zero : std_ulogic; - variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); variable vec : integer range 0 to 16#fff#; variable srr1 : std_ulogic_vector(15 downto 0); @@ -186,24 +183,6 @@ begin c_out.write_cr_data(31 downto 28) <= scf; end if; - -- Perform CR0 update for RC forms - -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data - if e_in.rc = '1' and e_in.write_enable = '1' then - zero := not (or e_in.write_data(31 downto 0)); - if e_in.mode_32bit = '0' then - sign := e_in.write_data(63); - zero := zero and not (or e_in.write_data(63 downto 32)); - else - sign := e_in.write_data(31); - end if; - c_out.write_cr_enable <= '1'; - c_out.write_cr_mask <= num_to_fxm(0); - cf(3) := sign; - cf(2) := not sign and not zero; - cf(1) := zero; - cf(0) := e_in.xerc.so; - c_out.write_cr_data(31 downto 28) <= cf; - end if; end if; -- Outputs to fetch1 From 9a8a8e50f8e886a90315091fe8d9e584c8429493 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 Jul 2022 14:17:18 +1000 Subject: [PATCH 09/30] FPU: Add stage-2 stall ability to FPU This makes the FPU able to stall other units at execute stage 2 and be stalled by other units (specifically the LSU). This means that the completion and writeback for an instruction can now end up being deferred until the second cycle of a following instruction, i.e. the cycle when the state machine has gone through IDLE state into one of the DO_* states, which means we need to latch the destination FPR number, CR mask, etc. from the previous instruction so that we present the correct information to writeback. The advantage of this is that we can get rid of the in_progress signal from the LSU. Signed-off-by: Paul Mackerras --- common.vhdl | 5 +- core.vhdl | 1 + execute1.vhdl | 14 ++-- fpu.vhdl | 169 ++++++++++++++++++++++++++++++------------------ loadstore1.vhdl | 3 - 5 files changed, 118 insertions(+), 74 deletions(-) diff --git a/common.vhdl b/common.vhdl index ac733db..ea6a8d8 100644 --- a/common.vhdl +++ b/common.vhdl @@ -480,7 +480,6 @@ package common is type Loadstore1ToExecute1Type is record busy : std_ulogic; l2stall : std_ulogic; - in_progress : std_ulogic; end record; type Loadstore1ToDcacheType is record @@ -640,16 +639,18 @@ package common is frt : gspr_index_t; rc : std_ulogic; out_cr : std_ulogic; + stall : std_ulogic; end record; constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'), itag => instr_tag_init, insn => (others => '0'), fe_mode => "00", rc => '0', fra => (others => '0'), frb => (others => '0'), frc => (others => '0'), frt => (others => '0'), - single => '0', out_cr => '0'); + single => '0', out_cr => '0', stall => '0'); type FPUToExecute1Type is record busy : std_ulogic; + f2stall : std_ulogic; exception : std_ulogic; end record; constant FPUToExecute1Init : FPUToExecute1Type := (others => '0'); diff --git a/core.vhdl b/core.vhdl index 84604c6..23f7e82 100644 --- a/core.vhdl +++ b/core.vhdl @@ -384,6 +384,7 @@ begin port map ( clk => clk, rst => rst_fpu, + flush_in => flush, e_in => execute1_to_fpu, e_out => fpu_to_execute1, w_out => fpu_to_writeback diff --git a/execute1.vhdl b/execute1.vhdl index 75e8275..57f90b0 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -442,9 +442,9 @@ begin -- writeback, unless a pipeline flush has happened in the meantime. xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc; - with e_in.unit select busy_out <= - l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU, - l_in.busy or ex1.busy or fp_in.busy when others; + -- N.B. the busy signal from each source includes the + -- stage2 stall from that source in it. + busy_out <= l_in.busy or ex1.busy or fp_in.busy; valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt); @@ -1299,8 +1299,7 @@ begin end if; end if; - v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or - ex1.busy or fp_in.busy); + v.no_instr_avail := not (e_in.valid or l_in.busy or ex1.busy or fp_in.busy); go := valid_in and not exception; v.instr_dispatch := go; @@ -1436,7 +1435,7 @@ begin lv.is_32bit := e_in.is_32bit; lv.repeat := e_in.repeat; lv.second := e_in.second; - lv.e2stall := '0'; + lv.e2stall := fp_in.f2stall; -- Outputs to FPU fv.op := e_in.insn_type; @@ -1451,6 +1450,7 @@ begin fv.frt := e_in.write_reg; fv.rc := e_in.rc; fv.out_cr := e_in.output_cr; + fv.stall := l_in.l2stall; -- Update registers ex1in <= v; @@ -1472,7 +1472,7 @@ begin ctrl.cfar when SPRSEL_CFAR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; - stage2_stall <= l_in.l2stall or fp_in.busy; + stage2_stall <= l_in.l2stall or fp_in.f2stall; -- Second execute stage control execute2_1: process(all) diff --git a/fpu.vhdl b/fpu.vhdl index fad09cc..a20a7a0 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -15,6 +15,7 @@ entity fpu is port ( clk : in std_ulogic; rst : in std_ulogic; + flush_in : in std_ulogic; e_in : in Execute1ToFPUType; e_out : out FPUToExecute1Type; @@ -35,7 +36,7 @@ architecture behaviour of fpu is mantissa : std_ulogic_vector(63 downto 0); -- 10.54 format end record; - type state_t is (IDLE, + type state_t is (IDLE, DO_ILLEGAL, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, @@ -71,7 +72,9 @@ architecture behaviour of fpu is type reg_type is record state : state_t; busy : std_ulogic; + f2stall : std_ulogic; instr_done : std_ulogic; + complete : std_ulogic; do_intr : std_ulogic; illegal : std_ulogic; op : insn_type_t; @@ -83,7 +86,9 @@ architecture behaviour of fpu is rc : std_ulogic; is_cmp : std_ulogic; single_prec : std_ulogic; + sp_result : std_ulogic; fpscr : std_ulogic_vector(31 downto 0); + comm_fpscr : std_ulogic_vector(31 downto 0); -- committed FPSCR value a : fpu_reg_type; b : fpu_reg_type; c : fpu_reg_type; @@ -96,13 +101,17 @@ architecture behaviour of fpu is result_class : fp_number_class; result_exp : signed(EXP_BITS-1 downto 0); shift : signed(EXP_BITS-1 downto 0); - writing_back : std_ulogic; + writing_fpr : std_ulogic; + write_reg : gspr_index_t; + complete_tag : instr_tag_t; + writing_cr : std_ulogic; int_result : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); old_exc : std_ulogic_vector(4 downto 0); update_fprf : std_ulogic; quieten_nan : std_ulogic; + nsnan_result : std_ulogic; tiny : std_ulogic; denorm : std_ulogic; round_mode : std_ulogic_vector(2 downto 0); @@ -542,17 +551,30 @@ begin fpu_0: process(clk) begin if rising_edge(clk) then - if rst = '1' then + if rst = '1' or flush_in = '1' then r.state <= IDLE; r.busy <= '0'; + r.f2stall <= '0'; r.instr_done <= '0'; + r.complete <= '0'; + r.illegal <= '0'; r.do_intr <= '0'; + r.writing_fpr <= '0'; + r.writing_cr <= '0'; r.fpscr <= (others => '0'); - r.writing_back <= '0'; - r.dest_fpr <= (others =>'0'); + r.write_reg <= (others =>'0'); + r.complete_tag.valid <= '0'; r.cr_mask <= (others =>'0'); r.cr_result <= (others =>'0'); r.instr_tag.valid <= '0'; + if rst = '1' then + r.fpscr <= (others => '0'); + r.comm_fpscr <= (others => '0'); + elsif r.do_intr = '0' then + -- flush_in = 1 and not due to us generating an interrupt, + -- roll back to committed fpscr + r.fpscr <= r.comm_fpscr; + end if; else assert not (r.state /= IDLE and e_in.valid = '1') severity failure; r <= rin; @@ -577,14 +599,19 @@ begin end process; e_out.busy <= r.busy; + e_out.f2stall <= r.f2stall; e_out.exception <= r.fpscr(FPSCR_FEX); - w_out.valid <= r.instr_done and not r.do_intr; - w_out.instr_tag <= r.instr_tag; - w_out.write_enable <= r.writing_back; - w_out.write_reg <= r.dest_fpr; + -- Note that the cycle where r.complete = 1 for an instruction can be as + -- late as the second cycle of the following instruction (i.e. in the state + -- following IDLE state). Hence it is important that none of the fields of + -- r that are used below are modified in IDLE state. + w_out.valid <= r.complete; + w_out.instr_tag <= r.complete_tag; + w_out.write_enable <= r.writing_fpr and r.complete; + w_out.write_reg <= r.write_reg; w_out.write_data <= fp_result; - w_out.write_cr_enable <= r.instr_done and (r.rc or r.is_cmp); + w_out.write_cr_enable <= r.writing_cr and r.complete; w_out.write_cr_mask <= r.cr_mask; w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result; @@ -599,7 +626,6 @@ begin variable bdec : fpu_reg_type; variable cdec : fpu_reg_type; variable fpscr_mask : std_ulogic_vector(31 downto 0); - variable illegal : std_ulogic; variable j, k : integer; variable flm : std_ulogic_vector(7 downto 0); variable int_input : std_ulogic; @@ -644,12 +670,22 @@ begin variable maddend : std_ulogic_vector(127 downto 0); variable sum : std_ulogic_vector(63 downto 0); variable round_inc : std_ulogic_vector(63 downto 0); + variable int_result : std_ulogic; + variable illegal : std_ulogic; begin v := r; - illegal := '0'; - v.busy := '0'; + v.complete := '0'; + v.do_intr := '0'; int_input := '0'; + if r.complete = '1' or r.do_intr = '1' then + v.instr_done := '0'; + v.writing_fpr := '0'; + v.writing_cr := '0'; + v.comm_fpscr := r.fpscr; + v.illegal := '0'; + end if; + -- capture incoming instruction if e_in.valid = '1' then v.insn := e_in.insn; @@ -660,14 +696,8 @@ begin v.dest_fpr := e_in.frt; v.single_prec := e_in.single; v.longmask := e_in.single; - v.int_result := '0'; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; - if e_in.out_cr = '0' then - v.cr_mask := num_to_fxm(1); - else - v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn)))); - end if; int_input := '0'; if e_in.op = OP_FPOP_I then int_input := '1'; @@ -741,8 +771,6 @@ begin pcmpb_lt := '1'; end if; - v.writing_back := '0'; - v.instr_done := '0'; v.update_fprf := '0'; v.shift := to_signed(0, EXP_BITS); v.first := '0'; @@ -777,6 +805,8 @@ begin pshift := '0'; renorm_sqrt := '0'; shiftin := '0'; + int_result := '0'; + illegal := '0'; case r.state is when IDLE => v.use_a := '0'; @@ -785,6 +815,7 @@ begin v.invalid := '0'; v.negate := '0'; if e_in.valid = '1' then + v.busy := '1'; case e_in.insn(5 downto 1) is when "00000" => if e_in.insn(8) = '1' then @@ -876,13 +907,17 @@ begin end if; v.state := DO_FMADD; when others => - illegal := '1'; + v.state := DO_ILLEGAL; end case; end if; v.x := '0'; v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); set_s := '1'; + when DO_ILLEGAL => + illegal := '1'; + v.instr_done := '1'; + when DO_MCRFS => j := to_integer(unsigned(insn_bfa(r.insn))); for i in 0 to 7 loop @@ -894,11 +929,9 @@ begin end loop; v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF"); v.instr_done := '1'; - v.state := IDLE; when DO_FTDIV => v.instr_done := '1'; - v.state := IDLE; v.cr_result := "0000"; if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or (r.b.class = FINITE and r.b.mantissa(53) = '0') then @@ -917,7 +950,6 @@ begin when DO_FTSQRT => v.instr_done := '1'; - v.state := IDLE; v.cr_result := "0000"; if r.b.class = ZERO or r.b.class = INFINITY or (r.b.class = FINITE and r.b.mantissa(53) = '0') then @@ -932,7 +964,6 @@ begin -- fcmp[uo] -- r.opsel_a = AIN_B v.instr_done := '1'; - v.state := IDLE; update_fx := '1'; v.result_exp := r.b.exponent; if (r.a.class = NAN and r.a.mantissa(53) = '0') or @@ -993,7 +1024,6 @@ begin end if; end loop; v.instr_done := '1'; - v.state := IDLE; when DO_MTFSFI => -- mtfsfi @@ -1007,20 +1037,17 @@ begin end loop; end if; v.instr_done := '1'; - v.state := IDLE; when DO_FMRG => -- fmrgew, fmrgow opsel_r <= RES_MISC; misc_sel <= "01" & r.insn(8) & '0'; - v.int_result := '1'; - v.writing_back := '1'; + int_result := '1'; + v.writing_fpr := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_MFFS => - v.int_result := '1'; - v.writing_back := '1'; + v.writing_fpr := '1'; opsel_r <= RES_MISC; case r.insn(20 downto 16) is when "00000" => @@ -1044,10 +1071,11 @@ begin -- mffsl fpscr_mask := x"0007F0FF"; when others => - illegal := '1'; + v.illegal := '1'; + v.writing_fpr := '0'; end case; + int_result := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_MTFSF => if r.insn(25) = '1' then @@ -1064,7 +1092,6 @@ begin end if; end loop; v.instr_done := '1'; - v.state := IDLE; when DO_FMR => -- r.opsel_a = AIN_B @@ -1082,9 +1109,8 @@ begin else v.result_sign := r.a.negative; -- fcpsgn end if; - v.writing_back := '1'; + v.writing_fpr := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_FRI => -- fri[nzpm] -- r.opsel_a = AIN_B @@ -1153,7 +1179,7 @@ begin invalid := '1'; end if; - v.int_result := '1'; + int_result := '1'; case r.b.class is when ZERO => arith_done := '1'; @@ -1671,7 +1697,6 @@ begin end if; v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; v.instr_done := '1'; - v.state := IDLE; when MULT_1 => f_to_multiply.valid <= r.first; @@ -1849,7 +1874,6 @@ begin v.cr_result(1) := exp_tiny or exp_huge; if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then v.instr_done := '1'; - v.state := IDLE; else v.shift := r.a.exponent; v.doing_ftdiv := "10"; @@ -2054,6 +2078,7 @@ begin when others => -- fctidu[z] need_check := r.r(63); end case; + int_result := '1'; if need_check = '1' then v.state := INT_CHECK; else @@ -2080,6 +2105,7 @@ begin v.fpscr(FPSCR_XX) := '1'; end if; end if; + int_result := '1'; arith_done := '1'; when INT_OFLOW => @@ -2090,6 +2116,7 @@ begin end if; v.fpscr(FPSCR_VXCVI) := '1'; invalid := '1'; + int_result := '1'; arith_done := '1'; when FRI_1 => @@ -2306,11 +2333,10 @@ begin -- Neither does enabled zero-divide exception if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then - v.writing_back := '1'; + v.writing_fpr := '1'; v.update_fprf := '1'; end if; v.instr_done := '1'; - v.state := IDLE; update_fx := '1'; end if; @@ -2530,12 +2556,6 @@ begin v.shift := resize(signed('0' & clz) - 9, EXP_BITS); end if; - if r.int_result = '1' then - fp_result <= r.r; - else - fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r, - r.single_prec, r.quieten_nan); - end if; if r.update_fprf = '1' then v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class, r.r(54) and not r.denorm); @@ -2549,24 +2569,49 @@ begin (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then v.fpscr(FPSCR_FX) := '1'; end if; - if r.rc = '1' then - v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); - end if; - v.illegal := illegal; - if illegal = '1' then - v.instr_done := '0'; - v.do_intr := '1'; - v.writing_back := '0'; - v.busy := '0'; - v.state := IDLE; + if v.instr_done = '1' then + if r.state /= IDLE then + v.state := IDLE; + v.busy := '0'; + v.f2stall := '0'; + if r.rc = '1' then + v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); + end if; + v.sp_result := r.single_prec; + v.int_result := int_result; + v.illegal := illegal; + v.nsnan_result := v.quieten_nan; + if r.is_cmp = '0' then + v.cr_mask := num_to_fxm(1); + else + v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(r.insn)))); + end if; + v.writing_cr := r.is_cmp or r.rc; + v.write_reg := r.dest_fpr; + v.complete_tag := r.instr_tag; + end if; + if e_in.stall = '0' then + v.complete := not v.illegal; + v.do_intr := (v.fpscr(FPSCR_FEX) and r.fe_mode) or v.illegal; + end if; + -- N.B. We rely on execute1 to prevent any new instruction + -- coming in while e_in.stall = 1, without us needing to + -- have busy asserted. else - v.do_intr := v.instr_done and v.fpscr(FPSCR_FEX) and r.fe_mode; - if v.state /= IDLE or v.do_intr = '1' then - v.busy := '1'; + if r.state /= IDLE and e_in.stall = '0' then + v.f2stall := '1'; end if; end if; + -- This mustn't depend on any fields of r that are modified in IDLE state. + if r.int_result = '1' then + fp_result <= r.r; + else + fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r, + r.sp_result, r.nsnan_result); + end if; + rin <= v; end process; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index bd62f0b..ff2633b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -159,7 +159,6 @@ architecture behave of loadstore1 is signal flush : std_ulogic; signal busy : std_ulogic; signal complete : std_ulogic; - signal in_progress : std_ulogic; signal flushing : std_ulogic; signal store_sp_data : std_ulogic_vector(31 downto 0); @@ -523,7 +522,6 @@ begin busy <= dc_stall or d_in.error or r1.busy or r2.busy; complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or r3.complete; - in_progress <= r1.req.valid or (r2.req.valid and not complete); -- Processing done in the first cycle of a load/store instruction loadstore1_1: process(all) @@ -981,7 +979,6 @@ begin -- update busy signal back to execute1 e_out.busy <= busy; e_out.l2stall <= dc_stall or d_in.error or r2.busy; - e_out.in_progress <= in_progress; events <= r3.events; From ebe1caab85c35497e733c566fc9750813f505e5d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 Jul 2022 22:23:35 +1000 Subject: [PATCH 10/30] decode1: Reduce number of single-issue instructions This reduces the set of instructions marked as single-issue to just attn and mtspr to "slow" SPRs (those that are not stored in the register file). The instructions that were previously single-issue are: isync, dcbf, dcbst, dcbt, dcbtst, eieio, icbi, mfmsr, mtmsr, mtmsrd, mfspr to slow SPRS, sync, tlbsync and wait. The synchronization instructions are mostly no-ops anyway due to the in-order nature of the core, and the cache-management instructions are unimplemented (except for icbi). The MSR ops don't need to be single-issue due to the in-order core and the fact that MSR updates are effective on the following instruction. Signed-off-by: Paul Mackerras --- decode1.vhdl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index fb92b9e..3f3109f 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -170,7 +170,7 @@ architecture behaviour of decode1 is -- bclr, bcctr, bctar 2#100# => (ALU, NONE, OP_BCREG, SPR, SPR, NONE, SPR, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- isync - 2#111# => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), + 2#111# => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- rfid 2#101# => (ALU, NONE, OP_RFID, SPR, SPR, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), others => illegal_inst @@ -223,10 +223,10 @@ architecture behaviour of decode1 is 2#1000111010# => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- cnttzd 2#1000011010# => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- cnttzw 2#1011110011# => (ALU, NONE, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- darn - 2#0001010110# => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbf - 2#0000110110# => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbst - 2#0100010110# => (ALU, NONE, OP_DCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbt - 2#0011110110# => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbtst + 2#0001010110# => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbf + 2#0000110110# => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbst + 2#0100010110# => (ALU, NONE, OP_DCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbt + 2#0011110110# => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbtst 2#1111110110# => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbz 2#0110001001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeu 2#1110001001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeuo @@ -247,7 +247,7 @@ architecture behaviour of decode1 is 2#1100110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dss 2#0101010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dst 2#0101110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dstst - 2#1101010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- eieio + 2#1101010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- eieio 2#0100011100# => (ALU, NONE, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- eqv 2#1110111010# => (ALU, NONE, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- extsb 2#1110011010# => (ALU, NONE, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- extsh @@ -327,8 +327,8 @@ architecture behaviour of decode1 is 2#1100001001# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd 2#1100001011# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw 2#0010010000# => (ALU, NONE, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtcrf/mtocrf - 2#0010010010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1', NONE), -- mtmsr - 2#0010110010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mtmsrd # ignore top bits and d + 2#0010010010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- mtmsr + 2#0010110010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtmsrd # ignore top bits and d 2#0111010011# => (ALU, NONE, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr 2#0001001001# => (ALU, NONE, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- mulhd 2#0000001001# => (ALU, NONE, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- mulhdu @@ -409,13 +409,13 @@ architecture behaviour of decode1 is 2#1011101000# => (ALU, NONE, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfmeo 2#0011001000# => (ALU, NONE, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfze 2#1011001000# => (ALU, NONE, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfzeo - 2#1001010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- sync + 2#1001010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sync 2#0001000100# => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- td 2#0000000100# => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- tw 2#0100110010# => (LDST, NONE, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbie 2#0100010010# => (LDST, NONE, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbiel - 2#1000110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- tlbsync - 2#0000011110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- wait + 2#1000110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbsync + 2#0000011110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- wait 2#0100111100# => (ALU, NONE, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- xor others => illegal_inst ); @@ -640,9 +640,9 @@ begin if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr - -- Make slow SPRs single issue if is_fast_spr(v.ispr1) = '0' then - vi.force_single := '1'; + -- Make mtspr to slow SPRs single issue + vi.force_single := f_in.insn(8); -- send MMU-related SPRs to loadstore1 case sprn is when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR => From 0bd1e24024879ae6a30f29ae8a6a47e169551096 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 4 Jul 2022 18:23:03 +1000 Subject: [PATCH 11/30] decode2: Rename 'r' to 'dc2' Also get rid of a couple of unused variables. Signed-off-by: Paul Mackerras --- decode2.vhdl | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/decode2.vhdl b/decode2.vhdl index c290c98..1d4ce57 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -52,7 +52,7 @@ architecture behaviour of decode2 is repeat : std_ulogic; end record; - signal r, rin : reg_type; + signal dc2, dc2in : reg_type; signal deferred : std_ulogic; @@ -302,7 +302,7 @@ begin complete_in => complete_in, valid_in => control_valid_in, - repeated => r.repeat, + repeated => dc2.repeat, busy_in => busy_in, deferred => deferred, flush_in => flush_in, @@ -341,16 +341,16 @@ begin instr_tag_out => instr_tag ); - deferred <= r.e.valid and busy_in; + deferred <= dc2.e.valid and busy_in; decode2_0: process(clk) begin if rising_edge(clk) then if rst = '1' or flush_in = '1' or deferred = '0' then - if rin.e.valid = '1' then - report "execute " & to_hstring(rin.e.nia); + if dc2in.e.valid = '1' then + report "execute " & to_hstring(dc2in.e.nia); end if; - r <= rin; + dc2 <= dc2in; end if; end if; end process; @@ -359,8 +359,6 @@ begin decode2_1: process(all) variable v : reg_type; - variable mul_a : std_ulogic_vector(63 downto 0); - variable mul_b : std_ulogic_vector(63 downto 0); variable decoded_reg_a : decode_input_reg_t; variable decoded_reg_b : decode_input_reg_t; variable decoded_reg_c : decode_input_reg_t; @@ -368,13 +366,10 @@ begin variable length : std_ulogic_vector(3 downto 0); variable op : insn_type_t; begin - v := r; + v := dc2; v.e := Decode2ToExecute1Init; - mul_a := (others => '0'); - mul_b := (others => '0'); - --v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; @@ -409,21 +404,21 @@ begin if d_in.decode.repeat /= NONE then v.e.repeat := '1'; - v.e.second := r.repeat; + v.e.second := dc2.repeat; case d_in.decode.repeat is when DRSE => -- do RS|1,RS for LE; RS,RS|1 for BE - if r.repeat = d_in.big_endian then + if dc2.repeat = d_in.big_endian then decoded_reg_c.reg(0) := '1'; end if; when DRTE => -- do RT|1,RT for LE; RT,RT|1 for BE - if r.repeat = d_in.big_endian then + if dc2.repeat = d_in.big_endian then decoded_reg_o.reg(0) := '1'; end if; when DUPD => -- update-form loads, 2nd instruction writes RA - if r.repeat = '1' then + if dc2.repeat = '1' then decoded_reg_o.reg := decoded_reg_a.reg; end if; when others => @@ -431,9 +426,9 @@ begin elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled v.e.repeat := '1'; - v.e.second := r.repeat; + v.e.second := dc2.repeat; -- first one does CTR, second does LR - decoded_reg_o.reg(0) := not r.repeat; + decoded_reg_o.reg(0) := not dc2.repeat; end if; v.e.spr_select := d_in.spr_info; @@ -487,7 +482,7 @@ begin v.e.result_sel := result_select(op); v.e.sub_select := subresult_select(op); if op = OP_BC or op = OP_BCREG then - if d_in.insn(23) = '0' and r.repeat = '0' and + if d_in.insn(23) = '0' and dc2.repeat = '0' and not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then -- decrement CTR if BO(2) = 0 and not bcctr v.e.addm1 := '1'; @@ -562,7 +557,7 @@ begin v.e.valid := control_valid_out; if control_valid_out = '1' then - v.repeat := v.e.repeat and not r.repeat; + v.repeat := v.e.repeat and not dc2.repeat; end if; stall_out <= control_stall_out or v.repeat; @@ -573,10 +568,10 @@ begin end if; -- Update registers - rin <= v; + dc2in <= v; -- Update outputs - e_out <= r.e; + e_out <= dc2.e; end process; d2_log: if LOG_LENGTH > 0 generate @@ -585,8 +580,8 @@ begin dec2_log : process(clk) begin if rising_edge(clk) then - log_data <= r.e.nia(5 downto 2) & - r.e.valid & + log_data <= dc2.e.nia(5 downto 2) & + dc2.e.valid & stopped_out & stall_out & (gpr_a_bypass(1) or gpr_a_bypass(0)) & From c9e838b6560fb7981062fef2762762e9cf4e748f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 4 Jun 2022 17:37:48 +1000 Subject: [PATCH 12/30] Remove support for lq, stq, lqarx and stqcx. They are optional in SFFS (scalar fixed-point and floating-point subset), are not needed for running Linux, and add complexity, so remove them. Signed-off-by: Paul Mackerras --- decode1.vhdl | 17 --- decode2.vhdl | 10 -- decode_types.vhdl | 2 - loadstore1.vhdl | 11 -- tests/modes/head.S | 60 ---------- tests/modes/modes.c | 171 ----------------------------- tests/reservation/head.S | 28 ----- tests/reservation/reservation.c | 62 ----------- tests/test_modes.bin | Bin 20520 -> 20520 bytes tests/test_modes.console_out | 2 - tests/test_reservation.bin | Bin 11604 -> 10888 bytes tests/test_reservation.console_out | 1 - 12 files changed, 364 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 3f3109f..b807054 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -93,7 +93,6 @@ architecture behaviour of decode1 is 43 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhau 40 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhz 41 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzu - 56 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DQ, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTE), -- lq 32 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwz 33 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzu 7 => (ALU, NONE, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- mulli @@ -310,7 +309,6 @@ architecture behaviour of decode1 is 2#1100110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzcix 2#0100110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzux 2#0100010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzx - 2#0100010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', DRTE), -- lqarx 2#0000010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lwarx 2#0101110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwaux 2#0101010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwax @@ -393,7 +391,6 @@ architecture behaviour of decode1 is 2#1011010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- sthcx 2#0110110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthux 2#0110010111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthx - 2#0010110110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', DRSE), -- stqcx 2#1010010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwbrx 2#1110010101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwcix 2#0010010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- stwcx @@ -452,7 +449,6 @@ architecture behaviour of decode1 is -- op in out A out in out len ext pipe 0 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- std 1 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdu - 2 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSE), -- stq others => decode_rom_init ); @@ -652,13 +648,6 @@ begin end case; end if; end if; - if std_match(f_in.insn(10 downto 1), "0100010100") then - -- lqarx, illegal if RA = RT or RB = RT - if f_in.insn(25 downto 21) = f_in.insn(20 downto 16) or - f_in.insn(25 downto 21) = f_in.insn(15 downto 11) then - vi.override := '1'; - end if; - end if; when 16 => -- CTR may be needed as input to bc @@ -722,12 +711,6 @@ begin when 30 => v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1)))); - when 56 => - -- lq, illegal if RA = RT - if f_in.insn(25 downto 21) = f_in.insn(20 downto 16) then - vi.override := '1'; - end if; - when 58 => v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0)))); diff --git a/decode2.vhdl b/decode2.vhdl index 1d4ce57..371c48c 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -406,16 +406,6 @@ begin v.e.repeat := '1'; v.e.second := dc2.repeat; case d_in.decode.repeat is - when DRSE => - -- do RS|1,RS for LE; RS,RS|1 for BE - if dc2.repeat = d_in.big_endian then - decoded_reg_c.reg(0) := '1'; - end if; - when DRTE => - -- do RT|1,RT for LE; RT,RT|1 for BE - if dc2.repeat = d_in.big_endian then - decoded_reg_o.reg(0) := '1'; - end if; when DUPD => -- update-form loads, 2nd instruction writes RA if dc2.repeat = '1' then diff --git a/decode_types.vhdl b/decode_types.vhdl index 885cc91..514bc08 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -53,8 +53,6 @@ package decode_types is type length_t is (NONE, is1B, is2B, is4B, is8B); type repeat_t is (NONE, -- instruction is not repeated - DRSE, -- double RS, endian twist - DRTE, -- double RT, endian twist DUPD); -- update-form load type decode_rom_t is record diff --git a/loadstore1.vhdl b/loadstore1.vhdl index ff2633b..7fad454 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -458,17 +458,6 @@ begin -- check alignment for larx/stcx misaligned := or (addr_mask and addr(2 downto 0)); v.align_intr := l_in.reserve and misaligned; - if l_in.repeat = '1' and l_in.second = '0' and l_in.update = '0' and addr(3) = '1' then - -- length is really 16 not 8 - -- Make misaligned lq cause an alignment interrupt in LE mode, - -- in order to avoid the case with RA = RT + 1 where the second half - -- faults but the first doesn't (and updates RT+1, destroying RA). - -- The equivalent BE case doesn't occur because RA = RT is illegal. - misaligned := '1'; - if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then - v.align_intr := '1'; - end if; - end if; v.atomic := not misaligned; v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); diff --git a/tests/modes/head.S b/tests/modes/head.S index 8b00bdd..d9e69dc 100644 --- a/tests/modes/head.S +++ b/tests/modes/head.S @@ -230,63 +230,3 @@ restore: ld %r0,16(%r1) mtlr %r0 blr - - .global do_lq -do_lq: - lq %r6,0(%r3) - std %r6,0(%r4) - std %r7,8(%r4) - li %r3,0 - blr - - .global do_lq_np /* "non-preferred" form of lq */ -do_lq_np: - mr %r7,%r3 - lq %r6,0(%r7) - std %r6,0(%r4) - std %r7,8(%r4) - li %r3,0 - blr - - .global do_lq_bad /* illegal form of lq */ -do_lq_bad: - mr %r6,%r3 - .long 0xe0c60000 /* lq %r6,0(%r6) */ - std %r6,0(%r4) - std %r7,8(%r4) - li %r3,0 - blr - - .global do_stq -do_stq: - ld %r8,0(%r4) - ld %r9,8(%r4) - stq %r8,0(%r3) - li %r3,0 - blr - - /* big-endian versions of the above */ - .global do_lq_be -do_lq_be: - .long 0x0000c3e0 - .long 0x0000c4f8 - .long 0x0800e4f8 - .long 0x00006038 - .long 0x2000804e - - .global do_lq_np_be /* "non-preferred" form of lq */ -do_lq_np_be: - .long 0x781b677c - .long 0x0000c7e0 - .long 0x0000c4f8 - .long 0x0800e4f8 - .long 0x00006038 - .long 0x2000804e - - .global do_stq_be -do_stq_be: - .long 0x000004e9 - .long 0x080024e9 - .long 0x020003f9 - .long 0x00006038 - .long 0x2000804e diff --git a/tests/modes/modes.c b/tests/modes/modes.c index b94bb47..fa4872c 100644 --- a/tests/modes/modes.c +++ b/tests/modes/modes.c @@ -12,14 +12,6 @@ extern unsigned long callit(unsigned long arg1, unsigned long arg2, unsigned long fn, unsigned long msr); -extern void do_lq(void *src, unsigned long *regs); -extern void do_lq_np(void *src, unsigned long *regs); -extern void do_lq_bad(void *src, unsigned long *regs); -extern void do_stq(void *dst, unsigned long *regs); -extern void do_lq_be(void *src, unsigned long *regs); -extern void do_lq_np_be(void *src, unsigned long *regs); -extern void do_stq_be(void *dst, unsigned long *regs); - static inline void do_tlbie(unsigned long rb, unsigned long rs) { __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); @@ -302,167 +294,6 @@ int mode_test_6(void) return 0; } -int mode_test_7(void) -{ - unsigned long quad[4] __attribute__((__aligned__(16))); - unsigned long regs[2]; - unsigned long ret, msr; - - /* - * Test lq/stq in LE mode - */ - msr = MSR_SF | MSR_LE; - quad[0] = 0x123456789abcdef0ul; - quad[1] = 0xfafa5959bcbc3434ul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq, msr); - if (ret) - return ret | 1; - if (regs[0] != quad[1] || regs[1] != quad[0]) - return 2; - /* unaligned may give alignment interrupt */ - quad[2] = 0x0011223344556677ul; - ret = callit((unsigned long)&quad[1], (unsigned long)regs, - (unsigned long)&do_lq, msr); - if (ret == 0) { - if (regs[0] != quad[2] || regs[1] != quad[1]) - return 3; - } else if (ret == 0x600) { - if (mfspr(SPRG0) != (unsigned long) &do_lq || - mfspr(DAR) != (unsigned long) &quad[1]) - return ret | 4; - } else - return ret | 5; - - /* try stq */ - regs[0] = 0x5238523852385238ul; - regs[1] = 0x5239523952395239ul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_stq, msr); - if (ret) - return ret | 5; - if (quad[0] != regs[1] || quad[1] != regs[0]) - return 6; - regs[0] = 0x0172686966746564ul; - regs[1] = 0xfe8d0badd00dabcdul; - ret = callit((unsigned long)quad + 1, (unsigned long)regs, - (unsigned long)&do_stq, msr); - if (ret) - return ret | 7; - if (((quad[0] >> 8) | (quad[1] << 56)) != regs[1] || - ((quad[1] >> 8) | (quad[2] << 56)) != regs[0]) - return 8; - - /* try lq non-preferred form */ - quad[0] = 0x56789abcdef01234ul; - quad[1] = 0x5959bcbc3434fafaul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq_np, msr); - if (ret) - return ret | 9; - if (regs[0] != quad[1] || regs[1] != quad[0]) - return 10; - /* unaligned should give alignment interrupt in uW implementation */ - quad[2] = 0x6677001122334455ul; - ret = callit((unsigned long)&quad[1], (unsigned long)regs, - (unsigned long)&do_lq_np, msr); - if (ret == 0x600) { - if (mfspr(SPRG0) != (unsigned long) &do_lq_np + 4 || - mfspr(DAR) != (unsigned long) &quad[1]) - return ret | 11; - } else - return 12; - - /* make sure lq with rt = ra causes an illegal instruction interrupt */ - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq_bad, msr); - if (ret != 0x700) - return 13; - if (mfspr(SPRG0) != (unsigned long)&do_lq_bad + 4 || - !(mfspr(SPRG3) & 0x80000)) - return 14; - return 0; -} - -int mode_test_8(void) -{ - unsigned long quad[4] __attribute__((__aligned__(16))); - unsigned long regs[2]; - unsigned long ret, msr; - - /* - * Test lq/stq in BE mode - */ - msr = MSR_SF; - quad[0] = 0x123456789abcdef0ul; - quad[1] = 0xfafa5959bcbc3434ul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq_be, msr); - if (ret) - return ret | 1; - if (regs[0] != quad[0] || regs[1] != quad[1]) { - print_hex(regs[0], 16); - print_string(" "); - print_hex(regs[1], 16); - print_string(" "); - return 2; - } - /* don't expect alignment interrupt */ - quad[2] = 0x0011223344556677ul; - ret = callit((unsigned long)&quad[1], (unsigned long)regs, - (unsigned long)&do_lq_be, msr); - if (ret == 0) { - if (regs[0] != quad[1] || regs[1] != quad[2]) - return 3; - } else - return ret | 5; - - /* try stq */ - regs[0] = 0x5238523852385238ul; - regs[1] = 0x5239523952395239ul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_stq_be, msr); - if (ret) - return ret | 5; - if (quad[0] != regs[0] || quad[1] != regs[1]) - return 6; - regs[0] = 0x0172686966746564ul; - regs[1] = 0xfe8d0badd00dabcdul; - ret = callit((unsigned long)quad + 1, (unsigned long)regs, - (unsigned long)&do_stq_be, msr); - if (ret) - return ret | 7; - if (((quad[0] >> 8) | (quad[1] << 56)) != regs[0] || - ((quad[1] >> 8) | (quad[2] << 56)) != regs[1]) { - print_hex(quad[0], 16); - print_string(" "); - print_hex(quad[1], 16); - print_string(" "); - print_hex(quad[2], 16); - print_string(" "); - return 8; - } - - /* try lq non-preferred form */ - quad[0] = 0x56789abcdef01234ul; - quad[1] = 0x5959bcbc3434fafaul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq_np_be, msr); - if (ret) - return ret | 9; - if (regs[0] != quad[0] || regs[1] != quad[1]) - return 10; - /* unaligned should not give alignment interrupt in uW implementation */ - quad[2] = 0x6677001122334455ul; - ret = callit((unsigned long)&quad[1], (unsigned long)regs, - (unsigned long)&do_lq_np_be, msr); - if (ret) - return ret | 11; - if (regs[0] != quad[1] || regs[1] != quad[2]) - return 12; - return 0; -} - int fail = 0; void do_test(int num, int (*test)(void)) @@ -507,8 +338,6 @@ int main(void) do_test(4, mode_test_4); do_test(5, mode_test_5); do_test(6, mode_test_6); - do_test(7, mode_test_7); - do_test(8, mode_test_8); return fail; } diff --git a/tests/reservation/head.S b/tests/reservation/head.S index 4ff85ce..ce258b5 100644 --- a/tests/reservation/head.S +++ b/tests/reservation/head.S @@ -155,31 +155,3 @@ call_ret: ld %r31,248(%r1) addi %r1,%r1,256 blr - - .global do_lqarx -do_lqarx: - /* r3 = src, r4 = regs */ - lqarx %r10,0,%r3 - std %r10,0(%r4) - std %r11,8(%r4) - li %r3,0 - blr - - .global do_lqarx_bad -do_lqarx_bad: - /* r3 = src, r4 = regs */ - .long 0x7d405228 /* lqarx %r10,0,%r10 */ - std %r10,0(%r4) - std %r11,8(%r4) - li %r3,0 - blr - - .global do_stqcx -do_stqcx: - /* r3 = dest, r4 = regs, return CR */ - ld %r10,0(%r4) - ld %r11,8(%r4) - stqcx. %r10,0,%r3 - mfcr %r3 - oris %r3,%r3,1 /* to distinguish from trap number */ - blr diff --git a/tests/reservation/reservation.c b/tests/reservation/reservation.c index a3d5a7a..79bbc1f 100644 --- a/tests/reservation/reservation.c +++ b/tests/reservation/reservation.c @@ -7,10 +7,6 @@ extern unsigned long callit(unsigned long arg1, unsigned long arg2, unsigned long (*fn)(unsigned long, unsigned long)); -extern unsigned long do_lqarx(unsigned long src, unsigned long regs); -extern unsigned long do_lqarx_bad(unsigned long src, unsigned long regs); -extern unsigned long do_stqcx(unsigned long dst, unsigned long regs); - #define DSISR 18 #define DAR 19 #define SRR0 26 @@ -184,63 +180,6 @@ int resv_test_2(void) return 0; } -/* test lqarx/stqcx */ -int resv_test_3(void) -{ - unsigned long x[4] __attribute__((__aligned__(16))); - unsigned long y[2], regs[2]; - unsigned long ret, offset; - int count; - - x[0] = 0x7766554433221100ul; - x[1] = 0xffeeddccbbaa9988ul; - y[0] = 0x0badcafef00dd00dul; - y[1] = 0xdeadbeef07070707ul; - for (count = 0; count < 1000; ++count) { - ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx); - if (ret) - return ret | 1; - ret = callit((unsigned long)x, (unsigned long)y, do_stqcx); - if (ret < 0x10000) - return ret | 2; - if (ret & 0x20000000) - break; - } - if (count == 1000) - return 3; - if (x[0] != y[1] || x[1] != y[0]) - return 4; - if (regs[1] != 0x7766554433221100ul || regs[0] != 0xffeeddccbbaa9988ul) - return 5; - ret = callit((unsigned long)x, (unsigned long)regs, do_stqcx); - if (ret < 0x10000 || (ret & 0x20000000)) - return ret | 12; - /* test alignment interrupts */ - for (offset = 0; offset < 16; ++offset) { - ret = callit((unsigned long)x + offset, (unsigned long)regs, do_lqarx); - if (ret == 0 && (offset & 15) != 0) - return 6; - if (ret == 0x600) { - if ((offset & 15) == 0) - return ret + 7; - } else if (ret) - return ret; - ret = callit((unsigned long)x + offset, (unsigned long)y, do_stqcx); - if (ret >= 0x10000 && (offset & 15) != 0) - return 8; - if (ret == 0x600) { - if ((offset & 15) == 0) - return ret + 9; - } else if (ret < 0x10000) - return ret; - } - /* test illegal interrupt for bad lqarx case */ - ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx_bad); - if (ret != 0x700 || !(mfspr(SRR1) & 0x80000)) - return ret + 10; - return 0; -} - int fail = 0; void do_test(int num, int (*test)(void)) @@ -265,7 +204,6 @@ int main(void) do_test(1, resv_test_1); do_test(2, resv_test_2); - do_test(3, resv_test_3); return fail; } diff --git a/tests/test_modes.bin b/tests/test_modes.bin index 7e6b8f5d03126023a7ce4e75f157bd289d1e387c..24e39813fa353af0d03f534c01389d4c3f97d9f5 100755 GIT binary patch delta 1111 zcmYjOZD?Cn7=G_rcALg5i}*p)&h+*+Ka!c1AWOK}a`mQZK1}yfaaHhF+RWNXk?x0( zwl~YQL{NXVdHtbmXgj53qew?9i1;IH{uujJWgAQs<|pk$8MfQ9we0www6MMKa-Z{@ z=RD^*@2$>A)fs83z-ros@t>8&y!*VX68eMnIPtI0PiVYkLo@7So`3OdxQjjg1pXL) zfqnfC%8_^EQ3quDn{YgGgta*EO2o_ll<`(%FP@3mv3kIHxH^*8+#uC|;2rhnIPbl_ z@vszvlI8&3zSNXA-if-xdV{X-rU9^=+H(MrGxE~bO<|QoljkB zg3NEvj2C9fxX@NKeeG{7z8-6q*i9J+^$L4s74PbE?A!`|_Tu%@0>v=Mf}*U{`LBP^ z?Xa829`IfXl&H9#^TF6Rr1`*7r*+~B8^4#_U}+a;`NX?Im*W%rVbTz~8tFX5&2^8O zLeo!J@Rx~~8vMxyFMI^QJd$Tx8Q+Vvq1$jvaqKYCEI5R38{Y8JKv}!+Q0OLOJL$Qb zLuGAtdo9ztkM^f&Ei#G`d}Ix-Ea5MPxA{XsO0TkL!nB=5=)$(!?z=3adcX|w6f$RYF*t`wQTdx^Ks3|(g zS^T1VU&&Svj6n_bOS|c-*hyy}BR2}e?(YH61y@OAi)EOM3tr6l6HEP6oN>~g#g(Wy zBfQV2mEx48t~T&dqk-AQ?3fJ#(Z{(RU__l;9KM?9@{%p**=<`zZ&3miBd`_ssDY;w zN|*-J9NfyKuKF5TI)i>!xnL(OK{@y64$D-)Is)JeR(gCG>RC{$|8Y)Z0XXbZo{XRB zbwNTFkMUD$)V4Kb$-QiT4ZD)hI()>Gsc*00cv6+;34XbbX9-><`29LwNvaJINC56T z*qTy3euBsEaBh)8l<Y%w0T{ipVnr58cCWQ>{B|1UK`bzQbMV_SP2qufz8_zbtlpn&a@flyHUU9YdH(tARRgiMXQN)M84*4<1Dk wk2I*S;&l4?efAB`1tUtRE$ThQ1?f6bg8P_y|0d_div#Y{yQde9Rr>t@1HPzfVE_OC delta 3286 zcmZ`+e{56N6+V7(V&|6^3R?+b@RGQzUlv;DN9i+(6Fbo2B^9Klv{3DjxTztjI+RYS zYPZjQCd#BTwNkbr_r7z!bM86cbB-Qy96jRbi%ZVXiC4am_0!`-XJ*|rHA~c{%9Pp`FWIXR zeS8Oe5Af0d!-ksZJYU0iN+Vc(u>G3+b>FIWh$MUpO=Fn+OO{3twd#ts{GEWQ%bD1R*Cws zKkyE#-Q$>mG`P$dUF@7?H{WYV~PTq9+sqCIjR3 zd8}8E9Ar{8Wo-DPx(vCUTP$)f@3F`+nfSj?>wY+oEpy&WJt6kvhs`Y8k0x}>5j$r` zFTj!dgd7y&z52Hve_v&X>Jv(ov)~~m!C7Dj9BC^J%!c43fb#*I;r$5K-(IyM#{M)% zts?4dL|q_&=al=hsS_Dck_2CadQ)qXS{pTIjvSa-0wC?cx(ulkDE3#F*%FF(q0GvP|^NB{}$qd_Jcn3dfY#x!)^Mof(I1 z#)*E(z$_2H1w;V(PnJiI6vH5wckbZOJW+4@TWG+&A z>V{~^)J~qArDHd4t)0VIzr4rF(h3B|SD7JEQ>I%iP(&t)OyN}7&I;IQPI=`n7#YM> zIZ4#zp7qvQ)SDmp zv=Oc<`JX6o8yrI$Si&%DHn)>3L0~bslMrxQ+)WG)P2buYTx^ygv};ci>p*lGHK3;{ zQm(B+!*cmUMX;{W>T#HJTtq3=*DdNMP@VCGzw7<@8^Mc=pR=(f{z{NtR+;k!6k+iY zVpwJI-~hM)_>NdSe7GYP4;GB;|6qbTnQ|336~iKs1!GL&IGUD8;jl^S2-X?FxPVNt z6BZ11!h(>g?7NA@FHDEjmJtiC>}(ySJ%b2X{?ND(0LytToLRkSIrn1C0>DmL0G3C_ z0>F*W0stF#Y<}+jY?}Zp6+3sqx87|!k4|&yxxb{5=i{E?iUc?JT4aSaR2osx^gn^* zg(01a_SkLL^9XvAuje|d7b!Ncr^?1Afcb6ghcYq%E(yKGQjhoQhwYTIdMuBQ z)gz<9EFdVFy9&pBU*B4E8S&^d#UqT5OldmL&hUBkWQ6paMRx`Os|Tl;)nhpc6q8vz zEFN4uAAV&y%A_(MsjRe}a3%|ECtaqKn>EyiHGkiC;VkiEd{$kuAzt#DCd9M#c{E<< z?`~JQ*FM8Xw|_yje7>-R@0+gXkytu1J~0_%3Z5bIS>pgM)< zsn157p?$sTfW786X^r-(EC{^s!g>hn{q~xx%e}W(eGzLW&$;e=ZFMk27T(2S=q z)r7yw0pm#E)!Pv7ro(l;5=HW8oew1xDugxYQ44=E0{AEkfIXu7-{$ivAqUw0@1B;EeW|jvIp0p^oIl@A z==^dli$Z|{QjAM=)oYjzZGBq^cVM;5eWh4bx0CT;z5EE0E zo7CcwSA<{N?4hkW_zIE_qZgwC`yXUsvNbnM?>gR*T0C7i*P7>{rb}Pug!8fG#W;>} z?O(=duq@{SrblOte_~1wqO&u`4;soIkbzcWx$0XX-e7~*$iYY zpd!>qhv8@}DfVOb>3@Xi;*)q}j4i-?#zT$k*K}ZZepU$ljs$E5yxHSNxciKKjb%#j zoDi?`7M>yu+zibE<2>=hISXNQ7=La2@rJ|mLNLM)pH5*M2gXU@hp}H_oLvwij(-#G Jbsuc?{TF)MbVmRH diff --git a/tests/test_modes.console_out b/tests/test_modes.console_out index 25e791c..a49bb9b 100644 --- a/tests/test_modes.console_out +++ b/tests/test_modes.console_out @@ -4,5 +4,3 @@ test 03:PASS test 04:PASS test 05:PASS test 06:PASS -test 07:PASS -test 08:PASS diff --git a/tests/test_reservation.bin b/tests/test_reservation.bin index 1e305f43af8cbe4213e597cdfc50f1eab71abb5d..1cb62505581b238325722ae999b1b70778e0ffa1 100755 GIT binary patch delta 986 zcmYjPO-vI}5T30qv;^ytVggah>;D!O0})MYsa;e;ZPju>V++M*-Fc9)rtC)koD5WT`2dB&U(#tadF3Hrie*Oo761ZMklrSCO>4sl<#v*Ue(WW^u`qHKUpfFD%pjuZNx@pq~P8%QRExHu54mSvPUG<0oTF zc;Y6ab2$+o{8>(*aZW++L{7!Xtp&Xk{b`=wtUqXRBOFD~(zSY*X%^8mb;v#wzIoWB zCuLWOC;)V<(`&N5vKit0I%C~<32jzV(K-;Pld{|Y70C=I(f$n5wZWKPU;reH3i?aB dD%T5})X`ug;wED<3UcE$dj*2+x--2yU50Okks)eYaMJb#qPSn9?DM`C{*Qq(1J2AFl}4tP;%SB2 zw=?(C-(6FspJ*e>!(~MjeN$G?OX&_vsfLNN;+yI!^XfKJqrB!{$myK%hd8?-af{&=hn7 z9QWsO_F;S4K|4g~r1u za^berLX(#){_t77W8i?E@B#m5xfWXd&caJ(^^TGYdZLB#%UbBS&n?{H&^sWBHsHw{ zC%8!_#*^@WcGKz~6q%MC2vMvNr4()_4&X0>g;6OvrMTK$y-9>A>(~A}!2kroV5jZNFX`{%D4gS=k zr_Y0fvCOmpt4ruZ>~mu>_O$fpl9MAp@kzOBP}HYz#BD z_It3;U0JG7$ydlDaB){OvqzzLcxbUdQjBpJoUpZ&ter?9qmfQ;@}E?l>3MnWPbsg9 zAKaDpaXUqiid*hxrBj4Fk10=zZJw}ts#hHHR5kplP!wmxTV~C41GVQ~&AB|b0hOG| z+YNY0V>XJ4Zcl}9KxC11g-BCwwm4lK?Z7uf}JWwmr<1u@6jlMcLa0d%MpO)xA{-0Dp7Lf>uDbH z#}%tHL`CAYT2Cc~G;$^&sSj)byT+t(Bg9FtxU#?aNsECSOChO+ zRfIi!zx$#yW1C-{ns_Ld%I5weT7dnEu}4&QfcaUe1^E2om$ZyC>z>Kk&6Km%FP$kH z>DGQ8Doi1t3g7>8+~ zLlG}hQgIwVV`nbCSmHrABg21J!v0k>OH9{!h0AwjCtrbky@vrTAH;wR2z~YA>&CFd2rp)5As2wv<9J?-aUIit HJt*~Wb@w*g diff --git a/tests/test_reservation.console_out b/tests/test_reservation.console_out index 623335d..0c39ae3 100644 --- a/tests/test_reservation.console_out +++ b/tests/test_reservation.console_out @@ -1,3 +1,2 @@ test 01:PASS test 02:PASS -test 03:PASS From 2f45e545ed86795c0f282204a27f97887329051f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 8 Jul 2022 14:07:28 +1000 Subject: [PATCH 13/30] decode2: Rework to make the stall_out signal come from a register At present the busy/stall signal going to decode1 depends on whether control thinks it can issue the current instruction, and that depends on completion and bypass signals coming from execute1 and writeback. To improve the timing of stall_out, this rearranges decode2 so that stall_out is asserted when we have a valid instruction that couldn't be issued in the previous cycle. This means that decode1 could give us a new instruction when we haven't issued the previous instruction. This in turn means that we can only use d_in in the first cycle of processing an instruction. After the first cycle, we get register addresses etc. from dc2 rather than d_in. Then, to avoid the need to read register operands from register_file in each cycle until the instruction issues, we bring the bypass path for data being written to the register file into decode2 explicitly rather than having it in register_file. A new process called decode2_addrs does the process of calling decode_input_reg_* and decode_output_reg and sets up the register file addresses. This was split out (and decode_input_reg_* reworked) to try to reduce the number of passes through the decode2_1 process that need to be done in simulation. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + control.vhdl | 31 ++-- core.vhdl | 5 + decode2.vhdl | 430 +++++++++++++++++++++++++-------------------- register_file.vhdl | 14 +- writeback.vhdl | 7 + 6 files changed, 269 insertions(+), 219 deletions(-) diff --git a/common.vhdl b/common.vhdl index ea6a8d8..54a87d2 100644 --- a/common.vhdl +++ b/common.vhdl @@ -288,6 +288,7 @@ package common is write_reg_enable: std_ulogic; read_reg1: gspr_index_t; read_reg2: gspr_index_t; + read_reg3: gspr_index_t; read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); diff --git a/control.vhdl b/control.vhdl index 17a288b..e6855c2 100644 --- a/control.vhdl +++ b/control.vhdl @@ -15,9 +15,7 @@ entity control is complete_in : in instr_tag_t; valid_in : in std_ulogic; - repeated : in std_ulogic; flush_in : in std_ulogic; - busy_in : in std_ulogic; deferred : in std_ulogic; sgl_pipe_in : in std_ulogic; stop_mark_in : in std_ulogic; @@ -43,7 +41,6 @@ entity control is cr_write_in : in std_ulogic; valid_out : out std_ulogic; - stall_out : out std_ulogic; stopped_out : out std_ulogic; gpr_bypass_a : out std_ulogic_vector(1 downto 0); @@ -157,9 +154,6 @@ begin tag_a.tag := i; end if; end loop; - if tag_match(tag_a, complete_in) then - tag_a.valid := '0'; - end if; tag_b := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then @@ -167,9 +161,6 @@ begin tag_b.tag := i; end if; end loop; - if tag_match(tag_b, complete_in) then - tag_b.valid := '0'; - end if; tag_c := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then @@ -177,26 +168,29 @@ begin tag_c.tag := i; end if; end loop; - if tag_match(tag_c, complete_in) then - tag_c.valid := '0'; - end if; byp_a := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then - byp_a := "10"; + byp_a := "01"; elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then + byp_a := "10"; + elsif tag_match(complete_in, tag_a) then byp_a := "11"; end if; byp_b := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then - byp_b := "10"; + byp_b := "01"; elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then + byp_b := "10"; + elsif tag_match(complete_in, tag_b) then byp_b := "11"; end if; byp_c := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then - byp_c := "10"; + byp_c := "01"; elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then + byp_c := "10"; + elsif tag_match(complete_in, tag_c) then byp_c := "11"; end if; @@ -204,9 +198,9 @@ begin gpr_bypass_b <= byp_b; gpr_bypass_c <= byp_c; - gpr_tag_stall <= (tag_a.valid and not byp_a(1)) or - (tag_b.valid and not byp_b(1)) or - (tag_c.valid and not byp_c(1)); + gpr_tag_stall <= (tag_a.valid and not (or (byp_a))) or + (tag_b.valid and not (or (byp_b))) or + (tag_c.valid and not (or (byp_c))); incr_tag := curr_tag; instr_tag.tag <= curr_tag; @@ -331,7 +325,6 @@ begin -- update outputs valid_out <= valid_tmp; - stall_out <= stall_tmp or deferred; -- update registers rin_int <= v_int; diff --git a/core.vhdl b/core.vhdl index 23f7e82..ba8f0cc 100644 --- a/core.vhdl +++ b/core.vhdl @@ -100,6 +100,9 @@ architecture behave of core is signal fpu_to_execute1: FPUToExecute1Type; signal fpu_to_writeback: FPUToWritebackType; + -- Writeback signals + signal writeback_bypass: bypass_data_t; + -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; @@ -302,6 +305,7 @@ begin execute_cr_bypass => execute1_cr_bypass, execute2_bypass => execute2_bypass, execute2_cr_bypass => execute2_cr_bypass, + writeback_bypass => writeback_bypass, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -463,6 +467,7 @@ begin w_out => writeback_to_register_file, c_out => writeback_to_cr_file, f_out => writeback_to_fetch1, + wb_bypass => writeback_bypass, events => writeback_events, interrupt_out => do_interrupt, complete_out => complete diff --git a/decode2.vhdl b/decode2.vhdl index 371c48c..41f3e09 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -41,6 +41,7 @@ entity decode2 is execute_cr_bypass : in cr_bypass_data_t; execute2_bypass : in bypass_data_t; execute2_cr_bypass : in cr_bypass_data_t; + writeback_bypass : in bypass_data_t; log_out : out std_ulogic_vector(9 downto 0) ); @@ -49,8 +50,16 @@ end entity decode2; architecture behaviour of decode2 is type reg_type is record e : Decode2ToExecute1Type; - repeat : std_ulogic; + repeat : repeat_t; + busy : std_ulogic; + sgl_pipe : std_ulogic; + reg_a_valid : std_ulogic; + reg_b_valid : std_ulogic; + reg_c_valid : std_ulogic; + reg_o_valid : std_ulogic; end record; + constant reg_type_init : reg_type := + (e => Decode2ToExecute1Init, repeat => NONE, others => '0'); signal dc2, dc2in : reg_type; @@ -61,20 +70,21 @@ architecture behaviour of decode2 is reg : gspr_index_t; data : std_ulogic_vector(63 downto 0); end record; + constant decode_input_reg_init : decode_input_reg_t := ('0', (others => '0'), (others => '0')); type decode_output_reg_t is record reg_valid : std_ulogic; reg : gspr_index_t; end record; + constant decode_output_reg_init : decode_output_reg_t := ('0', (others => '0')); function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0); ispr : gspr_index_t; instr_addr : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is begin if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then - return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data); + return ('1', gpr_to_gspr(insn_ra(insn_in)), (others => '0')); elsif t = SPR then -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. -- If it's all 0, we don't treat it as a dependency as slow SPRs @@ -83,27 +93,26 @@ architecture behaviour of decode2 is assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode A says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; - return (is_fast_spr(ispr), ispr, reg_data); + return (is_fast_spr(ispr), ispr, (others => '0')); elsif t = CIA then return ('0', (others => '0'), instr_addr); elsif HAS_FPU and t = FRA then - return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_fra(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; end; function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0); ispr : gspr_index_t) return decode_input_reg_t is variable ret : decode_input_reg_t; begin case t is when RB => - ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data); + ret := ('1', gpr_to_gspr(insn_rb(insn_in)), (others => '0')); when FRB => if HAS_FPU then - ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data); + ret := ('1', fpr_to_gspr(insn_frb(insn_in)), (others => '0')); else ret := ('0', (others => '0'), (others => '0')); end if; @@ -138,7 +147,7 @@ architecture behaviour of decode2 is assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode B says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; - ret := (is_fast_spr(ispr), ispr, reg_data); + ret := (is_fast_spr(ispr), ispr, (others => '0')); when NONE => ret := ('0', (others => '0'), (others => '0')); end case; @@ -146,23 +155,23 @@ architecture behaviour of decode2 is return ret; end; - function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is + function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0)) + return decode_input_reg_t is begin case t is when RS => - return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data); + return ('1', gpr_to_gspr(insn_rs(insn_in)), (others => '0')); when RCR => - return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data); + return ('1', gpr_to_gspr(insn_rcreg(insn_in)), (others => '0')); when FRS => if HAS_FPU then - return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_frt(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; when FRC => if HAS_FPU then - return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_frc(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; @@ -264,10 +273,14 @@ architecture behaviour of decode2 is others => "000" ); + signal decoded_reg_a : decode_input_reg_t; + signal decoded_reg_b : decode_input_reg_t; + signal decoded_reg_c : decode_input_reg_t; + signal decoded_reg_o : decode_output_reg_t; + -- issue control signals signal control_valid_in : std_ulogic; signal control_valid_out : std_ulogic; - signal control_stall_out : std_ulogic; signal control_sgl_pipe : std_logic; signal gpr_write_valid : std_ulogic; @@ -302,8 +315,6 @@ begin complete_in => complete_in, valid_in => control_valid_in, - repeated => dc2.repeat, - busy_in => busy_in, deferred => deferred, flush_in => flush_in, sgl_pipe_in => control_sgl_pipe, @@ -331,7 +342,6 @@ begin cr_bypass => cr_bypass, valid_out => control_valid_out, - stall_out => control_stall_out, stopped_out => stopped_out, gpr_bypass_a => gpr_a_bypass, @@ -346,9 +356,12 @@ begin decode2_0: process(clk) begin if rising_edge(clk) then - if rst = '1' or flush_in = '1' or deferred = '0' then + if rst = '1' or flush_in = '1' then + dc2 <= reg_type_init; + elsif deferred = '0' then if dc2in.e.valid = '1' then - report "execute " & to_hstring(dc2in.e.nia); + report "execute " & to_hstring(dc2in.e.nia) & + " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid); end if; dc2 <= dc2in; end if; @@ -357,205 +370,246 @@ begin c_out.read <= d_in.decode.input_cr; + decode2_addrs: process(all) + begin + decoded_reg_a <= decode_input_reg_init; + decoded_reg_b <= decode_input_reg_init; + decoded_reg_c <= decode_input_reg_init; + decoded_reg_o <= decode_output_reg_init; + if d_in.valid = '1' then + decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.ispr1, d_in.nia); + decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, d_in.ispr2); + decoded_reg_c <= decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn); + decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + end if; + + r_out.read1_enable <= decoded_reg_a.reg_valid; + r_out.read1_reg <= decoded_reg_a.reg; + r_out.read2_enable <= decoded_reg_b.reg_valid; + r_out.read2_reg <= decoded_reg_b.reg; + r_out.read3_enable <= decoded_reg_c.reg_valid; + r_out.read3_reg <= decoded_reg_c.reg; + + end process; + decode2_1: process(all) variable v : reg_type; - variable decoded_reg_a : decode_input_reg_t; - variable decoded_reg_b : decode_input_reg_t; - variable decoded_reg_c : decode_input_reg_t; - variable decoded_reg_o : decode_output_reg_t; variable length : std_ulogic_vector(3 downto 0); variable op : insn_type_t; + variable valid_in : std_ulogic; begin v := dc2; - v.e := Decode2ToExecute1Init; - - --v.e.input_cr := d_in.decode.input_cr; - v.e.output_cr := d_in.decode.output_cr; + valid_in := d_in.valid or dc2.busy; - -- Work out whether XER common bits are set - v.e.output_xer := d_in.decode.output_carry; - case d_in.decode.insn_type is - when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => - -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only - if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then - v.e.oe := '1'; - v.e.output_xer := '1'; - end if; - when OP_MTSPR => - if decode_spr_num(d_in.insn) = SPR_XER then - v.e.output_xer := '1'; - end if; - when others => - end case; + if dc2.busy = '0' then + v.e := Decode2ToExecute1Init; - decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1, - d_in.nia); - decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); - decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); - decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + v.sgl_pipe := d_in.decode.sgl_pipe; - if d_in.decode.lr = '1' then - v.e.lr := insn_lk(d_in.insn); - -- b and bc have even major opcodes; bcreg is considered absolute - v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); - end if; - op := d_in.decode.insn_type; + v.e.input_cr := d_in.decode.input_cr; + v.e.output_cr := d_in.decode.output_cr; - if d_in.decode.repeat /= NONE then - v.e.repeat := '1'; - v.e.second := dc2.repeat; - case d_in.decode.repeat is - when DUPD => - -- update-form loads, 2nd instruction writes RA - if dc2.repeat = '1' then - decoded_reg_o.reg := decoded_reg_a.reg; + -- Work out whether XER common bits are set + v.e.output_xer := d_in.decode.output_carry; + case d_in.decode.insn_type is + when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => + -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only + if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then + v.e.oe := '1'; + v.e.output_xer := '1'; + end if; + when OP_MTSPR => + if decode_spr_num(d_in.insn) = SPR_XER then + v.e.output_xer := '1'; end if; when others => end case; - elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then - -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled - v.e.repeat := '1'; - v.e.second := dc2.repeat; - -- first one does CTR, second does LR - decoded_reg_o.reg(0) := not dc2.repeat; - end if; - v.e.spr_select := d_in.spr_info; + v.reg_a_valid := decoded_reg_a.reg_valid; + v.reg_b_valid := decoded_reg_b.reg_valid; + v.reg_c_valid := decoded_reg_c.reg_valid; + v.reg_o_valid := decoded_reg_o.reg_valid; - r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; - r_out.read1_reg <= decoded_reg_a.reg; - r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid; - r_out.read2_reg <= decoded_reg_b.reg; - r_out.read3_enable <= decoded_reg_c.reg_valid and d_in.valid; - r_out.read3_reg <= decoded_reg_c.reg; + if d_in.decode.lr = '1' then + v.e.lr := insn_lk(d_in.insn); + -- b and bc have even major opcodes; bcreg is considered absolute + v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); + end if; + op := d_in.decode.insn_type; + + v.repeat := d_in.decode.repeat; + if d_in.decode.repeat /= NONE then + v.e.repeat := '1'; + elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then + -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled + v.e.repeat := '1'; + end if; - case d_in.decode.length is - when is1B => - length := "0001"; - when is2B => - length := "0010"; - when is4B => - length := "0100"; - when is8B => - length := "1000"; - when NONE => - length := "0000"; - end case; + v.e.spr_select := d_in.spr_info; + + case d_in.decode.length is + when is1B => + length := "0001"; + when is2B => + length := "0010"; + when is4B => + length := "0100"; + when is8B => + length := "1000"; + when NONE => + length := "0000"; + end case; - -- execute unit - v.e.nia := d_in.nia; - v.e.unit := d_in.decode.unit; - v.e.fac := d_in.decode.facility; - v.e.instr_tag := instr_tag; - v.e.read_reg1 := decoded_reg_a.reg; - v.e.read_reg2 := decoded_reg_b.reg; - v.e.write_reg := decoded_reg_o.reg; - v.e.write_reg_enable := decoded_reg_o.reg_valid; - v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.e.xerc := c_in.read_xerc_data; - v.e.invert_a := d_in.decode.invert_a; - v.e.addm1 := '0'; - v.e.insn_type := op; - v.e.invert_out := d_in.decode.invert_out; - v.e.input_carry := d_in.decode.input_carry; - v.e.output_carry := d_in.decode.output_carry; - v.e.is_32bit := d_in.decode.is_32bit; - v.e.is_signed := d_in.decode.is_signed; - v.e.insn := d_in.insn; - v.e.data_len := length; - v.e.byte_reverse := d_in.decode.byte_reverse; - v.e.sign_extend := d_in.decode.sign_extend; - v.e.update := d_in.decode.update; - v.e.reserve := d_in.decode.reserve; - v.e.br_pred := d_in.br_pred; - v.e.result_sel := result_select(op); - v.e.sub_select := subresult_select(op); - if op = OP_BC or op = OP_BCREG then - if d_in.insn(23) = '0' and dc2.repeat = '0' and - not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then - -- decrement CTR if BO(2) = 0 and not bcctr - v.e.addm1 := '1'; - v.e.result_sel := "000"; -- select adder output + -- execute unit + v.e.nia := d_in.nia; + v.e.unit := d_in.decode.unit; + v.e.fac := d_in.decode.facility; + v.e.read_reg1 := decoded_reg_a.reg; + v.e.read_reg2 := decoded_reg_b.reg; + v.e.read_reg3 := decoded_reg_c.reg; + v.e.write_reg := decoded_reg_o.reg; + v.e.write_reg_enable := decoded_reg_o.reg_valid; + v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); + v.e.xerc := c_in.read_xerc_data; + v.e.invert_a := d_in.decode.invert_a; + v.e.addm1 := '0'; + v.e.insn_type := op; + v.e.invert_out := d_in.decode.invert_out; + v.e.input_carry := d_in.decode.input_carry; + v.e.output_carry := d_in.decode.output_carry; + v.e.is_32bit := d_in.decode.is_32bit; + v.e.is_signed := d_in.decode.is_signed; + v.e.insn := d_in.insn; + v.e.data_len := length; + v.e.byte_reverse := d_in.decode.byte_reverse; + v.e.sign_extend := d_in.decode.sign_extend; + v.e.update := d_in.decode.update; + v.e.reserve := d_in.decode.reserve; + v.e.br_pred := d_in.br_pred; + v.e.result_sel := result_select(op); + v.e.sub_select := subresult_select(op); + if op = OP_BC or op = OP_BCREG then + if d_in.insn(23) = '0' and + not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then + -- decrement CTR if BO(2) = 0 and not bcctr + v.e.addm1 := '1'; + v.e.result_sel := "000"; -- select adder output + end if; end if; - end if; - if op = OP_MFSPR then - if is_fast_spr(d_in.ispr1) = '1' then - v.e.result_sel := "000"; -- adder_result, effectively a_in - elsif d_in.spr_info.valid = '0' then - -- Privileged mfspr to invalid/unimplemented SPR numbers - -- writes the contents of RT back to RT (i.e. it's a no-op) - v.e.result_sel := "001"; -- logical_result - elsif d_in.spr_info.ispmu = '1' then - v.e.result_sel := "100"; -- pmuspr_result + if op = OP_MFSPR then + if is_fast_spr(d_in.ispr1) = '1' then + v.e.result_sel := "000"; -- adder_result, effectively a_in + elsif d_in.spr_info.valid = '0' then + -- Privileged mfspr to invalid/unimplemented SPR numbers + -- writes the contents of RT back to RT (i.e. it's a no-op) + v.e.result_sel := "001"; -- logical_result + elsif d_in.spr_info.ispmu = '1' then + v.e.result_sel := "100"; -- pmuspr_result + end if; end if; - end if; - -- See if any of the operands can get their value via the bypass path. - case gpr_a_bypass is - when "10" => - v.e.read_data1 := execute_bypass.data; - when "11" => - v.e.read_data1 := execute2_bypass.data; - when others => - v.e.read_data1 := decoded_reg_a.data; - end case; - case gpr_b_bypass is - when "10" => - v.e.read_data2 := execute_bypass.data; - when "11" => - v.e.read_data2 := execute2_bypass.data; - when others => - v.e.read_data2 := decoded_reg_b.data; - end case; - case gpr_c_bypass is - when "10" => - v.e.read_data3 := execute_bypass.data; - when "11" => - v.e.read_data3 := execute2_bypass.data; - when others => - v.e.read_data3 := decoded_reg_c.data; - end case; - - v.e.cr := c_in.read_cr_data; - if cr_bypass = "10" then - v.e.cr := execute_cr_bypass.data; - elsif cr_bypass = "11" then - v.e.cr := execute2_cr_bypass.data; + elsif dc2.e.valid = '1' then + -- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction. + -- Set up for the second iteration (if deferred = 1 this will all be ignored) + v.e.second := '1'; + case dc2.repeat is + when DUPD => + -- update-form loads, 2nd instruction writes RA + v.e.write_reg := dc2.e.read_reg1; + when NONE => + -- bcl/bclrl/bctarl that needs to write both CTR and LR + v.e.write_reg(0) := '0'; -- point to LR + v.e.result_sel := "110"; -- select NIA (to go to LR) + when others => + end case; end if; -- issue control - control_valid_in <= d_in.valid; - control_sgl_pipe <= d_in.decode.sgl_pipe; + control_valid_in <= valid_in; + control_sgl_pipe <= v.sgl_pipe; - gpr_write_valid <= v.e.write_reg_enable; - gpr_write <= decoded_reg_o.reg; + gpr_write_valid <= v.reg_o_valid; + gpr_write <= v.e.write_reg; - gpr_a_read_valid <= decoded_reg_a.reg_valid; - gpr_a_read <= decoded_reg_a.reg; + gpr_a_read_valid <= v.reg_a_valid; + gpr_a_read <= v.e.read_reg1; - gpr_b_read_valid <= decoded_reg_b.reg_valid; - gpr_b_read <= decoded_reg_b.reg; + gpr_b_read_valid <= v.reg_b_valid; + gpr_b_read <= v.e.read_reg2; - gpr_c_read_valid <= decoded_reg_c.reg_valid; - gpr_c_read <= decoded_reg_c.reg; + gpr_c_read_valid <= v.reg_c_valid; + gpr_c_read <= v.e.read_reg3; - cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); + cr_write_valid <= v.e.output_cr or v.e.rc; -- Since ops that write CR only write some of the fields, -- any op that writes CR effectively also reads it. - cr_read_valid <= cr_write_valid or d_in.decode.input_cr; + cr_read_valid <= cr_write_valid or v.e.input_cr; - v.e.valid := control_valid_out; - if control_valid_out = '1' then - v.repeat := v.e.repeat and not dc2.repeat; + -- See if any of the operands can get their value via the bypass path. + if dc2.busy = '0' or gpr_a_bypass /= "00" then + case gpr_a_bypass is + when "01" => + v.e.read_data1 := execute_bypass.data; + when "10" => + v.e.read_data1 := execute2_bypass.data; + when "11" => + v.e.read_data1 := writeback_bypass.data; + when others => + if decoded_reg_a.reg_valid = '1' then + v.e.read_data1 := r_in.read1_data; + else + v.e.read_data1 := decoded_reg_a.data; + end if; + end case; + end if; + if dc2.busy = '0' or gpr_b_bypass /= "00" then + case gpr_b_bypass is + when "01" => + v.e.read_data2 := execute_bypass.data; + when "10" => + v.e.read_data2 := execute2_bypass.data; + when "11" => + v.e.read_data2 := writeback_bypass.data; + when others => + if decoded_reg_b.reg_valid = '1' then + v.e.read_data2 := r_in.read2_data; + else + v.e.read_data2 := decoded_reg_b.data; + end if; + end case; + end if; + if dc2.busy = '0' or gpr_c_bypass /= "00" then + case gpr_c_bypass is + when "01" => + v.e.read_data3 := execute_bypass.data; + when "10" => + v.e.read_data3 := execute2_bypass.data; + when "11" => + v.e.read_data3 := writeback_bypass.data; + when others => + if decoded_reg_c.reg_valid = '1' then + v.e.read_data3 := r_in.read3_data; + else + v.e.read_data3 := decoded_reg_c.data; + end if; + end case; end if; - stall_out <= control_stall_out or v.repeat; + case cr_bypass is + when "10" => + v.e.cr := execute_cr_bypass.data; + when "11" => + v.e.cr := execute2_cr_bypass.data; + when others => + v.e.cr := c_in.read_cr_data; + end case; - if rst = '1' or flush_in = '1' then - v.e := Decode2ToExecute1Init; - v.repeat := '0'; - end if; + v.e.valid := control_valid_out; + v.e.instr_tag := instr_tag; + v.busy := valid_in and (not control_valid_out or (v.e.repeat and not v.e.second)); + + stall_out <= dc2.busy or deferred; -- Update registers dc2in <= v; @@ -574,9 +628,9 @@ begin dc2.e.valid & stopped_out & stall_out & - (gpr_a_bypass(1) or gpr_a_bypass(0)) & - (gpr_b_bypass(1) or gpr_b_bypass(0)) & - (gpr_c_bypass(1) or gpr_c_bypass(0)); + (gpr_a_bypass(1) xor gpr_a_bypass(0)) & + (gpr_b_bypass(1) xor gpr_b_bypass(0)) & + (gpr_c_bypass(1) xor gpr_c_bypass(0)); end if; end process; log_out <= log_data; diff --git a/register_file.vhdl b/register_file.vhdl index ab35855..0235dfc 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -100,18 +100,8 @@ begin d_out.read2_data <= rd_port_b; d_out.read3_data <= registers(to_integer(unsigned(c_addr))); - -- Forward any written data - if w_in.write_enable = '1' then - if a_addr = w_addr then - d_out.read1_data <= w_in.write_data; - end if; - if b_addr = w_addr then - d_out.read2_data <= w_in.write_data; - end if; - if c_addr = w_addr then - d_out.read3_data <= w_in.write_data; - end if; - end if; + -- Forwarding of written data is now done explicitly with a bypass path + -- from writeback to decode2. end process register_read_0; -- Latch read data and ack if dbg read requested and B port not busy diff --git a/writeback.vhdl b/writeback.vhdl index db30164..0d6f41d 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -19,6 +19,8 @@ entity writeback is c_out : out WritebackToCrFileType; f_out : out WritebackToFetch1Type; + wb_bypass : out bypass_data_t; + -- PMU event bus events : out WritebackEventType; @@ -215,6 +217,11 @@ begin f_out <= f; flush_out <= f_out.redirect; + -- Register write data bypass to decode2 + wb_bypass.tag.tag <= complete_out.tag; + wb_bypass.tag.valid <= complete_out.valid and w_out.write_enable; + wb_bypass.data <= w_out.write_data; + rin <= v; end process; end; From 2da08bcf2e64e5f77ce8b4098ae27101dceef6cc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 8 Jul 2022 16:37:12 +1000 Subject: [PATCH 14/30] decode1: Remove stash buffer Now that the timing of the busy signal from decode2 doesn't depend on register numbers or downstream instruction completion, we no longer need the stash buffer on the output of decode1. Signed-off-by: Paul Mackerras --- decode1.vhdl | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index b807054..5bc023b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -30,7 +30,6 @@ end entity decode1; architecture behaviour of decode1 is signal r, rin : Decode1ToDecode2Type; - signal s : Decode1ToDecode2Type; signal f, fin : Decode1ToFetch1Type; constant illegal_inst : decode_rom_t := @@ -46,7 +45,6 @@ architecture behaviour of decode1 is (override => '0', override_decode => illegal_inst, override_unit => '0', force_single => '0'); signal ri, ri_in : reg_internal_t; - signal si : reg_internal_t; type br_predictor_t is record br_nia : std_ulogic_vector(61 downto 0); @@ -555,26 +553,12 @@ begin if rising_edge(clk) then if rst = '1' then r <= Decode1ToDecode2Init; - s <= Decode1ToDecode2Init; ri <= reg_internal_t_init; - si <= reg_internal_t_init; elsif flush_in = '1' then r.valid <= '0'; - s.valid <= '0'; - elsif s.valid = '1' then - if stall_in = '0' then - r <= s; - ri <= si; - s.valid <= '0'; - end if; - else - s <= rin; - si <= ri_in; - s.valid <= rin.valid and r.valid and stall_in; - if r.valid = '0' or stall_in = '0' then - r <= rin; - ri <= ri_in; - end if; + elsif stall_in = '0' then + r <= rin; + ri <= ri_in; end if; if rst = '1' then br.br_nia <= (others => '0'); @@ -585,7 +569,7 @@ begin end if; end if; end process; - busy_out <= s.valid; + busy_out <= stall_in; decode1_1: process(all) variable v : Decode1ToDecode2Type; From e598c2aef8067f2fdbcb0f2eab3d945e3eca1335 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 9 Jul 2022 11:55:13 +1000 Subject: [PATCH 15/30] control: Reimplement serialization using tags This lets us get rid of r_int and its 'outstanding' counter. We now test more directly for excess completions by checking that we don't get duplicate completions for the same tag. Signed-off-by: Paul Mackerras --- control.vhdl | 116 +++++++++++---------------------------------------- decode2.vhdl | 12 ++++-- 2 files changed, 32 insertions(+), 96 deletions(-) diff --git a/control.vhdl b/control.vhdl index e6855c2..e5ad1c7 100644 --- a/control.vhdl +++ b/control.vhdl @@ -17,7 +17,7 @@ entity control is valid_in : in std_ulogic; flush_in : in std_ulogic; deferred : in std_ulogic; - sgl_pipe_in : in std_ulogic; + serialize : in std_ulogic; stop_mark_in : in std_ulogic; gpr_write_valid_in : in std_ulogic; @@ -53,16 +53,6 @@ entity control is end entity control; architecture rtl of control is - type state_type is (IDLE, WAIT_FOR_PREV_TO_COMPLETE, WAIT_FOR_CURR_TO_COMPLETE); - - type reg_internal_type is record - state : state_type; - outstanding : integer range -1 to PIPELINE_DEPTH+2; - end record; - constant reg_internal_init : reg_internal_type := (state => IDLE, outstanding => 0); - - signal r_int, rin_int : reg_internal_type := reg_internal_init; - signal gpr_write_valid : std_ulogic; signal cr_write_valid : std_ulogic; @@ -71,6 +61,7 @@ architecture rtl of control is reg : gspr_index_t; recent : std_ulogic; wr_cr : std_ulogic; + valid : std_ulogic; end record; type tag_regs_array is array(tag_number_t) of tag_register; @@ -80,27 +71,29 @@ architecture rtl of control is signal gpr_tag_stall : std_ulogic; signal cr_tag_stall : std_ulogic; + signal serial_stall : std_ulogic; signal curr_tag : tag_number_t; signal next_tag : tag_number_t; signal curr_cr_tag : tag_number_t; + signal prev_tag : tag_number_t; begin control0: process(clk) begin if rising_edge(clk) then - assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1) - report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure; - r_int <= rin_int; for i in tag_number_t loop if rst = '1' or flush_in = '1' then tag_regs(i).wr_gpr <= '0'; tag_regs(i).wr_cr <= '0'; + tag_regs(i).valid <= '0'; else if complete_in.valid = '1' and i = complete_in.tag then + assert tag_regs(i).valid = '1' report "spurious completion" severity failure; tag_regs(i).wr_gpr <= '0'; tag_regs(i).wr_cr <= '0'; + tag_regs(i).valid <= '0'; report "tag " & integer'image(i) & " not valid"; end if; if instr_tag.valid = '1' and gpr_write_valid = '1' and @@ -115,6 +108,7 @@ begin tag_regs(i).reg <= gpr_write_in; tag_regs(i).recent <= gpr_write_valid; tag_regs(i).wr_cr <= cr_write_valid; + tag_regs(i).valid <= '1'; if gpr_write_valid = '1' then report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in); end if; @@ -124,11 +118,15 @@ begin if rst = '1' then curr_tag <= 0; curr_cr_tag <= 0; + prev_tag <= 0; else curr_tag <= next_tag; if instr_tag.valid = '1' and cr_write_valid = '1' then curr_cr_tag <= instr_tag.tag; end if; + if valid_out = '1' then + prev_tag <= instr_tag.tag; + end if; end if; end if; end process; @@ -146,6 +144,7 @@ begin variable byp_c : std_ulogic_vector(1 downto 0); variable tag_cr : instr_tag_t; variable byp_cr : std_ulogic_vector(1 downto 0); + variable tag_prev : instr_tag_t; begin tag_a := instr_tag_init; for i in tag_number_t loop @@ -226,107 +225,40 @@ begin cr_bypass <= byp_cr; cr_tag_stall <= tag_cr.valid and not byp_cr(1); + + tag_prev.tag := prev_tag; + tag_prev.valid := tag_regs(prev_tag).valid; + if tag_match(tag_prev, complete_in) then + tag_prev.valid := '0'; + end if; + serial_stall <= tag_prev.valid; end process; control1 : process(all) - variable v_int : reg_internal_type; variable valid_tmp : std_ulogic; - variable stall_tmp : std_ulogic; begin - v_int := r_int; - -- asynchronous valid_tmp := valid_in and not flush_in; - stall_tmp := '0'; - - if flush_in = '1' then - v_int.outstanding := 0; - elsif complete_in.valid = '1' then - v_int.outstanding := r_int.outstanding - 1; - end if; - if r_int.outstanding >= PIPELINE_DEPTH + 1 then - valid_tmp := '0'; - stall_tmp := '1'; - end if; if rst = '1' then gpr_write_valid <= '0'; cr_write_valid <= '0'; - v_int := reg_internal_init; valid_tmp := '0'; end if; -- Handle debugger stop - stopped_out <= '0'; - if stop_mark_in = '1' and v_int.outstanding = 0 then - stopped_out <= '1'; - end if; - - -- state machine to handle instructions that must be single - -- through the pipeline. - case r_int.state is - when IDLE => - if valid_tmp = '1' then - if (sgl_pipe_in = '1') then - if v_int.outstanding /= 0 then - v_int.state := WAIT_FOR_PREV_TO_COMPLETE; - stall_tmp := '1'; - else - -- send insn out and wait on it to complete - v_int.state := WAIT_FOR_CURR_TO_COMPLETE; - end if; - else - -- let it go out if there are no GPR or CR hazards - stall_tmp := gpr_tag_stall or cr_tag_stall; - end if; - end if; - - when WAIT_FOR_PREV_TO_COMPLETE => - if v_int.outstanding = 0 then - -- send insn out and wait on it to complete - v_int.state := WAIT_FOR_CURR_TO_COMPLETE; - else - stall_tmp := '1'; - end if; - - when WAIT_FOR_CURR_TO_COMPLETE => - if v_int.outstanding = 0 then - v_int.state := IDLE; - -- XXX Don't replicate this - if valid_tmp = '1' then - if (sgl_pipe_in = '1') then - if v_int.outstanding /= 0 then - v_int.state := WAIT_FOR_PREV_TO_COMPLETE; - stall_tmp := '1'; - else - -- send insn out and wait on it to complete - v_int.state := WAIT_FOR_CURR_TO_COMPLETE; - end if; - else - -- let it go out if there are no GPR or CR hazards - stall_tmp := gpr_tag_stall or cr_tag_stall; - end if; - end if; - else - stall_tmp := '1'; - end if; - end case; + stopped_out <= stop_mark_in and not serial_stall; - if stall_tmp = '1' then + -- Don't let it go out if there are GPR or CR hazards + -- or we are waiting for the previous instruction to complete + if (gpr_tag_stall or cr_tag_stall or (serialize and serial_stall)) = '1' then valid_tmp := '0'; end if; gpr_write_valid <= gpr_write_valid_in and valid_tmp; cr_write_valid <= cr_write_in and valid_tmp; - if valid_tmp = '1' and deferred = '0' then - v_int.outstanding := v_int.outstanding + 1; - end if; - -- update outputs valid_out <= valid_tmp; - - -- update registers - rin_int <= v_int; end process; end; diff --git a/decode2.vhdl b/decode2.vhdl index 41f3e09..500e4f5 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -53,6 +53,7 @@ architecture behaviour of decode2 is repeat : repeat_t; busy : std_ulogic; sgl_pipe : std_ulogic; + prev_sgl : std_ulogic; reg_a_valid : std_ulogic; reg_b_valid : std_ulogic; reg_c_valid : std_ulogic; @@ -281,7 +282,7 @@ architecture behaviour of decode2 is -- issue control signals signal control_valid_in : std_ulogic; signal control_valid_out : std_ulogic; - signal control_sgl_pipe : std_logic; + signal control_serialize : std_logic; signal gpr_write_valid : std_ulogic; signal gpr_write : gspr_index_t; @@ -317,7 +318,7 @@ begin valid_in => control_valid_in, deferred => deferred, flush_in => flush_in, - sgl_pipe_in => control_sgl_pipe, + serialize => control_serialize, stop_mark_in => d_in.stop_mark, gpr_write_valid_in => gpr_write_valid, @@ -405,7 +406,10 @@ begin if dc2.busy = '0' then v.e := Decode2ToExecute1Init; - v.sgl_pipe := d_in.decode.sgl_pipe; + if d_in.valid = '1' then + v.prev_sgl := dc2.sgl_pipe; + v.sgl_pipe := d_in.decode.sgl_pipe; + end if; v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; @@ -527,7 +531,7 @@ begin -- issue control control_valid_in <= valid_in; - control_sgl_pipe <= v.sgl_pipe; + control_serialize <= v.sgl_pipe or v.prev_sgl; gpr_write_valid <= v.reg_o_valid; gpr_write <= v.e.write_reg; From 7c240a664bb68bc1d9c35254fe06e07436eb1318 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 9 Jul 2022 13:17:18 +1000 Subject: [PATCH 16/30] fetch1: Fix debug stop again This fixes a bug which prevents the core from stopping properly. The same bug was previously fixed in commit e41cb01bca99 ("fetch1: Fix debug stop", 2020-12-19) and reintroduced by commit 0fb207be6069 ("fetch1: Implement a simple branch target cache", 2020-12-19). Signed-off-by: Paul Mackerras --- fetch1.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch1.vhdl b/fetch1.vhdl index 4c4a6a8..af1dd6b 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -93,7 +93,7 @@ begin end if; -- always send the up-to-date stop mark and req r.stop_mark <= stop_in; - r.req <= not rst; + r.req <= not rst and not stop_in; end if; end process; log_out <= log_nia; From d1850fea29a88bcb4f7789da1e4e50550c2eb9ec Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 9 Jul 2022 18:29:48 +1000 Subject: [PATCH 17/30] Track hazards explicitly for XER overflow bits This provides a mechanism for tracking updates to the XER overflow bits (SO, OV, OV32) and stalling instructions which need current values of those bits (mfxer, integer compare instructions, integer Rc=1 instructions, addex) or which writes carry bits (since all the XER common bits are written together, if we are writing CA/CA32 we need up-to-date values of SO/OV/OV32). This will enable updates to SO/OV/OV32 to be done at other places besides the ex1 stage. Signed-off-by: Paul Mackerras --- control.vhdl | 26 +++++++++++++++++++++++++- decode2.vhdl | 36 +++++++++++++++++++++++++++++++++--- execute1.vhdl | 20 +++++++++++++------- 3 files changed, 71 insertions(+), 11 deletions(-) diff --git a/control.vhdl b/control.vhdl index e5ad1c7..e8c8068 100644 --- a/control.vhdl +++ b/control.vhdl @@ -39,6 +39,8 @@ entity control is cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; + ov_read_in : in std_ulogic; + ov_write_in : in std_ulogic; valid_out : out std_ulogic; stopped_out : out std_ulogic; @@ -55,12 +57,14 @@ end entity control; architecture rtl of control is signal gpr_write_valid : std_ulogic; signal cr_write_valid : std_ulogic; + signal ov_write_valid : std_ulogic; type tag_register is record wr_gpr : std_ulogic; reg : gspr_index_t; recent : std_ulogic; wr_cr : std_ulogic; + wr_ov : std_ulogic; valid : std_ulogic; end record; @@ -71,12 +75,14 @@ architecture rtl of control is signal gpr_tag_stall : std_ulogic; signal cr_tag_stall : std_ulogic; + signal ov_tag_stall : std_ulogic; signal serial_stall : std_ulogic; signal curr_tag : tag_number_t; signal next_tag : tag_number_t; signal curr_cr_tag : tag_number_t; + signal curr_ov_tag : tag_number_t; signal prev_tag : tag_number_t; begin @@ -87,12 +93,14 @@ begin if rst = '1' or flush_in = '1' then tag_regs(i).wr_gpr <= '0'; tag_regs(i).wr_cr <= '0'; + tag_regs(i).wr_ov <= '0'; tag_regs(i).valid <= '0'; else if complete_in.valid = '1' and i = complete_in.tag then assert tag_regs(i).valid = '1' report "spurious completion" severity failure; tag_regs(i).wr_gpr <= '0'; tag_regs(i).wr_cr <= '0'; + tag_regs(i).wr_ov <= '0'; tag_regs(i).valid <= '0'; report "tag " & integer'image(i) & " not valid"; end if; @@ -108,6 +116,7 @@ begin tag_regs(i).reg <= gpr_write_in; tag_regs(i).recent <= gpr_write_valid; tag_regs(i).wr_cr <= cr_write_valid; + tag_regs(i).wr_ov <= ov_write_valid; tag_regs(i).valid <= '1'; if gpr_write_valid = '1' then report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in); @@ -118,12 +127,16 @@ begin if rst = '1' then curr_tag <= 0; curr_cr_tag <= 0; + curr_ov_tag <= 0; prev_tag <= 0; else curr_tag <= next_tag; if instr_tag.valid = '1' and cr_write_valid = '1' then curr_cr_tag <= instr_tag.tag; end if; + if instr_tag.valid = '1' and ov_write_valid = '1' then + curr_ov_tag <= instr_tag.tag; + end if; if valid_out = '1' then prev_tag <= instr_tag.tag; end if; @@ -144,6 +157,7 @@ begin variable byp_c : std_ulogic_vector(1 downto 0); variable tag_cr : instr_tag_t; variable byp_cr : std_ulogic_vector(1 downto 0); + variable tag_ov : instr_tag_t; variable tag_prev : instr_tag_t; begin tag_a := instr_tag_init; @@ -226,6 +240,14 @@ begin cr_bypass <= byp_cr; cr_tag_stall <= tag_cr.valid and not byp_cr(1); + -- OV hazards + tag_ov.tag := curr_ov_tag; + tag_ov.valid := ov_read_in and tag_regs(curr_ov_tag).wr_ov; + if tag_match(tag_ov, complete_in) then + tag_ov.valid := '0'; + end if; + ov_tag_stall <= tag_ov.valid; + tag_prev.tag := prev_tag; tag_prev.valid := tag_regs(prev_tag).valid; if tag_match(tag_prev, complete_in) then @@ -251,12 +273,14 @@ begin -- Don't let it go out if there are GPR or CR hazards -- or we are waiting for the previous instruction to complete - if (gpr_tag_stall or cr_tag_stall or (serialize and serial_stall)) = '1' then + if (gpr_tag_stall or cr_tag_stall or ov_tag_stall or + (serialize and serial_stall)) = '1' then valid_tmp := '0'; end if; gpr_write_valid <= gpr_write_valid_in and valid_tmp; cr_write_valid <= cr_write_in and valid_tmp; + ov_write_valid <= ov_write_in and valid_tmp; -- update outputs valid_out <= valid_tmp; diff --git a/decode2.vhdl b/decode2.vhdl index 500e4f5..a043ef9 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -58,6 +58,8 @@ architecture behaviour of decode2 is reg_b_valid : std_ulogic; reg_c_valid : std_ulogic; reg_o_valid : std_ulogic; + input_ov : std_ulogic; + output_ov : std_ulogic; end record; constant reg_type_init : reg_type := (e => Decode2ToExecute1Init, repeat => NONE, others => '0'); @@ -303,6 +305,9 @@ architecture behaviour of decode2 is signal cr_write_valid : std_ulogic; signal cr_bypass : std_ulogic_vector(1 downto 0); + signal ov_read_valid : std_ulogic; + signal ov_write_valid : std_ulogic; + signal instr_tag : instr_tag_t; begin @@ -342,6 +347,9 @@ begin cr_write_in => cr_write_valid, cr_bypass => cr_bypass, + ov_read_in => ov_read_valid, + ov_write_in => ov_write_valid, + valid_out => control_valid_out, stopped_out => stopped_out, @@ -414,19 +422,39 @@ begin v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; - -- Work out whether XER common bits are set + -- Work out whether XER SO/OV/OV32 bits are set + -- or used by this instruction + v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); v.e.output_xer := d_in.decode.output_carry; + v.input_ov := d_in.decode.output_carry; + v.output_ov := '0'; + if d_in.decode.input_carry = OV then + v.input_ov := '1'; + v.output_ov := '1'; + end if; + if v.e.rc = '1' and d_in.decode.facility /= FPU then + v.input_ov := '1'; + end if; case d_in.decode.insn_type is when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then v.e.oe := '1'; v.e.output_xer := '1'; + v.output_ov := '1'; + v.input_ov := '1'; -- need SO state if setting OV to 0 + end if; + when OP_MFSPR => + if decode_spr_num(d_in.insn) = SPR_XER then + v.input_ov := '1'; end if; when OP_MTSPR => if decode_spr_num(d_in.insn) = SPR_XER then v.e.output_xer := '1'; + v.output_ov := '1'; end if; + when OP_CMP | OP_MCRXRX => + v.input_ov := '1'; when others => end case; @@ -474,8 +502,6 @@ begin v.e.read_reg3 := decoded_reg_c.reg; v.e.write_reg := decoded_reg_o.reg; v.e.write_reg_enable := decoded_reg_o.reg_valid; - v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; v.e.insn_type := op; @@ -550,6 +576,9 @@ begin -- any op that writes CR effectively also reads it. cr_read_valid <= cr_write_valid or v.e.input_cr; + ov_read_valid <= v.input_ov; + ov_write_valid <= v.output_ov; + -- See if any of the operands can get their value via the bypass path. if dc2.busy = '0' or gpr_a_bypass /= "00" then case gpr_a_bypass is @@ -608,6 +637,7 @@ begin when others => v.e.cr := c_in.read_cr_data; end case; + v.e.xerc := c_in.read_xerc_data; v.e.valid := control_valid_out; v.e.instr_tag := instr_tag; diff --git a/execute1.vhdl b/execute1.vhdl index 57f90b0..6fadc8c 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -435,12 +435,18 @@ begin x_to_pmu.spr_val <= ex1.e.write_data; x_to_pmu.run <= '1'; - -- XER forwarding. To avoid having to track XER hazards, we use - -- the previously latched value. Since the XER common bits - -- (SO, OV[32] and CA[32]) are only modified by instructions that are - -- handled here, we can just use the result most recently sent to - -- writeback, unless a pipeline flush has happened in the meantime. - xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc; + -- XER forwarding. The CA and CA32 bits are only modified by instructions + -- that are handled here, so for them we can just use the result most + -- recently sent to writeback, unless a pipeline flush has happened in the + -- meantime. + -- Hazards for SO/OV/OV32 are handled by control.vhdl as there may be other + -- units writing to them. No forwarding is done because performance of + -- instructions that alter them is not considered significant. + xerc_in.so <= e_in.xerc.so; + xerc_in.ov <= e_in.xerc.ov; + xerc_in.ov32 <= e_in.xerc.ov32; + xerc_in.ca <= ex1.xerc.ca when ex1.xerc_valid = '1' else e_in.xerc.ca; + xerc_in.ca32 <= ex1.xerc.ca32 when ex1.xerc_valid = '1' else e_in.xerc.ca32; -- N.B. the busy signal from each source includes the -- stage2 stall from that source in it. @@ -1561,7 +1567,7 @@ begin cr_res(31) := sign; cr_res(30) := not (sign or zero); cr_res(29) := zero; - cr_res(28) := ex1.xerc.so; + cr_res(28) := ex1.e.xerc.so; cr_mask(7) := '1'; end if; From 23d5c4edc50bf64a7e675220c338671059ede0bf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 2 May 2022 09:39:26 +1000 Subject: [PATCH 18/30] FPU: Convert internal R, A, B, and C registers to 8.56 format This changes the representation of the R, A, B and C registers in the FPU from 10.54 format (10 bits to the left of the binary point and 54 bits to the right) to 8.56 format, to match the representation used in the P and Y registers and the multiplier operands. This eliminates the need for shifting when R, A, B or C is an input to the multiplier and will make it easier to implement integer division in the FPU. Signed-off-by: Paul Mackerras --- fpu.vhdl | 220 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 123 insertions(+), 97 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index a20a7a0..27587f7 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -28,12 +28,20 @@ architecture behaviour of fpu is type fp_number_class is (ZERO, FINITE, INFINITY, NAN); constant EXP_BITS : natural := 13; + constant UNIT_BIT : natural := 56; + constant QNAN_BIT : natural := UNIT_BIT - 1; + constant SP_LSB : natural := UNIT_BIT - 23; + constant SP_GBIT : natural := SP_LSB - 1; + constant SP_RBIT : natural := SP_LSB - 2; + constant DP_LSB : natural := UNIT_BIT - 52; + constant DP_GBIT : natural := DP_LSB - 1; + constant DP_RBIT : natural := DP_LSB - 2; type fpu_reg_type is record class : fp_number_class; negative : std_ulogic; exponent : signed(EXP_BITS-1 downto 0); -- unbiased - mantissa : std_ulogic_vector(63 downto 0); -- 10.54 format + mantissa : std_ulogic_vector(63 downto 0); -- 8.56 format end record; type state_t is (IDLE, DO_ILLEGAL, @@ -92,7 +100,7 @@ architecture behaviour of fpu is a : fpu_reg_type; b : fpu_reg_type; c : fpu_reg_type; - r : std_ulogic_vector(63 downto 0); -- 10.54 format + r : std_ulogic_vector(63 downto 0); -- 8.56 format s : std_ulogic_vector(55 downto 0); -- extended fraction x : std_ulogic; p : std_ulogic_vector(63 downto 0); -- 8.56 format @@ -170,7 +178,7 @@ architecture behaviour of fpu is constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00"; constant BIN_R : std_ulogic_vector(1 downto 0) := "01"; constant BIN_RND : std_ulogic_vector(1 downto 0) := "10"; - constant BIN_PS6 : std_ulogic_vector(1 downto 0) := "11"; + constant BIN_PS8 : std_ulogic_vector(1 downto 0) := "11"; constant RES_SUM : std_ulogic_vector(1 downto 0) := "00"; constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01"; @@ -432,7 +440,8 @@ architecture behaviour of fpu is if exp_nz = '0' then r.exponent := to_signed(-1022, EXP_BITS); end if; - r.mantissa := "000000000" & exp_nz & fpr(51 downto 0) & "00"; + r.mantissa := std_ulogic_vector(shift_left(resize(unsigned(exp_nz & fpr(51 downto 0)), 64), + UNIT_BIT - 52)); cls := exp_ao & exp_nz & frac_nz; case cls is when "000" => r.class := ZERO; @@ -465,22 +474,22 @@ architecture behaviour of fpu is case class is when ZERO => when FINITE => - if mantissa(54) = '1' then + if mantissa(UNIT_BIT) = '1' then -- normalized number result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023); end if; - result(51 downto 29) := mantissa(53 downto 31); + result(51 downto 29) := mantissa(UNIT_BIT - 1 downto SP_LSB); if single_prec = '0' then - result(28 downto 0) := mantissa(30 downto 2); + result(28 downto 0) := mantissa(SP_LSB - 1 downto DP_LSB); end if; when INFINITY => result(62 downto 52) := "11111111111"; when NAN => result(62 downto 52) := "11111111111"; - result(51) := quieten_nan or mantissa(53); - result(50 downto 29) := mantissa(52 downto 31); + result(51) := quieten_nan or mantissa(QNAN_BIT); + result(50 downto 29) := mantissa(QNAN_BIT - 1 downto SP_LSB); if single_prec = '0' then - result(28 downto 0) := mantissa(30 downto 2); + result(28 downto 0) := mantissa(SP_LSB - 1 downto DP_LSB); end if; end case; return result; @@ -488,8 +497,8 @@ architecture behaviour of fpu is -- Determine whether to increment when rounding -- Returns rounding_inc & inexact - -- Assumes x includes the bottom 29 bits of the mantissa already - -- if single_prec = 1 (usually arranged by setting set_x = 1 earlier). + -- If single_prec = 1, assumes x includes the bottom 31 (== SP_LSB - 2) + -- bits of the mantissa already (usually arranged by setting set_x = 1 earlier). function fp_rounding(mantissa: std_ulogic_vector(63 downto 0); x: std_ulogic; single_prec: std_ulogic; rn: std_ulogic_vector(2 downto 0); sign: std_ulogic) @@ -499,11 +508,11 @@ architecture behaviour of fpu is variable lsb : std_ulogic; begin if single_prec = '0' then - grx := mantissa(1 downto 0) & x; - lsb := mantissa(2); + grx := mantissa(DP_GBIT downto DP_RBIT) & (x or (or mantissa(DP_RBIT - 1 downto 0))); + lsb := mantissa(DP_LSB); else - grx := mantissa(30 downto 29) & x; - lsb := mantissa(31); + grx := mantissa(SP_GBIT downto SP_RBIT) & x; + lsb := mantissa(SP_LSB); end if; ret(1) := '0'; ret(0) := or (grx); @@ -589,11 +598,11 @@ begin begin if rising_edge(clk) then if r.is_sqrt = '1' then - addrhi := r.b.mantissa(55 downto 54); + addrhi := r.b.mantissa(UNIT_BIT + 1 downto UNIT_BIT); else addrhi := "00"; end if; - addr := addrhi & r.b.mantissa(53 downto 46); + addr := addrhi & r.b.mantissa(UNIT_BIT - 1 downto UNIT_BIT - 8); inverse_est <= '1' & inverse_table(to_integer(unsigned(addr))); end if; end process; @@ -670,6 +679,8 @@ begin variable maddend : std_ulogic_vector(127 downto 0); variable sum : std_ulogic_vector(63 downto 0); variable round_inc : std_ulogic_vector(63 downto 0); + variable rbit_inc : std_ulogic; + variable mult_mask : std_ulogic; variable int_result : std_ulogic; variable illegal : std_ulogic; begin @@ -729,8 +740,8 @@ begin end if; end if; - r_hi_nz <= or (r.r(55 downto 31)); - r_lo_nz <= or (r.r(30 downto 2)); + r_hi_nz <= or (r.r(UNIT_BIT + 1 downto SP_LSB)); + r_lo_nz <= or (r.r(SP_LSB - 1 downto DP_LSB)); s_nz <= or (r.s); if r.single_prec = '0' then @@ -761,13 +772,13 @@ begin end if; -- Compare P with zero and with B - px_nz := or (r.p(57 downto 4)); + px_nz := or (r.p(UNIT_BIT + 1 downto 4)); pcmpb_eq := '0'; - if r.p(59 downto 4) = r.b.mantissa(55 downto 0) then + if r.p(59 downto 4) = r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT) then pcmpb_eq := '1'; end if; pcmpb_lt := '0'; - if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(55 downto 0)) then + if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT)) then pcmpb_lt := '1'; end if; @@ -805,6 +816,8 @@ begin pshift := '0'; renorm_sqrt := '0'; shiftin := '0'; + rbit_inc := '0'; + mult_mask := '0'; int_result := '0'; illegal := '0'; case r.state is @@ -870,7 +883,7 @@ begin v.state := DO_FCTI; when "10010" => v.opsel_a := AIN_A; - if v.b.mantissa(54) = '0' and v.a.mantissa(54) = '1' then + if v.b.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then v.opsel_a := AIN_B; end if; v.state := DO_FDIV; @@ -889,7 +902,7 @@ begin when "11001" => v.is_multiply := '1'; v.opsel_a := AIN_A; - if v.c.mantissa(54) = '0' and v.a.mantissa(54) = '1' then + if v.c.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then v.opsel_a := AIN_C; end if; v.state := DO_FMUL; @@ -898,9 +911,9 @@ begin v.opsel_a := AIN_B; v.state := DO_FRSQRTE; when "11100" | "11101" | "11110" | "11111" => - if v.a.mantissa(54) = '0' then + if v.a.mantissa(UNIT_BIT) = '0' then v.opsel_a := AIN_A; - elsif v.c.mantissa(54) = '0' then + elsif v.c.mantissa(UNIT_BIT) = '0' then v.opsel_a := AIN_C; else v.opsel_a := AIN_B; @@ -934,7 +947,7 @@ begin v.instr_done := '1'; v.cr_result := "0000"; if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or - (r.b.class = FINITE and r.b.mantissa(53) = '0') then + (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then v.cr_result(2) := '1'; end if; if r.a.class = NAN or r.a.class = INFINITY or @@ -952,7 +965,7 @@ begin v.instr_done := '1'; v.cr_result := "0000"; if r.b.class = ZERO or r.b.class = INFINITY or - (r.b.class = FINITE and r.b.mantissa(53) = '0') then + (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then v.cr_result(2) := '1'; end if; if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO @@ -966,8 +979,8 @@ begin v.instr_done := '1'; update_fx := '1'; v.result_exp := r.b.exponent; - if (r.a.class = NAN and r.a.mantissa(53) = '0') or - (r.b.class = NAN and r.b.mantissa(53) = '0') then + if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or + (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') then -- Signalling NAN v.fpscr(FPSCR_VXSNAN) := '1'; if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then @@ -1119,7 +1132,7 @@ begin v.result_exp := r.b.exponent; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.b.class = NAN and r.b.mantissa(53) = '0' then + if r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0' then -- Signalling NAN v.fpscr(FPSCR_VXSNAN) := '1'; invalid := '1'; @@ -1190,7 +1203,7 @@ begin elsif r.b.exponent >= to_signed(52, EXP_BITS) then -- integer already, no rounding required, -- shift into final position - v.shift := r.b.exponent - to_signed(54, EXP_BITS); + v.shift := r.b.exponent - to_signed(UNIT_BIT, EXP_BITS); if r.insn(8) = '1' and r.b.negative = '1' then v.state := INT_OFLOW; else @@ -1214,7 +1227,7 @@ begin v.result_sign := '1'; end if; v.result_class := r.b.class; - v.result_exp := to_signed(54, EXP_BITS); + v.result_exp := to_signed(UNIT_BIT, EXP_BITS); v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; if r.b.class = ZERO then @@ -1286,9 +1299,9 @@ begin if r.a.class = FINITE and r.c.class = FINITE then v.result_exp := r.a.exponent + r.c.exponent; -- Renormalize denorm operands - if r.a.mantissa(54) = '0' then + if r.a.mantissa(UNIT_BIT) = '0' then v.state := RENORM_A; - elsif r.c.mantissa(54) = '0' then + elsif r.c.mantissa(UNIT_BIT) = '0' then v.state := RENORM_C; else f_to_multiply.valid <= '1'; @@ -1325,9 +1338,9 @@ begin v.count := "00"; if r.a.class = FINITE and r.b.class = FINITE then -- Renormalize denorm operands - if r.a.mantissa(54) = '0' then + if r.a.mantissa(UNIT_BIT) = '0' then v.state := RENORM_A; - elsif r.b.mantissa(54) = '0' then + elsif r.b.mantissa(UNIT_BIT) = '0' then v.state := RENORM_B; else v.first := '1'; @@ -1384,7 +1397,7 @@ begin if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - elsif r.b.mantissa(54) = '0' then + elsif r.b.mantissa(UNIT_BIT) = '0' then v.state := RENORM_B; elsif r.b.exponent(0) = '0' then v.state := SQRT_1; @@ -1416,7 +1429,7 @@ begin case r.b.class is when FINITE => v.result_exp := - r.b.exponent; - if r.b.mantissa(54) = '0' then + if r.b.mantissa(UNIT_BIT) = '0' then v.state := RENORM_B; else v.state := FRE_1; @@ -1446,7 +1459,7 @@ begin if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - elsif r.b.mantissa(54) = '0' then + elsif r.b.mantissa(UNIT_BIT) = '0' then v.state := RENORM_B; elsif r.b.exponent(0) = '0' then v.state := RSQRT_1; @@ -1488,9 +1501,9 @@ begin mulexp := r.a.exponent + r.c.exponent; v.result_exp := mulexp; -- Make sure A and C are normalized - if r.a.mantissa(54) = '0' then + if r.a.mantissa(UNIT_BIT) = '0' then v.state := RENORM_A; - elsif r.c.mantissa(54) = '0' then + elsif r.c.mantissa(UNIT_BIT) = '0' then v.state := RENORM_C; elsif r.b.class = ZERO then -- no addend, degenerates to multiply @@ -1559,7 +1572,7 @@ begin set_a := '1'; v.result_exp := new_exp; if r.insn(4) = '1' then - if r.c.mantissa(54) = '1' then + if r.c.mantissa(UNIT_BIT) = '1' then if r.insn(3) = '0' or r.b.class = ZERO then v.first := '1'; v.state := MULT_1; @@ -1575,7 +1588,7 @@ begin v.state := RENORM_C; end if; else - if r.b.mantissa(54) = '1' then + if r.b.mantissa(UNIT_BIT) = '1' then v.first := '1'; v.state := DIV_2; else @@ -1654,7 +1667,7 @@ begin opsel_ainv <= '1'; carry_in <= '1'; v.state := FINISH; - elsif r.r(55) = '1' then + elsif r.r(UNIT_BIT + 1) = '1' then -- sum overflowed, shift right opsel_r <= RES_SHIFT; set_x := '1'; @@ -1663,10 +1676,10 @@ begin else v.state := ROUNDING; end if; - elsif r.r(54) = '1' then + elsif r.r(UNIT_BIT) = '1' then set_x := '1'; v.state := ROUNDING; - elsif (r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then + elsif (r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then -- r.x must be zero at this point v.result_class := ZERO; if r.is_subtract = '1' then @@ -1753,12 +1766,12 @@ begin opsel_s <= S_NEG; set_s := '1'; end if; - v.shift := to_signed(56, EXP_BITS); + v.shift := to_signed(UNIT_BIT, EXP_BITS); v.state := FMADD_6; when FMADD_6 => - -- r.shift = 56 (or 0, but only if r is now nonzero) - if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then + -- r.shift = UNIT_BIT (or 0, but only if r is now nonzero) + if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then if s_nz = '0' then -- must be a subtraction, and r.x must be zero v.result_class := ZERO; @@ -1771,7 +1784,7 @@ begin set_s := '1'; -- stay in state FMADD_6 end if; - elsif r.r(56 downto 54) = "001" then + elsif r.r(UNIT_BIT + 2 downto UNIT_BIT) = "001" then v.state := FINISH; else renormalize := '1'; @@ -1835,6 +1848,7 @@ begin set_y := r.first; f_to_multiply.valid <= r.first; pshift := '1'; + mult_mask := '1'; if multiply_to_f.valid = '1' then opsel_r <= RES_MULT; v.first := '1'; @@ -1853,13 +1867,15 @@ begin end if; when DIV_6 => + -- r.opsel_a = AIN_R -- test if remainder is 0 or >= B if pcmpb_lt = '1' then -- quotient is correct, set X if remainder non-zero - v.x := r.p(58) or px_nz; + v.x := r.p(UNIT_BIT + 2) or px_nz; else - -- quotient needs to be incremented by 1 - carry_in <= '1'; + -- quotient needs to be incremented by 1 in R-bit position + rbit_inc := '1'; + opsel_b <= BIN_RND; v.x := not pcmpb_eq; end if; v.state := FINISH; @@ -1913,6 +1929,7 @@ begin msel_2 <= MUL2_R; set_y := r.first; pshift := '1'; + mult_mask := '1'; if multiply_to_f.valid = '1' then -- put result into R opsel_r <= RES_MULT; @@ -1957,6 +1974,7 @@ begin set_y := r.first; -- wait for second multiply (should be here already) pshift := '1'; + mult_mask := '1'; if multiply_to_f.valid = '1' then -- put result into R opsel_r <= RES_MULT; @@ -2001,11 +2019,8 @@ begin end if; when SQRT_10 => - -- Add the bottom 8 bits of P, sign-extended, - -- divided by 4, onto R. - -- The division by 4 is because R is 10.54 format - -- whereas P is 8.56 format. - opsel_b <= BIN_PS6; + -- Add the bottom 8 bits of P, sign-extended, onto R. + opsel_b <= BIN_PS8; sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1); v.result_exp := sqrt_exp; v.shift := to_signed(1, EXP_BITS); @@ -2030,7 +2045,7 @@ begin -- test if remainder is 0 or >= B = 2*R + 1 if pcmpb_lt = '1' then -- square root is correct, set X if remainder non-zero - v.x := r.p(58) or px_nz; + v.x := r.p(UNIT_BIT + 2) or px_nz; else -- square root needs to be incremented by 1 carry_in <= '1'; @@ -2043,10 +2058,10 @@ begin opsel_r <= RES_SHIFT; set_x := '1'; v.state := INT_ROUND; - v.shift := to_signed(-2, EXP_BITS); + v.shift := to_signed(52 - UNIT_BIT, EXP_BITS); when INT_ROUND => - -- r.shift = -2 + -- r.shift = -4 (== 52 - UNIT_BIT) opsel_r <= RES_SHIFT; round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign); v.fpscr(FPSCR_FR downto FPSCR_FI) := round; @@ -2059,7 +2074,7 @@ begin end if; when INT_ISHIFT => - -- r.shift = b.exponent - 54; + -- r.shift = b.exponent - UNIT_BIT; opsel_r <= RES_SHIFT; v.state := INT_FINAL; @@ -2129,7 +2144,7 @@ begin if r.is_multiply = '1' and px_nz = '1' then v.x := '1'; end if; - if r.r(63 downto 54) /= "0000000001" then + if r.r(63 downto UNIT_BIT) /= std_ulogic_vector(to_unsigned(1, 64 - UNIT_BIT)) then renormalize := '1'; v.state := NORMALIZE; else @@ -2172,7 +2187,7 @@ begin -- if denormalized, have to normalize before rounding v.fpscr(FPSCR_UX) := '1'; v.result_exp := r.result_exp + bias_exp; - if r.r(54) = '0' then + if r.r(UNIT_BIT) = '0' then renormalize := '1'; v.state := NORMALIZE; else @@ -2215,7 +2230,7 @@ begin v.shift := to_signed(-1, EXP_BITS); v.state := ROUNDING_2; else - if r.r(54) = '0' then + if r.r(UNIT_BIT) = '0' then -- result after masking could be zero, or could be a -- denormalized result that needs to be renormalized renormalize := '1'; @@ -2235,14 +2250,14 @@ begin -- Check for overflow during rounding -- r.shift = -1 v.x := '0'; - if r.r(55) = '1' then + if r.r(UNIT_BIT + 1) = '1' then opsel_r <= RES_SHIFT; if exp_huge = '1' then v.state := ROUND_OFLOW; else arith_done := '1'; end if; - elsif r.r(54) = '0' then + elsif r.r(UNIT_BIT) = '0' then -- Do CLZ so we can renormalize the result renormalize := '1'; v.state := ROUNDING_3; @@ -2278,9 +2293,9 @@ begin arith_done := '1'; when NAN_RESULT => - if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(53) = '0') or - (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(53) = '0') or - (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(53) = '0') then + if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or + (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or + (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then -- Signalling NAN v.fpscr(FPSCR_VXSNAN) := '1'; invalid := '1'; @@ -2343,39 +2358,41 @@ begin -- Multiplier and divide/square root data path case msel_1 is when MUL1_A => - f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00"; + f_to_multiply.data1 <= r.a.mantissa; when MUL1_B => - f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00"; + f_to_multiply.data1 <= r.b.mantissa; when MUL1_Y => f_to_multiply.data1 <= r.y; when others => - f_to_multiply.data1 <= r.r(61 downto 0) & "00"; + f_to_multiply.data1 <= r.r; end case; case msel_2 is when MUL2_C => - f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00"; + f_to_multiply.data2 <= r.c.mantissa; when MUL2_LUT => - f_to_multiply.data2 <= x"00" & inverse_est & '0' & x"000000000"; + f_to_multiply.data2 <= std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64), + UNIT_BIT - 19)); when MUL2_P => f_to_multiply.data2 <= r.p; when others => - f_to_multiply.data2 <= r.r(61 downto 0) & "00"; + f_to_multiply.data2 <= r.r; end case; maddend := (others => '0'); case msel_add is when MULADD_CONST => -- addend is 2.0 or 1.5 in 16.112 format if r.is_sqrt = '0' then - maddend(113) := '1'; -- 2.0 + maddend(2*UNIT_BIT + 1) := '1'; -- 2.0 else - maddend(112 downto 111) := "11"; -- 1.5 + maddend(2*UNIT_BIT downto 2*UNIT_BIT - 1) := "11"; -- 1.5 end if; when MULADD_A => -- addend is A in 16.112 format - maddend(121 downto 58) := r.a.mantissa; + maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.a.mantissa; when MULADD_RS => -- addend is concatenation of R and S in 16.112 format - maddend := "000000" & r.r & r.s & "00"; + maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.r; + maddend(UNIT_BIT - 1 downto 0) := r.s; when others => end case; if msel_inv = '1' then @@ -2391,7 +2408,7 @@ begin if pshift = '0' then v.p := multiply_to_f.result(63 downto 0); else - v.p := multiply_to_f.result(119 downto 56); + v.p := multiply_to_f.result(UNIT_BIT + 63 downto UNIT_BIT); end if; end if; @@ -2433,11 +2450,15 @@ begin when BIN_R => in_b0 := r.r; when BIN_RND => - round_inc := (31 => r.single_prec, 2 => not r.single_prec, others => '0'); + if rbit_inc = '0' then + round_inc := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0'); + else + round_inc := (DP_RBIT => '1', others => '0'); + end if; in_b0 := round_inc; when others => - -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64 - in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64)); + -- BIN_PS8, 8 LSBs of P sign-extended to 64 + in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 0)), 64)); end case; if opsel_binv = '1' then in_b0 := not in_b0; @@ -2451,9 +2472,9 @@ begin end if; sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in); if opsel_mask = '1' then - sum(1 downto 0) := "00"; + sum(DP_LSB - 1 downto 0) := "0000"; if r.single_prec = '1' then - sum(30 downto 2) := (others => '0'); + sum(SP_LSB - 1 downto DP_LSB) := (others => '0'); end if; end if; case opsel_r is @@ -2462,20 +2483,25 @@ begin when RES_SHIFT => result <= shift_res; when RES_MULT => - result <= multiply_to_f.result(121 downto 58); + result <= multiply_to_f.result(UNIT_BIT + 63 downto UNIT_BIT); + if mult_mask = '1' then + -- trim to 54 fraction bits if mult_mask = 1, for quotient when dividing + result(UNIT_BIT - 55 downto 0) <= (others => '0'); + end if; when others => + misc := (others => '0'); case misc_sel is when "0000" => misc := x"00000000" & (r.fpscr and fpscr_mask); when "0001" => -- generated QNaN mantissa - misc := x"0020000000000000"; + misc(QNAN_BIT) := '1'; when "0010" => -- mantissa of max representable DP number - misc := x"007ffffffffffffc"; + misc(UNIT_BIT downto DP_LSB) := (others => '1'); when "0011" => -- mantissa of max representable SP number - misc := x"007fffff80000000"; + misc(UNIT_BIT downto SP_LSB) := (others => '1'); when "0100" => -- fmrgow result misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0); @@ -2483,7 +2509,8 @@ begin -- fmrgew result misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32); when "0111" => - misc := 10x"000" & inverse_est & 35x"000000000"; + misc := std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64), + UNIT_BIT - 19)); when "1000" => -- max positive result for fctiw[z] misc := x"000000007fffffff"; @@ -2509,7 +2536,6 @@ begin -- max negative result for fctidu[z] misc := x"0000000000000000"; when others => - misc := x"0000000000000000"; end case; result <= misc; end case; @@ -2519,7 +2545,7 @@ begin when S_NEG => v.s := std_ulogic_vector(unsigned(not r.s) + (not r.x)); when S_MULT => - v.s := multiply_to_f.result(57 downto 2); + v.s := multiply_to_f.result(55 downto 0); when S_SHIFT => v.s := shift_res(63 downto 8); if shift_res(7 downto 0) /= x"00" then @@ -2553,12 +2579,12 @@ begin -- make denormalized value end up with even exponent clz(0) := '1'; end if; - v.shift := resize(signed('0' & clz) - 9, EXP_BITS); + v.shift := resize(signed('0' & clz) - (63 - UNIT_BIT), EXP_BITS); end if; if r.update_fprf = '1' then v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class, - r.r(54) and not r.denorm); + r.r(UNIT_BIT) and not r.denorm); end if; v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or From a95f8aab385b05ba6f5b7bedd6fbe1e97669ebdb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 7 May 2022 18:28:33 +1000 Subject: [PATCH 19/30] FPU: Add integer division logic to FPU This adds logic to the FPU to accomplish 64-bit integer divisions. No instruction actually uses this yet. The algorithm used is to obtain an estimate of the reciprocal of the divisor using the lookup table and refine it by one to three iterations of the Newton-Raphson algorithm (the number of iterations depends on the number of significant bits in the dividend). Then the reciprocal is multiplied by the dividend to get the quotient estimate. The remainder is calculated as dividend - quotient * divisor. If the remainder is greater than or equal to the divisor, the quotient is incremented, or if a modulo operation is being done, the divisor is subtracted from the remainder. The inverse estimate after refinement is good enough that the quotient estimate is always equal to or one less than the true quotient. Signed-off-by: Paul Mackerras --- common.vhdl | 34 ++-- execute1.vhdl | 1 + fpu.vhdl | 525 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 541 insertions(+), 19 deletions(-) diff --git a/common.vhdl b/common.vhdl index 54a87d2..aa7b830 100644 --- a/common.vhdl +++ b/common.vhdl @@ -627,27 +627,29 @@ package common is srr1 => (others => '0'), msr => (others => '0')); type Execute1ToFPUType is record - valid : std_ulogic; - op : insn_type_t; - nia : std_ulogic_vector(63 downto 0); - itag : instr_tag_t; - insn : std_ulogic_vector(31 downto 0); - single : std_ulogic; - fe_mode : std_ulogic_vector(1 downto 0); - fra : std_ulogic_vector(63 downto 0); - frb : std_ulogic_vector(63 downto 0); - frc : std_ulogic_vector(63 downto 0); - frt : gspr_index_t; - rc : std_ulogic; - out_cr : std_ulogic; - stall : std_ulogic; + valid : std_ulogic; + op : insn_type_t; + nia : std_ulogic_vector(63 downto 0); + itag : instr_tag_t; + insn : std_ulogic_vector(31 downto 0); + single : std_ulogic; + is_signed : std_ulogic; + fe_mode : std_ulogic_vector(1 downto 0); + fra : std_ulogic_vector(63 downto 0); + frb : std_ulogic_vector(63 downto 0); + frc : std_ulogic_vector(63 downto 0); + frt : gspr_index_t; + rc : std_ulogic; + out_cr : std_ulogic; + stall : std_ulogic; end record; constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'), itag => instr_tag_init, - insn => (others => '0'), fe_mode => "00", rc => '0', + insn => (others => '0'), fe_mode => "00", rc => '0', fra => (others => '0'), frb => (others => '0'), frc => (others => '0'), frt => (others => '0'), - single => '0', out_cr => '0', stall => '0'); + single => '0', is_signed => '0', out_cr => '0', + stall => '0'); type FPUToExecute1Type is record busy : std_ulogic; diff --git a/execute1.vhdl b/execute1.vhdl index 6fadc8c..2121963 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1449,6 +1449,7 @@ begin fv.insn := e_in.insn; fv.itag := e_in.instr_tag; fv.single := e_in.is_32bit; + fv.is_signed := e_in.is_signed; fv.fe_mode := ex1.msr(MSR_FE0) & ex1.msr(MSR_FE1); fv.fra := a_in; fv.frb := b_in; diff --git a/fpu.vhdl b/fpu.vhdl index 27587f7..18d3a5a 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -75,7 +75,19 @@ architecture behaviour of fpu is RENORM_A, RENORM_A2, RENORM_B, RENORM_B2, RENORM_C, RENORM_C2, - NAN_RESULT, EXC_RESULT); + NAN_RESULT, EXC_RESULT, + DO_IDIVMOD, + IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3, + IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3, + IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5, + IDIV_DODIV, + IDIV_DIV, IDIV_DIV2, IDIV_DIV3, IDIV_DIV4, IDIV_DIV5, + IDIV_DIV6, IDIV_DIV7, IDIV_DIV8, IDIV_DIV9, + IDIV_EXT_TBH, IDIV_EXT_TBH2, IDIV_EXT_TBH3, + IDIV_EXT_TBH4, IDIV_EXT_TBH5, + IDIV_EXTDIV, IDIV_EXTDIV1, IDIV_EXTDIV2, IDIV_EXTDIV3, + IDIV_EXTDIV4, IDIV_EXTDIV5, IDIV_EXTDIV6, + IDIV_MODADJ, IDIV_MODSUB, IDIV_DIVADJ, IDIV_OVFCHK, IDIV_DONE, IDIV_ZERO); type reg_type is record state : state_t; @@ -139,6 +151,14 @@ architecture behaviour of fpu is invalid : std_ulogic; negate : std_ulogic; longmask : std_ulogic; + divext : std_ulogic; + divmod : std_ulogic; + is_signed : std_ulogic; + int_ovf : std_ulogic; + div_close : std_ulogic; + inc_quot : std_ulogic; + a_hi : std_ulogic_vector(7 downto 0); + a_lo : std_ulogic_vector(55 downto 0); end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -159,6 +179,7 @@ architecture behaviour of fpu is signal lost_bits : std_ulogic; signal r_hi_nz : std_ulogic; signal r_lo_nz : std_ulogic; + signal r_gt_1 : std_ulogic; signal s_nz : std_ulogic; signal misc_sel : std_ulogic_vector(3 downto 0); signal f_to_multiply : MultiplyInputType; @@ -663,7 +684,12 @@ begin variable msb : std_ulogic; variable is_add : std_ulogic; variable set_a : std_ulogic; + variable set_a_exp : std_ulogic; + variable set_a_mant : std_ulogic; + variable set_a_hi : std_ulogic; + variable set_a_lo : std_ulogic; variable set_b : std_ulogic; + variable set_b_mant : std_ulogic; variable set_c : std_ulogic; variable set_y : std_ulogic; variable set_s : std_ulogic; @@ -671,10 +697,13 @@ begin variable px_nz : std_ulogic; variable pcmpb_eq : std_ulogic; variable pcmpb_lt : std_ulogic; + variable pcmpc_eq : std_ulogic; + variable pcmpc_lt : std_ulogic; variable pshift : std_ulogic; variable renorm_sqrt : std_ulogic; variable sqrt_exp : signed(EXP_BITS-1 downto 0); variable shiftin : std_ulogic; + variable shiftin0 : std_ulogic; variable mulexp : signed(EXP_BITS-1 downto 0); variable maddend : std_ulogic_vector(127 downto 0); variable sum : std_ulogic_vector(63 downto 0); @@ -722,6 +751,11 @@ begin v.is_sqrt := '0'; v.add_bsmall := '0'; v.doing_ftdiv := "00"; + v.divext := e_in.insn(8) and not e_in.insn(7); + v.divmod := not e_in.insn(8); + v.is_signed := e_in.is_signed; + v.int_ovf := '0'; + v.div_close := '0'; adec := decode_dp(e_in.fra, int_input); bdec := decode_dp(e_in.frb, int_input); @@ -738,10 +772,14 @@ begin if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then v.madd_cmp := '1'; end if; + + v.a_hi := 8x"0"; + v.a_lo := 56x"0"; end if; r_hi_nz <= or (r.r(UNIT_BIT + 1 downto SP_LSB)); r_lo_nz <= or (r.r(SP_LSB - 1 downto DP_LSB)); + r_gt_1 <= or (r.r(63 downto 1)); s_nz <= or (r.s); if r.single_prec = '0' then @@ -781,6 +819,14 @@ begin if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT)) then pcmpb_lt := '1'; end if; + pcmpc_eq := '0'; + if r.p = r.c.mantissa then + pcmpc_eq := '1'; + end if; + pcmpc_lt := '0'; + if unsigned(r.p) < unsigned(r.c.mantissa) then + pcmpc_lt := '1'; + end if; v.update_fprf := '0'; v.shift := to_signed(0, EXP_BITS); @@ -803,7 +849,12 @@ begin set_x := '0'; qnan_result := '0'; set_a := '0'; + set_a_exp := '0'; + set_a_mant := '0'; + set_a_hi := '0'; + set_a_lo := '0'; set_b := '0'; + set_b_mant := '0'; set_c := '0'; set_s := '0'; f_to_multiply.is_32bit <= '0'; @@ -816,6 +867,7 @@ begin pshift := '0'; renorm_sqrt := '0'; shiftin := '0'; + shiftin0 := '0'; rbit_inc := '0'; mult_mask := '0'; int_result := '0'; @@ -866,6 +918,10 @@ begin else v.state := DO_FRI; end if; + when "01001" => + -- integer divides and mods, major opcode 31 + v.opsel_a := AIN_B; + v.state := DO_IDIVMOD; when "01100" => v.opsel_a := AIN_B; v.state := DO_FRSP; @@ -2327,6 +2383,451 @@ begin end case; arith_done := '1'; + when DO_IDIVMOD => + -- r.opsel_a = AIN_B + v.result_sign := r.is_signed and (r.a.negative xor (r.b.negative and not r.divmod)); + if r.b.class = ZERO then + -- B is zero, signal overflow + v.int_ovf := '1'; + v.state := IDIV_ZERO; + elsif r.a.class = ZERO then + -- A is zero, result is zero (both for div and for mod) + v.state := IDIV_ZERO; + else + -- take absolute value for signed division, and + -- normalize and round up B to 8.56 format, like fcfid[u] + if r.is_signed = '1' and r.b.negative = '1' then + opsel_ainv <= '1'; + carry_in <= '1'; + end if; + v.result_class := FINITE; + v.result_exp := to_signed(UNIT_BIT, EXP_BITS); + v.state := IDIV_NORMB; + end if; + when IDIV_NORMB => + -- do count-leading-zeroes on B (now in R) + renormalize := '1'; + -- save the original value of B or |B| in C + set_c := '1'; + v.state := IDIV_NORMB2; + when IDIV_NORMB2 => + -- get B into the range [1, 2) in 8.56 format + set_x := '1'; -- record if any 1 bits shifted out + opsel_r <= RES_SHIFT; + v.state := IDIV_NORMB3; + when IDIV_NORMB3 => + -- add the X bit onto R to round up B + carry_in <= r.x; + -- prepare to do count-leading-zeroes on A + v.opsel_a := AIN_A; + v.state := IDIV_CLZA; + when IDIV_CLZA => + set_b := '1'; -- put R back into B + -- r.opsel_a = AIN_A + if r.is_signed = '1' and r.a.negative = '1' then + opsel_ainv <= '1'; + carry_in <= '1'; + end if; + v.result_exp := to_signed(UNIT_BIT, EXP_BITS); + v.opsel_a := AIN_C; + v.state := IDIV_CLZA2; + when IDIV_CLZA2 => + -- r.opsel_a = AIN_C + renormalize := '1'; + -- write the dividend back into A in case we negated it + set_a_mant := '1'; + -- while doing the count-leading-zeroes on A, + -- also compute A - B to tell us whether A >= B + -- (using the original value of B, which is now in C) + opsel_b <= BIN_R; + opsel_ainv <= '1'; + carry_in <= '1'; + v.state := IDIV_CLZA3; + when IDIV_CLZA3 => + -- save the exponent of A (but don't overwrite the mantissa) + v.a.exponent := new_exp; + v.div_close := '0'; + if new_exp = r.b.exponent then + v.div_close := '1'; + end if; + v.state := IDIV_NR0; + if new_exp > r.b.exponent or (v.div_close = '1' and r.r(63) = '0') then + -- A >= B, overflow if extended division + if r.divext = '1' then + v.int_ovf := '1'; + -- return 0 in overflow cases + v.state := IDIV_ZERO; + end if; + else + -- A < B, result is zero for normal division + if r.divmod = '0' and r.divext = '0' then + v.state := IDIV_ZERO; + end if; + end if; + when IDIV_NR0 => + -- reduce number of Newton-Raphson iterations for small A + if r.divext = '1' or new_exp >= to_signed(32, EXP_BITS) then + v.count := "00"; + elsif new_exp >= to_signed(16, EXP_BITS) then + v.count := "01"; + else + v.count := "10"; + end if; + -- first NR iteration does Y = LUT; P = 2 - B * LUT + msel_1 <= MUL1_B; + msel_add <= MULADD_CONST; + msel_inv <= '1'; + msel_2 <= MUL2_LUT; + set_y := '1'; + if r.b.mantissa(UNIT_BIT + 1) = '1' then + -- rounding up of the mantissa caused overflow, meaning the + -- normalized B is 2.0. Since this is outside the range + -- of the LUT, just use 0.5 as the estimated inverse. + v.state := IDIV_USE0_5; + else + -- start the first multiply now + f_to_multiply.valid <= '1'; + -- note we don't set v.first, thus the following IDIV_NR1 + -- state doesn't start a multiply (we already did that) + v.state := IDIV_NR1; + end if; + when IDIV_NR1 => + -- subsequent NR iterations do Y = P; P = 2 - B * P + msel_1 <= MUL1_B; + msel_add <= MULADD_CONST; + msel_inv <= '1'; + msel_2 <= MUL2_P; + set_y := r.first; + pshift := '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.first := '1'; + v.count := r.count + 1; + v.state := IDIV_NR2; + end if; + when IDIV_NR2 => + -- compute P = Y * P + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + f_to_multiply.valid <= r.first; + pshift := '1'; + v.opsel_a := AIN_A; + v.shift := to_signed(64, EXP_BITS); + -- Get 0.5 into R in case the inverse estimate turns out to be + -- less than 0.5, in which case we want to use 0.5, to avoid + -- infinite loops in some cases. + opsel_r <= RES_MISC; + misc_sel <= "0001"; + if multiply_to_f.valid = '1' then + v.first := '1'; + if r.count = "11" then + v.state := IDIV_DODIV; + else + v.state := IDIV_NR1; + end if; + end if; + when IDIV_USE0_5 => + -- Get 0.5 into R; it turns out the generated + -- QNaN mantissa is actually what we want + opsel_r <= RES_MISC; + misc_sel <= "0001"; + v.opsel_a := AIN_A; + v.shift := to_signed(64, EXP_BITS); + v.state := IDIV_DODIV; + when IDIV_DODIV => + -- r.opsel_a = AIN_A + -- r.shift = 64 + -- inverse estimate is in P or in R; copy it to Y + if r.b.mantissa(UNIT_BIT + 1) = '1' or + (r.p(UNIT_BIT) = '0' and r.p(UNIT_BIT - 1) = '0') then + msel_2 <= MUL2_R; + else + msel_2 <= MUL2_P; + end if; + set_y := '1'; + -- shift_res is 0 because r.shift = 64; + -- put that into B, which now holds the quotient + set_b_mant := '1'; + if r.divext = '0' then + v.shift := to_signed(-UNIT_BIT, EXP_BITS); + v.first := '1'; + v.state := IDIV_DIV; + elsif r.div_close = '0' then + v.shift := to_signed(64 - UNIT_BIT, EXP_BITS); + v.state := IDIV_EXTDIV; + else + -- handle top bit of quotient specially + -- for this we need the divisor left-justified in B + v.opsel_a := AIN_C; + v.state := IDIV_EXT_TBH; + end if; + when IDIV_DIV => + -- Dividing A by C, r.shift = -56; A is in R + -- Put A into the bottom 64 bits of Ahi/A/Alo + set_a_mant := r.first; + set_a_lo := r.first; + -- compute R = R * Y (quotient estimate) + msel_1 <= MUL1_Y; + msel_2 <= MUL2_R; + f_to_multiply.valid <= r.first; + pshift := '1'; + opsel_r <= RES_MULT; + v.shift := - r.b.exponent; + if multiply_to_f.valid = '1' then + v.state := IDIV_DIV2; + end if; + when IDIV_DIV2 => + -- r.shift = - b.exponent + -- shift the quotient estimate right by b.exponent bits + opsel_r <= RES_SHIFT; + v.first := '1'; + v.state := IDIV_DIV3; + when IDIV_DIV3 => + -- quotient (so far) is in R; multiply by C and subtract from A + msel_1 <= MUL1_R; + msel_2 <= MUL2_C; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + -- store the current quotient estimate in B + set_b_mant := r.first; + opsel_r <= RES_MULT; + opsel_s <= S_MULT; + set_s := '1'; + if multiply_to_f.valid = '1' then + v.state := IDIV_DIV4; + end if; + when IDIV_DIV4 => + -- remainder is in R/S and P + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + v.inc_quot := not pcmpc_lt and not r.divmod; + if r.divmod = '0' then + v.opsel_a := AIN_B; + end if; + v.shift := to_signed(UNIT_BIT, EXP_BITS); + if pcmpc_lt = '1' or pcmpc_eq = '1' then + if r.divmod = '0' then + v.state := IDIV_DIVADJ; + elsif pcmpc_eq = '1' then + v.state := IDIV_ZERO; + else + v.state := IDIV_MODADJ; + end if; + else + -- need to do another iteration, compute P * Y + f_to_multiply.valid <= '1'; + v.state := IDIV_DIV5; + end if; + when IDIV_DIV5 => + pshift := '1'; + opsel_r <= RES_MULT; + v.shift := - r.b.exponent; + if multiply_to_f.valid = '1' then + v.state := IDIV_DIV6; + end if; + when IDIV_DIV6 => + -- r.shift = - b.exponent + -- shift the quotient estimate right by b.exponent bits + opsel_r <= RES_SHIFT; + v.opsel_a := AIN_B; + v.first := '1'; + v.state := IDIV_DIV7; + when IDIV_DIV7 => + -- r.opsel_a = AIN_B + -- add shifted quotient delta onto the total quotient + opsel_b <= BIN_R; + v.first := '1'; + v.state := IDIV_DIV8; + when IDIV_DIV8 => + -- quotient (so far) is in R; multiply by C and subtract from A + msel_1 <= MUL1_R; + msel_2 <= MUL2_C; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + -- store the current quotient estimate in B + set_b_mant := r.first; + opsel_r <= RES_MULT; + opsel_s <= S_MULT; + set_s := '1'; + if multiply_to_f.valid = '1' then + v.state := IDIV_DIV9; + end if; + when IDIV_DIV9 => + -- remainder is in R/S and P + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + v.inc_quot := not pcmpc_lt and not r.divmod; + if r.divmod = '0' then + v.opsel_a := AIN_B; + end if; + v.shift := to_signed(UNIT_BIT, EXP_BITS); + if r.divmod = '0' then + v.state := IDIV_DIVADJ; + elsif pcmpc_eq = '1' then + v.state := IDIV_ZERO; + else + v.state := IDIV_MODADJ; + end if; + when IDIV_EXT_TBH => + -- r.opsel_a = AIN_C; get divisor into R and prepare to shift left + v.shift := to_signed(63, EXP_BITS) - r.b.exponent; + v.opsel_a := AIN_A; + v.state := IDIV_EXT_TBH2; + when IDIV_EXT_TBH2 => + -- r.opsel_a = AIN_A; divisor is in R + -- r.shift = 63 - b.exponent; shift and put into B + set_b_mant := '1'; + v.shift := to_signed(64 - UNIT_BIT, EXP_BITS); + v.state := IDIV_EXT_TBH3; + when IDIV_EXT_TBH3 => + -- Dividing (A << 64) by C + -- r.shift = 8 + -- Put A in the top 64 bits of Ahi/A/Alo + set_a_hi := '1'; + set_a_mant := '1'; + v.shift := to_signed(64, EXP_BITS) - r.b.exponent; + v.state := IDIV_EXT_TBH4; + when IDIV_EXT_TBH4 => + -- dividend (A) is in R + -- r.shift = 64 - B.exponent, so is at least 1 + opsel_r <= RES_SHIFT; + -- top bit of A gets lost in the shift, so handle it specially + v.opsel_a := AIN_B; + v.shift := to_signed(63, EXP_BITS); + v.state := IDIV_EXT_TBH5; + when IDIV_EXT_TBH5 => + -- r.opsel_a = AIN_B, r.shift = 63 + -- shifted dividend is in R, subtract left-justified divisor + opsel_b <= BIN_R; + opsel_ainv <= '1'; + carry_in <= '1'; + -- and put 1<<63 into B as the divisor (S is still 0) + shiftin0 := '1'; + set_b_mant := '1'; + v.first := '1'; + v.state := IDIV_EXTDIV2; + when IDIV_EXTDIV => + -- Dividing (A << 64) by C + -- r.shift = 8 + -- Put A in the top 64 bits of Ahi/A/Alo + set_a_hi := '1'; + set_a_mant := '1'; + v.shift := to_signed(64, EXP_BITS) - r.b.exponent; + v.state := IDIV_EXTDIV1; + when IDIV_EXTDIV1 => + -- dividend is in R + -- r.shift = 64 - B.exponent + opsel_r <= RES_SHIFT; + v.first := '1'; + v.state := IDIV_EXTDIV2; + when IDIV_EXTDIV2 => + -- shifted remainder is in R; compute R = R * Y (quotient estimate) + msel_1 <= MUL1_Y; + msel_2 <= MUL2_R; + f_to_multiply.valid <= r.first; + pshift := '1'; + v.opsel_a := AIN_B; + opsel_r <= RES_MULT; + if multiply_to_f.valid = '1' then + v.first := '1'; + v.state := IDIV_EXTDIV3; + end if; + when IDIV_EXTDIV3 => + -- r.opsel_a = AIN_B + -- delta quotient is in R; add it to B + opsel_b <= BIN_R; + v.first := '1'; + v.state := IDIV_EXTDIV4; + when IDIV_EXTDIV4 => + -- quotient is in R; put it in B and compute remainder + set_b_mant := r.first; + msel_1 <= MUL1_R; + msel_2 <= MUL2_C; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + opsel_r <= RES_MULT; + opsel_s <= S_MULT; + set_s := '1'; + v.shift := to_signed(UNIT_BIT, EXP_BITS) - r.b.exponent; + if multiply_to_f.valid = '1' then + v.state := IDIV_EXTDIV5; + end if; + when IDIV_EXTDIV5 => + -- r.shift = r.b.exponent - 56 + -- remainder is in R/S; shift it right r.b.exponent bits + opsel_r <= RES_SHIFT; + -- test LS 64b of remainder in P against divisor in C + v.inc_quot := not pcmpc_lt; + v.opsel_a := AIN_B; + v.state := IDIV_EXTDIV6; + when IDIV_EXTDIV6 => + -- r.opsel_a = AIN_B + -- shifted remainder is in R, see if it is > 1 + -- and compute R = R * Y if so + msel_1 <= MUL1_Y; + msel_2 <= MUL2_R; + pshift := '1'; + if r_gt_1 = '1' then + f_to_multiply.valid <= '1'; + v.state := IDIV_EXTDIV2; + else + v.state := IDIV_DIVADJ; + end if; + when IDIV_MODADJ => + -- r.shift = 56 + -- result is in R/S + opsel_r <= RES_SHIFT; + if pcmpc_lt = '0' then + v.opsel_a := AIN_C; + v.state := IDIV_MODSUB; + elsif r.result_sign = '0' then + v.state := IDIV_DONE; + else + v.state := IDIV_DIVADJ; + end if; + when IDIV_MODSUB => + -- r.opsel_a = AIN_C + -- Subtract divisor from remainder + opsel_ainv <= '1'; + carry_in <= '1'; + opsel_b <= BIN_R; + if r.result_sign = '0' then + v.state := IDIV_DONE; + else + v.state := IDIV_DIVADJ; + end if; + when IDIV_DIVADJ => + -- result (so far) is on the A input of the adder + -- set carry to increment quotient if needed + -- and also negate R if the answer is negative + opsel_ainv <= r.result_sign; + carry_in <= r.inc_quot xor r.result_sign; + if r.is_signed = '0' then + v.state := IDIV_DONE; + else + v.state := IDIV_OVFCHK; + end if; + when IDIV_OVFCHK => + v.int_ovf := r.r(63) xor r.result_sign; + if v.int_ovf = '1' then + v.state := IDIV_ZERO; + else + v.state := IDIV_DONE; + end if; + when IDIV_DONE => + int_result := '1'; + v.writing_fpr := '1'; + v.instr_done := '1'; + when IDIV_ZERO => + opsel_r <= RES_MISC; + misc_sel <= "0101"; + int_result := '1'; + v.writing_fpr := '1'; + v.instr_done := '1'; + end case; if zero_divide = '1' then @@ -2388,7 +2889,9 @@ begin end if; when MULADD_A => -- addend is A in 16.112 format + maddend(127 downto UNIT_BIT + 64) := r.a_hi; maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.a.mantissa; + maddend(UNIT_BIT - 1 downto 0) := r.a_lo; when MULADD_RS => -- addend is concatenation of R and S in 16.112 format maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.r; @@ -2465,7 +2968,8 @@ begin end if; in_b <= in_b0; if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then - shift_res := shifter_64(r.r & (shiftin or r.s(55)) & r.s(54 downto 0), + shift_res := shifter_64(r.r(63 downto 1) & (shiftin0 or r.r(0)) & + (shiftin or r.s(55)) & r.s(54 downto 0), std_ulogic_vector(r.shift(6 downto 0))); else shift_res := (others => '0'); @@ -2556,12 +3060,27 @@ begin end case; end if; - if set_a = '1' then + if set_a = '1' or set_a_exp = '1' then v.a.exponent := new_exp; + end if; + if set_a = '1' or set_a_mant = '1' then v.a.mantissa := shift_res; end if; + if e_in.valid = '1' then + v.a_hi := (others => '0'); + v.a_lo := (others => '0'); + else + if set_a_hi = '1' then + v.a_hi := r.r(63 downto 56); + end if; + if set_a_lo = '1' then + v.a_lo := r.r(55 downto 0); + end if; + end if; if set_b = '1' then v.b.exponent := new_exp; + end if; + if set_b = '1' or set_b_mant = '1' then v.b.mantissa := shift_res; end if; if set_c = '1' then From 34330552e8f1d78c1dac1e7a154dcee6a991c74a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 7 May 2022 22:34:23 +1000 Subject: [PATCH 20/30] FPU: Add logic for 32-bit integer division Signed-off-by: Paul Mackerras --- fpu.vhdl | 77 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 18d3a5a..b8cea39 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -80,7 +80,7 @@ architecture behaviour of fpu is IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3, IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3, IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5, - IDIV_DODIV, + IDIV_DODIV, IDIV_SH32, IDIV_DIV, IDIV_DIV2, IDIV_DIV3, IDIV_DIV4, IDIV_DIV5, IDIV_DIV6, IDIV_DIV7, IDIV_DIV8, IDIV_DIV9, IDIV_EXT_TBH, IDIV_EXT_TBH2, IDIV_EXT_TBH3, @@ -445,17 +445,20 @@ architecture behaviour of fpu is -- Split a DP floating-point number into components and work out its class. -- If is_int = 1, the input is considered an integer - function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic) return fpu_reg_type is + function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic; + is_32bint: std_ulogic; is_signed: std_ulogic) return fpu_reg_type is variable r : fpu_reg_type; variable exp_nz : std_ulogic; variable exp_ao : std_ulogic; variable frac_nz : std_ulogic; + variable low_nz : std_ulogic; variable cls : std_ulogic_vector(2 downto 0); begin r.negative := fpr(63); exp_nz := or (fpr(62 downto 52)); exp_ao := and (fpr(62 downto 52)); frac_nz := or (fpr(51 downto 0)); + low_nz := or (fpr(31 downto 0)); if is_int = '0' then r.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS); if exp_nz = '0' then @@ -472,6 +475,16 @@ architecture behaviour of fpu is when "110" => r.class := INFINITY; when others => r.class := NAN; end case; + elsif is_32bint = '1' then + r.negative := fpr(31); + r.mantissa(31 downto 0) := fpr(31 downto 0); + r.mantissa(63 downto 32) := (others => (is_signed and fpr(31))); + r.exponent := (others => '0'); + if low_nz = '1' then + r.class := FINITE; + else + r.class := ZERO; + end if; else r.mantissa := fpr; r.exponent := (others => '0'); @@ -659,6 +672,7 @@ begin variable j, k : integer; variable flm : std_ulogic_vector(7 downto 0); variable int_input : std_ulogic; + variable is_32bint : std_ulogic; variable mask : std_ulogic_vector(63 downto 0); variable in_a0 : std_ulogic_vector(63 downto 0); variable in_b0 : std_ulogic_vector(63 downto 0); @@ -710,6 +724,8 @@ begin variable round_inc : std_ulogic_vector(63 downto 0); variable rbit_inc : std_ulogic; variable mult_mask : std_ulogic; + variable sign_bit : std_ulogic; + variable rnd_b32 : std_ulogic; variable int_result : std_ulogic; variable illegal : std_ulogic; begin @@ -717,6 +733,7 @@ begin v.complete := '0'; v.do_intr := '0'; int_input := '0'; + is_32bint := '0'; if r.complete = '1' or r.do_intr = '1' then v.instr_done := '0'; @@ -735,12 +752,25 @@ begin v.fe_mode := or (e_in.fe_mode); v.dest_fpr := e_in.frt; v.single_prec := e_in.single; - v.longmask := e_in.single; + v.is_signed := e_in.is_signed; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; - int_input := '0'; - if e_in.op = OP_FPOP_I then + v.longmask := '0'; + v.divext := '0'; + v.divmod := '0'; + if e_in.op = OP_FPOP or e_in.op = OP_FPOP_I then + v.longmask := e_in.single; + if e_in.op = OP_FPOP_I then + int_input := '1'; + end if; + else -- OP_DIV, OP_DIVE, OP_MOD int_input := '1'; + is_32bint := e_in.single; + if e_in.op = OP_DIVE then + v.divext := '1'; + elsif e_in.op = OP_MOD then + v.divmod := '1'; + end if; end if; v.quieten_nan := '1'; v.tiny := '0'; @@ -751,15 +781,12 @@ begin v.is_sqrt := '0'; v.add_bsmall := '0'; v.doing_ftdiv := "00"; - v.divext := e_in.insn(8) and not e_in.insn(7); - v.divmod := not e_in.insn(8); - v.is_signed := e_in.is_signed; v.int_ovf := '0'; v.div_close := '0'; - adec := decode_dp(e_in.fra, int_input); - bdec := decode_dp(e_in.frb, int_input); - cdec := decode_dp(e_in.frc, int_input); + adec := decode_dp(e_in.fra, int_input, is_32bint, e_in.is_signed); + bdec := decode_dp(e_in.frb, int_input, is_32bint, e_in.is_signed); + cdec := decode_dp(e_in.frc, int_input, '0', '0'); v.a := adec; v.b := bdec; v.c := cdec; @@ -870,6 +897,7 @@ begin shiftin0 := '0'; rbit_inc := '0'; mult_mask := '0'; + rnd_b32 := '0'; int_result := '0'; illegal := '0'; case r.state is @@ -918,7 +946,7 @@ begin else v.state := DO_FRI; end if; - when "01001" => + when "01001" | "01011" => -- integer divides and mods, major opcode 31 v.opsel_a := AIN_B; v.state := DO_IDIVMOD; @@ -2552,6 +2580,10 @@ begin v.shift := to_signed(-UNIT_BIT, EXP_BITS); v.first := '1'; v.state := IDIV_DIV; + elsif r.single_prec = '1' then + -- divwe[u][o], shift A left 32 bits + v.shift := to_signed(32, EXP_BITS); + v.state := IDIV_SH32; elsif r.div_close = '0' then v.shift := to_signed(64 - UNIT_BIT, EXP_BITS); v.state := IDIV_EXTDIV; @@ -2561,6 +2593,12 @@ begin v.opsel_a := AIN_C; v.state := IDIV_EXT_TBH; end if; + when IDIV_SH32 => + -- r.shift = 32, R contains the dividend + opsel_r <= RES_SHIFT; + v.shift := to_signed(-UNIT_BIT, EXP_BITS); + v.first := '1'; + v.state := IDIV_DIV; when IDIV_DIV => -- Dividing A by C, r.shift = -56; A is in R -- Put A into the bottom 64 bits of Ahi/A/Alo @@ -2805,13 +2843,22 @@ begin -- and also negate R if the answer is negative opsel_ainv <= r.result_sign; carry_in <= r.inc_quot xor r.result_sign; + rnd_b32 := '1'; + if r.divmod = '0' then + opsel_b <= BIN_RND; + end if; if r.is_signed = '0' then v.state := IDIV_DONE; else v.state := IDIV_OVFCHK; end if; when IDIV_OVFCHK => - v.int_ovf := r.r(63) xor r.result_sign; + if r.single_prec = '0' then + sign_bit := r.r(63); + else + sign_bit := r.r(31); + end if; + v.int_ovf := sign_bit xor r.result_sign; if v.int_ovf = '1' then v.state := IDIV_ZERO; else @@ -2953,7 +3000,9 @@ begin when BIN_R => in_b0 := r.r; when BIN_RND => - if rbit_inc = '0' then + if rnd_b32 = '1' then + round_inc := (32 => r.result_sign and r.single_prec, others => '0'); + elsif rbit_inc = '0' then round_inc := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0'); else round_inc := (DP_RBIT => '1', others => '0'); From 73cc5167ec1ea591d9da43f2e392b5202f045f32 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 9 May 2022 19:18:42 +1000 Subject: [PATCH 21/30] Use FPU for division instructions if we have an FPU - Arrange for XER to be written for OE=1 forms - Arrange for condition codes to be set for RC=1 forms (including correct handling for 32-bit mode) - Don't instantiate the divider if we have an FPU. Signed-off-by: Paul Mackerras --- common.vhdl | 7 ++++ decode1.vhdl | 52 +++++++++++++++--------- execute1.vhdl | 29 ++++++++----- fpu.vhdl | 53 +++++++++++++++++++++++- tests/fpu/fpu.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ writeback.vhdl | 7 ++++ 6 files changed, 221 insertions(+), 33 deletions(-) diff --git a/common.vhdl b/common.vhdl index aa7b830..f846fb4 100644 --- a/common.vhdl +++ b/common.vhdl @@ -640,7 +640,10 @@ package common is frc : std_ulogic_vector(63 downto 0); frt : gspr_index_t; rc : std_ulogic; + m32b : std_ulogic; out_cr : std_ulogic; + oe : std_ulogic; + xerc : xer_common_t; stall : std_ulogic; end record; constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'), @@ -649,6 +652,7 @@ package common is fra => (others => '0'), frb => (others => '0'), frc => (others => '0'), frt => (others => '0'), single => '0', is_signed => '0', out_cr => '0', + m32b => '0', oe => '0', xerc => xerc_init, stall => '0'); type FPUToExecute1Type is record @@ -668,6 +672,8 @@ package common is write_cr_enable : std_ulogic; write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); + write_xerc : std_ulogic; + xerc : xer_common_t; intr_vec : intr_vector_t; srr0 : std_ulogic_vector(63 downto 0); srr1 : std_ulogic_vector(15 downto 0); @@ -677,6 +683,7 @@ package common is write_enable => '0', write_reg => (others => '0'), write_cr_enable => '0', write_cr_mask => (others => '0'), write_cr_data => (others => '0'), + write_xerc => '0', xerc => xerc_init, intr_vec => 0, srr1 => (others => '0'), others => (others => '0')); diff --git a/decode1.vhdl b/decode1.vhdl index 5bc023b..2e2a8e3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -35,6 +35,18 @@ architecture behaviour of decode1 is constant illegal_inst : decode_rom_t := (NONE, NONE, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); + -- If we have an FPU, then it is used for integer divisions, + -- otherwise a dedicated divider in the ALU is used. + function divider_unit(hf : boolean) return unit_t is + begin + if hf then + return FPU; + else + return ALU; + end if; + end; + constant DVU : unit_t := divider_unit(HAS_FPU); + type reg_internal_t is record override : std_ulogic; override_decode: decode_rom_t; @@ -225,22 +237,22 @@ architecture behaviour of decode1 is 2#0100010110# => (ALU, NONE, OP_DCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbt 2#0011110110# => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbtst 2#1111110110# => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbz - 2#0110001001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeu - 2#1110001001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeuo - 2#0110001011# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divweu - 2#1110001011# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divweuo - 2#0110101001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divde - 2#1110101001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divdeo - 2#0110101011# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divwe - 2#1110101011# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divweo - 2#0111001001# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdu - 2#1111001001# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divduo - 2#0111001011# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divwu - 2#1111001011# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divwuo - 2#0111101001# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divd - 2#1111101001# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divdo - 2#0111101011# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divw - 2#1111101011# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divwo + 2#0110001001# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeu + 2#1110001001# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeuo + 2#0110001011# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divweu + 2#1110001011# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divweuo + 2#0110101001# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divde + 2#1110101001# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divdeo + 2#0110101011# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divwe + 2#1110101011# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divweo + 2#0111001001# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdu + 2#1111001001# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divduo + 2#0111001011# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divwu + 2#1111001011# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divwuo + 2#0111101001# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divd + 2#1111101001# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divdo + 2#0111101011# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divw + 2#1111101011# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divwo 2#1100110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dss 2#0101010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dst 2#0101110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dstst @@ -318,10 +330,10 @@ architecture behaviour of decode1 is 2#0000010011# => (ALU, NONE, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfcr/mfocrf 2#0001010011# => (ALU, NONE, OP_MFMSR, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mfmsr 2#0101010011# => (ALU, NONE, OP_MFSPR, SPR, NONE, RS, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr - 2#0100001001# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud - 2#0100001011# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw - 2#1100001001# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd - 2#1100001011# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw + 2#0100001001# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud + 2#0100001011# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw + 2#1100001001# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd + 2#1100001011# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw 2#0010010000# => (ALU, NONE, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtcrf/mtocrf 2#0010010010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- mtmsr 2#0010110010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtmsrd # ignore top bits and d diff --git a/execute1.vhdl b/execute1.vhdl index 2121963..2efe439 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -188,7 +188,7 @@ architecture behaviour of execute1 is -- divider signals signal x_to_divider: Execute1ToDividerType; - signal divider_to_x: DividerToExecute1Type; + signal divider_to_x: DividerToExecute1Type := DividerToExecute1Init; -- random number generator signals signal random_raw : std_ulogic_vector(63 downto 0); @@ -367,13 +367,15 @@ begin m_out => multiply_to_x ); - divider_0: entity work.divider - port map ( - clk => clk, - rst => rst, - d_in => x_to_divider, - d_out => divider_to_x - ); + divider_0: if not HAS_FPU generate + div_0: entity work.divider + port map ( + clk => clk, + rst => rst, + d_in => x_to_divider, + d_out => divider_to_x + ); + end generate; random_0: entity work.random port map ( @@ -1159,9 +1161,11 @@ begin owait := '1'; when OP_DIV | OP_DIVE | OP_MOD => - v.start_div := '1'; - slow_op := '1'; - owait := '1'; + if not HAS_FPU then + v.start_div := '1'; + slow_op := '1'; + owait := '1'; + end if; when OP_FETCH_FAILED => -- Handling an ITLB miss doesn't count as having executed an instruction @@ -1457,6 +1461,9 @@ begin fv.frt := e_in.write_reg; fv.rc := e_in.rc; fv.out_cr := e_in.output_cr; + fv.m32b := not ex1.msr(MSR_SF); + fv.oe := e_in.oe; + fv.xerc := xerc_in; fv.stall := l_in.l2stall; -- Update registers diff --git a/fpu.vhdl b/fpu.vhdl index b8cea39..90e04b3 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -125,6 +125,7 @@ architecture behaviour of fpu is write_reg : gspr_index_t; complete_tag : instr_tag_t; writing_cr : std_ulogic; + writing_xer : std_ulogic; int_result : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); @@ -151,6 +152,7 @@ architecture behaviour of fpu is invalid : std_ulogic; negate : std_ulogic; longmask : std_ulogic; + integer_op : std_ulogic; divext : std_ulogic; divmod : std_ulogic; is_signed : std_ulogic; @@ -159,6 +161,10 @@ architecture behaviour of fpu is inc_quot : std_ulogic; a_hi : std_ulogic_vector(7 downto 0); a_lo : std_ulogic_vector(55 downto 0); + m32b : std_ulogic; + oe : std_ulogic; + xerc : xer_common_t; + xerc_result : xer_common_t; end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -604,6 +610,7 @@ begin r.do_intr <= '0'; r.writing_fpr <= '0'; r.writing_cr <= '0'; + r.writing_xer <= '0'; r.fpscr <= (others => '0'); r.write_reg <= (others =>'0'); r.complete_tag.valid <= '0'; @@ -658,6 +665,8 @@ begin w_out.write_cr_mask <= r.cr_mask; w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result; + w_out.write_xerc <= r.writing_xer and r.complete; + w_out.xerc <= r.xerc_result; w_out.interrupt <= r.do_intr; w_out.intr_vec <= 16#700#; w_out.srr0 <= r.nia; @@ -739,6 +748,7 @@ begin v.instr_done := '0'; v.writing_fpr := '0'; v.writing_cr := '0'; + v.writing_xer := '0'; v.comm_fpscr := r.fpscr; v.illegal := '0'; end if; @@ -755,7 +765,11 @@ begin v.is_signed := e_in.is_signed; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; + v.oe := e_in.oe; + v.m32b := e_in.m32b; + v.xerc := e_in.xerc; v.longmask := '0'; + v.integer_op := '0'; v.divext := '0'; v.divmod := '0'; if e_in.op = OP_FPOP or e_in.op = OP_FPOP_I then @@ -764,6 +778,7 @@ begin int_input := '1'; end if; else -- OP_DIV, OP_DIVE, OP_MOD + v.integer_op := '1'; int_input := '1'; is_32bint := e_in.single; if e_in.op = OP_DIVE then @@ -2865,12 +2880,44 @@ begin v.state := IDIV_DONE; end if; when IDIV_DONE => + v.xerc_result := v.xerc; + if r.oe = '1' then + v.xerc_result.ov := '0'; + v.xerc_result.ov32 := '0'; + v.writing_xer := '1'; + end if; + if r.m32b = '0' then + v.cr_result(3) := r.r(63); + v.cr_result(2 downto 1) := "00"; + if r.r = 64x"0" then + v.cr_result(1) := '1'; + else + v.cr_result(2) := not r.r(63); + end if; + else + v.cr_result(3) := r.r(31); + v.cr_result(2 downto 1) := "00"; + if r.r(31 downto 0) = 32x"0" then + v.cr_result(1) := '1'; + else + v.cr_result(2) := not r.r(31); + end if; + end if; + v.cr_result(0) := v.xerc.so; int_result := '1'; v.writing_fpr := '1'; v.instr_done := '1'; when IDIV_ZERO => opsel_r <= RES_MISC; misc_sel <= "0101"; + v.xerc_result := v.xerc; + if r.oe = '1' then + v.xerc_result.ov := r.int_ovf; + v.xerc_result.ov32 := r.int_ovf; + v.xerc_result.so := r.xerc.so or r.int_ovf; + v.writing_xer := '1'; + end if; + v.cr_result := "001" & v.xerc_result.so; int_result := '1'; v.writing_fpr := '1'; v.instr_done := '1'; @@ -3169,14 +3216,16 @@ begin v.state := IDLE; v.busy := '0'; v.f2stall := '0'; - if r.rc = '1' then + if r.rc = '1' and (r.op = OP_FPOP or r.op = OP_FPOP_I) then v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); end if; v.sp_result := r.single_prec; v.int_result := int_result; v.illegal := illegal; v.nsnan_result := v.quieten_nan; - if r.is_cmp = '0' then + if r.integer_op = '1' then + v.cr_mask := num_to_fxm(0); + elsif r.is_cmp = '0' then v.cr_mask := num_to_fxm(1); else v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(r.insn)))); diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 500e92d..773c05d 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1410,6 +1410,110 @@ int fpu_test_23(void) return trapit(0, test23); } +struct idiv_tests { + unsigned long denom; + unsigned long divisor; + unsigned long divd; + unsigned long divdu; + unsigned long divde; + unsigned long divdeu; + unsigned long modsd; + unsigned long modud; +} idiv_tests[] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0x56789a, 0x1234, 0x4c0, 0x4c0, 0, 0, 0x19a, 0x19a }, + { 2, 3, 0, 0, 0, 0xaaaaaaaaaaaaaaaa, 2, 2 }, + { 31, 157, 0, 0, 0x328c3ab35cf15328, 0x328c3ab35cf15328, 31, 31 }, + { -4329874, 43879, -98, 0x17e5a119b9170, 0, 0, -29732, 39518 }, + { -4329874, -43879, 98, 0, 0, 0xffffffffffbe99d4, -29732, -4329874 }, + { 0x8000000000000000ul, -1, 0, 0, 0, 0x8000000000000000ul, 0, 0x8000000000000000ul }, +}; + +int fpu_test_24(void) +{ + long i; + unsigned long a, b, results[6]; + + for (i = 0; i < sizeof(idiv_tests) / sizeof(idiv_tests[0]); ++i) { + a = idiv_tests[i].denom; + b = idiv_tests[i].divisor; + asm("divd %0,%1,%2" : "=r" (results[0]) : "r" (a), "r" (b)); + asm("divdu %0,%1,%2" : "=r" (results[1]) : "r" (a), "r" (b)); + asm("divde %0,%1,%2" : "=r" (results[2]) : "r" (a), "r" (b)); + asm("divdeu %0,%1,%2" : "=r" (results[3]) : "r" (a), "r" (b)); + asm("modsd %0,%1,%2" : "=r" (results[4]) : "r" (a), "r" (b)); + asm("modud %0,%1,%2" : "=r" (results[5]) : "r" (a), "r" (b)); + if (results[0] != idiv_tests[i].divd || + results[1] != idiv_tests[i].divdu || + results[2] != idiv_tests[i].divde || + results[3] != idiv_tests[i].divdeu || + results[4] != idiv_tests[i].modsd || + results[5] != idiv_tests[i].modud) { + print_hex(i, 2, " "); + print_hex(results[0], 16, " "); + print_hex(results[1], 16, " "); + print_hex(results[2], 16, " "); + print_hex(results[3], 16, " "); + print_hex(results[4], 16, " "); + print_hex(results[5], 16, "\r\n"); + return i + 1; + } + } + return 0; +} + +struct wdiv_tests { + unsigned int denom; + unsigned int divisor; + unsigned int divw; + unsigned int divwu; + unsigned int divwe; + unsigned int divweu; + unsigned int modsw; + unsigned int moduw; +} wdiv_tests[] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0x56789a, 0x1234, 0x4c0, 0x4c0, 0, 0, 0x19a, 0x19a }, + { 2, 3, 0, 0, 0, 0xaaaaaaaa, 2, 2 }, + { 31, 157, 0, 0, 0x328c3ab3, 0x328c3ab3, 31, 31 }, + { -4329874, 43879, -98, 0x17df7, 0, 0, -29732, 17165 }, + { -4329874, -43879, 98, 0, 0, 0xffbe99a9, -29732, -4329874 }, + { 0x80000000u, -1, 0, 0, 0, 0x80000000u, 0, 0x80000000u }, +}; + +int fpu_test_25(void) +{ + long i; + unsigned int a, b, results[6]; + + for (i = 0; i < sizeof(wdiv_tests) / sizeof(wdiv_tests[0]); ++i) { + a = wdiv_tests[i].denom; + b = wdiv_tests[i].divisor; + asm("divw %0,%1,%2" : "=r" (results[0]) : "r" (a), "r" (b)); + asm("divwu %0,%1,%2" : "=r" (results[1]) : "r" (a), "r" (b)); + asm("divwe %0,%1,%2" : "=r" (results[2]) : "r" (a), "r" (b)); + asm("divweu %0,%1,%2" : "=r" (results[3]) : "r" (a), "r" (b)); + asm("modsw %0,%1,%2" : "=r" (results[4]) : "r" (a), "r" (b)); + asm("moduw %0,%1,%2" : "=r" (results[5]) : "r" (a), "r" (b)); + if (results[0] != wdiv_tests[i].divw || + results[1] != wdiv_tests[i].divwu || + results[2] != wdiv_tests[i].divwe || + results[3] != wdiv_tests[i].divweu || + results[4] != wdiv_tests[i].modsw || + results[5] != wdiv_tests[i].moduw) { + print_hex(i, 2, " "); + print_hex(results[0], 8, " "); + print_hex(results[1], 8, " "); + print_hex(results[2], 8, " "); + print_hex(results[3], 8, " "); + print_hex(results[4], 8, " "); + print_hex(results[5], 8, "\r\n"); + return i + 1; + } + } + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1458,6 +1562,8 @@ int main(void) do_test(21, fpu_test_21); do_test(22, fpu_test_22); do_test(23, fpu_test_23); + do_test(24, fpu_test_24); + do_test(25, fpu_test_25); return fail; } diff --git a/writeback.vhdl b/writeback.vhdl index 0d6f41d..5b384c6 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -73,6 +73,8 @@ begin assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + assert (e_in.write_xerc_enable and fp_in.write_xerc) /= '1' severity failure; + assert not (e_in.valid = '1' and e_in.instr_tag.valid = '0') severity failure; assert not (l_in.valid = '1' and l_in.instr_tag.valid = '0') severity failure; assert not (fp_in.valid = '1' and fp_in.instr_tag.valid = '0') severity failure; @@ -168,6 +170,11 @@ begin c_out.write_cr_data <= fp_in.write_cr_data; end if; + if fp_in.write_xerc = '1' then + c_out.write_xerc_enable <= '1'; + c_out.write_xerc_data <= fp_in.xerc; + end if; + if l_in.write_enable = '1' then w_out.write_reg <= l_in.write_reg; w_out.write_data <= l_in.write_data; From bc4d02cb0dcc5b502a45651953ac7bd34521f0b9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 Jul 2022 08:52:05 +1000 Subject: [PATCH 22/30] Start removing SPRs from register file This starts the process of removing SPRs from the register file by moving SRR0/1, SPRG0-3, HSRR0/1 and HSPRG0/1 out of the register file and putting them into execute1. They are stored in a pair of small RAM arrays, referred to as "even" and "odd". The reason for having two arrays is so that two values can be read and written in each cycle. For example, SRR0 and SRR1 can be written in parallel by an interrupt and read in parallel by the rfid instruction. The addresses in the RAM which will be accessed are determined in the decode2 stage. We have one write address for both sides, but two read addresses, since in future we will want to be able to read CTR at the same time as either LR or TAR. We now have a connection from writeback to execute1 which carries the partial SRR1 value for an interrupt. SRR0 comes from the execute pipeline; we no longer need to carry instruction addresses along the LSU and FPU pipelines. Since SRR0 and SRR1 can be written in the same cycle now, we don't need the little state machine in writeback any more. Signed-off-by: Paul Mackerras --- common.vhdl | 64 +++++++++++++-------- core.vhdl | 6 +- decode1.vhdl | 61 +++++++++++++++----- decode2.vhdl | 19 +++++++ execute1.vhdl | 146 ++++++++++++++++++++++++++++++++++++++++++------ fpu.vhdl | 3 - loadstore1.vhdl | 8 +-- writeback.vhdl | 43 ++------------ 8 files changed, 242 insertions(+), 108 deletions(-) diff --git a/common.vhdl b/common.vhdl index f846fb4..74341d1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -124,6 +124,28 @@ package common is end record; constant xerc_init : xer_common_t := (others => '0'); + -- Some SPRs are stored in a pair of small RAMs in execute1 + -- Even half: + subtype ramspr_index is natural range 0 to 7; + constant RAMSPR_SRR0 : ramspr_index := 0; + constant RAMSPR_HSRR0 : ramspr_index := 1; + constant RAMSPR_SPRG0 : ramspr_index := 2; + constant RAMSPR_SPRG2 : ramspr_index := 3; + constant RAMSPR_HSPRG0 : ramspr_index := 4; + -- Odd half: + constant RAMSPR_SRR1 : ramspr_index := 0; + constant RAMSPR_HSRR1 : ramspr_index := 1; + constant RAMSPR_SPRG1 : ramspr_index := 2; + constant RAMSPR_SPRG3 : ramspr_index := 3; + constant RAMSPR_HSPRG1 : ramspr_index := 4; + + type ram_spr_info is record + index : ramspr_index; + isodd : std_ulogic; + valid : std_ulogic; + end record; + constant ram_spr_info_init: ram_spr_info := (index => 0, others => '0'); + subtype spr_selector is std_ulogic_vector(2 downto 0); type spr_id is record sel : spr_selector; @@ -253,12 +275,13 @@ package common is br_pred: std_ulogic; -- Branch was predicted to be taken big_endian: std_ulogic; spr_info : spr_id; + ram_spr : ram_spr_info; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'), decode => decode_rom_init, br_pred => '0', big_endian => '0', - spr_info => spr_id_init); + spr_info => spr_id_init, ram_spr => ram_spr_info_init); type Decode1ToFetch1Type is record redirect : std_ulogic; @@ -320,6 +343,13 @@ package common is repeat : std_ulogic; -- set if instruction is cracked into two ops second : std_ulogic; -- set if this is the second op spr_select : spr_id; + spr_is_ram : std_ulogic; + ramspr_even_rdaddr : ramspr_index; + ramspr_odd_rdaddr : ramspr_index; + ramspr_rd_odd : std_ulogic; + ramspr_wraddr : ramspr_index; + ramspr_write_even : std_ulogic; + ramspr_write_odd : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, @@ -333,6 +363,9 @@ package common is cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), result_sel => "000", sub_select => "000", repeat => '0', second => '0', spr_select => spr_id_init, + spr_is_ram => '0', + ramspr_even_rdaddr => 0, ramspr_odd_rdaddr => 0, ramspr_rd_odd => '0', + ramspr_wraddr => 0, ramspr_write_even => '0', ramspr_write_odd => '0', others => (others => '0')); type MultiplyInputType is record @@ -574,7 +607,6 @@ package common is store_done : std_ulogic; interrupt : std_ulogic; intr_vec : intr_vector_t; - srr0: std_ulogic_vector(63 downto 0); srr1: std_ulogic_vector(15 downto 0); end record; constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := @@ -582,7 +614,7 @@ package common is write_reg => (others => '0'), write_data => (others => '0'), xerc => xerc_init, rc => '0', store_done => '0', interrupt => '0', intr_vec => 0, - srr0 => (others => '0'), srr1 => (others => '0')); + srr1 => (others => '0')); type Loadstore1EventType is record load_complete : std_ulogic; @@ -675,7 +707,6 @@ package common is write_xerc : std_ulogic; xerc : xer_common_t; intr_vec : intr_vector_t; - srr0 : std_ulogic_vector(63 downto 0); srr1 : std_ulogic_vector(15 downto 0); end record; constant FPUToWritebackInit : FPUToWritebackType := @@ -731,6 +762,11 @@ package common is write_cr_mask => (others => '0'), write_cr_data => (others => '0')); + type WritebackToExecute1Type is record + intr : std_ulogic; + srr1 : std_ulogic_vector(15 downto 0); + end record; + type WritebackEventType is record instr_complete : std_ulogic; fp_complete : std_ulogic; @@ -755,26 +791,6 @@ package body common is n := 0; -- N.B. decode2 relies on this specific value when SPR_CTR => n := 1; -- N.B. decode2 relies on this specific value - when SPR_SRR0 => - n := 2; - when SPR_SRR1 => - n := 3; - when SPR_HSRR0 => - n := 4; - when SPR_HSRR1 => - n := 5; - when SPR_SPRG0 => - n := 6; - when SPR_SPRG1 => - n := 7; - when SPR_SPRG2 => - n := 8; - when SPR_SPRG3 | SPR_SPRG3U => - n := 9; - when SPR_HSPRG0 => - n := 10; - when SPR_HSPRG1 => - n := 11; when SPR_TAR => n := 13; when others => diff --git a/core.vhdl b/core.vhdl index ba8f0cc..b2f2704 100644 --- a/core.vhdl +++ b/core.vhdl @@ -102,6 +102,7 @@ architecture behave of core is -- Writeback signals signal writeback_bypass: bypass_data_t; + signal wb_interrupt: WritebackToExecute1Type; -- local signals signal fetch1_stall_in : std_ulogic; @@ -122,7 +123,6 @@ architecture behave of core is signal complete: instr_tag_t; signal terminate: std_ulogic; signal core_rst: std_ulogic; - signal do_interrupt: std_ulogic; -- Delayed/Latched resets and alt_reset signal rst_fetch1 : std_ulogic; @@ -361,7 +361,7 @@ begin l_in => loadstore1_to_execute1, fp_in => fpu_to_execute1, ext_irq_in => ext_irq, - interrupt_in => do_interrupt, + interrupt_in => wb_interrupt, l_out => execute1_to_loadstore1, fp_out => execute1_to_fpu, e_out => execute1_to_writeback, @@ -469,7 +469,7 @@ begin f_out => writeback_to_fetch1, wb_bypass => writeback_bypass, events => writeback_events, - interrupt_out => do_interrupt, + interrupt_out => wb_interrupt, complete_out => complete ); diff --git a/decode1.vhdl b/decode1.vhdl index 2e2a8e3..fd01d61 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -181,7 +181,7 @@ architecture behaviour of decode1 is -- isync 2#111# => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- rfid - 2#101# => (ALU, NONE, OP_RFID, SPR, SPR, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + 2#101# => (ALU, NONE, OP_RFID, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), others => illegal_inst ); @@ -525,6 +525,42 @@ architecture behaviour of decode1 is constant nop_instr : decode_rom_t := (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); constant fetch_fail_inst: decode_rom_t := (LDST, NONE, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); + function decode_ram_spr(sprn : spr_num_t) return ram_spr_info is + variable ret : ram_spr_info; + begin + ret := (index => 0, isodd => '0', valid => '1'); + case sprn is + when SPR_SRR0 => + ret.index := RAMSPR_SRR0; + when SPR_SRR1 => + ret.index := RAMSPR_SRR1; + ret.isodd := '1'; + when SPR_HSRR0 => + ret.index := RAMSPR_HSRR0; + when SPR_HSRR1 => + ret.index := RAMSPR_HSRR1; + ret.isodd := '1'; + when SPR_SPRG0 => + ret.index := RAMSPR_SPRG0; + when SPR_SPRG1 => + ret.index := RAMSPR_SPRG1; + ret.isodd := '1'; + when SPR_SPRG2 => + ret.index := RAMSPR_SPRG2; + when SPR_SPRG3 | SPR_SPRG3U => + ret.index := RAMSPR_SPRG3; + ret.isodd := '1'; + when SPR_HSPRG0 => + ret.index := RAMSPR_HSPRG0; + when SPR_HSPRG1 => + ret.index := RAMSPR_HSPRG1; + ret.isodd := '1'; + when others => + ret.valid := '0'; + end case; + return ret; + end; + function map_spr(sprn : spr_num_t) return spr_id is variable i : spr_id; begin @@ -614,6 +650,7 @@ begin sprn := decode_spr_num(f_in.insn); v.spr_info := map_spr(sprn); + v.ram_spr := decode_ram_spr(sprn); case to_integer(unsigned(majorop)) is when 4 => @@ -632,17 +669,17 @@ begin if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr - if is_fast_spr(v.ispr1) = '0' then - -- Make mtspr to slow SPRs single issue + -- Make mtspr to slow SPRs single issue + if v.spr_info.valid = '1' then vi.force_single := f_in.insn(8); - -- send MMU-related SPRs to loadstore1 - case sprn is - when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR => - vi.override_decode.unit := LDST; - vi.override_unit := '1'; - when others => - end case; end if; + -- send MMU-related SPRs to loadstore1 + case sprn is + when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR => + vi.override_decode.unit := LDST; + vi.override_unit := '1'; + when others => + end case; end if; when 16 => @@ -690,10 +727,6 @@ begin else v.ispr2 := fast_spr_num(SPR_TAR); end if; - else - -- Could be OP_RFID - v.ispr1 := fast_spr_num(SPR_SRR1); - v.ispr2 := fast_spr_num(SPR_SRR0); end if; when 24 => diff --git a/decode2.vhdl b/decode2.vhdl index a043ef9..c76b7f5 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -480,6 +480,23 @@ begin v.e.spr_select := d_in.spr_info; + case op is + when OP_MFSPR => + v.e.ramspr_even_rdaddr := d_in.ram_spr.index; + v.e.ramspr_odd_rdaddr := d_in.ram_spr.index; + v.e.ramspr_rd_odd := d_in.ram_spr.isodd; + v.e.spr_is_ram := d_in.ram_spr.valid; + when OP_MTSPR => + v.e.ramspr_wraddr := d_in.ram_spr.index; + v.e.ramspr_write_even := d_in.ram_spr.valid and not d_in.ram_spr.isodd; + v.e.ramspr_write_odd := d_in.ram_spr.valid and d_in.ram_spr.isodd; + v.e.spr_is_ram := d_in.ram_spr.valid; + when OP_RFID => + v.e.ramspr_even_rdaddr := RAMSPR_SRR0; + v.e.ramspr_odd_rdaddr := RAMSPR_SRR1; + when others => + end case; + case d_in.decode.length is when is1B => length := "0001"; @@ -530,6 +547,8 @@ begin if op = OP_MFSPR then if is_fast_spr(d_in.ispr1) = '1' then v.e.result_sel := "000"; -- adder_result, effectively a_in + elsif d_in.ram_spr.valid = '1' then + v.e.result_sel := "101"; -- ramspr_result elsif d_in.spr_info.valid = '0' then -- Privileged mfspr to invalid/unimplemented SPR numbers -- writes the contents of RT back to RT (i.e. it's a no-op) diff --git a/execute1.vhdl b/execute1.vhdl index 2efe439..b0b2f98 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -31,7 +31,7 @@ entity execute1 is fp_in : in FPUToExecute1Type; ext_irq_in : std_ulogic; - interrupt_in : std_ulogic; + interrupt_in : WritebackToExecute1Type; -- asynchronous l_out : out Execute1ToLoadstore1Type; @@ -72,6 +72,8 @@ architecture behaviour of execute1 is write_loga : std_ulogic; inc_loga : std_ulogic; write_pmuspr : std_ulogic; + ramspr_write_even : std_ulogic; + ramspr_write_odd : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -119,6 +121,7 @@ architecture behaviour of execute1 is msr : std_ulogic_vector(63 downto 0); xerc : xer_common_t; xerc_valid : std_ulogic; + ramspr_wraddr : ramspr_index; end record; constant reg_stage1_type_init : reg_stage1_type := (e => Execute1ToWritebackInit, se => side_effect_init, @@ -130,7 +133,8 @@ architecture behaviour of execute1 is no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", - xerc => xerc_init, xerc_valid => '0'); + xerc => xerc_init, xerc_valid => '0', + ramspr_wraddr => 0); type reg_stage2_type is record e : Execute1ToWritebackType; @@ -203,6 +207,20 @@ architecture behaviour of execute1 is signal exception_log : std_ulogic; signal irq_valid_log : std_ulogic; + -- SPR-related signals + type ramspr_half_t is array(ramspr_index) of std_ulogic_vector(63 downto 0); + signal even_sprs : ramspr_half_t := (others => (others => '0')); + signal odd_sprs : ramspr_half_t := (others => (others => '0')); + signal ramspr_even : std_ulogic_vector(63 downto 0); + signal ramspr_odd : std_ulogic_vector(63 downto 0); + signal ramspr_result : std_ulogic_vector(63 downto 0); + signal ramspr_rd_odd : std_ulogic; + signal ramspr_wr_addr : ramspr_index; + signal ramspr_even_wr_data : std_ulogic_vector(63 downto 0); + signal ramspr_even_wr_enab : std_ulogic; + signal ramspr_odd_wr_data : std_ulogic_vector(63 downto 0); + signal ramspr_odd_wr_enab : std_ulogic; + signal stage2_stall : std_ulogic; type privilege_level is (USER, SUPER); @@ -289,6 +307,18 @@ architecture behaviour of execute1 is return msr_out; end; + function intr_srr1(msr: std_ulogic_vector; flags: std_ulogic_vector) + return std_ulogic_vector is + variable srr1: std_ulogic_vector(63 downto 0); + begin + srr1(63 downto 31) := msr(63 downto 31); + srr1(30 downto 27) := flags(14 downto 11); + srr1(26 downto 22) := msr(26 downto 22); + srr1(21 downto 16) := flags(5 downto 0); + srr1(15 downto 0) := msr(15 downto 0); + return srr1; + end; + -- Work out whether a signed value fits into n bits, -- that is, see if it is in the range -2^(n-1) .. 2^(n-1) - 1 function fits_in_n_bits(val: std_ulogic_vector; n: integer) return boolean is @@ -456,6 +486,78 @@ begin valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt); + -- SPRs stored in two small RAM arrays (two so that we can read and write + -- two SPRs in each cycle). + + ramspr_read: process(all) + variable even_rd_data, odd_rd_data : std_ulogic_vector(63 downto 0); + variable wr_addr : ramspr_index; + variable even_wr_enab, odd_wr_enab : std_ulogic; + variable even_wr_data, odd_wr_data : std_ulogic_vector(63 downto 0); + variable doit : std_ulogic; + begin + -- Read address mux and async RAM reading + even_rd_data := even_sprs(e_in.ramspr_even_rdaddr); + odd_rd_data := odd_sprs(e_in.ramspr_odd_rdaddr); + + -- Write address and data muxes + doit := ex1.e.valid and not stage2_stall and not flush_in; + even_wr_enab := (ex1.se.ramspr_write_even and doit) or interrupt_in.intr; + odd_wr_enab := (ex1.se.ramspr_write_odd and doit) or interrupt_in.intr; + if interrupt_in.intr = '1' then + wr_addr := RAMSPR_SRR0; + else + wr_addr := ex1.ramspr_wraddr; + end if; + if interrupt_in.intr = '1' then + even_wr_data := ex2.e.last_nia; + odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1); + else + even_wr_data := ex1.e.write_data; + odd_wr_data := ex1.e.write_data; + end if; + ramspr_wr_addr <= wr_addr; + ramspr_even_wr_data <= even_wr_data; + ramspr_even_wr_enab <= even_wr_enab; + ramspr_odd_wr_data <= odd_wr_data; + ramspr_odd_wr_enab <= odd_wr_enab; + + -- SPR RAM read with write data bypass + -- We assume no instruction executes in the cycle immediately following + -- an interrupt, so we don't need to bypass interrupt data + if ex1.se.ramspr_write_even = '1' and e_in.ramspr_even_rdaddr = ex1.ramspr_wraddr then + ramspr_even <= ex1.e.write_data; + else + ramspr_even <= even_rd_data; + end if; + if ex1.se.ramspr_write_odd = '1' and e_in.ramspr_odd_rdaddr = ex1.ramspr_wraddr then + ramspr_odd <= ex1.e.write_data; + else + ramspr_odd <= odd_rd_data; + end if; + if e_in.ramspr_rd_odd = '0' then + ramspr_result <= ramspr_even; + else + ramspr_result <= ramspr_odd; + end if; + end process; + + ramspr_write: process(clk) + begin + if rising_edge(clk) then + if ramspr_even_wr_enab = '1' then + even_sprs(ramspr_wr_addr) <= ramspr_even_wr_data; + report "writing even spr " & integer'image(ramspr_wr_addr) & " data=" & + to_hstring(ramspr_even_wr_data); + end if; + if ramspr_odd_wr_enab = '1' then + odd_sprs(ramspr_wr_addr) <= ramspr_odd_wr_data; + report "writing odd spr " & integer'image(ramspr_wr_addr) & " data=" & + to_hstring(ramspr_odd_wr_data); + end if; + end if; + end process; + -- First stage result mux s1_sel <= e_in.result_sel when ex1.busy = '0' else "100"; with s1_sel select alu_result <= @@ -464,6 +566,7 @@ begin rotator_result when "010", shortmul_result when "011", muldiv_result when "100", + ramspr_result when "101", next_nia when "110", misc_result when others; @@ -830,6 +933,7 @@ begin variable privileged : std_ulogic; variable slow_op : std_ulogic; variable owait : std_ulogic; + variable srr1 : std_ulogic_vector(63 downto 0); begin v := actions_type_init; v.e.write_data := alu_result; @@ -850,6 +954,9 @@ begin v.e.last_nia := e_in.nia; v.e.br_offset := 64x"4"; + v.se.ramspr_write_even := e_in.ramspr_write_even; + v.se.ramspr_write_odd := e_in.ramspr_write_odd; + -- Note the difference between v.exception and v.trap: -- v.exception signals a condition that prevents execution of the -- instruction, and hence shouldn't depend on operand data, so as to @@ -1009,26 +1116,27 @@ begin end if; when OP_RFID => - v.e.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) & - not a_in(MSR_LE) & not a_in(MSR_SF); + srr1 := ramspr_odd; + v.e.redir_mode := (srr1(MSR_IR) or srr1(MSR_PR)) & not srr1(MSR_PR) & + not srr1(MSR_LE) & not srr1(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. - v.new_msr(63 downto 31) := a_in(63 downto 31); - v.new_msr(26 downto 22) := a_in(26 downto 22); - v.new_msr(15 downto 0) := a_in(15 downto 0); - if a_in(MSR_PR) = '1' then + v.new_msr(63 downto 31) := srr1(63 downto 31); + v.new_msr(26 downto 22) := srr1(26 downto 22); + v.new_msr(15 downto 0) := srr1(15 downto 0); + if srr1(MSR_PR) = '1' then v.new_msr(MSR_EE) := '1'; v.new_msr(MSR_IR) := '1'; v.new_msr(MSR_DR) := '1'; end if; v.se.write_msr := '1'; - v.e.br_offset := b_in; + v.e.br_offset := ramspr_even; v.e.abs_br := '1'; v.e.redirect := '1'; v.se.write_cfar := '1'; if HAS_FPU then v.fp_intr := fp_in.exception and - (a_in(MSR_FE0) or a_in(MSR_FE1)); + (srr1(MSR_FE0) or srr1(MSR_FE1)); end if; v.do_trace := '0'; @@ -1041,10 +1149,10 @@ begin when OP_DARN => when OP_MFMSR => when OP_MFSPR => - if is_fast_spr(e_in.read_reg1) = '1' then + if is_fast_spr(e_in.read_reg1) = '1' or e_in.spr_is_ram = '1' then if e_in.valid = '1' then report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(a_in); + "=" & to_hstring(alu_result); end if; elsif e_in.spr_select.valid = '1' then if e_in.valid = '1' then @@ -1121,7 +1229,9 @@ begin v.se.write_loga := '1'; when others => end case; - elsif is_fast_spr(e_in.write_reg) = '0' then + end if; + if e_in.spr_select.valid = '0' and is_fast_spr(e_in.write_reg) = '0' and + e_in.spr_is_ram = '0' then -- mtspr to unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode if ex1.msr(MSR_PR) = '1' then @@ -1232,6 +1342,7 @@ begin v.pmu_spr_num := e_in.insn(20 downto 16); v.mul_select := e_in.sub_select(1 downto 0); v.se := side_effect_init; + v.ramspr_wraddr := e_in.ramspr_wraddr; end if; lv := Execute1ToLoadstore1Init; @@ -1402,10 +1513,10 @@ begin v.mul_finish := '0'; v.xerc_valid := '0'; end if; - if flush_in = '1' or interrupt_in = '1' then + if flush_in = '1' or interrupt_in.intr = '1' then v.msr := ctrl_tmp.msr; end if; - if interrupt_in = '1' then + if interrupt_in.intr = '1' then v.trace_next := '0'; v.fp_exception_next := '0'; end if; @@ -1449,7 +1560,6 @@ begin -- Outputs to FPU fv.op := e_in.insn_type; - fv.nia := e_in.nia; fv.insn := e_in.insn; fv.itag := e_in.instr_tag; fv.single := e_in.is_32bit; @@ -1607,7 +1717,7 @@ begin x_to_pmu.mtspr <= ex1.se.write_pmuspr; end if; - if interrupt_in = '1' then + if interrupt_in.intr = '1' then ctrl_tmp.msr(MSR_SF) <= '1'; ctrl_tmp.msr(MSR_EE) <= '0'; ctrl_tmp.msr(MSR_PR) <= '0'; @@ -1659,7 +1769,7 @@ begin ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & exception_log & irq_valid_log & - interrupt_in & + interrupt_in.intr & "000" & ex2.e.write_enable & ex2.e.valid & diff --git a/fpu.vhdl b/fpu.vhdl index 90e04b3..2dd221e 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -99,7 +99,6 @@ architecture behaviour of fpu is illegal : std_ulogic; op : insn_type_t; insn : std_ulogic_vector(31 downto 0); - nia : std_ulogic_vector(63 downto 0); instr_tag : instr_tag_t; dest_fpr : gspr_index_t; fe_mode : std_ulogic; @@ -669,7 +668,6 @@ begin w_out.xerc <= r.xerc_result; w_out.interrupt <= r.do_intr; w_out.intr_vec <= 16#700#; - w_out.srr0 <= r.nia; w_out.srr1 <= (47-44 => r.illegal, 47-43 => not r.illegal, others => '0'); fpu_1: process(all) @@ -756,7 +754,6 @@ begin -- capture incoming instruction if e_in.valid = '1' then v.insn := e_in.insn; - v.nia := e_in.nia; v.op := e_in.op; v.instr_tag := e_in.itag; v.fe_mode := or (e_in.fe_mode); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 7fad454..b556211 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -90,7 +90,6 @@ architecture behave of loadstore1 is dword_index : std_ulogic; two_dwords : std_ulogic; incomplete : std_ulogic; - nia : std_ulogic_vector(63 downto 0); end record; constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', @@ -105,8 +104,7 @@ architecture behave of loadstore1 is atomic => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', sprn => 10x"0", is_slbia => '0', align_intr => '0', - dword_index => '0', two_dwords => '0', incomplete => '0', - nia => (others => '0')); + dword_index => '0', two_dwords => '0', incomplete => '0'); type reg_stage1_t is record req : request_t; @@ -146,7 +144,6 @@ architecture behave of loadstore1 is stage1_en : std_ulogic; interrupt : std_ulogic; intr_vec : integer range 0 to 16#fff#; - nia : std_ulogic_vector(63 downto 0); srr1 : std_ulogic_vector(15 downto 0); events : Loadstore1EventType; end record; @@ -412,7 +409,6 @@ begin v.virt_mode := l_in.virt_mode; v.priv_mode := l_in.priv_mode; v.sprn := sprn; - v.nia := l_in.nia; lsu_sum := std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)); @@ -866,7 +862,6 @@ begin -- or ISI or ISegI for instruction fetch exceptions v.interrupt := exception; if exception = '1' then - v.nia := r2.req.nia; if r2.req.align_intr = '1' then v.intr_vec := 16#600#; v.dar := r2.req.addr; @@ -962,7 +957,6 @@ begin l_out.store_done <= d_in.store_done; l_out.interrupt <= r3.interrupt; l_out.intr_vec <= r3.intr_vec; - l_out.srr0 <= r3.nia; l_out.srr1 <= r3.srr1; -- update busy signal back to execute1 diff --git a/writeback.vhdl b/writeback.vhdl index 5b384c6..2f6af2c 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -25,20 +25,12 @@ entity writeback is events : out WritebackEventType; flush_out : out std_ulogic; - interrupt_out: out std_ulogic; + interrupt_out: out WritebackToExecute1Type; complete_out : out instr_tag_t ); end entity writeback; architecture behaviour of writeback is - type irq_state_t is (WRITE_SRR0, WRITE_SRR1); - - type reg_type is record - state : irq_state_t; - srr1 : std_ulogic_vector(63 downto 0); - end record; - - signal r, rin : reg_type; begin writeback_0: process(clk) @@ -47,13 +39,6 @@ begin variable w : std_ulogic_vector(0 downto 0); begin if rising_edge(clk) then - if rst = '1' then - r.state <= WRITE_SRR0; - r.srr1 <= (others => '0'); - else - r <= rin; - end if; - -- Do consistency checks only on the clock edge x(0) := e_in.valid; y(0) := l_in.valid; @@ -82,7 +67,6 @@ begin end process; writeback_1: process(all) - variable v : reg_type; variable f : WritebackToFetch1Type; variable scf : std_ulogic_vector(3 downto 0); variable vec : integer range 0 to 16#fff#; @@ -92,9 +76,7 @@ begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; f := WritebackToFetch1Init; - interrupt_out <= '0'; vec := 0; - v := r; complete_out <= instr_tag_init; if e_in.valid = '1' then @@ -108,37 +90,21 @@ begin events.fp_complete <= fp_in.valid; intr := e_in.interrupt or l_in.interrupt or fp_in.interrupt; + interrupt_out.intr <= intr; - if r.state = WRITE_SRR1 then - w_out.write_reg <= fast_spr_num(SPR_SRR1); - w_out.write_data <= r.srr1; - w_out.write_enable <= '1'; - interrupt_out <= '1'; - v.state := WRITE_SRR0; - - elsif intr = '1' then - w_out.write_reg <= fast_spr_num(SPR_SRR0); - w_out.write_enable <= '1'; - v.state := WRITE_SRR1; + if intr = '1' then srr1 := (others => '0'); if e_in.interrupt = '1' then vec := e_in.intr_vec; - w_out.write_data <= e_in.last_nia; srr1 := e_in.srr1; elsif l_in.interrupt = '1' then vec := l_in.intr_vec; - w_out.write_data <= l_in.srr0; srr1 := l_in.srr1; elsif fp_in.interrupt = '1' then vec := fp_in.intr_vec; - w_out.write_data <= fp_in.srr0; srr1 := fp_in.srr1; end if; - v.srr1(63 downto 31) := e_in.msr(63 downto 31); - v.srr1(30 downto 27) := srr1(14 downto 11); - v.srr1(26 downto 22) := e_in.msr(26 downto 22); - v.srr1(21 downto 16) := srr1(5 downto 0); - v.srr1(15 downto 0) := e_in.msr(15 downto 0); + interrupt_out.srr1 <= srr1; else if e_in.write_enable = '1' then @@ -229,6 +195,5 @@ begin wb_bypass.tag.valid <= complete_out.valid and w_out.write_enable; wb_bypass.data <= w_out.write_data; - rin <= v; end process; end; From 337b1042501a84b3f28b11e94e650800177a63ce Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 Jul 2022 11:20:17 +1000 Subject: [PATCH 23/30] Move LR, CTR and TAR out of the register file By putting CTR on the odd side and LR and TAR on the even side, we can read and write CTR for bdnz-style instructions in parallel with reading LR or TAR for indirect branches and writing LR for branches with LK=1. Thus we don't need to double up any of these instructions, giving a simplification in decode2. We now have logic for printing LR and CTR at the end of a simulation in execute1, in addition to the similar logic in register_file and cr_file. Signed-off-by: Paul Mackerras --- common.vhdl | 27 +++-------- core.vhdl | 6 ++- decode1.vhdl | 45 ++++------------- decode2.vhdl | 57 +++++++++++++--------- execute1.vhdl | 117 +++++++++++++++++++++++---------------------- register_file.vhdl | 3 -- 6 files changed, 118 insertions(+), 137 deletions(-) diff --git a/common.vhdl b/common.vhdl index 74341d1..7df451b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -132,12 +132,15 @@ package common is constant RAMSPR_SPRG0 : ramspr_index := 2; constant RAMSPR_SPRG2 : ramspr_index := 3; constant RAMSPR_HSPRG0 : ramspr_index := 4; + constant RAMSPR_LR : ramspr_index := 5; -- must equal RAMSPR_CTR + constant RAMSPR_TAR : ramspr_index := 6; -- Odd half: constant RAMSPR_SRR1 : ramspr_index := 0; constant RAMSPR_HSRR1 : ramspr_index := 1; constant RAMSPR_SPRG1 : ramspr_index := 2; constant RAMSPR_SPRG3 : ramspr_index := 3; constant RAMSPR_HSPRG1 : ramspr_index := 4; + constant RAMSPR_CTR : ramspr_index := 5; -- must equal RAMSPR_LR type ram_spr_info is record index : ramspr_index; @@ -322,7 +325,6 @@ package common is rc: std_ulogic; oe: std_ulogic; invert_a: std_ulogic; - addm1 : std_ulogic; invert_out: std_ulogic; input_carry: carry_in_t; output_carry: std_ulogic; @@ -350,11 +352,12 @@ package common is ramspr_wraddr : ramspr_index; ramspr_write_even : std_ulogic; ramspr_write_odd : std_ulogic; + dec_ctr : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, write_reg_enable => '0', - lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', + lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', output_xer => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', @@ -366,6 +369,7 @@ package common is spr_is_ram => '0', ramspr_even_rdaddr => 0, ramspr_odd_rdaddr => 0, ramspr_rd_odd => '0', ramspr_wraddr => 0, ramspr_write_even => '0', ramspr_write_odd => '0', + dec_ctr => '0', others => (others => '0')); type MultiplyInputType is record @@ -780,25 +784,8 @@ package body common is return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16))); end; function fast_spr_num(spr: spr_num_t) return gspr_index_t is - variable n : integer range 0 to 31; - -- tmp variable introduced as workaround for VCS compilation - -- simulation was failing with subtype constraint mismatch error - -- see GitHub PR #173 - variable tmp : std_ulogic_vector(4 downto 0); begin - case spr is - when SPR_LR => - n := 0; -- N.B. decode2 relies on this specific value - when SPR_CTR => - n := 1; -- N.B. decode2 relies on this specific value - when SPR_TAR => - n := 13; - when others => - n := 0; - return "0000000"; - end case; - tmp := std_ulogic_vector(to_unsigned(n, 5)); - return "01" & tmp; + return "0000000"; end; function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is diff --git a/core.vhdl b/core.vhdl index b2f2704..82c66b4 100644 --- a/core.vhdl +++ b/core.vhdl @@ -138,6 +138,7 @@ architecture behave of core is signal rst_dbg : std_ulogic; signal alt_reset_d : std_ulogic; + signal sim_ex_dump: std_ulogic; signal sim_cr_dump: std_ulogic; -- Debug actions @@ -326,7 +327,7 @@ begin dbg_gpr_addr => dbg_gpr_addr, dbg_gpr_data => dbg_gpr_data, sim_dump => terminate, - sim_dump_done => sim_cr_dump, + sim_dump_done => sim_ex_dump, log_out => log_data(255 downto 184) ); @@ -347,6 +348,7 @@ begin execute1_0: entity work.execute1 generic map ( + SIM => SIM, EX1_BYPASS => EX1_BYPASS, HAS_FPU => HAS_FPU, HAS_SHORT_MULT => HAS_SHORT_MULT, @@ -376,6 +378,8 @@ begin dc_events => dcache_events, ic_events => icache_events, terminate_out => terminate, + sim_dump => sim_ex_dump, + sim_dump_done => sim_cr_dump, log_out => log_data(134 downto 120), log_rd_addr => log_rd_addr, log_rd_data => log_rd_data, diff --git a/decode1.vhdl b/decode1.vhdl index fd01d61..b6cea31 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -89,8 +89,8 @@ architecture behaviour of decode1 is 28 => (ALU, NONE, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andi. 29 => (ALU, NONE, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andis. 0 => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn - 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b - 16 => (ALU, NONE, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc + 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b + 16 => (ALU, NONE, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc 11 => (ALU, NONE, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi 10 => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli 34 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbz @@ -177,7 +177,7 @@ architecture behaviour of decode1 is -- addpcis 2#001# => (ALU, NONE, OP_ADD, CIA, CONST_DXHI4, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- bclr, bcctr, bctar - 2#100# => (ALU, NONE, OP_BCREG, SPR, SPR, NONE, SPR, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + 2#100# => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, SPR, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- isync 2#111# => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- rfid @@ -530,6 +530,13 @@ architecture behaviour of decode1 is begin ret := (index => 0, isodd => '0', valid => '1'); case sprn is + when SPR_LR => + ret.index := RAMSPR_LR; + when SPR_CTR => + ret.index := RAMSPR_CTR; + ret.isodd := '1'; + when SPR_TAR => + ret.index := RAMSPR_TAR; when SPR_SRR0 => ret.index := RAMSPR_SRR0; when SPR_SRR1 => @@ -683,13 +690,6 @@ begin end if; when 16 => - -- CTR may be needed as input to bc - if f_in.insn(23) = '0' then - v.ispr1 := fast_spr_num(SPR_CTR); - v.ispro := fast_spr_num(SPR_CTR); - elsif f_in.insn(0) = '1' then - v.ispro := fast_spr_num(SPR_LR); - end if; -- Predict backward branches as taken, forward as untaken v.br_pred := f_in.insn(15); br_offset := resize(signed(f_in.insn(15 downto 2)), 24); @@ -698,37 +698,12 @@ begin -- Unconditional branches are always taken v.br_pred := '1'; br_offset := signed(f_in.insn(25 downto 2)); - if f_in.insn(0) = '1' then - v.ispro := fast_spr_num(SPR_LR); - end if; when 19 => vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6)))); op_19_bits := f_in.insn(5) & f_in.insn(3) & f_in.insn(2); v.decode := decode_op_19_array(to_integer(unsigned(op_19_bits))); - -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path - if f_in.insn(2) = '0' then - -- Could be OP_BCREG: bclr, bcctr, bctar - -- Branch uses CTR as condition when BO(2) is 0. This is - -- also used to indicate that CTR is modified (they go - -- together). - -- bcctr doesn't update CTR or use it in the branch condition - if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then - v.ispr1 := fast_spr_num(SPR_CTR); - v.ispro := fast_spr_num(SPR_CTR); - elsif f_in.insn(0) = '1' then - v.ispro := fast_spr_num(SPR_LR); - end if; - if f_in.insn(10) = '0' then - v.ispr2 := fast_spr_num(SPR_LR); - elsif f_in.insn(6) = '0' then - v.ispr2 := fast_spr_num(SPR_CTR); - else - v.ispr2 := fast_spr_num(SPR_TAR); - end if; - end if; - when 24 => -- ori, special-case the standard NOP if std_match(f_in.insn, "01100000000000000000000000000000") then diff --git a/decode2.vhdl b/decode2.vhdl index c76b7f5..928ec94 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -406,6 +406,7 @@ begin variable length : std_ulogic_vector(3 downto 0); variable op : insn_type_t; variable valid_in : std_ulogic; + variable decctr : std_ulogic; begin v := dc2; @@ -470,17 +471,45 @@ begin end if; op := d_in.decode.insn_type; + -- Does this instruction decrement CTR? + -- bc, bclr, bctar with BO(2) = 0 do, but not bcctr. + decctr := '0'; + if d_in.insn(23) = '0' and + (op = OP_BC or + (op = OP_BCREG and not (d_in.insn(10) = '1' and d_in.insn(6) = '0'))) then + decctr := '1'; + end if; + v.e.dec_ctr := decctr; + v.repeat := d_in.decode.repeat; if d_in.decode.repeat /= NONE then v.e.repeat := '1'; - elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then - -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled - v.e.repeat := '1'; end if; v.e.spr_select := d_in.spr_info; + if decctr = '1' then + -- read and write CTR + v.e.ramspr_odd_rdaddr := RAMSPR_CTR; + v.e.ramspr_wraddr := RAMSPR_CTR; + v.e.ramspr_write_odd := '1'; + end if; + if v.e.lr = '1' then + -- write LR + v.e.ramspr_wraddr := RAMSPR_LR; + v.e.ramspr_write_even := '1'; + end if; + case op is + when OP_BCREG => + if d_in.insn(10) = '0' then + v.e.ramspr_even_rdaddr := RAMSPR_LR; + elsif d_in.insn(6) = '0' then + v.e.ramspr_odd_rdaddr := RAMSPR_CTR; + v.e.ramspr_rd_odd := '1'; + else + v.e.ramspr_even_rdaddr := RAMSPR_TAR; + end if; when OP_MFSPR => v.e.ramspr_even_rdaddr := d_in.ram_spr.index; v.e.ramspr_odd_rdaddr := d_in.ram_spr.index; @@ -520,7 +549,6 @@ begin v.e.write_reg := decoded_reg_o.reg; v.e.write_reg_enable := decoded_reg_o.reg_valid; v.e.invert_a := d_in.decode.invert_a; - v.e.addm1 := '0'; v.e.insn_type := op; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; @@ -536,14 +564,6 @@ begin v.e.br_pred := d_in.br_pred; v.e.result_sel := result_select(op); v.e.sub_select := subresult_select(op); - if op = OP_BC or op = OP_BCREG then - if d_in.insn(23) = '0' and - not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then - -- decrement CTR if BO(2) = 0 and not bcctr - v.e.addm1 := '1'; - v.e.result_sel := "000"; -- select adder output - end if; - end if; if op = OP_MFSPR then if is_fast_spr(d_in.ispr1) = '1' then v.e.result_sel := "000"; -- adder_result, effectively a_in @@ -562,16 +582,9 @@ begin -- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction. -- Set up for the second iteration (if deferred = 1 this will all be ignored) v.e.second := '1'; - case dc2.repeat is - when DUPD => - -- update-form loads, 2nd instruction writes RA - v.e.write_reg := dc2.e.read_reg1; - when NONE => - -- bcl/bclrl/bctarl that needs to write both CTR and LR - v.e.write_reg(0) := '0'; -- point to LR - v.e.result_sel := "110"; -- select NIA (to go to LR) - when others => - end case; + -- DUPD is the only possibility here: + -- update-form loads, 2nd instruction writes RA + v.e.write_reg := dc2.e.read_reg1; end if; -- issue control diff --git a/execute1.vhdl b/execute1.vhdl index b0b2f98..5ee830b 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -12,6 +12,7 @@ use work.ppc_fx_insns.all; entity execute1 is generic ( + SIM : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; HAS_SHORT_MULT : boolean := false; @@ -54,6 +55,10 @@ entity execute1 is dc_events : in DcacheEventType; ic_events : in IcacheEventType; + -- debug + sim_dump : in std_ulogic; + sim_dump_done : out std_ulogic; + log_out : out std_ulogic_vector(14 downto 0); log_rd_addr : out std_ulogic_vector(31 downto 0); log_rd_data : in std_ulogic_vector(63 downto 0); @@ -92,10 +97,12 @@ architecture behaviour of execute1 is fp_intr : std_ulogic; res2_sel : std_ulogic_vector(1 downto 0); bypass_valid : std_ulogic; + ramspr_odd_data : std_ulogic_vector(63 downto 0); end record; constant actions_type_init : actions_type := (e => Execute1ToWritebackInit, se => side_effect_init, - new_msr => (others => '0'), res2_sel => "00", others => '0'); + new_msr => (others => '0'), res2_sel => "00", + ramspr_odd_data => 64x"0", others => '0'); type reg_stage1_type is record e : Execute1ToWritebackType; @@ -104,7 +111,6 @@ architecture behaviour of execute1 is fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; - br_taken : std_ulogic; oe : std_ulogic; mul_select : std_ulogic_vector(1 downto 0); res2_sel : std_ulogic_vector(1 downto 0); @@ -122,11 +128,12 @@ architecture behaviour of execute1 is xerc : xer_common_t; xerc_valid : std_ulogic; ramspr_wraddr : ramspr_index; + ramspr_odd_data : std_ulogic_vector(63 downto 0); end record; constant reg_stage1_type_init : reg_stage1_type := (e => Execute1ToWritebackInit, se => side_effect_init, busy => '0', - fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', + fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, oe => '0', mul_select => "00", res2_sel => "00", spr_select => spr_id_init, pmu_spr_num => 5x"0", mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', @@ -134,7 +141,7 @@ architecture behaviour of execute1 is taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", xerc => xerc_init, xerc_valid => '0', - ramspr_wraddr => 0); + ramspr_wraddr => 0, ramspr_odd_data => 64x"0"); type reg_stage2_type is record e : Execute1ToWritebackType; @@ -514,7 +521,7 @@ begin odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1); else even_wr_data := ex1.e.write_data; - odd_wr_data := ex1.e.write_data; + odd_wr_data := ex1.ramspr_odd_data; end if; ramspr_wr_addr <= wr_addr; ramspr_even_wr_data <= even_wr_data; @@ -531,7 +538,7 @@ begin ramspr_even <= even_rd_data; end if; if ex1.se.ramspr_write_odd = '1' and e_in.ramspr_odd_rdaddr = ex1.ramspr_wraddr then - ramspr_odd <= ex1.e.write_data; + ramspr_odd <= ex1.ramspr_odd_data; else ramspr_odd <= odd_rd_data; end if; @@ -600,7 +607,6 @@ begin -- Data path for integer instructions (first execute stage) execute1_dp: process(all) variable a_inv : std_ulogic_vector(63 downto 0); - variable b_or_m1 : std_ulogic_vector(63 downto 0); variable sum_with_carry : std_ulogic_vector(64 downto 0); variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); @@ -635,12 +641,7 @@ begin else a_inv := not a_in; end if; - if e_in.addm1 = '0' then - b_or_m1 := b_in; - else - b_or_m1 := (others => '1'); - end if; - sum_with_carry := ppc_adde(a_inv, b_or_m1, + sum_with_carry := ppc_adde(a_inv, b_in, decode_input_carry(e_in.input_carry, xerc_in)); adder_result <= sum_with_carry(63 downto 0); carry_32 <= sum_with_carry(32) xor a_inv(32) xor b_in(32); @@ -956,6 +957,10 @@ begin v.se.ramspr_write_even := e_in.ramspr_write_even; v.se.ramspr_write_odd := e_in.ramspr_write_odd; + v.ramspr_odd_data := c_in; + if e_in.dec_ctr = '1' then + v.ramspr_odd_data := std_ulogic_vector(unsigned(ramspr_odd) - 1); + end if; -- Note the difference between v.exception and v.trap: -- v.exception signals a condition that prevents execution of the @@ -1059,61 +1064,42 @@ begin end if; v.se.write_cfar := '1'; when OP_BC => - -- read_data1 is CTR - -- If this instruction updates both CTR and LR, then it is - -- doubled; the first instruction decrements CTR and determines - -- whether the branch is taken, and the second does the - -- redirect and the LR update. + -- If CTR is being decremented, it is in ramspr_odd. bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - if e_in.second = '0' then - v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - else - v.take_branch := ex1.br_taken; - end if; + v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); if v.take_branch = '1' then v.e.br_offset := b_in; v.e.abs_br := insn_aa(e_in.insn); end if; - if e_in.repeat = '0' or e_in.second = '1' then - -- Mispredicted branches cause a redirect - if v.take_branch /= e_in.br_pred then - v.e.redirect := '1'; - end if; - v.direct_branch := '1'; - v.e.br_last := '1'; - v.e.br_taken := v.take_branch; - if ex1.msr(MSR_BE) = '1' then - v.do_trace := '1'; - end if; - v.se.write_cfar := v.take_branch; + -- Mispredicted branches cause a redirect + if v.take_branch /= e_in.br_pred then + v.e.redirect := '1'; + end if; + v.direct_branch := '1'; + v.e.br_last := '1'; + v.e.br_taken := v.take_branch; + if ex1.msr(MSR_BE) = '1' then + v.do_trace := '1'; end if; + v.se.write_cfar := v.take_branch; when OP_BCREG => - -- read_data1 is CTR, read_data2 is target register (CTR, LR or TAR) - -- If this instruction updates both CTR and LR, then it is - -- doubled; the first instruction decrements CTR and determines - -- whether the branch is taken, and the second does the - -- redirect and the LR update. + -- If CTR is being decremented, it is in ramspr_odd. + -- The target address is in ramspr_result (LR, CTR or TAR). bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - if e_in.second = '0' then - v.take_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - else - v.take_branch := ex1.br_taken; - end if; + v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); if v.take_branch = '1' then - v.e.br_offset := b_in; + v.e.br_offset := ramspr_result; v.e.abs_br := '1'; end if; - if e_in.repeat = '0' or e_in.second = '1' then - -- Indirect branches are never predicted taken - v.e.redirect := v.take_branch; - v.e.br_taken := v.take_branch; - if ex1.msr(MSR_BE) = '1' then - v.do_trace := '1'; - end if; - v.se.write_cfar := v.take_branch; + -- Indirect branches are never predicted taken + v.e.redirect := v.take_branch; + v.e.br_taken := v.take_branch; + if ex1.msr(MSR_BE) = '1' then + v.do_trace := '1'; end if; + v.se.write_cfar := v.take_branch; when OP_RFID => srr1 := ramspr_odd; @@ -1130,7 +1116,7 @@ begin v.new_msr(MSR_DR) := '1'; end if; v.se.write_msr := '1'; - v.e.br_offset := ramspr_even; + v.e.br_offset := ramspr_result; v.e.abs_br := '1'; v.e.redirect := '1'; v.se.write_cfar := '1'; @@ -1343,6 +1329,7 @@ begin v.mul_select := e_in.sub_select(1 downto 0); v.se := side_effect_init; v.ramspr_wraddr := e_in.ramspr_wraddr; + v.ramspr_odd_data := actions.ramspr_odd_data; end if; lv := Execute1ToLoadstore1Init; @@ -1430,7 +1417,6 @@ begin v.e.valid := actions.complete; bypass_valid := actions.bypass_valid; v.taken_branch_event := actions.take_branch; - v.br_taken := actions.take_branch; v.trace_next := actions.do_trace; v.fp_exception_next := actions.fp_intr; v.res2_sel := actions.res2_sel; @@ -1759,6 +1745,25 @@ begin exception_log <= v.e.interrupt; end process; + sim_dump_test: if SIM generate + dump_exregs: process(all) + variable xer : std_ulogic_vector(63 downto 0); + begin + if sim_dump = '1' then + report "LR " & to_hstring(even_sprs(RAMSPR_LR)); + report "CTR " & to_hstring(odd_sprs(RAMSPR_CTR)); + sim_dump_done <= '1'; + else + sim_dump_done <= '0'; + end if; + end process; + end generate; + + -- Keep GHDL synthesis happy + sim_dump_test_synth: if not SIM generate + sim_dump_done <= '0'; + end generate; + e1_log: if LOG_LENGTH > 0 generate signal log_data : std_ulogic_vector(14 downto 0); begin diff --git a/register_file.vhdl b/register_file.vhdl index 0235dfc..ed856cb 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -130,9 +130,6 @@ begin loop_0: for i in 0 to 31 loop report "GPR" & integer'image(i) & " " & to_hstring(registers(i)); end loop loop_0; - - report "LR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_LR))))); - report "CTR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_CTR))))); sim_dump_done <= '1'; else sim_dump_done <= '0'; From fdb3ef6874fb34e67e8d6f136440378c706069e9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 19 Feb 2022 19:03:49 +1100 Subject: [PATCH 24/30] Finish off taking SPRs out of register file With this, the register file now contains 64 entries, for 32 GPRs and 32 FPRs, rather than the 128 it had previously. Several things get simplified - decode1 no longer has to work out the ispr{1,2,o} values, decode_input_reg_{a,b,c} no longer have the t = SPR case, etc. Signed-off-by: Paul Mackerras --- common.vhdl | 45 ++++++------------------------------------- decode1.vhdl | 10 +++------- decode2.vhdl | 48 ++++++++++------------------------------------ decode_types.vhdl | 6 +++--- execute1.vhdl | 5 ++--- loadstore1.vhdl | 2 +- logical.vhdl | 2 +- register_file.vhdl | 26 +++++++++++-------------- 8 files changed, 37 insertions(+), 107 deletions(-) diff --git a/common.vhdl b/common.vhdl index 7df451b..06b62e0 100644 --- a/common.vhdl +++ b/common.vhdl @@ -86,30 +86,19 @@ package common is -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); - -- Extended GPR index (can hold an SPR or a FPR) - subtype gspr_index_t is std_ulogic_vector(6 downto 0); + -- Extended GPR index (can hold a GPR or a FPR) + subtype gspr_index_t is std_ulogic_vector(5 downto 0); -- FPR indices subtype fpr_index_t is std_ulogic_vector(4 downto 0); - -- Some SPRs are stored in the register file, they use the magic - -- GPR numbers above 31. + -- FPRs are stored in the register file, using GSPR + -- numbers from 32 to 63. -- - -- The function fast_spr_num() returns the corresponding fast - -- pseudo-GPR number for a given SPR number. The result MSB - -- indicates if this is indeed a fast SPR. If clear, then - -- the SPR is not stored in the GPR file. - -- - -- FPRs are also stored in the register file, using GSPR - -- numbers from 64 to 95. - -- - function fast_spr_num(spr: spr_num_t) return gspr_index_t; -- Indices conversion functions function gspr_to_gpr(i: gspr_index_t) return gpr_index_t; function gpr_to_gspr(i: gpr_index_t) return gspr_index_t; - function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t; - function is_fast_spr(s: gspr_index_t) return std_ulogic; function fpr_to_gspr(f: fpr_index_t) return gspr_index_t; -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are @@ -271,9 +260,6 @@ package common is stop_mark : std_ulogic; nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); - ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr - ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) - ispro: gspr_index_t; -- (G)SPR written with LR or CTR decode: decode_rom_t; br_pred: std_ulogic; -- Branch was predicted to be taken big_endian: std_ulogic; @@ -282,7 +268,6 @@ package common is end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), - ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'), decode => decode_rom_init, br_pred => '0', big_endian => '0', spr_info => spr_id_init, ram_spr => ram_spr_info_init); @@ -783,10 +768,6 @@ package body common is begin return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16))); end; - function fast_spr_num(spr: spr_num_t) return gspr_index_t is - begin - return "0000000"; - end; function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is begin @@ -795,26 +776,12 @@ package body common is function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is begin - return "00" & i; - end; - - function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is - begin - if s(5) = '1' then - return s; - else - return gpr_to_gspr(g); - end if; - end; - - function is_fast_spr(s: gspr_index_t) return std_ulogic is - begin - return s(5); + return "0" & i; end; function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is begin - return "10" & f; + return "1" & f; end; function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is diff --git a/decode1.vhdl b/decode1.vhdl index b6cea31..af8cd6c 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -177,7 +177,7 @@ architecture behaviour of decode1 is -- addpcis 2#001# => (ALU, NONE, OP_ADD, CIA, CONST_DXHI4, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- bclr, bcctr, bctar - 2#100# => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, SPR, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + 2#100# => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- isync 2#111# => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- rfid @@ -329,7 +329,7 @@ architecture behaviour of decode1 is 2#1001000000# => (ALU, NONE, OP_MCRXRX, NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mcrxrx 2#0000010011# => (ALU, NONE, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfcr/mfocrf 2#0001010011# => (ALU, NONE, OP_MFMSR, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mfmsr - 2#0101010011# => (ALU, NONE, OP_MFSPR, SPR, NONE, RS, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr + 2#0101010011# => (ALU, NONE, OP_MFSPR, NONE, NONE, RS, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr 2#0100001001# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud 2#0100001011# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw 2#1100001001# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd @@ -337,7 +337,7 @@ architecture behaviour of decode1 is 2#0010010000# => (ALU, NONE, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtcrf/mtocrf 2#0010010010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- mtmsr 2#0010110010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtmsrd # ignore top bits and d - 2#0111010011# => (ALU, NONE, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr + 2#0111010011# => (ALU, NONE, OP_MTSPR, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr 2#0001001001# => (ALU, NONE, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- mulhd 2#0000001001# => (ALU, NONE, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- mulhdu 2#0001001011# => (ALU, NONE, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- mulhw @@ -670,10 +670,6 @@ begin -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); - -- Work out ispr1/ispro independent of v.decode since they seem to be critical path - v.ispr1 := fast_spr_num(sprn); - v.ispro := fast_spr_num(sprn); - if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr -- Make mtspr to slow SPRs single issue diff --git a/decode2.vhdl b/decode2.vhdl index 928ec94..5a8c2b7 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -82,21 +82,11 @@ architecture behaviour of decode2 is constant decode_output_reg_init : decode_output_reg_t := ('0', (others => '0')); function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); - ispr : gspr_index_t; instr_addr : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is begin if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then return ('1', gpr_to_gspr(insn_ra(insn_in)), (others => '0')); - elsif t = SPR then - -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. - -- If it's all 0, we don't treat it as a dependency as slow SPRs - -- operations are single issue. - -- - assert is_fast_spr(ispr) = '1' or ispr = "0000000" - report "Decode A says SPR but ISPR is invalid:" & - to_hstring(ispr) severity failure; - return (is_fast_spr(ispr), ispr, (others => '0')); elsif t = CIA then return ('0', (others => '0'), instr_addr); elsif HAS_FPU and t = FRA then @@ -106,8 +96,8 @@ architecture behaviour of decode2 is end if; end; - function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0); - ispr : gspr_index_t) return decode_input_reg_t is + function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0)) + return decode_input_reg_t is variable ret : decode_input_reg_t; begin case t is @@ -143,14 +133,6 @@ architecture behaviour of decode2 is ret := ('0', (others => '0'), x"00000000000000" & "00" & insn_in(1) & insn_in(15 downto 11)); when CONST_SH32 => ret := ('0', (others => '0'), x"00000000000000" & "000" & insn_in(15 downto 11)); - when SPR => - -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. - -- If it's all 0, we don't treat it as a dependency as slow SPRs - -- operations are single issue. - assert is_fast_spr(ispr) = '1' or ispr = "0000000" - report "Decode B says SPR but ISPR is invalid:" & - to_hstring(ispr) severity failure; - ret := (is_fast_spr(ispr), ispr, (others => '0')); when NONE => ret := ('0', (others => '0'), (others => '0')); end case; @@ -183,8 +165,8 @@ architecture behaviour of decode2 is end case; end; - function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); - ispr : gspr_index_t) return decode_output_reg_t is + function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0)) + return decode_output_reg_t is begin case t is when RT => @@ -195,18 +177,10 @@ architecture behaviour of decode2 is if HAS_FPU then return ('1', fpr_to_gspr(insn_frt(insn_in))); else - return ('0', "0000000"); + return ('0', "000000"); end if; - when SPR => - -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. - -- If it's all 0, we don't treat it as a dependency as slow SPRs - -- operations are single issue. - assert is_fast_spr(ispr) = '1' or ispr = "0000000" - report "Decode B says SPR but ISPR is invalid:" & - to_hstring(ispr) severity failure; - return (is_fast_spr(ispr), ispr); when NONE => - return ('0', "0000000"); + return ('0', "000000"); end case; end; @@ -386,10 +360,10 @@ begin decoded_reg_c <= decode_input_reg_init; decoded_reg_o <= decode_output_reg_init; if d_in.valid = '1' then - decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.ispr1, d_in.nia); - decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, d_in.ispr2); + decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.nia); + decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn); decoded_reg_c <= decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn); - decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn); end if; r_out.read1_enable <= decoded_reg_a.reg_valid; @@ -565,9 +539,7 @@ begin v.e.result_sel := result_select(op); v.e.sub_select := subresult_select(op); if op = OP_MFSPR then - if is_fast_spr(d_in.ispr1) = '1' then - v.e.result_sel := "000"; -- adder_result, effectively a_in - elsif d_in.ram_spr.valid = '1' then + if d_in.ram_spr.valid = '1' then v.e.result_sel := "101"; -- ramspr_result elsif d_in.spr_info.valid = '0' then -- Privileged mfspr to invalid/unimplemented SPR numbers diff --git a/decode_types.vhdl b/decode_types.vhdl index 514bc08..9ee329d 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -22,11 +22,11 @@ package decode_types is OP_BCD, OP_ADDG6S, OP_FETCH_FAILED ); - type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA); + type input_reg_a_t is (NONE, RA, RA_OR_ZERO, CIA, FRA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, - CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB); + CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, FRB); type input_reg_c_t is (NONE, RS, RCR, FRC, FRS); - type output_reg_a_t is (NONE, RT, RA, SPR, FRT); + type output_reg_a_t is (NONE, RT, RA, FRT); type rc_t is (NONE, ONE, RC); type carry_in_t is (ZERO, CA, OV, ONE); diff --git a/execute1.vhdl b/execute1.vhdl index 5ee830b..dc68806 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1135,7 +1135,7 @@ begin when OP_DARN => when OP_MFMSR => when OP_MFSPR => - if is_fast_spr(e_in.read_reg1) = '1' or e_in.spr_is_ram = '1' then + if e_in.spr_is_ram = '1' then if e_in.valid = '1' then report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(alu_result); @@ -1216,8 +1216,7 @@ begin when others => end case; end if; - if e_in.spr_select.valid = '0' and is_fast_spr(e_in.write_reg) = '0' and - e_in.spr_is_ram = '0' then + if e_in.spr_select.valid = '0' and e_in.spr_is_ram = '0' then -- mtspr to unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode if ex1.msr(MSR_PR) = '1' then diff --git a/loadstore1.vhdl b/loadstore1.vhdl index b556211..9dab15b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -97,7 +97,7 @@ architecture behave of loadstore1 is mode_32bit => '0', addr => (others => '0'), byte_sel => x"00", second_bytes => x"00", store_data => (others => '0'), instr_tag => instr_tag_init, - write_reg => 7x"00", length => x"0", + write_reg => 6x"00", length => x"0", elt_length => x"0", byte_reverse => '0', brev_mask => "000", sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', diff --git a/logical.vhdl b/logical.vhdl index 60309ac..77ef29c 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -167,7 +167,7 @@ begin end if; tmp(7 downto 0) := rs(7 downto 0); when others => - -- e.g. OP_MTSPR + -- e.g. OP_MFSPR tmp := rs; end case; diff --git a/register_file.vhdl b/register_file.vhdl index ed856cb..dcce0a4 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -34,7 +34,7 @@ entity register_file is end entity register_file; architecture behaviour of register_file is - type regfile is array(0 to 127) of std_ulogic_vector(63 downto 0); + type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); @@ -47,15 +47,11 @@ begin if rising_edge(clk) then if w_in.write_enable = '1' then w_addr := w_in.write_reg; - if HAS_FPU and w_addr(6) = '1' then + if HAS_FPU and w_addr(5) = '1' then report "Writing FPR " & to_hstring(w_addr(4 downto 0)) & " " & to_hstring(w_in.write_data); else - w_addr(6) := '0'; - if w_addr(5) = '0' then - report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); - else - report "Writing GSPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); - end if; + w_addr(5) := '0'; + report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); end if; assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; registers(to_integer(unsigned(w_addr))) <= w_in.write_data; @@ -73,11 +69,11 @@ begin c_addr := d_in.read3_reg; w_addr := w_in.write_reg; if not HAS_FPU then - -- Make it obvious that we only want 64 GSPRs for a no-FPU implementation - a_addr(6) := '0'; - b_addr(6) := '0'; - c_addr(6) := '0'; - w_addr(6) := '0'; + -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation + a_addr(5) := '0'; + b_addr(5) := '0'; + c_addr(5) := '0'; + w_addr(5) := '0'; end if; if d_in.read1_enable = '1' then report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr)))); @@ -93,7 +89,7 @@ begin if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then b_addr := dbg_gpr_addr; if not HAS_FPU then - b_addr(6) := '0'; + b_addr(5) := '0'; end if; end if; rd_port_b <= registers(to_integer(unsigned(b_addr))); @@ -150,7 +146,7 @@ begin if rising_edge(clk) then log_data <= w_in.write_data & w_in.write_enable & - w_in.write_reg; + '0' & w_in.write_reg; end if; end process; log_out <= log_data; From d0f319290fd22724a06b6db628aa7ee3458ca1bc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 25 Feb 2022 16:46:34 +1100 Subject: [PATCH 25/30] Restore debug access to SPRs This provides access to the SPRs via the JTAG DMI interface. For now they are still accessed as if they were GPR/FPRs using the same numbering as before (GPRs at 0 - 0x1f, SPRs at 0x20 - 0x2d, FPRs at 0x40 - 0x5f). For XER, debug reads now report the full value, not just the bits that were previously stored in the register file. The "slow" SPR mux is not used for debug reads. Decode2 determines on each cycle whether a debug SPR access will happen next cycle, based on whether there is a request and whether the current instruction accesses the SPR RAM. Signed-off-by: Paul Mackerras --- common.vhdl | 2 + core.vhdl | 14 +++++++ core_debug.vhdl | 84 +++++++++++++++++++++++++++++++++---- decode2.vhdl | 29 ++++++++++++- execute1.vhdl | 24 +++++++++++ scripts/mw_debug/mw_debug.c | 2 +- 6 files changed, 144 insertions(+), 11 deletions(-) diff --git a/common.vhdl b/common.vhdl index 06b62e0..d743c2d 100644 --- a/common.vhdl +++ b/common.vhdl @@ -337,6 +337,7 @@ package common is ramspr_wraddr : ramspr_index; ramspr_write_even : std_ulogic; ramspr_write_odd : std_ulogic; + dbg_spr_access : std_ulogic; dec_ctr : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := @@ -354,6 +355,7 @@ package common is spr_is_ram => '0', ramspr_even_rdaddr => 0, ramspr_odd_rdaddr => 0, ramspr_rd_odd => '0', ramspr_wraddr => 0, ramspr_write_even => '0', ramspr_write_odd => '0', + dbg_spr_access => '0', dec_ctr => '0', others => (others => '0')); diff --git a/core.vhdl b/core.vhdl index 82c66b4..a91b729 100644 --- a/core.vhdl +++ b/core.vhdl @@ -150,6 +150,10 @@ architecture behave of core is signal dbg_gpr_ack : std_ulogic; signal dbg_gpr_addr : gspr_index_t; signal dbg_gpr_data : std_ulogic_vector(63 downto 0); + signal dbg_spr_req : std_ulogic; + signal dbg_spr_ack : std_ulogic; + signal dbg_spr_addr : std_ulogic_vector(7 downto 0); + signal dbg_spr_data : std_ulogic_vector(63 downto 0); signal ctrl_debug : ctrl_t; @@ -307,6 +311,8 @@ begin execute2_bypass => execute2_bypass, execute2_cr_bypass => execute2_cr_bypass, writeback_bypass => writeback_bypass, + dbg_spr_req => dbg_spr_req, + dbg_spr_addr => dbg_spr_addr, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -378,6 +384,10 @@ begin dc_events => dcache_events, ic_events => icache_events, terminate_out => terminate, + dbg_spr_req => dbg_spr_req, + dbg_spr_ack => dbg_spr_ack, + dbg_spr_addr => dbg_spr_addr, + dbg_spr_data => dbg_spr_data, sim_dump => sim_ex_dump, sim_dump_done => sim_cr_dump, log_out => log_data(134 downto 120), @@ -504,6 +514,10 @@ begin dbg_gpr_ack => dbg_gpr_ack, dbg_gpr_addr => dbg_gpr_addr, dbg_gpr_data => dbg_gpr_data, + dbg_spr_req => dbg_spr_req, + dbg_spr_ack => dbg_spr_ack, + dbg_spr_addr => dbg_spr_addr, + dbg_spr_data => dbg_spr_data, log_data => log_data, log_read_addr => log_rd_addr, log_read_data => log_rd_data, diff --git a/core_debug.vhdl b/core_debug.vhdl index ff99df4..a1d4a94 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -33,12 +33,18 @@ entity core_debug is nia : in std_ulogic_vector(63 downto 0); msr : in std_ulogic_vector(63 downto 0); - -- GSPR register read port + -- GPR/FPR register read port dbg_gpr_req : out std_ulogic; dbg_gpr_ack : in std_ulogic; dbg_gpr_addr : out gspr_index_t; dbg_gpr_data : in std_ulogic_vector(63 downto 0); + -- SPR register read port + dbg_spr_req : out std_ulogic; + dbg_spr_ack : in std_ulogic; + dbg_spr_addr : out std_ulogic_vector(7 downto 0); + dbg_spr_data : in std_ulogic_vector(63 downto 0); + -- Core logging data log_data : in std_ulogic_vector(255 downto 0); log_read_addr : in std_ulogic_vector(31 downto 0); @@ -105,7 +111,10 @@ architecture behave of core_debug is signal do_icreset : std_ulogic; signal terminated : std_ulogic; signal do_gspr_rd : std_ulogic; - signal gspr_index : gspr_index_t; + signal gspr_index : std_ulogic_vector(7 downto 0); + signal gspr_data : std_ulogic_vector(63 downto 0); + + signal spr_index_valid : std_ulogic; signal log_dmi_addr : std_ulogic_vector(31 downto 0) := (others => '0'); signal log_dmi_data : std_ulogic_vector(63 downto 0) := (others => '0'); @@ -119,9 +128,7 @@ architecture behave of core_debug is begin -- Single cycle register accesses on DMI except for GSPR data dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA - else dbg_gpr_ack; - dbg_gpr_req <= dmi_req when dmi_addr = DBG_CORE_GSPR_DATA - else '0'; + else dbg_gpr_ack or dbg_spr_ack; -- Status register read composition stat_reg <= (2 => terminated, @@ -129,12 +136,16 @@ begin 0 => stopping, others => '0'); + gspr_data <= dbg_gpr_data when gspr_index(5) = '0' else + dbg_spr_data when spr_index_valid = '1' else + (others => '0'); + -- DMI read data mux with dmi_addr select dmi_dout <= stat_reg when DBG_CORE_STAT, nia when DBG_CORE_NIA, msr when DBG_CORE_MSR, - dbg_gpr_data when DBG_CORE_GSPR_DATA, + gspr_data when DBG_CORE_GSPR_DATA, log_write_addr & log_dmi_addr when DBG_CORE_LOG_ADDR, log_dmi_data when DBG_CORE_LOG_DATA, log_dmi_trigger when DBG_CORE_LOG_TRIGGER, @@ -191,7 +202,7 @@ begin terminated <= '0'; end if; elsif dmi_addr = DBG_CORE_GSPR_INDEX then - gspr_index <= dmi_din(gspr_index_t'left downto 0); + gspr_index <= dmi_din(7 downto 0); elsif dmi_addr = DBG_CORE_LOG_ADDR then log_dmi_addr <= dmi_din(31 downto 0); do_dmi_log_rd <= '1'; @@ -226,7 +237,64 @@ begin end if; end process; - dbg_gpr_addr <= gspr_index; + gspr_access: process(clk) + variable valid : std_ulogic; + variable sel : spr_selector; + variable isram : std_ulogic; + variable raddr : ramspr_index; + variable odd : std_ulogic; + begin + if rising_edge(clk) then + if rst = '1' or dmi_req = '0' or dmi_addr /= DBG_CORE_GSPR_DATA then + dbg_gpr_req <= '0'; + dbg_spr_req <= '0'; + else + dbg_gpr_req <= not gspr_index(5); + dbg_spr_req <= gspr_index(5); + end if; + + -- Map 0 - 0x1f to GPRs, 0x20 - 0x3f to SPRs, and 0x40 - 0x5f to FPRs + dbg_gpr_addr <= gspr_index(6) & gspr_index(4 downto 0); + + -- For SPRs, use the same mapping as when the fast SPRs were in the GPR file + valid := '1'; + sel := "000"; + isram := '1'; + raddr := 0; + odd := '0'; + case gspr_index(4 downto 0) is + when 5x"00" => + raddr := RAMSPR_LR; + when 5x"01" => + odd := '1'; + raddr := RAMSPR_CTR; + when 5x"02" | 5x"03" => + odd := gspr_index(0); + raddr := RAMSPR_SRR0; + when 5x"04" | 5x"05" => + odd := gspr_index(0); + raddr := RAMSPR_HSRR0; + when 5x"06" | 5x"07" => + odd := gspr_index(0); + raddr := RAMSPR_SPRG0; + when 5x"08" | 5x"09" => + odd := gspr_index(0); + raddr := RAMSPR_SPRG2; + when 5x"0a" | 5x"0b" => + odd := gspr_index(0); + raddr := RAMSPR_HSPRG0; + when 5x"0c" => + isram := '0'; + sel := SPRSEL_XER; + when 5x"0d" => + raddr := RAMSPR_TAR; + when others => + valid := '0'; + end case; + dbg_spr_addr <= isram & sel & std_ulogic_vector(to_unsigned(raddr, 3)) & odd; + spr_index_valid <= valid; + end if; + end process; -- Core control signals generated by the debug module core_stop <= stopping and not do_step; diff --git a/decode2.vhdl b/decode2.vhdl index 5a8c2b7..d91bec5 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -43,6 +43,10 @@ entity decode2 is execute2_cr_bypass : in cr_bypass_data_t; writeback_bypass : in bypass_data_t; + -- Access to SPRs from core_debug module + dbg_spr_req : in std_ulogic; + dbg_spr_addr : in std_ulogic_vector(7 downto 0); + log_out : out std_ulogic_vector(9 downto 0) ); end entity decode2; @@ -60,6 +64,7 @@ architecture behaviour of decode2 is reg_o_valid : std_ulogic; input_ov : std_ulogic; output_ov : std_ulogic; + read_rspr : std_ulogic; end record; constant reg_type_init : reg_type := (e => Decode2ToExecute1Init, repeat => NONE, others => '0'); @@ -347,6 +352,13 @@ begin " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid); end if; dc2 <= dc2in; + elsif dc2.read_rspr = '0' then + -- Update debug SPR access signals even when stalled + -- if the instruction in dc2.e doesn't read any SPRs. + dc2.e.dbg_spr_access <= dc2in.e.dbg_spr_access; + dc2.e.ramspr_even_rdaddr <= dc2in.e.ramspr_even_rdaddr; + dc2.e.ramspr_odd_rdaddr <= dc2in.e.ramspr_odd_rdaddr; + dc2.e.ramspr_rd_odd <= dc2in.e.ramspr_rd_odd; end if; end if; end process; @@ -381,6 +393,7 @@ begin variable op : insn_type_t; variable valid_in : std_ulogic; variable decctr : std_ulogic; + variable sprs_busy : std_ulogic; begin v := dc2; @@ -389,6 +402,8 @@ begin if dc2.busy = '0' then v.e := Decode2ToExecute1Init; + sprs_busy := '0'; + if d_in.valid = '1' then v.prev_sgl := dc2.sgl_pipe; v.sgl_pipe := d_in.decode.sgl_pipe; @@ -467,6 +482,7 @@ begin v.e.ramspr_odd_rdaddr := RAMSPR_CTR; v.e.ramspr_wraddr := RAMSPR_CTR; v.e.ramspr_write_odd := '1'; + sprs_busy := '1'; end if; if v.e.lr = '1' then -- write LR @@ -484,11 +500,13 @@ begin else v.e.ramspr_even_rdaddr := RAMSPR_TAR; end if; + sprs_busy := '1'; when OP_MFSPR => v.e.ramspr_even_rdaddr := d_in.ram_spr.index; v.e.ramspr_odd_rdaddr := d_in.ram_spr.index; v.e.ramspr_rd_odd := d_in.ram_spr.isodd; v.e.spr_is_ram := d_in.ram_spr.valid; + sprs_busy := d_in.ram_spr.valid; when OP_MTSPR => v.e.ramspr_wraddr := d_in.ram_spr.index; v.e.ramspr_write_even := d_in.ram_spr.valid and not d_in.ram_spr.isodd; @@ -497,8 +515,10 @@ begin when OP_RFID => v.e.ramspr_even_rdaddr := RAMSPR_SRR0; v.e.ramspr_odd_rdaddr := RAMSPR_SRR1; + sprs_busy := '1'; when others => end case; + v.read_rspr := sprs_busy and d_in.valid; case d_in.decode.length is when is1B => @@ -545,8 +565,6 @@ begin -- Privileged mfspr to invalid/unimplemented SPR numbers -- writes the contents of RT back to RT (i.e. it's a no-op) v.e.result_sel := "001"; -- logical_result - elsif d_in.spr_info.ispmu = '1' then - v.e.result_sel := "100"; -- pmuspr_result end if; end if; @@ -649,6 +667,13 @@ begin stall_out <= dc2.busy or deferred; + v.e.dbg_spr_access := dbg_spr_req and not v.read_rspr; + if v.e.dbg_spr_access = '1' then + v.e.ramspr_even_rdaddr := to_integer(unsigned(dbg_spr_addr(3 downto 1))); + v.e.ramspr_odd_rdaddr := to_integer(unsigned(dbg_spr_addr(3 downto 1))); + v.e.ramspr_rd_odd := dbg_spr_addr(0); + end if; + -- Update registers dc2in <= v; diff --git a/execute1.vhdl b/execute1.vhdl index dc68806..20efef6 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -55,6 +55,12 @@ entity execute1 is dc_events : in DcacheEventType; ic_events : in IcacheEventType; + -- Access to SPRs from core_debug module + dbg_spr_req : in std_ulogic; + dbg_spr_ack : out std_ulogic; + dbg_spr_addr : in std_ulogic_vector(7 downto 0); + dbg_spr_data : out std_ulogic_vector(63 downto 0); + -- debug sim_dump : in std_ulogic; sim_dump_done : out std_ulogic; @@ -604,6 +610,24 @@ begin end if; end process; + ex_dbg_spr: process(clk) + begin + if rising_edge(clk) then + if rst = '0' and dbg_spr_req = '1' then + if e_in.dbg_spr_access = '1' and dbg_spr_ack = '0' then + if dbg_spr_addr(7) = '1' then + dbg_spr_data <= ramspr_result; + else + dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low); + end if; + dbg_spr_ack <= '1'; + end if; + else + dbg_spr_ack <= '0'; + end if; + end if; + end process; + -- Data path for integer instructions (first execute stage) execute1_dp: process(all) variable a_inv : std_ulogic_vector(63 downto 0); diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index 6271760..ef5b1ec 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -548,7 +548,7 @@ static const char *fast_spr_names[] = { "lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1", "sprg0", "sprg1", "sprg2", "sprg3", - "hsprg0", "hsprg1", "xer" + "hsprg0", "hsprg1", "xer", "tar", }; static void gpr_read(uint64_t reg, uint64_t count) From af814a0d5eedf433c52fc9674b1aa1241069f9be Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 24 Feb 2022 11:37:17 +1100 Subject: [PATCH 26/30] Provide debug access to SPRs in loadstore1 and mmu They are accessible as GSPR 0x3c - PID, 0x3d - PTCR, 0x3e - DSISR and 0x3f - DAR. Signed-off-by: Paul Mackerras --- common.vhdl | 4 ++- core.vhdl | 12 ++++++++ core_debug.vhdl | 29 +++++++++++++----- loadstore1.vhdl | 61 +++++++++++++++++++++++++++++-------- mmu.vhdl | 11 +++---- scripts/mw_debug/mw_debug.c | 8 ++++- 6 files changed, 96 insertions(+), 29 deletions(-) diff --git a/common.vhdl b/common.vhdl index d743c2d..39ebfb1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -547,7 +547,9 @@ package common is iside : std_ulogic; load : std_ulogic; priv : std_ulogic; - sprn : std_ulogic_vector(9 downto 0); + ric : std_ulogic_vector(1 downto 0); + sprnf : std_ulogic; + sprnt : std_ulogic; addr : std_ulogic_vector(63 downto 0); rs : std_ulogic_vector(63 downto 0); end record; diff --git a/core.vhdl b/core.vhdl index a91b729..641c12d 100644 --- a/core.vhdl +++ b/core.vhdl @@ -154,6 +154,10 @@ architecture behave of core is signal dbg_spr_ack : std_ulogic; signal dbg_spr_addr : std_ulogic_vector(7 downto 0); signal dbg_spr_data : std_ulogic_vector(63 downto 0); + signal dbg_ls_spr_req : std_ulogic; + signal dbg_ls_spr_ack : std_ulogic; + signal dbg_ls_spr_addr : std_ulogic_vector(1 downto 0); + signal dbg_ls_spr_data : std_ulogic_vector(63 downto 0); signal ctrl_debug : ctrl_t; @@ -432,6 +436,10 @@ begin m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, events => loadstore_events, + dbg_spr_req => dbg_ls_spr_req, + dbg_spr_ack => dbg_ls_spr_ack, + dbg_spr_addr => dbg_ls_spr_addr, + dbg_spr_data => dbg_ls_spr_data, log_out => log_data(149 downto 140) ); @@ -518,6 +526,10 @@ begin dbg_spr_ack => dbg_spr_ack, dbg_spr_addr => dbg_spr_addr, dbg_spr_data => dbg_spr_data, + dbg_ls_spr_req => dbg_ls_spr_req, + dbg_ls_spr_ack => dbg_ls_spr_ack, + dbg_ls_spr_addr => dbg_ls_spr_addr, + dbg_ls_spr_data => dbg_ls_spr_data, log_data => log_data, log_read_addr => log_rd_addr, log_read_data => log_rd_data, diff --git a/core_debug.vhdl b/core_debug.vhdl index a1d4a94..c060f74 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -39,12 +39,18 @@ entity core_debug is dbg_gpr_addr : out gspr_index_t; dbg_gpr_data : in std_ulogic_vector(63 downto 0); - -- SPR register read port + -- SPR register read port for SPRs in execute1 dbg_spr_req : out std_ulogic; dbg_spr_ack : in std_ulogic; dbg_spr_addr : out std_ulogic_vector(7 downto 0); dbg_spr_data : in std_ulogic_vector(63 downto 0); + -- SPR register read port for SPRs in loadstore1 and mmu + dbg_ls_spr_req : out std_ulogic; + dbg_ls_spr_ack : in std_ulogic; + dbg_ls_spr_addr : out std_ulogic_vector(1 downto 0); + dbg_ls_spr_data : in std_ulogic_vector(63 downto 0); + -- Core logging data log_data : in std_ulogic_vector(255 downto 0); log_read_addr : in std_ulogic_vector(31 downto 0); @@ -128,7 +134,7 @@ architecture behave of core_debug is begin -- Single cycle register accesses on DMI except for GSPR data dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA - else dbg_gpr_ack or dbg_spr_ack; + else dbg_gpr_ack or dbg_spr_ack or dbg_ls_spr_ack; -- Status register read composition stat_reg <= (2 => terminated, @@ -137,6 +143,7 @@ begin others => '0'); gspr_data <= dbg_gpr_data when gspr_index(5) = '0' else + dbg_ls_spr_data when dbg_ls_spr_req = '1' else dbg_spr_data when spr_index_valid = '1' else (others => '0'); @@ -245,16 +252,22 @@ begin variable odd : std_ulogic; begin if rising_edge(clk) then - if rst = '1' or dmi_req = '0' or dmi_addr /= DBG_CORE_GSPR_DATA then - dbg_gpr_req <= '0'; - dbg_spr_req <= '0'; - else - dbg_gpr_req <= not gspr_index(5); - dbg_spr_req <= gspr_index(5); + dbg_gpr_req <= '0'; + dbg_spr_req <= '0'; + dbg_ls_spr_req <= '0'; + if rst = '0' and dmi_req = '1' and dmi_addr = DBG_CORE_GSPR_DATA then + if gspr_index(5) = '0' then + dbg_gpr_req <= '1'; + elsif gspr_index(4 downto 2) = "111" then + dbg_ls_spr_req <= '1'; + else + dbg_spr_req <= '1'; + end if; end if; -- Map 0 - 0x1f to GPRs, 0x20 - 0x3f to SPRs, and 0x40 - 0x5f to FPRs dbg_gpr_addr <= gspr_index(6) & gspr_index(4 downto 0); + dbg_ls_spr_addr <= gspr_index(1 downto 0); -- For SPRs, use the same mapping as when the fast SPRs were in the GPR file valid := '1'; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 9dab15b..92ebeec 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -35,6 +35,12 @@ entity loadstore1 is events : out Loadstore1EventType; + -- Access to SPRs from core_debug module + dbg_spr_req : in std_ulogic; + dbg_spr_ack : out std_ulogic; + dbg_spr_addr : in std_ulogic_vector(1 downto 0); + dbg_spr_data : out std_ulogic_vector(63 downto 0); + log_out : out std_ulogic_vector(9 downto 0) ); end loadstore1; @@ -123,6 +129,8 @@ architecture behave of loadstore1 is one_cycle : std_ulogic; wr_sel : std_ulogic_vector(1 downto 0); addr0 : std_ulogic_vector(63 downto 0); + sprsel : std_ulogic_vector(1 downto 0); + dbg_spr_rd : std_ulogic; end record; type reg_stage3_t is record @@ -146,6 +154,8 @@ architecture behave of loadstore1 is intr_vec : integer range 0 to 16#fff#; srr1 : std_ulogic_vector(15 downto 0); events : Loadstore1EventType; + dbg_spr : std_ulogic_vector(63 downto 0); + dbg_spr_ack : std_ulogic; end record; signal req_in : request_t; @@ -664,6 +674,20 @@ begin v.busy := '1'; end if; + v.dbg_spr_rd := dbg_spr_req and not (v.req.valid and v.req.read_spr); + if v.dbg_spr_rd = '0' then + v.sprsel(1) := v.req.sprn(1); + if v.req.sprn(1) = '1' then + -- DSISR and DAR + v.sprsel(0) := v.req.sprn(0); + else + -- PID and PTCR + v.sprsel(0) := v.req.sprn(8); + end if; + else + v.sprsel := dbg_spr_addr; + end if; + r2in <= v; end process; @@ -763,21 +787,26 @@ begin v.load_data := data_permuted; end if; + -- SPR mux + if r2.sprsel(1) = '1' then + if r2.sprsel(0) = '0' then + sprval := x"00000000" & r3.dsisr; + else + sprval := r3.dar; + end if; + else + sprval := m_in.sprval; + end if; + if dbg_spr_req = '0' then + v.dbg_spr_ack := '0'; + elsif r2.dbg_spr_rd = '1' and r3.dbg_spr_ack = '0' then + v.dbg_spr := sprval; + v.dbg_spr_ack := '1'; + end if; + if r2.req.valid = '1' then if r2.req.read_spr = '1' then write_enable := '1'; - -- partial decode on SPR number should be adequate given - -- the restricted set that get sent down this path - if r2.req.sprn(8) = '0' and r2.req.sprn(5) = '0' then - if r2.req.sprn(0) = '0' then - sprval := x"00000000" & r3.dsisr; - else - sprval := r3.dar; - end if; - else - -- reading one of the SPRs in the MMU - sprval := m_in.sprval; - end if; end if; if r2.req.align_intr = '1' then -- generate alignment interrupt @@ -940,8 +969,10 @@ begin m_out.load <= r2.req.load; m_out.priv <= r2.req.priv_mode; m_out.tlbie <= r2.req.tlbie; + m_out.ric <= r2.req.sprn(3 downto 2); m_out.mtspr <= mmu_mtspr; - m_out.sprn <= r2.req.sprn; + m_out.sprnf <= r2.sprsel(0); + m_out.sprnt <= r2.req.sprn(8); m_out.addr <= r2.req.addr; m_out.slbia <= r2.req.is_slbia; m_out.rs <= r2.req.store_data; @@ -967,6 +998,10 @@ begin flush <= exception; + -- SPR values for core_debug + dbg_spr_data <= r3.dbg_spr; + dbg_spr_ack <= r3.dbg_spr_ack; + -- Update registers r3in <= v; diff --git a/mmu.vhdl b/mmu.vhdl index d80caf4..d95cd3c 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -81,8 +81,8 @@ architecture behave of mmu is begin -- Multiplex internal SPR values back to loadstore1, selected - -- by l_in.sprn. - l_out.sprval <= r.ptcr when l_in.sprn(8) = '1' else x"00000000" & r.pid; + -- by l_in.sprnf. + l_out.sprval <= r.ptcr when l_in.sprnf = '1' else x"00000000" & r.pid; mmu_0: process(clk) begin @@ -259,9 +259,8 @@ begin -- RB[IS] != 0 or RB[AP] != 0, or for slbia v.inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or l_in.addr(7) or l_in.addr(6) or l_in.addr(5); - -- The RIC field of the tlbie instruction comes across on the - -- sprn bus as bits 2--3. RIC=2 flushes process table caches. - if l_in.sprn(3) = '1' then + -- RIC=2 or 3 flushes process table caches. + if l_in.ric(1) = '1' then v.pt0_valid := '0'; v.pt3_valid := '0'; v.ptb_valid := '0'; @@ -291,7 +290,7 @@ begin -- Move to PID needs to invalidate L1 TLBs and cached -- pgtbl0 value. Move to PTCR does that plus -- invalidating the cached pgtbl3 and prtbl values as well. - if l_in.sprn(8) = '0' then + if l_in.sprnt = '0' then v.pid := l_in.rs(31 downto 0); else v.ptcr := l_in.rs; diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index ef5b1ec..81e8094 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -551,6 +551,10 @@ static const char *fast_spr_names[] = "hsprg0", "hsprg1", "xer", "tar", }; +static const char *ldst_spr_names[] = { + "pidr", "ptcr", "dsisr", "dar" +}; + static void gpr_read(uint64_t reg, uint64_t count) { uint64_t data; @@ -566,8 +570,10 @@ static void gpr_read(uint64_t reg, uint64_t count) printf("r%"PRId64, reg); else if ((reg - 32) < sizeof(fast_spr_names) / sizeof(fast_spr_names[0])) printf("%s", fast_spr_names[reg - 32]); - else if (reg < 64) + else if (reg < 60) printf("gspr%"PRId64, reg); + else if (reg < 64) + printf("%s", ldst_spr_names[reg - 60]); else printf("FPR%"PRId64, reg - 64); printf(":\t%016"PRIx64"\n", data); From 047be5c0c3b2f12c9321412518e17b7267fe14ea Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 23 Mar 2022 18:02:28 +0000 Subject: [PATCH 27/30] loadstore1: Do SPR reading in stage 2 rather than stage 3 This eliminates one leg of the output value multiplexer, and seems to improve timing slightly on the A7-100. Since SPR values are written in stage 3 and read in stage 2, an mfspr immediately following an mtspr to the same SPR won't give the correct value. To avoid this, we make mtspr to the load/store SPRs single issue in decode1. Signed-off-by: Paul Mackerras --- decode1.vhdl | 4 ++ loadstore1.vhdl | 114 ++++++++++++++++++++++++------------------------ 2 files changed, 62 insertions(+), 56 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index af8cd6c..5ee7b57 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -681,6 +681,10 @@ begin when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR => vi.override_decode.unit := LDST; vi.override_unit := '1'; + -- make mtspr to loadstore SPRs single-issue + if f_in.insn(8) = '1' then + vi.force_single := '1'; + end if; when others => end case; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 92ebeec..0a2f088 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -90,7 +90,8 @@ architecture behave of loadstore1 is virt_mode : std_ulogic; priv_mode : std_ulogic; load_sp : std_ulogic; - sprn : std_ulogic_vector(9 downto 0); + sprsel : std_ulogic_vector(1 downto 0); + ric : std_ulogic_vector(1 downto 0); is_slbia : std_ulogic; align_intr : std_ulogic; dword_index : std_ulogic; @@ -109,7 +110,7 @@ architecture behave of loadstore1 is xerc => xerc_init, reserve => '0', atomic => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', - sprn => 10x"0", is_slbia => '0', align_intr => '0', + sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0', dword_index => '0', two_dwords => '0', incomplete => '0'); type reg_stage1_t is record @@ -130,7 +131,8 @@ architecture behave of loadstore1 is wr_sel : std_ulogic_vector(1 downto 0); addr0 : std_ulogic_vector(63 downto 0); sprsel : std_ulogic_vector(1 downto 0); - dbg_spr_rd : std_ulogic; + dbg_spr : std_ulogic_vector(63 downto 0); + dbg_spr_ack: std_ulogic; end record; type reg_stage3_t is record @@ -154,8 +156,6 @@ architecture behave of loadstore1 is intr_vec : integer range 0 to 16#fff#; srr1 : std_ulogic_vector(15 downto 0); events : Loadstore1EventType; - dbg_spr : std_ulogic_vector(63 downto 0); - dbg_spr_ack : std_ulogic; end record; signal req_in : request_t; @@ -287,7 +287,8 @@ begin r1.req.instr_fault <= '0'; r1.req.load <= '0'; r1.req.priv_mode <= '0'; - r1.req.sprn <= (others => '0'); + r1.req.sprsel <= "00"; + r1.req.ric <= "00"; r1.req.xerc <= xerc_init; r2.req.valid <= '0'; @@ -297,7 +298,8 @@ begin r2.req.instr_fault <= '0'; r2.req.load <= '0'; r2.req.priv_mode <= '0'; - r2.req.sprn <= (others => '0'); + r2.req.sprsel <= "00"; + r2.req.ric <= "00"; r2.req.xerc <= xerc_init; r2.wait_dc <= '0'; @@ -418,7 +420,14 @@ begin v.nc := l_in.ci; v.virt_mode := l_in.virt_mode; v.priv_mode := l_in.priv_mode; - v.sprn := sprn; + v.ric := l_in.insn(19 downto 18); + if sprn(1) = '1' then + -- DSISR and DAR + v.sprsel := '1' & sprn(0); + else + -- PID and PTCR + v.sprsel := '0' & sprn(8); + end if; lsu_sum := std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)); @@ -494,7 +503,7 @@ begin v.read_spr := '1'; when OP_MTSPR => v.write_spr := '1'; - v.mmu_op := sprn(8) or sprn(5); + v.mmu_op := not sprn(1); when OP_FETCH_FAILED => -- send it to the MMU to do the radix walk v.instr_fault := '1'; @@ -605,6 +614,9 @@ begin variable idx : unsigned(2 downto 0); variable byte_offset : unsigned(2 downto 0); variable interrupt : std_ulogic; + variable dbg_spr_rd : std_ulogic; + variable sprsel : std_ulogic_vector(1 downto 0); + variable sprval : std_ulogic_vector(63 downto 0); begin v := r2; @@ -617,6 +629,28 @@ begin store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j); end loop; + dbg_spr_rd := dbg_spr_req and not (r1.req.valid and r1.req.read_spr); + if dbg_spr_rd = '0' then + sprsel := r1.req.sprsel; + else + sprsel := dbg_spr_addr; + end if; + if sprsel(1) = '1' then + if sprsel(0) = '0' then + sprval := x"00000000" & r3.dsisr; + else + sprval := r3.dar; + end if; + else + sprval := m_in.sprval; + end if; + if dbg_spr_req = '0' then + v.dbg_spr_ack := '0'; + elsif dbg_spr_rd = '1' and r2.dbg_spr_ack = '0' then + v.dbg_spr := sprval; + v.dbg_spr_ack := '1'; + end if; + if (dc_stall or d_in.error or r2.busy or l_in.e2stall) = '0' then if r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0' then v.req := r1.req; @@ -627,14 +661,15 @@ begin v.wait_mmu := r1.req.valid and r1.req.mmu_op; v.busy := r1.req.valid and r1.req.mmu_op; v.one_cycle := r1.req.valid and not (r1.req.dc_req or r1.req.mmu_op); - if r1.req.read_spr = '1' then + if r1.req.do_update = '1' or r1.req.store = '1' or r1.req.read_spr = '1' then v.wr_sel := "00"; - elsif r1.req.do_update = '1' or r1.req.store = '1' then - v.wr_sel := "01"; elsif r1.req.load_sp = '1' then - v.wr_sel := "10"; + v.wr_sel := "01"; else - v.wr_sel := "11"; + v.wr_sel := "10"; + end if; + if r1.req.read_spr = '1' then + v.addr0 := sprval; end if; -- Work out load formatter controls for next cycle @@ -674,21 +709,11 @@ begin v.busy := '1'; end if; - v.dbg_spr_rd := dbg_spr_req and not (v.req.valid and v.req.read_spr); - if v.dbg_spr_rd = '0' then - v.sprsel(1) := v.req.sprn(1); - if v.req.sprn(1) = '1' then - -- DSISR and DAR - v.sprsel(0) := v.req.sprn(0); - else - -- PID and PTCR - v.sprsel(0) := v.req.sprn(8); - end if; - else - v.sprsel := dbg_spr_addr; - end if; - r2in <= v; + + -- SPR values for core_debug + dbg_spr_data <= r2.dbg_spr; + dbg_spr_ack <= r2.dbg_spr_ack; end process; -- Processing done in the third cycle of a load/store instruction. @@ -787,22 +812,6 @@ begin v.load_data := data_permuted; end if; - -- SPR mux - if r2.sprsel(1) = '1' then - if r2.sprsel(0) = '0' then - sprval := x"00000000" & r3.dsisr; - else - sprval := r3.dar; - end if; - else - sprval := m_in.sprval; - end if; - if dbg_spr_req = '0' then - v.dbg_spr_ack := '0'; - elsif r2.dbg_spr_rd = '1' and r3.dbg_spr_ack = '0' then - v.dbg_spr := sprval; - v.dbg_spr_ack := '1'; - end if; if r2.req.valid = '1' then if r2.req.read_spr = '1' then @@ -819,7 +828,7 @@ begin write_enable := '1'; end if; if r2.req.write_spr = '1' and r2.req.mmu_op = '0' then - if r2.req.sprn(0) = '0' then + if r2.req.sprsel(0) = '0' then v.dsisr := r2.req.store_data(31 downto 0); else v.dar := r2.req.store_data; @@ -917,12 +926,9 @@ begin case r2.wr_sel is when "00" => - -- mfspr result - write_data := sprval; - when "01" => -- update reg write_data := r2.addr0; - when "10" => + when "01" => -- lfs result write_data := load_dp_data; when others => @@ -969,10 +975,10 @@ begin m_out.load <= r2.req.load; m_out.priv <= r2.req.priv_mode; m_out.tlbie <= r2.req.tlbie; - m_out.ric <= r2.req.sprn(3 downto 2); + m_out.ric <= r2.req.ric; m_out.mtspr <= mmu_mtspr; - m_out.sprnf <= r2.sprsel(0); - m_out.sprnt <= r2.req.sprn(8); + m_out.sprnf <= r1.req.sprsel(0); + m_out.sprnt <= r2.req.sprsel(0); m_out.addr <= r2.req.addr; m_out.slbia <= r2.req.is_slbia; m_out.rs <= r2.req.store_data; @@ -998,10 +1004,6 @@ begin flush <= exception; - -- SPR values for core_debug - dbg_spr_data <= r3.dbg_spr; - dbg_spr_ack <= r3.dbg_spr_ack; - -- Update registers r3in <= v; From 06c13d4988fee4ec1f5bf089ad71f2acc2883818 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 21 Feb 2022 19:29:09 +1100 Subject: [PATCH 28/30] decode1: Work out register addresses in decode1 This adds some relatively simple logic to decode1 to compute the GPR/FPR addresses that an instruction will access. It always computes three addresses regardless of whether the instruction will actually use all of them. The main things it computes are whether the instruction uses the RS field or the RC field for the 3rd operand, and whether the operands are FPRs or GPRs (it is possible for RS to be an FPR but RA and RB to be GPRs, as for example with stfdx). At the moment all we do with these computed register addresses is to assert that they are identical to the ones coming from decode2 one cycle later. Signed-off-by: Paul Mackerras --- common.vhdl | 6 ++++++ core.vhdl | 4 ++++ decode1.vhdl | 37 +++++++++++++++++++++++++++++++++++++ register_file.vhdl | 19 +++++++++++++++++++ 4 files changed, 66 insertions(+) diff --git a/common.vhdl b/common.vhdl index 39ebfb1..0349a6e 100644 --- a/common.vhdl +++ b/common.vhdl @@ -276,6 +276,12 @@ package common is redirect_nia : std_ulogic_vector(63 downto 0); end record; + type Decode1ToRegisterFileType is record + reg_1_addr : gspr_index_t; + reg_2_addr : gspr_index_t; + reg_3_addr : gspr_index_t; + end record; + type bypass_data_t is record tag : instr_tag_t; data : std_ulogic_vector(63 downto 0); diff --git a/core.vhdl b/core.vhdl index 641c12d..764141a 100644 --- a/core.vhdl +++ b/core.vhdl @@ -63,6 +63,7 @@ architecture behave of core is -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; signal decode1_to_fetch1: Decode1ToFetch1Type; + signal decode1_to_register_file: Decode1ToRegisterFileType; signal decode2_to_execute1: Decode2ToExecute1Type; -- register file signals @@ -285,6 +286,7 @@ begin f_in => icache_to_decode1, d_out => decode1_to_decode2, f_out => decode1_to_fetch1, + r_out => decode1_to_register_file, log_out => log_data(109 downto 97) ); @@ -329,6 +331,8 @@ begin ) port map ( clk => clk, + stall => decode2_stall_out, + d1_in => decode1_to_register_file, d_in => decode2_to_register_file, d_out => register_file_to_decode2, w_in => writeback_to_register_file, diff --git a/decode1.vhdl b/decode1.vhdl index 5ee7b57..36d511b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -5,6 +5,7 @@ use ieee.numeric_std.all; library work; use work.common.all; use work.decode_types.all; +use work.insn_helpers.all; entity decode1 is generic ( @@ -24,6 +25,7 @@ entity decode1 is f_in : in IcacheToDecode1Type; f_out : out Decode1ToFetch1Type; d_out : out Decode1ToDecode2Type; + r_out : out Decode1ToRegisterFileType; log_out : out std_ulogic_vector(12 downto 0) ); end entity decode1; @@ -628,6 +630,7 @@ begin decode1_1: process(all) variable v : Decode1ToDecode2Type; + variable vr : Decode1ToRegisterFileType; variable vi : reg_internal_t; variable majorop : major_opcode_t; variable minor4op : std_ulogic_vector(10 downto 0); @@ -636,6 +639,8 @@ begin variable br_target : std_ulogic_vector(61 downto 0); variable br_offset : signed(23 downto 0); variable bv : br_predictor_t; + variable fprs, fprabc : std_ulogic; + variable in3rc : std_ulogic; begin v := Decode1ToDecode2Init; vi := reg_internal_t_init; @@ -646,6 +651,10 @@ begin v.stop_mark := f_in.stop_mark; v.big_endian := f_in.big_endian; + fprs := '0'; + fprabc := '0'; + in3rc := '0'; + if f_in.valid = '1' then report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia); end if; @@ -665,6 +674,7 @@ begin minor4op := f_in.insn(5 downto 0) & f_in.insn(10 downto 6); vi.override := not decode_op_4_valid(to_integer(unsigned(minor4op))); v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0)))); + in3rc := '1'; when 31 => -- major opcode 31, lots of things @@ -688,6 +698,10 @@ begin when others => end case; end if; + if HAS_FPU and std_match(f_in.insn(10 downto 1), "1----10111") then + -- lower half of column 23 has FP loads and stores + fprs := '1'; + end if; when 16 => -- Predict backward branches as taken, forward as untaken @@ -715,6 +729,12 @@ begin when 30 => v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1)))); + when 52 | 53 | 54 | 55 => + -- stfd[u] and stfs[u] + if HAS_FPU then + fprs := '1'; + end if; + when 58 => v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0)))); @@ -725,6 +745,9 @@ begin if f_in.insn(5) = '0' and not std_match(f_in.insn(10 downto 1), "11-1001110") then vi.override := '1'; end if; + in3rc := '1'; + fprabc := '1'; + fprs := '1'; end if; when 62 => @@ -738,11 +761,23 @@ begin else v.decode := decode_op_63h_array(to_integer(unsigned(f_in.insn(4 downto 1)))); end if; + in3rc := '1'; + fprabc := '1'; + fprs := '1'; end if; when others => end case; + -- Work out GPR/FPR read addresses + vr.reg_1_addr := fprabc & insn_ra(f_in.insn); + vr.reg_2_addr := fprabc & insn_rb(f_in.insn); + if in3rc = '1' then + vr.reg_3_addr := fprabc & insn_rcreg(f_in.insn); + else + vr.reg_3_addr := fprs & insn_rs(f_in.insn); + end if; + if f_in.fetch_failed = '1' then v.valid := '1'; vi.override := '1'; @@ -788,6 +823,8 @@ begin f_out.redirect <= br.predict; f_out.redirect_nia <= br_target & "00"; flush_out <= bv.predict or br.predict; + + r_out <= vr; end process; d1_log: if LOG_LENGTH > 0 generate diff --git a/register_file.vhdl b/register_file.vhdl index dcce0a4..bc40c3f 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -14,7 +14,9 @@ entity register_file is ); port( clk : in std_logic; + stall : in std_ulogic; + d1_in : in Decode1ToRegisterFileType; d_in : in Decode2ToRegisterFileType; d_out : out RegisterFileToDecode2Type; @@ -39,9 +41,13 @@ architecture behaviour of register_file is signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_ack : std_ulogic; + signal addr_1_reg : gspr_index_t; + signal addr_2_reg : gspr_index_t; + signal addr_3_reg : gspr_index_t; begin -- synchronous writes register_write_0: process(clk) + variable a_addr, b_addr, c_addr : gspr_index_t; variable w_addr : gspr_index_t; begin if rising_edge(clk) then @@ -56,6 +62,19 @@ begin assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; registers(to_integer(unsigned(w_addr))) <= w_in.write_data; end if; + + a_addr := d1_in.reg_1_addr; + b_addr := d1_in.reg_2_addr; + c_addr := d1_in.reg_3_addr; + + if stall = '0' then + addr_1_reg <= a_addr; + addr_2_reg <= b_addr; + addr_3_reg <= c_addr; + end if; + assert (d_in.read1_enable = '0') or (d_in.read1_reg = addr_1_reg) severity failure; + assert (d_in.read2_enable = '0') or (d_in.read2_reg = addr_2_reg) severity failure; + assert (d_in.read3_enable = '0') or (d_in.read3_reg = addr_3_reg) severity failure; end if; end process register_write_0; From 1d7de2f1dae295364848940f31c991c8b095f4aa Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 22 Feb 2022 09:30:05 +1100 Subject: [PATCH 29/30] register_file: Make read access to register file synchronous With this, the register RAM is read synchronously using the addresses supplied by decode1. That means the register RAM can now be block RAM rather than LUT RAM. Debug accesses are done via the B port on cycles when decode1 indicates that there is no valid instruction or the instruction doesn't use a [F]RB operand. We latch the addresses being read in each cycle and use the same address next cycle if stalled. Data that is being written is latched and a multiplexer on each read port then supplies the latched write data if the read address for that port equals the write address. Signed-off-by: Paul Mackerras --- common.vhdl | 3 ++ decode1.vhdl | 14 ++++++ register_file.vhdl | 117 +++++++++++++++++++++++++++++++-------------- 3 files changed, 99 insertions(+), 35 deletions(-) diff --git a/common.vhdl b/common.vhdl index 0349a6e..4d6cb91 100644 --- a/common.vhdl +++ b/common.vhdl @@ -280,6 +280,9 @@ package common is reg_1_addr : gspr_index_t; reg_2_addr : gspr_index_t; reg_3_addr : gspr_index_t; + read_1_enable : std_ulogic; + read_2_enable : std_ulogic; + read_3_enable : std_ulogic; end record; type bypass_data_t is record diff --git a/decode1.vhdl b/decode1.vhdl index 36d511b..cc93dfc 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -641,6 +641,7 @@ begin variable bv : br_predictor_t; variable fprs, fprabc : std_ulogic; variable in3rc : std_ulogic; + variable may_read_rb : std_ulogic; begin v := Decode1ToDecode2Init; vi := reg_internal_t_init; @@ -654,6 +655,7 @@ begin fprs := '0'; fprabc := '0'; in3rc := '0'; + may_read_rb := '0'; if f_in.valid = '1' then report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia); @@ -675,10 +677,16 @@ begin vi.override := not decode_op_4_valid(to_integer(unsigned(minor4op))); v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0)))); in3rc := '1'; + may_read_rb := '1'; + + when 23 => + -- rlwnm[.] + may_read_rb := '1'; when 31 => -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); + may_read_rb := '1'; if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr @@ -728,6 +736,7 @@ begin when 30 => v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1)))); + may_read_rb := f_in.insn(4); when 52 | 53 | 54 | 55 => -- stfd[u] and stfs[u] @@ -748,6 +757,7 @@ begin in3rc := '1'; fprabc := '1'; fprs := '1'; + may_read_rb := '1'; end if; when 62 => @@ -764,6 +774,7 @@ begin in3rc := '1'; fprabc := '1'; fprs := '1'; + may_read_rb := '1'; end if; when others => @@ -777,6 +788,9 @@ begin else vr.reg_3_addr := fprs & insn_rs(f_in.insn); end if; + vr.read_1_enable := f_in.valid and not f_in.fetch_failed; + vr.read_2_enable := f_in.valid and not f_in.fetch_failed and may_read_rb; + vr.read_3_enable := f_in.valid and not f_in.fetch_failed; if f_in.fetch_failed = '1' then v.valid := '1'; diff --git a/register_file.vhdl b/register_file.vhdl index bc40c3f..a8ddee2 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -38,17 +38,27 @@ end entity register_file; architecture behaviour of register_file is type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); - signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_ack : std_ulogic; + signal dbg_gpr_done : std_ulogic; signal addr_1_reg : gspr_index_t; signal addr_2_reg : gspr_index_t; signal addr_3_reg : gspr_index_t; + signal rd_2 : std_ulogic; + signal fwd_1 : std_ulogic; + signal fwd_2 : std_ulogic; + signal fwd_3 : std_ulogic; + signal data_1 : std_ulogic_vector(63 downto 0); + signal data_2 : std_ulogic_vector(63 downto 0); + signal data_3 : std_ulogic_vector(63 downto 0); + signal prev_write_data : std_ulogic_vector(63 downto 0); + begin - -- synchronous writes + -- synchronous reads and writes register_write_0: process(clk) variable a_addr, b_addr, c_addr : gspr_index_t; variable w_addr : gspr_index_t; + variable b_enable : std_ulogic; begin if rising_edge(clk) then if w_in.write_enable = '1' then @@ -66,57 +76,94 @@ begin a_addr := d1_in.reg_1_addr; b_addr := d1_in.reg_2_addr; c_addr := d1_in.reg_3_addr; - - if stall = '0' then + b_enable := d1_in.read_2_enable; + if stall = '1' then + a_addr := addr_1_reg; + b_addr := addr_2_reg; + c_addr := addr_3_reg; + b_enable := rd_2; + else addr_1_reg <= a_addr; addr_2_reg <= b_addr; addr_3_reg <= c_addr; + rd_2 <= b_enable; end if; + + fwd_1 <= '0'; + fwd_2 <= '0'; + fwd_3 <= '0'; + if w_in.write_enable = '1' then + if w_addr = a_addr then + fwd_1 <= '1'; + end if; + if w_addr = b_addr then + fwd_2 <= '1'; + end if; + if w_addr = c_addr then + fwd_3 <= '1'; + end if; + end if; + + -- Do debug reads to GPRs and FPRs using the B port when it is not in use + if dbg_gpr_req = '1' then + if b_enable = '0' then + b_addr := dbg_gpr_addr(5 downto 0); + dbg_gpr_done <= '1'; + end if; + else + dbg_gpr_done <= '0'; + end if; + + if not HAS_FPU then + -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation + a_addr(5) := '0'; + b_addr(5) := '0'; + c_addr(5) := '0'; + end if; + data_1 <= registers(to_integer(unsigned(a_addr))); + data_2 <= registers(to_integer(unsigned(b_addr))); + data_3 <= registers(to_integer(unsigned(c_addr))); + + prev_write_data <= w_in.write_data; + assert (d_in.read1_enable = '0') or (d_in.read1_reg = addr_1_reg) severity failure; assert (d_in.read2_enable = '0') or (d_in.read2_reg = addr_2_reg) severity failure; assert (d_in.read3_enable = '0') or (d_in.read3_reg = addr_3_reg) severity failure; end if; end process register_write_0; - -- asynchronous reads + -- asynchronous forwarding of write data register_read_0: process(all) - variable a_addr, b_addr, c_addr : gspr_index_t; - variable w_addr : gspr_index_t; + variable out_data_1 : std_ulogic_vector(63 downto 0); + variable out_data_2 : std_ulogic_vector(63 downto 0); + variable out_data_3 : std_ulogic_vector(63 downto 0); begin - a_addr := d_in.read1_reg; - b_addr := d_in.read2_reg; - c_addr := d_in.read3_reg; - w_addr := w_in.write_reg; - if not HAS_FPU then - -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation - a_addr(5) := '0'; - b_addr(5) := '0'; - c_addr(5) := '0'; - w_addr(5) := '0'; + out_data_1 := data_1; + out_data_2 := data_2; + out_data_3 := data_3; + if fwd_1 = '1' then + out_data_1 := prev_write_data; end if; + if fwd_2 = '1' then + out_data_2 := prev_write_data; + end if; + if fwd_3 = '1' then + out_data_3 := prev_write_data; + end if; + if d_in.read1_enable = '1' then - report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr)))); + report "Reading GPR " & to_hstring(addr_1_reg) & " " & to_hstring(out_data_1); end if; if d_in.read2_enable = '1' then - report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr)))); + report "Reading GPR " & to_hstring(addr_2_reg) & " " & to_hstring(out_data_2); end if; if d_in.read3_enable = '1' then - report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr)))); - end if; - d_out.read1_data <= registers(to_integer(unsigned(a_addr))); - -- B read port is multiplexed with reads from the debug circuitry - if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then - b_addr := dbg_gpr_addr; - if not HAS_FPU then - b_addr(5) := '0'; - end if; + report "Reading GPR " & to_hstring(addr_3_reg) & " " & to_hstring(out_data_3); end if; - rd_port_b <= registers(to_integer(unsigned(b_addr))); - d_out.read2_data <= rd_port_b; - d_out.read3_data <= registers(to_integer(unsigned(c_addr))); - -- Forwarding of written data is now done explicitly with a bypass path - -- from writeback to decode2. + d_out.read1_data <= out_data_1; + d_out.read2_data <= out_data_2; + d_out.read3_data <= out_data_3; end process register_read_0; -- Latch read data and ack if dbg read requested and B port not busy @@ -124,8 +171,8 @@ begin begin if rising_edge(clk) then if dbg_gpr_req = '1' then - if d_in.read2_enable = '0' and dbg_ack = '0' then - dbg_data <= rd_port_b; + if dbg_ack = '0' and dbg_gpr_done = '1' then + dbg_data <= data_2; dbg_ack <= '1'; end if; else From d6121cd636bd5321e57f8fc76ec35b8621241117 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 Jul 2022 15:47:21 +1000 Subject: [PATCH 30/30] Use register addresses from decode1 for dependency tracking This improves timing a little because the register addresses now come directly from a latch instead of being calculated by decode_input_reg_*. The asserts that check that the two are the same are now in decode2 rather than register_file. Signed-off-by: Paul Mackerras --- common.vhdl | 9 +++++---- decode1.vhdl | 4 ++++ decode2.vhdl | 14 ++++++++------ register_file.vhdl | 4 ---- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/common.vhdl b/common.vhdl index 4d6cb91..cc49e8f 100644 --- a/common.vhdl +++ b/common.vhdl @@ -265,11 +265,15 @@ package common is big_endian: std_ulogic; spr_info : spr_id; ram_spr : ram_spr_info; + reg_a : gspr_index_t; + reg_b : gspr_index_t; + reg_c : gspr_index_t; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), decode => decode_rom_init, br_pred => '0', big_endian => '0', - spr_info => spr_id_init, ram_spr => ram_spr_info_init); + spr_info => spr_id_init, ram_spr => ram_spr_info_init, + reg_a => (others => '0'), reg_b => (others => '0'), reg_c => (others => '0')); type Decode1ToFetch1Type is record redirect : std_ulogic; @@ -449,11 +453,8 @@ package common is type Decode2ToRegisterFileType is record read1_enable : std_ulogic; - read1_reg : gspr_index_t; read2_enable : std_ulogic; - read2_reg : gspr_index_t; read3_enable : std_ulogic; - read3_reg : gspr_index_t; end record; type RegisterFileToDecode2Type is record diff --git a/decode1.vhdl b/decode1.vhdl index cc93dfc..de9b836 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -792,6 +792,10 @@ begin vr.read_2_enable := f_in.valid and not f_in.fetch_failed and may_read_rb; vr.read_3_enable := f_in.valid and not f_in.fetch_failed; + v.reg_a := vr.reg_1_addr; + v.reg_b := vr.reg_2_addr; + v.reg_c := vr.reg_3_addr; + if f_in.fetch_failed = '1' then v.valid := '1'; vi.override := '1'; diff --git a/decode2.vhdl b/decode2.vhdl index d91bec5..e24ebb5 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -360,6 +360,11 @@ begin dc2.e.ramspr_odd_rdaddr <= dc2in.e.ramspr_odd_rdaddr; dc2.e.ramspr_rd_odd <= dc2in.e.ramspr_rd_odd; end if; + if d_in.valid = '1' then + assert decoded_reg_a.reg_valid = '0' or decoded_reg_a.reg = d_in.reg_a severity failure; + assert decoded_reg_b.reg_valid = '0' or decoded_reg_b.reg = d_in.reg_b severity failure; + assert decoded_reg_c.reg_valid = '0' or decoded_reg_c.reg = d_in.reg_c severity failure; + end if; end if; end process; @@ -379,11 +384,8 @@ begin end if; r_out.read1_enable <= decoded_reg_a.reg_valid; - r_out.read1_reg <= decoded_reg_a.reg; r_out.read2_enable <= decoded_reg_b.reg_valid; - r_out.read2_reg <= decoded_reg_b.reg; r_out.read3_enable <= decoded_reg_c.reg_valid; - r_out.read3_reg <= decoded_reg_c.reg; end process; @@ -537,9 +539,9 @@ begin v.e.nia := d_in.nia; v.e.unit := d_in.decode.unit; v.e.fac := d_in.decode.facility; - v.e.read_reg1 := decoded_reg_a.reg; - v.e.read_reg2 := decoded_reg_b.reg; - v.e.read_reg3 := decoded_reg_c.reg; + v.e.read_reg1 := d_in.reg_a; + v.e.read_reg2 := d_in.reg_b; + v.e.read_reg3 := d_in.reg_c; v.e.write_reg := decoded_reg_o.reg; v.e.write_reg_enable := decoded_reg_o.reg_valid; v.e.invert_a := d_in.decode.invert_a; diff --git a/register_file.vhdl b/register_file.vhdl index a8ddee2..753ce80 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -125,10 +125,6 @@ begin data_3 <= registers(to_integer(unsigned(c_addr))); prev_write_data <= w_in.write_data; - - assert (d_in.read1_enable = '0') or (d_in.read1_reg = addr_1_reg) severity failure; - assert (d_in.read2_enable = '0') or (d_in.read2_reg = addr_2_reg) severity failure; - assert (d_in.read3_enable = '0') or (d_in.read3_reg = addr_3_reg) severity failure; end if; end process register_write_0;