diff --git a/common.vhdl b/common.vhdl index ffddb0b..1ed0606 100644 --- a/common.vhdl +++ b/common.vhdl @@ -130,12 +130,13 @@ package common is byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? + reserve : std_ulogic; -- set for larx/stcx end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', xerc => xerc_init, + is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0')); type Execute1ToMultiplyType is record @@ -201,46 +202,47 @@ package common is data : std_ulogic_vector(63 downto 0); -- data to write, unused for read write_reg : gpr_index_t; length : std_ulogic_vector(3 downto 0); + ci : std_ulogic; -- cache-inhibited load/store byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? update_reg : gpr_index_t; -- if so, the register to update xerc : xer_common_t; + reserve : std_ulogic; -- set for larx/stcx. + rc : std_ulogic; -- set for stcx. end record; - constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', + constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - others => (others => '0')); + reserve => '0', rc => '0', others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; load : std_ulogic; nc : std_ulogic; + reserve : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); - write_reg : gpr_index_t; - length : std_ulogic_vector(3 downto 0); - byte_reverse : std_ulogic; - sign_extend : std_ulogic; - update : std_ulogic; - update_reg : gpr_index_t; - xerc : xer_common_t; + byte_sel : std_ulogic_vector(7 downto 0); + end record; + + type DcacheToLoadstore1Type is record + valid : std_ulogic; + data : std_ulogic_vector(63 downto 0); + store_done : std_ulogic; + error : std_ulogic; end record; - type DcacheToWritebackType is record + type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; write_reg : gpr_index_t; write_data : std_ulogic_vector(63 downto 0); - write_len : std_ulogic_vector(3 downto 0); - write_shift : std_ulogic_vector(2 downto 0); - sign_extend : std_ulogic; - byte_reverse : std_ulogic; - second_word : std_ulogic; xerc : xer_common_t; + rc : std_ulogic; + store_done : std_ulogic; end record; - constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', - byte_reverse => '0', second_word => '0', xerc => xerc_init, - others => (others => '0')); + constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', xerc => xerc_init, + rc => '0', store_done => '0', others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; diff --git a/core.vhdl b/core.vhdl index bc0b16f..d535f7a 100644 --- a/core.vhdl +++ b/core.vhdl @@ -61,8 +61,11 @@ architecture behave of core is -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; + signal loadstore1_to_writeback: Loadstore1ToWritebackType; + + -- dcache signals signal loadstore1_to_dcache: Loadstore1ToDcacheType; - signal dcache_to_writeback: DcacheToWritebackType; + signal dcache_to_loadstore1: DcacheToLoadstore1Type; -- local signals signal fetch1_stall_in : std_ulogic; @@ -73,6 +76,8 @@ architecture behave of core is signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; signal ex1_stall_out: std_ulogic; + signal ls1_stall_out: std_ulogic; + signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; @@ -195,7 +200,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file ); - decode2_stall_in <= ex1_stall_out; + decode2_stall_in <= ex1_stall_out or ls1_stall_out; register_file_0: entity work.register_file generic map ( @@ -242,8 +247,13 @@ begin loadstore1_0: entity work.loadstore1 port map ( clk => clk, + rst => core_rst, l_in => execute1_to_loadstore1, - l_out => loadstore1_to_dcache + l_out => loadstore1_to_writeback, + d_out => loadstore1_to_dcache, + d_in => dcache_to_loadstore1, + dc_stall => dcache_stall_out, + stall_out => ls1_stall_out ); dcache_0: entity work.dcache @@ -256,7 +266,8 @@ begin clk => clk, rst => core_rst, d_in => loadstore1_to_dcache, - d_out => dcache_to_writeback, + d_out => dcache_to_loadstore1, + stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out ); @@ -265,7 +276,7 @@ begin port map ( clk => clk, e_in => execute1_to_writeback, - l_in => dcache_to_writeback, + l_in => loadstore1_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, complete_out => complete diff --git a/dcache.vhdl b/dcache.vhdl index df54c95..7e553bf 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -7,9 +7,6 @@ -- * Complete load misses on the cycle when WB data comes instead of -- at the end of line (this requires dealing with requests coming in -- while not idle...) --- * Load with update could use one less non-pipelined cycle by moving --- the register update to the pipeline bubble that exists when going --- back to the IDLE state. -- library ieee; use ieee.std_logic_1164.all; @@ -35,7 +32,7 @@ entity dcache is rst : in std_ulogic; d_in : in Loadstore1ToDcacheType; - d_out : out DcacheToWritebackType; + d_out : out DcacheToLoadstore1Type; stall_out : out std_ulogic; @@ -113,6 +110,8 @@ architecture rtl of dcache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + signal r0 : Loadstore1ToDcacheType; + -- Type of operation on a "valid" input type op_t is (OP_NONE, OP_LOAD_HIT, -- Cache hit on load @@ -124,9 +123,8 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing - LOAD_UPDATE, -- Load with update extra cycle - LOAD_UPDATE2, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack + FINISH_LD_MISS, -- Extra cycle after load miss STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -157,13 +155,13 @@ architecture rtl of dcache is hit_way : way_t; hit_load_valid : std_ulogic; - -- Register update (load/store with update) - update_valid : std_ulogic; - -- Data buffer for "slow" read ops (load miss and NC loads). slow_data : std_ulogic_vector(63 downto 0); slow_valid : std_ulogic; + -- Signal to complete a failed stcx. + stcx_fail : std_ulogic; + -- Cache miss state (reload state machine) state : state_t; wb : wishbone_master_out; @@ -174,21 +172,14 @@ architecture rtl of dcache is signal r1 : reg_stage_1_t; - -- Second stage register, only used for load hits + -- Reservation information -- - type reg_stage_2_t is record - hit_way : way_t; - hit_load_valid : std_ulogic; - load_is_update : std_ulogic; - load_reg : std_ulogic_vector(4 downto 0); - data_shift : std_ulogic_vector(2 downto 0); - length : std_ulogic_vector(3 downto 0); - sign_extend : std_ulogic; - byte_reverse : std_ulogic; - xerc : xer_common_t; + type reservation_t is record + valid : std_ulogic; + addr : std_ulogic_vector(63 downto LINE_OFF_BITS); end record; - signal r2 : reg_stage_2_t; + signal reservation : reservation_t; -- Async signals on incoming request signal req_index : index_t; @@ -196,8 +187,15 @@ architecture rtl of dcache is signal req_hit_way : way_t; signal req_tag : cache_tag_t; signal req_op : op_t; + signal req_data : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); + signal early_req_row : row_t; + + signal cancel_store : std_ulogic; + signal set_rsrv : std_ulogic; + signal clear_rsrv : std_ulogic; + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -208,9 +206,8 @@ architecture rtl of dcache is signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals - signal bus_sel : wishbone_sel_type; - signal store_data : wishbone_data_type; - + signal bus_sel : std_ulogic_vector(7 downto 0); + -- -- Helper functions to decode incoming requests -- @@ -290,37 +287,6 @@ architecture rtl of dcache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; - -- Generate byte enables from sizes - function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is - begin - case length is - when "0001" => - return "00000001"; - when "0010" => - return "00000011"; - when "0100" => - return "00001111"; - when "1000" => - return "11111111"; - when others => - return "00000000"; - end case; - end function length_to_sel; - - -- Calculate shift and byte enables for wishbone - function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is - begin - return to_integer(unsigned(address(2 downto 0))) * 8; - end function wishbone_data_shift; - - function wishbone_data_sel(size : in std_logic_vector(3 downto 0); - address : in std_logic_vector(63 downto 0)) - return std_ulogic_vector is - begin - return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)), - to_integer(unsigned(address(2 downto 0))))); - end function wishbone_data_sel; - begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -375,6 +341,18 @@ begin end generate; end generate; + -- Latch the request in r0 as long as we're not stalling + stage_0 : process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r0.valid <= '0'; + elsif stall_out = '0' then + r0 <= d_in; + end if; + end if; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; @@ -383,23 +361,27 @@ begin variable tmp : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0); variable opsel : std_ulogic_vector(3 downto 0); + variable go : std_ulogic; begin -- Extract line, row and tag from request - req_index <= get_index(d_in.addr); - req_row <= get_row(d_in.addr); - req_tag <= get_tag(d_in.addr); + req_index <= get_index(r0.addr); + req_row <= get_row(r0.addr); + req_tag <= get_tag(r0.addr); + + -- Only do anything if not being stalled by stage 1 + go := r0.valid and not stall_out; - -- Calculate address of beginning of cache line, will be - -- used for cache miss processing if needed - -- - req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + -- Calculate address of beginning of cache line, will be + -- used for cache miss processing if needed + -- + req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; for i in way_t loop - if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then + if go = '1' and cache_valids(req_index)(i) = '1' then if read_tag(i, cache_tags(req_index)) = req_tag then hit_way := i; is_hit := '1'; @@ -416,7 +398,7 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := d_in.valid & d_in.load & d_in.nc & is_hit; + opsel := go & r0.load & r0.nc & is_hit; case opsel is when "1101" => op := OP_LOAD_HIT; when "1100" => op := OP_LOAD_MISS; @@ -431,46 +413,70 @@ begin req_op <= op; + -- Version of the row number that is valid one cycle earlier + -- in the cases where we need to read the cache data BRAM. + -- If we're stalling then we need to keep reading the last + -- row requested. + if stall_out = '0' then + early_req_row <= get_row(d_in.addr); + else + early_req_row <= req_row; + end if; end process; - -- - -- Misc signal assignments - -- - -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Wishbone & BRAM write data formatting for stores (most of it already - -- happens in loadstore1, this is the remaining data shifting) - -- - store_data <= std_logic_vector(shift_left(unsigned(d_in.data), - wishbone_data_shift(d_in.addr))); - - -- Wishbone read and write and BRAM write sel bits generation - bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); - -- TODO: Generate errors -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; -- Generate stalls from stage 1 state machine stall_out <= '1' when r1.state /= IDLE else '0'; - -- Writeback (loads and reg updates) & completion control logic + -- Handle load-with-reservation and store-conditional instructions + reservation_comb: process(all) + begin + cancel_store <= '0'; + set_rsrv <= '0'; + clear_rsrv <= '0'; + if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then + -- XXX generate alignment interrupt if address is not aligned + -- XXX or if r0.nc = '1' + if r0.load = '1' then + -- load with reservation + set_rsrv <= '1'; + else + -- store conditional + clear_rsrv <= '1'; + if reservation.valid = '0' or + r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + cancel_store <= '1'; + end if; + end if; + end if; + end process; + + reservation_reg: process(clk) + begin + if rising_edge(clk) then + if rst = '1' or clear_rsrv = '1' then + reservation.valid <= '0'; + elsif set_rsrv = '1' then + reservation.valid <= '1'; + reservation.addr <= r0.addr(63 downto LINE_OFF_BITS); + end if; + end if; + end process; + + -- Return data for loads & completion control logic -- writeback_control: process(all) begin - -- The mux on d_out.write reg defaults to the normal load hit case. - d_out.write_enable <= '0'; + -- The mux on d_out.data defaults to the normal load hit case. d_out.valid <= '0'; - d_out.write_reg <= r2.load_reg; - d_out.write_data <= cache_out(r2.hit_way); - d_out.write_len <= r2.length; - d_out.write_shift <= r2.data_shift; - d_out.sign_extend <= r2.sign_extend; - d_out.byte_reverse <= r2.byte_reverse; - d_out.second_word <= '0'; - d_out.xerc <= r2.xerc; + d_out.data <= cache_out(r1.hit_way); + d_out.store_done <= '0'; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -483,24 +489,17 @@ begin -- -- Sanity: Only one of these must be set in any given cycle - assert (r1.update_valid and r2.hit_load_valid) /= '1' report - "unexpected hit_load_delayed collision with update_valid" + assert (r1.slow_valid and r1.stcx_fail) /= '1' report + "unexpected slow_valid collision with stcx_fail" severity FAILURE; - assert (r1.slow_valid and r2.hit_load_valid) /= '1' report + assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - assert (r1.slow_valid and r1.update_valid) /= '1' report - "unexpected update_valid collision with slow_valid" - severity FAILURE; - - -- Delayed load hit case is the standard path - if r2.hit_load_valid = '1' then - d_out.write_enable <= '1'; - -- If it's not a load with update, complete it now - if r2.load_is_update = '0' then - d_out.valid <= '1'; - end if; + -- Load hit case is the standard path + if r1.hit_load_valid = '1' then + report "completing load hit"; + d_out.valid <= '1'; end if; -- Slow ops (load miss, NC, stores) @@ -508,49 +507,20 @@ begin -- If it's a load, enable register writeback and switch -- mux accordingly -- - if r1.req.load then - d_out.write_reg <= r1.req.write_reg; - d_out.write_enable <= '1'; - - -- Read data comes from the slow data latch, formatter - -- from the latched request. - -- - d_out.write_data <= r1.slow_data; - d_out.write_shift <= r1.req.addr(2 downto 0); - d_out.sign_extend <= r1.req.sign_extend; - d_out.byte_reverse <= r1.req.byte_reverse; - d_out.write_len <= r1.req.length; - d_out.xerc <= r1.req.xerc; + if r1.req.load then + -- Read data comes from the slow data latch + d_out.data <= r1.slow_data; end if; + d_out.store_done <= '1'; - -- If it's a store or a non-update load form, complete now - if r1.req.load = '0' or r1.req.update = '0' then - d_out.valid <= '1'; - end if; + report "completing store or load miss"; + d_out.valid <= '1'; end if; - -- We have a register update to do. - if r1.update_valid = '1' then - d_out.write_enable <= '1'; - d_out.write_reg <= r1.req.update_reg; - - -- Change the read data mux to the address that's going into - -- the register and the formatter does nothing. - -- - d_out.write_data <= r1.req.addr; - d_out.write_shift <= "000"; - d_out.write_len <= "1000"; - d_out.sign_extend <= '0'; - d_out.byte_reverse <= '0'; - d_out.xerc <= r1.req.xerc; - - -- If it was a load, this completes the operation (load with - -- update case). - -- - if r1.req.load = '1' then - d_out.valid <= '1'; - end if; - end if; + if r1.stcx_fail = '1' then + d_out.store_done <= '0'; + d_out.valid <= '1'; + end if; end process; @@ -595,7 +565,7 @@ begin begin -- Cache hit reads do_read <= '1'; - rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + rd_addr <= std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS)); cache_out(i) <= dout; -- Write mux: @@ -606,10 +576,10 @@ begin -- other than the current state. Only the do_write signal is. -- if r1.state = IDLE then - -- When IDLE, the only write path is the store-hit update case + -- In IDLE state, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= store_data; - wr_sel <= bus_sel; + wr_data <= r0.data; + wr_sel <= r0.byte_sel; else -- Otherwise, we might be doing a reload wr_data <= wishbone_in.dat; @@ -623,7 +593,7 @@ begin if reloading and wishbone_in.ack = '1' and r1.store_way = i then do_write <= '1'; end if; - if req_op = OP_STORE_HIT and req_hit_way = i then + if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' then assert not reloading report "Store hit while in state:" & state_t'image(r1.state) severity FAILURE; @@ -633,36 +603,22 @@ begin end generate; -- - -- Cache hit synchronous machine for the easy case. This handles - -- non-update form load hits and stage 1 to stage 2 transfers + -- Cache hit synchronous machine for the easy case. This handles load hits. -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- stage 1 -> stage 2 - r2.hit_load_valid <= r1.hit_load_valid; - r2.hit_way <= r1.hit_way; - r2.load_is_update <= r1.req.update; - r2.load_reg <= r1.req.write_reg; - r2.data_shift <= r1.req.addr(2 downto 0); - r2.length <= r1.req.length; - r2.sign_extend <= r1.req.sign_extend; - r2.byte_reverse <= r1.req.byte_reverse; - - -- If we have a request incoming, we have to latch it as d_in.valid + -- If we have a request incoming, we have to latch it as r0.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). - if req_op /= OP_NONE then - r1.req <= d_in; - + if req_op /= OP_NONE and stall_out = '0' then + r1.req <= r0; report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(d_in.addr) & - " upd:" & std_ulogic'image(d_in.update) & - " nc:" & std_ulogic'image(d_in.nc) & - " reg:" & to_hstring(d_in.write_reg) & + " addr:" & to_hstring(r0.addr) & + " nc:" & std_ulogic'image(r0.nc) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); @@ -679,10 +635,9 @@ begin end process; -- - -- Every other case is handled by this stage machine: + -- Every other case is handled by this state machine: -- -- * Cache load miss/reload (in conjunction with "rams") - -- * Load hits for update forms -- * Load hits for non-cachable forms -- * Stores (the collision case is handled in "rams") -- @@ -701,7 +656,6 @@ begin end loop; r1.state <= IDLE; r1.slow_valid <= '0'; - r1.update_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -710,27 +664,19 @@ begin else -- One cycle pulses reset r1.slow_valid <= '0'; - r1.update_valid <= '0'; - - -- We cannot currently process a new request when not idle - assert req_op = OP_NONE or r1.state = IDLE report "request " & - op_t'image(req_op) & " while in state " & state_t'image(r1.state) - severity FAILURE; + r1.stcx_fail <= '0'; -- Main state machine case r1.state is - when IDLE => + when IDLE => case req_op is - when OP_LOAD_HIT => - -- We have a load with update hit, we need the delayed update cycle - if d_in.update = '1' then - r1.state <= LOAD_UPDATE; - end if; + when OP_LOAD_HIT => + -- stay in IDLE state - when OP_LOAD_MISS => + when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(d_in.addr) & + report "cache miss addr:" & to_hstring(r0.addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -765,25 +711,26 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= bus_sel; - r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - -- For store-with-update do the register update - if d_in.update = '1' then - r1.update_valid <= '1'; - end if; - r1.wb.sel <= bus_sel; - r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= store_data; - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - r1.wb.we <= '1'; - r1.state <= STORE_WAIT_ACK; + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.dat <= r0.data; + if cancel_store = '0' then + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + r1.wb.we <= '1'; + r1.state <= STORE_WAIT_ACK; + else + r1.stcx_fail <= '1'; + r1.state <= IDLE; + end if; -- OP_NONE and OP_BAD do nothing when OP_NONE => @@ -831,32 +778,23 @@ begin -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Complete the load that missed. For load with update - -- we also need to do the deferred update cycle. - -- - r1.slow_valid <= '1'; - if r1.req.load = '1' and r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - report "completing miss with load-update !"; - else - r1.state <= IDLE; - report "completing miss !"; - end if; + -- Don't complete and go idle until next cycle, in + -- case the next request is for the last dword of + -- the cache line we just loaded. + r1.state <= FINISH_LD_MISS; end if; -- Increment store row counter r1.store_row <= next_row(r1.store_row); end if; - when LOAD_UPDATE => - -- We need the extra cycle to complete a load with update - r1.state <= LOAD_UPDATE2; - when LOAD_UPDATE2 => - -- We need the extra cycle to complete a load with update - r1.update_valid <= '1'; - r1.state <= IDLE; + when FINISH_LD_MISS => + -- Write back the load data that we got + r1.slow_valid <= '1'; + r1.state <= IDLE; + report "completing miss !"; - when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => + when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request if wishbone_in.stall = '0' then r1.wb.stb <= '0'; @@ -867,12 +805,12 @@ begin if r1.state = NC_LOAD_WAIT_ACK then r1.slow_data <= wishbone_in.dat; end if; + r1.state <= IDLE; r1.slow_valid <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; - r1.state <= IDLE; end if; - end case; + end case; end if; end if; end process; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index 437fd7d..bd8341a 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -13,7 +13,7 @@ architecture behave of dcache_tb is signal rst : std_ulogic; signal d_in : Loadstore1ToDcacheType; - signal d_out : DcacheToWritebackType; + signal d_out : DcacheToLoadstore1Type; signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -71,12 +71,6 @@ begin d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); - d_in.write_reg <= (others => '0'); - d_in.length <= (others => '0'); - d_in.byte_reverse <= '0'; - d_in.sign_extend <= '0'; - d_in.update <= '0'; - d_in.update_reg <= (others => '0'); wait for 4*clk_period; wait until rising_edge(clk); @@ -89,11 +83,10 @@ begin wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000000100000000" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000000100000000" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000000100000000" severity failure; -- wait for clk_period; @@ -106,11 +99,10 @@ begin wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000000D0000000C" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000000D0000000C" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000000D0000000C" severity failure; @@ -121,11 +113,10 @@ begin d_in.valid <= '1'; wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000004100000040" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000004100000040" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000004100000040" severity failure; diff --git a/decode1.vhdl b/decode1.vhdl index f1b5ad4..e9dae1e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -46,26 +46,26 @@ architecture behaviour of decode1 is 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi 10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli - 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz - 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu - 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha - 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lhau - 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhz - 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu - 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz - 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu + 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz + 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu + 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha + 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau + 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz + 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzu + 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwz + 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzu 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli 24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori 25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi 21 => (ALU, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwinm 23 => (ALU, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwnm - 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- stb - 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '1'), -- stbu - 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sth - 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu - 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw - 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu + 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb + 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu + 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth + 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu + 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw + 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic 2 => (ALU, OP_TDI, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi --PPC_TWI 3 @@ -215,25 +215,29 @@ architecture behaviour of decode1 is 2#1110101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#1111001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#1111101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel - 2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lbarx - 2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzux - 2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbzx - 2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- ldarx - 2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- ldbrx - 2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- ldux - 2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- ldx - 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lharx - 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lhaux - 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lhax - 2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhbrx - 2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzux - 2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhzx - 2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lwarx - 2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lwaux - 2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lwax - 2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwbrx - 2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzux - 2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwzx + 2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lbarx + 2#1101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzcix + 2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzux + 2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzx + 2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- ldarx + 2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldbrx + 2#1101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix + 2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux + 2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx + 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx + 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux + 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax + 2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhbrx + 2#1100110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzcix + 2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzux + 2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzx + 2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lwarx + 2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lwaux + 2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lwax + 2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwbrx + 2#1100010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzcix + 2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzux + 2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzx -- 2#1000000000# mcrxr -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf @@ -278,21 +282,25 @@ architecture behaviour of decode1 is 2#1100111000# => (ALU, OP_SHR, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- srawi 2#1000011011# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- srd 2#1000011000# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- srw - 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', RC, '0', '1'), -- stbcx - 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '1'), -- stbux - 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- stbx - 2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stdbrx - 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- stdcx - 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stdux - 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stdx - 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sthbrx - 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- sthcx - 2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthux - 2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sthx - 2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwbrx - 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- stwcx - 2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwux - 2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwx + 2#1111010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbcix + 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stbcx + 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbux + 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbx + 2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdbrx + 2#1111110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdcix + 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stdcx + 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux + 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx + 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx + 2#1110110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix + 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx + 2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthux + 2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthx + 2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwbrx + 2#1110010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwcix + 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stwcx + 2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwux + 2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwx 2#0000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subf 2#1000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfo 2#0000001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfc @@ -329,7 +337,7 @@ architecture behaviour of decode1 is -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe - constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'); + constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'); constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant sim_cfg_instr : decode_rom_t := (ALU, OP_SIM_CONFIG,NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'); diff --git a/decode2.vhdl b/decode2.vhdl index 3d6b7d8..ff773aa 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -334,6 +334,7 @@ begin v.e.byte_reverse := d_in.decode.byte_reverse; v.e.sign_extend := d_in.decode.sign_extend; v.e.update := d_in.decode.update; + v.e.reserve := d_in.decode.reserve; -- issue control control_valid_in <= d_in.valid; diff --git a/execute1.vhdl b/execute1.vhdl index ae13c72..4703049 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -200,6 +200,7 @@ begin variable bo, bi : std_ulogic_vector(4 downto 0); variable bf, bfa : std_ulogic_vector(2 downto 0); variable cr_op : std_ulogic_vector(9 downto 0); + variable cr_operands : std_ulogic_vector(1 downto 0); variable bt, ba, bb : std_ulogic_vector(4 downto 0); variable btnum, banum, bbnum : integer range 0 to 31; variable crresult : std_ulogic; @@ -532,27 +533,10 @@ begin btnum := 31 - to_integer(unsigned(bt)); banum := 31 - to_integer(unsigned(ba)); bbnum := 31 - to_integer(unsigned(bb)); - case cr_op(8 downto 5) is - when "1001" => -- CREQV - crresult := not(e_in.cr(banum) xor e_in.cr(bbnum)); - when "0111" => -- CRNAND - crresult := not(e_in.cr(banum) and e_in.cr(bbnum)); - when "0100" => -- CRANDC - crresult := (e_in.cr(banum) and not e_in.cr(bbnum)); - when "1000" => -- CRAND - crresult := (e_in.cr(banum) and e_in.cr(bbnum)); - when "0001" => -- CRNOR - crresult := not(e_in.cr(banum) or e_in.cr(bbnum)); - when "1101" => -- CRORC - crresult := (e_in.cr(banum) or not e_in.cr(bbnum)); - when "0110" => -- CRXOR - crresult := (e_in.cr(banum) xor e_in.cr(bbnum)); - when "1110" => -- CROR - crresult := (e_in.cr(banum) or e_in.cr(bbnum)); - when others => - crresult := '0'; - report "BAD CR?"; - end case; + -- Bits 5-8 of cr_op give the truth table of the requested + -- logical operation + cr_operands := e_in.cr(banum) & e_in.cr(bbnum); + crresult := cr_op(5 + to_integer(unsigned(cr_operands))); v.e.write_cr_mask := num_to_fxm((31-btnum) / 4); for i in 0 to 31 loop if i = btnum then @@ -767,6 +751,13 @@ begin lv.update := e_in.update; lv.update_reg := gspr_to_gpr(e_in.read_reg1); lv.xerc := v.e.xerc; + lv.reserve := e_in.reserve; + lv.rc := e_in.rc; + -- decode l*cix and st*cix instructions here + if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and + e_in.insn(5 downto 1) = "10101" then + lv.ci := '1'; + end if; -- Update registers rin <= v; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 5b61d4c..518feee 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -12,16 +12,90 @@ use work.helpers.all; entity loadstore1 is port ( clk : in std_ulogic; + rst : in std_ulogic; l_in : in Execute1ToLoadstore1Type; + l_out : out Loadstore1ToWritebackType; - l_out : out Loadstore1ToDcacheType + d_out : out Loadstore1ToDcacheType; + d_in : in DcacheToLoadstore1Type; + + dc_stall : in std_ulogic; + stall_out : out std_ulogic ); end loadstore1; +-- Note, we don't currently use the stall output from the dcache because +-- we know it can take two requests without stalling when idle, we are +-- its only user, and we know it never stalls when idle. + architecture behave of loadstore1 is - signal r, rin : Loadstore1ToDcacheType; + + -- State machine for unaligned loads/stores + type state_t is (IDLE, -- ready for instruction + SECOND_REQ, -- send 2nd request of unaligned xfer + FIRST_ACK_WAIT, -- waiting for 1st ack from dcache + LAST_ACK_WAIT, -- waiting for last ack from dcache + LD_UPDATE -- writing rA with computed addr on load + ); + + type reg_stage_t is record + -- latch most of the input request + load : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + store_data : std_ulogic_vector(63 downto 0); + load_data : std_ulogic_vector(63 downto 0); + write_reg : gpr_index_t; + length : std_ulogic_vector(3 downto 0); + byte_reverse : std_ulogic; + sign_extend : std_ulogic; + update : std_ulogic; + update_reg : gpr_index_t; + xerc : xer_common_t; + reserve : std_ulogic; + rc : std_ulogic; + nc : std_ulogic; -- non-cacheable access + state : state_t; + second_bytes : std_ulogic_vector(7 downto 0); + end record; + + type byte_sel_t is array(0 to 7) of std_ulogic; + subtype byte_trim_t is std_ulogic_vector(1 downto 0); + type trim_ctl_t is array(0 to 7) of byte_trim_t; + + signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); + + -- Generate byte enables from sizes + function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is + begin + case length is + when "0001" => + return "00000001"; + when "0010" => + return "00000011"; + when "0100" => + return "00001111"; + when "1000" => + return "11111111"; + when others => + return "00000000"; + end case; + end function length_to_sel; + + -- Calculate byte enables + -- This returns 16 bits, giving the select signals for two transfers, + -- to account for unaligned loads or stores + function xfer_data_sel(size : in std_logic_vector(3 downto 0); + address : in std_logic_vector(2 downto 0)) + return std_ulogic_vector is + variable longsel : std_ulogic_vector(15 downto 0); + begin + longsel := "00000000" & length_to_sel(size); + return std_ulogic_vector(shift_left(unsigned(longsel), + to_integer(unsigned(address)))); + end function xfer_data_sel; + begin -- Calculate the address in the first cycle lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); @@ -29,51 +103,224 @@ begin loadstore1_0: process(clk) begin if rising_edge(clk) then - r <= rin; + if rst = '1' then + r.state <= IDLE; + else + r <= rin; + end if; end if; end process; loadstore1_1: process(all) - variable v : Loadstore1ToDcacheType; + variable v : reg_stage_t; + variable brev_lenm1 : unsigned(2 downto 0); + variable byte_offset : unsigned(2 downto 0); + variable j : integer; + variable k : unsigned(2 downto 0); + variable kk : unsigned(3 downto 0); + variable long_sel : std_ulogic_vector(15 downto 0); + variable byte_sel : std_ulogic_vector(7 downto 0); + variable req : std_ulogic; + variable stall : std_ulogic; + variable addr : std_ulogic_vector(63 downto 0); + variable wdata : std_ulogic_vector(63 downto 0); + variable write_enable : std_ulogic; + variable do_update : std_ulogic; + variable two_dwords : std_ulogic; + variable done : std_ulogic; + variable data_permuted : std_ulogic_vector(63 downto 0); + variable data_trimmed : std_ulogic_vector(63 downto 0); + variable use_second : byte_sel_t; + variable trim_ctl : trim_ctl_t; + variable negative : std_ulogic; begin v := r; + req := '0'; + stall := '0'; + done := '0'; + byte_sel := (others => '0'); + addr := lsu_sum; + + write_enable := '0'; + do_update := '0'; + two_dwords := or (r.second_bytes); + + -- load data formatting + if r.load = '1' then + byte_offset := unsigned(r.addr(2 downto 0)); + brev_lenm1 := "000"; + if r.byte_reverse = '1' then + brev_lenm1 := unsigned(r.length(2 downto 0)) - 1; + end if; + + -- shift and byte-reverse data bytes + for i in 0 to 7 loop + kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); + use_second(i) := kk(3); + j := to_integer(kk(2 downto 0)) * 8; + data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j); + end loop; + + -- Work out the sign bit for sign extension. + -- Assumes we are not doing both sign extension and byte reversal, + -- in that for unaligned loads crossing two dwords we end up + -- using a bit from the second dword, whereas for a byte-reversed + -- (i.e. big-endian) load the sign bit would be in the first dword. + negative := (r.length(3) and data_permuted(63)) or + (r.length(2) and data_permuted(31)) or + (r.length(1) and data_permuted(15)) or + (r.length(0) and data_permuted(7)); - v.valid := l_in.valid; - v.load := l_in.load; - v.data := l_in.data; - v.write_reg := l_in.write_reg; - v.length := l_in.length; - v.byte_reverse := l_in.byte_reverse; - v.sign_extend := l_in.sign_extend; - v.update := l_in.update; - v.update_reg := l_in.update_reg; - v.xerc := l_in.xerc; - - -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- - -- - -- This will have to be replaced by a combination of implementing the - -- proper HV CI load/store instructions and having an MMU to get the I - -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" then - v.nc := '1'; - else - v.nc := '0'; - end if; - - -- XXX Do length_to_sel here ? - - -- byte reverse stores in the first cycle - if v.load = '0' and l_in.byte_reverse = '1' then - v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length))); + -- trim and sign-extend + for i in 0 to 7 loop + if i < to_integer(unsigned(r.length)) then + if two_dwords = '1' then + trim_ctl(i) := '1' & not use_second(i); + else + trim_ctl(i) := not use_second(i) & '0'; + end if; + else + trim_ctl(i) := '0' & (negative and r.sign_extend); + end if; + case trim_ctl(i) is + when "11" => + data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8); + when "10" => + data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8); + when "01" => + data_trimmed(i * 8 + 7 downto i * 8) := x"FF"; + when others => + data_trimmed(i * 8 + 7 downto i * 8) := x"00"; + end case; + end loop; end if; - v.addr := lsu_sum; + case r.state is + when IDLE => + if l_in.valid = '1' then + v.load := l_in.load; + v.addr := lsu_sum; + v.write_reg := l_in.write_reg; + v.length := l_in.length; + v.byte_reverse := l_in.byte_reverse; + v.sign_extend := l_in.sign_extend; + v.update := l_in.update; + v.update_reg := l_in.update_reg; + v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; + v.nc := l_in.ci; + + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- + -- + -- This will have to be replaced by a combination of implementing the + -- proper HV CI load/store instructions and having an MMU to get the I + -- bit otherwise. + if lsu_sum(31 downto 28) = "1100" then + v.nc := '1'; + end if; + + -- Do length_to_sel and work out if we are doing 2 dwords + long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); + byte_sel := long_sel(7 downto 0); + v.second_bytes := long_sel(15 downto 8); + + v.addr := lsu_sum; + + -- Do byte reversing and rotating for stores in the first cycle + if v.load = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; + j := to_integer(k) * 8; + v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + end loop; + end if; + + req := '1'; + stall := '1'; + if long_sel(15 downto 8) = "00000000" then + v.state := LAST_ACK_WAIT; + else + v.state := SECOND_REQ; + end if; + end if; + + when SECOND_REQ => + -- compute (addr + 8) & ~7 for the second doubleword when unaligned + addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + byte_sel := r.second_bytes; + req := '1'; + stall := '1'; + v.state := FIRST_ACK_WAIT; + + when FIRST_ACK_WAIT => + stall := '1'; + if d_in.valid = '1' then + v.state := LAST_ACK_WAIT; + if r.load = '1' then + v.load_data := data_permuted; + end if; + end if; + + when LAST_ACK_WAIT => + stall := '1'; + if d_in.valid = '1' then + write_enable := r.load; + if r.load = '1' and r.update = '1' then + -- loads with rA update need an extra cycle + v.state := LD_UPDATE; + else + -- stores write back rA update in this cycle + do_update := r.update; + stall := '0'; + done := '1'; + v.state := IDLE; + end if; + end if; + + when LD_UPDATE => + do_update := '1'; + v.state := IDLE; + done := '1'; + end case; + + -- Update outputs to dcache + d_out.valid <= req; + d_out.load <= v.load; + d_out.nc <= v.nc; + d_out.reserve <= v.reserve; + d_out.addr <= addr; + d_out.data <= v.store_data; + d_out.byte_sel <= byte_sel; + + -- Update outputs to writeback + -- Multiplex either cache data to the destination GPR or + -- the address for the rA update. + l_out.valid <= done; + if do_update = '1' then + l_out.write_enable <= '1'; + l_out.write_reg <= r.update_reg; + l_out.write_data <= r.addr; + else + l_out.write_enable <= write_enable; + l_out.write_reg <= r.write_reg; + l_out.write_data <= data_trimmed; + end if; + l_out.xerc <= r.xerc; + l_out.rc <= r.rc and done; + l_out.store_done <= d_in.store_done; + + stall_out <= stall; -- Update registers rin <= v; - -- Update outputs - l_out <= r; end process; + end; diff --git a/writeback.vhdl b/writeback.vhdl index a730266..d1a7faf 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -11,7 +11,7 @@ entity writeback is clk : in std_ulogic; e_in : in Execute1ToWritebackType; - l_in : in DcacheToWritebackType; + l_in : in Loadstore1ToWritebackType; w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; @@ -21,48 +21,15 @@ entity writeback is end entity writeback; architecture behaviour of writeback is - subtype byte_index_t is unsigned(2 downto 0); - type permutation_t is array(0 to 7) of byte_index_t; - subtype byte_trim_t is std_ulogic_vector(1 downto 0); - type trim_ctl_t is array(0 to 7) of byte_trim_t; - type byte_sel_t is array(0 to 7) of std_ulogic; - - signal data_len : unsigned(3 downto 0); - signal data_in : std_ulogic_vector(63 downto 0); - signal data_permuted : std_ulogic_vector(63 downto 0); - signal data_trimmed : std_ulogic_vector(63 downto 0); - signal data_latched : std_ulogic_vector(63 downto 0); - signal perm : permutation_t; - signal use_second : byte_sel_t; - signal byte_offset : unsigned(2 downto 0); - signal brev_lenm1 : unsigned(2 downto 0); - signal trim_ctl : trim_ctl_t; - signal rc : std_ulogic; - signal partial_write : std_ulogic; - signal sign_extend : std_ulogic; - signal negative : std_ulogic; - signal second_word : std_ulogic; begin - writeback_0: process(clk) - begin - if rising_edge(clk) then - if partial_write = '1' then - data_latched <= data_permuted; - end if; - end if; - end process; - writeback_1: process(all) variable x : std_ulogic_vector(0 downto 0); variable y : std_ulogic_vector(0 downto 0); - variable z : std_ulogic_vector(0 downto 0); variable w : std_ulogic_vector(0 downto 0); - variable j : integer; - variable k : unsigned(3 downto 0); variable cf: std_ulogic_vector(3 downto 0); - variable xe: xer_common_t; variable zero : std_ulogic; variable sign : std_ulogic; + variable scf : std_ulogic_vector(3 downto 0); begin x(0) := e_in.valid; y(0) := l_in.valid; @@ -84,17 +51,10 @@ begin complete_out <= '1'; end if; - rc <= '0'; - brev_lenm1 <= "000"; - partial_write <= '0'; - second_word <= '0'; - xe := e_in.xerc; - data_in <= (others => '0'); - if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; + w_out.write_data <= e_in.write_data; w_out.write_enable <= '1'; - rc <= e_in.rc; end if; if e_in.write_cr_enable = '1' then @@ -108,75 +68,26 @@ begin c_out.write_xerc_data <= e_in.xerc; end if; - sign_extend <= l_in.sign_extend; - data_len <= unsigned(l_in.write_len); - byte_offset <= unsigned(l_in.write_shift); if l_in.write_enable = '1' then w_out.write_reg <= gpr_to_gspr(l_in.write_reg); - if l_in.byte_reverse = '1' then - brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1; - end if; + w_out.write_data <= l_in.write_data; w_out.write_enable <= '1'; - second_word <= l_in.second_word; - if l_in.valid = '0' and (data_len + byte_offset > 8) then - partial_write <= '1'; - end if; - xe := l_in.xerc; end if; - -- shift and byte-reverse data bytes - for i in 0 to 7 loop - k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); - perm(i) <= k(2 downto 0); - use_second(i) <= k(3); - end loop; - for i in 0 to 7 loop - j := to_integer(perm(i)) * 8; - data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j); - end loop; - - -- If the data can arrive split over two cycles, this will be correct - -- provided we don't have both sign extension and byte reversal. - negative <= (data_len(3) and data_permuted(63)) or - (data_len(2) and data_permuted(31)) or - (data_len(1) and data_permuted(15)) or - (data_len(0) and data_permuted(7)); - - -- trim and sign-extend - for i in 0 to 7 loop - if i < to_integer(data_len) then - if second_word = '1' then - trim_ctl(i) <= '1' & not use_second(i); - else - trim_ctl(i) <= not use_second(i) & '0'; - end if; - else - trim_ctl(i) <= '0' & (negative and sign_extend); - end if; - end loop; - for i in 0 to 7 loop - case trim_ctl(i) is - when "11" => - data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8); - when "10" => - data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8); - when "01" => - data_trimmed(i * 8 + 7 downto i * 8) <= x"FF"; - when others => - data_trimmed(i * 8 + 7 downto i * 8) <= x"00"; - end case; - end loop; - - -- deliver to regfile - if l_in.write_enable = '1' then - w_out.write_data <= data_trimmed; - else - w_out.write_data <= e_in.write_data; + if l_in.rc = '1' then + -- st*cx. instructions + scf(3) := '0'; + scf(2) := '0'; + scf(1) := l_in.store_done; + scf(0) := l_in.xerc.so; + c_out.write_cr_enable <= '1'; + c_out.write_cr_mask <= num_to_fxm(0); + c_out.write_cr_data(31 downto 28) <= scf; end if; -- Perform CR0 update for RC forms -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data - if rc = '1' then + if e_in.rc = '1' and e_in.write_enable = '1' then sign := e_in.write_data(63); zero := not (or e_in.write_data); c_out.write_cr_enable <= '1'; @@ -184,7 +95,7 @@ begin cf(3) := sign; cf(2) := not sign and not zero; cf(1) := zero; - cf(0) := xe.so; + cf(0) := e_in.xerc.so; c_out.write_cr_data(31 downto 28) <= cf; end if; end process;