diff --git a/common.vhdl b/common.vhdl index f581ccb..e4d810e 100644 --- a/common.vhdl +++ b/common.vhdl @@ -236,17 +236,11 @@ package common is write_enable: std_ulogic; write_reg : gpr_index_t; write_data : std_ulogic_vector(63 downto 0); - write_len : std_ulogic_vector(3 downto 0); - write_shift : std_ulogic_vector(2 downto 0); - sign_extend : std_ulogic; - byte_reverse : std_ulogic; - second_word : std_ulogic; xerc : xer_common_t; rc : std_ulogic; store_done : std_ulogic; end record; - constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', - byte_reverse => '0', second_word => '0', xerc => xerc_init, + constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', xerc => xerc_init, rc => '0', store_done => '0', others => (others => '0')); type Execute1ToWritebackType is record diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 2ab71ad..8c16886 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -43,7 +43,8 @@ architecture behave of loadstore1 is -- latch most of the input request load : std_ulogic; addr : std_ulogic_vector(63 downto 0); - data : std_ulogic_vector(63 downto 0); + store_data : std_ulogic_vector(63 downto 0); + load_data : std_ulogic_vector(63 downto 0); write_reg : gpr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; @@ -58,6 +59,10 @@ architecture behave of loadstore1 is second_bytes : std_ulogic_vector(7 downto 0); end record; + type byte_sel_t is array(0 to 7) of std_ulogic; + subtype byte_trim_t is std_ulogic_vector(1 downto 0); + type trim_ctl_t is array(0 to 7) of byte_trim_t; + signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); @@ -112,6 +117,7 @@ begin variable byte_offset : unsigned(2 downto 0); variable j : integer; variable k : unsigned(2 downto 0); + variable kk : unsigned(3 downto 0); variable long_sel : std_ulogic_vector(15 downto 0); variable byte_sel : std_ulogic_vector(7 downto 0); variable req : std_ulogic; @@ -120,8 +126,13 @@ begin variable wdata : std_ulogic_vector(63 downto 0); variable write_enable : std_ulogic; variable do_update : std_ulogic; - variable second_dword : std_ulogic; + variable two_dwords : std_ulogic; variable done : std_ulogic; + variable data_permuted : std_ulogic_vector(63 downto 0); + variable data_trimmed : std_ulogic_vector(63 downto 0); + variable use_second : byte_sel_t; + variable trim_ctl : trim_ctl_t; + variable negative : std_ulogic; begin v := r; req := '0'; @@ -132,14 +143,63 @@ begin write_enable := '0'; do_update := '0'; - second_dword := '0'; + two_dwords := or (r.second_bytes); + + -- load data formatting + if r.load = '1' then + byte_offset := unsigned(r.addr(2 downto 0)); + brev_lenm1 := "000"; + if r.byte_reverse = '1' then + brev_lenm1 := unsigned(r.length(2 downto 0)) - 1; + end if; + + -- shift and byte-reverse data bytes + for i in 0 to 7 loop + kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); + use_second(i) := kk(3); + j := to_integer(kk(2 downto 0)) * 8; + data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j); + end loop; + + -- Work out the sign bit for sign extension. + -- Assumes we are not doing both sign extension and byte reversal, + -- in that for unaligned loads crossing two dwords we end up + -- using a bit from the second dword, whereas for a byte-reversed + -- (i.e. big-endian) load the sign bit would be in the first dword. + negative := (r.length(3) and data_permuted(63)) or + (r.length(2) and data_permuted(31)) or + (r.length(1) and data_permuted(15)) or + (r.length(0) and data_permuted(7)); + + -- trim and sign-extend + for i in 0 to 7 loop + if i < to_integer(unsigned(r.length)) then + if two_dwords = '1' then + trim_ctl(i) := '1' & not use_second(i); + else + trim_ctl(i) := not use_second(i) & '0'; + end if; + else + trim_ctl(i) := '0' & (negative and r.sign_extend); + end if; + case trim_ctl(i) is + when "11" => + data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8); + when "10" => + data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8); + when "01" => + data_trimmed(i * 8 + 7 downto i * 8) := x"FF"; + when others => + data_trimmed(i * 8 + 7 downto i * 8) := x"00"; + end case; + end loop; + end if; case r.state is when IDLE => if l_in.valid = '1' then v.load := l_in.load; v.addr := lsu_sum; - v.data := l_in.data; v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; @@ -179,7 +239,7 @@ begin for i in 0 to 7 loop k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; j := to_integer(k) * 8; - v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); end loop; end if; @@ -203,13 +263,14 @@ begin when FIRST_ACK_WAIT => stall := '1'; if d_in.valid = '1' then - write_enable := r.load; v.state := LAST_ACK_WAIT; + if r.load = '1' then + v.load_data := data_permuted; + end if; end if; when LAST_ACK_WAIT => stall := '1'; - second_dword := or (r.second_bytes); if d_in.valid = '1' then write_enable := r.load; if r.load = '1' and r.update = '1' then @@ -230,16 +291,13 @@ begin done := '1'; end case; - -- Update registers - rin <= v; - -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; d_out.nc <= v.nc; d_out.reserve <= v.reserve; d_out.addr <= addr; - d_out.data <= v.data; + d_out.data <= v.store_data; d_out.byte_sel <= byte_sel; -- Update outputs to writeback @@ -250,28 +308,20 @@ begin l_out.write_enable <= '1'; l_out.write_reg <= r.update_reg; l_out.write_data <= r.addr; - l_out.write_len <= x"8"; - l_out.write_shift <= "000"; - l_out.sign_extend <= '0'; - l_out.byte_reverse <= '0'; - l_out.second_word <= '0'; - l_out.rc <= '0'; - l_out.store_done <= '0'; else l_out.write_enable <= write_enable; l_out.write_reg <= r.write_reg; - l_out.write_data <= d_in.data; - l_out.write_len <= r.length; - l_out.write_shift <= r.addr(2 downto 0); - l_out.sign_extend <= r.sign_extend; - l_out.byte_reverse <= r.byte_reverse; - l_out.second_word <= second_dword; - l_out.rc <= r.rc and done; - l_out.store_done <= d_in.store_done; + l_out.write_data <= data_trimmed; end if; l_out.xerc <= r.xerc; + l_out.rc <= r.rc and done; + l_out.store_done <= d_in.store_done; stall_out <= stall; + -- Update registers + rin <= v; + end process; + end; diff --git a/writeback.vhdl b/writeback.vhdl index d52bb54..d1a7faf 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -21,46 +21,12 @@ entity writeback is end entity writeback; architecture behaviour of writeback is - subtype byte_index_t is unsigned(2 downto 0); - type permutation_t is array(0 to 7) of byte_index_t; - subtype byte_trim_t is std_ulogic_vector(1 downto 0); - type trim_ctl_t is array(0 to 7) of byte_trim_t; - type byte_sel_t is array(0 to 7) of std_ulogic; - - signal data_len : unsigned(3 downto 0); - signal data_in : std_ulogic_vector(63 downto 0); - signal data_permuted : std_ulogic_vector(63 downto 0); - signal data_trimmed : std_ulogic_vector(63 downto 0); - signal data_latched : std_ulogic_vector(63 downto 0); - signal perm : permutation_t; - signal use_second : byte_sel_t; - signal byte_offset : unsigned(2 downto 0); - signal brev_lenm1 : unsigned(2 downto 0); - signal trim_ctl : trim_ctl_t; - signal rc : std_ulogic; - signal partial_write : std_ulogic; - signal sign_extend : std_ulogic; - signal negative : std_ulogic; - signal second_word : std_ulogic; begin - writeback_0: process(clk) - begin - if rising_edge(clk) then - if partial_write = '1' then - data_latched <= data_permuted; - end if; - end if; - end process; - writeback_1: process(all) variable x : std_ulogic_vector(0 downto 0); variable y : std_ulogic_vector(0 downto 0); - variable z : std_ulogic_vector(0 downto 0); variable w : std_ulogic_vector(0 downto 0); - variable j : integer; - variable k : unsigned(3 downto 0); variable cf: std_ulogic_vector(3 downto 0); - variable xe: xer_common_t; variable zero : std_ulogic; variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); @@ -85,17 +51,10 @@ begin complete_out <= '1'; end if; - rc <= '0'; - brev_lenm1 <= "000"; - partial_write <= '0'; - second_word <= '0'; - xe := e_in.xerc; - data_in <= (others => '0'); - if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; + w_out.write_data <= e_in.write_data; w_out.write_enable <= '1'; - rc <= e_in.rc; end if; if e_in.write_cr_enable = '1' then @@ -109,20 +68,10 @@ begin c_out.write_xerc_data <= e_in.xerc; end if; - sign_extend <= l_in.sign_extend; - data_len <= unsigned(l_in.write_len); - byte_offset <= unsigned(l_in.write_shift); if l_in.write_enable = '1' then w_out.write_reg <= gpr_to_gspr(l_in.write_reg); - if l_in.byte_reverse = '1' then - brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1; - end if; - second_word <= l_in.second_word; - if l_in.valid = '0' and (data_len + byte_offset > 8) then - partial_write <= '1'; - end if; - xe := l_in.xerc; - w_out.write_enable <= not partial_write or second_word; + w_out.write_data <= l_in.write_data; + w_out.write_enable <= '1'; end if; if l_in.rc = '1' then @@ -130,65 +79,15 @@ begin scf(3) := '0'; scf(2) := '0'; scf(1) := l_in.store_done; - scf(0) := xe.so; + scf(0) := l_in.xerc.so; c_out.write_cr_enable <= '1'; c_out.write_cr_mask <= num_to_fxm(0); c_out.write_cr_data(31 downto 28) <= scf; end if; - -- shift and byte-reverse data bytes - for i in 0 to 7 loop - k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); - perm(i) <= k(2 downto 0); - use_second(i) <= k(3); - end loop; - for i in 0 to 7 loop - j := to_integer(perm(i)) * 8; - data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j); - end loop; - - -- If the data can arrive split over two cycles, this will be correct - -- provided we don't have both sign extension and byte reversal. - negative <= (data_len(3) and data_permuted(63)) or - (data_len(2) and data_permuted(31)) or - (data_len(1) and data_permuted(15)) or - (data_len(0) and data_permuted(7)); - - -- trim and sign-extend - for i in 0 to 7 loop - if i < to_integer(data_len) then - if second_word = '1' then - trim_ctl(i) <= '1' & not use_second(i); - else - trim_ctl(i) <= not use_second(i) & '0'; - end if; - else - trim_ctl(i) <= '0' & (negative and sign_extend); - end if; - end loop; - for i in 0 to 7 loop - case trim_ctl(i) is - when "11" => - data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8); - when "10" => - data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8); - when "01" => - data_trimmed(i * 8 + 7 downto i * 8) <= x"FF"; - when others => - data_trimmed(i * 8 + 7 downto i * 8) <= x"00"; - end case; - end loop; - - -- deliver to regfile - if l_in.write_enable = '1' then - w_out.write_data <= data_trimmed; - else - w_out.write_data <= e_in.write_data; - end if; - -- Perform CR0 update for RC forms -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data - if rc = '1' then + if e_in.rc = '1' and e_in.write_enable = '1' then sign := e_in.write_data(63); zero := not (or e_in.write_data); c_out.write_cr_enable <= '1'; @@ -196,7 +95,7 @@ begin cf(3) := sign; cf(2) := not sign and not zero; cf(1) := zero; - cf(0) := xe.so; + cf(0) := e_in.xerc.so; c_out.write_cr_data(31 downto 28) <= cf; end if; end process;