dcache: Add support for unaligned loads and stores

For an unaligned load or store, we do the first doubleword (dword) of
the transfer as normal, but then go to a new NEXT_DWORD state of the
state machine to do the cache tag lookup for the second dword of the
transfer.  From the NEXT_DWORD state we have much the same transitions
to other states as from the IDLE state (the transitions for OP_LOAD_HIT
are a bit different but almost identical for the other op values).

We now do the preparation of the data to be written in loadstore1,
that is, byte reversal if necessary and rotation by a number of
bytes based on the low 3 bits of the address.  We do rotation not
shifting so we have the bytes that need to go into the second
doubleword in the right place in the low bytes of the data sent to
dcache.  The rotation and byte reversal are done in a single step
with one multiplexer per byte by setting the select inputs for each
byte appropriately.

This also fixes writeback to not write the register value until it
has received both pieces of an unaligned load value.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/153/head
Paul Mackerras 4 years ago
parent 1587d9e6eb
commit 94dd8bc480

@ -124,6 +124,7 @@ architecture rtl of dcache is
-- Cache state machine
type state_t is (IDLE, -- Normal load hit processing
NEXT_DWORD, -- Starting the 2nd xfer of misaligned
LOAD_UPDATE, -- Load with update extra cycle
LOAD_UPDATE2, -- Load with update extra cycle
RELOAD_WAIT_ACK, -- Cache reload wait ack
@ -157,6 +158,12 @@ architecture rtl of dcache is
hit_way : way_t;
hit_load_valid : std_ulogic;

-- Info for doing the second transfer of a misaligned load/store
two_dwords : std_ulogic;
second_dword : std_ulogic;
next_addr : std_ulogic_vector(63 downto 0);
next_sel : std_ulogic_vector(7 downto 0);

-- Register update (load/store with update)
update_valid : std_ulogic;

@ -186,6 +193,8 @@ architecture rtl of dcache is
sign_extend : std_ulogic;
byte_reverse : std_ulogic;
xerc : xer_common_t;
last_dword : std_ulogic;
second_dword : std_ulogic;
end record;

signal r2 : reg_stage_2_t;
@ -196,7 +205,10 @@ architecture rtl of dcache is
signal req_hit_way : way_t;
signal req_tag : cache_tag_t;
signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0);
signal req_addr : std_ulogic_vector(63 downto 0);
signal req_laddr : std_ulogic_vector(63 downto 0);
signal req_sel : std_ulogic_vector(7 downto 0);

-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
@ -208,8 +220,9 @@ architecture rtl of dcache is
signal replace_way : way_t;

-- Wishbone read/write/cache write formatting signals
signal bus_sel : wishbone_sel_type;
signal store_data : wishbone_data_type;
signal bus_sel : std_ulogic_vector(15 downto 0);

signal two_dwords : std_ulogic;
--
-- Helper functions to decode incoming requests
@ -307,17 +320,17 @@ architecture rtl of dcache is
end case;
end function length_to_sel;

-- Calculate shift and byte enables for wishbone
function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is
begin
return to_integer(unsigned(address(2 downto 0))) * 8;
end function wishbone_data_shift;

-- Calculate byte enables for wishbone
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(63 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)),
longsel := (others => '0');
longsel(7 downto 0) := length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address(2 downto 0)))));
end function wishbone_data_sel;

@ -383,23 +396,43 @@ begin
variable tmp : std_ulogic_vector(63 downto 0);
variable data : std_ulogic_vector(63 downto 0);
variable opsel : std_ulogic_vector(3 downto 0);
variable go : std_ulogic;
variable is_load : std_ulogic;
variable is_nc : std_ulogic;
begin
-- Extract line, row and tag from request
req_index <= get_index(d_in.addr);
req_row <= get_row(d_in.addr);
req_tag <= get_tag(d_in.addr);

-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
--
req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');
if r1.state /= NEXT_DWORD then
req_addr <= d_in.addr;
req_data <= d_in.data;
req_sel <= bus_sel(7 downto 0);
go := d_in.valid;
is_load := d_in.load;
is_nc := d_in.nc;

else
req_addr <= r1.next_addr;
req_data <= r1.req.data;
req_sel <= r1.next_sel;
go := '1';
is_load := r1.req.load;
is_nc := r1.req.nc;
end if;

req_index <= get_index(req_addr);
req_row <= get_row(req_addr);
req_tag <= get_tag(req_addr);

-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
--
req_laddr <= req_addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');

-- Test if pending request is a hit on any way
hit_way := 0;
is_hit := '0';
for i in way_t loop
if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then
if go = '1' and cache_valids(req_index)(i) = '1' then
if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i;
is_hit := '1';
@ -416,7 +449,7 @@ begin
-- Combine the request and cache his status to decide what
-- operation needs to be done
--
opsel := d_in.valid & d_in.load & d_in.nc & is_hit;
opsel := go & is_load & is_nc & is_hit;
case opsel is
when "1101" => op := OP_LOAD_HIT;
when "1100" => op := OP_LOAD_MISS;
@ -433,22 +466,15 @@ begin

end process;

--
-- Misc signal assignments
--

-- Wire up wishbone request latch out of stage 1
wishbone_out <= r1.wb;

-- Wishbone & BRAM write data formatting for stores (most of it already
-- happens in loadstore1, this is the remaining data shifting)
--
store_data <= std_logic_vector(shift_left(unsigned(d_in.data),
wishbone_data_shift(d_in.addr)));

-- Wishbone read and write and BRAM write sel bits generation
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);

-- See if the operation crosses two doublewords
two_dwords <= or (bus_sel(15 downto 8));

-- TODO: Generate errors
-- err_nc_collision <= '1' when req_op = OP_BAD else '0';

@ -469,7 +495,7 @@ begin
d_out.write_shift <= r2.data_shift;
d_out.sign_extend <= r2.sign_extend;
d_out.byte_reverse <= r2.byte_reverse;
d_out.second_word <= '0';
d_out.second_word <= r2.second_dword;
d_out.xerc <= r2.xerc;

-- We have a valid load or store hit or we just completed a slow
@ -497,8 +523,10 @@ begin
if r2.hit_load_valid = '1' then
d_out.write_enable <= '1';

-- If it's not a load with update, complete it now
if r2.load_is_update = '0' then
-- If there isn't another dword to go and
-- it's not a load with update, complete it now
if r2.last_dword = '1' and r2.load_is_update = '0' then
report "completing load hit";
d_out.valid <= '1';
end if;
end if;
@ -521,10 +549,14 @@ begin
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.write_len <= r1.req.length;
d_out.xerc <= r1.req.xerc;
d_out.second_word <= r1.second_dword;
end if;

-- If it's a store or a non-update load form, complete now
if r1.req.load = '0' or r1.req.update = '0' then
-- unless we need to do another dword transfer
if (r1.req.load = '0' or r1.req.update = '0') and
(r1.two_dwords = '0' or r1.second_dword = '1') then
report "completing store or load miss";
d_out.valid <= '1';
end if;
end if;
@ -543,11 +575,13 @@ begin
d_out.sign_extend <= '0';
d_out.byte_reverse <= '0';
d_out.xerc <= r1.req.xerc;
d_out.second_word <= '0';

-- If it was a load, this completes the operation (load with
-- update case).
--
if r1.req.load = '1' then
report "completing after load update";
d_out.valid <= '1';
end if;
end if;
@ -605,11 +639,11 @@ begin
-- For timing, the mux on wr_data/sel/addr is not dependent on anything
-- other than the current state. Only the do_write signal is.
--
if r1.state = IDLE then
-- When IDLE, the only write path is the store-hit update case
if r1.state = IDLE or r1.state = NEXT_DWORD then
-- In these states, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= store_data;
wr_sel <= bus_sel;
wr_data <= req_data;
wr_sel <= req_sel;
else
-- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat;
@ -648,6 +682,8 @@ begin
r2.length <= r1.req.length;
r2.sign_extend <= r1.req.sign_extend;
r2.byte_reverse <= r1.req.byte_reverse;
r2.second_dword <= r1.second_dword;
r2.last_dword <= r1.second_dword or not r1.two_dwords;

-- If we have a request incoming, we have to latch it as d_in.valid
-- is only set for a single cycle. It's up to the control logic to
@ -655,8 +691,12 @@ begin
-- single issue on load/stores so we are fine, later, we can generate
-- a stall output if necessary).

if req_op /= OP_NONE then
if req_op /= OP_NONE and d_in.valid = '1' then
r1.req <= d_in;
r1.second_dword <= '0';
r1.two_dwords <= two_dwords;
r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";
r1.next_sel <= bus_sel(15 downto 8);

report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(d_in.addr) &
@ -666,6 +706,8 @@ begin
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way);
elsif r1.state = NEXT_DWORD then
r1.second_dword <= '1';
end if;

-- Fast path for load/store hits. Set signals for the writeback controls.
@ -713,24 +755,36 @@ begin
r1.update_valid <= '0';

-- We cannot currently process a new request when not idle
assert req_op = OP_NONE or r1.state = IDLE report "request " &
assert d_in.valid = '0' or r1.state = IDLE report "request " &
op_t'image(req_op) & " while in state " & state_t'image(r1.state)
severity FAILURE;

-- Main state machine
case r1.state is
when IDLE =>
when IDLE | NEXT_DWORD =>
case req_op is
when OP_LOAD_HIT =>
-- We have a load with update hit, we need the delayed update cycle
if d_in.update = '1' then
r1.state <= LOAD_UPDATE;
end if;
when OP_LOAD_HIT =>
if r1.state = IDLE then
-- If the load is misaligned then we will need to start
-- the state machine
if two_dwords = '1' then
r1.state <= NEXT_DWORD;
elsif d_in.update = '1' then
-- We have a load with update hit, we need the delayed update cycle
r1.state <= LOAD_UPDATE;
end if;
else
if r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
else
r1.state <= IDLE;
end if;
end if;

when OP_LOAD_MISS =>
-- Normal load cache miss, start the reload machine
--
report "cache miss addr:" & to_hstring(d_in.addr) &
report "cache miss addr:" & to_hstring(req_addr) &
" idx:" & integer'image(req_index) &
" way:" & integer'image(replace_way) &
" tag:" & to_hstring(req_tag);
@ -765,8 +819,8 @@ begin
r1.state <= RELOAD_WAIT_ACK;

when OP_LOAD_NC =>
r1.wb.sel <= bus_sel;
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.sel <= req_sel;
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '0';
@ -774,12 +828,10 @@ begin

when OP_STORE_HIT | OP_STORE_MISS =>
-- For store-with-update do the register update
if d_in.update = '1' then
r1.update_valid <= '1';
end if;
r1.wb.sel <= bus_sel;
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= store_data;
r1.update_valid <= d_in.valid and d_in.update;
r1.wb.sel <= req_sel;
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= req_data;
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '1';
@ -831,11 +883,13 @@ begin
-- Cache line is now valid
cache_valids(r1.store_index)(r1.store_way) <= '1';

-- Complete the load that missed. For load with update
-- Write back the load data that we got, and start
-- the second dword if necessary. Otherwise, see if
-- we also need to do the deferred update cycle.
--
r1.slow_valid <= '1';
if r1.req.update = '1' then
if r1.two_dwords and not r1.second_dword then
r1.state <= NEXT_DWORD;
elsif r1.req.update = '1' then
r1.state <= LOAD_UPDATE2;
report "completing miss with load-update !";
else
@ -864,12 +918,15 @@ begin

-- Got ack ? complete.
if wishbone_in.ack = '1' then
r1.state <= IDLE;
if r1.two_dwords and not r1.second_dword then
r1.state <= NEXT_DWORD;
elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then
r1.state <= LOAD_UPDATE2;
else
r1.state <= IDLE;
end if;
if r1.state = NC_LOAD_WAIT_ACK then
r1.slow_data <= wishbone_in.dat;
if r1.req.update = '1' then
r1.state <= LOAD_UPDATE2;
end if;
end if;
r1.slow_valid <= '1';
r1.wb.cyc <= '0';

@ -35,12 +35,15 @@ begin

loadstore1_1: process(all)
variable v : Loadstore1ToDcacheType;
variable brev_lenm1 : unsigned(2 downto 0);
variable byte_offset : unsigned(2 downto 0);
variable j : integer;
variable k : unsigned(2 downto 0);
begin
v := r;

v.valid := l_in.valid;
v.load := l_in.load;
v.data := l_in.data;
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
@ -63,9 +66,18 @@ begin

-- XXX Do length_to_sel here ?

-- byte reverse stores in the first cycle
if v.load = '0' and l_in.byte_reverse = '1' then
v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length)));
-- Do byte reversing and rotating for stores in the first cycle
if v.load = '0' then
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;
end if;

v.addr := lsu_sum;

@ -116,12 +116,12 @@ begin
if l_in.byte_reverse = '1' then
brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
end if;
w_out.write_enable <= '1';
second_word <= l_in.second_word;
if l_in.valid = '0' and (data_len + byte_offset > 8) then
partial_write <= '1';
end if;
xe := l_in.xerc;
w_out.write_enable <= not partial_write or second_word;
end if;

-- shift and byte-reverse data bytes

Loading…
Cancel
Save