You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
microwatt/dcache.vhdl

997 lines
34 KiB
VHDL

--
-- Set associative dcache write-through
--
-- TODO (in no specific order):
--
-- * See list in icache.vhdl
-- * Complete load misses on the cycle when WB data comes instead of
-- at the end of line (this requires dealing with requests coming in
-- while not idle...)
-- * Load with update could use one less non-pipelined cycle by moving
-- the register update to the pipeline bubble that exists when going
-- back to the IDLE state.
--
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
library work;
use work.utils.all;
use work.common.all;
use work.helpers.all;
use work.wishbone_types.all;
entity dcache is
generic (
-- Line size in bytes
LINE_SIZE : positive := 64;
-- Number of lines in a set
NUM_LINES : positive := 32;
-- Number of ways
NUM_WAYS : positive := 4
);
port (
clk : in std_ulogic;
rst : in std_ulogic;
d_in : in Loadstore1ToDcacheType;
d_out : out DcacheToWritebackType;
stall_out : out std_ulogic;
wishbone_out : out wishbone_master_out;
wishbone_in : in wishbone_slave_out
);
end entity dcache;
architecture rtl of dcache is
-- BRAM organisation: We never access more than wishbone_data_bits at
-- a time so to save resources we make the array only that wide, and
-- use consecutive indices for to make a cache "line"
--
-- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
constant ROW_SIZE : natural := wishbone_data_bits / 8;
-- ROW_PER_LINE is the number of row (wishbone transactions) in a line
constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
-- BRAM_ROWS is the number of rows in BRAM needed to represent the full
-- dcache
constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
-- Bit fields counts in the address
-- ROW_BITS is the number of bits to select a row
constant ROW_BITS : natural := log2(BRAM_ROWS);
-- ROW_LINEBITS is the number of bits to select a row within a line
constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
-- LINE_OFF_BITS is the number of bits for the offset in a cache line
constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
-- ROW_OFF_BITS is the number of bits for the offset in a row
constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
-- INDEX_BITS is the number if bits to select a cache line
constant INDEX_BITS : natural := log2(NUM_LINES);
-- TAG_BITS is the number of bits of the tag part of the address
constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS;
-- WAY_BITS is the number of bits to select a way
constant WAY_BITS : natural := log2(NUM_WAYS);
-- Example of layout for 32 lines of 64 bytes:
--
-- .. tag |index| line |
-- .. | row | |
-- .. | |---| | ROW_LINEBITS (3)
-- .. | |--- - --| LINE_OFF_BITS (6)
-- .. | |- --| ROW_OFF_BITS (3)
-- .. |----- ---| | ROW_BITS (8)
-- .. |-----| | INDEX_BITS (5)
-- .. --------| | TAG_BITS (53)
subtype row_t is integer range 0 to BRAM_ROWS-1;
subtype index_t is integer range 0 to NUM_LINES-1;
subtype way_t is integer range 0 to NUM_WAYS-1;
-- The cache data BRAM organized as described above for each way
subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0);
-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
-- not handle a clean (commented) definition of the cache tags as a 3d
-- memory. For now, work around it by putting all the tags
subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
-- type cache_tags_set_t is array(way_t) of cache_tag_t;
-- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
type cache_tags_array_t is array(index_t) of cache_tags_set_t;
-- The cache valid bits
subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
type cache_valids_t is array(index_t) of cache_way_valids_t;
-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
signal cache_tags : cache_tags_array_t;
signal cache_valids : cache_valids_t;
attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed";
-- Type of operation on a "valid" input
type op_t is (OP_NONE,
OP_LOAD_HIT, -- Cache hit on load
OP_LOAD_MISS, -- Load missing cache
OP_LOAD_NC, -- Non-cachable load
OP_BAD, -- BAD: Cache hit on NC load/store
OP_STORE_HIT, -- Store hitting cache
OP_STORE_MISS); -- Store missing cache
-- Cache state machine
type state_t is (IDLE, -- Normal load hit processing
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD
NEXT_DWORD, -- Starting the 2nd xfer of misaligned
LOAD_UPDATE, -- Load with update extra cycle
RELOAD_WAIT_ACK, -- Cache reload wait ack
STORE_WAIT_ACK, -- Store wait ack
NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack
--
-- Dcache operations:
--
-- In order to make timing, we use the BRAMs with an output buffer,
-- which means that the BRAM output is delayed by an extra cycle.
--
-- Thus, the dcache has a 2-stage internal pipeline for cache hits
-- with no stalls.
--
-- All other operations are handled via stalling in the first stage.
--
-- The second stage can thus complete a hit at the same time as the
-- first stage emits a stall for a complex op.
--
-- First stage register, contains state for stage 1 of load hits
-- and for the state machine used by all other operations
--
type reg_stage_1_t is record
-- Latch the complete request from ls1
req : Loadstore1ToDcacheType;
-- Cache hit state
hit_way : way_t;
hit_load_valid : std_ulogic;
-- Info for doing the second transfer of a misaligned load/store
two_dwords : std_ulogic;
second_dword : std_ulogic;
next_addr : std_ulogic_vector(63 downto 0);
next_sel : std_ulogic_vector(7 downto 0);
-- Register update (load/store with update)
update_valid : std_ulogic;
-- Data buffer for "slow" read ops (load miss and NC loads).
slow_data : std_ulogic_vector(63 downto 0);
slow_valid : std_ulogic;
-- Signal to complete a failed stcx.
stcx_fail : std_ulogic;
-- Cache miss state (reload state machine)
state : state_t;
wb : wishbone_master_out;
store_way : way_t;
store_row : row_t;
store_index : index_t;
end record;
signal r1 : reg_stage_1_t;
-- Reservation information
--
type reservation_t is record
valid : std_ulogic;
addr : std_ulogic_vector(63 downto LINE_OFF_BITS);
end record;
signal reservation : reservation_t;
-- Async signals on incoming request
signal req_index : index_t;
signal req_row : row_t;
signal req_hit_way : way_t;
signal req_tag : cache_tag_t;
signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0);
signal req_addr : std_ulogic_vector(63 downto 0);
signal req_laddr : std_ulogic_vector(63 downto 0);
signal req_sel : std_ulogic_vector(7 downto 0);
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
signal next_addr : std_ulogic_vector(63 downto 0);
signal early_req_addr : std_ulogic_vector(11 downto 0);
signal early_req_row : row_t;
signal cancel_store : std_ulogic;
signal set_rsrv : std_ulogic;
signal clear_rsrv : std_ulogic;
-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t;
-- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_victim : plru_out_t;
signal replace_way : way_t;
-- Wishbone read/write/cache write formatting signals
signal bus_sel : std_ulogic_vector(15 downto 0);
signal two_dwords : std_ulogic;
--
-- Helper functions to decode incoming requests
--
-- Return the cache line index (tag index) for an address
function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is
begin
return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS)));
end;
-- Return the cache row index (data memory) for an address
function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
begin
return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS)));
end;
-- Returns whether this is the last row of a line
function is_last_row_addr(addr: wishbone_addr_type) return boolean is
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
begin
return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones;
end;
-- Returns whether this is the last row of a line
function is_last_row(row: row_t) return boolean is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
return row_v(ROW_LINEBITS-1 downto 0) = ones;
end;
-- Return the address of the next row in the current cache line
function next_row_addr(addr: wishbone_addr_type) return std_ulogic_vector is
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
variable result : wishbone_addr_type;
begin
-- Is there no simpler way in VHDL to generate that 3 bits adder ?
row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
result := addr;
result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
return result;
end;
-- Return the next row in the current cache line. We use a dedicated
-- function in order to limit the size of the generated adder to be
-- only the bits within a cache line (3 bits with default settings)
--
function next_row(row: row_t) return row_t is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
row_idx := row_v(ROW_LINEBITS-1 downto 0);
row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1);
return to_integer(unsigned(row_v));
end;
-- Get the tag value from the address
function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is
begin
return addr(63 downto 64-TAG_BITS);
end;
-- Read a tag from a tag memory row
function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
begin
return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
end;
-- Write a tag to tag memory row
procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
tag: cache_tag_t) is
begin
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end;
-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;
-- Calculate byte enables for wishbone
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(63 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
longsel := (others => '0');
longsel(7 downto 0) := length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address(2 downto 0)))));
end function wishbone_data_sel;
begin
assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE;
assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE;
assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
report "geometry bits don't add up" severity FAILURE;
assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
report "geometry bits don't add up" severity FAILURE;
assert (64 = wishbone_data_bits)
report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE;
-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
begin
plrus: for i in 0 to NUM_LINES-1 generate
-- PLRU interface
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plru : entity work.plru
generic map (
BITS => WAY_BITS
)
port map (
clk => clk,
rst => rst,
acc => plru_acc,
acc_en => plru_acc_en,
lru => plru_out
);
process(req_index, req_op, req_hit_way, plru_out)
begin
-- PLRU interface
if (req_op = OP_LOAD_HIT or
req_op = OP_STORE_HIT) and req_index = i then
plru_acc_en <= '1';
else
plru_acc_en <= '0';
end if;
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
plru_victim(i) <= plru_out;
end process;
end generate;
end generate;
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
-- Wishbone read and write and BRAM write sel bits generation
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);
-- See if the operation crosses two doublewords
two_dwords <= or (bus_sel(15 downto 8));
-- Cache request parsing and hit detection
dcache_request : process(all)
variable is_hit : std_ulogic;
variable hit_way : way_t;
variable op : op_t;
variable tmp : std_ulogic_vector(63 downto 0);
variable data : std_ulogic_vector(63 downto 0);
variable opsel : std_ulogic_vector(3 downto 0);
variable go : std_ulogic;
variable is_load : std_ulogic;
variable is_nc : std_ulogic;
begin
-- Extract line, row and tag from request
if r1.state /= NEXT_DWORD then
req_addr <= d_in.addr;
req_data <= d_in.data;
req_sel <= bus_sel(7 downto 0);
go := d_in.valid;
is_load := d_in.load;
is_nc := d_in.nc;
else
req_addr <= r1.next_addr;
req_data <= r1.req.data;
req_sel <= r1.next_sel;
go := '1';
is_load := r1.req.load;
is_nc := r1.req.nc;
end if;
req_index <= get_index(req_addr);
req_row <= get_row(req_addr);
req_tag <= get_tag(req_addr);
-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
--
req_laddr <= req_addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
-- Address of next doubleword, used for unaligned accesses
next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";
-- Test if pending request is a hit on any way
hit_way := 0;
is_hit := '0';
for i in way_t loop
if go = '1' and cache_valids(req_index)(i) = '1' then
if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i;
is_hit := '1';
end if;
end if;
end loop;
-- The way that matched on a hit
req_hit_way <= hit_way;
-- The way to replace on a miss
replace_way <= to_integer(unsigned(plru_victim(req_index)));
-- Combine the request and cache his status to decide what
-- operation needs to be done
--
opsel := go & is_load & is_nc & is_hit;
case opsel is
when "1101" => op := OP_LOAD_HIT;
when "1100" => op := OP_LOAD_MISS;
when "1110" => op := OP_LOAD_NC;
when "1001" => op := OP_STORE_HIT;
when "1000" => op := OP_STORE_MISS;
when "1010" => op := OP_STORE_MISS;
when "1011" => op := OP_BAD;
when "1111" => op := OP_BAD;
when others => op := OP_NONE;
end case;
req_op <= op;
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
-- Versions of the address and row number that are valid one cycle earlier
-- in the cases where we need to read the cache data BRAM.
if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then
early_req_addr <= next_addr(11 downto 0);
elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then
early_req_addr <= r1.next_addr(11 downto 0);
else
early_req_addr <= d_in.early_low_addr;
end if;
early_req_row <= get_row(x"0000000000000" & early_req_addr);
end process;
-- Wire up wishbone request latch out of stage 1
wishbone_out <= r1.wb;
-- TODO: Generate errors
-- err_nc_collision <= '1' when req_op = OP_BAD else '0';
-- Generate stalls from stage 1 state machine
stall_out <= '1' when r1.state /= IDLE else '0';
-- Handle load-with-reservation and store-conditional instructions
reservation_comb: process(all)
begin
cancel_store <= '0';
set_rsrv <= '0';
clear_rsrv <= '0';
if d_in.valid = '1' and d_in.reserve = '1' then
-- XXX generate alignment interrupt if address is not aligned
-- XXX or if d_in.nc = '1'
if d_in.load = '1' then
-- load with reservation
set_rsrv <= '1';
else
-- store conditional
clear_rsrv <= '1';
if reservation.valid = '0' or
d_in.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
cancel_store <= '1';
end if;
end if;
end if;
end process;
reservation_reg: process(clk)
begin
if rising_edge(clk) then
if rst = '1' or clear_rsrv = '1' then
reservation.valid <= '0';
elsif set_rsrv = '1' then
reservation.valid <= '1';
reservation.addr <= d_in.addr(63 downto LINE_OFF_BITS);
end if;
end if;
end process;
-- Writeback (loads and reg updates) & completion control logic
--
writeback_control: process(all)
begin
-- The mux on d_out.write reg defaults to the normal load hit case.
d_out.write_enable <= '0';
d_out.valid <= '0';
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
d_out.write_reg <= r1.req.write_reg;
d_out.write_data <= cache_out(r1.hit_way);
d_out.write_len <= r1.req.length;
d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.second_word <= r1.second_dword;
d_out.xerc <= r1.req.xerc;
d_out.rc <= '0'; -- loads never have rc=1
d_out.store_done <= '0';
-- We have a valid load or store hit or we just completed a slow
-- op such as a load miss, a NC load or a store
--
-- Note: the load hit is delayed by one cycle. However it can still
-- not collide with r.slow_valid (well unless I miscalculated) because
-- slow_valid can only be set on a subsequent request and not on its
-- first cycle (the state machine must have advanced), which makes
-- slow_valid at least 2 cycles from the previous hit_load_valid.
--
-- Sanity: Only one of these must be set in any given cycle
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
assert (r1.update_valid and r1.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with update_valid"
severity FAILURE;
assert (r1.slow_valid and r1.stcx_fail) /= '1' report
"unexpected slow_valid collision with stcx_fail"
severity FAILURE;
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with slow_valid"
severity FAILURE;
assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report
"unexpected update_valid collision with slow_valid or stcx_fail"
severity FAILURE;
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
-- Load hit case is the standard path
if r1.hit_load_valid = '1' then
d_out.write_enable <= '1';
-- If there isn't another dword to go and
-- it's not a load with update, complete it now
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
if (r1.second_dword or not r1.two_dwords) = '1' and
r1.req.update = '0' then
report "completing load hit";
d_out.valid <= '1';
end if;
end if;
-- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then
-- If it's a load, enable register writeback and switch
-- mux accordingly
--
if r1.req.load then
d_out.write_reg <= r1.req.write_reg;
d_out.write_enable <= '1';
-- Read data comes from the slow data latch, formatter
-- from the latched request.
--
d_out.write_data <= r1.slow_data;
d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.write_len <= r1.req.length;
Add basic XER support The carry is currently internal to execute1. We don't handle any of the other XER fields. This creates type called "xer_common_t" that contains the commonly used XER bits (CA, CA32, SO, OV, OV32). The value is stored in the CR file (though it could be a separate module). The rest of the bits will be implemented as a separate SPR and the two parts reconciled in mfspr/mtspr in latter commits. We always read XER in decode2 (there is little point not to) and send it down all pipeline branches as it will be needed in writeback for all type of instructions when CR0:SO needs to be updated (such forms exist for all pipeline branches even if we don't yet implement them). To avoid having to track XER hazards, we forward it back in EX1. This assumes that other pipeline branches that can modify it (mult and div) are running single issue for now. One additional hazard to beware of is an XER:SO modifying instruction in EX1 followed immediately by a store conditional. Due to our writeback latency, the store will go down the LSU with the previous XER value, thus the stcx. will set CR0:SO using an obsolete SO value. I doubt there exist any code relying on this behaviour being correct but we should account for it regardless, possibly by ensuring that stcx. remain single issue initially, or later by adding some minimal tracking or moving the LSU into the same pipeline as execute. Missing some obscure XER affecting instructions like addex or mcrxrx. [paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of arguments to set_ov] Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
d_out.xerc <= r1.req.xerc;
d_out.second_word <= r1.second_dword;
end if;
d_out.rc <= r1.req.rc;
d_out.store_done <= '1';
-- If it's a store or a non-update load form, complete now
-- unless we need to do another dword transfer
if (r1.req.load = '0' or r1.req.update = '0') and
(r1.two_dwords = '0' or r1.second_dword = '1') then
report "completing store or load miss";
d_out.valid <= '1';
end if;
end if;
if r1.stcx_fail = '1' then
d_out.rc <= r1.req.rc;
d_out.store_done <= '0';
d_out.valid <= '1';
end if;
-- We have a register update to do.
if r1.update_valid = '1' then
d_out.write_enable <= '1';
d_out.write_reg <= r1.req.update_reg;
-- Change the read data mux to the address that's going into
-- the register and the formatter does nothing.
--
d_out.write_data <= r1.req.addr;
d_out.write_shift <= "000";
d_out.write_len <= "1000";
d_out.sign_extend <= '0';
d_out.byte_reverse <= '0';
Add basic XER support The carry is currently internal to execute1. We don't handle any of the other XER fields. This creates type called "xer_common_t" that contains the commonly used XER bits (CA, CA32, SO, OV, OV32). The value is stored in the CR file (though it could be a separate module). The rest of the bits will be implemented as a separate SPR and the two parts reconciled in mfspr/mtspr in latter commits. We always read XER in decode2 (there is little point not to) and send it down all pipeline branches as it will be needed in writeback for all type of instructions when CR0:SO needs to be updated (such forms exist for all pipeline branches even if we don't yet implement them). To avoid having to track XER hazards, we forward it back in EX1. This assumes that other pipeline branches that can modify it (mult and div) are running single issue for now. One additional hazard to beware of is an XER:SO modifying instruction in EX1 followed immediately by a store conditional. Due to our writeback latency, the store will go down the LSU with the previous XER value, thus the stcx. will set CR0:SO using an obsolete SO value. I doubt there exist any code relying on this behaviour being correct but we should account for it regardless, possibly by ensuring that stcx. remain single issue initially, or later by adding some minimal tracking or moving the LSU into the same pipeline as execute. Missing some obscure XER affecting instructions like addex or mcrxrx. [paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of arguments to set_ov] Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
d_out.xerc <= r1.req.xerc;
d_out.second_word <= '0';
-- If it was a load, this completes the operation (load with
-- update case).
--
if r1.req.load = '1' then
report "completing after load update";
d_out.valid <= '1';
end if;
end if;
end process;
--
-- Generate a cache RAM for each way. This handles the normal
-- reads, writes from reloads and the special store-hit update
-- path as well.
--
-- Note: the BRAMs have an extra read buffer, meaning the output
-- is pipelined an extra cycle. This differs from the
-- icache. The writeback logic needs to take that into
-- account by using 1-cycle delayed signals for load hits.
--
rams: for i in 0 to NUM_WAYS-1 generate
signal do_read : std_ulogic;
signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal do_write : std_ulogic;
signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_data : std_ulogic_vector(wishbone_data_bits-1 downto 0);
signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
signal dout : cache_row_t;
begin
way: entity work.cache_ram
generic map (
ROW_BITS => ROW_BITS,
WIDTH => wishbone_data_bits,
ADD_BUF => true
)
port map (
clk => clk,
rd_en => do_read,
rd_addr => rd_addr,
rd_data => dout,
wr_en => do_write,
wr_sel => wr_sel,
wr_addr => wr_addr,
wr_data => wr_data
);
process(all)
variable tmp_adr : std_ulogic_vector(63 downto 0);
variable reloading : boolean;
begin
-- Cache hit reads
do_read <= '1';
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
rd_addr <= std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS));
cache_out(i) <= dout;
-- Write mux:
--
-- Defaults to wishbone read responses (cache refill),
--
-- For timing, the mux on wr_data/sel/addr is not dependent on anything
-- other than the current state. Only the do_write signal is.
--
if r1.state = IDLE or r1.state = NEXT_DWORD then
-- In these states, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= req_data;
wr_sel <= req_sel;
else
-- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat;
wr_sel <= (others => '1');
wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
end if;
-- The two actual write cases here
do_write <= '0';
reloading := r1.state = RELOAD_WAIT_ACK;
if reloading and wishbone_in.ack = '1' and r1.store_way = i then
do_write <= '1';
end if;
if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' then
assert not reloading report "Store hit while in state:" &
state_t'image(r1.state)
severity FAILURE;
do_write <= '1';
end if;
end process;
end generate;
--
-- Cache hit synchronous machine for the easy case. This handles
dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
-- non-update form load hits
--
dcache_fast_hit : process(clk)
begin
if rising_edge(clk) then