dcache: Add more commentary, no code change

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago · 784d409999
parent 55f7d99376
commit 784d409999
1 changed files with 68 additions and 14 deletions
--- a/dcache.vhdl
+++ b/dcache.vhdl
@ -1,12 +1,6 @@
 --
 -- Set associative dcache write-through
 --
 -- TODO (in no specific order):
 --
 -- * See list in icache.vhdl
 -- * Complete load misses on the cycle when WB data comes instead of
 --   at the end of line (this requires dealing with requests coming in
 --   while not idle...)
 --
 library ieee;
 use ieee.std_logic_1164.all;
@ -57,7 +51,7 @@ end entity dcache;
 architecture rtl of dcache is
    -- BRAM organisation: We never access more than wishbone_data_bits at
    -- a time so to save resources we make the array only that wide, and
-    -- use consecutive indices for to make a cache "line"
+    -- use consecutive indices to make a cache "line"
    --
    -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
    constant ROW_SIZE      : natural := wishbone_data_bits / 8;
@ -206,20 +200,80 @@ architecture rtl of dcache is
    -- which means that the BRAM output is delayed by an extra cycle.
    --
    -- Thus, the dcache has a 2-stage internal pipeline for cache hits
-    -- with no stalls.
+    -- with no stalls.  Stores also complete in 2 cycles in most
    -- circumstances.
    --
    -- A request proceeds through the pipeline as follows.
    --
    -- Cycle 0: Request is received from loadstore or mmu if either
    -- d_in.valid or m_in.valid is 1 (not both).  In this cycle portions
    -- of the address are presented to the TLB tag RAM and data RAM
    -- and the cache tag RAM and data RAM.
    --
    -- Clock edge between cycle 0 and cycle 1:
    -- Request is stored in r0 (assuming r0_full was 0).  TLB tag and
    -- data RAMs are read, and the cache tag RAM is read.  (Cache data
    -- comes out a cycle later due to its output register, giving the
    -- whole of cycle 1 to read the cache data RAM.)
    --
    -- Cycle 1: TLB and cache tag matching is done, the real address
    -- (RA) for the access is calculated, and the type of operation is
    -- determined (the OP_* values above).  This gives the TLB way for
    -- a TLB hit, and the cache way for a hit or the way to replace
    -- for a load miss.
    --
    -- Clock edge between cycle 1 and cycle 2:
    -- Request is stored in r1 (assuming r1.full was 0)
    -- The state machine transitions out of IDLE state for a load miss,
    -- a store, a dcbz, or a non-cacheable load.  r1.full is set to 1
    -- for a load miss, dcbz or non-cacheable load but not a store.
    --
    -- Cycle 2: Completion signals are asserted for a load hit,
    -- a store (excluding dcbz), a TLB operation, a conditional
    -- store which failed due to no matching reservation, or an error
    -- (cache hit on non-cacheable operation, TLB miss, or protection
    -- fault).
    --
    -- For a load miss, store, or dcbz, the state machine initiates
    -- a wishbone cycle, which takes at least 2 cycles.  For a store,
    -- if another store comes in with the same cache tag (therefore
    -- in the same 4k page), it can be added on to the existing cycle,
    -- subject to some constraints.
    -- While r1.full = 1, no new requests can go from r0 to r1, but
    -- requests can come in to r0 and be satisfied if they are
    -- cacheable load hits or stores with the same cache tag.
    --
-    -- All other operations are handled via stalling in the first stage.
+    -- Writing to the cache data RAM is done at the clock edge
    -- at the end of cycle 2 for a store hit (excluding dcbz).
    -- Stores that miss are not written to the cache data RAM
    -- but just stored through to memory.
    -- Dcbz is done like a cache miss, but the wishbone cycle
    -- is a write rather than a read, and zeroes are written to
    -- the cache data RAM.  Thus dcbz will allocate the line in
    -- the cache as well as zeroing memory.
    --
-    -- The second stage can thus complete a hit at the same time as the
+    -- Since stores are written to the cache data RAM at the end of
-    -- first stage emits a stall for a complex op.
+    -- cycle 2, and loads can come in and hit on the data just stored,
    -- there is a two-stage bypass from store data to load data to
    -- make sure that loads always see previously-stored data even
    -- if it has not yet made it to the cache data RAM.
    --
    -- Load misses read the requested dword of the cache line first in
    -- the memory read request and then cycle around through the other
    -- dwords.  The load is completed on the cycle after the requested
    -- dword comes back from memory (using a forwarding path, rather
    -- than going via the cache data RAM).  We maintain an array of
    -- valid bits per dword for the line being refilled so that
    -- subsequent load requests to the same line can be completed as
    -- soon as the necessary data comes in from memory, without
    -- waiting for the whole line to be read.
    -- Stage 0 register, basically contains just the latched request
    type reg_stage_0_t is record
        req   : Loadstore1ToDcacheType;
-        tlbie : std_ulogic;
+        tlbie : std_ulogic;     -- indicates a tlbie request (from MMU)
-        doall : std_ulogic;
+        doall : std_ulogic;     -- with tlbie, indicates flush whole TLB
-        tlbld : std_ulogic;
+        tlbld : std_ulogic;     -- indicates a TLB load request (from MMU)
        mmu_req : std_ulogic;   -- indicates source of request
    end record;