From 83dea9479364674a8b289373d13d5a0c3233c4e4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 25 Sep 2021 13:26:09 +1000 Subject: [PATCH 1/3] decode1: Conditional trap instructions don't need to be single-issue They can generate interrupts, but that doesn't mean they have to single-issue. Signed-off-by: Paul Mackerras --- decode1.vhdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 0774250..baf4347 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -114,8 +114,8 @@ architecture behaviour of decode1 is 36 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stw 37 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwu 8 => (ALU, NONE, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- subfic - 2 => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- tdi - 3 => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1', NONE), -- twi + 2 => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tdi + 3 => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- twi 26 => (ALU, NONE, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- xori 27 => (ALU, NONE, OP_XOR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- xoris others => illegal_inst @@ -410,8 +410,8 @@ architecture behaviour of decode1 is 2#0011001000# => (ALU, NONE, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfze 2#1011001000# => (ALU, NONE, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfzeo 2#1001010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- sync - 2#0001000100# => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- td - 2#0000000100# => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1', NONE), -- tw + 2#0001000100# => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- td + 2#0000000100# => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- tw 2#0100110010# => (LDST, NONE, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbie 2#0100010010# => (LDST, NONE, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbiel 2#1000110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- tlbsync From 9b3b57710a0b0898f293dde630799bbeb0e6bda2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 25 Sep 2021 11:34:38 +1000 Subject: [PATCH 2/3] icache: Fix icache invalidation This fixes two bugs in the flash invalidation of the icache. The first is that an instruction could get executed twice. The i-cache RAM is 2 instructions (64 bits) wide, so one read can supply results for 2 cycles. The fetch1 stage tells icache when the address is equal to the address of the previous cycle plus 4, and in cases where that is true, bit 2 of the address is 1, and the previous cycle was a cache hit, we just use the second word of the doubleword read from the cache RAM. However, the cache hit/miss logic also continues to operate, so in the case where the first word hits but the second word misses (because of an icache invalidation or a snoop occurring in the first cycle), we supply the instruction from the data previously read from the icache RAM but also stall fetch1 and start a cache reload sequence, and subsequently supply the second instruction again. This fixes the issue by inhibiting req_is_miss and stall_out when use_previous is true. The second bug is that if an icache invalidation occurs while reloading a line, we continue to reload the line, and make it valid when the reload finishes, even though some of the data may have been read before the invalidation occurred. This adds a new state STOP_RELOAD which we go to if an invalidation happens while we are in CLR_TAG or WAIT_ACK state. In STOP_RELOAD state we don't request any more reads from memory and wait for the reads we have previously requested to be acked, and then go to IDLE state. Data returned is still written to the icache RAM, but that doesn't matter because the line is invalid and is never made valid. Note that we don't have to worry about invalidations due to snooped writes while reloading a line, because the wishbone arbiter won't switch to another master once it has started sending our reload requests to memory. Thus a store to memory will either happen before any of our reads have got to memory, or after we have finished the reload (in which case we will no longer be in WAIT_ACK state). Signed-off-by: Paul Mackerras --- icache.vhdl | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index 7937ff6..298ee47 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -171,7 +171,7 @@ architecture rtl of icache is signal eaa_priv : std_ulogic; -- Cache reload state machine - type state_t is (IDLE, CLR_TAG, WAIT_ACK); + type state_t is (IDLE, STOP_RELOAD, CLR_TAG, WAIT_ACK); type reg_internal_t is record -- Cache hit state (Latches for 1 cycle BRAM access) @@ -546,7 +546,7 @@ begin end loop; -- Generate the "hit" and "miss" signals for the synchronous blocks - if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then + if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' and use_previous = '0' then req_is_hit <= is_hit; req_is_miss <= not is_hit; else @@ -580,7 +580,7 @@ begin i_out.next_pred_ntaken <= i_in.pred_ntaken; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault - stall_out <= not (is_hit and access_ok); + stall_out <= not (is_hit and access_ok) and not use_previous; -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; @@ -757,9 +757,15 @@ begin r.wb.adr <= next_row_addr(r.wb.adr); end if; + -- Abort reload if we get an invalidation + if inval_in = '1' then + r.wb.stb <= '0'; + r.state <= STOP_RELOAD; + end if; + -- Incoming acks processing if wishbone_in.ack = '1' then - r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1'; + r.rows_valid(r.store_row mod ROW_PER_LINE) <= not inval_in; -- Check for completion if is_last_row(r.store_row, r.end_row_ix) then -- Complete wishbone cycle @@ -775,6 +781,18 @@ begin -- Increment store row counter r.store_row <= next_row(r.store_row); end if; + + when STOP_RELOAD => + -- Wait for all outstanding requests to be satisfied, then + -- go to IDLE state. + if get_row_of_line(r.store_row) = get_row_of_line(get_row(r.wb.adr)) then + r.wb.cyc <= '0'; + r.state <= IDLE; + end if; + if wishbone_in.ack = '1' then + -- Increment store row counter + r.store_row <= next_row(r.store_row); + end if; end case; end if; From 70270c066a0fbcce74ffe866199973fe937d7aea Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 25 Sep 2021 13:18:59 +1000 Subject: [PATCH 3/3] dcache: Fix bug with dcbz closely following stores with the same tag This fixes a bug where a dcbz can get incorrectly handled as an ordinary 8-byte store if it arrives while the dcache state machine is handling other stores with the same tag value (i.e. within the same set-sized area of memory). The logic that says whether to include a new store in the current wishbone cycle didn't take into account whether the new store was a dcbz. This adds a "req.dcbz = '0'" factor so that it does. This is necessary because dcbz is handled more like a cache line refill (but writing to memory rather than reading) than an ordinary store. Signed-off-by: Paul Mackerras --- dcache.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcache.vhdl b/dcache.vhdl index 705393a..34dbda2 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -1537,7 +1537,7 @@ begin r1.wb.dat <= req.data; r1.wb.sel <= req.byte_sel; end if; - if acks < 7 and req.same_tag = '1' and + if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.stb <= '1'; stbs_done := false;