From 441160d8655762d115b2c6eab2c58bb146b96860 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 16 Jan 2020 13:18:56 +1100 Subject: [PATCH 01/10] execute1: Use truth table embedded in instruction for CR logical ops It turns out that CR logical instructions have the truth table of the operation embedded in the instruction word. This means that we can collect the two input operand bits into a 2-bit value and use that as the index to select the appropriate bit from the instruction word. Signed-off-by: Paul Mackerras --- execute1.vhdl | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index ae13c72..e41a743 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -200,6 +200,7 @@ begin variable bo, bi : std_ulogic_vector(4 downto 0); variable bf, bfa : std_ulogic_vector(2 downto 0); variable cr_op : std_ulogic_vector(9 downto 0); + variable cr_operands : std_ulogic_vector(1 downto 0); variable bt, ba, bb : std_ulogic_vector(4 downto 0); variable btnum, banum, bbnum : integer range 0 to 31; variable crresult : std_ulogic; @@ -532,27 +533,10 @@ begin btnum := 31 - to_integer(unsigned(bt)); banum := 31 - to_integer(unsigned(ba)); bbnum := 31 - to_integer(unsigned(bb)); - case cr_op(8 downto 5) is - when "1001" => -- CREQV - crresult := not(e_in.cr(banum) xor e_in.cr(bbnum)); - when "0111" => -- CRNAND - crresult := not(e_in.cr(banum) and e_in.cr(bbnum)); - when "0100" => -- CRANDC - crresult := (e_in.cr(banum) and not e_in.cr(bbnum)); - when "1000" => -- CRAND - crresult := (e_in.cr(banum) and e_in.cr(bbnum)); - when "0001" => -- CRNOR - crresult := not(e_in.cr(banum) or e_in.cr(bbnum)); - when "1101" => -- CRORC - crresult := (e_in.cr(banum) or not e_in.cr(bbnum)); - when "0110" => -- CRXOR - crresult := (e_in.cr(banum) xor e_in.cr(bbnum)); - when "1110" => -- CROR - crresult := (e_in.cr(banum) or e_in.cr(bbnum)); - when others => - crresult := '0'; - report "BAD CR?"; - end case; + -- Bits 5-8 of cr_op give the truth table of the requested + -- logical operation + cr_operands := e_in.cr(banum) & e_in.cr(bbnum); + crresult := cr_op(5 + to_integer(unsigned(cr_operands))); v.e.write_cr_mask := num_to_fxm((31-btnum) / 4); for i in 0 to 31 loop if i = btnum then From 1a244d34707af829e5967e24c70d04099ece41f7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 19 Feb 2020 15:39:46 +1100 Subject: [PATCH 02/10] Remove single-issue constraint for most loads and stores This removes the constraint that loads and stores are single-issue, at the expense of a stall of at least 2 cycles for every load and store. To do this, we plumb the existing stall signal that was generated in dcache to core, where it gets ORed with the stall signal from execute1. Execute1 generates a stall signal for the first two cycles of each load and store, and dcache generates the stall signal in the 3rd and subsequent cycles if it needs to. Signed-off-by: Paul Mackerras --- core.vhdl | 4 ++- decode1.vhdl | 96 +++++++++++++++++++++++++-------------------------- execute1.vhdl | 8 +++++ 3 files changed, 59 insertions(+), 49 deletions(-) diff --git a/core.vhdl b/core.vhdl index bc0b16f..87e73a4 100644 --- a/core.vhdl +++ b/core.vhdl @@ -73,6 +73,7 @@ architecture behave of core is signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; signal ex1_stall_out: std_ulogic; + signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; @@ -195,7 +196,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file ); - decode2_stall_in <= ex1_stall_out; + decode2_stall_in <= ex1_stall_out or dcache_stall_out; register_file_0: entity work.register_file generic map ( @@ -257,6 +258,7 @@ begin rst => core_rst, d_in => loadstore1_to_dcache, d_out => dcache_to_writeback, + stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out ); diff --git a/decode1.vhdl b/decode1.vhdl index f1b5ad4..bca7c2a 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -46,26 +46,26 @@ architecture behaviour of decode1 is 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi 10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli - 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz - 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu - 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha - 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lhau - 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhz - 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu - 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz - 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu + 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz + 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu + 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha + 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau + 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz + 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzu + 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwz + 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzu 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli 24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori 25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi 21 => (ALU, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwinm 23 => (ALU, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwnm - 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- stb - 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '1'), -- stbu - 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sth - 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu - 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw - 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu + 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- stb + 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '0'), -- stbu + 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth + 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu + 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw + 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic 2 => (ALU, OP_TDI, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi --PPC_TWI 3 @@ -215,25 +215,25 @@ architecture behaviour of decode1 is 2#1110101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#1111001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#1111101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel - 2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lbarx - 2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzux - 2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbzx - 2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- ldarx - 2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- ldbrx - 2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- ldux - 2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- ldx - 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lharx - 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lhaux - 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lhax - 2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhbrx - 2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzux - 2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhzx - 2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lwarx - 2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lwaux - 2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lwax - 2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwbrx - 2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzux - 2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwzx + 2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lbarx + 2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzux + 2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzx + 2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- ldarx + 2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldbrx + 2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux + 2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx + 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx + 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux + 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax + 2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhbrx + 2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzux + 2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzx + 2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lwarx + 2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lwaux + 2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lwax + 2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwbrx + 2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzux + 2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzx -- 2#1000000000# mcrxr -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf @@ -278,21 +278,21 @@ architecture behaviour of decode1 is 2#1100111000# => (ALU, OP_SHR, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- srawi 2#1000011011# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- srd 2#1000011000# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- srw - 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', RC, '0', '1'), -- stbcx - 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '1'), -- stbux - 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- stbx - 2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stdbrx - 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- stdcx - 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stdux - 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stdx - 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sthbrx - 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- sthcx - 2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthux - 2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sthx - 2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwbrx - 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- stwcx - 2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwux - 2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwx + 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', RC, '0', '0'), -- stbcx + 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '0'), -- stbux + 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- stbx + 2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdbrx + 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- stdcx + 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux + 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx + 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx + 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- sthcx + 2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthux + 2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthx + 2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwbrx + 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- stwcx + 2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwux + 2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwx 2#0000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subf 2#1000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfo 2#0000001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfc diff --git a/execute1.vhdl b/execute1.vhdl index e41a743..c536a27 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -42,6 +42,7 @@ architecture behaviour of execute1 is next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; div_in_progress : std_ulogic; + ldst_in_progress : std_ulogic; cntz_in_progress : std_ulogic; slow_op_dest : gpr_index_t; slow_op_rc : std_ulogic; @@ -263,6 +264,7 @@ begin v.mul_in_progress := '0'; v.div_in_progress := '0'; v.cntz_in_progress := '0'; + v.ldst_in_progress := '0'; -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; @@ -660,6 +662,8 @@ begin when OP_LOAD | OP_STORE => -- loadstore/dcache has its own port to writeback v.e.valid := '0'; + stall_out <= '1'; + v.ldst_in_progress := '1'; when others => terminate_out <= '1'; @@ -699,6 +703,10 @@ begin v.e.rc := v.slow_op_rc; v.e.xerc := v.slow_op_xerc; v.e.valid := '1'; + elsif r.ldst_in_progress = '1' then + -- assert stall for 2 cycles on load/store, then + -- the stall output from dcache takes over + stall_out <= '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then From 1587d9e6eb284a4fa8ead86abef3be719ba8cf6f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 21 Feb 2020 12:34:23 +1100 Subject: [PATCH 03/10] dcache: Fix obscure bug and minor cleanups The obscure bug is that a non-cacheable load with update would never do the update and would never complete the instruction. This is fixed by making state NC_LOAD_WAIT_ACK go to LOAD_UPDATE2 if r1.req.update is set. The slow load forms with update can go to LOAD_UPDATE2 at the end rather than LOAD_UPDATE, thus saving a cycle. Loads with a cache hit need the LOAD_UPDATE state in the third cycle since they are not writing back until the 4th cycle, when the state is LOAD_UPDATE2. Slow loads (cacheable loads that miss and non-cacheable loads) currently go to LOAD_UPDATE in the cycle after they see r1.wb.ack = 1 for the last time, but that cycle is the cycle where they write back, and the following cycle does nothing. Going to LOAD_UPDATE2 in those cases saves a cycle and makes them consistent with the load hit case. The logic in the RELOAD_WAIT_ACK case doesn't need to check r1.req.load = '1' since we only ever use RELOAD_WAIT_ACK for loads. There are also some whitespace fixes and a typo fix. Signed-off-by: Paul Mackerras --- dcache.vhdl | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index df54c95..ddc4769 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -500,7 +500,7 @@ begin -- If it's not a load with update, complete it now if r2.load_is_update = '0' then d_out.valid <= '1'; - end if; + end if; end if; -- Slow ops (load miss, NC, stores) @@ -508,7 +508,7 @@ begin -- If it's a load, enable register writeback and switch -- mux accordingly -- - if r1.req.load then + if r1.req.load then d_out.write_reg <= r1.req.write_reg; d_out.write_enable <= '1'; @@ -679,7 +679,7 @@ begin end process; -- - -- Every other case is handled by this stage machine: + -- Every other case is handled by this state machine: -- -- * Cache load miss/reload (in conjunction with "rams") -- * Load hits for update forms @@ -835,8 +835,8 @@ begin -- we also need to do the deferred update cycle. -- r1.slow_valid <= '1'; - if r1.req.load = '1' and r1.req.update = '1' then - r1.state <= LOAD_UPDATE; + if r1.req.update = '1' then + r1.state <= LOAD_UPDATE2; report "completing miss with load-update !"; else r1.state <= IDLE; @@ -864,13 +864,16 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then + r1.state <= IDLE; if r1.state = NC_LOAD_WAIT_ACK then r1.slow_data <= wishbone_in.dat; + if r1.req.update = '1' then + r1.state <= LOAD_UPDATE2; + end if; end if; r1.slow_valid <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; - r1.state <= IDLE; end if; end case; end if; From 94dd8bc48066e8c4505843a11250209f3bf29226 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 26 Feb 2020 11:55:36 +1100 Subject: [PATCH 04/10] dcache: Add support for unaligned loads and stores For an unaligned load or store, we do the first doubleword (dword) of the transfer as normal, but then go to a new NEXT_DWORD state of the state machine to do the cache tag lookup for the second dword of the transfer. From the NEXT_DWORD state we have much the same transitions to other states as from the IDLE state (the transitions for OP_LOAD_HIT are a bit different but almost identical for the other op values). We now do the preparation of the data to be written in loadstore1, that is, byte reversal if necessary and rotation by a number of bytes based on the low 3 bits of the address. We do rotation not shifting so we have the bytes that need to go into the second doubleword in the right place in the low bytes of the data sent to dcache. The rotation and byte reversal are done in a single step with one multiplexer per byte by setting the select inputs for each byte appropriately. This also fixes writeback to not write the register value until it has received both pieces of an unaligned load value. Signed-off-by: Paul Mackerras --- dcache.vhdl | 181 +++++++++++++++++++++++++++++++----------------- loadstore1.vhdl | 20 ++++-- writeback.vhdl | 2 +- 3 files changed, 136 insertions(+), 67 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index ddc4769..5bf477b 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -124,6 +124,7 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing + NEXT_DWORD, -- Starting the 2nd xfer of misaligned LOAD_UPDATE, -- Load with update extra cycle LOAD_UPDATE2, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack @@ -157,6 +158,12 @@ architecture rtl of dcache is hit_way : way_t; hit_load_valid : std_ulogic; + -- Info for doing the second transfer of a misaligned load/store + two_dwords : std_ulogic; + second_dword : std_ulogic; + next_addr : std_ulogic_vector(63 downto 0); + next_sel : std_ulogic_vector(7 downto 0); + -- Register update (load/store with update) update_valid : std_ulogic; @@ -186,6 +193,8 @@ architecture rtl of dcache is sign_extend : std_ulogic; byte_reverse : std_ulogic; xerc : xer_common_t; + last_dword : std_ulogic; + second_dword : std_ulogic; end record; signal r2 : reg_stage_2_t; @@ -196,7 +205,10 @@ architecture rtl of dcache is signal req_hit_way : way_t; signal req_tag : cache_tag_t; signal req_op : op_t; + signal req_data : std_ulogic_vector(63 downto 0); + signal req_addr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); + signal req_sel : std_ulogic_vector(7 downto 0); -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -208,8 +220,9 @@ architecture rtl of dcache is signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals - signal bus_sel : wishbone_sel_type; - signal store_data : wishbone_data_type; + signal bus_sel : std_ulogic_vector(15 downto 0); + + signal two_dwords : std_ulogic; -- -- Helper functions to decode incoming requests @@ -307,17 +320,17 @@ architecture rtl of dcache is end case; end function length_to_sel; - -- Calculate shift and byte enables for wishbone - function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is - begin - return to_integer(unsigned(address(2 downto 0))) * 8; - end function wishbone_data_shift; - + -- Calculate byte enables for wishbone + -- This returns 16 bits, giving the select signals for two transfers, + -- to account for unaligned loads or stores function wishbone_data_sel(size : in std_logic_vector(3 downto 0); address : in std_logic_vector(63 downto 0)) return std_ulogic_vector is + variable longsel : std_ulogic_vector(15 downto 0); begin - return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)), + longsel := (others => '0'); + longsel(7 downto 0) := length_to_sel(size); + return std_ulogic_vector(shift_left(unsigned(longsel), to_integer(unsigned(address(2 downto 0))))); end function wishbone_data_sel; @@ -383,23 +396,43 @@ begin variable tmp : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0); variable opsel : std_ulogic_vector(3 downto 0); + variable go : std_ulogic; + variable is_load : std_ulogic; + variable is_nc : std_ulogic; begin -- Extract line, row and tag from request - req_index <= get_index(d_in.addr); - req_row <= get_row(d_in.addr); - req_tag <= get_tag(d_in.addr); - - -- Calculate address of beginning of cache line, will be - -- used for cache miss processing if needed - -- - req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + if r1.state /= NEXT_DWORD then + req_addr <= d_in.addr; + req_data <= d_in.data; + req_sel <= bus_sel(7 downto 0); + go := d_in.valid; + is_load := d_in.load; + is_nc := d_in.nc; + + else + req_addr <= r1.next_addr; + req_data <= r1.req.data; + req_sel <= r1.next_sel; + go := '1'; + is_load := r1.req.load; + is_nc := r1.req.nc; + end if; + + req_index <= get_index(req_addr); + req_row <= get_row(req_addr); + req_tag <= get_tag(req_addr); + + -- Calculate address of beginning of cache line, will be + -- used for cache miss processing if needed + -- + req_laddr <= req_addr(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; for i in way_t loop - if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then + if go = '1' and cache_valids(req_index)(i) = '1' then if read_tag(i, cache_tags(req_index)) = req_tag then hit_way := i; is_hit := '1'; @@ -416,7 +449,7 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := d_in.valid & d_in.load & d_in.nc & is_hit; + opsel := go & is_load & is_nc & is_hit; case opsel is when "1101" => op := OP_LOAD_HIT; when "1100" => op := OP_LOAD_MISS; @@ -433,22 +466,15 @@ begin end process; - -- - -- Misc signal assignments - -- - -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Wishbone & BRAM write data formatting for stores (most of it already - -- happens in loadstore1, this is the remaining data shifting) - -- - store_data <= std_logic_vector(shift_left(unsigned(d_in.data), - wishbone_data_shift(d_in.addr))); - -- Wishbone read and write and BRAM write sel bits generation bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); + -- See if the operation crosses two doublewords + two_dwords <= or (bus_sel(15 downto 8)); + -- TODO: Generate errors -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; @@ -469,7 +495,7 @@ begin d_out.write_shift <= r2.data_shift; d_out.sign_extend <= r2.sign_extend; d_out.byte_reverse <= r2.byte_reverse; - d_out.second_word <= '0'; + d_out.second_word <= r2.second_dword; d_out.xerc <= r2.xerc; -- We have a valid load or store hit or we just completed a slow @@ -497,8 +523,10 @@ begin if r2.hit_load_valid = '1' then d_out.write_enable <= '1'; - -- If it's not a load with update, complete it now - if r2.load_is_update = '0' then + -- If there isn't another dword to go and + -- it's not a load with update, complete it now + if r2.last_dword = '1' and r2.load_is_update = '0' then + report "completing load hit"; d_out.valid <= '1'; end if; end if; @@ -521,10 +549,14 @@ begin d_out.byte_reverse <= r1.req.byte_reverse; d_out.write_len <= r1.req.length; d_out.xerc <= r1.req.xerc; + d_out.second_word <= r1.second_dword; end if; -- If it's a store or a non-update load form, complete now - if r1.req.load = '0' or r1.req.update = '0' then + -- unless we need to do another dword transfer + if (r1.req.load = '0' or r1.req.update = '0') and + (r1.two_dwords = '0' or r1.second_dword = '1') then + report "completing store or load miss"; d_out.valid <= '1'; end if; end if; @@ -543,11 +575,13 @@ begin d_out.sign_extend <= '0'; d_out.byte_reverse <= '0'; d_out.xerc <= r1.req.xerc; + d_out.second_word <= '0'; -- If it was a load, this completes the operation (load with -- update case). -- if r1.req.load = '1' then + report "completing after load update"; d_out.valid <= '1'; end if; end if; @@ -605,11 +639,11 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. Only the do_write signal is. -- - if r1.state = IDLE then - -- When IDLE, the only write path is the store-hit update case + if r1.state = IDLE or r1.state = NEXT_DWORD then + -- In these states, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= store_data; - wr_sel <= bus_sel; + wr_data <= req_data; + wr_sel <= req_sel; else -- Otherwise, we might be doing a reload wr_data <= wishbone_in.dat; @@ -648,6 +682,8 @@ begin r2.length <= r1.req.length; r2.sign_extend <= r1.req.sign_extend; r2.byte_reverse <= r1.req.byte_reverse; + r2.second_dword <= r1.second_dword; + r2.last_dword <= r1.second_dword or not r1.two_dwords; -- If we have a request incoming, we have to latch it as d_in.valid -- is only set for a single cycle. It's up to the control logic to @@ -655,8 +691,12 @@ begin -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). - if req_op /= OP_NONE then + if req_op /= OP_NONE and d_in.valid = '1' then r1.req <= d_in; + r1.second_dword <= '0'; + r1.two_dwords <= two_dwords; + r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; + r1.next_sel <= bus_sel(15 downto 8); report "op:" & op_t'image(req_op) & " addr:" & to_hstring(d_in.addr) & @@ -666,6 +706,8 @@ begin " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); + elsif r1.state = NEXT_DWORD then + r1.second_dword <= '1'; end if; -- Fast path for load/store hits. Set signals for the writeback controls. @@ -713,24 +755,36 @@ begin r1.update_valid <= '0'; -- We cannot currently process a new request when not idle - assert req_op = OP_NONE or r1.state = IDLE report "request " & + assert d_in.valid = '0' or r1.state = IDLE report "request " & op_t'image(req_op) & " while in state " & state_t'image(r1.state) severity FAILURE; -- Main state machine case r1.state is - when IDLE => + when IDLE | NEXT_DWORD => case req_op is - when OP_LOAD_HIT => - -- We have a load with update hit, we need the delayed update cycle - if d_in.update = '1' then - r1.state <= LOAD_UPDATE; - end if; + when OP_LOAD_HIT => + if r1.state = IDLE then + -- If the load is misaligned then we will need to start + -- the state machine + if two_dwords = '1' then + r1.state <= NEXT_DWORD; + elsif d_in.update = '1' then + -- We have a load with update hit, we need the delayed update cycle + r1.state <= LOAD_UPDATE; + end if; + else + if r1.req.update = '1' then + r1.state <= LOAD_UPDATE; + else + r1.state <= IDLE; + end if; + end if; when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(d_in.addr) & + report "cache miss addr:" & to_hstring(req_addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -765,8 +819,8 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= bus_sel; - r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= req_sel; + r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; @@ -774,12 +828,10 @@ begin when OP_STORE_HIT | OP_STORE_MISS => -- For store-with-update do the register update - if d_in.update = '1' then - r1.update_valid <= '1'; - end if; - r1.wb.sel <= bus_sel; - r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= store_data; + r1.update_valid <= d_in.valid and d_in.update; + r1.wb.sel <= req_sel; + r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.dat <= req_data; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '1'; @@ -831,11 +883,13 @@ begin -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Complete the load that missed. For load with update + -- Write back the load data that we got, and start + -- the second dword if necessary. Otherwise, see if -- we also need to do the deferred update cycle. - -- r1.slow_valid <= '1'; - if r1.req.update = '1' then + if r1.two_dwords and not r1.second_dword then + r1.state <= NEXT_DWORD; + elsif r1.req.update = '1' then r1.state <= LOAD_UPDATE2; report "completing miss with load-update !"; else @@ -864,12 +918,15 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then - r1.state <= IDLE; + if r1.two_dwords and not r1.second_dword then + r1.state <= NEXT_DWORD; + elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then + r1.state <= LOAD_UPDATE2; + else + r1.state <= IDLE; + end if; if r1.state = NC_LOAD_WAIT_ACK then r1.slow_data <= wishbone_in.dat; - if r1.req.update = '1' then - r1.state <= LOAD_UPDATE2; - end if; end if; r1.slow_valid <= '1'; r1.wb.cyc <= '0'; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 5b61d4c..9e038e1 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -35,12 +35,15 @@ begin loadstore1_1: process(all) variable v : Loadstore1ToDcacheType; + variable brev_lenm1 : unsigned(2 downto 0); + variable byte_offset : unsigned(2 downto 0); + variable j : integer; + variable k : unsigned(2 downto 0); begin v := r; v.valid := l_in.valid; v.load := l_in.load; - v.data := l_in.data; v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; @@ -63,9 +66,18 @@ begin -- XXX Do length_to_sel here ? - -- byte reverse stores in the first cycle - if v.load = '0' and l_in.byte_reverse = '1' then - v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length))); + -- Do byte reversing and rotating for stores in the first cycle + if v.load = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; + j := to_integer(k) * 8; + v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + end loop; end if; v.addr := lsu_sum; diff --git a/writeback.vhdl b/writeback.vhdl index a730266..b924ee0 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -116,12 +116,12 @@ begin if l_in.byte_reverse = '1' then brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1; end if; - w_out.write_enable <= '1'; second_word <= l_in.second_word; if l_in.valid = '0' and (data_len + byte_offset > 8) then partial_write <= '1'; end if; xe := l_in.xerc; + w_out.write_enable <= not partial_write or second_word; end if; -- shift and byte-reverse data bytes From 5d85ede97dfe13b6762c47a894edb49ceeb7f26c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 28 Feb 2020 08:09:08 +1100 Subject: [PATCH 05/10] dcache: Implement load-reserve and store-conditional instructions This involves plumbing the (existing) 'reserve' and 'rc' bits in the decode tables down to dcache, and 'rc' and 'store_done' bits from dcache to writeback. It turns out that we had 'RC' set in the 'rc' column for several ordinary stores and for the attn instruction. This corrects them to 'NONE', and sets the 'rc' column to 'ONE' for the conditional stores. In writeback we now have logic to set CR0 when the input from dcache has rc = 1. In dcache we have the reservation itself, which has a valid bit and the address down to cache line granularity. We don't currently store the reservation length. For a store conditional which fails, we set a 'cancel_store' signal which inhibits the write to the cache and prevents the state machine from starting a bus cycle or going to the STORE_WAIT_ACK state. Instead we set r1.stcx_fail which causes the instruction to complete in the next cycle with rc=1 and store_done=0. Signed-off-by: Paul Mackerras --- common.vhdl | 13 ++++++-- dcache.vhdl | 88 ++++++++++++++++++++++++++++++++++++++++++++----- decode1.vhdl | 18 +++++----- decode2.vhdl | 1 + execute1.vhdl | 2 ++ loadstore1.vhdl | 2 ++ writeback.vhdl | 12 +++++++ 7 files changed, 115 insertions(+), 21 deletions(-) diff --git a/common.vhdl b/common.vhdl index ffddb0b..84bbc47 100644 --- a/common.vhdl +++ b/common.vhdl @@ -130,12 +130,13 @@ package common is byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? + reserve : std_ulogic; -- set for larx/stcx end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', xerc => xerc_init, + is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0')); type Execute1ToMultiplyType is record @@ -206,10 +207,12 @@ package common is update : std_ulogic; -- is this an update instruction? update_reg : gpr_index_t; -- if so, the register to update xerc : xer_common_t; + reserve : std_ulogic; -- set for larx/stcx. + rc : std_ulogic; -- set for stcx. end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - others => (others => '0')); + reserve => '0', rc => '0', others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; @@ -224,6 +227,8 @@ package common is update : std_ulogic; update_reg : gpr_index_t; xerc : xer_common_t; + reserve : std_ulogic; + rc : std_ulogic; end record; type DcacheToWritebackType is record @@ -237,10 +242,12 @@ package common is byte_reverse : std_ulogic; second_word : std_ulogic; xerc : xer_common_t; + rc : std_ulogic; + store_done : std_ulogic; end record; constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', byte_reverse => '0', second_word => '0', xerc => xerc_init, - others => (others => '0')); + rc => '0', store_done => '0', others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 5bf477b..75b10c7 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -171,6 +171,9 @@ architecture rtl of dcache is slow_data : std_ulogic_vector(63 downto 0); slow_valid : std_ulogic; + -- Signal to complete a failed stcx. + stcx_fail : std_ulogic; + -- Cache miss state (reload state machine) state : state_t; wb : wishbone_master_out; @@ -199,6 +202,15 @@ architecture rtl of dcache is signal r2 : reg_stage_2_t; + -- Reservation information + -- + type reservation_t is record + valid : std_ulogic; + addr : std_ulogic_vector(63 downto LINE_OFF_BITS); + end record; + + signal reservation : reservation_t; + -- Async signals on incoming request signal req_index : index_t; signal req_row : row_t; @@ -210,6 +222,10 @@ architecture rtl of dcache is signal req_laddr : std_ulogic_vector(63 downto 0); signal req_sel : std_ulogic_vector(7 downto 0); + signal cancel_store : std_ulogic; + signal set_rsrv : std_ulogic; + signal clear_rsrv : std_ulogic; + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -481,6 +497,41 @@ begin -- Generate stalls from stage 1 state machine stall_out <= '1' when r1.state /= IDLE else '0'; + -- Handle load-with-reservation and store-conditional instructions + reservation_comb: process(all) + begin + cancel_store <= '0'; + set_rsrv <= '0'; + clear_rsrv <= '0'; + if d_in.valid = '1' and d_in.reserve = '1' then + -- XXX generate alignment interrupt if address is not aligned + -- XXX or if d_in.nc = '1' + if d_in.load = '1' then + -- load with reservation + set_rsrv <= '1'; + else + -- store conditional + clear_rsrv <= '1'; + if reservation.valid = '0' or + d_in.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + cancel_store <= '1'; + end if; + end if; + end if; + end process; + + reservation_reg: process(clk) + begin + if rising_edge(clk) then + if rst = '1' or clear_rsrv = '1' then + reservation.valid <= '0'; + elsif set_rsrv = '1' then + reservation.valid <= '1'; + reservation.addr <= d_in.addr(63 downto LINE_OFF_BITS); + end if; + end if; + end process; + -- Writeback (loads and reg updates) & completion control logic -- writeback_control: process(all) @@ -497,6 +548,8 @@ begin d_out.byte_reverse <= r2.byte_reverse; d_out.second_word <= r2.second_dword; d_out.xerc <= r2.xerc; + d_out.rc <= '0'; -- loads never have rc=1 + d_out.store_done <= '0'; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -512,11 +565,14 @@ begin assert (r1.update_valid and r2.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with update_valid" severity FAILURE; - assert (r1.slow_valid and r2.hit_load_valid) /= '1' report + assert (r1.slow_valid and r1.stcx_fail) /= '1' report + "unexpected slow_valid collision with stcx_fail" + severity FAILURE; + assert ((r1.slow_valid or r1.stcx_fail) and r2.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - assert (r1.slow_valid and r1.update_valid) /= '1' report - "unexpected update_valid collision with slow_valid" + assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report + "unexpected update_valid collision with slow_valid or stcx_fail" severity FAILURE; -- Delayed load hit case is the standard path @@ -551,6 +607,8 @@ begin d_out.xerc <= r1.req.xerc; d_out.second_word <= r1.second_dword; end if; + d_out.rc <= r1.req.rc; + d_out.store_done <= '1'; -- If it's a store or a non-update load form, complete now -- unless we need to do another dword transfer @@ -561,6 +619,12 @@ begin end if; end if; + if r1.stcx_fail = '1' then + d_out.rc <= r1.req.rc; + d_out.store_done <= '0'; + d_out.valid <= '1'; + end if; + -- We have a register update to do. if r1.update_valid = '1' then d_out.write_enable <= '1'; @@ -657,7 +721,7 @@ begin if reloading and wishbone_in.ack = '1' and r1.store_way = i then do_write <= '1'; end if; - if req_op = OP_STORE_HIT and req_hit_way = i then + if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' then assert not reloading report "Store hit while in state:" & state_t'image(r1.state) severity FAILURE; @@ -753,6 +817,7 @@ begin -- One cycle pulses reset r1.slow_valid <= '0'; r1.update_valid <= '0'; + r1.stcx_fail <= '0'; -- We cannot currently process a new request when not idle assert d_in.valid = '0' or r1.state = IDLE report "request " & @@ -832,10 +897,15 @@ begin r1.wb.sel <= req_sel; r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; r1.wb.dat <= req_data; - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - r1.wb.we <= '1'; - r1.state <= STORE_WAIT_ACK; + if cancel_store = '0' then + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + r1.wb.we <= '1'; + r1.state <= STORE_WAIT_ACK; + else + r1.stcx_fail <= '1'; + r1.state <= IDLE; + end if; -- OP_NONE and OP_BAD do nothing when OP_NONE => @@ -932,7 +1002,7 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; - end case; + end case; end if; end if; end process; diff --git a/decode1.vhdl b/decode1.vhdl index bca7c2a..349aa7e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -60,8 +60,8 @@ architecture behaviour of decode1 is 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi 21 => (ALU, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwinm 23 => (ALU, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwnm - 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- stb - 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '0'), -- stbu + 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb + 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw @@ -278,19 +278,19 @@ architecture behaviour of decode1 is 2#1100111000# => (ALU, OP_SHR, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- srawi 2#1000011011# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- srd 2#1000011000# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- srw - 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', RC, '0', '0'), -- stbcx - 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '0'), -- stbux - 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- stbx + 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stbcx + 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbux + 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbx 2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdbrx - 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- stdcx + 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stdcx 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx - 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- sthcx + 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx 2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthux 2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthx 2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwbrx - 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- stwcx + 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stwcx 2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwux 2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwx 2#0000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subf @@ -329,7 +329,7 @@ architecture behaviour of decode1 is -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe - constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'); + constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'); constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant sim_cfg_instr : decode_rom_t := (ALU, OP_SIM_CONFIG,NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'); diff --git a/decode2.vhdl b/decode2.vhdl index 3d6b7d8..ff773aa 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -334,6 +334,7 @@ begin v.e.byte_reverse := d_in.decode.byte_reverse; v.e.sign_extend := d_in.decode.sign_extend; v.e.update := d_in.decode.update; + v.e.reserve := d_in.decode.reserve; -- issue control control_valid_in <= d_in.valid; diff --git a/execute1.vhdl b/execute1.vhdl index c536a27..b1662b7 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -759,6 +759,8 @@ begin lv.update := e_in.update; lv.update_reg := gspr_to_gpr(e_in.read_reg1); lv.xerc := v.e.xerc; + lv.reserve := e_in.reserve; + lv.rc := e_in.rc; -- Update registers rin <= v; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 9e038e1..a0c0beb 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -51,6 +51,8 @@ begin v.update := l_in.update; v.update_reg := l_in.update_reg; v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; -- XXX Temporary hack. Mark the op as non-cachable if the address -- is the form 0xc------- diff --git a/writeback.vhdl b/writeback.vhdl index b924ee0..0151561 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -63,6 +63,7 @@ begin variable xe: xer_common_t; variable zero : std_ulogic; variable sign : std_ulogic; + variable scf : std_ulogic_vector(3 downto 0); begin x(0) := e_in.valid; y(0) := l_in.valid; @@ -124,6 +125,17 @@ begin w_out.write_enable <= not partial_write or second_word; end if; + if l_in.rc = '1' then + -- st*cx. instructions + scf(3) := '0'; + scf(2) := '0'; + scf(1) := l_in.store_done; + scf(0) := xe.so; + c_out.write_cr_enable <= '1'; + c_out.write_cr_mask <= num_to_fxm(0); + c_out.write_cr_data(31 downto 28) <= scf; + end if; + -- shift and byte-reverse data bytes for i in 0 to 7 loop k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); From 81d777be02a1052433bad993063c0f50c13d8131 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 4 Mar 2020 16:20:05 +1100 Subject: [PATCH 06/10] dcache: Trim one cycle from the load hit path Currently we don't get the result from a load that hits in the dcache until the fourth cycle after the instruction was presented to loadstore1. This trims this back to 3 cycles by taking the low order bits of the address generated in loadstore1 into dcache directly (not via the output register of loadstore1) and using them to address the read port of the dcache data RAM. We use the lower 12 address bits here in the expectation that any reasonable data cache design will have a set size of 4kB or less in order to avoid the aliasing problems that can arise with a virtually-indexed physically-tagged cache if the set size is greater than the smallest page size provided by the MMU. With this we can get rid of r2 and drive the signals going to writeback from r1, since the load hit data is now available one cycle earlier. We need a multiplexer on the read address of the data cache RAM in order to handle the second doubleword of an unaligned access. One small complication is that we now need an extra cycle in the case of an unaligned load which misses in the data cache and which reads the 2nd-last and last doublewords of a cache line. This is the reason for the PRE_NEXT_DWORD state; if we just go straight to NEXT_DWORD then we end up having the write of the last doubleword of the cache line and the read of that same doubleword occurring in the same cycle, which means we read stale data rather than the just-fetched data. Signed-off-by: Paul Mackerras --- common.vhdl | 2 + dcache.vhdl | 98 +++++++++++++++++++++++-------------------------- loadstore1.vhdl | 4 ++ 3 files changed, 51 insertions(+), 53 deletions(-) diff --git a/common.vhdl b/common.vhdl index 84bbc47..4b879a1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -229,6 +229,8 @@ package common is xerc : xer_common_t; reserve : std_ulogic; rc : std_ulogic; + early_low_addr : std_ulogic_vector(11 downto 0); + early_valid : std_ulogic; end record; type DcacheToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 75b10c7..265022b 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -124,6 +124,7 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing + PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD NEXT_DWORD, -- Starting the 2nd xfer of misaligned LOAD_UPDATE, -- Load with update extra cycle LOAD_UPDATE2, -- Load with update extra cycle @@ -184,24 +185,6 @@ architecture rtl of dcache is signal r1 : reg_stage_1_t; - -- Second stage register, only used for load hits - -- - type reg_stage_2_t is record - hit_way : way_t; - hit_load_valid : std_ulogic; - load_is_update : std_ulogic; - load_reg : std_ulogic_vector(4 downto 0); - data_shift : std_ulogic_vector(2 downto 0); - length : std_ulogic_vector(3 downto 0); - sign_extend : std_ulogic; - byte_reverse : std_ulogic; - xerc : xer_common_t; - last_dword : std_ulogic; - second_dword : std_ulogic; - end record; - - signal r2 : reg_stage_2_t; - -- Reservation information -- type reservation_t is record @@ -221,6 +204,10 @@ architecture rtl of dcache is signal req_addr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); signal req_sel : std_ulogic_vector(7 downto 0); + signal next_addr : std_ulogic_vector(63 downto 0); + + signal early_req_addr : std_ulogic_vector(11 downto 0); + signal early_req_row : row_t; signal cancel_store : std_ulogic; signal set_rsrv : std_ulogic; @@ -404,6 +391,12 @@ begin end generate; end generate; + -- Wishbone read and write and BRAM write sel bits generation + bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); + + -- See if the operation crosses two doublewords + two_dwords <= or (bus_sel(15 downto 8)); + -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; @@ -444,6 +437,9 @@ begin req_laddr <= req_addr(63 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); + -- Address of next doubleword, used for unaligned accesses + next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; + -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; @@ -480,17 +476,21 @@ begin req_op <= op; + -- Versions of the address and row number that are valid one cycle earlier + -- in the cases where we need to read the cache data BRAM. + if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then + early_req_addr <= next_addr(11 downto 0); + elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then + early_req_addr <= r1.next_addr(11 downto 0); + else + early_req_addr <= d_in.early_low_addr; + end if; + early_req_row <= get_row(x"0000000000000" & early_req_addr); end process; -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Wishbone read and write and BRAM write sel bits generation - bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); - - -- See if the operation crosses two doublewords - two_dwords <= or (bus_sel(15 downto 8)); - -- TODO: Generate errors -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; @@ -540,14 +540,14 @@ begin -- The mux on d_out.write reg defaults to the normal load hit case. d_out.write_enable <= '0'; d_out.valid <= '0'; - d_out.write_reg <= r2.load_reg; - d_out.write_data <= cache_out(r2.hit_way); - d_out.write_len <= r2.length; - d_out.write_shift <= r2.data_shift; - d_out.sign_extend <= r2.sign_extend; - d_out.byte_reverse <= r2.byte_reverse; - d_out.second_word <= r2.second_dword; - d_out.xerc <= r2.xerc; + d_out.write_reg <= r1.req.write_reg; + d_out.write_data <= cache_out(r1.hit_way); + d_out.write_len <= r1.req.length; + d_out.write_shift <= r1.req.addr(2 downto 0); + d_out.sign_extend <= r1.req.sign_extend; + d_out.byte_reverse <= r1.req.byte_reverse; + d_out.second_word <= r1.second_dword; + d_out.xerc <= r1.req.xerc; d_out.rc <= '0'; -- loads never have rc=1 d_out.store_done <= '0'; @@ -562,26 +562,27 @@ begin -- -- Sanity: Only one of these must be set in any given cycle - assert (r1.update_valid and r2.hit_load_valid) /= '1' report + assert (r1.update_valid and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with update_valid" severity FAILURE; assert (r1.slow_valid and r1.stcx_fail) /= '1' report "unexpected slow_valid collision with stcx_fail" severity FAILURE; - assert ((r1.slow_valid or r1.stcx_fail) and r2.hit_load_valid) /= '1' report + assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report "unexpected update_valid collision with slow_valid or stcx_fail" severity FAILURE; - -- Delayed load hit case is the standard path - if r2.hit_load_valid = '1' then + -- Load hit case is the standard path + if r1.hit_load_valid = '1' then d_out.write_enable <= '1'; -- If there isn't another dword to go and -- it's not a load with update, complete it now - if r2.last_dword = '1' and r2.load_is_update = '0' then + if (r1.second_dword or not r1.two_dwords) = '1' and + r1.req.update = '0' then report "completing load hit"; d_out.valid <= '1'; end if; @@ -693,7 +694,7 @@ begin begin -- Cache hit reads do_read <= '1'; - rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + rd_addr <= std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS)); cache_out(i) <= dout; -- Write mux: @@ -732,23 +733,11 @@ begin -- -- Cache hit synchronous machine for the easy case. This handles - -- non-update form load hits and stage 1 to stage 2 transfers + -- non-update form load hits -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- stage 1 -> stage 2 - r2.hit_load_valid <= r1.hit_load_valid; - r2.hit_way <= r1.hit_way; - r2.load_is_update <= r1.req.update; - r2.load_reg <= r1.req.write_reg; - r2.data_shift <= r1.req.addr(2 downto 0); - r2.length <= r1.req.length; - r2.sign_extend <= r1.req.sign_extend; - r2.byte_reverse <= r1.req.byte_reverse; - r2.second_dword <= r1.second_dword; - r2.last_dword <= r1.second_dword or not r1.two_dwords; - -- If we have a request incoming, we have to latch it as d_in.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are @@ -759,7 +748,7 @@ begin r1.req <= d_in; r1.second_dword <= '0'; r1.two_dwords <= two_dwords; - r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; + r1.next_addr <= next_addr; r1.next_sel <= bus_sel(15 downto 8); report "op:" & op_t'image(req_op) & @@ -912,6 +901,9 @@ begin when OP_BAD => end case; + when PRE_NEXT_DWORD => + r1.state <= NEXT_DWORD; + when RELOAD_WAIT_ACK => -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; @@ -958,7 +950,7 @@ begin -- we also need to do the deferred update cycle. r1.slow_valid <= '1'; if r1.two_dwords and not r1.second_dword then - r1.state <= NEXT_DWORD; + r1.state <= PRE_NEXT_DWORD; elsif r1.req.update = '1' then r1.state <= LOAD_UPDATE2; report "completing miss with load-update !"; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a0c0beb..a25e617 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -89,5 +89,9 @@ begin -- Update outputs l_out <= r; + + -- Asynchronous output of the low-order address bits (latched in dcache) + l_out.early_low_addr <= lsu_sum(11 downto 0); + l_out.early_valid <= l_in.valid; end process; end; From ef9c1efd72e39c86a63d85c440f13a431d90bae8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 5 Mar 2020 15:02:10 +1100 Subject: [PATCH 07/10] dcache: Remove LOAD_UPDATE2 state Since we removed one cycle from the load hit case, we actually no longer need the extra cycle provided by having the LOAD_UPDATE state. Therefore this makes the load hit case in the IDLE and NEXT_DWORD states go to LOAD_UPDATE2 rather than LOAD_UPDATE. Then we remove LOAD_UPDATE and then rename LOAD_UPDATE2 to LOAD_UPDATE. Signed-off-by: Paul Mackerras --- dcache.vhdl | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 265022b..bcc7590 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -127,7 +127,6 @@ architecture rtl of dcache is PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD NEXT_DWORD, -- Starting the 2nd xfer of misaligned LOAD_UPDATE, -- Load with update extra cycle - LOAD_UPDATE2, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -824,7 +823,6 @@ begin if two_dwords = '1' then r1.state <= NEXT_DWORD; elsif d_in.update = '1' then - -- We have a load with update hit, we need the delayed update cycle r1.state <= LOAD_UPDATE; end if; else @@ -952,7 +950,7 @@ begin if r1.two_dwords and not r1.second_dword then r1.state <= PRE_NEXT_DWORD; elsif r1.req.update = '1' then - r1.state <= LOAD_UPDATE2; + r1.state <= LOAD_UPDATE; report "completing miss with load-update !"; else r1.state <= IDLE; @@ -965,9 +963,6 @@ begin end if; when LOAD_UPDATE => - -- We need the extra cycle to complete a load with update - r1.state <= LOAD_UPDATE2; - when LOAD_UPDATE2 => -- We need the extra cycle to complete a load with update r1.update_valid <= '1'; r1.state <= IDLE; @@ -983,7 +978,7 @@ begin if r1.two_dwords and not r1.second_dword then r1.state <= NEXT_DWORD; elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then - r1.state <= LOAD_UPDATE2; + r1.state <= LOAD_UPDATE; else r1.state <= IDLE; end if; From b349cc891a52c0453e7c721b98b96025995a4588 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2020 20:24:13 +1100 Subject: [PATCH 08/10] loadstore1: Move logic from dcache to loadstore1 So that the dcache could in future be used by an MMU, this moves logic to do with data formatting, rA updates for update-form instructions, and handling of unaligned loads and stores out of dcache and into loadstore1. For now, dcache connects only to loadstore1, and loadstore1 now has the connection to writeback. Dcache generates a stall signal to loadstore1 which indicates that the request presented in the current cycle was not accepted and should be presented again. However, loadstore1 doesn't currently use it because we know that we can never hit the circumstances where it might be set. For unaligned transfers, loadstore1 generates two requests to dcache back-to-back, and then waits to see two acks back from dcache (cycles where d_in.valid is true). Loadstore1 now has a FSM for tracking how many acks we are expecting from dcache and for doing the rA update cycles when necessary. Handling for reservations and conditional stores is still in dcache. Loadstore1 now generates its own stall signal back to decode2, so we no longer need the logic in execute1 that generated the stall for the first two cycles. Signed-off-by: Paul Mackerras --- common.vhdl | 28 ++--- core.vhdl | 19 ++- dcache.vhdl | 325 +++++++++++------------------------------------- dcache_tb.vhdl | 29 ++--- execute1.vhdl | 8 -- loadstore1.vhdl | 276 +++++++++++++++++++++++++++++++++------- writeback.vhdl | 2 +- 7 files changed, 339 insertions(+), 348 deletions(-) diff --git a/common.vhdl b/common.vhdl index 4b879a1..f581ccb 100644 --- a/common.vhdl +++ b/common.vhdl @@ -218,22 +218,20 @@ package common is valid : std_ulogic; load : std_ulogic; nc : std_ulogic; + reserve : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); - write_reg : gpr_index_t; - length : std_ulogic_vector(3 downto 0); - byte_reverse : std_ulogic; - sign_extend : std_ulogic; - update : std_ulogic; - update_reg : gpr_index_t; - xerc : xer_common_t; - reserve : std_ulogic; - rc : std_ulogic; - early_low_addr : std_ulogic_vector(11 downto 0); - early_valid : std_ulogic; + byte_sel : std_ulogic_vector(7 downto 0); + end record; + + type DcacheToLoadstore1Type is record + valid : std_ulogic; + data : std_ulogic_vector(63 downto 0); + store_done : std_ulogic; + error : std_ulogic; end record; - type DcacheToWritebackType is record + type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; write_reg : gpr_index_t; @@ -247,9 +245,9 @@ package common is rc : std_ulogic; store_done : std_ulogic; end record; - constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', - byte_reverse => '0', second_word => '0', xerc => xerc_init, - rc => '0', store_done => '0', others => (others => '0')); + constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', + byte_reverse => '0', second_word => '0', xerc => xerc_init, + rc => '0', store_done => '0', others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; diff --git a/core.vhdl b/core.vhdl index 87e73a4..d535f7a 100644 --- a/core.vhdl +++ b/core.vhdl @@ -61,8 +61,11 @@ architecture behave of core is -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; + signal loadstore1_to_writeback: Loadstore1ToWritebackType; + + -- dcache signals signal loadstore1_to_dcache: Loadstore1ToDcacheType; - signal dcache_to_writeback: DcacheToWritebackType; + signal dcache_to_loadstore1: DcacheToLoadstore1Type; -- local signals signal fetch1_stall_in : std_ulogic; @@ -73,6 +76,7 @@ architecture behave of core is signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; signal ex1_stall_out: std_ulogic; + signal ls1_stall_out: std_ulogic; signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; @@ -196,7 +200,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file ); - decode2_stall_in <= ex1_stall_out or dcache_stall_out; + decode2_stall_in <= ex1_stall_out or ls1_stall_out; register_file_0: entity work.register_file generic map ( @@ -243,8 +247,13 @@ begin loadstore1_0: entity work.loadstore1 port map ( clk => clk, + rst => core_rst, l_in => execute1_to_loadstore1, - l_out => loadstore1_to_dcache + l_out => loadstore1_to_writeback, + d_out => loadstore1_to_dcache, + d_in => dcache_to_loadstore1, + dc_stall => dcache_stall_out, + stall_out => ls1_stall_out ); dcache_0: entity work.dcache @@ -257,7 +266,7 @@ begin clk => clk, rst => core_rst, d_in => loadstore1_to_dcache, - d_out => dcache_to_writeback, + d_out => dcache_to_loadstore1, stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out @@ -267,7 +276,7 @@ begin port map ( clk => clk, e_in => execute1_to_writeback, - l_in => dcache_to_writeback, + l_in => loadstore1_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, complete_out => complete diff --git a/dcache.vhdl b/dcache.vhdl index bcc7590..7e553bf 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -7,9 +7,6 @@ -- * Complete load misses on the cycle when WB data comes instead of -- at the end of line (this requires dealing with requests coming in -- while not idle...) --- * Load with update could use one less non-pipelined cycle by moving --- the register update to the pipeline bubble that exists when going --- back to the IDLE state. -- library ieee; use ieee.std_logic_1164.all; @@ -35,7 +32,7 @@ entity dcache is rst : in std_ulogic; d_in : in Loadstore1ToDcacheType; - d_out : out DcacheToWritebackType; + d_out : out DcacheToLoadstore1Type; stall_out : out std_ulogic; @@ -113,6 +110,8 @@ architecture rtl of dcache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + signal r0 : Loadstore1ToDcacheType; + -- Type of operation on a "valid" input type op_t is (OP_NONE, OP_LOAD_HIT, -- Cache hit on load @@ -124,10 +123,8 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing - PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD - NEXT_DWORD, -- Starting the 2nd xfer of misaligned - LOAD_UPDATE, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack + FINISH_LD_MISS, -- Extra cycle after load miss STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -158,15 +155,6 @@ architecture rtl of dcache is hit_way : way_t; hit_load_valid : std_ulogic; - -- Info for doing the second transfer of a misaligned load/store - two_dwords : std_ulogic; - second_dword : std_ulogic; - next_addr : std_ulogic_vector(63 downto 0); - next_sel : std_ulogic_vector(7 downto 0); - - -- Register update (load/store with update) - update_valid : std_ulogic; - -- Data buffer for "slow" read ops (load miss and NC loads). slow_data : std_ulogic_vector(63 downto 0); slow_valid : std_ulogic; @@ -200,12 +188,8 @@ architecture rtl of dcache is signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); - signal req_addr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); - signal req_sel : std_ulogic_vector(7 downto 0); - signal next_addr : std_ulogic_vector(63 downto 0); - signal early_req_addr : std_ulogic_vector(11 downto 0); signal early_req_row : row_t; signal cancel_store : std_ulogic; @@ -222,10 +206,8 @@ architecture rtl of dcache is signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals - signal bus_sel : std_ulogic_vector(15 downto 0); + signal bus_sel : std_ulogic_vector(7 downto 0); - signal two_dwords : std_ulogic; - -- -- Helper functions to decode incoming requests -- @@ -305,37 +287,6 @@ architecture rtl of dcache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; - -- Generate byte enables from sizes - function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is - begin - case length is - when "0001" => - return "00000001"; - when "0010" => - return "00000011"; - when "0100" => - return "00001111"; - when "1000" => - return "11111111"; - when others => - return "00000000"; - end case; - end function length_to_sel; - - -- Calculate byte enables for wishbone - -- This returns 16 bits, giving the select signals for two transfers, - -- to account for unaligned loads or stores - function wishbone_data_sel(size : in std_logic_vector(3 downto 0); - address : in std_logic_vector(63 downto 0)) - return std_ulogic_vector is - variable longsel : std_ulogic_vector(15 downto 0); - begin - longsel := (others => '0'); - longsel(7 downto 0) := length_to_sel(size); - return std_ulogic_vector(shift_left(unsigned(longsel), - to_integer(unsigned(address(2 downto 0))))); - end function wishbone_data_sel; - begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -390,11 +341,17 @@ begin end generate; end generate; - -- Wishbone read and write and BRAM write sel bits generation - bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); - - -- See if the operation crosses two doublewords - two_dwords <= or (bus_sel(15 downto 8)); + -- Latch the request in r0 as long as we're not stalling + stage_0 : process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r0.valid <= '0'; + elsif stall_out = '0' then + r0 <= d_in; + end if; + end if; + end process; -- Cache request parsing and hit detection dcache_request : process(all) @@ -405,40 +362,21 @@ begin variable data : std_ulogic_vector(63 downto 0); variable opsel : std_ulogic_vector(3 downto 0); variable go : std_ulogic; - variable is_load : std_ulogic; - variable is_nc : std_ulogic; begin -- Extract line, row and tag from request - if r1.state /= NEXT_DWORD then - req_addr <= d_in.addr; - req_data <= d_in.data; - req_sel <= bus_sel(7 downto 0); - go := d_in.valid; - is_load := d_in.load; - is_nc := d_in.nc; - - else - req_addr <= r1.next_addr; - req_data <= r1.req.data; - req_sel <= r1.next_sel; - go := '1'; - is_load := r1.req.load; - is_nc := r1.req.nc; - end if; + req_index <= get_index(r0.addr); + req_row <= get_row(r0.addr); + req_tag <= get_tag(r0.addr); - req_index <= get_index(req_addr); - req_row <= get_row(req_addr); - req_tag <= get_tag(req_addr); + -- Only do anything if not being stalled by stage 1 + go := r0.valid and not stall_out; -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= req_addr(63 downto LINE_OFF_BITS) & + req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); - -- Address of next doubleword, used for unaligned accesses - next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; - -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; @@ -460,7 +398,7 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := go & is_load & is_nc & is_hit; + opsel := go & r0.load & r0.nc & is_hit; case opsel is when "1101" => op := OP_LOAD_HIT; when "1100" => op := OP_LOAD_MISS; @@ -475,16 +413,15 @@ begin req_op <= op; - -- Versions of the address and row number that are valid one cycle earlier + -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. - if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then - early_req_addr <= next_addr(11 downto 0); - elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then - early_req_addr <= r1.next_addr(11 downto 0); + -- If we're stalling then we need to keep reading the last + -- row requested. + if stall_out = '0' then + early_req_row <= get_row(d_in.addr); else - early_req_addr <= d_in.early_low_addr; + early_req_row <= req_row; end if; - early_req_row <= get_row(x"0000000000000" & early_req_addr); end process; -- Wire up wishbone request latch out of stage 1 @@ -502,17 +439,17 @@ begin cancel_store <= '0'; set_rsrv <= '0'; clear_rsrv <= '0'; - if d_in.valid = '1' and d_in.reserve = '1' then + if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then -- XXX generate alignment interrupt if address is not aligned - -- XXX or if d_in.nc = '1' - if d_in.load = '1' then + -- XXX or if r0.nc = '1' + if r0.load = '1' then -- load with reservation set_rsrv <= '1'; else -- store conditional clear_rsrv <= '1'; if reservation.valid = '0' or - d_in.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then cancel_store <= '1'; end if; end if; @@ -526,28 +463,19 @@ begin reservation.valid <= '0'; elsif set_rsrv = '1' then reservation.valid <= '1'; - reservation.addr <= d_in.addr(63 downto LINE_OFF_BITS); + reservation.addr <= r0.addr(63 downto LINE_OFF_BITS); end if; end if; end process; - -- Writeback (loads and reg updates) & completion control logic + -- Return data for loads & completion control logic -- writeback_control: process(all) begin - -- The mux on d_out.write reg defaults to the normal load hit case. - d_out.write_enable <= '0'; + -- The mux on d_out.data defaults to the normal load hit case. d_out.valid <= '0'; - d_out.write_reg <= r1.req.write_reg; - d_out.write_data <= cache_out(r1.hit_way); - d_out.write_len <= r1.req.length; - d_out.write_shift <= r1.req.addr(2 downto 0); - d_out.sign_extend <= r1.req.sign_extend; - d_out.byte_reverse <= r1.req.byte_reverse; - d_out.second_word <= r1.second_dword; - d_out.xerc <= r1.req.xerc; - d_out.rc <= '0'; -- loads never have rc=1 + d_out.data <= cache_out(r1.hit_way); d_out.store_done <= '0'; -- We have a valid load or store hit or we just completed a slow @@ -561,30 +489,17 @@ begin -- -- Sanity: Only one of these must be set in any given cycle - assert (r1.update_valid and r1.hit_load_valid) /= '1' report - "unexpected hit_load_delayed collision with update_valid" - severity FAILURE; assert (r1.slow_valid and r1.stcx_fail) /= '1' report "unexpected slow_valid collision with stcx_fail" severity FAILURE; assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report - "unexpected update_valid collision with slow_valid or stcx_fail" - severity FAILURE; -- Load hit case is the standard path if r1.hit_load_valid = '1' then - d_out.write_enable <= '1'; - - -- If there isn't another dword to go and - -- it's not a load with update, complete it now - if (r1.second_dword or not r1.two_dwords) = '1' and - r1.req.update = '0' then - report "completing load hit"; - d_out.valid <= '1'; - end if; + report "completing load hit"; + d_out.valid <= '1'; end if; -- Slow ops (load miss, NC, stores) @@ -593,63 +508,20 @@ begin -- mux accordingly -- if r1.req.load then - d_out.write_reg <= r1.req.write_reg; - d_out.write_enable <= '1'; - - -- Read data comes from the slow data latch, formatter - -- from the latched request. - -- - d_out.write_data <= r1.slow_data; - d_out.write_shift <= r1.req.addr(2 downto 0); - d_out.sign_extend <= r1.req.sign_extend; - d_out.byte_reverse <= r1.req.byte_reverse; - d_out.write_len <= r1.req.length; - d_out.xerc <= r1.req.xerc; - d_out.second_word <= r1.second_dword; + -- Read data comes from the slow data latch + d_out.data <= r1.slow_data; end if; - d_out.rc <= r1.req.rc; d_out.store_done <= '1'; - -- If it's a store or a non-update load form, complete now - -- unless we need to do another dword transfer - if (r1.req.load = '0' or r1.req.update = '0') and - (r1.two_dwords = '0' or r1.second_dword = '1') then - report "completing store or load miss"; - d_out.valid <= '1'; - end if; + report "completing store or load miss"; + d_out.valid <= '1'; end if; if r1.stcx_fail = '1' then - d_out.rc <= r1.req.rc; d_out.store_done <= '0'; d_out.valid <= '1'; end if; - -- We have a register update to do. - if r1.update_valid = '1' then - d_out.write_enable <= '1'; - d_out.write_reg <= r1.req.update_reg; - - -- Change the read data mux to the address that's going into - -- the register and the formatter does nothing. - -- - d_out.write_data <= r1.req.addr; - d_out.write_shift <= "000"; - d_out.write_len <= "1000"; - d_out.sign_extend <= '0'; - d_out.byte_reverse <= '0'; - d_out.xerc <= r1.req.xerc; - d_out.second_word <= '0'; - - -- If it was a load, this completes the operation (load with - -- update case). - -- - if r1.req.load = '1' then - report "completing after load update"; - d_out.valid <= '1'; - end if; - end if; - end process; -- @@ -703,11 +575,11 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. Only the do_write signal is. -- - if r1.state = IDLE or r1.state = NEXT_DWORD then - -- In these states, the only write path is the store-hit update case + if r1.state = IDLE then + -- In IDLE state, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= req_data; - wr_sel <= req_sel; + wr_data <= r0.data; + wr_sel <= r0.byte_sel; else -- Otherwise, we might be doing a reload wr_data <= wishbone_in.dat; @@ -731,35 +603,25 @@ begin end generate; -- - -- Cache hit synchronous machine for the easy case. This handles - -- non-update form load hits + -- Cache hit synchronous machine for the easy case. This handles load hits. -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as d_in.valid + -- If we have a request incoming, we have to latch it as r0.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). - if req_op /= OP_NONE and d_in.valid = '1' then - r1.req <= d_in; - r1.second_dword <= '0'; - r1.two_dwords <= two_dwords; - r1.next_addr <= next_addr; - r1.next_sel <= bus_sel(15 downto 8); - + if req_op /= OP_NONE and stall_out = '0' then + r1.req <= r0; report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(d_in.addr) & - " upd:" & std_ulogic'image(d_in.update) & - " nc:" & std_ulogic'image(d_in.nc) & - " reg:" & to_hstring(d_in.write_reg) & + " addr:" & to_hstring(r0.addr) & + " nc:" & std_ulogic'image(r0.nc) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); - elsif r1.state = NEXT_DWORD then - r1.second_dword <= '1'; end if; -- Fast path for load/store hits. Set signals for the writeback controls. @@ -776,7 +638,6 @@ begin -- Every other case is handled by this state machine: -- -- * Cache load miss/reload (in conjunction with "rams") - -- * Load hits for update forms -- * Load hits for non-cachable forms -- * Stores (the collision case is handled in "rams") -- @@ -795,7 +656,6 @@ begin end loop; r1.state <= IDLE; r1.slow_valid <= '0'; - r1.update_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -804,39 +664,19 @@ begin else -- One cycle pulses reset r1.slow_valid <= '0'; - r1.update_valid <= '0'; r1.stcx_fail <= '0'; - -- We cannot currently process a new request when not idle - assert d_in.valid = '0' or r1.state = IDLE report "request " & - op_t'image(req_op) & " while in state " & state_t'image(r1.state) - severity FAILURE; - -- Main state machine case r1.state is - when IDLE | NEXT_DWORD => + when IDLE => case req_op is when OP_LOAD_HIT => - if r1.state = IDLE then - -- If the load is misaligned then we will need to start - -- the state machine - if two_dwords = '1' then - r1.state <= NEXT_DWORD; - elsif d_in.update = '1' then - r1.state <= LOAD_UPDATE; - end if; - else - if r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - else - r1.state <= IDLE; - end if; - end if; + -- stay in IDLE state - when OP_LOAD_MISS => + when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(req_addr) & + report "cache miss addr:" & to_hstring(r0.addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -871,19 +711,17 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= req_sel; - r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - -- For store-with-update do the register update - r1.update_valid <= d_in.valid and d_in.update; - r1.wb.sel <= req_sel; - r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= req_data; + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.dat <= r0.data; if cancel_store = '0' then r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -899,9 +737,6 @@ begin when OP_BAD => end case; - when PRE_NEXT_DWORD => - r1.state <= NEXT_DWORD; - when RELOAD_WAIT_ACK => -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; @@ -943,31 +778,23 @@ begin -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Write back the load data that we got, and start - -- the second dword if necessary. Otherwise, see if - -- we also need to do the deferred update cycle. - r1.slow_valid <= '1'; - if r1.two_dwords and not r1.second_dword then - r1.state <= PRE_NEXT_DWORD; - elsif r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - report "completing miss with load-update !"; - else - r1.state <= IDLE; - report "completing miss !"; - end if; + -- Don't complete and go idle until next cycle, in + -- case the next request is for the last dword of + -- the cache line we just loaded. + r1.state <= FINISH_LD_MISS; end if; -- Increment store row counter r1.store_row <= next_row(r1.store_row); end if; - when LOAD_UPDATE => - -- We need the extra cycle to complete a load with update - r1.update_valid <= '1'; - r1.state <= IDLE; + when FINISH_LD_MISS => + -- Write back the load data that we got + r1.slow_valid <= '1'; + r1.state <= IDLE; + report "completing miss !"; - when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => + when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request if wishbone_in.stall = '0' then r1.wb.stb <= '0'; @@ -975,16 +802,10 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then - if r1.two_dwords and not r1.second_dword then - r1.state <= NEXT_DWORD; - elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - else - r1.state <= IDLE; - end if; if r1.state = NC_LOAD_WAIT_ACK then r1.slow_data <= wishbone_in.dat; end if; + r1.state <= IDLE; r1.slow_valid <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index 437fd7d..bd8341a 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -13,7 +13,7 @@ architecture behave of dcache_tb is signal rst : std_ulogic; signal d_in : Loadstore1ToDcacheType; - signal d_out : DcacheToWritebackType; + signal d_out : DcacheToLoadstore1Type; signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -71,12 +71,6 @@ begin d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); - d_in.write_reg <= (others => '0'); - d_in.length <= (others => '0'); - d_in.byte_reverse <= '0'; - d_in.sign_extend <= '0'; - d_in.update <= '0'; - d_in.update_reg <= (others => '0'); wait for 4*clk_period; wait until rising_edge(clk); @@ -89,11 +83,10 @@ begin wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000000100000000" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000000100000000" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000000100000000" severity failure; -- wait for clk_period; @@ -106,11 +99,10 @@ begin wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000000D0000000C" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000000D0000000C" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000000D0000000C" severity failure; @@ -121,11 +113,10 @@ begin d_in.valid <= '1'; wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000004100000040" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000004100000040" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000004100000040" severity failure; diff --git a/execute1.vhdl b/execute1.vhdl index b1662b7..b05fd4d 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -42,7 +42,6 @@ architecture behaviour of execute1 is next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; div_in_progress : std_ulogic; - ldst_in_progress : std_ulogic; cntz_in_progress : std_ulogic; slow_op_dest : gpr_index_t; slow_op_rc : std_ulogic; @@ -264,7 +263,6 @@ begin v.mul_in_progress := '0'; v.div_in_progress := '0'; v.cntz_in_progress := '0'; - v.ldst_in_progress := '0'; -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; @@ -662,8 +660,6 @@ begin when OP_LOAD | OP_STORE => -- loadstore/dcache has its own port to writeback v.e.valid := '0'; - stall_out <= '1'; - v.ldst_in_progress := '1'; when others => terminate_out <= '1'; @@ -703,10 +699,6 @@ begin v.e.rc := v.slow_op_rc; v.e.xerc := v.slow_op_xerc; v.e.valid := '1'; - elsif r.ldst_in_progress = '1' then - -- assert stall for 2 cycles on load/store, then - -- the stall output from dcache takes over - stall_out <= '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a25e617..2ab71ad 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -12,16 +12,85 @@ use work.helpers.all; entity loadstore1 is port ( clk : in std_ulogic; + rst : in std_ulogic; l_in : in Execute1ToLoadstore1Type; + l_out : out Loadstore1ToWritebackType; - l_out : out Loadstore1ToDcacheType + d_out : out Loadstore1ToDcacheType; + d_in : in DcacheToLoadstore1Type; + + dc_stall : in std_ulogic; + stall_out : out std_ulogic ); end loadstore1; +-- Note, we don't currently use the stall output from the dcache because +-- we know it can take two requests without stalling when idle, we are +-- its only user, and we know it never stalls when idle. + architecture behave of loadstore1 is - signal r, rin : Loadstore1ToDcacheType; + + -- State machine for unaligned loads/stores + type state_t is (IDLE, -- ready for instruction + SECOND_REQ, -- send 2nd request of unaligned xfer + FIRST_ACK_WAIT, -- waiting for 1st ack from dcache + LAST_ACK_WAIT, -- waiting for last ack from dcache + LD_UPDATE -- writing rA with computed addr on load + ); + + type reg_stage_t is record + -- latch most of the input request + load : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + data : std_ulogic_vector(63 downto 0); + write_reg : gpr_index_t; + length : std_ulogic_vector(3 downto 0); + byte_reverse : std_ulogic; + sign_extend : std_ulogic; + update : std_ulogic; + update_reg : gpr_index_t; + xerc : xer_common_t; + reserve : std_ulogic; + rc : std_ulogic; + nc : std_ulogic; -- non-cacheable access + state : state_t; + second_bytes : std_ulogic_vector(7 downto 0); + end record; + + signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); + + -- Generate byte enables from sizes + function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is + begin + case length is + when "0001" => + return "00000001"; + when "0010" => + return "00000011"; + when "0100" => + return "00001111"; + when "1000" => + return "11111111"; + when others => + return "00000000"; + end case; + end function length_to_sel; + + -- Calculate byte enables + -- This returns 16 bits, giving the select signals for two transfers, + -- to account for unaligned loads or stores + function xfer_data_sel(size : in std_logic_vector(3 downto 0); + address : in std_logic_vector(2 downto 0)) + return std_ulogic_vector is + variable longsel : std_ulogic_vector(15 downto 0); + begin + longsel := "00000000" & length_to_sel(size); + return std_ulogic_vector(shift_left(unsigned(longsel), + to_integer(unsigned(address)))); + end function xfer_data_sel; + begin -- Calculate the address in the first cycle lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); @@ -29,69 +98,180 @@ begin loadstore1_0: process(clk) begin if rising_edge(clk) then - r <= rin; + if rst = '1' then + r.state <= IDLE; + else + r <= rin; + end if; end if; end process; loadstore1_1: process(all) - variable v : Loadstore1ToDcacheType; + variable v : reg_stage_t; variable brev_lenm1 : unsigned(2 downto 0); variable byte_offset : unsigned(2 downto 0); variable j : integer; variable k : unsigned(2 downto 0); + variable long_sel : std_ulogic_vector(15 downto 0); + variable byte_sel : std_ulogic_vector(7 downto 0); + variable req : std_ulogic; + variable stall : std_ulogic; + variable addr : std_ulogic_vector(63 downto 0); + variable wdata : std_ulogic_vector(63 downto 0); + variable write_enable : std_ulogic; + variable do_update : std_ulogic; + variable second_dword : std_ulogic; + variable done : std_ulogic; begin v := r; + req := '0'; + stall := '0'; + done := '0'; + byte_sel := (others => '0'); + addr := lsu_sum; + + write_enable := '0'; + do_update := '0'; + second_dword := '0'; + + case r.state is + when IDLE => + if l_in.valid = '1' then + v.load := l_in.load; + v.addr := lsu_sum; + v.data := l_in.data; + v.write_reg := l_in.write_reg; + v.length := l_in.length; + v.byte_reverse := l_in.byte_reverse; + v.sign_extend := l_in.sign_extend; + v.update := l_in.update; + v.update_reg := l_in.update_reg; + v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; - v.valid := l_in.valid; - v.load := l_in.load; - v.write_reg := l_in.write_reg; - v.length := l_in.length; - v.byte_reverse := l_in.byte_reverse; - v.sign_extend := l_in.sign_extend; - v.update := l_in.update; - v.update_reg := l_in.update_reg; - v.xerc := l_in.xerc; - v.reserve := l_in.reserve; - v.rc := l_in.rc; - - -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- - -- - -- This will have to be replaced by a combination of implementing the - -- proper HV CI load/store instructions and having an MMU to get the I - -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" then - v.nc := '1'; - else - v.nc := '0'; - end if; - - -- XXX Do length_to_sel here ? - - -- Do byte reversing and rotating for stores in the first cycle - if v.load = '0' then - byte_offset := unsigned(lsu_sum(2 downto 0)); - brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- + -- + -- This will have to be replaced by a combination of implementing the + -- proper HV CI load/store instructions and having an MMU to get the I + -- bit otherwise. + if lsu_sum(31 downto 28) = "1100" then + v.nc := '1'; + else + v.nc := '0'; + end if; + + -- Do length_to_sel and work out if we are doing 2 dwords + long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); + byte_sel := long_sel(7 downto 0); + v.second_bytes := long_sel(15 downto 8); + + v.addr := lsu_sum; + + -- Do byte reversing and rotating for stores in the first cycle + if v.load = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; + j := to_integer(k) * 8; + v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + end loop; + end if; + + req := '1'; + stall := '1'; + if long_sel(15 downto 8) = "00000000" then + v.state := LAST_ACK_WAIT; + else + v.state := SECOND_REQ; + end if; end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; - j := to_integer(k) * 8; - v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); - end loop; - end if; - v.addr := lsu_sum; + when SECOND_REQ => + -- compute (addr + 8) & ~7 for the second doubleword when unaligned + addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + byte_sel := r.second_bytes; + req := '1'; + stall := '1'; + v.state := FIRST_ACK_WAIT; + + when FIRST_ACK_WAIT => + stall := '1'; + if d_in.valid = '1' then + write_enable := r.load; + v.state := LAST_ACK_WAIT; + end if; + + when LAST_ACK_WAIT => + stall := '1'; + second_dword := or (r.second_bytes); + if d_in.valid = '1' then + write_enable := r.load; + if r.load = '1' and r.update = '1' then + -- loads with rA update need an extra cycle + v.state := LD_UPDATE; + else + -- stores write back rA update in this cycle + do_update := r.update; + stall := '0'; + done := '1'; + v.state := IDLE; + end if; + end if; + + when LD_UPDATE => + do_update := '1'; + v.state := IDLE; + done := '1'; + end case; -- Update registers rin <= v; - -- Update outputs - l_out <= r; + -- Update outputs to dcache + d_out.valid <= req; + d_out.load <= v.load; + d_out.nc <= v.nc; + d_out.reserve <= v.reserve; + d_out.addr <= addr; + d_out.data <= v.data; + d_out.byte_sel <= byte_sel; + + -- Update outputs to writeback + -- Multiplex either cache data to the destination GPR or + -- the address for the rA update. + l_out.valid <= done; + if do_update = '1' then + l_out.write_enable <= '1'; + l_out.write_reg <= r.update_reg; + l_out.write_data <= r.addr; + l_out.write_len <= x"8"; + l_out.write_shift <= "000"; + l_out.sign_extend <= '0'; + l_out.byte_reverse <= '0'; + l_out.second_word <= '0'; + l_out.rc <= '0'; + l_out.store_done <= '0'; + else + l_out.write_enable <= write_enable; + l_out.write_reg <= r.write_reg; + l_out.write_data <= d_in.data; + l_out.write_len <= r.length; + l_out.write_shift <= r.addr(2 downto 0); + l_out.sign_extend <= r.sign_extend; + l_out.byte_reverse <= r.byte_reverse; + l_out.second_word <= second_dword; + l_out.rc <= r.rc and done; + l_out.store_done <= d_in.store_done; + end if; + l_out.xerc <= r.xerc; + + stall_out <= stall; - -- Asynchronous output of the low-order address bits (latched in dcache) - l_out.early_low_addr <= lsu_sum(11 downto 0); - l_out.early_valid <= l_in.valid; end process; end; diff --git a/writeback.vhdl b/writeback.vhdl index 0151561..d52bb54 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -11,7 +11,7 @@ entity writeback is clk : in std_ulogic; e_in : in Execute1ToWritebackType; - l_in : in DcacheToWritebackType; + l_in : in Loadstore1ToWritebackType; w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; From 4e38c2cc2155d8efa3edf2599606a011d7487320 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Mar 2020 09:16:58 +1100 Subject: [PATCH 09/10] loadstore1: Move load data formatting from writeback to loadstore1 This puts all the data formatting (byte rotation based on lowest three bits of the address, byte reversal, sign extension, zero extension) in loadstore1. Writeback now simply sends the data provided to the register files. Signed-off-by: Paul Mackerras --- common.vhdl | 8 +--- loadstore1.vhdl | 102 ++++++++++++++++++++++++++++++++----------- writeback.vhdl | 113 +++--------------------------------------------- 3 files changed, 83 insertions(+), 140 deletions(-) diff --git a/common.vhdl b/common.vhdl index f581ccb..e4d810e 100644 --- a/common.vhdl +++ b/common.vhdl @@ -236,17 +236,11 @@ package common is write_enable: std_ulogic; write_reg : gpr_index_t; write_data : std_ulogic_vector(63 downto 0); - write_len : std_ulogic_vector(3 downto 0); - write_shift : std_ulogic_vector(2 downto 0); - sign_extend : std_ulogic; - byte_reverse : std_ulogic; - second_word : std_ulogic; xerc : xer_common_t; rc : std_ulogic; store_done : std_ulogic; end record; - constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', - byte_reverse => '0', second_word => '0', xerc => xerc_init, + constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', xerc => xerc_init, rc => '0', store_done => '0', others => (others => '0')); type Execute1ToWritebackType is record diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 2ab71ad..8c16886 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -43,7 +43,8 @@ architecture behave of loadstore1 is -- latch most of the input request load : std_ulogic; addr : std_ulogic_vector(63 downto 0); - data : std_ulogic_vector(63 downto 0); + store_data : std_ulogic_vector(63 downto 0); + load_data : std_ulogic_vector(63 downto 0); write_reg : gpr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; @@ -58,6 +59,10 @@ architecture behave of loadstore1 is second_bytes : std_ulogic_vector(7 downto 0); end record; + type byte_sel_t is array(0 to 7) of std_ulogic; + subtype byte_trim_t is std_ulogic_vector(1 downto 0); + type trim_ctl_t is array(0 to 7) of byte_trim_t; + signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); @@ -112,6 +117,7 @@ begin variable byte_offset : unsigned(2 downto 0); variable j : integer; variable k : unsigned(2 downto 0); + variable kk : unsigned(3 downto 0); variable long_sel : std_ulogic_vector(15 downto 0); variable byte_sel : std_ulogic_vector(7 downto 0); variable req : std_ulogic; @@ -120,8 +126,13 @@ begin variable wdata : std_ulogic_vector(63 downto 0); variable write_enable : std_ulogic; variable do_update : std_ulogic; - variable second_dword : std_ulogic; + variable two_dwords : std_ulogic; variable done : std_ulogic; + variable data_permuted : std_ulogic_vector(63 downto 0); + variable data_trimmed : std_ulogic_vector(63 downto 0); + variable use_second : byte_sel_t; + variable trim_ctl : trim_ctl_t; + variable negative : std_ulogic; begin v := r; req := '0'; @@ -132,14 +143,63 @@ begin write_enable := '0'; do_update := '0'; - second_dword := '0'; + two_dwords := or (r.second_bytes); + + -- load data formatting + if r.load = '1' then + byte_offset := unsigned(r.addr(2 downto 0)); + brev_lenm1 := "000"; + if r.byte_reverse = '1' then + brev_lenm1 := unsigned(r.length(2 downto 0)) - 1; + end if; + + -- shift and byte-reverse data bytes + for i in 0 to 7 loop + kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); + use_second(i) := kk(3); + j := to_integer(kk(2 downto 0)) * 8; + data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j); + end loop; + + -- Work out the sign bit for sign extension. + -- Assumes we are not doing both sign extension and byte reversal, + -- in that for unaligned loads crossing two dwords we end up + -- using a bit from the second dword, whereas for a byte-reversed + -- (i.e. big-endian) load the sign bit would be in the first dword. + negative := (r.length(3) and data_permuted(63)) or + (r.length(2) and data_permuted(31)) or + (r.length(1) and data_permuted(15)) or + (r.length(0) and data_permuted(7)); + + -- trim and sign-extend + for i in 0 to 7 loop + if i < to_integer(unsigned(r.length)) then + if two_dwords = '1' then + trim_ctl(i) := '1' & not use_second(i); + else + trim_ctl(i) := not use_second(i) & '0'; + end if; + else + trim_ctl(i) := '0' & (negative and r.sign_extend); + end if; + case trim_ctl(i) is + when "11" => + data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8); + when "10" => + data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8); + when "01" => + data_trimmed(i * 8 + 7 downto i * 8) := x"FF"; + when others => + data_trimmed(i * 8 + 7 downto i * 8) := x"00"; + end case; + end loop; + end if; case r.state is when IDLE => if l_in.valid = '1' then v.load := l_in.load; v.addr := lsu_sum; - v.data := l_in.data; v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; @@ -179,7 +239,7 @@ begin for i in 0 to 7 loop k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; j := to_integer(k) * 8; - v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); end loop; end if; @@ -203,13 +263,14 @@ begin when FIRST_ACK_WAIT => stall := '1'; if d_in.valid = '1' then - write_enable := r.load; v.state := LAST_ACK_WAIT; + if r.load = '1' then + v.load_data := data_permuted; + end if; end if; when LAST_ACK_WAIT => stall := '1'; - second_dword := or (r.second_bytes); if d_in.valid = '1' then write_enable := r.load; if r.load = '1' and r.update = '1' then @@ -230,16 +291,13 @@ begin done := '1'; end case; - -- Update registers - rin <= v; - -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; d_out.nc <= v.nc; d_out.reserve <= v.reserve; d_out.addr <= addr; - d_out.data <= v.data; + d_out.data <= v.store_data; d_out.byte_sel <= byte_sel; -- Update outputs to writeback @@ -250,28 +308,20 @@ begin l_out.write_enable <= '1'; l_out.write_reg <= r.update_reg; l_out.write_data <= r.addr; - l_out.write_len <= x"8"; - l_out.write_shift <= "000"; - l_out.sign_extend <= '0'; - l_out.byte_reverse <= '0'; - l_out.second_word <= '0'; - l_out.rc <= '0'; - l_out.store_done <= '0'; else l_out.write_enable <= write_enable; l_out.write_reg <= r.write_reg; - l_out.write_data <= d_in.data; - l_out.write_len <= r.length; - l_out.write_shift <= r.addr(2 downto 0); - l_out.sign_extend <= r.sign_extend; - l_out.byte_reverse <= r.byte_reverse; - l_out.second_word <= second_dword; - l_out.rc <= r.rc and done; - l_out.store_done <= d_in.store_done; + l_out.write_data <= data_trimmed; end if; l_out.xerc <= r.xerc; + l_out.rc <= r.rc and done; + l_out.store_done <= d_in.store_done; stall_out <= stall; + -- Update registers + rin <= v; + end process; + end; diff --git a/writeback.vhdl b/writeback.vhdl index d52bb54..d1a7faf 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -21,46 +21,12 @@ entity writeback is end entity writeback; architecture behaviour of writeback is - subtype byte_index_t is unsigned(2 downto 0); - type permutation_t is array(0 to 7) of byte_index_t; - subtype byte_trim_t is std_ulogic_vector(1 downto 0); - type trim_ctl_t is array(0 to 7) of byte_trim_t; - type byte_sel_t is array(0 to 7) of std_ulogic; - - signal data_len : unsigned(3 downto 0); - signal data_in : std_ulogic_vector(63 downto 0); - signal data_permuted : std_ulogic_vector(63 downto 0); - signal data_trimmed : std_ulogic_vector(63 downto 0); - signal data_latched : std_ulogic_vector(63 downto 0); - signal perm : permutation_t; - signal use_second : byte_sel_t; - signal byte_offset : unsigned(2 downto 0); - signal brev_lenm1 : unsigned(2 downto 0); - signal trim_ctl : trim_ctl_t; - signal rc : std_ulogic; - signal partial_write : std_ulogic; - signal sign_extend : std_ulogic; - signal negative : std_ulogic; - signal second_word : std_ulogic; begin - writeback_0: process(clk) - begin - if rising_edge(clk) then - if partial_write = '1' then - data_latched <= data_permuted; - end if; - end if; - end process; - writeback_1: process(all) variable x : std_ulogic_vector(0 downto 0); variable y : std_ulogic_vector(0 downto 0); - variable z : std_ulogic_vector(0 downto 0); variable w : std_ulogic_vector(0 downto 0); - variable j : integer; - variable k : unsigned(3 downto 0); variable cf: std_ulogic_vector(3 downto 0); - variable xe: xer_common_t; variable zero : std_ulogic; variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); @@ -85,17 +51,10 @@ begin complete_out <= '1'; end if; - rc <= '0'; - brev_lenm1 <= "000"; - partial_write <= '0'; - second_word <= '0'; - xe := e_in.xerc; - data_in <= (others => '0'); - if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; + w_out.write_data <= e_in.write_data; w_out.write_enable <= '1'; - rc <= e_in.rc; end if; if e_in.write_cr_enable = '1' then @@ -109,20 +68,10 @@ begin c_out.write_xerc_data <= e_in.xerc; end if; - sign_extend <= l_in.sign_extend; - data_len <= unsigned(l_in.write_len); - byte_offset <= unsigned(l_in.write_shift); if l_in.write_enable = '1' then w_out.write_reg <= gpr_to_gspr(l_in.write_reg); - if l_in.byte_reverse = '1' then - brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1; - end if; - second_word <= l_in.second_word; - if l_in.valid = '0' and (data_len + byte_offset > 8) then - partial_write <= '1'; - end if; - xe := l_in.xerc; - w_out.write_enable <= not partial_write or second_word; + w_out.write_data <= l_in.write_data; + w_out.write_enable <= '1'; end if; if l_in.rc = '1' then @@ -130,65 +79,15 @@ begin scf(3) := '0'; scf(2) := '0'; scf(1) := l_in.store_done; - scf(0) := xe.so; + scf(0) := l_in.xerc.so; c_out.write_cr_enable <= '1'; c_out.write_cr_mask <= num_to_fxm(0); c_out.write_cr_data(31 downto 28) <= scf; end if; - -- shift and byte-reverse data bytes - for i in 0 to 7 loop - k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); - perm(i) <= k(2 downto 0); - use_second(i) <= k(3); - end loop; - for i in 0 to 7 loop - j := to_integer(perm(i)) * 8; - data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j); - end loop; - - -- If the data can arrive split over two cycles, this will be correct - -- provided we don't have both sign extension and byte reversal. - negative <= (data_len(3) and data_permuted(63)) or - (data_len(2) and data_permuted(31)) or - (data_len(1) and data_permuted(15)) or - (data_len(0) and data_permuted(7)); - - -- trim and sign-extend - for i in 0 to 7 loop - if i < to_integer(data_len) then - if second_word = '1' then - trim_ctl(i) <= '1' & not use_second(i); - else - trim_ctl(i) <= not use_second(i) & '0'; - end if; - else - trim_ctl(i) <= '0' & (negative and sign_extend); - end if; - end loop; - for i in 0 to 7 loop - case trim_ctl(i) is - when "11" => - data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8); - when "10" => - data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8); - when "01" => - data_trimmed(i * 8 + 7 downto i * 8) <= x"FF"; - when others => - data_trimmed(i * 8 + 7 downto i * 8) <= x"00"; - end case; - end loop; - - -- deliver to regfile - if l_in.write_enable = '1' then - w_out.write_data <= data_trimmed; - else - w_out.write_data <= e_in.write_data; - end if; - -- Perform CR0 update for RC forms -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data - if rc = '1' then + if e_in.rc = '1' and e_in.write_enable = '1' then sign := e_in.write_data(63); zero := not (or e_in.write_data); c_out.write_cr_enable <= '1'; @@ -196,7 +95,7 @@ begin cf(3) := sign; cf(2) := not sign and not zero; cf(1) := zero; - cf(0) := xe.so; + cf(0) := e_in.xerc.so; c_out.write_cr_data(31 downto 28) <= cf; end if; end process; From 81369187c036d93a67b0abb6ca487151b84dc82b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Mar 2020 12:39:08 +1100 Subject: [PATCH 10/10] loadstore1: Add support for cache-inhibited load and store instructions This adds support for lbzcix, lhzcix, lwzcix, ldcix, stbcix, sthcix, stwcix and stdcix. The temporary hack where accesses to addresses of the form 0xc??????? are made non-cacheable is left in for now to avoid making existing programs non-functional. Signed-off-by: Paul Mackerras --- common.vhdl | 3 ++- decode1.vhdl | 8 ++++++++ execute1.vhdl | 5 +++++ loadstore1.vhdl | 3 +-- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/common.vhdl b/common.vhdl index e4d810e..1ed0606 100644 --- a/common.vhdl +++ b/common.vhdl @@ -202,6 +202,7 @@ package common is data : std_ulogic_vector(63 downto 0); -- data to write, unused for read write_reg : gpr_index_t; length : std_ulogic_vector(3 downto 0); + ci : std_ulogic; -- cache-inhibited load/store byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? @@ -210,7 +211,7 @@ package common is reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. end record; - constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', + constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', rc => '0', others => (others => '0')); diff --git a/decode1.vhdl b/decode1.vhdl index 349aa7e..e9dae1e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -216,22 +216,26 @@ architecture behaviour of decode1 is 2#1111001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#1111101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lbarx + 2#1101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzcix 2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzux 2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzx 2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- ldarx 2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldbrx + 2#1101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix 2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux 2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax 2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhbrx + 2#1100110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzcix 2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzux 2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzx 2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lwarx 2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lwaux 2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lwax 2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwbrx + 2#1100010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzcix 2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzux 2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzx -- 2#1000000000# mcrxr @@ -278,18 +282,22 @@ architecture behaviour of decode1 is 2#1100111000# => (ALU, OP_SHR, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- srawi 2#1000011011# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- srd 2#1000011000# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- srw + 2#1111010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbcix 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stbcx 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbux 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbx 2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdbrx + 2#1111110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdcix 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stdcx 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx + 2#1110110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx 2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthux 2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthx 2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwbrx + 2#1110010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwcix 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stwcx 2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwux 2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwx diff --git a/execute1.vhdl b/execute1.vhdl index b05fd4d..4703049 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -753,6 +753,11 @@ begin lv.xerc := v.e.xerc; lv.reserve := e_in.reserve; lv.rc := e_in.rc; + -- decode l*cix and st*cix instructions here + if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and + e_in.insn(5 downto 1) = "10101" then + lv.ci := '1'; + end if; -- Update registers rin <= v; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 8c16886..518feee 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -209,6 +209,7 @@ begin v.xerc := l_in.xerc; v.reserve := l_in.reserve; v.rc := l_in.rc; + v.nc := l_in.ci; -- XXX Temporary hack. Mark the op as non-cachable if the address -- is the form 0xc------- @@ -218,8 +219,6 @@ begin -- bit otherwise. if lsu_sum(31 downto 28) = "1100" then v.nc := '1'; - else - v.nc := '0'; end if; -- Do length_to_sel and work out if we are doing 2 dwords