From 5a00029519a8887f87763cbef8424d1f90f2e1b1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 13 Jun 2020 17:22:56 +1000 Subject: [PATCH 01/22] register_file: Report value being written before asserting it's not X If a bug causes an indeterminate value to be written to a GPR, an assert causes simulation to abort. Move the assert after the report of the GPR index and value so that we get to know what the bad value is before the simulation terminates. Signed-off-by: Paul Mackerras --- register_file.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/register_file.vhdl b/register_file.vhdl index 2cffeea..4df032c 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -40,12 +40,12 @@ begin begin if rising_edge(clk) then if w_in.write_enable = '1' then - assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; if w_in.write_reg(5) = '0' then report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); else report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); end if; + assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data; end if; end if; From 03f9d7a97e8bbec46b2cbd3386774295e48a8387 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jun 2020 14:57:30 +1000 Subject: [PATCH 02/22] tests/xics: Fix assumption that interrupts happen immediately Currently the test writes to the XICS and then checks that the expected interrupt has happened. This turns into a stbcix instruction followed immediately by a load from the variable that indicates whether an interrupt has happened. It is possible for it to take a few cycles for the store to reach the XICS and the interrupt request signal to come back to the core, particularly with improvements to the load/store unit and dcache. This therefore adds a delay between storing to the XICS and checking for the occurrence of an interrupt, so as to give the signals time to propagate. The delay loop does an arbitrary 10 iterations, and each iteration does two loads and one store to (cacheable) memory. Signed-off-by: Paul Mackerras --- tests/test_xics.bin | Bin 12384 -> 12384 bytes tests/xics/xics.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/tests/test_xics.bin b/tests/test_xics.bin index 6dd993ce0af3fd582dcb3d1914ca85dcf8b48a1b..327f98f64e2bb62ea1ca48f98456b1dae59799f6 100755 GIT binary patch literal 12384 zcmeHMeP~9}x?1N^Bop^0P9~4sBwE9SP_d8El z_Os$ND{N?x_u%Q?d+xdCoZs)n)4Qz-uwh4VU1x-yG;m~qBLf^6;K%?+1~@XnkpYejaAbia z3mjSC$O1rNF!tn3n?cQea++#bPs_36+eU(8)q43!N->)0Pk&s9G?MObp!KasKKNbYK{^nEt@br^K+|q)!1HQ^Iwes!yffG0x{j#AxQ?Nj1)xhm3#vQ3B(Jd|Ce8I+<_SsKMAZu)*WY{ciy) zQ7}#9X@3fsgm}91&`89)R$}-%>E1s`8^=+fhd#?rOBw5*hyHa-dkXEpmHqO!E%?^p zo8!0o-XVP>FRfPJN99=dvrX1dpdOaKAGP>m-@}$R`~IeEo8Q&oJ1Oq;ebMAQn08k9 z{*oNazW-cX;_h|SuS1_@zm>8w?)NNh_B}7#W#62yO57~#K+cge+`;^fq9ylAW$sm( zdtMJTpw(2Bx7D{i7|;7w^`a8vro1;W(Ch8 zC6hsywh$o-9OX5cpPjakk_q+!+(99_aBmC}*4INLDz7M!`u z{}rH~qNEydX`V2z1!k;|ajdFOH`kQF(b11ZUo#2+;PO7F7{R3HNehtL8~)jDPdsXLf$v_Q z5FT?4&0uf68)s}AHq)ZCE|>JN=H5%6d4&J}SiiZfi&*jeKAXasq2&8%yvIbmJO?s& zIggdkHsx5Yw(6y=&9%b=jN7#H33z#q@HvFfzCyg?FP}jay82V1&F#&YujPHM<)-Ls zq=#DheDqn)Kk~)78bf&jH{#(-F)-hw>%NCGmhSycOB?nwKCd0vp#ay7Wj#|lIvm;m z6+ES_9yPRkch9bYJ>Ljz+y2C^9(SY43Iluhk8*{jjVSw}#pUq+AK({$Fn-S{l=gH~ zw7*)>{u(u#5(N0%E^uFLS2>qz1 ziMne;knt4k&U*BH=VKwhze1;Pg^q^4U*&{2xI*XgB^@u!52DXi5MqdRx;eh1buZPfh_Sm# zcWiHLqAoMae_W~!eWt|Aag6D}EPN0{`C=a5kg~tBz_e$CmbUtI?Y0+qof{SKBm0t=r(!Y zQo?CMG{>KW%Dd3zarR|1U^8GdU^8GdU^8GdU^8GdU^8GdU^8GdU^8GdU^DQ)V?dW5 gjPR+_yw1qBENHS^xl0J6pDT6jlOLj~IekM&zO2#spjdc^ku7gA-g@pl< z)7Z7c`hiUOkV(Lxg^;vc1ZbPW2eejNI;YhT-G)@uQo$H0fqw9xO$wu|c1(BoJJ0vR z<0W+KG|7|jlV0w<=ltHe=l8qky?38O6l)?H44hdo*ehson-GJ6G3UVu4c)cFzs(sN z40LL-2$9}My#t)G@kNd!p+ZV!>6i9HMj}f&M zdhs9h`Qrk$DNvgNwJA`W0<|enn*y~drgMvNU+5Iu2wWPtG;nF)(!izRO8}c5bGpL7 zr-4s1miz4EG5gr2k*}Gme|v0Q15s^KqnZHPEc>nNFdw$$-(dOP;asP2kDPkQ z+|&EjTspbyIUk;r^Ks0LrNuabxZXrYO+crb-jiSH3a>2-cGz>f2eWLz@1 z5kse7=(A$f&J#oNXHG;$L)SmWhEw46vgfKdRP`ckF%SJg1Q(LuQ%*-#9?ZQNT|&b? zqLZr@(Xe4;hNG2q#8*d=dgf3r^+4Q)53&={gO^2AqsE4XG_bOL=1NqjsXm<=dvy|h z<6_uohhC3jPRU7&l@eKV;@Y5i+{^Vr##9i$2{mwx62#{4L92|sr%RAFOU$9viCa4D*6zY> zPuGI07xjE%rJcjWwph&Jl+?h=;i$B|0uJT#aEM4hb9+PeEgZH;o8oZ5hG|lM$ssQ0 z=ir=&L#r(obNE~SyV-B$a7o&ZfkVrubJyVw>1PhNRNunku(T--=WUo?<(C||?&hw8 z-cX17Md{)6W_RZ-b&Tt*bx8tOLas}1-_>he*6o1wvtGYfeT!aQ(x&tpwP9M7AG~*#$!8+!m+w8+ZIP{h z=Ue-4*kUr58#0!a%N1$+4Y;&Ib9E0YE)&wv_N?k#xEz%>#pQwx6H)%pvG$?t|9qUaP>N|nd#IhqH3WOwZnRze7g0%b)nffv6jCmom&?_mA3bw<)`4Fw2aHT*eU&N z|CQ=nwEUK|DJ=(VnAOV9S`O*pQ`75=)qszF-{)A1Ee7YFESa181!?;Lx%=kH{j~IR z?l)B5lKTN^Q@NkBVfHBh=ct38Pwrm_THfdP$6ob}n)~af0ipr{w_YqH~yU%l|`+FYR(coF@ zxVTw8$6mtTA$0x6(dVplVywxr`M$jzjNWkMdG~?i7U-w!m!v)PUUPJ*H3r5L%J;_= z(X8q-Z;VIfnEzMgB5yy%2ciaY;k+vGO*ZDNiG(&hZvHnipvBc4(%&y%(E$ChIcgE^`7 zFpnQwmh3aIkF9Ozx!+v%y194R>wOpcvW82o95J4dv2?oLSMv|C0qfGpx?SuJ){jcx znDZU%+kBbO%|Au?d(PxaudP}m_eC0Vx{7B2-#e$}HEM6PMUFM_#=;~1Ti%4k4%o1( z<@kNz3}_ZgzZR|g*rF9SHcA(BzYD&k*90of_eD)_Pi%Si5onRZv+jO+3a(9D*ArWp z)Nx%3d!+kedF}plEXRBv-?y0mJ1ZjihS-@Adkq(2q&5cU-Lv1}BJ)|y`8-t{amj0W z{?s>r@R_xt_6bXtza0%#Ma^#w-sc)Kzl-vooA(^9o2+4UAFsgyX#0`1{`h%?d49dV z8PC|CnEUk5h83nJ;Lwn`i)A2n?fQ|~hPr~gw(j1xYxmYV+jzBY{B?8O0jQ4N6Z{9q zQ@++V>_MsB_w2LWa=fo$$Nog#P1Q9zJT03#wlebDn_AotFVx|KM&_Ztu!%}SKCNnz zX4+5#wC3+ZJTCo=<9?gP_9(^%&Q93c&{9i<{srHQEfv&UhL038jvwt1{P+#PJUGs7 zoFlO5rX`QthyE{ZxFfLr)`DYY#&Mp7Z9iIivfM|$S4xkR9CZ#mj%k?NSJD3c3i5q8 zb>%OY%^Y~%k8oDs6yhwe4PP_tSYJ_R;vFMM_QR<%pAHwO)7e`>+(t`n6?KTmVIAET zOdWh9p9taNSQbo=#5BU(0nAHTAu=4NnR7hkI=tZJvU@UZuJAj07FN(YGuJ;~x>)9W z2yoVCKdx)c39$ryK3MnT(pGsV<~^*9u#Ci{IR+o|t?A<1Le%HoCXQpTfsI=}_*w&J zo6Xh%TPJMHj}OLe$2n+;SB-HP=QwQbXsN~2`k3~lsbw?RVrA~~JPyeFmU;M+;!#wf zs6bJHq5?$)iV74JC@N4?pr}AmfuaIM1&Rv%_X=#6KML^A7JS6zST?j@b@J{xf2+Tk PH_gZ&CI0%Qdm;Y<^ycm; diff --git a/tests/xics/xics.c b/tests/xics/xics.c index 2ff4c54..a2db3a5 100644 --- a/tests/xics/xics.c +++ b/tests/xics/xics.c @@ -9,6 +9,14 @@ #undef DEBUG //#define DEBUG 1 +void delay(void) +{ + static volatile int i; + + for (i = 0; i < 10; ++i) + ; +} + void print_number(unsigned int i) // only for i = 0-999 { unsigned int j, k, m; @@ -148,14 +156,17 @@ int xics_test_0(void) xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt // still masked, so shouldn't happen yet + delay(); assert(isrs_run == 0); // unmask IPI only xics_write8(XICS_XIRR, 0x40); + delay(); assert(isrs_run == ISR_IPI); // unmask UART xics_write8(XICS_XIRR, 0xc0); + delay(); assert(isrs_run == (ISR_IPI | ISR_UART)); // cleanup @@ -174,12 +185,14 @@ int xics_test_1(void) xics_write8(XICS_XIRR, 0xff); // allow all interrupts // should be none pending + delay(); assert(isrs_run == 0); // trigger both potato_uart_irq_en(); // cause 0x500 interrupt xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt + delay(); assert(isrs_run == (ISR_IPI | ISR_UART)); // cleanup @@ -208,9 +221,11 @@ int xics_test_2(void) // trigger an IPI xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt + delay(); assert(isrs_run == 0); mtmsrd(0x9000000000008003); // EE on + delay(); assert(isrs_run == ISR_IPI); // cleanup From aab84acda8676a82439b3ee0c905cc2db5c6ffce Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 13 Jun 2020 19:59:17 +1000 Subject: [PATCH 03/22] scripts/mw_debug: Make progress counts display on one line This outputs a carriage return rather than a newline after the display of the progress count during the load and save operations. This makes the output more compact and better looking. Signed-off-by: Paul Mackerras --- scripts/mw_debug/mw_debug.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index c58073b..7f77558 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -507,8 +507,10 @@ static void load(const char *filename, uint64_t addr) // if (rc < 8) XXX fixup endian ? check(dmi_write(DBG_WB_DATA, data), "writing WB_DATA"); count += 8; - if (!(count % 1024)) - printf("%x...\n", count); + if (!(count % 1024)) { + printf("%x...\r", count); + fflush(stdout); + } } close(fd); printf("%x done.\n", count); @@ -535,8 +537,10 @@ static void save(const char *filename, uint64_t addr, uint64_t size) break; } count += 8; - if (!(count % 1024)) - printf("%x...\n", count); + if (!(count % 1024)) { + printf("%x...\r", count); + fflush(stdout); + } if (count >= size) break; } From 49a4d9f67a21438a4af703027baa72211409857a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2020 13:25:48 +1000 Subject: [PATCH 04/22] Add core logging This logs 256 bits of data per cycle to a ring buffer in BRAM. The data collected can be read out through 2 new SPRs or through the debug interface. The new SPRs are LOG_ADDR (724) and LOG_DATA (725). LOG_ADDR contains the buffer write pointer in the upper 32 bits (in units of entries, i.e. 32 bytes) and the read pointer in the lower 32 bits (in units of doublewords, i.e. 8 bytes). Reading LOG_DATA gives the doubleword from the buffer at the read pointer and increments the read pointer. Setting bit 31 of LOG_ADDR inhibits the trace log system from writing to the log buffer, so the contents are stable and can be read. There are two new debug addresses which function similarly to the LOG_ADDR and LOG_DATA SPRs. The log is frozen while either or both of the LOG_ADDR SPR bit 31 or the debug LOG_ADDR register bit 31 are set. The buffer defaults to 2048 entries, i.e. 64kB. The size is set by the LOG_LENGTH generic on the core_debug module. Software can determine the length of the buffer because the length is ORed into the buffer write pointer in the upper 32 bits of LOG_ADDR. Hence the length of the buffer can be calculated as 1 << (31 - clz(LOG_ADDR)). There is a program to format the log entries in a somewhat readable fashion in scripts/fmt_log/fmt_log.c. The log_entry struct in that file describes the layout of the bits in the log entries. Signed-off-by: Paul Mackerras --- core.vhdl | 43 +++++-- core_debug.vhdl | 111 +++++++++++++++++ cr_file.vhdl | 15 ++- dcache.vhdl | 24 +++- decode1.vhdl | 18 ++- decode2.vhdl | 21 +++- execute1.vhdl | 42 ++++++- fetch1.vhdl | 8 +- icache.vhdl | 39 +++++- loadstore1.vhdl | 20 ++- register_file.vhdl | 14 ++- scripts/fmt_log/Makefile | 12 ++ scripts/fmt_log/fmt_log.c | 235 ++++++++++++++++++++++++++++++++++++ scripts/mw_debug/mw_debug.c | 87 +++++++++++++ 14 files changed, 671 insertions(+), 18 deletions(-) create mode 100644 scripts/fmt_log/Makefile create mode 100644 scripts/fmt_log/fmt_log.c diff --git a/core.vhdl b/core.vhdl index 8ba5b70..da9853f 100644 --- a/core.vhdl +++ b/core.vhdl @@ -128,6 +128,12 @@ architecture behave of core is -- Debug status signal dbg_core_is_stopped: std_ulogic; + -- Logging signals + signal log_data : std_ulogic_vector(255 downto 0); + signal log_rd_addr : std_ulogic_vector(31 downto 0); + signal log_wr_addr : std_ulogic_vector(31 downto 0); + signal log_rd_data : std_ulogic_vector(63 downto 0); + function keep_h(disable : boolean) return string is begin if disable then @@ -183,7 +189,8 @@ begin flush_in => flush, stop_in => dbg_core_stop, e_in => execute1_to_fetch1, - i_out => fetch1_to_icache + i_out => fetch1_to_icache, + log_out => log_data(42 downto 0) ); fetch1_stall_in <= icache_stall_out or decode2_stall_out; @@ -205,7 +212,8 @@ begin inval_in => dbg_icache_rst or ex1_icache_inval, stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, - wishbone_in => wishbone_insn_in + wishbone_in => wishbone_insn_in, + log_out => log_data(96 downto 43) ); fetch2_0: entity work.fetch2 @@ -227,7 +235,8 @@ begin stall_in => decode1_stall_in, flush_in => flush, f_in => fetch2_to_decode1, - d_out => decode1_to_decode2 + d_out => decode1_to_decode2, + log_out => log_data(109 downto 97) ); decode1_stall_in <= decode2_stall_out; @@ -249,7 +258,8 @@ begin r_in => register_file_to_decode2, r_out => decode2_to_register_file, c_in => cr_file_to_decode2, - c_out => decode2_to_cr_file + c_out => decode2_to_cr_file, + log_out => log_data(119 downto 110) ); decode2_stall_in <= ex1_stall_out or ls1_stall_out; @@ -267,7 +277,8 @@ begin dbg_gpr_addr => dbg_gpr_addr, dbg_gpr_data => dbg_gpr_data, sim_dump => terminate, - sim_dump_done => sim_cr_dump + sim_dump_done => sim_cr_dump, + log_out => log_data(255 downto 185) ); cr_file_0: entity work.cr_file @@ -279,7 +290,8 @@ begin d_in => decode2_to_cr_file, d_out => cr_file_to_decode2, w_in => writeback_to_cr_file, - sim_dump => sim_cr_dump + sim_dump => sim_cr_dump, + log_out => log_data(184 downto 172) ); execute1_0: entity work.execute1 @@ -299,7 +311,11 @@ begin e_out => execute1_to_writeback, icache_inval => ex1_icache_inval, dbg_msr_out => msr, - terminate_out => terminate + terminate_out => terminate, + log_out => log_data(134 downto 120), + log_rd_addr => log_rd_addr, + log_rd_data => log_rd_data, + log_wr_addr => log_wr_addr ); loadstore1_0: entity work.loadstore1 @@ -314,7 +330,8 @@ begin m_out => loadstore1_to_mmu, m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, - stall_out => ls1_stall_out + stall_out => ls1_stall_out, + log_out => log_data(149 downto 140) ); mmu_0: entity work.mmu @@ -343,7 +360,8 @@ begin m_out => dcache_to_mmu, stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, - wishbone_out => wishbone_data_out + wishbone_out => wishbone_data_out, + log_out => log_data(171 downto 152) ); writeback_0: entity work.writeback @@ -356,6 +374,9 @@ begin complete_out => complete ); + log_data(151 downto 150) <= "00"; + log_data(139 downto 135) <= "00000"; + debug_0: entity work.core_debug port map ( clk => clk, @@ -377,6 +398,10 @@ begin dbg_gpr_ack => dbg_gpr_ack, dbg_gpr_addr => dbg_gpr_addr, dbg_gpr_data => dbg_gpr_data, + log_data => log_data, + log_read_addr => log_rd_addr, + log_read_data => log_rd_data, + log_write_addr => log_wr_addr, terminated_out => terminated_out ); diff --git a/core_debug.vhdl b/core_debug.vhdl index c97213b..31e4ab8 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -3,9 +3,14 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.utils.all; use work.common.all; entity core_debug is + generic ( + -- Length of log buffer + LOG_LENGTH : positive := 2048 + ); port ( clk : in std_logic; rst : in std_logic; @@ -34,6 +39,12 @@ entity core_debug is dbg_gpr_addr : out gspr_index_t; dbg_gpr_data : in std_ulogic_vector(63 downto 0); + -- Core logging data + log_data : in std_ulogic_vector(255 downto 0); + log_read_addr : in std_ulogic_vector(31 downto 0); + log_read_data : out std_ulogic_vector(63 downto 0); + log_write_addr : out std_ulogic_vector(31 downto 0); + -- Misc terminated_out : out std_ulogic ); @@ -77,6 +88,10 @@ architecture behave of core_debug is -- GSPR register data constant DBG_CORE_GSPR_DATA : std_ulogic_vector(3 downto 0) := "0101"; + -- Log buffer address and data registers + constant DBG_CORE_LOG_ADDR : std_ulogic_vector(3 downto 0) := "0110"; + constant DBG_CORE_LOG_DATA : std_ulogic_vector(3 downto 0) := "0111"; + -- Some internal wires signal stat_reg : std_ulogic_vector(63 downto 0); @@ -89,6 +104,38 @@ architecture behave of core_debug is signal do_gspr_rd : std_ulogic; signal gspr_index : gspr_index_t; + -- Logging RAM + constant LOG_INDEX_BITS : natural := log2(LOG_LENGTH); + subtype log_ptr_t is unsigned(LOG_INDEX_BITS - 1 downto 0); + type log_array_t is array(0 to LOG_LENGTH - 1) of std_ulogic_vector(255 downto 0); + signal log_array : log_array_t; + signal log_rd_ptr : log_ptr_t; + signal log_wr_ptr : log_ptr_t; + signal log_toggle : std_ulogic; + signal log_wr_enable : std_ulogic; + signal log_rd_ptr_latched : log_ptr_t; + signal log_rd : std_ulogic_vector(255 downto 0); + signal log_dmi_addr : std_ulogic_vector(31 downto 0); + signal log_dmi_data : std_ulogic_vector(63 downto 0); + signal do_dmi_log_rd : std_ulogic; + signal log_dmi_reading : std_ulogic; + signal log_dmi_read_done : std_ulogic; + signal dmi_read_log_data : std_ulogic; + signal dmi_read_log_data_1 : std_ulogic; + + function select_dword(data : std_ulogic_vector(255 downto 0); + addr : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is + variable firstbit : integer; + begin + firstbit := to_integer(unsigned(addr(1 downto 0))) * 64; + return data(firstbit + 63 downto firstbit); + end; + + attribute ram_style : string; + attribute ram_style of log_array : signal is "block"; + attribute ram_decomp : string; + attribute ram_decomp of log_array : signal is "power"; + begin -- Single cycle register accesses on DMI except for GSPR data dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA @@ -108,6 +155,8 @@ begin nia when DBG_CORE_NIA, msr when DBG_CORE_MSR, dbg_gpr_data when DBG_CORE_GSPR_DATA, + log_write_addr & log_dmi_addr when DBG_CORE_LOG_ADDR, + log_dmi_data when DBG_CORE_LOG_DATA, (others => '0') when others; -- DMI writes @@ -118,6 +167,7 @@ begin do_step <= '0'; do_reset <= '0'; do_icreset <= '0'; + do_dmi_log_rd <= '0'; if (rst) then stopping <= '0'; @@ -151,11 +201,26 @@ begin end if; elsif dmi_addr = DBG_CORE_GSPR_INDEX then gspr_index <= dmi_din(gspr_index_t'left downto 0); + elsif dmi_addr = DBG_CORE_LOG_ADDR then + log_dmi_addr <= dmi_din(31 downto 0); + do_dmi_log_rd <= '1'; end if; else report("DMI read from " & to_string(dmi_addr)); end if; + + elsif dmi_read_log_data = '0' and dmi_read_log_data_1 = '1' then + -- Increment log_dmi_addr after the end of a read from DBG_CORE_LOG_DATA + log_dmi_addr(LOG_INDEX_BITS + 1 downto 0) <= + std_ulogic_vector(unsigned(log_dmi_addr(LOG_INDEX_BITS+1 downto 0)) + 1); + do_dmi_log_rd <= '1'; end if; + dmi_read_log_data_1 <= dmi_read_log_data; + if dmi_req = '1' and dmi_addr = DBG_CORE_LOG_DATA then + dmi_read_log_data <= '1'; + else + dmi_read_log_data <= '0'; + end if; -- Set core stop on terminate. We'll be stopping some time *after* -- the offending instruction, at least until we can do back flushes @@ -175,5 +240,51 @@ begin core_rst <= do_reset; icache_rst <= do_icreset; terminated_out <= terminated; + + -- Use MSB of read addresses to stop the logging + log_wr_enable <= not (log_read_addr(31) or log_dmi_addr(31)); + + log_ram: process(clk) + begin + if rising_edge(clk) then + if log_wr_enable = '1' then + log_array(to_integer(log_wr_ptr)) <= log_data; + end if; + log_rd <= log_array(to_integer(log_rd_ptr_latched)); + end if; + end process; + + + log_buffer: process(clk) + variable b : integer; + variable data : std_ulogic_vector(255 downto 0); + begin + if rising_edge(clk) then + if rst = '1' then + log_wr_ptr <= (others => '0'); + log_toggle <= '0'; + elsif log_wr_enable = '1' then + if log_wr_ptr = to_unsigned(LOG_LENGTH - 1, LOG_INDEX_BITS) then + log_toggle <= not log_toggle; + end if; + log_wr_ptr <= log_wr_ptr + 1; + end if; + if do_dmi_log_rd = '1' then + log_rd_ptr_latched <= unsigned(log_dmi_addr(LOG_INDEX_BITS + 1 downto 2)); + else + log_rd_ptr_latched <= unsigned(log_read_addr(LOG_INDEX_BITS + 1 downto 2)); + end if; + if log_dmi_read_done = '1' then + log_dmi_data <= select_dword(log_rd, log_dmi_addr); + else + log_read_data <= select_dword(log_rd, log_read_addr); + end if; + log_dmi_read_done <= log_dmi_reading; + log_dmi_reading <= do_dmi_log_rd; + end if; + end process; + log_write_addr(LOG_INDEX_BITS - 1 downto 0) <= std_ulogic_vector(log_wr_ptr); + log_write_addr(LOG_INDEX_BITS) <= '1'; + log_write_addr(31 downto LOG_INDEX_BITS + 1) <= (others => '0'); end behave; diff --git a/cr_file.vhdl b/cr_file.vhdl index a6dd585..37fa76b 100644 --- a/cr_file.vhdl +++ b/cr_file.vhdl @@ -18,7 +18,9 @@ entity cr_file is w_in : in WritebackToCrFileType; -- debug - sim_dump : in std_ulogic + sim_dump : in std_ulogic; + + log_out : out std_ulogic_vector(12 downto 0) ); end entity cr_file; @@ -27,6 +29,7 @@ architecture behaviour of cr_file is signal crs_updated : std_ulogic_vector(31 downto 0); signal xerc : xer_common_t := xerc_init; signal xerc_updated : xer_common_t; + signal log_data : std_ulogic_vector(12 downto 0); begin cr_create_0: process(all) variable hi, lo : integer := 0; @@ -88,4 +91,14 @@ begin end process; end generate; + cr_log: process(clk) + begin + if rising_edge(clk) then + log_data <= w_in.write_cr_enable & + w_in.write_cr_data(31 downto 28) & + w_in.write_cr_mask; + end if; + end process; + log_out <= log_data; + end architecture behaviour; diff --git a/dcache.vhdl b/dcache.vhdl index 9df5562..7a8c0ba 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -46,7 +46,9 @@ entity dcache is stall_out : out std_ulogic; wishbone_out : out wishbone_master_out; - wishbone_in : in wishbone_slave_out + wishbone_in : in wishbone_slave_out; + + log_out : out std_ulogic_vector(19 downto 0) ); end entity dcache; @@ -419,6 +421,8 @@ architecture rtl of dcache is ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; end; + signal log_data : std_ulogic_vector(19 downto 0); + begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -1265,4 +1269,22 @@ begin end if; end if; end process; + + dcache_log: process(clk) + begin + if rising_edge(clk) then + log_data <= r1.wb.adr(5 downto 3) & + wishbone_in.stall & + wishbone_in.ack & + r1.wb.stb & r1.wb.cyc & + d_out.error & + d_out.valid & + std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & + stall_out & + std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) & + valid_ra & + std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3)); + end if; + end process; + log_out <= log_data; end; diff --git a/decode1.vhdl b/decode1.vhdl index b6da5d7..3e3b41a 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -15,7 +15,9 @@ entity decode1 is flush_in : in std_ulogic; f_in : in Fetch2ToDecode1Type; - d_out : out Decode1ToDecode2Type + d_out : out Decode1ToDecode2Type; + + log_out : out std_ulogic_vector(12 downto 0) ); end entity decode1; @@ -352,6 +354,8 @@ architecture behaviour of decode1 is constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); + signal log_data : std_ulogic_vector(12 downto 0); + begin decode1_0: process(clk) begin @@ -474,4 +478,16 @@ begin -- Update outputs d_out <= r; end process; + + dec1_log : process(clk) + begin + if rising_edge(clk) then + log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) & + r.nia(5 downto 2) & + std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) & + r.valid; + end if; + end process; + log_out <= log_data; + end architecture behaviour; diff --git a/decode2.vhdl b/decode2.vhdl index f889a23..2c02a75 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -32,7 +32,9 @@ entity decode2 is r_out : out Decode2ToRegisterFileType; c_in : in CrFileToDecode2Type; - c_out : out Decode2ToCrFileType + c_out : out Decode2ToCrFileType; + + log_out : out std_ulogic_vector(9 downto 0) ); end entity decode2; @@ -43,6 +45,8 @@ architecture behaviour of decode2 is signal r, rin : reg_type; + signal log_data : std_ulogic_vector(9 downto 0); + type decode_input_reg_t is record reg_valid : std_ulogic; reg : gspr_index_t; @@ -381,4 +385,19 @@ begin -- Update outputs e_out <= r.e; end process; + + dec2_log : process(clk) + begin + if rising_edge(clk) then + log_data <= r.e.nia(5 downto 2) & + r.e.valid & + stopped_out & + stall_out & + r.e.bypass_data3 & + r.e.bypass_data2 & + r.e.bypass_data1; + end if; + end process; + log_out <= log_data; + end architecture behaviour; diff --git a/execute1.vhdl b/execute1.vhdl index cac8e8a..9066aa0 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -36,7 +36,12 @@ entity execute1 is dbg_msr_out : out std_ulogic_vector(63 downto 0); icache_inval : out std_ulogic; - terminate_out : out std_ulogic + terminate_out : out std_ulogic; + + log_out : out std_ulogic_vector(14 downto 0); + log_rd_addr : out std_ulogic_vector(31 downto 0); + log_rd_data : in std_ulogic_vector(63 downto 0); + log_wr_addr : in std_ulogic_vector(31 downto 0) ); end entity execute1; @@ -53,6 +58,7 @@ architecture behaviour of execute1 is slow_op_oe : std_ulogic; slow_op_xerc : xer_common_t; ldst_nia : std_ulogic_vector(63 downto 0); + log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, lr_update => '0', @@ -83,6 +89,11 @@ architecture behaviour of execute1 is signal x_to_divider: Execute1ToDividerType; signal divider_to_x: DividerToExecute1Type; + -- signals for logging + signal exception_log : std_ulogic; + signal irq_valid_log : std_ulogic; + signal log_data : std_ulogic_vector(14 downto 0); + type privilege_level is (USER, SUPER); type op_privilege_array is array(insn_type_t) of privilege_level; constant op_privilege: op_privilege_array := ( @@ -223,6 +234,7 @@ begin ); dbg_msr_out <= ctrl.msr; + log_rd_addr <= r.log_addr_spr; a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; @@ -767,6 +779,11 @@ begin result := ctrl.tb; when SPR_DEC => result := ctrl.dec; + when 724 => -- LOG_ADDR SPR + result := log_wr_addr & r.log_addr_spr; + when 725 => -- LOG_DATA SPR + result := log_rd_data; + v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1); when others => -- mfspr from unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode @@ -840,6 +857,8 @@ begin case decode_spr_num(e_in.insn) is when SPR_DEC => ctrl_tmp.dec <= c_in; + when 724 => -- LOG_ADDR SPR + v.log_addr_spr := c_in(31 downto 0); when others => -- mtspr to unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode @@ -1040,5 +1059,26 @@ begin l_out <= lv; e_out <= r.e; flush_out <= f_out.redirect; + + exception_log <= exception; + irq_valid_log <= irq_valid; + end process; + + ex1_log : process(clk) + begin + if rising_edge(clk) then + log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) & + ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & + exception_log & + irq_valid_log & + std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & + "000" & + r.e.write_enable & + r.e.valid & + f_out.redirect & + stall_out & + flush_out; + end if; end process; + log_out <= log_data; end architecture behaviour; diff --git a/fetch1.vhdl b/fetch1.vhdl index cb1d1df..758db24 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -24,7 +24,10 @@ entity fetch1 is e_in : in Execute1ToFetch1Type; -- Request to icache - i_out : out Fetch1ToIcacheType + i_out : out Fetch1ToIcacheType; + + -- outputs to logger + log_out : out std_ulogic_vector(42 downto 0) ); end entity fetch1; @@ -35,11 +38,13 @@ architecture behaviour of fetch1 is end record; signal r, r_next : Fetch1ToIcacheType; signal r_int, r_next_int : reg_internal_t; + signal log_nia : std_ulogic_vector(42 downto 0); begin regs : process(clk) begin if rising_edge(clk) then + log_nia <= r.nia(63) & r.nia(43 downto 2); if r /= r_next then report "fetch1 rst:" & std_ulogic'image(rst) & " IR:" & std_ulogic'image(e_in.virt_mode) & @@ -54,6 +59,7 @@ begin r_int <= r_next_int; end if; end process; + log_out <= log_nia; comb : process(all) variable v : Fetch1ToIcacheType; diff --git a/icache.vhdl b/icache.vhdl index 27f8c6a..2107d5a 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -57,7 +57,9 @@ entity icache is inval_in : in std_ulogic; wishbone_out : out wishbone_master_out; - wishbone_in : in wishbone_slave_out + wishbone_in : in wishbone_slave_out; + + log_out : out std_ulogic_vector(53 downto 0) ); end entity icache; @@ -198,6 +200,9 @@ architecture rtl of icache is signal priv_fault : std_ulogic; signal access_ok : std_ulogic; + -- Output data to logger + signal log_data : std_ulogic_vector(53 downto 0); + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -674,4 +679,36 @@ begin end if; end if; end process; + + data_log: process(clk) + variable lway: way_t; + variable wstate: std_ulogic; + begin + if rising_edge(clk) then + if req_is_hit then + lway := req_hit_way; + else + lway := replace_way; + end if; + wstate := '0'; + if r.state /= IDLE then + wstate := '1'; + end if; + log_data <= i_out.valid & + i_out.insn & + wishbone_in.ack & + r.wb.adr(5 downto 3) & + r.wb.stb & r.wb.cyc & + wishbone_in.stall & + stall_out & + r.fetch_failed & + r.hit_nia(5 downto 2) & + wstate & + std_ulogic_vector(to_unsigned(lway, 3)) & + req_is_hit & req_is_miss & + access_ok & + ra_valid; + end if; + end process; + log_out <= log_data; end; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e71ad74..6e71df9 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -25,7 +25,9 @@ entity loadstore1 is m_in : in MmuToLoadstore1Type; dc_stall : in std_ulogic; - stall_out : out std_ulogic + stall_out : out std_ulogic; + + log_out : out std_ulogic_vector(9 downto 0) ); end loadstore1; @@ -80,6 +82,8 @@ architecture behave of loadstore1 is signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); + signal log_data : std_ulogic_vector(9 downto 0); + -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is begin @@ -516,4 +520,18 @@ begin end process; + ls1_log: process(clk) + begin + if rising_edge(clk) then + log_data <= stall_out & + e_out.exception & + l_out.valid & + m_out.valid & + d_out.valid & + m_in.done & + r.dwords_done & + std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); + end if; + end process; + log_out <= log_data; end; diff --git a/register_file.vhdl b/register_file.vhdl index 4df032c..260255e 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -24,7 +24,9 @@ entity register_file is -- debug sim_dump : in std_ulogic; - sim_dump_done : out std_ulogic + sim_dump_done : out std_ulogic; + + log_out : out std_ulogic_vector(70 downto 0) ); end entity register_file; @@ -34,6 +36,7 @@ architecture behaviour of register_file is signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_ack : std_ulogic; + signal log_data : std_ulogic_vector(70 downto 0); begin -- synchronous writes register_write_0: process(clk) @@ -131,4 +134,13 @@ begin sim_dump_done <= '0'; end generate; + reg_log: process(clk) + begin + if rising_edge(clk) then + log_data <= w_in.write_data & + w_in.write_enable & + w_in.write_reg; + end if; + end process; + log_out <= log_data; end architecture behaviour; diff --git a/scripts/fmt_log/Makefile b/scripts/fmt_log/Makefile new file mode 100644 index 0000000..04d1e9a --- /dev/null +++ b/scripts/fmt_log/Makefile @@ -0,0 +1,12 @@ +CFLAGS = -O2 -g -Wall -std=c99 + +all: fmt_log + +fmt_log: fmt_log.c + $(CC) -o $@ $^ $(CFLAGS) + +clean: + rm -f fmt_log +distclean: + rm -f *~ + diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c new file mode 100644 index 0000000..c8fb501 --- /dev/null +++ b/scripts/fmt_log/fmt_log.c @@ -0,0 +1,235 @@ +#include +#include +#include + +typedef unsigned long long u64; + +struct log_entry { + u64 nia_lo: 42; + u64 nia_hi: 1; + u64 ic_ra_valid: 1; + u64 ic_access_ok: 1; + u64 ic_is_miss: 1; + u64 ic_is_hit: 1; + u64 ic_way: 3; + u64 ic_state: 1; + u64 ic_part_nia: 4; + u64 ic_fetch_failed: 1; + u64 ic_stall_out: 1; + u64 ic_wb_stall: 1; + u64 ic_wb_cyc: 1; + u64 ic_wb_stb: 1; + u64 ic_wb_adr: 3; + u64 ic_wb_ack: 1; + + u64 ic_insn: 32; + u64 ic_valid: 1; + u64 d1_valid: 1; + u64 d1_unit: 2; + u64 d1_part_nia: 4; + u64 d1_insn_type: 6; + u64 d2_bypass_a: 1; + u64 d2_bypass_b: 1; + u64 d2_bypass_c: 1; + u64 d2_stall_out: 1; + u64 d2_stopped_out: 1; + u64 d2_valid: 1; + u64 d2_part_nia: 4; + u64 e1_flush_out: 1; + u64 e1_stall_out: 1; + u64 e1_redirect: 1; + u64 e1_valid: 1; + u64 e1_write_enable: 1; + u64 e1_unused: 3; + + u64 e1_irq_state: 1; + u64 e1_irq: 1; + u64 e1_exception: 1; + u64 e1_msr_dr: 1; + u64 e1_msr_ir: 1; + u64 e1_msr_pr: 1; + u64 e1_msr_ee: 1; + u64 pad1: 5; + u64 ls_state: 3; + u64 ls_dw_done: 1; + u64 ls_min_done: 1; + u64 ls_do_valid: 1; + u64 ls_mo_valid: 1; + u64 ls_lo_valid: 1; + u64 ls_eo_except: 1; + u64 ls_stall_out: 1; + u64 pad2: 2; + u64 dc_state: 3; + u64 dc_ra_valid: 1; + u64 dc_tlb_way: 3; + u64 dc_stall_out: 1; + u64 dc_op: 3; + u64 dc_do_valid: 1; + u64 dc_do_error: 1; + u64 dc_wb_cyc: 1; + u64 dc_wb_stb: 1; + u64 dc_wb_ack: 1; + u64 dc_wb_stall: 1; + u64 dc_wb_adr: 3; + u64 cr_wr_mask: 8; + u64 cr_wr_data: 4; + u64 cr_wr_enable: 1; + u64 reg_wr_reg: 6; + u64 reg_wr_enable: 1; + + u64 reg_wr_data; +}; + +#define FLAG(i, y) (log.i? y: ' ') +#define FLGA(i, y, z) (log.i? y: z) +#define PNIA(f) (full_nia[log.f] & 0xff) + +const char *units[4] = { "--", "al", "ls", "?3" }; +const char *ops[64] = +{ + "illegal", "nop ", "add ", "and ", "attn ", "b ", "bc ", "bcreg ", + "bperm ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", "darn ", + "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "div ", "dive ", "exts ", + "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "maddhd ", + "maddhdu", "maddld ", "mcrxr ", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", + "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", + "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", + "shr ", "sync ", "tlbie ", "trap ", "xor ", "ffail ", "?62 ", "?63 " +}; + +const char *spr_names[13] = +{ + "lr ", "ctr", "sr0", "sr1", "hr0", "hr1", "sg0", "sg1", + "sg2", "sg3", "hg0", "hg1", "xer" +}; + +int main(int ac, char **av) +{ + struct log_entry log; + u64 full_nia[16]; + long int lineno = 1; + FILE *f; + const char *filename; + int i; + long int ncompl = 0; + + if (ac != 1 && ac != 2) { + fprintf(stderr, "Usage: %s [filename]\n", av[0]); + exit(1); + } + f = stdin; + if (ac == 2) { + filename = av[1]; + f = fopen(filename, "rb"); + if (f == NULL) { + perror(filename); + exit(1); + } + } + + for (i = 0; i < 15; ++i) + full_nia[i] = i << 2; + + while (fread(&log, sizeof(log), 1, f) == 1) { + full_nia[log.nia_lo & 0xf] = (log.nia_hi? 0xc000000000000000: 0) | + (log.nia_lo << 2); + if (lineno % 20 == 1) { + printf(" fetch1 NIA icache decode1 decode2 execute1 loadstore dcache CR GSPR\n"); + printf(" ---------------- TAHW S -WB-- pN --insn-- pN un op pN byp FR IIE MSR WC SD MM CE SRTO DE -WB-- c ms reg val\n"); + printf(" LdMy t csnSa IA IA it IA abc le srx EPID em tw rd mx tAwp vr csnSa 0 k\n"); + } + printf("%4ld %c0000%.11llx %c ", lineno, + (log.nia_hi? 'c': '0'), + (unsigned long long)log.nia_lo << 2, + FLAG(ic_stall_out, '|')); + printf("%c%c%c%d %c %c%c%d%c%c %.2llx ", + FLGA(ic_ra_valid, ' ', 'T'), + FLGA(ic_access_ok, ' ', 'X'), + FLGA(ic_is_hit, 'H', FLGA(ic_is_miss, 'M', ' ')), + log.ic_way, + FLAG(ic_state, 'W'), + FLAG(ic_wb_cyc, 'c'), + FLAG(ic_wb_stb, 's'), + log.ic_wb_adr, + FLAG(ic_wb_stall, 'S'), + FLAG(ic_wb_ack, 'a'), + PNIA(ic_part_nia)); + if (log.ic_valid) + printf("%.8x", log.ic_insn); + else if (log.ic_fetch_failed) + printf("!!!!!!!!"); + else + printf("--------"); + printf(" %c%c %.2llx ", + FLAG(ic_valid, '>'), + FLAG(d2_stall_out, '|'), + PNIA(d1_part_nia)); + if (log.d1_valid) + printf("%s %s", + units[log.d1_unit], + ops[log.d1_insn_type]); + else + printf("-- -------"); + printf(" %c%c ", + FLAG(d1_valid, '>'), + FLAG(d2_stall_out, '|')); + printf("%.2llx %c%c%c %c%c ", + PNIA(d2_part_nia), + FLAG(d2_bypass_a, 'a'), + FLAG(d2_bypass_b, 'b'), + FLAG(d2_bypass_c, 'c'), + FLAG(d2_valid, '>'), + FLAG(e1_stall_out, '|')); + printf("%c%c %c%c%c %c%c%c%c %c%c ", + FLAG(e1_flush_out, 'F'), + FLAG(e1_redirect, 'R'), + FLAG(e1_irq_state, 'w'), + FLAG(e1_irq, 'I'), + FLAG(e1_exception, 'X'), + FLAG(e1_msr_ee, 'E'), + FLGA(e1_msr_pr, 'u', 's'), + FLAG(e1_msr_ir, 'I'), + FLAG(e1_msr_dr, 'D'), + FLAG(e1_write_enable, 'W'), + FLAG(e1_valid, 'C')); + printf("%c %d%d %c%c %c%c %c ", + FLAG(ls_stall_out, '|'), + log.ls_state, + log.ls_dw_done, + FLAG(ls_mo_valid, 'M'), + FLAG(ls_min_done, 'm'), + FLAG(ls_lo_valid, 'C'), + FLAG(ls_eo_except, 'X'), + FLAG(ls_do_valid, '>')); + printf("%d%c%d%d %c%c %c%c%d%c%c ", + log.dc_state, + FLAG(dc_ra_valid, 'R'), + log.dc_tlb_way, + log.dc_op, + FLAG(dc_do_valid, 'V'), + FLAG(dc_do_error, 'E'), + FLAG(dc_wb_cyc, 'c'), + FLAG(dc_wb_stb, 's'), + log.dc_wb_adr, + FLAG(dc_wb_stall, 'S'), + FLAG(dc_wb_ack, 'a')); + if (log.cr_wr_enable) + printf("%x>%.2x ", log.cr_wr_data, log.cr_wr_mask); + else + printf(" "); + if (log.reg_wr_enable) { + if (log.reg_wr_reg < 32 || log.reg_wr_reg > 44) + printf("r%02d", log.reg_wr_reg); + else + printf("%s", spr_names[log.reg_wr_reg - 32]); + printf("=%.16llx", log.reg_wr_data); + } + printf("\n"); + ++lineno; + if (log.ls_lo_valid || log.e1_valid) + ++ncompl; + } + printf("%ld instructions completed, %.2f CPI\n", ncompl, + (double)(lineno - 1) / ncompl); + exit(0); +} diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index 7f77558..28e43b4 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -42,6 +42,9 @@ #define DBG_CORE_GSPR_INDEX 0x14 #define DBG_CORE_GSPR_DATA 0x15 +#define DBG_LOG_ADDR 0x16 +#define DBG_LOG_DATA 0x17 + static bool debug; struct backend { @@ -548,6 +551,73 @@ static void save(const char *filename, uint64_t addr, uint64_t size) printf("%x done.\n", count); } +#define LOG_STOP 0x80000000ull + +static void log_start(void) +{ + check(dmi_write(DBG_LOG_ADDR, 0), "writing LOG_ADDR"); +} + +static void log_stop(void) +{ + uint64_t lsize, laddr, waddr; + + check(dmi_write(DBG_LOG_ADDR, LOG_STOP), "writing LOG_ADDR"); + check(dmi_read(DBG_LOG_ADDR, &laddr), "reading LOG_ADDR"); + waddr = laddr >> 32; + for (lsize = 1; lsize; lsize <<= 1) + if ((waddr >> 1) < lsize) + break; + waddr &= ~lsize; + printf("Log size = %" PRIu64 " entries, ", lsize); + printf("write ptr = %" PRIx64 "\n", waddr); +} + +static void log_dump(const char *filename) +{ + FILE *f; + uint64_t lsize, laddr, waddr; + uint64_t orig_laddr; + uint64_t i, ldata; + + f = fopen(filename, "w"); + if (f == NULL) { + fprintf(stderr, "Failed to create '%s': %s\n", filename, + strerror(errno)); + exit(1); + } + + check(dmi_read(DBG_LOG_ADDR, &orig_laddr), "reading LOG_ADDR"); + if (!(orig_laddr & LOG_STOP)) + check(dmi_write(DBG_LOG_ADDR, LOG_STOP), "writing LOG_ADDR"); + + waddr = orig_laddr >> 32; + for (lsize = 1; lsize; lsize <<= 1) + if ((waddr >> 1) < lsize) + break; + waddr &= ~lsize; + printf("Log size = %" PRIu64 " entries\n", lsize); + + laddr = LOG_STOP | (waddr << 2); + check(dmi_write(DBG_LOG_ADDR, laddr), "writing LOG_ADDR"); + + for (i = 0; i < lsize * 4; ++i) { + check(dmi_read(DBG_LOG_DATA, &ldata), "reading LOG_DATA"); + if (fwrite(&ldata, sizeof(ldata), 1, f) != 1) { + fprintf(stderr, "Write error on %s\n", filename); + exit(1); + } + if (!(i % 128)) { + printf("%" PRIu64 "...\r", i * 8); + fflush(stdout); + } + } + fclose(f); + printf("%" PRIu64 " done\n", lsize * 32); + + check(dmi_write(DBG_LOG_ADDR, orig_laddr), "writing LOG_ADDR"); +} + static void usage(const char *cmd) { fprintf(stderr, "Usage: %s -b \n", cmd); @@ -572,6 +642,12 @@ static void usage(const char *cmd) fprintf(stderr, " gpr [count]\n"); fprintf(stderr, " status\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " Core logging:\n"); + fprintf(stderr, " lstart start logging\n"); + fprintf(stderr, " lstop stop logging\n"); + fprintf(stderr, " ldump dump log to file\n"); + fprintf(stderr, "\n"); fprintf(stderr, " JTAG:\n"); fprintf(stderr, " dmiread \n"); @@ -710,6 +786,17 @@ int main(int argc, char *argv[]) if (((i+1) < argc) && isdigit(argv[i+1][0])) count = strtoul(argv[++i], NULL, 10); gpr_read(reg, count); + } else if (strcmp(argv[i], "lstart") == 0) { + log_start(); + } else if (strcmp(argv[i], "lstop") == 0) { + log_stop(); + } else if (strcmp(argv[i], "ldump") == 0) { + const char *filename; + + if ((i+1) >= argc) + usage(argv[0]); + filename = argv[++i]; + log_dump(filename); } else { fprintf(stderr, "Unknown command %s\n", argv[i]); exit(1); From b5a7dbb78dff640ee18b6662ea007a946a4ebb09 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 10 May 2020 18:18:03 +1000 Subject: [PATCH 05/22] core: Remove fetch2 pipeline stage The fetch2 stage existed primarily to provide a stash buffer for the output of icache when a stall occurred. However, we can get the same effect -- of having the input to decode1 stay unchanged on a stall cycle -- by using the read enable of the BRAMs in icache, and by adding logic to keep the outputs unchanged on a clock cycle when stall_in = 1. This reduces branch and interrupt latency by one cycle. Signed-off-by: Paul Mackerras --- Makefile | 2 +- common.vhdl | 12 +---- core.vhdl | 25 +++------- decode1.vhdl | 3 +- fetch2.vhdl | 123 ------------------------------------------------- icache.vhdl | 52 ++++++++++++--------- icache_tb.vhdl | 3 +- microwatt.core | 1 - 8 files changed, 41 insertions(+), 180 deletions(-) delete mode 100644 fetch2.vhdl diff --git a/Makefile b/Makefile index 692704e..1e4b558 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ all = core_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \ all: $(all) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ - fetch2.vhdl utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ + utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \ cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ diff --git a/common.vhdl b/common.vhdl index a6b3f95..7236a56 100644 --- a/common.vhdl +++ b/common.vhdl @@ -96,7 +96,7 @@ package common is nia: std_ulogic_vector(63 downto 0); end record; - type IcacheToFetch2Type is record + type IcacheToDecode1Type is record valid: std_ulogic; stop_mark: std_ulogic; fetch_failed: std_ulogic; @@ -104,16 +104,6 @@ package common is insn: std_ulogic_vector(31 downto 0); end record; - type Fetch2ToDecode1Type is record - valid: std_ulogic; - stop_mark : std_ulogic; - fetch_failed: std_ulogic; - nia: std_ulogic_vector(63 downto 0); - insn: std_ulogic_vector(31 downto 0); - end record; - constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0', - nia => (others => '0'), insn => (others => '0')); - type Decode1ToDecode2Type is record valid: std_ulogic; stop_mark : std_ulogic; diff --git a/core.vhdl b/core.vhdl index da9853f..5517959 100644 --- a/core.vhdl +++ b/core.vhdl @@ -41,12 +41,9 @@ entity core is end core; architecture behave of core is - -- fetch signals - signal fetch2_to_decode1: Fetch2ToDecode1Type; - -- icache signals signal fetch1_to_icache : Fetch1ToIcacheType; - signal icache_to_fetch2 : IcacheToFetch2Type; + signal icache_to_decode1 : IcacheToDecode1Type; signal mmu_to_icache : MmuToIcacheType; -- decode signals @@ -83,7 +80,7 @@ architecture behave of core is -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; - signal fetch2_stall_in : std_ulogic; + signal icache_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic; signal decode2_stall_in : std_ulogic; signal decode2_stall_out : std_ulogic; @@ -145,7 +142,6 @@ architecture behave of core is attribute keep_hierarchy : string; attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN); - attribute keep_hierarchy of fetch2_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); @@ -206,27 +202,18 @@ begin clk => clk, rst => rst_icache, i_in => fetch1_to_icache, - i_out => icache_to_fetch2, + i_out => icache_to_decode1, m_in => mmu_to_icache, flush_in => flush, inval_in => dbg_icache_rst or ex1_icache_inval, + stall_in => icache_stall_in, stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, wishbone_in => wishbone_insn_in, log_out => log_data(96 downto 43) ); - fetch2_0: entity work.fetch2 - port map ( - clk => clk, - rst => rst_fetch2, - stall_in => fetch2_stall_in, - flush_in => flush, - i_in => icache_to_fetch2, - f_out => fetch2_to_decode1 - ); - - fetch2_stall_in <= decode2_stall_out; + icache_stall_in <= decode2_stall_out; decode1_0: entity work.decode1 port map ( @@ -234,7 +221,7 @@ begin rst => rst_dec1, stall_in => decode1_stall_in, flush_in => flush, - f_in => fetch2_to_decode1, + f_in => icache_to_decode1, d_out => decode1_to_decode2, log_out => log_data(109 downto 97) ); diff --git a/decode1.vhdl b/decode1.vhdl index 3e3b41a..214285e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -14,9 +14,8 @@ entity decode1 is stall_in : in std_ulogic; flush_in : in std_ulogic; - f_in : in Fetch2ToDecode1Type; + f_in : in IcacheToDecode1Type; d_out : out Decode1ToDecode2Type; - log_out : out std_ulogic_vector(12 downto 0) ); end entity decode1; diff --git a/fetch2.vhdl b/fetch2.vhdl deleted file mode 100644 index 13ff56e..0000000 --- a/fetch2.vhdl +++ /dev/null @@ -1,123 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.common.all; -use work.wishbone_types.all; - -entity fetch2 is - port( - clk : in std_ulogic; - rst : in std_ulogic; - - stall_in : in std_ulogic; - flush_in : in std_ulogic; - - -- Results from icache - i_in : in IcacheToFetch2Type; - - -- Output to decode - f_out : out Fetch2ToDecode1Type - ); -end entity fetch2; - -architecture behaviour of fetch2 is - - -- The icache cannot stall, so we need to stash a cycle - -- of output from it when we stall. - type reg_internal_type is record - stash : IcacheToFetch2Type; - stash_valid : std_ulogic; - stopped : std_ulogic; - end record; - - signal r_int, rin_int : reg_internal_type; - signal r, rin : Fetch2ToDecode1Type; - -begin - regs : process(clk) - begin - if rising_edge(clk) then - - if (r /= rin) then - report "fetch2 rst:" & std_ulogic'image(rst) & - " S:" & std_ulogic'image(stall_in) & - " F:" & std_ulogic'image(flush_in) & - " T:" & std_ulogic'image(rin.stop_mark) & - " V:" & std_ulogic'image(rin.valid) & - " FF:" & std_ulogic'image(rin.fetch_failed) & - " nia:" & to_hstring(rin.nia); - end if; - - -- Output state remains unchanged on stall, unless we are flushing - if rst = '1' or flush_in = '1' or stall_in = '0' then - r <= rin; - end if; - - -- Internal state is updated on every clock - r_int <= rin_int; - end if; - end process; - - comb : process(all) - variable v : Fetch2ToDecode1Type; - variable v_int : reg_internal_type; - variable v_i_in : IcacheToFetch2Type; - begin - v := r; - v_int := r_int; - - -- If stalling, stash away the current input from the icache - if stall_in = '1' and v_int.stash_valid = '0' then - v_int.stash := i_in; - v_int.stash_valid := '1'; - end if; - - -- If unstalling, source input from the stash and invalidate it, - -- otherwise source normally from the icache. - -- - v_i_in := i_in; - if v_int.stash_valid = '1' and stall_in = '0' then - v_i_in := v_int.stash; - v_int.stash_valid := '0'; - end if; - - v.valid := v_i_in.valid; - v.stop_mark := v_i_in.stop_mark; - v.fetch_failed := v_i_in.fetch_failed; - v.nia := v_i_in.nia; - v.insn := v_i_in.insn; - - -- Clear stash internal valid bit on flush. We still mark - -- the stash itself as valid since we still want to override - -- whatever comes form icache when unstalling, but we'll - -- override it with something invalid. - -- - if flush_in = '1' then - v_int.stash.valid := '0'; - v_int.stash.fetch_failed := '0'; - end if; - - -- If we are flushing or the instruction comes with a stop mark - -- we tag it as invalid so it doesn't get decoded and executed - if flush_in = '1' or v.stop_mark = '1' then - v.valid := '0'; - v.fetch_failed := '0'; - end if; - - -- Clear stash on reset - if rst = '1' then - v_int.stash_valid := '0'; - v.valid := '0'; - end if; - - -- Update registers - rin <= v; - rin_int <= v_int; - - -- Update outputs - f_out <= r; - end process; - -end architecture behaviour; diff --git a/icache.vhdl b/icache.vhdl index 2107d5a..e4f8448 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -48,10 +48,11 @@ entity icache is rst : in std_ulogic; i_in : in Fetch1ToIcacheType; - i_out : out IcacheToFetch2Type; + i_out : out IcacheToDecode1Type; m_in : in MmuToIcacheType; + stall_in : in std_ulogic; stall_out : out std_ulogic; flush_in : in std_ulogic; inval_in : in std_ulogic; @@ -366,7 +367,7 @@ begin ); process(all) begin - do_read <= '1'; + do_read <= not stall_in; do_write <= '0'; if wishbone_in.ack = '1' and r.store_way = i then do_write <= '1'; @@ -533,25 +534,32 @@ begin icache_hit : process(clk) begin if rising_edge(clk) then - -- On a hit, latch the request for the next cycle, when the BRAM data - -- will be available on the cache_out output of the corresponding way - -- - r.hit_valid <= req_is_hit; - -- Send stop marks and NIA down regardless of validity - r.hit_smark <= i_in.stop_mark; - r.hit_nia <= i_in.nia; - if req_is_hit = '1' then - r.hit_way <= req_hit_way; - r.hit_smark <= i_in.stop_mark; - - report "cache hit nia:" & to_hstring(i_in.nia) & - " IR:" & std_ulogic'image(i_in.virt_mode) & - " SM:" & std_ulogic'image(i_in.stop_mark) & - " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag) & - " way:" & integer'image(req_hit_way) & - " RA:" & to_hstring(real_addr); - end if; + -- keep outputs to fetch2 unchanged on a stall + -- except that flush or reset sets valid to 0 + if stall_in = '1' then + if rst = '1' or flush_in = '1' then + r.hit_valid <= '0'; + end if; + else + -- On a hit, latch the request for the next cycle, when the BRAM data + -- will be available on the cache_out output of the corresponding way + -- + r.hit_valid <= req_is_hit; + -- Send stop marks and NIA down regardless of validity + r.hit_smark <= i_in.stop_mark; + r.hit_nia <= i_in.nia; + if req_is_hit = '1' then + r.hit_way <= req_hit_way; + + report "cache hit nia:" & to_hstring(i_in.nia) & + " IR:" & std_ulogic'image(i_in.virt_mode) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way:" & integer'image(req_hit_way) & + " RA:" & to_hstring(real_addr); + end if; + end if; end if; end process; @@ -674,7 +682,7 @@ begin -- TLB miss and protection fault processing if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then r.fetch_failed <= '0'; - elsif i_in.req = '1' and access_ok = '0' then + elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then r.fetch_failed <= '1'; end if; end if; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 39e28d5..1d179d6 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -13,7 +13,7 @@ architecture behave of icache_tb is signal rst : std_ulogic; signal i_out : Fetch1ToIcacheType; - signal i_in : IcacheToFetch2Type; + signal i_in : IcacheToDecode1Type; signal m_out : MmuToIcacheType; @@ -33,6 +33,7 @@ begin i_in => i_out, i_out => i_in, m_in => m_out, + stall_in => '0', flush_in => '0', inval_in => '0', wishbone_out => wb_bram_in, diff --git a/microwatt.core b/microwatt.core index 87ef39d..876f762 100644 --- a/microwatt.core +++ b/microwatt.core @@ -9,7 +9,6 @@ filesets: - wishbone_types.vhdl - common.vhdl - fetch1.vhdl - - fetch2.vhdl - decode1.vhdl - helpers.vhdl - decode2.vhdl From f80da65799c366edb88e20ec4f95f60c62ac3d94 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 3 Jun 2020 11:26:33 +1000 Subject: [PATCH 06/22] core: Double the dcache and icache sizes This makes the dcache and icache both be 8kB. This still only uses one BRAM per way per cache on the Artix-7, since the BRAMs were only half-used previously. Signed-off-by: Paul Mackerras --- core.vhdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core.vhdl b/core.vhdl index 5517959..4d84b4a 100644 --- a/core.vhdl +++ b/core.vhdl @@ -195,7 +195,7 @@ begin generic map( SIM => SIM, LINE_SIZE => 64, - NUM_LINES => 32, + NUM_LINES => 64, NUM_WAYS => 2 ) port map( @@ -335,7 +335,7 @@ begin dcache_0: entity work.dcache generic map( LINE_SIZE => 64, - NUM_LINES => 32, + NUM_LINES => 64, NUM_WAYS => 2 ) port map ( From 9880fc743551a15aa2dd834b2472180e5280796e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 21 May 2020 13:42:46 +1000 Subject: [PATCH 07/22] multiply: Move selection of result bits into execute1 This puts the logic that selects which bits of the multiplier result get written into the destination GPR into execute1, moved out from multiply. The multiplier is now expected to do an unsigned multiplication of 64-bit operands, optionally negate the result, detect 32-bit or 64-bit signed overflow of the result, and return a full 128-bit result. Signed-off-by: Paul Mackerras --- common.vhdl | 12 +++--- execute1.vhdl | 54 ++++++++++++-------------- multiply.vhdl | 59 ++++++++++------------------- multiply_tb.vhdl | 99 +++++++++++++++++++++++++++--------------------- 4 files changed, 107 insertions(+), 117 deletions(-) diff --git a/common.vhdl b/common.vhdl index 7236a56..82b3242 100644 --- a/common.vhdl +++ b/common.vhdl @@ -158,13 +158,13 @@ package common is type Execute1ToMultiplyType is record valid: std_ulogic; - insn_type: insn_type_t; - data1: std_ulogic_vector(64 downto 0); - data2: std_ulogic_vector(64 downto 0); + data1: std_ulogic_vector(63 downto 0); + data2: std_ulogic_vector(63 downto 0); is_32bit: std_ulogic; + neg_result: std_ulogic; end record; - constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, - is_32bit => '0', + constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', + is_32bit => '0', neg_result => '0', others => (others => '0')); type Execute1ToDividerType is record @@ -356,7 +356,7 @@ package common is type MultiplyToExecute1Type is record valid: std_ulogic; - write_reg_data: std_ulogic_vector(63 downto 0); + result: std_ulogic_vector(127 downto 0); overflow : std_ulogic; end record; constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0', diff --git a/execute1.vhdl b/execute1.vhdl index 9066aa0..0009699 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -53,6 +53,7 @@ architecture behaviour of execute1 is mul_in_progress : std_ulogic; div_in_progress : std_ulogic; cntz_in_progress : std_ulogic; + slow_op_insn : insn_type_t; slow_op_dest : gpr_index_t; slow_op_rc : std_ulogic; slow_op_oe : std_ulogic; @@ -63,7 +64,7 @@ architecture behaviour of execute1 is constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, lr_update => '0', mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0', - slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, + slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, next_lr => (others => '0'), ldst_nia => (others => '0'), others => (others => '0')); signal r, rin : reg_type; @@ -346,32 +347,7 @@ begin v.div_in_progress := '0'; v.cntz_in_progress := '0'; - -- signals to multiply unit - x_to_multiply <= Execute1ToMultiplyInit; - x_to_multiply.insn_type <= e_in.insn_type; - x_to_multiply.is_32bit <= e_in.is_32bit; - - if e_in.is_32bit = '1' then - if e_in.is_signed = '1' then - x_to_multiply.data1 <= (others => a_in(31)); - x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0); - x_to_multiply.data2 <= (others => b_in(31)); - x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0); - else - x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0); - x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0); - end if; - else - if e_in.is_signed = '1' then - x_to_multiply.data1 <= a_in(63) & a_in; - x_to_multiply.data2 <= b_in(63) & b_in; - else - x_to_multiply.data1 <= '0' & a_in; - x_to_multiply.data2 <= '0' & b_in; - end if; - end if; - - -- signals to divide unit + -- signals to multiply and divide units sign1 := '0'; sign2 := '0'; if e_in.is_signed = '1' then @@ -395,15 +371,22 @@ begin abs2 := - signed(b_in); end if; + x_to_multiply <= Execute1ToMultiplyInit; + x_to_multiply.is_32bit <= e_in.is_32bit; + x_to_divider <= Execute1ToDividerInit; x_to_divider.is_signed <= e_in.is_signed; x_to_divider.is_32bit <= e_in.is_32bit; if e_in.insn_type = OP_MOD then x_to_divider.is_modulus <= '1'; end if; + + x_to_multiply.neg_result <= sign1 xor sign2; x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); if e_in.is_32bit = '0' then -- 64-bit forms + x_to_multiply.data1 <= std_ulogic_vector(abs1); + x_to_multiply.data2 <= std_ulogic_vector(abs2); if e_in.insn_type = OP_DIVE then x_to_divider.is_extended <= '1'; end if; @@ -411,6 +394,8 @@ begin x_to_divider.divisor <= std_ulogic_vector(abs2); else -- 32-bit forms + x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0)); + x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); x_to_divider.is_extended <= '0'; if e_in.insn_type = OP_DIVE then -- extended forms x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000"; @@ -505,6 +490,7 @@ begin v.e.valid := '1'; v.e.write_reg := e_in.write_reg; + v.slow_op_insn := e_in.insn_type; v.slow_op_dest := gspr_to_gpr(e_in.write_reg); v.slow_op_rc := e_in.rc; v.slow_op_oe := e_in.oe; @@ -950,8 +936,18 @@ begin if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then if r.mul_in_progress = '1' then - result := multiply_to_x.write_reg_data; - overflow := multiply_to_x.overflow; + overflow := '0'; + case r.slow_op_insn is + when OP_MUL_H32 => + result := multiply_to_x.result(63 downto 32) & + multiply_to_x.result(63 downto 32); + when OP_MUL_H64 => + result := multiply_to_x.result(127 downto 64); + when others => + -- i.e. OP_MUL_L64 + result := multiply_to_x.result(63 downto 0); + overflow := multiply_to_x.overflow; + end case; else result := divider_to_x.write_reg_data; overflow := divider_to_x.overflow; diff --git a/multiply.vhdl b/multiply.vhdl index 959c114..7a4c81b 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -4,11 +4,10 @@ use ieee.numeric_std.all; library work; use work.common.all; -use work.decode_types.all; entity multiply is generic ( - PIPELINE_DEPTH : natural := 16 + PIPELINE_DEPTH : natural := 4 ); port ( clk : in std_logic; @@ -19,17 +18,16 @@ entity multiply is end entity multiply; architecture behaviour of multiply is - signal m: Execute1ToMultiplyType; + signal m: Execute1ToMultiplyType := Execute1ToMultiplyInit; type multiply_pipeline_stage is record valid : std_ulogic; - insn_type : insn_type_t; - data : signed(129 downto 0); + data : unsigned(127 downto 0); is_32bit : std_ulogic; + neg_res : std_ulogic; end record; constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', - insn_type => OP_ILLEGAL, - is_32bit => '0', + is_32bit => '0', neg_res => '0', data => (others => '0')); type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage; @@ -51,50 +49,35 @@ begin multiply_1: process(all) variable v : reg_type; - variable d : std_ulogic_vector(129 downto 0); + variable d : std_ulogic_vector(127 downto 0); variable d2 : std_ulogic_vector(63 downto 0); variable ov : std_ulogic; begin - v := r; - - m_out <= MultiplyToExecute1Init; - v.multiply_pipeline(0).valid := m.valid; - v.multiply_pipeline(0).insn_type := m.insn_type; - v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2); + v.multiply_pipeline(0).data := unsigned(m.data1) * unsigned(m.data2); v.multiply_pipeline(0).is_32bit := m.is_32bit; + v.multiply_pipeline(0).neg_res := m.neg_result; loop_0: for i in 1 to PIPELINE_DEPTH-1 loop v.multiply_pipeline(i) := r.multiply_pipeline(i-1); end loop; - d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data); - ov := '0'; + if v.multiply_pipeline(PIPELINE_DEPTH-1).neg_res = '0' then + d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data); + else + d := std_ulogic_vector(- signed(v.multiply_pipeline(PIPELINE_DEPTH-1).data)); + end if; - -- TODO: Handle overflows - case_0: case v.multiply_pipeline(PIPELINE_DEPTH-1).insn_type is - when OP_MUL_L64 => - d2 := d(63 downto 0); - if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then - ov := (or d(63 downto 31)) and not (and d(63 downto 31)); - else - ov := (or d(127 downto 63)) and not (and d(127 downto 63)); - end if; - when OP_MUL_H32 => - d2 := d(63 downto 32) & d(63 downto 32); - when OP_MUL_H64 => - d2 := d(127 downto 64); - when others => - --report "Illegal insn type in multiplier"; - d2 := (others => '0'); - end case; + ov := '0'; + if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then + ov := (or d(63 downto 31)) and not (and d(63 downto 31)); + else + ov := (or d(127 downto 63)) and not (and d(127 downto 63)); + end if; - m_out.write_reg_data <= d2; + m_out.result <= d; m_out.overflow <= ov; - - if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then - m_out.valid <= '1'; - end if; + m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid; rin <= v; end process; diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl index ee80de0..87f029d 100644 --- a/multiply_tb.vhdl +++ b/multiply_tb.vhdl @@ -17,8 +17,18 @@ architecture behave of multiply_tb is constant pipeline_depth : integer := 4; - signal m1 : Execute1ToMultiplyType; + signal m1 : Execute1ToMultiplyType := Execute1ToMultiplyInit; signal m2 : MultiplyToExecute1Type; + + function absval(x: std_ulogic_vector) return std_ulogic_vector is + begin + if x(x'left) = '1' then + return std_ulogic_vector(- signed(x)); + else + return x; + end if; + end; + begin multiply_0: entity work.multiply generic map (PIPELINE_DEPTH => pipeline_depth) @@ -39,9 +49,8 @@ begin wait for clk_period; m1.valid <= '1'; - m1.insn_type <= OP_MUL_L64; - m1.data1 <= '0' & x"0000000000001000"; - m1.data2 <= '0' & x"0000000000001111"; + m1.data1 <= x"0000000000001000"; + m1.data2 <= x"0000000000001111"; wait for clk_period; assert m2.valid = '0'; @@ -56,7 +65,7 @@ begin wait for clk_period; assert m2.valid = '1'; - assert m2.write_reg_data = x"0000000001111000"; + assert m2.result = x"00000000000000000000000001111000"; wait for clk_period; assert m2.valid = '0'; @@ -70,7 +79,7 @@ begin wait for clk_period * (pipeline_depth-1); assert m2.valid = '1'; - assert m2.write_reg_data = x"0000000001111000"; + assert m2.result = x"00000000000000000000000001111000"; -- test mulld mulld_loop : for i in 0 to 1000 loop @@ -79,10 +88,10 @@ begin behave_rt := ppc_mulld(ra, rb); - m1.data1 <= '0' & ra; - m1.data2 <= '0' & rb; + m1.data1 <= absval(ra); + m1.data2 <= absval(rb); + m1.neg_result <= ra(63) xor rb(63); m1.valid <= '1'; - m1.insn_type <= OP_MUL_L64; wait for clk_period; @@ -92,8 +101,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulld expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0)) + report "bad mulld expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0)); end loop; -- test mulhdu @@ -103,10 +112,10 @@ begin behave_rt := ppc_mulhdu(ra, rb); - m1.data1 <= '0' & ra; - m1.data2 <= '0' & rb; + m1.data1 <= ra; + m1.data2 <= rb; + m1.neg_result <= '0'; m1.valid <= '1'; - m1.insn_type <= OP_MUL_H64; wait for clk_period; @@ -116,8 +125,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulhdu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(127 downto 64)) + report "bad mulhdu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(127 downto 64)); end loop; -- test mulhd @@ -127,10 +136,10 @@ begin behave_rt := ppc_mulhd(ra, rb); - m1.data1 <= ra(63) & ra; - m1.data2 <= rb(63) & rb; + m1.data1 <= absval(ra); + m1.data2 <= absval(rb); + m1.neg_result <= ra(63) xor rb(63); m1.valid <= '1'; - m1.insn_type <= OP_MUL_H64; wait for clk_period; @@ -140,8 +149,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulhd expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(127 downto 64)) + report "bad mulhd expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(127 downto 64)); end loop; -- test mullw @@ -151,12 +160,12 @@ begin behave_rt := ppc_mullw(ra, rb); - m1.data1 <= (others => ra(31)); - m1.data1(31 downto 0) <= ra(31 downto 0); - m1.data2 <= (others => rb(31)); - m1.data2(31 downto 0) <= rb(31 downto 0); + m1.data1 <= (others => '0'); + m1.data1(31 downto 0) <= absval(ra(31 downto 0)); + m1.data2 <= (others => '0'); + m1.data2(31 downto 0) <= absval(rb(31 downto 0)); + m1.neg_result <= ra(31) xor rb(31); m1.valid <= '1'; - m1.insn_type <= OP_MUL_L64; wait for clk_period; @@ -166,8 +175,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mullw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0)) + report "bad mullw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0)); end loop; -- test mulhw @@ -177,12 +186,12 @@ begin behave_rt := ppc_mulhw(ra, rb); - m1.data1 <= (others => ra(31)); - m1.data1(31 downto 0) <= ra(31 downto 0); - m1.data2 <= (others => rb(31)); - m1.data2(31 downto 0) <= rb(31 downto 0); + m1.data1 <= (others => '0'); + m1.data1(31 downto 0) <= absval(ra(31 downto 0)); + m1.data2 <= (others => '0'); + m1.data2(31 downto 0) <= absval(rb(31 downto 0)); + m1.neg_result <= ra(31) xor rb(31); m1.valid <= '1'; - m1.insn_type <= OP_MUL_H32; wait for clk_period; @@ -192,8 +201,9 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulhw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32)) + report "bad mulhw expected " & to_hstring(behave_rt) & " got " & + to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32)); end loop; -- test mulhwu @@ -207,8 +217,8 @@ begin m1.data1(31 downto 0) <= ra(31 downto 0); m1.data2 <= (others => '0'); m1.data2(31 downto 0) <= rb(31 downto 0); + m1.neg_result <= '0'; m1.valid <= '1'; - m1.insn_type <= OP_MUL_H32; wait for clk_period; @@ -218,8 +228,9 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulhwu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32)) + report "bad mulhwu expected " & to_hstring(behave_rt) & " got " & + to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32)); end loop; -- test mulli @@ -229,11 +240,11 @@ begin behave_rt := ppc_mulli(ra, si); - m1.data1 <= ra(63) & ra; - m1.data2 <= (others => si(15)); - m1.data2(15 downto 0) <= si; + m1.data1 <= absval(ra); + m1.data2 <= (others => '0'); + m1.data2(15 downto 0) <= absval(si); + m1.neg_result <= ra(63) xor si(15); m1.valid <= '1'; - m1.insn_type <= OP_MUL_L64; wait for clk_period; @@ -243,8 +254,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulli expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0)) + report "bad mulli expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0)); end loop; std.env.finish; From 0809bc898b0dacd1923d513331ac3008947bee31 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 21 May 2020 17:50:54 +1000 Subject: [PATCH 08/22] multiply: Use DSP48 slices for multiplication on Xilinx FPGAs This adds a custom implementation of the multiplier which uses 16 DSP48E1 slices to do a 64x64 bit multiplication in 2 cycles. Signed-off-by: Paul Mackerras --- microwatt.core | 5 +- xilinx-mult.vhdl | 985 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 989 insertions(+), 1 deletion(-) create mode 100644 xilinx-mult.vhdl diff --git a/microwatt.core b/microwatt.core index 876f762..7d86cc2 100644 --- a/microwatt.core +++ b/microwatt.core @@ -26,7 +26,6 @@ filesets: - loadstore1.vhdl - mmu.vhdl - dcache.vhdl - - multiply.vhdl - divider.vhdl - rotator.vhdl - writeback.vhdl @@ -75,24 +74,28 @@ filesets: - fpga/nexys_a7.xdc : {file_type : xdc} - fpga/clk_gen_plle2.vhd : {file_type : vhdlSource-2008} - fpga/top-generic.vhdl : {file_type : vhdlSource-2008} + - xilinx-mult.vhdl : {file_type : vhdlSource-2008} nexys_video: files: - fpga/nexys-video.xdc : {file_type : xdc} - fpga/clk_gen_plle2.vhd : {file_type : vhdlSource-2008} - fpga/top-nexys-video.vhdl : {file_type : vhdlSource-2008} + - xilinx-mult.vhdl : {file_type : vhdlSource-2008} arty_a7: files: - fpga/arty_a7.xdc : {file_type : xdc} - fpga/clk_gen_plle2.vhd : {file_type : vhdlSource-2008} - fpga/top-arty.vhdl : {file_type : vhdlSource-2008} + - xilinx-mult.vhdl : {file_type : vhdlSource-2008} cmod_a7-35: files: - fpga/cmod_a7-35.xdc : {file_type : xdc} - fpga/clk_gen_mcmm.vhd : {file_type : vhdlSource-2008} - fpga/top-generic.vhdl : {file_type : vhdlSource-2008} + - xilinx-mult.vhdl : {file_type : vhdlSource-2008} litedram: depend : [":microwatt:litedram"] diff --git a/xilinx-mult.vhdl b/xilinx-mult.vhdl new file mode 100644 index 0000000..46366d6 --- /dev/null +++ b/xilinx-mult.vhdl @@ -0,0 +1,985 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +library unisim; +use unisim.vcomponents.all; + +entity multiply is + port ( + clk : in std_logic; + + m_in : in Execute1ToMultiplyType; + m_out : out MultiplyToExecute1Type + ); +end entity multiply; + +architecture behaviour of multiply is + signal m00_p, m01_p, m02_p, m03_p : std_ulogic_vector(47 downto 0); + signal m00_pc : std_ulogic_vector(47 downto 0); + signal m10_p, m11_p, m12_p, m13_p : std_ulogic_vector(47 downto 0); + signal m11_pc, m12_pc, m13_pc : std_ulogic_vector(47 downto 0); + signal m20_p, m21_p, m22_p, m23_p : std_ulogic_vector(47 downto 0); + signal s0_pc, s1_pc : std_ulogic_vector(47 downto 0); + signal product_lo : std_ulogic_vector(31 downto 0); + signal product : std_ulogic_vector(127 downto 0); + signal addend : std_ulogic_vector(127 downto 0); + signal s0_carry, p0_carry : std_ulogic_vector(3 downto 0); + signal p0_mask : std_ulogic_vector(47 downto 0); + signal p0_pat, p0_patb : std_ulogic; + signal p1_pat, p1_patb : std_ulogic; + + signal req_32bit, r32_1 : std_ulogic; + signal req_neg, rneg_1 : std_ulogic; + signal valid_1 : std_ulogic; + +begin + addend <= (others => m_in.neg_result); + + m00: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & m_in.data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(16 downto 0), + BCIN => (others => '0'), + C => "00000000000000" & addend(33 downto 0), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m00_p, + PCIN => (others => '0'), + PCOUT => m00_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m01: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & m_in.data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(33 downto 17), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "1010101", + P => m01_p, + PCIN => m00_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m02: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & m_in.data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(50 downto 34), + BCIN => (others => '0'), + C => x"0000000" & "000" & addend(50 downto 34), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m02_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m03: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & m_in.data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => "00000" & m_in.data2(63 downto 51), + BCIN => (others => '0'), + C => x"000000" & '0' & addend(73 downto 51), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m03_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m10: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000000000" & m_in.data1(39 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(16 downto 0), + BCIN => (others => '0'), + C => x"000" & "00" & m01_p(39 downto 6), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m10_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m11: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000000000" & m_in.data1(39 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(33 downto 17), + BCIN => (others => '0'), + C => x"000" & "00" & m02_p(39 downto 6), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m11_p, + PCIN => (others => '0'), + PCOUT => m11_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m12: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000000000" & m_in.data1(39 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(50 downto 34), + BCIN => (others => '0'), + C => x"0000" & '0' & m03_p(36 downto 6), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m12_p, + PCIN => (others => '0'), + PCOUT => m12_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m13: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000000000" & m_in.data1(39 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => "00000" & m_in.data2(63 downto 51), + BCIN => (others => '0'), + C => x"0000000" & "000" & addend(90 downto 74), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m13_p, + PCIN => (others => '0'), + PCOUT => m13_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m20: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "000000" & m_in.data1(63 downto 40), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(16 downto 0), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010101", + P => m20_p, + PCIN => m11_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m21: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "000000" & m_in.data1(63 downto 40), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(33 downto 17), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010101", + P => m21_p, + PCIN => m12_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m22: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "000000" & m_in.data1(63 downto 40), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(50 downto 34), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010101", + P => m22_p, + PCIN => m13_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m23: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "000000" & m_in.data1(63 downto 40), + ACIN => (others => '0'), + ALUMODE => "0000", + B => "00000" & m_in.data2(63 downto 51), + BCIN => (others => '0'), + C => x"00" & "000" & addend(127 downto 91), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m23_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + s0: DSP48E1 + generic map ( + ACASCREG => 1, + ALUMODEREG => 0, + AREG => 1, + BCASCREG => 1, + BREG => 1, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 1, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0, + USE_MULT => "none" + ) + port map ( + A => m22_p(5 downto 0) & x"0000" & m10_p(34 downto 27), + ACIN => (others => '0'), + ALUMODE => "0000", + B => m10_p(26 downto 9), + BCIN => (others => '0'), + C => m20_p(39 downto 0) & m02_p(5 downto 0) & "00", + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CARRYOUT => s0_carry, + CEA1 => '0', + CEA2 => '1', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '1', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0001111", + PCIN => (others => '0'), + PCOUT => s0_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + s1: DSP48E1 + generic map ( + ACASCREG => 1, + ALUMODEREG => 0, + AREG => 1, + BCASCREG => 1, + BREG => 1, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 1, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0, + USE_MULT => "none" + ) + port map ( + A => x"000" & m22_p(41 downto 24), + ACIN => (others => '0'), + ALUMODE => "0000", + B => m22_p(23 downto 6), + BCIN => (others => '0'), + C => m23_p(36 downto 0) & x"00" & "0" & m20_p(41 downto 40), + CARRYCASCIN => '0', + CARRYIN => s0_carry(3), + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '1', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '1', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0001111", + PCIN => (others => '0'), + PCOUT => s1_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + -- mask is 0 for 32-bit ops, 0x0000ffffffff for 64-bit + p0_mask(47 downto 31) <= (others => '0'); + p0_mask(30 downto 0) <= (others => not r32_1); + + p0: DSP48E1 + generic map ( + ACASCREG => 1, + ALUMODEREG => 1, + AREG => 1, + BCASCREG => 1, + BREG => 1, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 1, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0, + SEL_MASK => "C", + USE_MULT => "none", + USE_PATTERN_DETECT => "PATDET" + ) + port map ( + A => m21_p(22 downto 0) & m03_p(5 downto 0) & '0', + ACIN => (others => '0'), + ALUMODE => "00" & rneg_1 & '0', + B => (others => '0'), + BCIN => (others => '0'), + C => p0_mask, + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CARRYOUT => p0_carry, + CEA1 => '0', + CEA2 => '1', + CEAD => '0', + CEALUMODE => '1', + CEB1 => '0', + CEB2 => '1', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010011", + P => product(79 downto 32), + PATTERNDETECT => p0_pat, + PATTERNBDETECT => p0_patb, + PCIN => s0_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + p1: DSP48E1 + generic map ( + ACASCREG => 1, + ALUMODEREG => 1, + AREG => 1, + BCASCREG => 1, + BREG => 1, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + MASK => x"000000000000", + MREG => 0, + OPMODEREG => 0, + PREG => 0, + USE_MULT => "none", + USE_PATTERN_DETECT => "PATDET" + ) + port map ( + A => x"0000000" & '0' & m21_p(41), + ACIN => (others => '0'), + ALUMODE => "00" & rneg_1 & '0', + B => m21_p(40 downto 23), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => p0_carry(3), + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '1', + CEAD => '0', + CEALUMODE => '1', + CEB1 => '0', + CEB2 => '1', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010011", + P => product(127 downto 80), + PATTERNDETECT => p1_pat, + PATTERNBDETECT => p1_patb, + PCIN => s1_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + product(31 downto 0) <= product_lo xor (31 downto 0 => req_neg); + + mult_out: process(all) + variable ov : std_ulogic; + begin + -- set overflow if the high bits are neither all zeroes nor all ones + if req_32bit = '0' then + ov := not ((p1_pat and p0_pat) or (p1_patb and p0_patb)); + else + ov := not ((p1_pat and p0_pat and not product(31)) or + (p1_patb and p0_patb and product(31))); + end if; + + m_out.result <= product; + m_out.overflow <= ov; + end process; + + process(clk) + begin + if rising_edge(clk) then + product_lo <= m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0); + m_out.valid <= valid_1; + valid_1 <= m_in.valid; + req_32bit <= r32_1; + r32_1 <= m_in.is_32bit; + req_neg <= rneg_1; + rneg_1 <= m_in.neg_result; + end if; + end process; + +end architecture behaviour; From 62b24a8dae3aa3597f863de1cd8f0a9f0ec2cb6b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 29 May 2020 09:38:05 +1000 Subject: [PATCH 09/22] icache: Improve latencies when reloading cache lines The icache can now detect a hit on a line being refilled from memory, as we have an array of individual valid bits per row for the line that is currently being loaded. This enables the request that initiated the refill to be satisfied earlier, and also enables following requests to the same cache line to be satisfied before the line is completely refilled. Furthermore, the refill now starts at the row that is needed. This should reduce the latency for an icache miss. We now get a 'sequential' indication from fetch1, and use that to know when we can deliver an instruction word using the other half of the 64-bit doubleword that was read last cycle. This doesn't make much difference at the moment, but it frees up cycles where we could test whether the next line is present in the cache so that we could prefetch it if not. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + fetch1.vhdl | 2 ++ icache.vhdl | 72 +++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 56 insertions(+), 19 deletions(-) diff --git a/common.vhdl b/common.vhdl index 82b3242..f08ecd1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -93,6 +93,7 @@ package common is virt_mode : std_ulogic; priv_mode : std_ulogic; stop_mark: std_ulogic; + sequential: std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; diff --git a/fetch1.vhdl b/fetch1.vhdl index 758db24..93a2293 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -68,6 +68,7 @@ begin begin v := r; v_int := r_int; + v.sequential := '0'; if rst = '1' then if alt_reset_in = '1' then @@ -128,6 +129,7 @@ begin if increment then v.nia := std_logic_vector(unsigned(v.nia) + 4); + v.sequential := '1'; end if; end if; diff --git a/icache.vhdl b/icache.vhdl index e4f8448..739e047 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -115,6 +115,7 @@ architecture rtl of icache is subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; subtype way_t is integer range 0 to NUM_WAYS-1; + subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); @@ -132,6 +133,7 @@ architecture rtl of icache is -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; + type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs signal cache_tags : cache_tags_array_t; @@ -179,6 +181,8 @@ architecture rtl of icache is store_row : row_t; store_tag : cache_tag_t; store_valid : std_ulogic; + end_row_ix : row_in_line_t; + rows_valid : row_per_line_valid_t; -- TLB miss state fetch_failed : std_ulogic; @@ -200,6 +204,7 @@ architecture rtl of icache is signal ra_valid : std_ulogic; signal priv_fault : std_ulogic; signal access_ok : std_ulogic; + signal use_previous : std_ulogic; -- Output data to logger signal log_data : std_ulogic_vector(53 downto 0); @@ -225,20 +230,24 @@ architecture rtl of icache is return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; + -- Return the index of a row within a line + function get_row_of_line(row: row_t) return row_in_line_t is + variable row_v : unsigned(ROW_BITS-1 downto 0); + begin + row_v := to_unsigned(row, ROW_BITS); + return row_v(ROW_LINEBITS-1 downto 0); + end; + -- Returns whether this is the last row of a line - function is_last_row_addr(addr: wishbone_addr_type) return boolean is - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is begin - return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; + return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; end; -- Returns whether this is the last row of a line - function is_last_row(row: row_t) return boolean is - variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row(row: row_t; last: row_in_line_t) return boolean is begin - row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); - return row_v(ROW_LINEBITS-1 downto 0) = ones; + return get_row_of_line(row) = last; end; -- Return the address of the next row in the current cache line @@ -367,7 +376,7 @@ begin ); process(all) begin - do_read <= not stall_in; + do_read <= not (stall_in or use_previous); do_write <= '0'; if wishbone_in.ack = '1' and r.store_way = i then do_write <= '1'; @@ -472,23 +481,38 @@ begin variable is_hit : std_ulogic; variable hit_way : way_t; begin + -- i_in.sequential means that i_in.nia this cycle is 4 more than + -- last cycle. If we read more than 32 bits at a time, had a cache hit + -- last cycle, and we don't want the first 32-bit chunk, then we can + -- keep the data we read last cycle and just use that. + if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then + use_previous <= i_in.sequential and r.hit_valid; + else + use_previous <= '0'; + end if; + -- Extract line, row and tag from request req_index <= get_index(i_in.nia); req_row <= get_row(i_in.nia); req_tag <= get_tag(real_addr); - -- Calculate address of beginning of cache line, will be + -- Calculate address of beginning of cache row, will be -- used for cache miss processing if needed -- req_laddr <= (63 downto REAL_ADDR_BITS => '0') & - real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; for i in way_t loop - if i_in.req = '1' and cache_valids(req_index)(i) = '1' then + if i_in.req = '1' and + (cache_valids(req_index)(i) = '1' or + (r.state = WAIT_ACK and + req_index = r.store_index and + i = r.store_way and + r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then if read_tag(i, cache_tags(req_index)) = req_tag then hit_way := i; is_hit := '1'; @@ -536,7 +560,8 @@ begin if rising_edge(clk) then -- keep outputs to fetch2 unchanged on a stall -- except that flush or reset sets valid to 0 - if stall_in = '1' then + -- If use_previous, keep the same data as last cycle and use the second half + if stall_in = '1' or use_previous = '1' then if rst = '1' or flush_in = '1' then r.hit_valid <= '0'; end if; @@ -545,9 +570,6 @@ begin -- will be available on the cache_out output of the corresponding way -- r.hit_valid <= req_is_hit; - -- Send stop marks and NIA down regardless of validity - r.hit_smark <= i_in.stop_mark; - r.hit_nia <= i_in.nia; if req_is_hit = '1' then r.hit_way <= req_hit_way; @@ -559,6 +581,11 @@ begin " way:" & integer'image(req_hit_way) & " RA:" & to_hstring(real_addr); end if; + end if; + if stall_in = '0' then + -- Send stop marks and NIA down regardless of validity + r.hit_smark <= i_in.stop_mark; + r.hit_nia <= i_in.nia; end if; end if; end process; @@ -597,6 +624,11 @@ begin -- Main state machine case r.state is when IDLE => + -- Reset per-row valid flags, only used in WAIT_ACK + for i in 0 to ROW_PER_LINE - 1 loop + r.rows_valid(i) <= '0'; + end loop; + -- We need to read a cache line if req_is_miss = '1' then report "cache miss nia:" & to_hstring(i_in.nia) & @@ -613,6 +645,7 @@ begin r.store_row <= get_row(req_laddr); r.store_tag <= req_tag; r.store_valid <= '1'; + r.end_row_ix <= get_row_of_line(get_row(req_laddr)) - 1; -- Prep for first wishbone read. We calculate the address of -- the start of the cache line and start the WB cycle. @@ -650,7 +683,7 @@ begin -- stb and set stbs_done so we can handle an eventual last -- ack on the same cycle. -- - if is_last_row_addr(r.wb.adr) then + if is_last_row_addr(r.wb.adr, r.end_row_ix) then r.wb.stb <= '0'; stbs_done := true; end if; @@ -661,8 +694,9 @@ begin -- Incoming acks processing if wishbone_in.ack = '1' then + r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1'; -- Check for completion - if stbs_done and is_last_row(r.store_row) then + if stbs_done and is_last_row(r.store_row, r.end_row_ix) then -- Complete wishbone cycle r.wb.cyc <= '0'; From 6701e7346b598f97100f609956a4b9df282af6f7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 4 Jun 2020 20:58:32 +1000 Subject: [PATCH 10/22] core: Use a busy signal rather than a stall This changes the instruction dependency tracking so that we can generate a "busy" signal from execute1 and loadstore1 which comes along one cycle later than the current "stall" signal. This will enable us to signal busy cycles only when we need to from loadstore1. The "busy" signal from execute1/loadstore1 indicates "I didn't take the thing you gave me on this cycle", as distinct from the previous stall signal which meant "I took that but don't give me anything next cycle". That means that decode2 proactively gives execute1 a new instruction as soon as it has taken the previous one (assuming there is a valid instruction available from decode1), and that then sits in decode2's output until execute1 can take it. So instructions are issued by decode2 somewhat earlier than they used to be. Decode2 now only signals a stall upstream when its output buffer is full, meaning that we can fill up bubbles in the upstream pipe while a long instruction is executing. This gives a small boost in performance. This also adds dependency tracking for rA updates by update-form load/store instructions. The GPR and CR hazard detection machinery now has one extra stage, which may not be strictly necessary. Some of the code now really only applies to PIPELINE_DEPTH=1. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + control.vhdl | 61 +++++++++++++++++++++++++++++++---------- core.vhdl | 12 ++++----- cr_hazard.vhdl | 45 ++++++++++++++++--------------- decode2.vhdl | 27 ++++++++++++++----- execute1.vhdl | 50 +++++++++++++++++++--------------- gpr_hazard.vhdl | 72 ++++++++++++++++++++++++++----------------------- loadstore1.vhdl | 8 +++--- writeback.vhdl | 32 +++++++++++++--------- 9 files changed, 190 insertions(+), 118 deletions(-) diff --git a/common.vhdl b/common.vhdl index f08ecd1..31bd920 100644 --- a/common.vhdl +++ b/common.vhdl @@ -244,6 +244,7 @@ package common is others => (others => '0')); type Loadstore1ToExecute1Type is record + busy : std_ulogic; exception : std_ulogic; invalid : std_ulogic; perm_error : std_ulogic; diff --git a/control.vhdl b/control.vhdl index 55f5649..5e557c4 100644 --- a/control.vhdl +++ b/control.vhdl @@ -15,7 +15,8 @@ entity control is complete_in : in std_ulogic; valid_in : in std_ulogic; flush_in : in std_ulogic; - stall_in : in std_ulogic; + busy_in : in std_ulogic; + deferred : in std_ulogic; sgl_pipe_in : in std_ulogic; stop_mark_in : in std_ulogic; @@ -23,6 +24,9 @@ entity control is gpr_write_in : in gspr_index_t; gpr_bypassable : in std_ulogic; + update_gpr_write_valid : in std_ulogic; + update_gpr_write_reg : in gspr_index_t; + gpr_a_read_valid_in : in std_ulogic; gpr_a_read_in : in gspr_index_t; @@ -72,7 +76,11 @@ begin ) port map ( clk => clk, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, + complete_in => complete_in, + flush_in => flush_in, + issuing => valid_out, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -80,6 +88,9 @@ begin gpr_read_valid_in => gpr_a_read_valid_in, gpr_read_in => gpr_a_read_in, + ugpr_write_valid => update_gpr_write_valid, + ugpr_write_reg => update_gpr_write_reg, + stall_out => stall_a_out, use_bypass => gpr_bypass_a ); @@ -90,7 +101,11 @@ begin ) port map ( clk => clk, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, + complete_in => complete_in, + flush_in => flush_in, + issuing => valid_out, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -98,6 +113,9 @@ begin gpr_read_valid_in => gpr_b_read_valid_in, gpr_read_in => gpr_b_read_in, + ugpr_write_valid => update_gpr_write_valid, + ugpr_write_reg => update_gpr_write_reg, + stall_out => stall_b_out, use_bypass => gpr_bypass_b ); @@ -110,7 +128,11 @@ begin ) port map ( clk => clk, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, + complete_in => complete_in, + flush_in => flush_in, + issuing => valid_out, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -118,6 +140,9 @@ begin gpr_read_valid_in => gpr_c_read_valid_in, gpr_read_in => gpr_c_read_in_fmt, + ugpr_write_valid => update_gpr_write_valid, + ugpr_write_reg => update_gpr_write_reg, + stall_out => stall_c_out, use_bypass => gpr_bypass_c ); @@ -128,7 +153,11 @@ begin ) port map ( clk => clk, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, + complete_in => complete_in, + flush_in => flush_in, + issuing => valid_out, cr_read_in => cr_read_in, cr_write_in => cr_write_valid, @@ -139,7 +168,8 @@ begin control0: process(clk) begin if rising_edge(clk) then - assert r_int.outstanding >= 0 and r_int.outstanding <= (PIPELINE_DEPTH+1) report "Outstanding bad " & integer'image(r_int.outstanding) severity failure; + assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1) + report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure; r_int <= rin_int; end if; end process; @@ -152,17 +182,18 @@ begin v_int := r_int; -- asynchronous - valid_tmp := valid_in and not flush_in and not stall_in; - stall_tmp := stall_in; + valid_tmp := valid_in and not flush_in; + stall_tmp := '0'; - if complete_in = '1' then + if flush_in = '1' then + -- expect to see complete_in next cycle + v_int.outstanding := 1; + elsif complete_in = '1' then v_int.outstanding := r_int.outstanding - 1; end if; if rst = '1' then - v_int.state := IDLE; - v_int.outstanding := 0; - stall_tmp := '0'; + v_int := reg_internal_init; valid_tmp := '0'; end if; @@ -227,7 +258,9 @@ begin end if; if valid_tmp = '1' then - v_int.outstanding := v_int.outstanding + 1; + if deferred = '0' then + v_int.outstanding := v_int.outstanding + 1; + end if; gpr_write_valid <= gpr_write_valid_in; cr_write_valid <= cr_write_in; else @@ -237,7 +270,7 @@ begin -- update outputs valid_out <= valid_tmp; - stall_out <= stall_tmp; + stall_out <= stall_tmp or deferred; -- update registers rin_int <= v_int; diff --git a/core.vhdl b/core.vhdl index 4d84b4a..13f3ce7 100644 --- a/core.vhdl +++ b/core.vhdl @@ -82,11 +82,10 @@ architecture behave of core is signal icache_stall_out : std_ulogic; signal icache_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic; - signal decode2_stall_in : std_ulogic; + signal decode2_busy_in : std_ulogic; signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; - signal ex1_stall_out: std_ulogic; - signal ls1_stall_out: std_ulogic; + signal ex1_busy_out: std_ulogic; signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; @@ -235,7 +234,7 @@ begin port map ( clk => clk, rst => rst_dec2, - stall_in => decode2_stall_in, + busy_in => decode2_busy_in, stall_out => decode2_stall_out, flush_in => flush, complete_in => complete, @@ -248,7 +247,7 @@ begin c_out => decode2_to_cr_file, log_out => log_data(119 downto 110) ); - decode2_stall_in <= ex1_stall_out or ls1_stall_out; + decode2_busy_in <= ex1_busy_out; register_file_0: entity work.register_file generic map ( @@ -289,7 +288,7 @@ begin clk => clk, rst => rst_ex1, flush_out => flush, - stall_out => ex1_stall_out, + busy_out => ex1_busy_out, e_in => decode2_to_execute1, l_in => loadstore1_to_execute1, ext_irq_in => ext_irq, @@ -317,7 +316,6 @@ begin m_out => loadstore1_to_mmu, m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, - stall_out => ls1_stall_out, log_out => log_data(149 downto 140) ); diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl index f6c5f3f..4b79020 100644 --- a/cr_hazard.vhdl +++ b/cr_hazard.vhdl @@ -4,11 +4,15 @@ use ieee.numeric_std.all; entity cr_hazard is generic ( - PIPELINE_DEPTH : natural := 2 + PIPELINE_DEPTH : natural := 1 ); port( clk : in std_ulogic; - stall_in : in std_ulogic; + busy_in : in std_ulogic; + deferred : in std_ulogic; + complete_in : in std_ulogic; + flush_in : in std_ulogic; + issuing : in std_ulogic; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -22,7 +26,7 @@ architecture behaviour of cr_hazard is end record; constant pipeline_entry_init : pipeline_entry_type := (valid => '0'); - type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type; + type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); signal r, rin : pipeline_t := pipeline_t_init; @@ -30,9 +34,7 @@ begin cr_hazard0: process(clk) begin if rising_edge(clk) then - if stall_in = '0' then - r <= rin; - end if; + r <= rin; end if; end process; @@ -41,22 +43,23 @@ begin begin v := r; - stall_out <= '0'; - loop_0: for i in 0 to PIPELINE_DEPTH-1 loop - if (r(i).valid = cr_read_in) then - stall_out <= '1'; - end if; - end loop; - - v(0).valid := cr_write_in; - loop_1: for i in 0 to PIPELINE_DEPTH-2 loop - -- propagate to next slot - v(i+1) := r(i); - end loop; + -- XXX assumes PIPELINE_DEPTH = 1 + if complete_in = '1' then + v(1).valid := '0'; + end if; + stall_out <= cr_read_in and (v(0).valid or v(1).valid); - -- asynchronous output - if cr_read_in = '0' then - stall_out <= '0'; + -- XXX assumes PIPELINE_DEPTH = 1 + if busy_in = '0' then + v(1) := r(0); + v(0).valid := '0'; + end if; + if deferred = '0' and issuing = '1' then + v(0).valid := cr_write_in; + end if; + if flush_in = '1' then + v(0).valid := '0'; + v(1).valid := '0'; end if; -- update registers diff --git a/decode2.vhdl b/decode2.vhdl index 2c02a75..5b8cbc1 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -17,7 +17,7 @@ entity decode2 is rst : in std_ulogic; complete_in : in std_ulogic; - stall_in : in std_ulogic; + busy_in : in std_ulogic; stall_out : out std_ulogic; stopped_out : out std_ulogic; @@ -45,6 +45,8 @@ architecture behaviour of decode2 is signal r, rin : reg_type; + signal deferred : std_ulogic; + signal log_data : std_ulogic_vector(9 downto 0); type decode_input_reg_t is record @@ -200,6 +202,9 @@ architecture behaviour of decode2 is signal gpr_write : gspr_index_t; signal gpr_bypassable : std_ulogic; + signal update_gpr_write_valid : std_ulogic; + signal update_gpr_write_reg : gspr_index_t; + signal gpr_a_read_valid : std_ulogic; signal gpr_a_read :gspr_index_t; signal gpr_a_bypass : std_ulogic; @@ -224,7 +229,8 @@ begin complete_in => complete_in, valid_in => control_valid_in, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, flush_in => flush_in, sgl_pipe_in => control_sgl_pipe, stop_mark_in => d_in.stop_mark, @@ -233,6 +239,9 @@ begin gpr_write_in => gpr_write, gpr_bypassable => gpr_bypassable, + update_gpr_write_valid => update_gpr_write_valid, + update_gpr_write_reg => update_gpr_write_reg, + gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -254,13 +263,17 @@ begin gpr_bypass_c => gpr_c_bypass ); + deferred <= r.e.valid and busy_in; + decode2_0: process(clk) begin if rising_edge(clk) then - if rin.e.valid = '1' then - report "execute " & to_hstring(rin.e.nia); + if rst = '1' or flush_in = '1' or deferred = '0' then + if rin.e.valid = '1' then + report "execute " & to_hstring(rin.e.nia); + end if; + r <= rin; end if; - r <= rin; end if; end process; @@ -358,6 +371,8 @@ begin if EX1_BYPASS and d_in.decode.unit = ALU then gpr_bypassable <= '1'; end if; + update_gpr_write_valid <= d_in.decode.update; + update_gpr_write_reg <= decoded_reg_a.reg; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; @@ -375,7 +390,7 @@ begin v.e.insn_type := OP_ILLEGAL; end if; - if rst = '1' then + if rst = '1' or flush_in = '1' then v.e := Decode2ToExecute1Init; end if; diff --git a/execute1.vhdl b/execute1.vhdl index 0009699..c234725 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -20,7 +20,7 @@ entity execute1 is -- asynchronous flush_out : out std_ulogic; - stall_out : out std_ulogic; + busy_out : out std_ulogic; e_in : in Decode2ToExecute1Type; l_in : in Loadstore1ToExecute1Type; @@ -48,6 +48,8 @@ end entity execute1; architecture behaviour of execute1 is type reg_type is record e : Execute1ToWritebackType; + busy: std_ulogic; + terminate: std_ulogic; lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; @@ -62,7 +64,7 @@ architecture behaviour of execute1 is log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := - (e => Execute1ToWritebackInit, lr_update => '0', + (e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0', mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0', slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, next_lr => (others => '0'), ldst_nia => (others => '0'), others => (others => '0')); @@ -71,6 +73,7 @@ architecture behaviour of execute1 is signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); + signal valid_in : std_ulogic; signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; @@ -241,6 +244,11 @@ begin b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; + busy_out <= l_in.busy or r.busy; + valid_in <= e_in.valid and not busy_out; + + terminate_out <= r.terminate; + execute1_0: process(clk) begin if rising_edge(clk) then @@ -251,7 +259,7 @@ begin else r <= rin; ctrl <= ctrl_tmp; - assert not (r.lr_update = '1' and e_in.valid = '1') + assert not (r.lr_update = '1' and valid_in = '1') report "LR update collision with valid in EX1" severity failure; if r.lr_update = '1' then @@ -423,9 +431,9 @@ begin end if; end if; - terminate_out <= '0'; + v.terminate := '0'; icache_inval <= '0'; - stall_out <= '0'; + v.busy := '0'; f_out <= Execute1ToFetch1TypeInit; -- send MSR[IR] and ~MSR[PR] up to fetch1 f_out.virt_mode <= ctrl.msr(MSR_IR); @@ -463,10 +471,10 @@ begin f_out.virt_mode <= '0'; f_out.priv_mode <= '1'; f_out.redirect_nia <= ctrl.irq_nia; - v.e.valid := e_in.valid; + v.e.valid := '1'; report "Writing SRR1: " & to_hstring(ctrl.srr1); - elsif irq_valid = '1' and e_in.valid = '1' then + elsif irq_valid = '1' and valid_in = '1' then -- we need two cycles to write srr0 and 1 -- will need more when we have to write HEIR -- Don't deliver the interrupt until we have a valid instruction @@ -474,7 +482,7 @@ begin exception := '1'; ctrl_tmp.srr1 <= msr_copy(ctrl.msr); - elsif e_in.valid = '1' and ctrl.msr(MSR_PR) = '1' and + elsif valid_in = '1' and ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then -- generate a program interrupt exception := '1'; @@ -484,7 +492,7 @@ begin ctrl_tmp.srr1(63 - 45) <= '1'; report "privileged instruction"; - elsif e_in.valid = '1' and e_in.unit = ALU then + elsif valid_in = '1' and e_in.unit = ALU then report "execute nia " & to_hstring(e_in.nia); @@ -519,7 +527,7 @@ begin -- check bits 1-10 of the instruction to make sure it's attn -- if not then it is illegal if e_in.insn(10 downto 1) = "0100000000" then - terminate_out <= '1'; + v.terminate := '1'; report "ATTN"; else illegal := '1'; @@ -674,7 +682,7 @@ begin when OP_CNTZ => v.e.valid := '0'; v.cntz_in_progress := '1'; - stall_out <= '1'; + v.busy := '1'; when OP_EXTS => -- note data_len is a 1-hot encoding negative := (e_in.data_len(0) and c_in(7)) or @@ -876,21 +884,21 @@ begin when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => v.e.valid := '0'; v.mul_in_progress := '1'; - stall_out <= '1'; + v.busy := '1'; x_to_multiply.valid <= '1'; when OP_DIV | OP_DIVE | OP_MOD => v.e.valid := '0'; v.div_in_progress := '1'; - stall_out <= '1'; + v.busy := '1'; x_to_divider.valid <= '1'; when others => - terminate_out <= '1'; + v.terminate := '1'; report "illegal"; end case; - v.e.rc := e_in.rc and e_in.valid; + v.e.rc := e_in.rc and valid_in; -- Update LR on the next cycle after a branch link -- @@ -908,10 +916,10 @@ begin v.next_lr := next_nia; v.e.valid := '0'; report "Delayed LR update to " & to_hstring(next_nia); - stall_out <= '1'; + v.busy := '1'; end if; - elsif e_in.valid = '1' then + elsif valid_in = '1' then -- instruction for other units, i.e. LDST v.ldst_nia := e_in.nia; v.e.valid := '0'; @@ -967,7 +975,7 @@ begin end if; v.e.valid := '1'; else - stall_out <= '1'; + v.busy := '1'; v.mul_in_progress := r.mul_in_progress; v.div_in_progress := r.div_in_progress; end if; @@ -988,7 +996,8 @@ begin v.e.exc_write_data := next_nia; end if; ctrl_tmp.irq_state <= WRITE_SRR1; - v.e.valid := '1'; + v.busy := '1'; + v.e.valid := '0'; end if; v.e.write_data := result; @@ -1020,7 +1029,6 @@ begin v.e.exc_write_data := r.ldst_nia; report "ldst exception writing srr0=" & to_hstring(r.ldst_nia); ctrl_tmp.irq_state <= WRITE_SRR1; - v.e.valid := '1'; -- complete the original load or store end if; -- Outputs to loadstore1 (async) @@ -1072,7 +1080,7 @@ begin r.e.write_enable & r.e.valid & f_out.redirect & - stall_out & + r.busy & flush_out; end if; end process; diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index de4f7d2..0fa66c5 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -4,11 +4,15 @@ use ieee.numeric_std.all; entity gpr_hazard is generic ( - PIPELINE_DEPTH : natural := 2 + PIPELINE_DEPTH : natural := 1 ); port( clk : in std_ulogic; - stall_in : in std_ulogic; + busy_in : in std_ulogic; + deferred : in std_ulogic; + complete_in : in std_ulogic; + flush_in : in std_ulogic; + issuing : in std_ulogic; gpr_write_valid_in : in std_ulogic; gpr_write_in : in std_ulogic_vector(5 downto 0); @@ -16,6 +20,9 @@ entity gpr_hazard is gpr_read_valid_in : in std_ulogic; gpr_read_in : in std_ulogic_vector(5 downto 0); + ugpr_write_valid : in std_ulogic; + ugpr_write_reg : in std_ulogic_vector(5 downto 0); + stall_out : out std_ulogic; use_bypass : out std_ulogic ); @@ -25,10 +32,13 @@ architecture behaviour of gpr_hazard is valid : std_ulogic; bypass : std_ulogic; gpr : std_ulogic_vector(5 downto 0); + ugpr_valid : std_ulogic; + ugpr : std_ulogic_vector(5 downto 0); end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0')); + constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'), + ugpr_valid => '0', ugpr => (others => '0')); - type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type; + type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); signal r, rin : pipeline_t := pipeline_t_init; @@ -45,50 +55,46 @@ begin begin v := r; + if complete_in = '1' then + v(PIPELINE_DEPTH).valid := '0'; + v(PIPELINE_DEPTH).ugpr_valid := '0'; + end if; + stall_out <= '0'; use_bypass <= '0'; if gpr_read_valid_in = '1' then - if r(0).valid = '1' and r(0).gpr = gpr_read_in then - if r(0).bypass = '1' and stall_in = '0' then - use_bypass <= '1'; - else - stall_out <= '1'; - end if; - end if; - loop_0: for i in 1 to PIPELINE_DEPTH-1 loop - if r(i).valid = '1' and r(i).gpr = gpr_read_in then + loop_0: for i in 0 to PIPELINE_DEPTH loop + if v(i).valid = '1' and r(i).gpr = gpr_read_in then if r(i).bypass = '1' then use_bypass <= '1'; else stall_out <= '1'; end if; end if; + if v(i).ugpr_valid = '1' and r(i).ugpr = gpr_read_in then + stall_out <= '1'; + end if; end loop; end if; - if stall_in = '0' then + -- XXX assumes PIPELINE_DEPTH = 1 + if busy_in = '0' then + v(1) := v(0); + v(0).valid := '0'; + v(0).ugpr_valid := '0'; + end if; + if deferred = '0' and issuing = '1' then v(0).valid := gpr_write_valid_in; v(0).bypass := bypass_avail; v(0).gpr := gpr_write_in; - loop_1: for i in 1 to PIPELINE_DEPTH-1 loop - -- propagate to next slot - v(i).valid := r(i-1).valid; - v(i).bypass := r(i-1).bypass; - v(i).gpr := r(i-1).gpr; - end loop; - - else - -- stage 0 stalled, so stage 1 becomes empty - loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop - -- propagate to next slot - if i = 1 then - v(i).valid := '0'; - else - v(i).valid := r(i-1).valid; - v(i).bypass := r(i-1).bypass; - v(i).gpr := r(i-1).gpr; - end if; - end loop; + v(0).ugpr_valid := ugpr_write_valid; + v(0).ugpr := ugpr_write_reg; + end if; + if flush_in = '1' then + v(0).valid := '0'; + v(0).ugpr_valid := '0'; + v(1).valid := '0'; + v(1).ugpr_valid := '0'; end if; -- update registers diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 6e71df9..4140244 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -25,7 +25,6 @@ entity loadstore1 is m_in : in MmuToLoadstore1Type; dc_stall : in std_ulogic; - stall_out : out std_ulogic; log_out : out std_ulogic_vector(9 downto 0) ); @@ -47,6 +46,7 @@ architecture behave of loadstore1 is ); type reg_stage_t is record + busy : std_ulogic; -- latch most of the input request load : std_ulogic; tlbie : std_ulogic; @@ -123,6 +123,7 @@ begin if rising_edge(clk) then if rst = '1' then r.state <= IDLE; + r.busy <= '0'; else r <= rin; end if; @@ -499,6 +500,7 @@ begin l_out.store_done <= d_in.store_done; -- update exception info back to execute1 + e_out.busy <= r.busy; e_out.exception <= exception; e_out.instr_fault <= r.instr_fault; e_out.invalid <= m_in.invalid; @@ -513,7 +515,7 @@ begin end if; end if; - stall_out <= stall; + v.busy := stall; -- Update registers rin <= v; @@ -523,7 +525,7 @@ begin ls1_log: process(clk) begin if rising_edge(clk) then - log_data <= stall_out & + log_data <= r.busy & e_out.exception & l_out.valid & m_out.valid & diff --git a/writeback.vhdl b/writeback.vhdl index 60afebb..d02a0b1 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -22,27 +22,33 @@ end entity writeback; architecture behaviour of writeback is begin - writeback_1: process(all) + writeback_0: process(clk) variable x : std_ulogic_vector(0 downto 0); variable y : std_ulogic_vector(0 downto 0); variable w : std_ulogic_vector(0 downto 0); + begin + if rising_edge(clk) then + -- Do consistency checks only on the clock edge + x(0) := e_in.valid; + y(0) := l_in.valid; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + + x(0) := e_in.write_enable or e_in.exc_write_enable; + y(0) := l_in.write_enable; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + + w(0) := e_in.write_cr_enable; + x(0) := (e_in.write_enable and e_in.rc); + assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure; + end if; + end process; + + writeback_1: process(all) variable cf: std_ulogic_vector(3 downto 0); variable zero : std_ulogic; variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); begin - x(0) := e_in.valid; - y(0) := l_in.valid; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; - - x(0) := e_in.write_enable or e_in.exc_write_enable; - y(0) := l_in.write_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; - - w(0) := e_in.write_cr_enable; - x(0) := (e_in.write_enable and e_in.rc); - assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure; - w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; From 1d09daae030835c673e2fe35c07f88f9ca736b85 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 5 Jun 2020 13:29:34 +1000 Subject: [PATCH 11/22] loadstore1: Complete mfspr/mtspr a cycle later This makes mfspr and mtspr complete (and mfspr write back) on the cycle after the instruction is received from execute1, rather than on the same cycle. This makes them match all other instructions that execute in one cycle. Because these instructions are marked as single-issue, there wasn't the possibility of having two instructions complete on the same cycle (which we can't cope with), but it is better to fix this. Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 4140244..e2f3248 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -42,7 +42,8 @@ architecture behave of loadstore1 is ACK_WAIT, -- waiting for ack from dcache LD_UPDATE, -- writing rA with computed addr on load MMU_LOOKUP, -- waiting for MMU to look up translation - TLBIE_WAIT -- waiting for MMU to finish doing a tlbie + TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie + SPR_CMPLT -- complete a mf/tspr operation ); type reg_stage_t is record @@ -51,6 +52,7 @@ architecture behave of loadstore1 is load : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; + mfspr : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); load_data : std_ulogic_vector(63 downto 0); @@ -73,6 +75,7 @@ architecture behave of loadstore1 is dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); instr_fault : std_ulogic; + sprval : std_ulogic_vector(63 downto 0); end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -152,9 +155,7 @@ begin variable use_second : byte_sel_t; variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; - variable mfspr : std_ulogic; variable sprn : std_ulogic_vector(9 downto 0); - variable sprval : std_ulogic_vector(63 downto 0); variable exception : std_ulogic; variable next_addr : std_ulogic_vector(63 downto 0); variable mmureq : std_ulogic; @@ -168,11 +169,10 @@ begin done := '0'; byte_sel := (others => '0'); addr := lsu_sum; - mfspr := '0'; + v.mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); - sprval := (others => '0'); -- avoid inferred latches exception := '0'; dsisr := (others => '0'); mmureq := '0'; @@ -256,20 +256,20 @@ begin v.tlbie := '1'; v.state := TLBIE_WAIT; when OP_MFSPR => - done := '1'; - mfspr := '1'; + v.mfspr := '1'; -- partial decode on SPR number should be adequate given -- the restricted set that get sent down this path if sprn(9) = '0' and sprn(5) = '0' then if sprn(0) = '0' then - sprval := x"00000000" & r.dsisr; + v.sprval := x"00000000" & r.dsisr; else - sprval := r.dar; + v.sprval := r.dar; end if; else -- reading one of the SPRs in the MMU - sprval := m_in.sprval; + v.sprval := m_in.sprval; end if; + v.state := SPR_CMPLT; when OP_MTSPR => if sprn(9) = '0' and sprn(5) = '0' then if sprn(0) = '0' then @@ -277,7 +277,7 @@ begin else v.dar := l_in.data; end if; - done := '1'; + v.state := SPR_CMPLT; else -- writing one of the SPRs in the MMU mmu_mtspr := '1'; @@ -452,6 +452,10 @@ begin v.state := IDLE; done := '1'; + when SPR_CMPLT => + done := '1'; + v.state := IDLE; + end case; -- Update outputs to dcache @@ -482,10 +486,10 @@ begin -- Multiplex either cache data to the destination GPR or -- the address for the rA update. l_out.valid <= done; - if mfspr = '1' then + if r.mfspr = '1' then l_out.write_enable <= '1'; - l_out.write_reg <= l_in.write_reg; - l_out.write_data <= sprval; + l_out.write_reg <= r.write_reg; + l_out.write_data <= r.sprval; elsif do_update = '1' then l_out.write_enable <= '1'; l_out.write_reg <= r.update_reg; From 209aa9ce3f1be930226ae1fabf2eed8b6d7ba302 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 5 Jun 2020 14:22:02 +1000 Subject: [PATCH 12/22] loadstore1: Reduce busy cycles This reduces the number of cycles where loadstore1 asserts its busy output, leading to increased throughput of loads and stores. Loads that hit in the cache can now be executed at the rate of one every two cycles. Stores take 4 cycles assuming the wishbone slave responds with an ack the cycle after we assert strobe. To achieve this, the state machine code is split into two parts, one for when we have an existing instruction in progress, and one for starting a new instruction. We can now combinatorially clear busy and start a new instruction in the same cycle that we get a done signal from the dcache; in other words we are completing one instruction and potentially writing back results in the same cycle that we start a new instruction and send its address and data to the dcache. Signed-off-by: Paul Mackerras --- execute1.vhdl | 13 +-- loadstore1.vhdl | 245 +++++++++++++++++++++++------------------------- 2 files changed, 123 insertions(+), 135 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index c234725..edbeaaa 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -60,14 +60,14 @@ architecture behaviour of execute1 is slow_op_rc : std_ulogic; slow_op_oe : std_ulogic; slow_op_xerc : xer_common_t; - ldst_nia : std_ulogic_vector(63 downto 0); + last_nia : std_ulogic_vector(63 downto 0); log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0', mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0', slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, - next_lr => (others => '0'), ldst_nia => (others => '0'), others => (others => '0')); + next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0')); signal r, rin : reg_type; @@ -455,6 +455,9 @@ begin v.e.exc_write_enable := '0'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); v.e.exc_write_data := e_in.nia; + if valid_in = '1' then + v.last_nia := e_in.nia; + end if; if ctrl.irq_state = WRITE_SRR1 then v.e.exc_write_reg := fast_spr_num(SPR_SRR1); @@ -921,8 +924,6 @@ begin elsif valid_in = '1' then -- instruction for other units, i.e. LDST - v.ldst_nia := e_in.nia; - v.e.valid := '0'; if e_in.unit = LDST then lv.valid := '1'; end if; @@ -1026,8 +1027,8 @@ begin end if; v.e.exc_write_enable := '1'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); - v.e.exc_write_data := r.ldst_nia; - report "ldst exception writing srr0=" & to_hstring(r.ldst_nia); + v.e.exc_write_data := r.last_nia; + report "ldst exception writing srr0=" & to_hstring(r.last_nia); ctrl_tmp.irq_state <= WRITE_SRR1; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e2f3248..cf00987 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -47,7 +47,6 @@ architecture behave of loadstore1 is ); type reg_stage_t is record - busy : std_ulogic; -- latch most of the input request load : std_ulogic; tlbie : std_ulogic; @@ -126,7 +125,6 @@ begin if rising_edge(clk) then if rst = '1' then r.state <= IDLE; - r.busy <= '0'; else r <= rin; end if; @@ -143,7 +141,7 @@ begin variable long_sel : std_ulogic_vector(15 downto 0); variable byte_sel : std_ulogic_vector(7 downto 0); variable req : std_ulogic; - variable stall : std_ulogic; + variable busy : std_ulogic; variable addr : std_ulogic_vector(63 downto 0); variable wdata : std_ulogic_vector(63 downto 0); variable write_enable : std_ulogic; @@ -165,15 +163,12 @@ begin begin v := r; req := '0'; - stall := '0'; - done := '0'; byte_sel := (others => '0'); addr := lsu_sum; v.mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); - exception := '0'; dsisr := (others => '0'); mmureq := '0'; @@ -232,130 +227,18 @@ begin -- compute (addr + 8) & ~7 for the second doubleword when unaligned next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + done := '0'; + exception := '0'; case r.state is when IDLE => - if l_in.valid = '1' then - v.addr := lsu_sum; - v.load := '0'; - v.dcbz := '0'; - v.tlbie := '0'; - v.instr_fault := '0'; - v.dwords_done := '0'; - case l_in.op is - when OP_STORE => - req := '1'; - when OP_LOAD => - req := '1'; - v.load := '1'; - when OP_DCBZ => - req := '1'; - v.dcbz := '1'; - when OP_TLBIE => - mmureq := '1'; - stall := '1'; - v.tlbie := '1'; - v.state := TLBIE_WAIT; - when OP_MFSPR => - v.mfspr := '1'; - -- partial decode on SPR number should be adequate given - -- the restricted set that get sent down this path - if sprn(9) = '0' and sprn(5) = '0' then - if sprn(0) = '0' then - v.sprval := x"00000000" & r.dsisr; - else - v.sprval := r.dar; - end if; - else - -- reading one of the SPRs in the MMU - v.sprval := m_in.sprval; - end if; - v.state := SPR_CMPLT; - when OP_MTSPR => - if sprn(9) = '0' and sprn(5) = '0' then - if sprn(0) = '0' then - v.dsisr := l_in.data(31 downto 0); - else - v.dar := l_in.data; - end if; - v.state := SPR_CMPLT; - else - -- writing one of the SPRs in the MMU - mmu_mtspr := '1'; - stall := '1'; - v.state := TLBIE_WAIT; - end if; - when OP_FETCH_FAILED => - -- send it to the MMU to do the radix walk - addr := l_in.nia; - v.addr := l_in.nia; - v.instr_fault := '1'; - mmureq := '1'; - stall := '1'; - v.state := MMU_LOOKUP; - when others => - assert false report "unknown op sent to loadstore1"; - end case; - - v.write_reg := l_in.write_reg; - v.length := l_in.length; - v.byte_reverse := l_in.byte_reverse; - v.sign_extend := l_in.sign_extend; - v.update := l_in.update; - v.update_reg := l_in.update_reg; - v.xerc := l_in.xerc; - v.reserve := l_in.reserve; - v.rc := l_in.rc; - v.nc := l_in.ci; - v.virt_mode := l_in.virt_mode; - v.priv_mode := l_in.priv_mode; - - -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- for a real-mode access. - -- - -- This will have to be replaced by a combination of implementing the - -- proper HV CI load/store instructions and having an MMU to get the I - -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then - v.nc := '1'; - end if; - - -- Do length_to_sel and work out if we are doing 2 dwords - long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); - byte_sel := long_sel(7 downto 0); - v.first_bytes := byte_sel; - v.second_bytes := long_sel(15 downto 8); - - -- Do byte reversing and rotating for stores in the first cycle - byte_offset := unsigned(lsu_sum(2 downto 0)); - brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; - end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; - j := to_integer(k) * 8; - v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); - end loop; - - if req = '1' then - stall := '1'; - if long_sel(15 downto 8) = "00000000" then - v.state := ACK_WAIT; - else - v.state := SECOND_REQ; - end if; - end if; - end if; when SECOND_REQ => addr := next_addr; byte_sel := r.second_bytes; req := '1'; - stall := '1'; v.state := ACK_WAIT; when ACK_WAIT => - stall := '1'; if d_in.valid = '1' then if d_in.error = '1' then -- dcache will discard the second request if it @@ -393,7 +276,6 @@ begin else -- stores write back rA update in this cycle do_update := r.update; - stall := '0'; done := '1'; v.state := IDLE; end if; @@ -402,7 +284,6 @@ begin end if; when MMU_LOOKUP => - stall := '1'; if r.dwords_done = '1' then addr := next_addr; byte_sel := r.second_bytes; @@ -423,7 +304,6 @@ begin end if; else -- nothing to do, the icache retries automatically - stall := '0'; done := '1'; v.state := IDLE; end if; @@ -439,10 +319,8 @@ begin end if; when TLBIE_WAIT => - stall := '1'; if m_in.done = '1' then -- tlbie is finished - stall := '0'; done := '1'; v.state := IDLE; end if; @@ -458,6 +336,117 @@ begin end case; + busy := '1'; + if r.state = IDLE or done = '1' then + busy := '0'; + end if; + + -- Note that l_in.valid is gated with busy inside execute1 + if l_in.valid = '1' then + v.addr := lsu_sum; + v.load := '0'; + v.dcbz := '0'; + v.tlbie := '0'; + v.instr_fault := '0'; + v.dwords_done := '0'; + v.write_reg := l_in.write_reg; + v.length := l_in.length; + v.byte_reverse := l_in.byte_reverse; + v.sign_extend := l_in.sign_extend; + v.update := l_in.update; + v.update_reg := l_in.update_reg; + v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; + v.nc := l_in.ci; + v.virt_mode := l_in.virt_mode; + v.priv_mode := l_in.priv_mode; + + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- for a real-mode access. + if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then + v.nc := '1'; + end if; + + -- Do length_to_sel and work out if we are doing 2 dwords + long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); + byte_sel := long_sel(7 downto 0); + v.first_bytes := byte_sel; + v.second_bytes := long_sel(15 downto 8); + + -- Do byte reversing and rotating for stores in the first cycle + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; + j := to_integer(k) * 8; + v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + end loop; + + case l_in.op is + when OP_STORE => + req := '1'; + when OP_LOAD => + req := '1'; + v.load := '1'; + when OP_DCBZ => + req := '1'; + v.dcbz := '1'; + when OP_TLBIE => + mmureq := '1'; + v.tlbie := '1'; + v.state := TLBIE_WAIT; + when OP_MFSPR => + v.mfspr := '1'; + -- partial decode on SPR number should be adequate given + -- the restricted set that get sent down this path + if sprn(9) = '0' and sprn(5) = '0' then + if sprn(0) = '0' then + v.sprval := x"00000000" & r.dsisr; + else + v.sprval := r.dar; + end if; + else + -- reading one of the SPRs in the MMU + v.sprval := m_in.sprval; + end if; + v.state := SPR_CMPLT; + when OP_MTSPR => + if sprn(9) = '0' and sprn(5) = '0' then + if sprn(0) = '0' then + v.dsisr := l_in.data(31 downto 0); + else + v.dar := l_in.data; + end if; + v.state := SPR_CMPLT; + else + -- writing one of the SPRs in the MMU + mmu_mtspr := '1'; + v.state := TLBIE_WAIT; + end if; + when OP_FETCH_FAILED => + -- send it to the MMU to do the radix walk + addr := l_in.nia; + v.addr := l_in.nia; + v.instr_fault := '1'; + mmureq := '1'; + v.state := MMU_LOOKUP; + when others => + assert false report "unknown op sent to loadstore1"; + end case; + + if req = '1' then + if long_sel(15 downto 8) = "00000000" then + v.state := ACK_WAIT; + else + v.state := SECOND_REQ; + end if; + end if; + end if; + -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; @@ -504,7 +493,7 @@ begin l_out.store_done <= d_in.store_done; -- update exception info back to execute1 - e_out.busy <= r.busy; + e_out.busy <= busy; e_out.exception <= exception; e_out.instr_fault <= r.instr_fault; e_out.invalid <= m_in.invalid; @@ -519,8 +508,6 @@ begin end if; end if; - v.busy := stall; - -- Update registers rin <= v; @@ -529,7 +516,7 @@ begin ls1_log: process(clk) begin if rising_edge(clk) then - log_data <= r.busy & + log_data <= e_out.busy & e_out.exception & l_out.valid & m_out.valid & From 65a36cc0fc65d4a7dbe83b2422b6b819c26d4162 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 May 2020 16:28:42 +1000 Subject: [PATCH 13/22] decode: Work out ispr1/ispr2 in parallel with decode ROM lookup This makes the logic that calculates which SPRs are being accessed work in parallel with the instruction decode ROM lookup instead of being dependent on the opcode found in the decode ROM. The reason for doing that is that the path from icache through the decode ROM to the ispr1/ispr2 fields has become a critical path. Thus we are now using only a very partial decode of the instruction word in the logic for isp1/isp2, and we therefore can no longer rely on them being zero in all cases where no SPR is being accessed. Instead, decode2 now ignores ispr1/ispr2 in all cases except when the relevant decode.input_reg_a/b or decode.output_reg_a is set to SPR. Signed-off-by: Paul Mackerras --- decode1.vhdl | 59 ++++++++++++++++++++++++++++++---------------------- decode2.vhdl | 10 ++++----- 2 files changed, 38 insertions(+), 31 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 214285e..ae3e970 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -398,6 +398,17 @@ begin -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); + -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path + sprn := decode_spr_num(f_in.insn); + v.ispr1 := fast_spr_num(sprn); + + elsif majorop = "010000" then + -- CTR may be needed as input to bc + v.decode := major_decode_rom_array(to_integer(majorop)); + if f_in.insn(23) = '0' then + v.ispr1 := fast_spr_num(SPR_CTR); + end if; + elsif majorop = "010011" then if decode_op_19_valid(to_integer(unsigned(f_in.insn(10 downto 1)))) = '0' then report "op 19 illegal subcode"; @@ -408,6 +419,27 @@ begin report "op 19 sub " & to_hstring(op_19_bits); end if; + -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path + if f_in.insn(2) = '0' then + -- Could be OP_BCREG: bclr, bcctr, bctar + -- Branch uses CTR as condition when BO(2) is 0. This is + -- also used to indicate that CTR is modified (they go + -- together). + if f_in.insn(23) = '0' then + v.ispr1 := fast_spr_num(SPR_CTR); + end if; + -- TODO: Add TAR + if f_in.insn(10) = '0' then + v.ispr2 := fast_spr_num(SPR_LR); + else + v.ispr2 := fast_spr_num(SPR_CTR); + end if; + else + -- Could be OP_RFID + v.ispr1 := fast_spr_num(SPR_SRR0); + v.ispr2 := fast_spr_num(SPR_SRR1); + end if; + elsif majorop = "011110" then v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1)))); @@ -423,30 +455,11 @@ begin else v.decode := major_decode_rom_array(to_integer(majorop)); - end if; - -- Set ISPR1/ISPR2 when needed - if v.decode.insn_type = OP_BC or v.decode.insn_type = OP_BCREG then - -- Branch uses CTR as condition when BO(2) is 0. This is - -- also used to indicate that CTR is modified (they go - -- together). - -- - if f_in.insn(23) = '0' then - v.ispr1 := fast_spr_num(SPR_CTR); - end if; + end if; - -- Branch source register is an SPR - if v.decode.insn_type = OP_BCREG then - -- TODO: Add TAR - if f_in.insn(10) = '0' then - v.ispr2 := fast_spr_num(SPR_LR); - else - v.ispr2 := fast_spr_num(SPR_CTR); - end if; - end if; - elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then + if v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then sprn := decode_spr_num(f_in.insn); - v.ispr1 := fast_spr_num(sprn); -- Make slow SPRs single issue if is_fast_spr(v.ispr1) = '0' then v.decode.sgl_pipe := '1'; @@ -457,10 +470,6 @@ begin when others => end case; end if; - elsif v.decode.insn_type = OP_RFID then - report "PPC RFID"; - v.ispr1 := fast_spr_num(SPR_SRR0); - v.ispr2 := fast_spr_num(SPR_SRR1); end if; if flush_in = '1' then diff --git a/decode2.vhdl b/decode2.vhdl index 5b8cbc1..6acbca7 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -67,8 +67,6 @@ architecture behaviour of decode2 is return decode_input_reg_t is begin if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then - assert is_fast_spr(ispr) = '0' report "Decode A says GPR but ISPR says SPR:" & - to_hstring(ispr) severity failure; return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data); elsif t = SPR then -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. @@ -93,8 +91,6 @@ architecture behaviour of decode2 is begin case t is when RB => - assert is_fast_spr(ispr) = '0' report "Decode B says GPR but ISPR says SPR:" & - to_hstring(ispr) severity failure; ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data); when CONST_UI => ret := ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64))); @@ -277,8 +273,10 @@ begin end if; end process; - r_out.read1_reg <= gpr_or_spr_to_gspr(insn_ra(d_in.insn), d_in.ispr1); - r_out.read2_reg <= gpr_or_spr_to_gspr(insn_rb(d_in.insn), d_in.ispr2); + r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR + else gpr_to_gspr(insn_ra(d_in.insn)); + r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR + else gpr_to_gspr(insn_rb(d_in.insn)); r_out.read3_reg <= insn_rs(d_in.insn); c_out.read <= d_in.decode.input_cr; From b5959632332cefbcb12537580710ba706fc79cf5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jun 2020 14:23:50 +1000 Subject: [PATCH 14/22] dcache: Reduce latencies and improve timing This implements various improvements to the dcache with the aim of making it go faster. - We can now execute operations that don't need to access main memory (cacheable loads that hit in the cache and TLB operations) as soon as any previous operation has completed, without waiting for the state machine to become idle. - Cache line refills start with the doubleword that is needed to satisfy the load that initiated them. - Cacheable loads that miss return their data and complete as soon as the requested doubleword comes back from memory; they don't wait for the refill to finish. - We now have per-doubleword valid bits for the cache line being refilled, meaning that if a load comes in for a line that is in the process of being refilled, we can return the data and complete it within a couple of cycles of the doubleword coming in from memory. - There is now a bypass path for data being written to the cache RAM so that we can do a store hit followed immediately by a load hit to the same doubleword. This also makes the data from a refill available to load hits one cycle earlier than it would be otherwise. - Stores complete in the cycle where their wishbone operation is initiated, without waiting for the wishbone cycle to complete. - During the wishbone cycle for a store, if another store comes in that is to the same page, and we don't have a stall from the wishbone, we can send out the write for the second store in the same wishbone cycle and without going through the IDLE state first. We limit it to 7 outstanding writes that have not yet been acknowledged. - The cache tag RAM is now read on a clock edge rather than being combinatorial for reading. Its width is rounded up to a multiple of 8 bits per way so that byte enables can be used for writing individual tags. - The cache tag RAM is now written a cycle later than previously, in order to ease timing. - Data for a store hit is now written one cycle later than previously. This eases timing since we don't have to get through the tag matching and on to the write enable within a single cycle. The 2-stage bypass path means we can still handle a load hit on either of the two cycles after the store and return the correct data. (A load hit 3 or more cycles later will get the correct data from the BRAM.) - Operations can sit in r0 while there is an uncompleted operation in r1. Once the operation in r1 is completed, the operation in r0 spends one cycle in r0 for TLB/cache tag lookup and then gets put into r1.req. This can happen before r1 gets to the IDLE state. Some operations can then be completed before r1 gets to the IDLE state - a load miss to the cache line being refilled, or a store to the same page as a previous store. Signed-off-by: Paul Mackerras --- dcache.vhdl | 705 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 435 insertions(+), 270 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 7a8c0ba..bc351b0 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -83,6 +83,8 @@ architecture rtl of dcache is constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; + -- TAG_WIDTH is the width in bits of each way of the tag RAM + constant TAG_WIDTH : natural := TAG_BITS + 7 - ((TAG_BITS + 7) mod 8); -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -100,6 +102,7 @@ architecture rtl of dcache is subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; subtype way_t is integer range 0 to NUM_WAYS-1; + subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); @@ -110,17 +113,19 @@ architecture rtl of dcache is subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); -- type cache_tags_set_t is array(way_t) of cache_tag_t; -- type cache_tags_array_t is array(index_t) of cache_tags_set_t; - constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; + constant TAG_RAM_WIDTH : natural := TAG_WIDTH * NUM_WAYS; subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); type cache_tags_array_t is array(index_t) of cache_tags_set_t; -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; + type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - signal cache_tags : cache_tags_array_t; - signal cache_valids : cache_valids_t; + signal cache_tags : cache_tags_array_t; + signal cache_tag_set : cache_tags_set_t; + signal cache_valids : cache_valids_t; attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; @@ -177,18 +182,17 @@ architecture rtl of dcache is -- Type of operation on a "valid" input type op_t is (OP_NONE, + OP_BAD, -- NC cache hit, TLB miss, prot/RC failure + OP_STCX_FAIL, -- conditional store w/o reservation OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load - OP_BAD, -- BAD: Cache hit on NC load/store - OP_TLB_ERR, -- TLB miss or protection/RC failure OP_STORE_HIT, -- Store hitting cache OP_STORE_MISS); -- Store missing cache -- Cache state machine type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack - FINISH_LD_MISS, -- Extra cycle after load miss STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -218,38 +222,63 @@ architecture rtl of dcache is end record; signal r0 : reg_stage_0_t; - signal r0_valid : std_ulogic; - + signal r0_full : std_ulogic; + + type mem_access_request_t is record + op : op_t; + dcbz : std_ulogic; + real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + data : std_ulogic_vector(63 downto 0); + byte_sel : std_ulogic_vector(7 downto 0); + hit_way : way_t; + repl_way : way_t; + end record; + -- First stage register, contains state for stage 1 of load hits -- and for the state machine used by all other operations -- type reg_stage_1_t is record - -- Latch the complete request from ls1 - req : Loadstore1ToDcacheType; - mmu_req : std_ulogic; + -- Info about the request + full : std_ulogic; -- have uncompleted request + mmu_req : std_ulogic; -- request is from MMU + req : mem_access_request_t; -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; - -- Data buffer for "slow" read ops (load miss and NC loads). - slow_data : std_ulogic_vector(63 downto 0); - slow_valid : std_ulogic; - - -- Signal to complete a failed stcx. - stcx_fail : std_ulogic; + -- 2-stage data buffer for data forwarded from writes to reads + forward_data1 : std_ulogic_vector(63 downto 0); + forward_data2 : std_ulogic_vector(63 downto 0); + forward_sel1 : std_ulogic_vector(7 downto 0); + forward_valid1 : std_ulogic; + forward_way1 : way_t; + forward_row1 : row_t; + use_forward1 : std_ulogic; + forward_sel : std_ulogic_vector(7 downto 0); -- Cache miss state (reload state machine) state : state_t; + dcbz : std_ulogic; + write_bram : std_ulogic; + write_tag : std_ulogic; + slow_valid : std_ulogic; wb : wishbone_master_out; + reload_tag : cache_tag_t; store_way : way_t; store_row : row_t; store_index : index_t; + end_row_ix : row_in_line_t; + rows_valid : row_per_line_valid_t; + acks_pending : unsigned(2 downto 0); -- Signals to complete with error error_done : std_ulogic; cache_paradox : std_ulogic; + -- Signal to complete a failed stcx. + stcx_fail : std_ulogic; + -- completion signal for tlbie tlbie_done : std_ulogic; end record; @@ -272,7 +301,6 @@ architecture rtl of dcache is signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); - signal req_laddr : std_ulogic_vector(63 downto 0); signal early_req_row : row_t; @@ -280,6 +308,12 @@ architecture rtl of dcache is signal set_rsrv : std_ulogic; signal clear_rsrv : std_ulogic; + signal r0_valid : std_ulogic; + signal r0_stall : std_ulogic; + + signal use_forward1_next : std_ulogic; + signal use_forward2_next : std_ulogic; + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -305,6 +339,7 @@ architecture rtl of dcache is signal perm_attr : perm_attr_t; signal rc_ok : std_ulogic; signal perm_ok : std_ulogic; + signal access_ok : std_ulogic; -- TLB PLRU output interface type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); @@ -315,31 +350,35 @@ architecture rtl of dcache is -- -- Return the cache line index (tag index) for an address - function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is + function get_index(addr: std_ulogic_vector) return index_t is begin return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address - function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is + function get_row(addr: std_ulogic_vector) return row_t is begin return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; + -- Return the index of a row within a line + function get_row_of_line(row: row_t) return row_in_line_t is + variable row_v : unsigned(ROW_BITS-1 downto 0); + begin + row_v := to_unsigned(row, ROW_BITS); + return row_v(ROW_LINEBITS-1 downto 0); + end; + -- Returns whether this is the last row of a line - function is_last_row_addr(addr: wishbone_addr_type) return boolean is - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is begin - return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; + return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; end; -- Returns whether this is the last row of a line - function is_last_row(row: row_t) return boolean is - variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row(row: row_t; last: row_in_line_t) return boolean is begin - row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); - return row_v(ROW_LINEBITS-1 downto 0) = ones; + return get_row_of_line(row) = last; end; -- Return the address of the next row in the current cache line @@ -371,7 +410,7 @@ architecture rtl of dcache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector) return cache_tag_t is begin return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; @@ -379,14 +418,7 @@ architecture rtl of dcache is -- Read a tag from a tag memory row function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is begin - return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); - end; - - -- Write a tag to tag memory row - procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; - tag: cache_tag_t) is - begin - tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; + return tagset(way * TAG_WIDTH + TAG_BITS - 1 downto way * TAG_WIDTH); end; -- Read a TLB tag from a TLB tag memory row @@ -439,38 +471,42 @@ begin report "geometry bits don't add up" severity FAILURE; assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; + assert SET_SIZE_BITS <= TLB_LG_PGSZ report "Set indexed by virtual address" severity FAILURE; -- Latch the request in r0.req as long as we're not stalling stage_0 : process(clk) + variable r : reg_stage_0_t; begin if rising_edge(clk) then + assert (d_in.valid and m_in.valid) = '0' report + "request collision loadstore vs MMU"; + if m_in.valid = '1' then + r.req.valid := '1'; + r.req.load := not (m_in.tlbie or m_in.tlbld); + r.req.dcbz := '0'; + r.req.nc := '0'; + r.req.reserve := '0'; + r.req.virt_mode := '0'; + r.req.priv_mode := '1'; + r.req.addr := m_in.addr; + r.req.data := m_in.pte; + r.req.byte_sel := (others => '1'); + r.tlbie := m_in.tlbie; + r.doall := m_in.doall; + r.tlbld := m_in.tlbld; + r.mmu_req := '1'; + else + r.req := d_in; + r.tlbie := '0'; + r.doall := '0'; + r.tlbld := '0'; + r.mmu_req := '0'; + end if; if rst = '1' then - r0.req.valid <= '0'; - elsif stall_out = '0' then - assert (d_in.valid and m_in.valid) = '0' report - "request collision loadstore vs MMU"; - if m_in.valid = '1' then - r0.req.valid <= '1'; - r0.req.load <= not (m_in.tlbie or m_in.tlbld); - r0.req.dcbz <= '0'; - r0.req.nc <= '0'; - r0.req.reserve <= '0'; - r0.req.virt_mode <= '0'; - r0.req.priv_mode <= '1'; - r0.req.addr <= m_in.addr; - r0.req.data <= m_in.pte; - r0.req.byte_sel <= (others => '1'); - r0.tlbie <= m_in.tlbie; - r0.doall <= m_in.doall; - r0.tlbld <= m_in.tlbld; - r0.mmu_req <= '1'; - else - r0.req <= d_in; - r0.tlbie <= '0'; - r0.doall <= '0'; - r0.tlbld <= '0'; - r0.mmu_req <= '0'; - end if; + r0_full <= '0'; + elsif r1.full = '0' or r0_full = '0' then + r0 <= r; + r0_full <= r.req.valid; end if; end if; end process; @@ -478,9 +514,10 @@ begin -- we don't yet handle collisions between loadstore1 requests and MMU requests m_out.stall <= '0'; - -- Hold off the request in r0 when stalling, - -- and cancel it if we get an error in a previous request. - r0_valid <= r0.req.valid and not stall_out and not r1.error_done; + -- Hold off the request in r0 when r1 has an uncompleted request + r0_stall <= r0_full and r1.full; + r0_valid <= r0_full and not r1.full; + stall_out <= r0_stall; -- TLB -- Operates in the second cycle on the request latched in r0.req. @@ -490,20 +527,19 @@ begin variable addrbits : std_ulogic_vector(TLB_SET_BITS - 1 downto 0); begin if rising_edge(clk) then - if stall_out = '1' then - -- keep reading the same thing while stalled - index := tlb_req_index; + if m_in.valid = '1' then + addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); else - if m_in.valid = '1' then - addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); - else - addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); - end if; - index := to_integer(unsigned(addrbits)); + addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + end if; + index := to_integer(unsigned(addrbits)); + -- If we have any op and the previous op isn't finished, + -- then keep the same output for next cycle. + if r0_stall = '0' then + tlb_valid_way <= dtlb_valids(index); + tlb_tag_way <= dtlb_tags(index); + tlb_pte_way <= dtlb_ptes(index); end if; - tlb_valid_way <= dtlb_valids(index); - tlb_tag_way <= dtlb_tags(index); - tlb_pte_way <= dtlb_ptes(index); end if; end process; @@ -569,10 +605,12 @@ begin valid_ra <= tlb_hit or not r0.req.virt_mode; if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - r0.req.addr(TLB_LG_PGSZ - 1 downto 0); + r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= extract_perm_attr(pte); else - ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto 0); + ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= real_mode_perm_attr; end if; end process; @@ -652,35 +690,45 @@ begin end generate; end generate; + -- Cache tag RAM read port + cache_tag_read : process(clk) + variable index : index_t; + begin + if rising_edge(clk) then + if r0_stall = '1' then + index := req_index; + elsif m_in.valid = '1' then + index := get_index(m_in.addr); + else + index := get_index(d_in.addr); + end if; + cache_tag_set <= cache_tags(index); + end if; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) - variable is_hit : std_ulogic; - variable hit_way : way_t; - variable op : op_t; - variable opsel : std_ulogic_vector(2 downto 0); - variable go : std_ulogic; - variable nc : std_ulogic; - variable s_hit : std_ulogic; - variable s_tag : cache_tag_t; - variable s_pte : tlb_pte_t; - variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); - variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable is_hit : std_ulogic; + variable hit_way : way_t; + variable op : op_t; + variable opsel : std_ulogic_vector(2 downto 0); + variable go : std_ulogic; + variable nc : std_ulogic; + variable s_hit : std_ulogic; + variable s_tag : cache_tag_t; + variable s_pte : tlb_pte_t; + variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable hit_way_set : hit_way_set_t; + variable rel_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable rel_match : std_ulogic; begin -- Extract line, row and tag from request req_index <= get_index(r0.req.addr); req_row <= get_row(r0.req.addr); req_tag <= get_tag(ra); - -- Only do anything if not being stalled by stage 1 - go := r0_valid and not (r0.tlbie or r0.tlbld); - - -- Calculate address of beginning of cache line, will be - -- used for cache miss processing if needed - -- - req_laddr <= (63 downto REAL_ADDR_BITS => '0') & - ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.error_done; -- Test if pending request is a hit on any way -- In order to make timing in virtual mode, when we are using the TLB, @@ -688,7 +736,9 @@ begin -- the TLB, and then decide later which match to use. hit_way := 0; is_hit := '0'; + rel_match := '0'; if r0.req.virt_mode = '1' then + rel_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := 0; s_hit := '0'; @@ -698,27 +748,61 @@ begin s_tag := get_tag(s_ra); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and - read_tag(i, cache_tags(req_index)) = s_tag and + read_tag(i, cache_tag_set) = s_tag and tlb_valid_way(j) = '1' then hit_way_set(j) := i; s_hit := '1'; end if; end loop; hit_set(j) := s_hit; + if s_tag = r1.reload_tag then + rel_matches(j) := '1'; + end if; end loop; if tlb_hit = '1' then is_hit := hit_set(tlb_hit_way); hit_way := hit_way_set(tlb_hit_way); + rel_match := rel_matches(tlb_hit_way); end if; else - s_tag := get_tag(r0.req.addr(REAL_ADDR_BITS - 1 downto 0)); + s_tag := get_tag(r0.req.addr); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and - read_tag(i, cache_tags(req_index)) = s_tag then + read_tag(i, cache_tag_set) = s_tag then hit_way := i; is_hit := '1'; end if; end loop; + if s_tag = r1.reload_tag then + rel_match := '1'; + end if; + end if; + + -- See if the request matches the line currently being reloaded + if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and + rel_match = '1' then + -- For a store, consider this a hit even if the row isn't valid + -- since it will be by the time we perform the store. + -- For a load, check the appropriate row valid bit. + is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE); + hit_way := r1.store_way; + end if; + + -- Whether to use forwarded data for a load or not + use_forward1_next <= '0'; + if get_row(r1.req.real_addr) = req_row and r1.req.hit_way = hit_way then + -- Only need to consider r1.write_bram here, since if we are + -- writing refill data here, then we don't have a cache hit this + -- cycle on the line being refilled. (There is the possibility + -- that the load following the load miss that started the refill + -- could be to the old contents of the victim line, since it is a + -- couple of cycles after the refill starts before we see the + -- updated cache tag. In that case we don't use the bypass.) + use_forward1_next <= r1.write_bram; + end if; + use_forward2_next <= '0'; + if r1.forward_row1 = req_row and r1.forward_way1 = hit_way then + use_forward2_next <= r1.forward_valid1; end if; -- The way that matched on a hit @@ -732,6 +816,7 @@ begin rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed); perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and (perm_attr.wr_perm or (r0.req.load and perm_attr.rd_perm)); + access_ok <= valid_ra and perm_ok and rc_ok; -- Combine the request and cache hit status to decide what -- operation needs to be done @@ -739,7 +824,11 @@ begin nc := r0.req.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then - if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then + if access_ok = '0' then + op := OP_BAD; + elsif cancel_store = '1' then + op := OP_STCX_FAIL; + else opsel := r0.req.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; @@ -752,8 +841,6 @@ begin when "111" => op := OP_BAD; when others => op := OP_NONE; end case; - else - op := OP_TLB_ERR; end if; end if; req_op <= op; @@ -762,7 +849,7 @@ begin -- in the cases where we need to read the cache data BRAM. -- If we're stalling then we need to keep reading the last -- row requested. - if stall_out = '0' then + if r0_stall = '0' then if m_in.valid = '1' then early_req_row <= get_row(m_in.addr); else @@ -776,9 +863,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Generate stalls from stage 1 state machine - stall_out <= '1' when r1.state /= IDLE else '0'; - -- Handle load-with-reservation and store-conditional instructions reservation_comb: process(all) begin @@ -805,11 +889,15 @@ begin reservation_reg: process(clk) begin if rising_edge(clk) then - if rst = '1' or clear_rsrv = '1' then + if rst = '1' then reservation.valid <= '0'; - elsif set_rsrv = '1' then - reservation.valid <= '1'; - reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); + elsif r0_valid = '1' and access_ok = '1' then + if clear_rsrv = '1' then + reservation.valid <= '0'; + elsif set_rsrv = '1' then + reservation.valid <= '1'; + reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); + end if; end if; end if; end process; @@ -817,11 +905,28 @@ begin -- Return data for loads & completion control logic -- writeback_control: process(all) + variable data_out : std_ulogic_vector(63 downto 0); + variable data_fwd : std_ulogic_vector(63 downto 0); + variable j : integer; begin + -- Use the bypass if are reading the row that was written 1 or 2 cycles + -- ago, including for the slow_valid = 1 case (i.e. completing a load + -- miss or a non-cacheable load). + if r1.use_forward1 = '1' then + data_fwd := r1.forward_data1; + else + data_fwd := r1.forward_data2; + end if; + data_out := cache_out(r1.hit_way); + for i in 0 to 7 loop + j := i * 8; + if r1.forward_sel(i) = '1' then + data_out(j + 7 downto j) := data_fwd(j + 7 downto j); + end if; + end loop; - -- The mux on d_out.data defaults to the normal load hit case. d_out.valid <= '0'; - d_out.data <= cache_out(r1.hit_way); + d_out.data <= data_out; d_out.store_done <= '0'; d_out.error <= '0'; d_out.cache_paradox <= '0'; @@ -829,7 +934,7 @@ begin -- Outputs to MMU m_out.done <= r1.tlbie_done; m_out.err <= '0'; - m_out.data <= cache_out(r1.hit_way); + m_out.data <= data_out; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -853,7 +958,7 @@ begin -- Request came from loadstore1... -- Load hit case is the standard path if r1.hit_load_valid = '1' then - report "completing load hit"; + report "completing load hit data=" & to_hstring(data_out); d_out.valid <= '1'; end if; @@ -867,16 +972,8 @@ begin -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then - -- If it's a load, enable register writeback and switch - -- mux accordingly - -- - if r1.req.load then - -- Read data comes from the slow data latch - d_out.data <= r1.slow_data; - end if; d_out.store_done <= '1'; - - report "completing store or load miss"; + report "completing store or load miss data=" & to_hstring(data_out); d_out.valid <= '1'; end if; @@ -901,8 +998,6 @@ begin -- Slow ops (i.e. load miss) if r1.slow_valid = '1' then - -- Read data comes from the slow data latch - m_out.data <= r1.slow_data; report "completing MMU load miss, data=" & to_hstring(m_out.data); m_out.done <= '1'; end if; @@ -946,8 +1041,6 @@ begin wr_data => wr_data ); process(all) - variable tmp_adr : std_ulogic_vector(63 downto 0); - variable reloading : boolean; begin -- Cache hit reads do_read <= '1'; @@ -959,43 +1052,40 @@ begin -- Defaults to wishbone read responses (cache refill), -- -- For timing, the mux on wr_data/sel/addr is not dependent on anything - -- other than the current state. Only the do_write signal is. + -- other than the current state. -- - if r1.state = IDLE then - -- In IDLE state, the only write path is the store-hit update case - wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= r0.req.data; - wr_sel <= r0.req.byte_sel; + wr_sel_m <= (others => '0'); + + do_write <= '0'; + if r1.write_bram = '1' then + -- Write store data to BRAM. This happens one cycle after the + -- store is in r0. + wr_data <= r1.req.data; + wr_sel <= r1.req.byte_sel; + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS)); + if i = r1.req.hit_way then + do_write <= '1'; + end if; else -- Otherwise, we might be doing a reload or a DCBZ - if r1.req.dcbz = '1' then + if r1.dcbz = '1' then wr_data <= (others => '0'); else wr_data <= wishbone_in.dat; end if; - wr_sel <= (others => '1'); - wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); - end if; + wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); + wr_sel <= (others => '1'); - -- The two actual write cases here - do_write <= '0'; - reloading := r1.state = RELOAD_WAIT_ACK; - if reloading and wishbone_in.ack = '1' and r1.store_way = i then - do_write <= '1'; - end if; - if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' and - r0.req.dcbz = '0' then - assert not reloading report "Store hit while in state:" & - state_t'image(r1.state) - severity FAILURE; - do_write <= '1'; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then + do_write <= '1'; + end if; end if; -- Mask write selects with do_write since BRAM doesn't -- have a global write-enable - for i in 0 to ROW_SIZE-1 loop - wr_sel_m(i) <= wr_sel(i) and do_write; - end loop; + if do_write = '1' then + wr_sel_m <= wr_sel; + end if; end process; end generate; @@ -1007,15 +1097,7 @@ begin dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as r0.req.valid - -- is only set for a single cycle. It's up to the control logic to - -- ensure we don't override an uncompleted request (for now we are - -- single issue on load/stores so we are fine, later, we can generate - -- a stall output if necessary). - - if req_op /= OP_NONE and stall_out = '0' then - r1.req <= r0.req; - r1.mmu_req <= r0.mmu_req; + if req_op /= OP_NONE then report "op:" & op_t'image(req_op) & " addr:" & to_hstring(r0.req.addr) & " nc:" & std_ulogic'image(r0.req.nc) & @@ -1023,8 +1105,11 @@ begin " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); end if; + if r0_valid = '1' then + r1.mmu_req <= r0.mmu_req; + end if; - -- Fast path for load/store hits. Set signals for the writeback controls. + -- Fast path for load/store hits. Set signals for the writeback controls. if req_op = OP_LOAD_HIT then r1.hit_way <= req_hit_way; r1.hit_load_valid <= '1'; @@ -1032,27 +1117,29 @@ begin r1.hit_load_valid <= '0'; end if; - if req_op = OP_TLB_ERR then + if req_op = OP_BAD then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.error_done <= '1'; - r1.cache_paradox <= '0'; - elsif req_op = OP_BAD then - report "Signalling cache paradox"; - r1.error_done <= '1'; - r1.cache_paradox <= '1'; + r1.cache_paradox <= access_ok; else r1.error_done <= '0'; r1.cache_paradox <= '0'; end if; + if req_op = OP_STCX_FAIL then + r1.stcx_fail <= '1'; + else + r1.stcx_fail <= '0'; + end if; + -- complete tlbies and TLB loads in the third cycle r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; end process; -- - -- Every other case is handled by this state machine: + -- Memory accesses are handled by this state machine: -- -- * Cache load miss/reload (in conjunction with "rams") -- * Load hits for non-cachable forms @@ -1062,16 +1149,45 @@ begin -- operates at stage 1. -- dcache_slow : process(clk) - variable tagset : cache_tags_set_t; variable stbs_done : boolean; + variable req : mem_access_request_t; + variable acks : unsigned(2 downto 0); begin if rising_edge(clk) then + r1.use_forward1 <= use_forward1_next; + r1.forward_sel <= (others => '0'); + if use_forward1_next = '1' then + r1.forward_sel <= r1.req.byte_sel; + elsif use_forward2_next = '1' then + r1.forward_sel <= r1.forward_sel1; + end if; + + r1.forward_data2 <= r1.forward_data1; + if r1.write_bram = '1' then + r1.forward_data1 <= r1.req.data; + r1.forward_sel1 <= r1.req.byte_sel; + r1.forward_way1 <= r1.req.hit_way; + r1.forward_row1 <= get_row(r1.req.real_addr); + r1.forward_valid1 <= '1'; + else + if r1.dcbz = '1' then + r1.forward_data1 <= (others => '0'); + else + r1.forward_data1 <= wishbone_in.dat; + end if; + r1.forward_sel1 <= (others => '1'); + r1.forward_way1 <= r1.store_way; + r1.forward_row1 <= r1.store_row; + r1.forward_valid1 <= '0'; + end if; + -- On reset, clear all valid bits to force misses if rst = '1' then for i in index_t loop cache_valids(i) <= (others => '0'); end loop; r1.state <= IDLE; + r1.full <= '0'; r1.slow_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -1081,44 +1197,77 @@ begin else -- One cycle pulses reset r1.slow_valid <= '0'; - r1.stcx_fail <= '0'; + r1.write_bram <= '0'; + + if r1.write_tag = '1' then + -- Store new tag in selected way + for i in 0 to NUM_WAYS-1 loop + if i = r1.store_way then + cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <= + (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag; + end if; + end loop; + r1.write_tag <= '0'; + end if; + + -- Take request from r1.req if there is one there, + -- else from req_op, ra, etc. + if r1.full = '1' then + req := r1.req; + else + req.op := req_op; + req.dcbz := r0.req.dcbz; + req.real_addr := ra; + req.data := r0.req.data; + req.byte_sel := r0.req.byte_sel; + req.hit_way := req_hit_way; + req.repl_way := replace_way; + + -- Store the incoming request from r0, if it is a slow request + -- Note that r1.full = 1 implies req_op = OP_NONE + if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or + req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then + r1.req <= req; + r1.full <= '1'; + end if; + end if; -- Main state machine case r1.state is when IDLE => - case req_op is + r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); + r1.dcbz <= '0'; + + -- Keep track of our index and way for subsequent stores. + r1.store_index <= get_index(req.real_addr); + r1.store_row <= get_row(req.real_addr); + r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1; + r1.reload_tag <= get_tag(req.real_addr); + + if req.op = OP_STORE_HIT then + r1.store_way <= req.hit_way; + else + r1.store_way <= req.repl_way; + end if; + + -- Reset per-row valid bits, ready for handling OP_LOAD_MISS + for i in 0 to ROW_PER_LINE - 1 loop + r1.rows_valid(i) <= '0'; + end loop; + + case req.op is when OP_LOAD_HIT => -- stay in IDLE state when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(r0.req.addr) & - " idx:" & integer'image(req_index) & - " way:" & integer'image(replace_way) & - " tag:" & to_hstring(req_tag); - - -- Force misses on that way while reloading that line - cache_valids(req_index)(replace_way) <= '0'; - - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if i = replace_way then - tagset := cache_tags(req_index); - write_tag(i, tagset, req_tag); - cache_tags(req_index) <= tagset; - end if; - end loop; - - -- Keep track of our index and way for subsequent stores. - r1.store_index <= req_index; - r1.store_way <= replace_way; - r1.store_row <= get_row(req_laddr); - - -- Prep for first wishbone read. We calculate the address of - -- the start of the cache line and start the WB cycle - -- - r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0); + report "cache miss real addr:" & to_hstring(req.real_addr) & + " idx:" & integer'image(get_index(req.real_addr)) & + " way:" & integer'image(req.repl_way) & + " tag:" & to_hstring(get_tag(req.real_addr)); + + -- Start the wishbone cycle r1.wb.sel <= (others => '1'); r1.wb.we <= '0'; r1.wb.cyc <= '1'; @@ -1126,74 +1275,52 @@ begin -- Track that we had one request sent r1.state <= RELOAD_WAIT_ACK; + r1.write_tag <= '1'; when OP_LOAD_NC => - r1.wb.sel <= r0.req.byte_sel; - r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= req.byte_sel; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if r0.req.dcbz = '0' then - r1.wb.sel <= r0.req.byte_sel; - r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= r0.req.data; - if cancel_store = '0' then - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - r1.wb.we <= '1'; - r1.state <= STORE_WAIT_ACK; - else - r1.stcx_fail <= '1'; - r1.state <= IDLE; + if req.dcbz = '0' then + r1.wb.sel <= req.byte_sel; + r1.wb.dat <= req.data; + r1.state <= STORE_WAIT_ACK; + r1.acks_pending <= to_unsigned(1, 3); + r1.full <= '0'; + r1.slow_valid <= '1'; + if req.op = OP_STORE_HIT then + r1.write_bram <= '1'; end if; else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading - r1.store_index <= req_index; - r1.store_row <= get_row(req_laddr); - - if req_op = OP_STORE_HIT then - r1.store_way <= req_hit_way; - else - r1.store_way <= replace_way; - - -- Force misses on the victim way while zeroing - cache_valids(req_index)(replace_way) <= '0'; - - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if i = replace_way then - tagset := cache_tags(req_index); - write_tag(i, tagset, req_tag); - cache_tags(req_index) <= tagset; - end if; - end loop; - end if; - -- Set up for wishbone writes - r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0); + -- Start the wishbone writes r1.wb.sel <= (others => '1'); - r1.wb.we <= '1'; r1.wb.dat <= (others => '0'); - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; -- Handle the rest like a load miss r1.state <= RELOAD_WAIT_ACK; + r1.write_tag <= '1'; + r1.dcbz <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; -- OP_NONE and OP_BAD do nothing - -- OP_BAD was handled above already + -- OP_BAD & OP_STCX_FAIL were handled above already when OP_NONE => when OP_BAD => - when OP_TLB_ERR => + when OP_STCX_FAIL => end case; - when RELOAD_WAIT_ACK => - -- Requests are all sent if stb is 0 + when RELOAD_WAIT_ACK => + -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; -- If we are still sending requests, was one accepted ? @@ -1202,7 +1329,7 @@ begin -- stb and set stbs_done so we can handle an eventual last -- ack on the same cycle. -- - if is_last_row_addr(r1.wb.adr) then + if is_last_row_addr(r1.wb.adr, r1.end_row_ix) then r1.wb.stb <= '0'; stbs_done := true; end if; @@ -1212,44 +1339,82 @@ begin end if; -- Incoming acks processing + r1.forward_valid1 <= wishbone_in.ack; if wishbone_in.ack = '1' then - -- Is this the data we were looking for ? Latch it so - -- we can respond later. We don't currently complete the - -- pending miss request immediately, we wait for the - -- whole line to be loaded. The reason is that if we - -- did, we would potentially get new requests in while - -- not idle, which we don't currently know how to deal - -- with. - -- - if r1.store_row = get_row(r1.req.addr) and r1.req.dcbz = '0' then - r1.slow_data <= wishbone_in.dat; + r1.rows_valid(r1.store_row mod ROW_PER_LINE) <= '1'; + -- If this is the data we were looking for, we can + -- complete the request next cycle. + -- Compare the whole address in case the request in + -- r1.req is not the one that started this refill. + if r1.full = '1' and + ((r1.dcbz = '1' and r1.req.dcbz = '1') or + (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and + r1.store_row = get_row(r1.req.real_addr) and + r1.reload_tag = get_tag(r1.req.real_addr) then + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.forward_sel <= (others => '1'); + r1.use_forward1 <= '1'; end if; -- Check for completion - if stbs_done and is_last_row(r1.store_row) then + if stbs_done and is_last_row(r1.store_row, r1.end_row_ix) then -- Complete wishbone cycle r1.wb.cyc <= '0'; -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Don't complete and go idle until next cycle, in - -- case the next request is for the last dword of - -- the cache line we just loaded. - r1.state <= FINISH_LD_MISS; + r1.state <= IDLE; end if; -- Increment store row counter r1.store_row <= next_row(r1.store_row); end if; - when FINISH_LD_MISS => - -- Write back the load data that we got - r1.slow_valid <= '1'; - r1.state <= IDLE; - report "completing miss !"; + when STORE_WAIT_ACK => + stbs_done := r1.wb.stb = '0'; + acks := r1.acks_pending; + -- Clear stb when slave accepted request + if wishbone_in.stall = '0' then + -- See if there is another store waiting to be done + -- which is in the same real page. + -- Using r1.req rather than req here limits us to one + -- store every two cycles, but helps timing in that we + -- don't depend on req_op or ra. + if r1.full = '1' and acks < 7 and + (r1.req.op = OP_STORE_MISS or r1.req.op = OP_STORE_HIT) and + (r1.req.real_addr(r1.wb.adr'left downto TLB_LG_PGSZ) = + r1.wb.adr(r1.wb.adr'left downto TLB_LG_PGSZ)) then + r1.wb.adr <= r1.req.real_addr(r1.wb.adr'left downto 0); + r1.wb.dat <= r1.req.data; + r1.wb.sel <= r1.req.byte_sel; + r1.wb.stb <= '1'; + stbs_done := false; + if r1.req.op = OP_STORE_HIT then + r1.write_bram <= '1'; + end if; + r1.full <= '0'; + r1.slow_valid <= '1'; + acks := acks + 1; + else + r1.wb.stb <= '0'; + stbs_done := true; + end if; + end if; + + -- Got ack ? See if complete. + if wishbone_in.ack = '1' then + if stbs_done and acks = 1 then + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + end if; + acks := acks - 1; + end if; + r1.acks_pending <= acks; - when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => + when NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request if wishbone_in.stall = '0' then r1.wb.stb <= '0'; @@ -1257,11 +1422,11 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then - if r1.state = NC_LOAD_WAIT_ACK then - r1.slow_data <= wishbone_in.dat; - end if; r1.state <= IDLE; + r1.full <= '0'; r1.slow_valid <= '1'; + r1.forward_sel <= (others => '1'); + r1.use_forward1 <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; From aebd915f8f465f66f33703164f9fdec419a53aa6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 13 Jun 2020 20:27:50 +1000 Subject: [PATCH 15/22] mmu: Take an extra cycle to do TLB invalidations This makes the TLB invalidations that occur as a result of a tlbie, slbia or mtspr instruction take one more cycle. This breaks some long combinatorial chains from decode2 to dcache and icache and thus eases timing. Signed-off-by: Paul Mackerras --- mmu.vhdl | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/mmu.vhdl b/mmu.vhdl index 0eefbab..fc2dd7a 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -27,6 +27,7 @@ end mmu; architecture behave of mmu is type state_t is (IDLE, + DO_TLBIE, TLB_WAIT, PROC_TBL_READ, PROC_TBL_WAIT, @@ -44,6 +45,7 @@ architecture behave of mmu is store : std_ulogic; priv : std_ulogic; addr : std_ulogic_vector(63 downto 0); + inval_all : std_ulogic; -- config SPRs prtbl : std_ulogic_vector(63 downto 0); pid : std_ulogic_vector(31 downto 0); @@ -178,7 +180,6 @@ begin variable tlb_load : std_ulogic; variable itlb_load : std_ulogic; variable tlbie_req : std_ulogic; - variable inval_all : std_ulogic; variable prtbl_rd : std_ulogic; variable pt_valid : std_ulogic; variable effpid : std_ulogic_vector(31 downto 0); @@ -207,7 +208,7 @@ begin tlb_load := '0'; itlb_load := '0'; tlbie_req := '0'; - inval_all := '0'; + v.inval_all := '0'; prtbl_rd := '0'; -- Radix tree data structures in memory are big-endian, @@ -240,19 +241,17 @@ begin v.store := not (l_in.load or l_in.iside); v.priv := l_in.priv; if l_in.tlbie = '1' then - dcreq := '1'; - tlbie_req := '1'; -- Invalidate all iTLB/dTLB entries for tlbie with -- RB[IS] != 0 or RB[AP] != 0, or for slbia - inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or - l_in.addr(7) or l_in.addr(6) or l_in.addr(5); + v.inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or + l_in.addr(7) or l_in.addr(6) or l_in.addr(5); -- The RIC field of the tlbie instruction comes across on the -- sprn bus as bits 2--3. RIC=2 flushes process table caches. if l_in.sprn(3) = '1' then v.pt0_valid := '0'; v.pt3_valid := '0'; end if; - v.state := TLB_WAIT; + v.state := DO_TLBIE; else v.valid := '1'; if pt_valid = '0' then @@ -281,12 +280,15 @@ begin v.pt3_valid := '0'; end if; v.pt0_valid := '0'; - dcreq := '1'; - tlbie_req := '1'; - inval_all := '1'; - v.state := TLB_WAIT; + v.inval_all := '1'; + v.state := DO_TLBIE; end if; + when DO_TLBIE => + dcreq := '1'; + tlbie_req := '1'; + v.state := TLB_WAIT; + when TLB_WAIT => if d_in.done = '1' then done := '1'; @@ -436,8 +438,8 @@ begin -- drive outputs if tlbie_req = '1' then - addr := l_in.addr; - tlb_data := l_in.rs; + addr := r.addr; + tlb_data := (others => '0'); elsif tlb_load = '1' then addr := r.addr(63 downto 12) & x"000"; tlb_data := pte; @@ -458,14 +460,14 @@ begin d_out.valid <= dcreq; d_out.tlbie <= tlbie_req; - d_out.doall <= inval_all; + d_out.doall <= r.inval_all; d_out.tlbld <= tlb_load; d_out.addr <= addr; d_out.pte <= tlb_data; i_out.tlbld <= itlb_load; i_out.tlbie <= tlbie_req; - i_out.doall <= inval_all; + i_out.doall <= r.inval_all; i_out.addr <= addr; i_out.pte <= tlb_data; From a4500c63a281a57752edbfdd4d9033974a98c8c8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 13 Jun 2020 23:00:13 +1000 Subject: [PATCH 16/22] dcache: Reduce back-to-back store latency from 3 cycles to 2 This uses the machinery we already had for comparing the real address of a new request with the tag of a previous request (r1.reload_tag) to get better timing on comparing the address of a second store with the one in progress. The comparison is now on the set size rather than the page size, but since set size can't be larger than the page size (and usually will equal the page size), that is OK. The same comparison can also be used to tell when we can satisfy a load miss during a cache line refill. Signed-off-by: Paul Mackerras --- dcache.vhdl | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index bc351b0..9ecb6a9 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -232,6 +232,7 @@ architecture rtl of dcache is byte_sel : std_ulogic_vector(7 downto 0); hit_way : way_t; repl_way : way_t; + same_tag : std_ulogic; end record; -- First stage register, contains state for stage 1 of load hits @@ -301,6 +302,7 @@ architecture rtl of dcache is signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); + signal req_same_tag : std_ulogic; signal early_req_row : row_t; @@ -777,6 +779,7 @@ begin rel_match := '1'; end if; end if; + req_same_tag <= rel_match; -- See if the request matches the line currently being reloaded if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and @@ -1222,6 +1225,7 @@ begin req.byte_sel := r0.req.byte_sel; req.hit_way := req_hit_way; req.repl_way := replace_way; + req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request -- Note that r1.full = 1 implies req_op = OP_NONE @@ -1243,6 +1247,7 @@ begin r1.store_row <= get_row(req.real_addr); r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1; r1.reload_tag <= get_tag(req.real_addr); + r1.req.same_tag <= '1'; if req.op = OP_STORE_HIT then r1.store_way <= req.hit_way; @@ -1346,11 +1351,10 @@ begin -- complete the request next cycle. -- Compare the whole address in case the request in -- r1.req is not the one that started this refill. - if r1.full = '1' and + if r1.full = '1' and r1.req.same_tag = '1' and ((r1.dcbz = '1' and r1.req.dcbz = '1') or (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and - r1.store_row = get_row(r1.req.real_addr) and - r1.reload_tag = get_tag(r1.req.real_addr) then + r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; r1.forward_sel <= (others => '1'); @@ -1379,19 +1383,14 @@ begin if wishbone_in.stall = '0' then -- See if there is another store waiting to be done -- which is in the same real page. - -- Using r1.req rather than req here limits us to one - -- store every two cycles, but helps timing in that we - -- don't depend on req_op or ra. - if r1.full = '1' and acks < 7 and - (r1.req.op = OP_STORE_MISS or r1.req.op = OP_STORE_HIT) and - (r1.req.real_addr(r1.wb.adr'left downto TLB_LG_PGSZ) = - r1.wb.adr(r1.wb.adr'left downto TLB_LG_PGSZ)) then - r1.wb.adr <= r1.req.real_addr(r1.wb.adr'left downto 0); - r1.wb.dat <= r1.req.data; - r1.wb.sel <= r1.req.byte_sel; + if acks < 7 and req.same_tag = '1' and + (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then + r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); + r1.wb.dat <= req.data; + r1.wb.sel <= req.byte_sel; r1.wb.stb <= '1'; stbs_done := false; - if r1.req.op = OP_STORE_HIT then + if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; r1.full <= '0'; From b3799c432ba51b4c2aceeefbe9de209e8e935362 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Jun 2020 09:28:03 +1000 Subject: [PATCH 17/22] decode1: Add a stash buffer to the output This means that the busy signal from execute1 (which can be driven combinatorially from mmu or dcache) now stops at decode1 and doesn't go on to icache or fetch1. This helps with timing. Signed-off-by: Paul Mackerras --- core.vhdl | 6 ++++-- decode1.vhdl | 31 ++++++++++++++++++++----------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/core.vhdl b/core.vhdl index 13f3ce7..019660c 100644 --- a/core.vhdl +++ b/core.vhdl @@ -82,6 +82,7 @@ architecture behave of core is signal icache_stall_out : std_ulogic; signal icache_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic; + signal decode1_busy : std_ulogic; signal decode2_busy_in : std_ulogic; signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; @@ -188,7 +189,7 @@ begin log_out => log_data(42 downto 0) ); - fetch1_stall_in <= icache_stall_out or decode2_stall_out; + fetch1_stall_in <= icache_stall_out or decode1_busy; icache_0: entity work.icache generic map( @@ -212,7 +213,7 @@ begin log_out => log_data(96 downto 43) ); - icache_stall_in <= decode2_stall_out; + icache_stall_in <= decode1_busy; decode1_0: entity work.decode1 port map ( @@ -220,6 +221,7 @@ begin rst => rst_dec1, stall_in => decode1_stall_in, flush_in => flush, + busy_out => decode1_busy, f_in => icache_to_decode1, d_out => decode1_to_decode2, log_out => log_data(109 downto 97) diff --git a/decode1.vhdl b/decode1.vhdl index ae3e970..21596f6 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -13,6 +13,7 @@ entity decode1 is stall_in : in std_ulogic; flush_in : in std_ulogic; + busy_out : out std_ulogic; f_in : in IcacheToDecode1Type; d_out : out Decode1ToDecode2Type; @@ -22,6 +23,7 @@ end entity decode1; architecture behaviour of decode1 is signal r, rin : Decode1ToDecode2Type; + signal s : Decode1ToDecode2Type; subtype major_opcode_t is unsigned(5 downto 0); type major_rom_array_t is array(0 to 63) of decode_rom_t; @@ -359,12 +361,27 @@ begin decode1_0: process(clk) begin if rising_edge(clk) then - -- Output state remains unchanged on stall, unless we are flushing - if rst = '1' or flush_in = '1' or stall_in = '0' then - r <= rin; + if rst = '1' then + r <= Decode1ToDecode2Init; + s <= Decode1ToDecode2Init; + elsif flush_in = '1' then + r.valid <= '0'; + s.valid <= '0'; + elsif s.valid = '1' then + if stall_in = '0' then + r <= s; + s.valid <= '0'; + end if; + else + s <= rin; + s.valid <= rin.valid and r.valid and stall_in; + if r.valid = '0' or stall_in = '0' then + r <= rin; + end if; end if; end if; end process; + busy_out <= s.valid; decode1_1: process(all) variable v : Decode1ToDecode2Type; @@ -472,14 +489,6 @@ begin end if; end if; - if flush_in = '1' then - v.valid := '0'; - end if; - - if rst = '1' then - v := Decode1ToDecode2Init; - end if; - -- Update registers rin <= v; From 09ae2ce58d71f0901e22f8f1f82607b77f38443f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Jun 2020 10:02:14 +1000 Subject: [PATCH 18/22] decode1: Improve timing for slow SPR decode path This makes the logic that works out decode.unit and decode.sgl_pipe for mtspr/mfspr to/from slow SPRs detect the fact that the instruction is mtspr/mfspr based on a match with the instruction word rather than looking at v.decode.insn_type. This improves timing substantially, as the ROM lookup to get v.decode is relatively slow. Signed-off-by: Paul Mackerras --- decode1.vhdl | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 21596f6..f72d310 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -419,6 +419,20 @@ begin sprn := decode_spr_num(f_in.insn); v.ispr1 := fast_spr_num(sprn); + if std_match(f_in.insn(10 downto 1), "01-1010011") then + -- mfspr or mtspr + -- Make slow SPRs single issue + if is_fast_spr(v.ispr1) = '0' then + v.decode.sgl_pipe := '1'; + -- send MMU-related SPRs to loadstore1 + case sprn is + when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL => + v.decode.unit := LDST; + when others => + end case; + end if; + end if; + elsif majorop = "010000" then -- CTR may be needed as input to bc v.decode := major_decode_rom_array(to_integer(majorop)); @@ -475,20 +489,6 @@ begin end if; - if v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then - sprn := decode_spr_num(f_in.insn); - -- Make slow SPRs single issue - if is_fast_spr(v.ispr1) = '0' then - v.decode.sgl_pipe := '1'; - -- send MMU-related SPRs to loadstore1 - case sprn is - when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL => - v.decode.unit := LDST; - when others => - end case; - end if; - end if; - -- Update registers rin <= v; From 6687aae4d659e79c429c60ebbc07bfac7686365a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Jun 2020 15:43:05 +1000 Subject: [PATCH 19/22] core: Implement a simple branch predictor This implements a simple branch predictor in the decode1 stage. If it sees that the instruction is b or bc and the branch is predicted to be taken, it sends a flush and redirect upstream (to icache and fetch1) to redirect fetching to the branch target. The prediction is sent downstream with the branch instruction, and execute1 now only sends a flush/redirect upstream if the prediction was wrong. Unconditional branches are always predicted to be taken, and conditional branches are predicted to be taken if and only if the offset is negative. Branches that take the branch address from a register (bclr, bcctr) are predicted not taken, as we don't have any way to predict the branch address. Since we can now have a mflr being executed immediately after a bl or bcl, we now track the update to LR in the hazard tracker, using the second write register field that is used to track RA updates for update-form loads and stores. For those branches that update LR but don't write any other result (i.e. that don't decrementer CTR), we now write back LR in the same cycle as the instruction rather than taking a second cycle for the LR writeback. Signed-off-by: Paul Mackerras --- common.vhdl | 13 +++++++-- core.vhdl | 11 ++++++-- decode1.vhdl | 50 +++++++++++++++++++++++++-------- decode2.vhdl | 6 ++++ execute1.vhdl | 70 +++++++++++++++++++++++++++-------------------- fetch1.vhdl | 7 ++++- ppc_fx_insns.vhdl | 12 ++------ 7 files changed, 114 insertions(+), 55 deletions(-) diff --git a/common.vhdl b/common.vhdl index 31bd920..52222c3 100644 --- a/common.vhdl +++ b/common.vhdl @@ -113,8 +113,16 @@ package common is ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) decode: decode_rom_t; + br_pred: std_ulogic; -- Branch was predicted to be taken + end record; + constant Decode1ToDecode2Init : Decode1ToDecode2Type := + (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), + ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, br_pred => '0'); + + type Decode1ToFetch1Type is record + redirect : std_ulogic; + redirect_nia : std_ulogic_vector(63 downto 0); end record; - constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init); type Decode2ToExecute1Type is record valid: std_ulogic; @@ -149,12 +157,13 @@ package common is sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? reserve : std_ulogic; -- set for larx/stcx + br_pred : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', + is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0')); type Execute1ToMultiplyType is record diff --git a/core.vhdl b/core.vhdl index 019660c..092df6d 100644 --- a/core.vhdl +++ b/core.vhdl @@ -48,6 +48,7 @@ architecture behave of core is -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; + signal decode1_to_fetch1: Decode1ToFetch1Type; signal decode2_to_execute1: Decode2ToExecute1Type; -- register file signals @@ -90,6 +91,8 @@ architecture behave of core is signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; + signal decode1_flush: std_ulogic; + signal fetch1_flush: std_ulogic; signal complete: std_ulogic; signal terminate: std_ulogic; @@ -182,14 +185,16 @@ begin rst => rst_fetch1, alt_reset_in => alt_reset_d, stall_in => fetch1_stall_in, - flush_in => flush, + flush_in => fetch1_flush, stop_in => dbg_core_stop, + d_in => decode1_to_fetch1, e_in => execute1_to_fetch1, i_out => fetch1_to_icache, log_out => log_data(42 downto 0) ); fetch1_stall_in <= icache_stall_out or decode1_busy; + fetch1_flush <= flush or decode1_flush; icache_0: entity work.icache generic map( @@ -204,7 +209,7 @@ begin i_in => fetch1_to_icache, i_out => icache_to_decode1, m_in => mmu_to_icache, - flush_in => flush, + flush_in => fetch1_flush, inval_in => dbg_icache_rst or ex1_icache_inval, stall_in => icache_stall_in, stall_out => icache_stall_out, @@ -221,9 +226,11 @@ begin rst => rst_dec1, stall_in => decode1_stall_in, flush_in => flush, + flush_out => decode1_flush, busy_out => decode1_busy, f_in => icache_to_decode1, d_out => decode1_to_decode2, + f_out => decode1_to_fetch1, log_out => log_data(109 downto 97) ); diff --git a/decode1.vhdl b/decode1.vhdl index f72d310..2060e64 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -8,16 +8,18 @@ use work.decode_types.all; entity decode1 is port ( - clk : in std_ulogic; - rst : in std_ulogic; - - stall_in : in std_ulogic; - flush_in : in std_ulogic; - busy_out : out std_ulogic; - - f_in : in IcacheToDecode1Type; - d_out : out Decode1ToDecode2Type; - log_out : out std_ulogic_vector(12 downto 0) + clk : in std_ulogic; + rst : in std_ulogic; + + stall_in : in std_ulogic; + flush_in : in std_ulogic; + busy_out : out std_ulogic; + flush_out : out std_ulogic; + + f_in : in IcacheToDecode1Type; + f_out : out Decode1ToFetch1Type; + d_out : out Decode1ToDecode2Type; + log_out : out std_ulogic_vector(12 downto 0) ); end entity decode1; @@ -385,11 +387,15 @@ begin decode1_1: process(all) variable v : Decode1ToDecode2Type; + variable f : Decode1ToFetch1Type; variable majorop : major_opcode_t; variable op_19_bits: std_ulogic_vector(2 downto 0); variable sprn : spr_num_t; + variable br_nia : std_ulogic_vector(61 downto 0); + variable br_target : std_ulogic_vector(61 downto 0); + variable br_offset : signed(23 downto 0); begin - v := r; + v := Decode1ToDecode2Init; v.valid := f_in.valid; v.nia := f_in.nia; @@ -486,14 +492,36 @@ begin else v.decode := major_decode_rom_array(to_integer(majorop)); + end if; + -- Branch predictor + -- Note bclr, bcctr and bctar are predicted not taken as we have no + -- count cache or link stack. + br_offset := (others => '0'); + if majorop = 18 then + -- Unconditional branches are always taken + v.br_pred := '1'; + br_offset := signed(f_in.insn(25 downto 2)); + elsif majorop = 16 then + -- Predict backward branches as taken, forward as untaken + v.br_pred := f_in.insn(15); + br_offset := resize(signed(f_in.insn(15 downto 2)), 24); + end if; + br_nia := f_in.nia(63 downto 2); + if f_in.insn(1) = '1' then + br_nia := (others => '0'); end if; + br_target := std_ulogic_vector(signed(br_nia) + br_offset); + f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid; + f.redirect_nia := br_target & "00"; -- Update registers rin <= v; -- Update outputs d_out <= r; + f_out <= f; + flush_out <= f.redirect; end process; dec1_log : process(clk) diff --git a/decode2.vhdl b/decode2.vhdl index 6acbca7..80687a0 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -358,6 +358,7 @@ begin v.e.sign_extend := d_in.decode.sign_extend; v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; + v.e.br_pred := d_in.br_pred; -- issue control control_valid_in <= d_in.valid; @@ -371,6 +372,11 @@ begin end if; update_gpr_write_valid <= d_in.decode.update; update_gpr_write_reg <= decoded_reg_a.reg; + if v.e.lr = '1' then + -- there are no instructions that have both update=1 and lr=1 + update_gpr_write_valid <= '1'; + update_gpr_write_reg <= fast_spr_num(SPR_LR); + end if; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; diff --git a/execute1.vhdl b/execute1.vhdl index edbeaaa..12d3df1 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -305,11 +305,17 @@ begin variable exception_nextpc : std_ulogic; variable trapval : std_ulogic_vector(4 downto 0); variable illegal : std_ulogic; + variable is_branch : std_ulogic; + variable taken_branch : std_ulogic; + variable abs_branch : std_ulogic; begin result := (others => '0'); result_with_carry := (others => '0'); result_en := '0'; newcrf := (others => '0'); + is_branch := '0'; + taken_branch := '0'; + abs_branch := '0'; v := r; v.e := Execute1ToWritebackInit; @@ -625,12 +631,9 @@ begin result := logical_result; result_en := '1'; when OP_B => - f_out.redirect <= '1'; - if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= b_in; - else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); - end if; + is_branch := '1'; + taken_branch := '1'; + abs_branch := insn_aa(e_in.insn); when OP_BC => -- read_data1 is CTR bo := insn_bo(e_in.insn); @@ -640,14 +643,9 @@ begin result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then - f_out.redirect <= '1'; - if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= b_in; - else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); - end if; - end if; + is_branch := '1'; + taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in); + abs_branch := insn_aa(e_in.insn); when OP_BCREG => -- read_data1 is CTR -- read_data2 is target register (CTR, LR or TAR) @@ -658,7 +656,7 @@ begin result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = '1' then f_out.redirect <= '1'; f_out.redirect_nia <= b_in(63 downto 2) & "00"; end if; @@ -903,23 +901,35 @@ begin v.e.rc := e_in.rc and valid_in; + -- Mispredicted branches cause a redirect + if is_branch = '1' and taken_branch /= e_in.br_pred then + f_out.redirect <= '1'; + if taken_branch = '1' then + if abs_branch = '1' then + f_out.redirect_nia <= b_in; + else + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); + end if; + else + f_out.redirect_nia <= next_nia; + end if; + end if; + -- Update LR on the next cycle after a branch link - -- - -- WARNING: The LR update isn't tracked by our hazard tracker. This - -- will work (well I hope) because it only happens on branches - -- which will flush all decoded instructions. By the time - -- fetch catches up, we'll have the new LR. This will - -- *not* work properly however if we have a branch predictor, - -- in which case the solution would probably be to keep a - -- local cache of the updated LR in execute1 (flushed on - -- exceptions) that is used instead of the value from - -- decode when its content is valid. + -- If we're not writing back anything else, we can write back LR + -- this cycle, otherwise we take an extra cycle. if e_in.lr = '1' then - v.lr_update := '1'; - v.next_lr := next_nia; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - v.busy := '1'; + if result_en = '0' then + result_en := '1'; + result := next_nia; + v.e.write_reg := fast_spr_num(SPR_LR); + else + v.lr_update := '1'; + v.next_lr := next_nia; + v.e.valid := '0'; + report "Delayed LR update to " & to_hstring(next_nia); + v.busy := '1'; + end if; end if; elsif valid_in = '1' then diff --git a/fetch1.vhdl b/fetch1.vhdl index 93a2293..0d9c6f7 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -23,6 +23,9 @@ entity fetch1 is -- redirect from execution unit e_in : in Execute1ToFetch1Type; + -- redirect from decode1 + d_in : in Decode1ToFetch1Type; + -- Request to icache i_out : out Fetch1ToIcacheType; @@ -49,7 +52,7 @@ begin report "fetch1 rst:" & std_ulogic'image(rst) & " IR:" & std_ulogic'image(e_in.virt_mode) & " P:" & std_ulogic'image(e_in.priv_mode) & - " R:" & std_ulogic'image(e_in.redirect) & + " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & " nia:" & to_hstring(r_next.nia) & @@ -83,6 +86,8 @@ begin v.nia := e_in.redirect_nia; v.virt_mode := e_in.virt_mode; v.priv_mode := e_in.priv_mode; + elsif d_in.redirect = '1' then + v.nia := d_in.redirect_nia; elsif stall_in = '0' then -- For debug stop/step to work properly we need a little bit of diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl index 0bf011d..5fdf1c7 100644 --- a/ppc_fx_insns.vhdl +++ b/ppc_fx_insns.vhdl @@ -93,7 +93,7 @@ package ppc_fx_insns is function ppc_divd (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; - function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer; + function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic; end package ppc_fx_insns; package body ppc_fx_insns is @@ -785,13 +785,12 @@ package body ppc_fx_insns is return std_ulogic_vector(resize(tmp, ra'length)); end; - function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer is + function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic is variable crfield: integer; variable crbit_match: std_ulogic; variable ctr_not_zero: std_ulogic; variable ctr_ok: std_ulogic; variable cond_ok: std_ulogic; - variable ret: integer; begin crfield := to_integer(unsigned(bi)); -- BE bit numbering @@ -800,12 +799,7 @@ package body ppc_fx_insns is ctr_not_zero := '1' when ctr /= x"0000000000000001" else '0'; ctr_ok := bo(4-2) or (ctr_not_zero xor bo(4-3)); cond_ok := bo(4-0) or crbit_match; - if ctr_ok = '1' and cond_ok = '1' then - ret := 1; - else - ret := 0; - end if; - return ret; + return ctr_ok and cond_ok; end; end package body ppc_fx_insns; From ec2fa61792ca73265159f711157ae3dfa6c623e0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Jun 2020 16:59:08 +1000 Subject: [PATCH 20/22] execute1: Reduce width of the result mux to help timing This reduces the number of different things that are assigned to the result variable. - The computations for the popcnt, prty, cmpb and exts instruction families are moved into the logical unit. - The result of mfspr from the slow SPRs is computed in 'spr_val' before being assigned to 'result'. - Writes to LR as a result of a blr or bclr instruction are done through the exc_write path to writeback. This eases timing considerably. Signed-off-by: Paul Mackerras --- execute1.vhdl | 60 ++++++++++++++-------------------------------- logical.vhdl | 66 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 62 insertions(+), 64 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index 12d3df1..902af70 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -82,8 +82,6 @@ architecture behaviour of execute1 is signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); - signal popcnt_result: std_ulogic_vector(63 downto 0); - signal parity_result: std_ulogic_vector(63 downto 0); -- multiply signals signal x_to_multiply: Execute1ToMultiplyType; @@ -208,9 +206,7 @@ begin invert_in => e_in.invert_a, invert_out => e_in.invert_out, result => logical_result, - datalen => e_in.data_len, - popcnt => popcnt_result, - parity => parity_result + datalen => e_in.data_len ); countzero_0: entity work.zero_counter @@ -295,7 +291,6 @@ begin variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); variable overflow : std_ulogic; - variable negative : std_ulogic; variable zerohi, zerolo : std_ulogic; variable msb_a, msb_b : std_ulogic; variable a_lt : std_ulogic; @@ -308,6 +303,7 @@ begin variable is_branch : std_ulogic; variable taken_branch : std_ulogic; variable abs_branch : std_ulogic; + variable spr_val : std_ulogic_vector(63 downto 0); begin result := (others => '0'); result_with_carry := (others => '0'); @@ -627,7 +623,7 @@ begin end if; end if; end if; - when OP_AND | OP_OR | OP_XOR => + when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS => result := logical_result; result_en := '1'; when OP_B => @@ -677,27 +673,10 @@ begin ctrl_tmp.msr(MSR_DR) <= '1'; end if; - when OP_CMPB => - result := ppc_cmpb(c_in, b_in); - result_en := '1'; when OP_CNTZ => v.e.valid := '0'; v.cntz_in_progress := '1'; v.busy := '1'; - when OP_EXTS => - -- note data_len is a 1-hot encoding - negative := (e_in.data_len(0) and c_in(7)) or - (e_in.data_len(1) and c_in(15)) or - (e_in.data_len(2) and c_in(31)); - result := (others => negative); - if e_in.data_len(2) = '1' then - result(31 downto 16) := c_in(31 downto 16); - end if; - if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then - result(15 downto 8) := c_in(15 downto 8); - end if; - result(7 downto 0) := c_in(7 downto 0); - result_en := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); if e_in.cr(31-crbit) = '1' then @@ -769,24 +748,25 @@ begin result(63-45) := v.e.xerc.ca32; end if; else + spr_val := c_in; case decode_spr_num(e_in.insn) is when SPR_TB => - result := ctrl.tb; + spr_val := ctrl.tb; when SPR_DEC => - result := ctrl.dec; + spr_val := ctrl.dec; when 724 => -- LOG_ADDR SPR - result := log_wr_addr & r.log_addr_spr; + spr_val := log_wr_addr & r.log_addr_spr; when 725 => -- LOG_DATA SPR - result := log_rd_data; + spr_val := log_rd_data; v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1); when others => -- mfspr from unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode - result := c_in; if ctrl.msr(MSR_PR) = '1' then illegal := '1'; end if; end case; + result := spr_val; end if; when OP_MFCR => if e_in.insn(20) = '0' then @@ -862,12 +842,6 @@ begin end if; end case; end if; - when OP_POPCNT => - result := popcnt_result; - result_en := '1'; - when OP_PRTY => - result := parity_result; - result_en := '1'; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => result := rotator_result; if e_in.output_carry = '1' then @@ -917,12 +891,14 @@ begin -- Update LR on the next cycle after a branch link -- If we're not writing back anything else, we can write back LR - -- this cycle, otherwise we take an extra cycle. + -- this cycle, otherwise we take an extra cycle. We use the + -- exc_write path since next_nia is written through that path + -- in other places. if e_in.lr = '1' then if result_en = '0' then - result_en := '1'; - result := next_nia; - v.e.write_reg := fast_spr_num(SPR_LR); + v.e.exc_write_enable := '1'; + v.e.exc_write_data := next_nia; + v.e.exc_write_reg := fast_spr_num(SPR_LR); else v.lr_update := '1'; v.next_lr := next_nia; @@ -939,9 +915,9 @@ begin end if; elsif r.lr_update = '1' then - result_en := '1'; - result := r.next_lr; - v.e.write_reg := fast_spr_num(SPR_LR); + v.e.exc_write_enable := '1'; + v.e.exc_write_data := r.next_lr; + v.e.exc_write_reg := fast_spr_num(SPR_LR); v.e.valid := '1'; elsif r.cntz_in_progress = '1' then -- cnt[lt]z always takes two cycles diff --git a/logical.vhdl b/logical.vhdl index 4dfc13d..5e6abfa 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -4,6 +4,7 @@ use ieee.numeric_std.all; library work; use work.decode_types.all; +use work.ppc_fx_insns.all; entity logical is port ( @@ -13,9 +14,7 @@ entity logical is invert_in : in std_ulogic; invert_out : in std_ulogic; result : out std_ulogic_vector(63 downto 0); - datalen : in std_logic_vector(3 downto 0); - popcnt : out std_ulogic_vector(63 downto 0); - parity : out std_ulogic_vector(63 downto 0) + datalen : in std_logic_vector(3 downto 0) ); end entity logical; @@ -34,30 +33,14 @@ architecture behaviour of logical is type sixbit2 is array(0 to 1) of sixbit; signal pc32 : sixbit2; signal par0, par1 : std_ulogic; + signal popcnt : std_ulogic_vector(63 downto 0); + signal parity : std_ulogic_vector(63 downto 0); begin logical_0: process(all) variable rb_adj, tmp : std_ulogic_vector(63 downto 0); + variable negative : std_ulogic; begin - rb_adj := rb; - if invert_in = '1' then - rb_adj := not rb; - end if; - - case op is - when OP_AND => - tmp := rs and rb_adj; - when OP_OR => - tmp := rs or rb_adj; - when others => - tmp := rs xor rb_adj; - end case; - - result <= tmp; - if invert_out = '1' then - result <= not tmp; - end if; - -- population counts for i in 0 to 31 loop pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1)); @@ -98,5 +81,44 @@ begin parity(32) <= par1; end if; + rb_adj := rb; + if invert_in = '1' then + rb_adj := not rb; + end if; + + case op is + when OP_AND => + tmp := rs and rb_adj; + when OP_OR => + tmp := rs or rb_adj; + when OP_XOR => + tmp := rs xor rb_adj; + when OP_POPCNT => + tmp := popcnt; + when OP_PRTY => + tmp := parity; + when OP_CMPB => + tmp := ppc_cmpb(rs, rb); + when others => + -- EXTS + -- note datalen is a 1-hot encoding + negative := (datalen(0) and rs(7)) or + (datalen(1) and rs(15)) or + (datalen(2) and rs(31)); + tmp := (others => negative); + if datalen(2) = '1' then + tmp(31 downto 16) := rs(31 downto 16); + end if; + if datalen(2) = '1' or datalen(1) = '1' then + tmp(15 downto 8) := rs(15 downto 8); + end if; + tmp(7 downto 0) := rs(7 downto 0); + end case; + + if invert_out = '1' then + tmp := not tmp; + end if; + result <= tmp; + end process; end behaviour; From 78de4fef72b900ab977275c40fd21ca080671e31 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 16 Jun 2020 11:37:25 +1000 Subject: [PATCH 21/22] Make LOG_LENGTH configurable per FPGA variant This plumbs the LOG_LENGTH parameter (which controls how many entries the core log RAM has) up to the top level so that it can be set on the fusesoc command line and have different default values on different FPGAs. It now defaults to 512 entries generally and on the Artix-7 35 parts, and 2048 on the larger Artix-7 FPGAs. It can be set to 0 if desired. Signed-off-by: Paul Mackerras --- core.vhdl | 6 +- core_debug.vhdl | 150 ++++++++++++++++++++++++--------------------- fpga/top-arty.vhdl | 6 +- microwatt.core | 13 ++++ soc.vhdl | 6 +- 5 files changed, 107 insertions(+), 74 deletions(-) diff --git a/core.vhdl b/core.vhdl index 092df6d..4a83d69 100644 --- a/core.vhdl +++ b/core.vhdl @@ -11,7 +11,8 @@ entity core is SIM : boolean := false; DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; - ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0') + ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0'); + LOG_LENGTH : natural := 512 ); port ( clk : in std_ulogic; @@ -372,6 +373,9 @@ begin log_data(139 downto 135) <= "00000"; debug_0: entity work.core_debug + generic map ( + LOG_LENGTH => LOG_LENGTH + ) port map ( clk => clk, rst => rst_dbg, diff --git a/core_debug.vhdl b/core_debug.vhdl index 31e4ab8..9efaa7c 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -9,7 +9,7 @@ use work.common.all; entity core_debug is generic ( -- Length of log buffer - LOG_LENGTH : positive := 2048 + LOG_LENGTH : natural := 512 ); port ( clk : in std_logic; @@ -92,6 +92,8 @@ architecture behave of core_debug is constant DBG_CORE_LOG_ADDR : std_ulogic_vector(3 downto 0) := "0110"; constant DBG_CORE_LOG_DATA : std_ulogic_vector(3 downto 0) := "0111"; + constant LOG_INDEX_BITS : natural := log2(LOG_LENGTH); + -- Some internal wires signal stat_reg : std_ulogic_vector(63 downto 0); @@ -104,38 +106,12 @@ architecture behave of core_debug is signal do_gspr_rd : std_ulogic; signal gspr_index : gspr_index_t; - -- Logging RAM - constant LOG_INDEX_BITS : natural := log2(LOG_LENGTH); - subtype log_ptr_t is unsigned(LOG_INDEX_BITS - 1 downto 0); - type log_array_t is array(0 to LOG_LENGTH - 1) of std_ulogic_vector(255 downto 0); - signal log_array : log_array_t; - signal log_rd_ptr : log_ptr_t; - signal log_wr_ptr : log_ptr_t; - signal log_toggle : std_ulogic; - signal log_wr_enable : std_ulogic; - signal log_rd_ptr_latched : log_ptr_t; - signal log_rd : std_ulogic_vector(255 downto 0); - signal log_dmi_addr : std_ulogic_vector(31 downto 0); - signal log_dmi_data : std_ulogic_vector(63 downto 0); + signal log_dmi_addr : std_ulogic_vector(31 downto 0) := (others => '0'); + signal log_dmi_data : std_ulogic_vector(63 downto 0) := (others => '0'); signal do_dmi_log_rd : std_ulogic; - signal log_dmi_reading : std_ulogic; - signal log_dmi_read_done : std_ulogic; signal dmi_read_log_data : std_ulogic; signal dmi_read_log_data_1 : std_ulogic; - function select_dword(data : std_ulogic_vector(255 downto 0); - addr : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is - variable firstbit : integer; - begin - firstbit := to_integer(unsigned(addr(1 downto 0))) * 64; - return data(firstbit + 63 downto firstbit); - end; - - attribute ram_style : string; - attribute ram_style of log_array : signal is "block"; - attribute ram_decomp : string; - attribute ram_decomp of log_array : signal is "power"; - begin -- Single cycle register accesses on DMI except for GSPR data dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA @@ -241,50 +217,86 @@ begin icache_rst <= do_icreset; terminated_out <= terminated; - -- Use MSB of read addresses to stop the logging - log_wr_enable <= not (log_read_addr(31) or log_dmi_addr(31)); - - log_ram: process(clk) - begin - if rising_edge(clk) then - if log_wr_enable = '1' then - log_array(to_integer(log_wr_ptr)) <= log_data; - end if; - log_rd <= log_array(to_integer(log_rd_ptr_latched)); - end if; - end process; - + -- Logging RAM + maybe_log: if LOG_LENGTH > 0 generate + subtype log_ptr_t is unsigned(LOG_INDEX_BITS - 1 downto 0); + type log_array_t is array(0 to LOG_LENGTH - 1) of std_ulogic_vector(255 downto 0); + signal log_array : log_array_t; + signal log_rd_ptr : log_ptr_t; + signal log_wr_ptr : log_ptr_t; + signal log_toggle : std_ulogic; + signal log_wr_enable : std_ulogic; + signal log_rd_ptr_latched : log_ptr_t; + signal log_rd : std_ulogic_vector(255 downto 0); + signal log_dmi_reading : std_ulogic; + signal log_dmi_read_done : std_ulogic; + + function select_dword(data : std_ulogic_vector(255 downto 0); + addr : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is + variable firstbit : integer; + begin + firstbit := to_integer(unsigned(addr(1 downto 0))) * 64; + return data(firstbit + 63 downto firstbit); + end; + + attribute ram_style : string; + attribute ram_style of log_array : signal is "block"; + attribute ram_decomp : string; + attribute ram_decomp of log_array : signal is "power"; - log_buffer: process(clk) - variable b : integer; - variable data : std_ulogic_vector(255 downto 0); begin - if rising_edge(clk) then - if rst = '1' then - log_wr_ptr <= (others => '0'); - log_toggle <= '0'; - elsif log_wr_enable = '1' then - if log_wr_ptr = to_unsigned(LOG_LENGTH - 1, LOG_INDEX_BITS) then - log_toggle <= not log_toggle; + -- Use MSB of read addresses to stop the logging + log_wr_enable <= not (log_read_addr(31) or log_dmi_addr(31)); + + log_ram: process(clk) + begin + if rising_edge(clk) then + if log_wr_enable = '1' then + log_array(to_integer(log_wr_ptr)) <= log_data; end if; - log_wr_ptr <= log_wr_ptr + 1; - end if; - if do_dmi_log_rd = '1' then - log_rd_ptr_latched <= unsigned(log_dmi_addr(LOG_INDEX_BITS + 1 downto 2)); - else - log_rd_ptr_latched <= unsigned(log_read_addr(LOG_INDEX_BITS + 1 downto 2)); + log_rd <= log_array(to_integer(log_rd_ptr_latched)); end if; - if log_dmi_read_done = '1' then - log_dmi_data <= select_dword(log_rd, log_dmi_addr); - else - log_read_data <= select_dword(log_rd, log_read_addr); + end process; + + + log_buffer: process(clk) + variable b : integer; + variable data : std_ulogic_vector(255 downto 0); + begin + if rising_edge(clk) then + if rst = '1' then + log_wr_ptr <= (others => '0'); + log_toggle <= '0'; + elsif log_wr_enable = '1' then + if log_wr_ptr = to_unsigned(LOG_LENGTH - 1, LOG_INDEX_BITS) then + log_toggle <= not log_toggle; + end if; + log_wr_ptr <= log_wr_ptr + 1; + end if; + if do_dmi_log_rd = '1' then + log_rd_ptr_latched <= unsigned(log_dmi_addr(LOG_INDEX_BITS + 1 downto 2)); + else + log_rd_ptr_latched <= unsigned(log_read_addr(LOG_INDEX_BITS + 1 downto 2)); + end if; + if log_dmi_read_done = '1' then + log_dmi_data <= select_dword(log_rd, log_dmi_addr); + else + log_read_data <= select_dword(log_rd, log_read_addr); + end if; + log_dmi_read_done <= log_dmi_reading; + log_dmi_reading <= do_dmi_log_rd; end if; - log_dmi_read_done <= log_dmi_reading; - log_dmi_reading <= do_dmi_log_rd; - end if; - end process; - log_write_addr(LOG_INDEX_BITS - 1 downto 0) <= std_ulogic_vector(log_wr_ptr); - log_write_addr(LOG_INDEX_BITS) <= '1'; - log_write_addr(31 downto LOG_INDEX_BITS + 1) <= (others => '0'); + end process; + log_write_addr(LOG_INDEX_BITS - 1 downto 0) <= std_ulogic_vector(log_wr_ptr); + log_write_addr(LOG_INDEX_BITS) <= '1'; + log_write_addr(31 downto LOG_INDEX_BITS + 1) <= (others => '0'); + end generate; + + no_log: if LOG_LENGTH = 0 generate + begin + log_read_data <= (others => '0'); + log_write_addr <= x"00000001"; + end generate; + end behave; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index b13ed34..44b59c3 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -20,7 +20,8 @@ entity toplevel is SCLK_STARTUPE2 : boolean := false; SPI_FLASH_OFFSET : integer := 4194304; SPI_FLASH_DEF_CKDV : natural := 1; - SPI_FLASH_DEF_QUAD : boolean := true + SPI_FLASH_DEF_QUAD : boolean := true; + LOG_LENGTH : natural := 512 ); port( ext_clk : in std_ulogic; @@ -139,7 +140,8 @@ begin SPI_FLASH_DLINES => 4, SPI_FLASH_OFFSET => SPI_FLASH_OFFSET, SPI_FLASH_DEF_CKDV => SPI_FLASH_DEF_CKDV, - SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD + SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD, + LOG_LENGTH => LOG_LENGTH ) port map ( -- System signals diff --git a/microwatt.core b/microwatt.core index 7d86cc2..85710be 100644 --- a/microwatt.core +++ b/microwatt.core @@ -110,6 +110,7 @@ targets: - clk_input - clk_frequency - disable_flatten_core + - log_length=2048 tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -124,6 +125,7 @@ targets: - clk_frequency - disable_flatten_core - spi_flash_offset=10485760 + - log_length=2048 tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel @@ -138,6 +140,7 @@ targets: - disable_flatten_core - no_bram - spi_flash_offset=10485760 + - log_length=2048 generate: [dram_nexys_video] tools: vivado: {part : xc7a200tsbg484-1} @@ -153,6 +156,7 @@ targets: - clk_frequency - disable_flatten_core - spi_flash_offset=3145728 + - log_length=512 tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel @@ -167,6 +171,7 @@ targets: - disable_flatten_core - no_bram - spi_flash_offset=3145728 + - log_length=512 generate: [dram_arty] tools: vivado: {part : xc7a35ticsg324-1L} @@ -182,6 +187,7 @@ targets: - clk_frequency - disable_flatten_core - spi_flash_offset=4194304 + - log_length=2048 tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel @@ -196,6 +202,7 @@ targets: - disable_flatten_core - no_bram - spi_flash_offset=4194304 + - log_length=2048 generate: [dram_arty] tools: vivado: {part : xc7a100ticsg324-1L} @@ -211,6 +218,7 @@ targets: - clk_input=12000000 - clk_frequency - disable_flatten_core + - log_length=512 tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel @@ -281,3 +289,8 @@ parameters: datatype : int description : Offset (in bytes) in the SPI flash of the code payload to run paramtype : generic + + log_length: + datatype : int + description : Length of the core log buffer in entries (32 bytes each) + paramtype : generic diff --git a/soc.vhdl b/soc.vhdl index 7c8e825..8c2fbfb 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -41,7 +41,8 @@ entity soc is SPI_FLASH_DLINES : positive := 1; SPI_FLASH_OFFSET : integer := 0; SPI_FLASH_DEF_CKDV : natural := 2; - SPI_FLASH_DEF_QUAD : boolean := false + SPI_FLASH_DEF_QUAD : boolean := false; + LOG_LENGTH : natural := 512 ); port( rst : in std_ulogic; @@ -186,7 +187,8 @@ begin generic map( SIM => SIM, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, - ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1') + ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'), + LOG_LENGTH => LOG_LENGTH ) port map( clk => system_clk, From 64efd494e512d7a962896ebdfc38817880d0d281 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 16 Jun 2020 16:59:54 +1000 Subject: [PATCH 22/22] fpga: Add a xilinx_specific fileset to microwatt.core At present this just has the Xilinx-specific multiplier code, but might in future have other things. This also adds the xilinx_specific fileset to the synth target. Without that it was failing because there was no multiplier. Signed-off-by: Paul Mackerras --- microwatt.core | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/microwatt.core b/microwatt.core index 85710be..83d7762 100644 --- a/microwatt.core +++ b/microwatt.core @@ -61,6 +61,10 @@ filesets: - fpga/firmware.hex : {copyto : firmware.hex, file_type : user} file_type : vhdlSource-2008 + xilinx_specific: + files: + - xilinx-mult.vhdl : {file_type : vhdlSource-2008} + debug_xilinx: files: - dmi_dtm_xilinx.vhdl : {file_type : vhdlSource-2008} @@ -74,28 +78,24 @@ filesets: - fpga/nexys_a7.xdc : {file_type : xdc} - fpga/clk_gen_plle2.vhd : {file_type : vhdlSource-2008} - fpga/top-generic.vhdl : {file_type : vhdlSource-2008} - - xilinx-mult.vhdl : {file_type : vhdlSource-2008} nexys_video: files: - fpga/nexys-video.xdc : {file_type : xdc} - fpga/clk_gen_plle2.vhd : {file_type : vhdlSource-2008} - fpga/top-nexys-video.vhdl : {file_type : vhdlSource-2008} - - xilinx-mult.vhdl : {file_type : vhdlSource-2008} arty_a7: files: - fpga/arty_a7.xdc : {file_type : xdc} - fpga/clk_gen_plle2.vhd : {file_type : vhdlSource-2008} - fpga/top-arty.vhdl : {file_type : vhdlSource-2008} - - xilinx-mult.vhdl : {file_type : vhdlSource-2008} cmod_a7-35: files: - fpga/cmod_a7-35.xdc : {file_type : xdc} - fpga/clk_gen_mcmm.vhd : {file_type : vhdlSource-2008} - fpga/top-generic.vhdl : {file_type : vhdlSource-2008} - - xilinx-mult.vhdl : {file_type : vhdlSource-2008} litedram: depend : [":microwatt:litedram"] @@ -103,7 +103,7 @@ filesets: targets: nexys_a7: default_tool: vivado - filesets: [core, nexys_a7, soc, fpga, debug_xilinx] + filesets: [core, nexys_a7, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -117,7 +117,7 @@ targets: nexys_video-nodram: default_tool: vivado - filesets: [core, nexys_video, soc, fpga, debug_xilinx] + filesets: [core, nexys_video, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -132,7 +132,7 @@ targets: nexys_video: default_tool: vivado - filesets: [core, nexys_video, soc, fpga, debug_xilinx, litedram] + filesets: [core, nexys_video, soc, fpga, debug_xilinx, litedram, xilinx_specific] parameters: - memory_size - ram_init_file @@ -148,7 +148,7 @@ targets: arty_a7-35-nodram: default_tool: vivado - filesets: [core, arty_a7, soc, fpga, debug_xilinx] + filesets: [core, arty_a7, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -163,7 +163,7 @@ targets: arty_a7-35: default_tool: vivado - filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram] + filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, xilinx_specific] parameters : - memory_size - ram_init_file @@ -179,7 +179,7 @@ targets: arty_a7-100-nodram: default_tool: vivado - filesets: [core, arty_a7, soc, fpga, debug_xilinx] + filesets: [core, arty_a7, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -194,7 +194,7 @@ targets: arty_a7-100: default_tool: vivado - filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram] + filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, xilinx_specific] parameters: - memory_size - ram_init_file @@ -210,7 +210,7 @@ targets: cmod_a7-35: default_tool: vivado - filesets: [core, cmod_a7-35, soc, fpga, debug_xilinx] + filesets: [core, cmod_a7-35, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -224,7 +224,7 @@ targets: toplevel : toplevel synth: - filesets: [core, soc] + filesets: [core, soc, xilinx_specific] tools: vivado: {pnr : none} toplevel: core