From ecaa5e2fb28e08650ea51936f3018cf5cec32c58 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 25 May 2020 16:48:47 +1000 Subject: [PATCH] dcache: Rework RAM wrapper to synthetize better on Xilinx The global wr_en signal is causing Vivado to generate two TDP (True Dual Port) block RAMs instead of one SDP (Simple Dual Port) for each cache way. Remove it and instead apply a AND to the individual byte write enables. Signed-off-by: Benjamin Herrenschmidt --- cache_ram.vhdl | 35 +++++++++++++++++------------------ dcache.vhdl | 13 ++++++++++--- icache.vhdl | 7 +++++-- 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/cache_ram.vhdl b/cache_ram.vhdl index 7a10a1c..4db8e0a 100644 --- a/cache_ram.vhdl +++ b/cache_ram.vhdl @@ -16,7 +16,6 @@ entity cache_ram is rd_en : in std_logic; rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0); rd_data : out std_logic_vector(WIDTH - 1 downto 0); - wr_en : in std_logic; wr_sel : in std_logic_vector(WIDTH/8 - 1 downto 0); wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0); wr_data : in std_logic_vector(WIDTH - 1 downto 0) @@ -31,8 +30,6 @@ architecture rtl of cache_ram is signal ram : ram_type; attribute ram_style : string; attribute ram_style of ram : signal is "block"; - attribute ram_decomp : string; - attribute ram_decomp of ram : signal is "power"; signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0); @@ -41,23 +38,25 @@ begin variable lbit : integer range 0 to WIDTH - 1; variable mbit : integer range 0 to WIDTH - 1; variable widx : integer range 0 to SIZE - 1; + constant sel0 : std_logic_vector(WIDTH/8 - 1 downto 0) + := (others => '0'); begin if rising_edge(clk) then - if wr_en = '1' then - if TRACE then - report "write a:" & to_hstring(wr_addr) & - " sel:" & to_hstring(wr_sel) & - " dat:" & to_hstring(wr_data); - end if; - for i in 0 to WIDTH/8-1 loop - lbit := i * 8; - mbit := lbit + 7; - widx := to_integer(unsigned(wr_addr)); - if wr_sel(i) = '1' then - ram(widx)(mbit downto lbit) <= wr_data(mbit downto lbit); - end if; - end loop; - end if; + if TRACE then + if wr_sel /= sel0 then + report "write a:" & to_hstring(wr_addr) & + " sel:" & to_hstring(wr_sel) & + " dat:" & to_hstring(wr_data); + end if; + end if; + for i in 0 to WIDTH/8-1 loop + lbit := i * 8; + mbit := lbit + 7; + widx := to_integer(unsigned(wr_addr)); + if wr_sel(i) = '1' then + ram(widx)(mbit downto lbit) <= wr_data(mbit downto lbit); + end if; + end loop; if rd_en = '1' then rd_data0 <= ram(to_integer(unsigned(rd_addr))); if TRACE then diff --git a/dcache.vhdl b/dcache.vhdl index 1d9cbda..9df5562 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -923,6 +923,7 @@ begin signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal wr_data : std_ulogic_vector(wishbone_data_bits-1 downto 0); signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); + signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0); signal dout : cache_row_t; begin way: entity work.cache_ram @@ -936,8 +937,7 @@ begin rd_en => do_read, rd_addr => rd_addr, rd_data => dout, - wr_en => do_write, - wr_sel => wr_sel, + wr_sel => wr_sel_m, wr_addr => wr_addr, wr_data => wr_data ); @@ -986,7 +986,14 @@ begin severity FAILURE; do_write <= '1'; end if; - end process; + + -- Mask write selects with do_write since BRAM doesn't + -- have a global write-enable + for i in 0 to ROW_SIZE-1 loop + wr_sel_m(i) <= wr_sel(i) and do_write; + end loop; + + end process; end generate; -- diff --git a/icache.vhdl b/icache.vhdl index 35d64a8..553b885 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -340,6 +340,7 @@ begin signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal dout : cache_row_t; + signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); begin way: entity work.cache_ram generic map ( @@ -351,8 +352,7 @@ begin rd_en => do_read, rd_addr => rd_addr, rd_data => dout, - wr_en => do_write, - wr_sel => (others => '1'), + wr_sel => wr_sel, wr_addr => wr_addr, wr_data => wishbone_in.dat ); @@ -366,6 +366,9 @@ begin cache_out(i) <= dout; rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS)); + for i in 0 to ROW_SIZE-1 loop + wr_sel(i) <= do_write; + end loop; end process; end generate;