dcache: Rework RAM wrapper to synthetize better on Xilinx

The global wr_en signal is causing Vivado to generate two TDP (True Dual Port) block RAMs instead of one SDP (Simple Dual Port) for each cache way. Remove it and instead apply a AND to the individual byte write enables. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
6 years ago · ecaa5e2fb2
parent a9178ed0c1
commit ecaa5e2fb2
3 changed files with 32 additions and 23 deletions
--- a/cache_ram.vhdl
+++ b/cache_ram.vhdl
@ -16,7 +16,6 @@ entity cache_ram is
 	rd_en   : in  std_logic;
 	rd_addr : in  std_logic_vector(ROW_BITS - 1 downto 0);
 	rd_data : out std_logic_vector(WIDTH - 1 downto 0);
-	wr_en   : in  std_logic;
 	wr_sel  : in  std_logic_vector(WIDTH/8 - 1 downto 0);
 	wr_addr : in  std_logic_vector(ROW_BITS - 1 downto 0);
 	wr_data : in  std_logic_vector(WIDTH - 1 downto 0)
@ -31,8 +30,6 @@ architecture rtl of cache_ram is
    signal ram : ram_type;
    attribute ram_style : string;
    attribute ram_style of ram : signal is "block";
-    attribute ram_decomp : string;
-    attribute ram_decomp of ram : signal is "power";

    signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0);

@ -41,14 +38,17 @@ begin
 	variable lbit : integer range 0 to WIDTH - 1;
 	variable mbit : integer range 0 to WIDTH - 1;
 	variable widx : integer range 0 to SIZE - 1;
+	constant sel0 : std_logic_vector(WIDTH/8 - 1 downto 0)
+            := (others => '0');
    begin
 	if rising_edge(clk) then
-	    if wr_en = '1' then
            if TRACE then
+                if wr_sel /= sel0 then
                    report "write a:" & to_hstring(wr_addr) &
                        " sel:" & to_hstring(wr_sel) &
                        " dat:" & to_hstring(wr_data);
                end if;
+            end if;
            for i in 0 to WIDTH/8-1 loop
                lbit := i * 8;
                mbit := lbit + 7;
@ -57,7 +57,6 @@ begin
                    ram(widx)(mbit downto lbit) <= wr_data(mbit downto lbit);
                end if;
            end loop;
-	    end if;
 	    if rd_en = '1' then
 		rd_data0 <= ram(to_integer(unsigned(rd_addr)));
 		if TRACE then
--- a/dcache.vhdl
+++ b/dcache.vhdl
@ -923,6 +923,7 @@ begin
 	signal wr_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 	signal wr_data  : std_ulogic_vector(wishbone_data_bits-1 downto 0);
 	signal wr_sel   : std_ulogic_vector(ROW_SIZE-1 downto 0);
+	signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
 	signal dout     : cache_row_t;
    begin
 	way: entity work.cache_ram
@ -936,8 +937,7 @@ begin
 		rd_en   => do_read,
 		rd_addr => rd_addr,
 		rd_data => dout,
-		wr_en   => do_write,
-		wr_sel  => wr_sel,
+		wr_sel  => wr_sel_m,
 		wr_addr => wr_addr,
 		wr_data => wr_data
 		);
@ -986,6 +986,13 @@ begin
 		    severity FAILURE;
 		do_write <= '1';
 	    end if;
+
+            -- Mask write selects with do_write since BRAM doesn't
+            -- have a global write-enable
+            for i in 0 to ROW_SIZE-1 loop
+                wr_sel_m(i) <= wr_sel(i) and do_write;
+            end loop;
+
        end process;
    end generate;

--- a/icache.vhdl
+++ b/icache.vhdl
@ -340,6 +340,7 @@ begin
 	signal rd_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 	signal wr_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 	signal dout     : cache_row_t;
+	signal wr_sel   : std_ulogic_vector(ROW_SIZE-1 downto 0);
    begin
 	way: entity work.cache_ram
 	    generic map (
@ -351,8 +352,7 @@ begin
 		rd_en   => do_read,
 		rd_addr => rd_addr,
 		rd_data => dout,
-		wr_en   => do_write,
-		wr_sel  => (others => '1'),
+		wr_sel  => wr_sel,
 		wr_addr => wr_addr,
 		wr_data => wishbone_in.dat
 		);
@ -366,6 +366,9 @@ begin
 	    cache_out(i) <= dout;
 	    rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
 	    wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
+            for i in 0 to ROW_SIZE-1 loop
+                wr_sel(i) <= do_write;
+            end loop;
 	end process;
    end generate;