Add arrays for ASIC flow

Add VHDL wrappers and verilog behaviourals for the cache_ram, register_file and main_bram arrays. Signed-off-by: Anton Blanchard <anton@linux.ibm.com>
5 years ago · 8ecb30da05
parent 747c96b100
commit 8ecb30da05
8 changed files with 523 additions and 0 deletions
--- a/asic/behavioural/Microwatt_FP_DFFRFile.v
+++ b/asic/behavioural/Microwatt_FP_DFFRFile.v
@ -0,0 +1,24 @@
+module Microwatt_FP_DFFRFile (
+`ifdef USE_POWER_PINS
+    inout VPWR,
+    inout VGND,
+`endif
+    input [6:0]   R1, R2, R3, RW,
+    input [63:0]  DW,
+    output [63:0] D1, D2, D3,
+    input CLK,
+    input WE
+);
+
+    reg [63:0] registers[0:95];
+
+    assign D1 = registers[R1];
+    assign D2 = registers[R2];
+    assign D3 = registers[R3];
+
+    always @(posedge CLK) begin
+        if (WE)
+            registers[RW] <= DW;
+    end
+
+endmodule
--- a/asic/behavioural/RAM32_1RW1R.v
+++ b/asic/behavioural/RAM32_1RW1R.v
@ -0,0 +1,40 @@
+module RAM32_1RW1R #(
+    parameter BITS=5
+) (
+`ifdef USE_POWER_PINS
+    inout VPWR,
+    inout VGND,
+`endif
+    input CLK,
+
+    input EN0,
+    input [BITS-1:0] A0,
+    input [7:0] WE0,
+    input [63:0] Di0,
+    output reg [63:0] Do0,
+
+    input EN1,
+    input [BITS-1:0] A1,
+    output reg [63:0] Do1
+);
+
+    reg [63:0] RAM[2**BITS-1:0];
+
+    always @(posedge CLK) begin
+        if (EN1)
+            Do1 <= RAM[A1];
+    end
+
+    generate
+        genvar i;
+        for (i=0; i<8; i=i+1) begin: BYTE
+            always @(posedge CLK) begin
+                if (EN0) begin
+                    if (WE0[i])
+                        RAM[A0][i*8+7:i*8] <= Di0[i*8+7:i*8];
+                end
+            end
+        end
+    endgenerate
+
+endmodule
--- a/asic/behavioural/RAM512.v
+++ b/asic/behavioural/RAM512.v
@ -0,0 +1,42 @@
+module RAM512 #(
+    parameter BITS=9,
+    parameter FILENAME="firmware.hex"
+) (
+`ifdef USE_POWER_PINS
+    inout VPWR,
+    inout VGND,
+`endif
+    input CLK,
+    input [7:0] WE0,
+    input EN0,
+    input [63:0] Di0,
+    output reg [63:0] Do0,
+    input [BITS-1:0] A0
+);
+
+    reg [63:0] RAM[2**BITS-1:0];
+
+    always @(posedge CLK) begin
+        if (EN0)
+            Do0 <= RAM[A0];
+        else
+            Do0 <= 64'b0;
+    end
+
+    generate
+        genvar i;
+        for (i=0; i<8; i=i+1) begin: BYTE
+            always @(posedge CLK) begin
+                if (EN0) begin
+                    if (WE0[i])
+                        RAM[A0][i*8+7:i*8] <= Di0[i*8+7:i*8];
+                end
+            end
+        end
+    endgenerate
+
+initial begin
+    $readmemh(FILENAME, RAM);
+end
+
+endmodule
--- a/asic/behavioural/multiply_add_64x64.v
+++ b/asic/behavioural/multiply_add_64x64.v
@ -0,0 +1,24 @@
+module multiply_add_64x64
+#(
+    parameter BITS=64
+) (
+`ifdef USE_POWER_PINS
+    inout VPWR,
+    inout VGND,
+`endif
+    input clk,
+    input [BITS-1:0] a,
+    input [BITS-1:0] b,
+    input [BITS*2-1:0] c,
+    output [BITS*2-1:0] o
+);
+    reg [BITS*2-1:0] o_tmp[2:0];
+
+    always @(posedge clk) begin
+        o_tmp[2] = o_tmp[1];
+        o_tmp[1] = o_tmp[0];
+	o_tmp[0] = (a * b) + c;
+    end
+
+    assign o = o_tmp[2];
+endmodule
--- a/asic/cache_ram.vhdl
+++ b/asic/cache_ram.vhdl
@ -0,0 +1,99 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+use ieee.math_real.all;
+
+entity cache_ram is
+    generic(
+        ROW_BITS : integer := 5;
+        WIDTH    : integer := 64;
+        TRACE    : boolean := false;
+        ADD_BUF  : boolean := false
+        );
+
+    port(
+        clk     : in std_logic;
+
+        rd_en   : in std_logic;
+        rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0);
+        rd_data : out std_logic_vector(WIDTH - 1 downto 0);
+
+        wr_sel  : in std_logic_vector(WIDTH/8 - 1 downto 0);
+        wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0);
+        wr_data : in std_logic_vector(WIDTH - 1 downto 0)
+        );
+
+end cache_ram;
+
+architecture rtl of cache_ram is
+    component RAM32_1RW1R port(
+        CLK     : in std_logic;
+
+        EN0     : in std_logic;
+        A0      : in std_logic_vector(4 downto 0);
+        WE0     : in std_logic_vector(7 downto 0);
+        Di0     : in std_logic_vector(63 downto 0);
+        Do0     : out std_logic_vector(63 downto 0);
+
+        EN1     : in std_logic;
+        A1      : in std_logic_vector(4 downto 0);
+        Do1     : out std_logic_vector(63 downto 0)
+        );
+    end component;
+
+    signal wr_enable: std_logic;
+    signal rd_data0_tmp : std_logic_vector(WIDTH - 1 downto 0);
+    signal rd_data0_saved : std_logic_vector(WIDTH - 1 downto 0);
+    signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0);
+    signal rd_en_prev: std_ulogic;
+begin
+    assert (ROW_BITS = 5)  report "ROW_BITS must be 5" severity FAILURE;
+    assert (WIDTH = 64)    report "Must be 64 bit" severity FAILURE;
+    assert (TRACE = false) report "Trace not supported" severity FAILURE;
+
+    wr_enable <= or(wr_sel);
+
+    cache_ram_0 : RAM32_1RW1R
+        port map (
+            CLK     => clk,
+
+            EN0     => wr_enable,
+            A0      => wr_addr,
+            WE0     => wr_sel,
+            Di0     => wr_data,
+            Do0     => open,
+
+            EN1     => rd_en,
+            A1      => rd_addr,
+            Do1     => rd_data0_tmp
+            );
+
+    -- The caches rely on cache_ram latching the last read. Handle it here
+    -- for now.
+    process(clk)
+    begin
+        if rising_edge(clk) then
+            rd_en_prev <= rd_en;
+            if rd_en_prev = '1' then
+                rd_data0_saved <= rd_data0_tmp;
+            end if;
+        end if;
+    end process;
+    rd_data0 <= rd_data0_tmp when rd_en_prev = '1' else rd_data0_saved;
+
+    buf: if ADD_BUF generate
+    begin
+        process(clk)
+        begin
+            if rising_edge(clk) then
+                rd_data <= rd_data0;
+            end if;
+        end process;
+    end generate;
+
+    nobuf: if not ADD_BUF generate
+    begin
+        rd_data <= rd_data0;
+    end generate;
+
+end architecture rtl;
--- a/asic/main_bram.vhdl
+++ b/asic/main_bram.vhdl
@ -0,0 +1,63 @@
+library ieee;
+use ieee.std_logic_1164.all;
+
+library work;
+
+entity main_bram is
+    generic(
+        WIDTH        : natural := 64;
+        HEIGHT_BITS  : natural;
+        MEMORY_SIZE  : natural;
+        RAM_INIT_FILE : string
+        );
+    port(
+        clk  : in std_logic;
+        addr : in std_logic_vector(HEIGHT_BITS - 1 downto 0) ;
+        din  : in std_logic_vector(WIDTH-1 downto 0);
+        dout : out std_logic_vector(WIDTH-1 downto 0);
+        sel  : in std_logic_vector((WIDTH/8)-1 downto 0);
+        re   : in std_ulogic;
+        we   : in std_ulogic
+        );
+end entity main_bram;
+
+architecture behaviour of main_bram is
+    component RAM512 port (
+        CLK : in std_ulogic;
+        WE0 : in std_ulogic_vector(7 downto 0);
+        EN0 : in std_ulogic;
+        Di0 : in std_ulogic_vector(63 downto 0);
+        Do0 : out std_ulogic_vector(63 downto 0);
+        A0  : in std_ulogic_vector(8 downto 0)
+    );
+    end component;
+
+    signal sel_qual: std_ulogic_vector((WIDTH/8)-1 downto 0);
+
+    signal obuf : std_logic_vector(WIDTH-1 downto 0);
+begin
+    assert (WIDTH = 64)         report "Must be 64 bit" severity FAILURE;
+    -- Do we have a log2 round up issue here?
+    assert (HEIGHT_BITS = 9)    report "HEIGHT_BITS must be 10" severity FAILURE;
+    assert (MEMORY_SIZE = 4096) report "MEMORY_SIZE must be 4096" severity FAILURE;
+
+    sel_qual <= sel when we = '1' else (others => '0');
+
+    memory_0 : RAM512
+        port map (
+            CLK  => clk,
+            WE0  => sel_qual(7 downto 0),
+            EN0  => re or we,
+            Di0  => din(63 downto 0),
+            Do0  => obuf(63 downto 0),
+            A0   => addr(8 downto 0)
+            );
+
+    -- The wishbone BRAM wrapper assumes a 1 cycle delay
+    memory_read_buffer: process(clk)
+    begin
+        if rising_edge(clk) then
+            dout <= obuf;
+        end if;
+    end process;
+end architecture behaviour;
--- a/asic/multiply.vhdl
+++ b/asic/multiply.vhdl
@ -0,0 +1,128 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+-- XXX We should be able to make timing with a 2 cycle multiplier
+entity multiply is
+    generic (
+        PIPELINE_DEPTH : natural := 4
+        );
+    port (
+        clk   : in std_logic;
+
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
+        );
+end entity multiply;
+
+architecture behaviour of multiply is
+    signal m: MultiplyInputType := MultiplyInputInit;
+
+    type multiply_pipeline_stage is record
+        valid     : std_ulogic;
+        is_32bit  : std_ulogic;
+        not_res   : std_ulogic;
+    end record;
+    constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
+                                                                     is_32bit => '0',
+                                                                     not_res => '0');
+
+    type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
+    constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit);
+
+    type reg_type is record
+        multiply_pipeline : multiply_pipeline_type;
+    end record;
+
+    signal r, rin : reg_type := (multiply_pipeline => MultiplyPipelineInit);
+    signal overflow : std_ulogic;
+    signal ovf_in   : std_ulogic;
+
+    signal mult_out : std_logic_vector(127 downto 0);
+
+    component multiply_add_64x64 port(
+        clk : in std_logic;
+        a   : in std_logic_vector(63 downto 0);
+        b   : in std_logic_vector(63 downto 0);
+        c   : in std_logic_vector(127 downto 0);
+        o   : out std_logic_vector(127 downto 0)
+        );
+    end component;
+begin
+    multiply_0: process(clk)
+    begin
+        if rising_edge(clk) then
+            m <= m_in;
+            r <= rin;
+            overflow <= ovf_in;
+        end if;
+    end process;
+
+    multiplier : multiply_add_64x64
+        port map (
+            clk => clk,
+            a => m.data1,
+            b => m.data2,
+            c => m.addend,
+            o => mult_out
+    );
+
+    multiply_1: process(all)
+        variable v : reg_type;
+        variable d : std_ulogic_vector(127 downto 0);
+        variable d2 : std_ulogic_vector(63 downto 0);
+        variable ov : std_ulogic;
+    begin
+        v := r;
+        v.multiply_pipeline(0).valid := m.valid;
+        v.multiply_pipeline(0).is_32bit := m.is_32bit;
+        v.multiply_pipeline(0).not_res := m.not_result;
+
+        loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
+            v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
+        end loop;
+
+        if v.multiply_pipeline(PIPELINE_DEPTH-1).not_res = '1' then
+            d := not mult_out;
+	else
+            d := mult_out;
+        end if;
+
+        ov := '0';
+        if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then
+            ov := (or d(63 downto 31)) and not (and d(63 downto 31));
+        else
+            ov := (or d(127 downto 63)) and not (and d(127 downto 63));
+        end if;
+        ovf_in <= ov;
+
+        m_out.result <= d;
+        m_out.overflow <= overflow;
+        m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid;
+
+        rin <= v;
+    end process;
+end architecture behaviour;
+
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+entity short_multiply is
+    port (
+        clk   : in std_ulogic;
+
+        a_in  : in std_ulogic_vector(15 downto 0);
+        b_in  : in std_ulogic_vector(15 downto 0);
+        m_out : out std_ulogic_vector(31 downto 0)
+        );
+end entity short_multiply;
+
+architecture behaviour of short_multiply is
+begin
+    m_out <= std_ulogic_vector(signed(a_in) * signed(b_in));
+end architecture behaviour;
--- a/asic/register_file.vhdl
+++ b/asic/register_file.vhdl
@ -0,0 +1,103 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+entity register_file is
+    generic (
+        SIM        : boolean := false;
+        HAS_FPU    : boolean := true;
+        LOG_LENGTH : natural := 0
+        );
+    port(
+        clk           : in std_logic;
+
+        d_in          : in Decode2ToRegisterFileType;
+        d_out         : out RegisterFileToDecode2Type;
+
+        w_in          : in WritebackToRegisterFileType;
+
+        dbg_gpr_req   : in std_ulogic;
+        dbg_gpr_ack   : out std_ulogic;
+        dbg_gpr_addr  : in gspr_index_t;
+        dbg_gpr_data  : out std_ulogic_vector(63 downto 0);
+
+        sim_dump      : in std_ulogic;
+        sim_dump_done : out std_ulogic;
+
+        log_out       : out std_ulogic_vector(71 downto 0)
+        );
+end entity register_file;
+
+architecture behaviour of register_file is
+    component Microwatt_FP_DFFRFile port (
+        CLK : in std_ulogic;
+
+        R1  : in std_ulogic_vector(6 downto 0);
+        R2  : in std_ulogic_vector(6 downto 0);
+        R3  : in std_ulogic_vector(6 downto 0);
+
+        D1  : out std_ulogic_vector(63 downto 0);
+        D2  : out std_ulogic_vector(63 downto 0);
+        D3  : out std_ulogic_vector(63 downto 0);
+
+        WE  : in std_ulogic;
+        RW  : in std_ulogic_vector(6 downto 0);
+        DW  : in std_ulogic_vector(63 downto 0)
+    );
+    end component;
+
+    signal d1: std_ulogic_vector(63 downto 0);
+    signal d2: std_ulogic_vector(63 downto 0);
+    signal d3: std_ulogic_vector(63 downto 0);
+begin
+
+    register_file_0 : Microwatt_FP_DFFRFile
+        port map (
+            CLK => clk,
+
+            R1  => d_in.read1_reg,
+            R2  => d_in.read2_reg,
+            R3  => d_in.read3_reg,
+
+            D1  => d1,
+            D2  => d2,
+            D3  => d3,
+
+            WE  => w_in.write_enable,
+            RW  => w_in.write_reg,
+            DW  => w_in.write_data
+            );
+
+    x_state_check: process(clk)
+    begin
+        if rising_edge(clk) then
+            if w_in.write_enable = '1' then
+                assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
+            end if;
+        end if;
+    end process x_state_check;
+
+    -- Forward any written data
+    register_read_0: process(all)
+    begin
+        d_out.read1_data <= d1;
+        d_out.read2_data <= d2;
+        d_out.read3_data <= d3;
+
+        if w_in.write_enable = '1' then
+            if d_in.read1_reg = w_in.write_reg then
+                d_out.read1_data <= w_in.write_data;
+            end if;
+            if d_in.read2_reg = w_in.write_reg then
+                d_out.read2_data <= w_in.write_data;
+            end if;
+            if d_in.read3_reg = w_in.write_reg then
+                d_out.read3_data <= w_in.write_data;
+            end if;
+        end if;
+    end process register_read_0;
+
+end architecture behaviour;