Add arrays for ASIC flow

Add VHDL wrappers and verilog behaviourals for the cache_ram, register_file and main_bram arrays. Signed-off-by: Anton Blanchard <anton@linux.ibm.com>
5 years ago · 52f2462232
parent d8ba6a78d2
commit 52f2462232
8 changed files with 523 additions and 0 deletions
--- a/asic/behavioural/Microwatt_FP_DFFRFile.v
+++ b/asic/behavioural/Microwatt_FP_DFFRFile.v
@ -0,0 +1,24 @@
 module Microwatt_FP_DFFRFile (
 `ifdef USE_POWER_PINS
    inout VPWR,
    inout VGND,
 `endif
    input [6:0]   R1, R2, R3, RW,
    input [63:0]  DW,
    output [63:0] D1, D2, D3,
    input CLK,
    input WE
 );
    reg [63:0] registers[0:95];
    assign D1 = registers[R1];
    assign D2 = registers[R2];
    assign D3 = registers[R3];
    always @(posedge CLK) begin
        if (WE)
            registers[RW] <= DW;
    end
 endmodule
--- a/asic/behavioural/RAM32_1RW1R.v
+++ b/asic/behavioural/RAM32_1RW1R.v
@ -0,0 +1,40 @@
 module RAM32_1RW1R #(
    parameter BITS=5
 ) (
 `ifdef USE_POWER_PINS
    inout VPWR,
    inout VGND,
 `endif
    input CLK,
    input EN0,
    input [BITS-1:0] A0,
    input [7:0] WE0,
    input [63:0] Di0,
    output reg [63:0] Do0,
    input EN1,
    input [BITS-1:0] A1,
    output reg [63:0] Do1
 );
    reg [63:0] RAM[2**BITS-1:0];
    always @(posedge CLK) begin
        if (EN1)
            Do1 <= RAM[A1];
    end
    generate
        genvar i;
        for (i=0; i<8; i=i+1) begin: BYTE
            always @(posedge CLK) begin
                if (EN0) begin
                    if (WE0[i])
                        RAM[A0][i*8+7:i*8] <= Di0[i*8+7:i*8];
                end
            end
        end
    endgenerate
 endmodule
--- a/asic/behavioural/RAM512.v
+++ b/asic/behavioural/RAM512.v
@ -0,0 +1,42 @@
 module RAM512 #(
    parameter BITS=9,
    parameter FILENAME="firmware.hex"
 ) (
 `ifdef USE_POWER_PINS
    inout VPWR,
    inout VGND,
 `endif
    input CLK,
    input [7:0] WE0,
    input EN0,
    input [63:0] Di0,
    output reg [63:0] Do0,
    input [BITS-1:0] A0
 );
    reg [63:0] RAM[2**BITS-1:0];
    always @(posedge CLK) begin
        if (EN0)
            Do0 <= RAM[A0];
        else
            Do0 <= 64'b0;
    end
    generate
        genvar i;
        for (i=0; i<8; i=i+1) begin: BYTE
            always @(posedge CLK) begin
                if (EN0) begin
                    if (WE0[i])
                        RAM[A0][i*8+7:i*8] <= Di0[i*8+7:i*8];
                end
            end
        end
    endgenerate
 initial begin
    $readmemh(FILENAME, RAM);
 end
 endmodule
--- a/asic/behavioural/multiply_add_64x64.v
+++ b/asic/behavioural/multiply_add_64x64.v
@ -0,0 +1,24 @@
 module multiply_add_64x64
 #(
    parameter BITS=64
 ) (
 `ifdef USE_POWER_PINS
    inout VPWR,
    inout VGND,
 `endif
    input clk,
    input [BITS-1:0] a,
    input [BITS-1:0] b,
    input [BITS*2-1:0] c,
    output [BITS*2-1:0] o
 );
    reg [BITS*2-1:0] o_tmp[2:0];
    always @(posedge clk) begin
        o_tmp[2] = o_tmp[1];
        o_tmp[1] = o_tmp[0];
 	o_tmp[0] = (a * b) + c;
    end
    assign o = o_tmp[2];
 endmodule
--- a/asic/cache_ram.vhdl
+++ b/asic/cache_ram.vhdl
@ -0,0 +1,99 @@
 library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 use ieee.math_real.all;
 entity cache_ram is
    generic(
        ROW_BITS : integer := 5;
        WIDTH    : integer := 64;
        TRACE    : boolean := false;
        ADD_BUF  : boolean := false
        );
    port(
        clk     : in std_logic;
        rd_en   : in std_logic;
        rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0);
        rd_data : out std_logic_vector(WIDTH - 1 downto 0);
        wr_sel  : in std_logic_vector(WIDTH/8 - 1 downto 0);
        wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0);
        wr_data : in std_logic_vector(WIDTH - 1 downto 0)
        );
 end cache_ram;
 architecture rtl of cache_ram is
    component RAM32_1RW1R port(
        CLK     : in std_logic;
        EN0     : in std_logic;
        A0      : in std_logic_vector(4 downto 0);
        WE0     : in std_logic_vector(7 downto 0);
        Di0     : in std_logic_vector(63 downto 0);
        Do0     : out std_logic_vector(63 downto 0);
        EN1     : in std_logic;
        A1      : in std_logic_vector(4 downto 0);
        Do1     : out std_logic_vector(63 downto 0)
        );
    end component;
    signal wr_enable: std_logic;
    signal rd_data0_tmp : std_logic_vector(WIDTH - 1 downto 0);
    signal rd_data0_saved : std_logic_vector(WIDTH - 1 downto 0);
    signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0);
    signal rd_en_prev: std_ulogic;
 begin
    assert (ROW_BITS = 5)  report "ROW_BITS must be 5" severity FAILURE;
    assert (WIDTH = 64)    report "Must be 64 bit" severity FAILURE;
    assert (TRACE = false) report "Trace not supported" severity FAILURE;
    wr_enable <= or(wr_sel);
    cache_ram_0 : RAM32_1RW1R
        port map (
            CLK     => clk,
            EN0     => wr_enable,
            A0      => wr_addr,
            WE0     => wr_sel,
            Di0     => wr_data,
            Do0     => open,
            EN1     => rd_en,
            A1      => rd_addr,
            Do1     => rd_data0_tmp
            );
    -- The caches rely on cache_ram latching the last read. Handle it here
    -- for now.
    process(clk)
    begin
        if rising_edge(clk) then
            rd_en_prev <= rd_en;
            if rd_en_prev = '1' then
                rd_data0_saved <= rd_data0_tmp;
            end if;
        end if;
    end process;
    rd_data0 <= rd_data0_tmp when rd_en_prev = '1' else rd_data0_saved;
    buf: if ADD_BUF generate
    begin
        process(clk)
        begin
            if rising_edge(clk) then
                rd_data <= rd_data0;
            end if;
        end process;
    end generate;
    nobuf: if not ADD_BUF generate
    begin
        rd_data <= rd_data0;
    end generate;
 end architecture rtl;
--- a/asic/main_bram.vhdl
+++ b/asic/main_bram.vhdl
@ -0,0 +1,63 @@
 library ieee;
 use ieee.std_logic_1164.all;
 library work;
 entity main_bram is
    generic(
        WIDTH        : natural := 64;
        HEIGHT_BITS  : natural;
        MEMORY_SIZE  : natural;
        RAM_INIT_FILE : string
        );
    port(
        clk  : in std_logic;
        addr : in std_logic_vector(HEIGHT_BITS - 1 downto 0) ;
        din  : in std_logic_vector(WIDTH-1 downto 0);
        dout : out std_logic_vector(WIDTH-1 downto 0);
        sel  : in std_logic_vector((WIDTH/8)-1 downto 0);
        re   : in std_ulogic;
        we   : in std_ulogic
        );
 end entity main_bram;
 architecture behaviour of main_bram is
    component RAM512 port (
        CLK : in std_ulogic;
        WE0 : in std_ulogic_vector(7 downto 0);
        EN0 : in std_ulogic;
        Di0 : in std_ulogic_vector(63 downto 0);
        Do0 : out std_ulogic_vector(63 downto 0);
        A0  : in std_ulogic_vector(8 downto 0)
    );
    end component;
    signal sel_qual: std_ulogic_vector((WIDTH/8)-1 downto 0);
    signal obuf : std_logic_vector(WIDTH-1 downto 0);
 begin
    assert (WIDTH = 64)         report "Must be 64 bit" severity FAILURE;
    -- Do we have a log2 round up issue here?
    assert (HEIGHT_BITS = 9)    report "HEIGHT_BITS must be 10" severity FAILURE;
    assert (MEMORY_SIZE = 4096) report "MEMORY_SIZE must be 4096" severity FAILURE;
    sel_qual <= sel when we = '1' else (others => '0');
    memory_0 : RAM512
        port map (
            CLK  => clk,
            WE0  => sel_qual(7 downto 0),
            EN0  => re or we,
            Di0  => din(63 downto 0),
            Do0  => obuf(63 downto 0),
            A0   => addr(8 downto 0)
            );
    -- The wishbone BRAM wrapper assumes a 1 cycle delay
    memory_read_buffer: process(clk)
    begin
        if rising_edge(clk) then
            dout <= obuf;
        end if;
    end process;
 end architecture behaviour;
--- a/asic/multiply.vhdl
+++ b/asic/multiply.vhdl
@ -0,0 +1,128 @@
 library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 library work;
 use work.common.all;
 -- XXX We should be able to make timing with a 2 cycle multiplier
 entity multiply is
    generic (
        PIPELINE_DEPTH : natural := 4
        );
    port (
        clk   : in std_logic;
        m_in  : in MultiplyInputType;
        m_out : out MultiplyOutputType
        );
 end entity multiply;
 architecture behaviour of multiply is
    signal m: MultiplyInputType := MultiplyInputInit;
    type multiply_pipeline_stage is record
        valid     : std_ulogic;
        is_32bit  : std_ulogic;
        not_res   : std_ulogic;
    end record;
    constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
                                                                     is_32bit => '0',
                                                                     not_res => '0');
    type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
    constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit);
    type reg_type is record
        multiply_pipeline : multiply_pipeline_type;
    end record;
    signal r, rin : reg_type := (multiply_pipeline => MultiplyPipelineInit);
    signal overflow : std_ulogic;
    signal ovf_in   : std_ulogic;
    signal mult_out : std_logic_vector(127 downto 0);
    component multiply_add_64x64 port(
        clk : in std_logic;
        a   : in std_logic_vector(63 downto 0);
        b   : in std_logic_vector(63 downto 0);
        c   : in std_logic_vector(127 downto 0);
        o   : out std_logic_vector(127 downto 0)
        );
    end component;
 begin
    multiply_0: process(clk)
    begin
        if rising_edge(clk) then
            m <= m_in;
            r <= rin;
            overflow <= ovf_in;
        end if;
    end process;
    multiplier : multiply_add_64x64
        port map (
            clk => clk,
            a => m.data1,
            b => m.data2,
            c => m.addend,
            o => mult_out
    );
    multiply_1: process(all)
        variable v : reg_type;
        variable d : std_ulogic_vector(127 downto 0);
        variable d2 : std_ulogic_vector(63 downto 0);
        variable ov : std_ulogic;
    begin
        v := r;
        v.multiply_pipeline(0).valid := m.valid;
        v.multiply_pipeline(0).is_32bit := m.is_32bit;
        v.multiply_pipeline(0).not_res := m.not_result;
        loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
            v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
        end loop;
        if v.multiply_pipeline(PIPELINE_DEPTH-1).not_res = '1' then
            d := not mult_out;
 	else
            d := mult_out;
        end if;
        ov := '0';
        if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then
            ov := (or d(63 downto 31)) and not (and d(63 downto 31));
        else
            ov := (or d(127 downto 63)) and not (and d(127 downto 63));
        end if;
        ovf_in <= ov;
        m_out.result <= d;
        m_out.overflow <= overflow;
        m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid;
        rin <= v;
    end process;
 end architecture behaviour;
 library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 entity short_multiply is
    port (
        clk   : in std_ulogic;
        a_in  : in std_ulogic_vector(15 downto 0);
        b_in  : in std_ulogic_vector(15 downto 0);
        m_out : out std_ulogic_vector(31 downto 0)
        );
 end entity short_multiply;
 architecture behaviour of short_multiply is
 begin
    m_out <= std_ulogic_vector(signed(a_in) * signed(b_in));
 end architecture behaviour;
--- a/asic/register_file.vhdl
+++ b/asic/register_file.vhdl
@ -0,0 +1,103 @@
 library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 library work;
 use work.common.all;
 entity register_file is
    generic (
        SIM        : boolean := false;
        HAS_FPU    : boolean := true;
        LOG_LENGTH : natural := 0
        );
    port(
        clk           : in std_logic;
        d_in          : in Decode2ToRegisterFileType;
        d_out         : out RegisterFileToDecode2Type;
        w_in          : in WritebackToRegisterFileType;
        dbg_gpr_req   : in std_ulogic;
        dbg_gpr_ack   : out std_ulogic;
        dbg_gpr_addr  : in gspr_index_t;
        dbg_gpr_data  : out std_ulogic_vector(63 downto 0);
        sim_dump      : in std_ulogic;
        sim_dump_done : out std_ulogic;
        log_out       : out std_ulogic_vector(71 downto 0)
        );
 end entity register_file;
 architecture behaviour of register_file is
    component Microwatt_FP_DFFRFile port (
        CLK : in std_ulogic;
        R1  : in std_ulogic_vector(6 downto 0);
        R2  : in std_ulogic_vector(6 downto 0);
        R3  : in std_ulogic_vector(6 downto 0);
        D1  : out std_ulogic_vector(63 downto 0);
        D2  : out std_ulogic_vector(63 downto 0);
        D3  : out std_ulogic_vector(63 downto 0);
        WE  : in std_ulogic;
        RW  : in std_ulogic_vector(6 downto 0);
        DW  : in std_ulogic_vector(63 downto 0)
    );
    end component;
    signal d1: std_ulogic_vector(63 downto 0);
    signal d2: std_ulogic_vector(63 downto 0);
    signal d3: std_ulogic_vector(63 downto 0);
 begin
    register_file_0 : Microwatt_FP_DFFRFile
        port map (
            CLK => clk,
            R1  => d_in.read1_reg,
            R2  => d_in.read2_reg,
            R3  => d_in.read3_reg,
            D1  => d1,
            D2  => d2,
            D3  => d3,
            WE  => w_in.write_enable,
            RW  => w_in.write_reg,
            DW  => w_in.write_data
            );
    x_state_check: process(clk)
    begin
        if rising_edge(clk) then
            if w_in.write_enable = '1' then
                assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
            end if;
        end if;
    end process x_state_check;
    -- Forward any written data
    register_read_0: process(all)
    begin
        d_out.read1_data <= d1;
        d_out.read2_data <= d2;
        d_out.read3_data <= d3;
        if w_in.write_enable = '1' then
            if d_in.read1_reg = w_in.write_reg then
                d_out.read1_data <= w_in.write_data;
            end if;
            if d_in.read2_reg = w_in.write_reg then
                d_out.read2_data <= w_in.write_data;
            end if;
            if d_in.read3_reg = w_in.write_reg then
                d_out.read3_data <= w_in.write_data;
            end if;
        end if;
    end process register_read_0;
 end architecture behaviour;