library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.wishbone_types.all;

entity wishbone_debug_master is
    port(clk	: in std_ulogic;
	 rst	: in std_ulogic;

	 -- Debug bus interface
	 dmi_addr	: in std_ulogic_vector(1 downto 0);
	 dmi_din	: in std_ulogic_vector(63 downto 0);
	 dmi_dout	: out std_ulogic_vector(63 downto 0);
	 dmi_req	: in std_ulogic;
	 dmi_wr		: in std_ulogic;
	 dmi_ack	: out std_ulogic;

	 -- Wishbone master interface
	 wb_out  : out wishbone_master_out;
	 wb_in   : in wishbone_slave_out
	 );
end entity wishbone_debug_master;

architecture behaviour of wishbone_debug_master is

    -- ** Register offsets definitions. All registers are 64-bit
    constant DBG_WB_ADDR	: std_ulogic_vector(1 downto 0) := "00";
    constant DBG_WB_DATA	: std_ulogic_vector(1 downto 0) := "01";
    constant DBG_WB_CTRL	: std_ulogic_vector(1 downto 0) := "10";
    constant DBG_WB_RSVD	: std_ulogic_vector(1 downto 0) := "11";

    -- CTRL register:
    --
    -- bit  0..7 : SEL bits (byte enables)
    -- bit     8 : address auto-increment
    -- bit 10..9 : auto-increment value:
    --                00 - +1
    --                01 - +2
    --                10 - +4
    --                11 - +8

    -- ** Address and control registers and read data
    signal reg_addr		: std_ulogic_vector(63 downto 0);
    signal reg_ctrl_out		: std_ulogic_vector(63 downto 0);
    signal reg_ctrl		: std_ulogic_vector(10 downto 0);
    signal data_latch		: std_ulogic_vector(63 downto 0);
    
    type state_t is (IDLE, WB_CYCLE, DMI_WAIT);
    signal state : state_t;
    signal do_inc : std_ulogic;

begin

    -- Hard wire unused bits to 0
    reg_ctrl_out <= (63 downto 11 => '0',
		     10 downto  0 => reg_ctrl);

    -- DMI read data mux
    with dmi_addr select dmi_dout <=
	reg_addr        when DBG_WB_ADDR,
	data_latch      when DBG_WB_DATA,
	reg_ctrl_out    when DBG_WB_CTRL,
	(others => '0') when others;

    -- ADDR and CTRL register writes
    reg_write : process(clk)
	subtype autoinc_inc_t is integer range 1 to 8;
	function decode_autoinc(c : std_ulogic_vector(1 downto 0))
	    return autoinc_inc_t is
	begin
	    case c is
	    when "00" => return 1;
	    when "01" => return 2;
	    when "10" => return 4;
	    when "11" => return 8;
	    -- Below shouldn't be necessary but GHDL complains
	    when others => return 8;
	    end case;
	end function decode_autoinc;
    begin
	if rising_edge(clk) then
	    if (rst) then
		reg_addr <= (others => '0');
		reg_ctrl <= (others => '0');
	    else 	    -- Standard register writes
                if do_inc = '1' then
		    -- Address register auto-increment
		    reg_addr <= std_ulogic_vector(unsigned(reg_addr) +
						  decode_autoinc(reg_ctrl(10 downto 9)));
                elsif dmi_req and dmi_wr then
		    if dmi_addr = DBG_WB_ADDR then
			reg_addr <= dmi_din;
		    elsif dmi_addr = DBG_WB_CTRL then
			reg_ctrl <= dmi_din(10 downto 0);
		    end if;
		end if;
	    end if;
	end if;
    end process;

    -- ACK is hard wired to req for register writes. For data read/writes
    -- (aka commands), it's sent when the state machine got the WB ack.
    --
    -- Note: We never set it to 1, we just pass dmi_req back when acking.
    --       This fullfills two purposes:
    --
    --        * Avoids polluting the ack signal when another DMI slave is
    --          selected. This allows the decoder to just OR all the acks
    --          together rather than mux them.
    --
    --        * Makes ack go down on the same cycle as req goes down, thus
    --          saving a clock cycle. This is safe because we know that
    --          the state machine will no longer be in DMI_WAIT state on
    --          the next cycle, so we won't be bouncing the signal back up.
    --
    dmi_ack <= dmi_req when (dmi_addr /= DBG_WB_DATA or state = DMI_WAIT) else '0';

	-- Some WB signals are direct wires from registers or DMI
    wb_out.adr <= reg_addr(wb_out.adr'left downto 0);
    wb_out.dat <= dmi_din;
    wb_out.sel <= reg_ctrl(7 downto 0);
    wb_out.we  <= dmi_wr;

    -- We always move WB cyc and stb simultaneously (no pipelining yet...)
    wb_out.cyc <= '1' when state = WB_CYCLE else '0';

    -- Data latch. WB will take the read data away as soon as the cycle
    -- terminates but we must maintain it on DMI until req goes down, so
    -- we latch it. (Q: Should we move that latch to dmi_dtm itself ?)
    --
    latch_reads : process(clk)
    begin
	if rising_edge(clk) then
	    if state = WB_CYCLE and wb_in.ack = '1' and dmi_wr = '0' then
		data_latch <= wb_in.dat;
	    end if;
	end if;
    end process;

    -- Command state machine (generate wb_cyc)
    wb_trigger : process(clk)
    begin
	if rising_edge(clk) then
	    if (rst) then
		state <= IDLE;
		wb_out.stb <= '0';
                do_inc <= '0';
	    else
		case state is
		when IDLE =>
		    if dmi_req = '1' and dmi_addr = DBG_WB_DATA then
			state <= WB_CYCLE;
			wb_out.stb <= '1';
		    end if;
		when WB_CYCLE =>
		    if wb_in.stall = '0' then
			wb_out.stb <= '0';
		    end if;
		    if wb_in.ack then
			-- We shouldn't get the ack if we hadn't already cleared
			-- stb above but if this happen, don't leave it dangling.
			--
			wb_out.stb <= '0';
			state <= DMI_WAIT;
                        do_inc <= reg_ctrl(8);
		    end if;
		when DMI_WAIT =>
		    if dmi_req = '0' then
			state <= IDLE;
		    end if;
                    do_inc <= '0';
		end case;
	    end if;
	end if;
    end process;
end architecture behaviour;