/*
 * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/**
 * @file
 * The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
 * agent. Other GPU devices, or other HSA agents, should not derive
 * from this class. Instead device-specific implementations of an
 * HSADriver should be provided for each unique device.
 */

#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__

#include <cassert>
#include <cstdint>
#include <set>
#include <unordered_map>

#include "base/addr_range_map.hh"
#include "base/types.hh"
#include "enums/GfxVersion.hh"
#include "mem/request.hh"
#include "sim/emul_driver.hh"

namespace gem5
{

struct GPUComputeDriverParams;
class GPUCommandProcessor;
class PortProxy;
class ThreadContext;

class GPUComputeDriver final : public EmulatedDriver
{
  public:
    typedef GPUComputeDriverParams Params;
    GPUComputeDriver(const Params &p);
    int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;

    int open(ThreadContext *tc, int mode, int flags) override;
    Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
              int prot, int tgt_flags, int tgt_fd, off_t offset) override;
    virtual void signalWakeupEvent(uint32_t event_id);
    void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
    /**
     * Called by the compute units right before a request is issued to ruby.
     * This uses our VMAs to correctly set the MTYPE on a per-request basis.
     * In real hardware, this is actually done through PTE bits in GPUVM.
     * Since we are running a single VM (x86 PT) system, the MTYPE bits aren't
     * available.  Adding GPUVM specific bits to x86 page tables probably
     * isn't the best way to proceed.  For now we just have the driver set
     * these until we implement a proper dual PT system.
     */
    void setMtype(RequestPtr req);

    int
    doorbellSize()
    {
        switch (gfxVersion) {
          case GfxVersion::gfx801:
          case GfxVersion::gfx803:
          case GfxVersion::gfx902:
            return 4;
          case GfxVersion::gfx900:
            // gfx900 supports large BAR, so it has a larger doorbell
            return 8;
          default:
            fatal("Invalid GPU type\n");
        }
        return 4;
    }

    class DriverWakeupEvent : public Event
    {
      public:
        DriverWakeupEvent(GPUComputeDriver *gpu_driver,
                          ThreadContext *thrd_cntxt)
          : driver(gpu_driver), tc(thrd_cntxt) {}
        void process() override;
        const char *description() const override;
        void scheduleWakeup(Tick wakeup_delay);
      private:
        GPUComputeDriver *driver;
        ThreadContext *tc;
    };

    class EventTableEntry
    {
      public:
        EventTableEntry() :
            mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
        {}
        // Mail box pointer for this address. Current implementation does not
        // use this mailBoxPtr to notify events but directly calls
        // signalWakeupEvent from dispatcher (GPU) to notifiy events. So,
        // currently this mailBoxPtr is not used. But a future implementation
        // may communicate to the driver using mailBoxPtr.
        Addr mailBoxPtr;
        // Thread context waiting on this even. We do not support multiple
        // threads waiting on an event currently.
        ThreadContext *tc;
        // threadWaiting = true, if some thread context is waiting on this
        // event. A thread context waiting on this event is put to sleep.
        bool threadWaiting;
        // setEvent = true, if this event is triggered but when this event
        // triggered, no thread context was waiting on it. In the future, some
        // thread context will try to wait on this event but since event has
        // already happened, we will not allow that thread context to go to
        // sleep. The above mentioned scneario can happen when the waiting
        // thread and wakeup thread race on this event and the wakeup thread
        // beat the waiting thread at the driver.
        bool setEvent;
    };
    typedef class EventTableEntry ETEntry;

    GfxVersion getGfxVersion() const { return gfxVersion; }

  private:
    /**
     * GPU that is controlled by this driver.
     */
    GPUCommandProcessor *device;
    uint32_t queueId;
    bool isdGPU;
    GfxVersion gfxVersion;
    int dGPUPoolID;
    Addr eventPage;
    uint32_t eventSlotIndex;
    //Event table that keeps track of events. It is indexed with event ID.
    std::unordered_map<uint32_t, ETEntry> ETable;

    /**
     * VMA structures for GPUVM memory.
     */
    AddrRangeMap<Request::CacheCoherenceFlags, 1> gpuVmas;

    /**
     * Mtype bits {Cached, Read Write, Shared} for caches
     */
    enum MtypeFlags
    {
        SHARED                  = 0,
        READ_WRITE              = 1,
        CACHED                  = 2,
        NUM_MTYPE_BITS
    };

    Request::CacheCoherenceFlags defaultMtype;

    // TCEvents map keeps trak of the events that can wakeup this thread. When
    // multiple events can wake up this thread, this data structure helps to
    // reset all events when one of those events wake up this thread. the
    // signal events that can wake up this thread are stored in signalEvents
    // whereas the timer wakeup event is stored in timerEvent.
    class EventList
    {
      public:
        EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
        EventList(GPUComputeDriver *gpu_driver, ThreadContext *thrd_cntxt)
            : driver(gpu_driver), timerEvent(gpu_driver, thrd_cntxt)
        { }
        void clearEvents() {
            assert(driver);
            for (auto event : signalEvents) {
                assert(event < driver->eventSlotIndex);
                driver->ETable[event].tc = nullptr;
                driver->ETable[event].threadWaiting = false;
            }
            signalEvents.clear();
            if (timerEvent.scheduled()) {
                driver->deschedule(timerEvent);
            }
        }
        GPUComputeDriver *driver;
        DriverWakeupEvent timerEvent;
        // The set of events that can wake up the same thread.
        std::set<uint32_t> signalEvents;
    };
    std::unordered_map<ThreadContext *, EventList> TCEvents;

    /**
     * Register a region of host memory as uncacheable from the perspective
     * of the dGPU.
     */
    void registerUncacheableMemory(Addr start, Addr length);

    /**
     * The aperture (APE) base/limit pairs are set
     * statically at startup by the real KFD. AMD
     * x86_64 CPUs only use the areas in the 64b
     * address space where VA[63:47] == 0x1ffff or
     * VA[63:47] = 0. These methods generate the APE
     * base/limit pairs in exactly the same way as
     * the real KFD does, which ensures these APEs do
     * not fall into the CPU's address space
     *
     * see the macros in the KFD driver in the ROCm
     * Linux kernel source:
     *
     * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
     */
    Addr gpuVmApeBase(int gpuNum) const;
    Addr gpuVmApeLimit(Addr apeBase) const;
    Addr scratchApeBase(int gpuNum) const;
    Addr scratchApeBaseV9() const;
    Addr scratchApeLimit(Addr apeBase) const;
    Addr ldsApeBase(int gpuNum) const;
    Addr ldsApeBaseV9() const;
    Addr ldsApeLimit(Addr apeBase) const;

    /**
     * Allocate/deallocate GPUVM VMAs for tracking virtual address allocations
     * and properties on DGPUs.  For now, we use these to track MTYPE and to
     * be able to select which pages to unmap when the user provides us with
     * a handle during the free ioctl.
     */
    void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start,
                        Addr length);
    Addr deallocateGpuVma(Addr start);

    void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr);

};

} // namespace gem5

#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__