/*
 * Copyright (c) 2014-2017 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __FETCH_UNIT_HH__
#define __FETCH_UNIT_HH__

#include <cassert>
#include <cstdint>
#include <deque>
#include <map>
#include <utility>
#include <vector>

#include "arch/gpu_decoder.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/scheduler.hh"
#include "mem/packet.hh"
#include "sim/eventq.hh"

namespace gem5
{

class ComputeUnit;
class Wavefront;

class FetchUnit
{
  public:
    FetchUnit(const ComputeUnitParams &p, ComputeUnit &cu);
    ~FetchUnit();
    void init();
    void exec();
    void bindWaveList(std::vector<Wavefront*> *list);
    void initiateFetch(Wavefront *wavefront);
    void fetch(PacketPtr pkt, Wavefront *wavefront);
    void processFetchReturn(PacketPtr pkt);
    void flushBuf(int wfSlotId);
    static uint32_t globalFetchUnitID;

  private:
    /**
     * fetch buffer descriptor. holds buffered
     * instruction data in the fetch unit.
     */
    class FetchBufDesc
    {
      public:
        FetchBufDesc() : bufStart(nullptr), bufEnd(nullptr),
            readPtr(nullptr), fetchDepth(0), maxIbSize(0), maxFbSize(0),
            cacheLineSize(0), restartFromBranch(false), wavefront(nullptr),
            _decoder(nullptr)
        {
        }

        ~FetchBufDesc()
        {
            delete[] bufStart;
        }

        /**
         * allocate the fetch buffer space, and set the fetch depth
         * (number of lines that may be buffered), fetch size
         * (cache line size), and parent WF for this fetch buffer.
         */
        void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf);

        int
        bufferedAndReservedLines() const
        {
            return bufferedLines() + reservedLines();
        }

        int bufferedLines() const { return bufferedPCs.size(); }
        int bufferedBytes() const { return bufferedLines() * cacheLineSize; }
        int reservedLines() const { return reservedPCs.size(); }
        bool hasFreeSpace() const { return !freeList.empty(); }
        void flushBuf();
        Addr nextFetchAddr();

        /**
         * reserve an entry in the fetch buffer for PC = vaddr,
         */
        void reserveBuf(Addr vaddr);

        /**
         * return a pointer to the raw fetch buffer data.
         * this allows the fetch pkt to use this data directly
         * to avoid unnecessary memcpy and malloc/new.
         */
        uint8_t*
        reservedBuf(Addr vaddr) const
        {
            auto reserved_pc = reservedPCs.find(vaddr);
            assert(reserved_pc != reservedPCs.end());
            assert(reserved_pc == reservedPCs.begin());

            return reserved_pc->second;
        }

        /**
         * returns true if there is an entry reserved for this address,
         * and false otherwise
         */
        bool
        isReserved(Addr vaddr) const
        {
            auto reserved_pc = reservedPCs.find(vaddr);
            bool is_reserved = (reserved_pc != reservedPCs.end());
            return is_reserved;
        }

        void fetchDone(Addr vaddr);

        /**
         * checks if the buffer contains valid data. this essentially
         * tells fetch when there is data remaining that needs to be
         * decoded into the WF's IB.
         */
        bool hasFetchDataToProcess() const;

        /**
         * each time the fetch stage is ticked, we check if there
         * are any data in the fetch buffer that may be decoded and
         * sent to the IB. because we are modeling the fetch buffer
         * as a circular buffer, it is possible that an instruction
         * can straddle the end/beginning of the fetch buffer, so
         * decodeSplitInsts() handles that case.
         */
        void decodeInsts();

        /**
         * checks if the wavefront can release any of its fetch
         * buffer entries. this will occur when the WF's PC goes
         * beyond any of the currently buffered cache lines.
         */
        void checkWaveReleaseBuf();

        void
        decoder(TheGpuISA::Decoder *dec)
        {
            _decoder = dec;
        }

        bool
        pcBuffered(Addr pc) const
        {
            bool buffered = bufferedPCs.find(pc) != bufferedPCs.end()
                            && reservedPCs.find(pc) != reservedPCs.end();

            return buffered;
        }

        /**
         * calculates the number of fetched bytes that have yet
         * to be decoded.
         */
        int fetchBytesRemaining() const;

      private:
        void decodeSplitInst();

        /**
         * check if the next instruction to be processed out of
         * the fetch buffer is split across the end/beginning of
         * the fetch buffer.
         */
        bool splitDecode() const;

        /**
         * the set of PCs (fetch addresses) that are currently
         * buffered. bufferedPCs are valid, reservedPCs are
         * waiting for their buffers to be filled with valid
         * fetch data.
         */
        std::map<Addr, uint8_t*> bufferedPCs;
        std::map<Addr, uint8_t*> reservedPCs;

        /**
         * represents the fetch buffer free list. holds buffer space
         * that is currently free. each pointer in this array must
         * have enough space to hold a cache line. in reality we
         * have one actual fetch buffer: 'bufStart', these pointers
         * point to addresses within bufStart that are aligned to the
         * cache line size.
         */
        std::deque<uint8_t*> freeList;

        /**
         * raw instruction buffer. holds cache line data associated with
         * the set of PCs (fetch addresses) that are buffered here.
         */
        uint8_t *bufStart;
        uint8_t *bufEnd;
        /**
         * pointer that points to the next chunk of inst data to be
         * decoded.
         */
        uint8_t *readPtr;
        // how many lines the fetch unit may buffer
        int fetchDepth;
        // maximum size (in number of insts) of the WF's IB
        int maxIbSize;
        // maximum size (in bytes) of this fetch buffer
        int maxFbSize;
        int cacheLineSize;
        int cacheLineBits;
        bool restartFromBranch;
        // wavefront whose IB is serviced by this fetch buffer
        Wavefront *wavefront;
        TheGpuISA::Decoder *_decoder;
    };

    class SystemHubEvent : public Event
    {
      FetchUnit *fetchUnit;
      PacketPtr reqPkt;

      public:
        SystemHubEvent(PacketPtr pkt, FetchUnit *fetch_unit)
            : fetchUnit(fetch_unit), reqPkt(pkt)
        {
            setFlags(Event::AutoDelete);
        }

        void process();
    };

    bool timingSim;
    ComputeUnit &computeUnit;
    TheGpuISA::Decoder decoder;

    // Fetch scheduler; Selects one wave from
    // the fetch queue for instruction fetching.
    // The selection is made according to
    // a scheduling policy
    Scheduler fetchScheduler;

    // Stores the list of waves that are
    // ready to be fetched this cycle
    std::vector<Wavefront*> fetchQueue;

    // Stores the fetch status of all waves dispatched to this SIMD.
    // TRUE implies the wave is ready to fetch and is already
    // moved to fetchQueue
    std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;

    // Pointer to list of waves dispatched on to this SIMD unit
    std::vector<Wavefront*> *waveList;
    // holds the fetch buffers. each wave has 1 entry.
    std::vector<FetchBufDesc> fetchBuf;
    /**
     * number of cache lines we can fetch and buffer.
     * this includes the currently fetched line (i.e., the
     * line that corresponds to the WF's current PC), as
     * well as any lines that may be prefetched.
     */
    int fetchDepth;
};

} // namespace gem5

#endif // __FETCH_UNIT_HH__