This fixes occasional readBlob fatals caused by the functional read of system memory, seen often with the KVM CPU. Change-Id: Ifccee666f62faa5b2fcf0a64a9d77c8cf95b3add
305 lines
12 KiB
C++
305 lines
12 KiB
C++
/*
|
|
* Copyright (c) 2018 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* The GPUCommandProcessor (CP) is responsible for accepting commands, in
|
|
* the form of HSA AQL packets, from the HSA packet processor (HSAPP). The CP
|
|
* works with several components, including the HSAPP and the dispatcher.
|
|
* When the HSAPP sends a ready task to the CP, it will perform the necessary
|
|
* operations to extract relevant data structures from memory, such as the
|
|
* AQL queue descriptor and AQL packet, and initializes register state for the
|
|
* task's wavefronts.
|
|
*/
|
|
|
|
#ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
|
|
#define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
|
|
|
|
#include <cstdint>
|
|
#include <functional>
|
|
|
|
#include "arch/amdgpu/vega/gpu_registers.hh"
|
|
#include "base/logging.hh"
|
|
#include "base/trace.hh"
|
|
#include "base/types.hh"
|
|
#include "debug/GPUCommandProc.hh"
|
|
#include "dev/dma_virt_device.hh"
|
|
#include "dev/hsa/hsa_packet_processor.hh"
|
|
#include "dev/hsa/hsa_signal.hh"
|
|
#include "gpu-compute/dispatcher.hh"
|
|
#include "gpu-compute/gpu_compute_driver.hh"
|
|
#include "gpu-compute/hsa_queue_entry.hh"
|
|
#include "params/GPUCommandProcessor.hh"
|
|
#include "sim/full_system.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
struct GPUCommandProcessorParams;
|
|
class GPUComputeDriver;
|
|
class GPUDispatcher;
|
|
class Shader;
|
|
|
|
class GPUCommandProcessor : public DmaVirtDevice
|
|
{
|
|
public:
|
|
typedef GPUCommandProcessorParams Params;
|
|
typedef std::function<void(const uint64_t &)> HsaSignalCallbackFunction;
|
|
|
|
GPUCommandProcessor() = delete;
|
|
GPUCommandProcessor(const Params &p);
|
|
|
|
HSAPacketProcessor& hsaPacketProc();
|
|
RequestorID vramRequestorId();
|
|
|
|
void setGPUDevice(AMDGPUDevice *gpu_device);
|
|
void setShader(Shader *shader);
|
|
Shader* shader();
|
|
GPUComputeDriver* driver();
|
|
|
|
enum AgentCmd
|
|
{
|
|
Nop = 0,
|
|
Steal = 1
|
|
};
|
|
|
|
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
|
Addr host_pkt_addr);
|
|
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
|
Addr host_pkt_addr);
|
|
void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
|
|
Addr host_pkt_addr);
|
|
void attachDriver(GPUComputeDriver *driver);
|
|
|
|
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
|
|
uint32_t queue_id, Addr host_pkt_addr);
|
|
void dispatchPkt(HSAQueueEntry *task);
|
|
void signalWakeupEvent(uint32_t event_id);
|
|
|
|
Tick write(PacketPtr pkt) override { return 0; }
|
|
Tick read(PacketPtr pkt) override { return 0; }
|
|
AddrRangeList getAddrRanges() const override;
|
|
System *system();
|
|
|
|
void sendCompletionSignal(Addr signal_handle);
|
|
void updateHsaSignal(Addr signal_handle, uint64_t signal_value,
|
|
HsaSignalCallbackFunction function =
|
|
[] (const uint64_t &) { });
|
|
void updateHsaSignalAsync(Addr signal_handle, int64_t diff);
|
|
void updateHsaSignalData(Addr value_addr, int64_t diff,
|
|
uint64_t *prev_value);
|
|
void updateHsaSignalDone(uint64_t *signal_value);
|
|
void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value);
|
|
void updateHsaEventData(Addr signal_handle, uint64_t *event_value);
|
|
void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value);
|
|
|
|
uint64_t functionalReadHsaSignal(Addr signal_handle);
|
|
|
|
Addr getHsaSignalValueAddr(Addr signal_handle)
|
|
{
|
|
return signal_handle + offsetof(amd_signal_t, value);
|
|
}
|
|
|
|
Addr getHsaSignalMailboxAddr(Addr signal_handle)
|
|
{
|
|
return signal_handle + offsetof(amd_signal_t, event_mailbox_ptr);
|
|
}
|
|
|
|
Addr getHsaSignalEventAddr(Addr signal_handle)
|
|
{
|
|
return signal_handle + offsetof(amd_signal_t, event_id);
|
|
}
|
|
|
|
private:
|
|
Shader *_shader;
|
|
GPUDispatcher &dispatcher;
|
|
GPUComputeDriver *_driver;
|
|
AMDGPUDevice *gpuDevice;
|
|
VegaISA::Walker *walker;
|
|
|
|
// Typedefing dmaRead and dmaWrite function pointer
|
|
typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
|
|
void initABI(HSAQueueEntry *task);
|
|
HSAPacketProcessor *hsaPP;
|
|
TranslationGenPtr translate(Addr vaddr, Addr size) override;
|
|
|
|
// Running counter of dispatched tasks
|
|
int dynamic_task_id = 0;
|
|
|
|
// Keep track of start times for task dispatches.
|
|
std::unordered_map<Addr, Tick> dispatchStartTime;
|
|
|
|
/**
|
|
* Perform a DMA read of the read_dispatch_id_field_base_byte_offset
|
|
* field, which follows directly after the read_dispatch_id (the read
|
|
* pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor
|
|
* (MQD)), to find the base address of the MQD. The MQD is the runtime's
|
|
* soft representation of a HW queue descriptor (HQD).
|
|
*
|
|
* Any fields below the read dispatch ID in the amd_hsa_queue_t should
|
|
* not change according to the HSA standard, therefore we should be able
|
|
* to get them based on their known relative position to the read dispatch
|
|
* ID.
|
|
*/
|
|
void
|
|
ReadDispIdOffsetDmaEvent(HSAQueueEntry *task,
|
|
const uint32_t &readDispIdOffset)
|
|
{
|
|
/**
|
|
* Now that the read pointer's offset from the base of
|
|
* the MQD is known, we can use that to calculate the
|
|
* the address of the MQD itself, the dispatcher will
|
|
* DMA that into the HSAQueueEntry when a kernel is
|
|
* launched.
|
|
*/
|
|
task->hostAMDQueueAddr = hsaPP->getQueueDesc(
|
|
task->queueId())->hostReadIndexPtr - readDispIdOffset;
|
|
|
|
/**
|
|
* DMA a copy of the MQD into the task. some fields of
|
|
* the MQD will be used to initialize register state in VI
|
|
*/
|
|
auto *mqdDmaEvent = new DmaVirtCallback<int>(
|
|
[ = ] (const int &) { MQDDmaEvent(task); });
|
|
|
|
dmaReadVirt(task->hostAMDQueueAddr,
|
|
sizeof(_amd_queue_t), mqdDmaEvent, &task->amdQueue);
|
|
}
|
|
|
|
/**
|
|
* Perform a DMA read of the MQD that corresponds to a hardware
|
|
* queue descriptor (HQD). We store a copy of the MQD in the
|
|
* HSAQueueEntry object so we can send a copy of it along with
|
|
* a dispatch packet, which is needed to initialize register
|
|
* state.
|
|
*/
|
|
void
|
|
MQDDmaEvent(HSAQueueEntry *task)
|
|
{
|
|
/**
|
|
* dGPUs on any version of ROCm and APUs starting with ROCm 2.2
|
|
* can perform lazy allocation of private segment (scratch) memory,
|
|
* where the runtime will intentianally underallocate scratch
|
|
* resources to save framebuffer (or system on APU) memory.
|
|
* If we don't have enough scratch memory to launch this kernel,
|
|
* we need to raise a recoverable error code to the runtime by
|
|
* asserting queue_inactive_signal for the queue. The runtime will
|
|
* then try to allocate more scratch and reset this signal. When
|
|
* the signal is reset we should check that the runtime was
|
|
* successful and then proceed to launch the kernel.
|
|
*/
|
|
if ((task->privMemPerItem() * VegaISA::NumVecElemPerVecReg) >
|
|
task->amdQueue.compute_tmpring_size_wavesize * 1024) {
|
|
// TODO: Raising this signal will potentially nuke scratch
|
|
// space for in-flight kernels that were launched from this
|
|
// queue. We need to drain all kernels and deschedule the
|
|
// queue before raising this signal. For now, just assert if
|
|
// there are any in-flight kernels and tell the user that this
|
|
// feature still needs to be implemented.
|
|
fatal_if(hsaPP->inFlightPkts(task->queueId()) > 1,
|
|
"Needed more scratch, but kernels are in flight for "
|
|
"this queue and it is unsafe to reallocate scratch. "
|
|
"We need to implement additional intelligence in the "
|
|
"hardware scheduling logic to support CP-driven "
|
|
"queue draining and scheduling.");
|
|
DPRINTF(GPUCommandProc, "Not enough scratch space to launch "
|
|
"kernel (%x available, %x requested bytes per "
|
|
"workitem). Asking host runtime to allocate more "
|
|
"space.\n",
|
|
task->amdQueue.compute_tmpring_size_wavesize * 1024,
|
|
task->privMemPerItem());
|
|
|
|
updateHsaSignal(task->amdQueue.queue_inactive_signal.handle, 1,
|
|
[ = ] (const uint64_t &dma_buffer)
|
|
{ WaitScratchDmaEvent(task, dma_buffer); });
|
|
|
|
} else {
|
|
DPRINTF(GPUCommandProc, "Sufficient scratch space, launching "
|
|
"kernel (%x available, %x requested bytes per "
|
|
"workitem).\n",
|
|
task->amdQueue.compute_tmpring_size_wavesize * 1024,
|
|
task->privMemPerItem());
|
|
dispatchPkt(task);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Poll on queue_inactive signal until the runtime can get around to
|
|
* taking care of our lack of scratch space.
|
|
*/
|
|
void
|
|
WaitScratchDmaEvent(HSAQueueEntry *task, const uint64_t &dmaBuffer)
|
|
{
|
|
if (dmaBuffer == 0) {
|
|
DPRINTF(GPUCommandProc, "Host scratch allocation complete. "
|
|
"Attempting to re-read MQD\n");
|
|
/**
|
|
* Runtime will have updated the MQD to give us more scratch
|
|
* space. Read it out and continue to pester the runtime until
|
|
* we get all that we need to launch.
|
|
*
|
|
* TODO: Technically only need to update private segment fields
|
|
* since other MQD entries won't change since we last read them.
|
|
*/
|
|
auto cb = new DmaVirtCallback<int>(
|
|
[ = ] (const int &) { MQDDmaEvent(task); });
|
|
|
|
dmaReadVirt(task->hostAMDQueueAddr, sizeof(_amd_queue_t), cb,
|
|
&task->amdQueue);
|
|
} else {
|
|
/**
|
|
* Poll until runtime signals us that scratch space has been
|
|
* allocated.
|
|
*/
|
|
Addr value_addr = getHsaSignalValueAddr(
|
|
task->amdQueue.queue_inactive_signal.handle);
|
|
DPRINTF(GPUCommandProc, "Polling queue inactive signal at "
|
|
"%p.\n", value_addr);
|
|
auto cb = new DmaVirtCallback<uint64_t>(
|
|
[ = ] (const uint64_t &dma_buffer)
|
|
{ WaitScratchDmaEvent(task, dma_buffer); } );
|
|
|
|
/**
|
|
* Delay for a large amount of ticks to give the CPU time to
|
|
* setup the scratch space. The delay should be non-zero to since
|
|
* this method calls back itself and can cause an infinite loop
|
|
* in the event queue if the allocation is not completed by the
|
|
* first time this is called.
|
|
*/
|
|
dmaReadVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 1e9);
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace gem5
|
|
|
|
#endif // __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
|