gpu-compute,dev-hsa: ROCm 5.5+ support (#498)
ROCm 5.5 support including: - Vendor packet completion signals - Queue remapping race condition fix - Backwards compatible GPR allocation - Fix transient readBlob fatal reading kernel descriptor
This commit is contained in:
@@ -466,7 +466,17 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
|
||||
panic("Write to unkown queue type!");
|
||||
}
|
||||
} else {
|
||||
warn("Unknown doorbell offset: %lx\n", offset);
|
||||
warn("Unknown doorbell offset: %lx. Saving to pending doorbells.\n",
|
||||
offset);
|
||||
|
||||
// We have to ACK the PCI packet immediately, so create a copy of the
|
||||
// packet here to send again.
|
||||
RequestPtr pending_req(pkt->req);
|
||||
PacketPtr pending_pkt = Packet::createWrite(pending_req);
|
||||
uint8_t *pending_data = new uint8_t[pkt->getSize()];
|
||||
pending_pkt->dataDynamic(pending_data);
|
||||
|
||||
pendingDoorbellPkts.emplace(offset, pending_pkt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -589,6 +599,17 @@ AMDGPUDevice::write(PacketPtr pkt)
|
||||
return pioDelay;
|
||||
}
|
||||
|
||||
void
|
||||
AMDGPUDevice::processPendingDoorbells(uint32_t offset)
|
||||
{
|
||||
if (pendingDoorbellPkts.count(offset)) {
|
||||
DPRINTF(AMDGPUDevice, "Sending pending doorbell %x\n", offset);
|
||||
writeDoorbell(pendingDoorbellPkts[offset], offset);
|
||||
delete pendingDoorbellPkts[offset];
|
||||
pendingDoorbellPkts.erase(offset);
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
AMDGPUDevice::haveRegVal(uint32_t addr)
|
||||
{
|
||||
@@ -812,6 +833,14 @@ AMDGPUDevice::deallocateAllQueues()
|
||||
for (auto& it : sdmaEngs) {
|
||||
it.second->deallocateRLCQueues();
|
||||
}
|
||||
|
||||
// "All" queues implicitly refers to all user queues. User queues begin at
|
||||
// doorbell address 0x4000, so unmap any queue at or above that address.
|
||||
for (auto [offset, vmid] : doorbellVMIDMap) {
|
||||
if (offset >= 0x4000) {
|
||||
doorbells.erase(offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -90,6 +90,7 @@ class AMDGPUDevice : public PciDevice
|
||||
using GPURegMap = std::unordered_map<uint32_t, uint64_t>;
|
||||
GPURegMap regs;
|
||||
std::unordered_map<uint32_t, QueueType> doorbells;
|
||||
std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;
|
||||
|
||||
/**
|
||||
* VGA ROM methods
|
||||
@@ -187,6 +188,7 @@ class AMDGPUDevice : public PciDevice
|
||||
* Set handles to GPU blocks.
|
||||
*/
|
||||
void setDoorbellType(uint32_t offset, QueueType qt);
|
||||
void processPendingDoorbells(uint32_t offset);
|
||||
void setSDMAEngine(Addr offset, SDMAEngine *eng);
|
||||
|
||||
/**
|
||||
|
||||
@@ -384,7 +384,10 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt)
|
||||
"Mapping mqd from %p %p (vmid %d - last vmid %d).\n",
|
||||
addr, pkt->mqdAddr, pkt->vmid, gpuDevice->lastVMID());
|
||||
|
||||
gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset,
|
||||
// The doorbellOffset is a dword address. We shift by two / multiply
|
||||
// by four to get the byte address to match doorbell addresses in
|
||||
// the GPU device.
|
||||
gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset << 2,
|
||||
gpuDevice->lastVMID());
|
||||
|
||||
QueueDesc *mqd = new QueueDesc();
|
||||
@@ -444,6 +447,8 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
|
||||
|
||||
DPRINTF(PM4PacketProcessor, "PM4 mqd read completed, base %p, mqd %p, "
|
||||
"hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
|
||||
|
||||
gpuDevice->processPendingDoorbells(offset);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -472,6 +477,8 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
|
||||
// Register doorbell with GPU device
|
||||
gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
|
||||
gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC);
|
||||
|
||||
gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -576,6 +583,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
|
||||
gpuDevice->deallocatePasid(pkt->pasid);
|
||||
break;
|
||||
case 2:
|
||||
panic("Unmapping queue selection 2 unimplemented\n");
|
||||
break;
|
||||
case 3: {
|
||||
auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
|
||||
|
||||
@@ -100,6 +100,14 @@ struct _hsa_barrier_or_packet_t
|
||||
uint64_t completion_signal;
|
||||
};
|
||||
|
||||
struct _hsa_generic_vendor_pkt
|
||||
{
|
||||
uint32_t padding[14];
|
||||
Addr completion_signal;
|
||||
};
|
||||
// All HSA AQL packets are 64 bytes. Confirm that here.
|
||||
static_assert(sizeof(_hsa_generic_vendor_pkt) == 64);
|
||||
|
||||
} // namespace gem5
|
||||
|
||||
#endif // __DEV_HSA_HSA_PACKET_HH__
|
||||
|
||||
@@ -116,28 +116,52 @@ void
|
||||
GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr)
|
||||
{
|
||||
static int dynamic_task_id = 0;
|
||||
_hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
|
||||
assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
|
||||
|
||||
/**
|
||||
* we need to read a pointer in the application's address
|
||||
* space to pull out the kernel code descriptor.
|
||||
* Need to use a raw pointer for DmaVirtDevice API. This is deleted
|
||||
* in the dispatchKernelObject method.
|
||||
*/
|
||||
auto *tc = sys->threads[0];
|
||||
|
||||
TranslatingPortProxy fs_proxy(tc);
|
||||
SETranslatingPortProxy se_proxy(tc);
|
||||
PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
|
||||
AMDKernelCode *akc = new AMDKernelCode;
|
||||
|
||||
/**
|
||||
* In full system mode, the page table entry may point to a system page
|
||||
* or a device page. System pages use the proxy as normal, but a device
|
||||
* page needs to be read from device memory. Check what type it is here.
|
||||
* The kernel_object is a pointer to the machine code, whose entry
|
||||
* point is an 'amd_kernel_code_t' type, which is included in the
|
||||
* kernel binary, and describes various aspects of the kernel. The
|
||||
* desired entry is the 'kernel_code_entry_byte_offset' field,
|
||||
* which provides the byte offset (positive or negative) from the
|
||||
* address of the amd_kernel_code_t to the start of the machine
|
||||
* instructions.
|
||||
*
|
||||
* For SE mode we can read from the port proxy. In FS mode, we may need
|
||||
* to wait for the guest OS to setup translations, especially when using
|
||||
* the KVM CPU, so it is preferred to read the code object using a timing
|
||||
* DMA request.
|
||||
*/
|
||||
bool is_system_page = true;
|
||||
Addr phys_addr = disp_pkt->kernel_object;
|
||||
if (FullSystem) {
|
||||
if (!FullSystem) {
|
||||
/**
|
||||
* we need to read a pointer in the application's address
|
||||
* space to pull out the kernel code descriptor.
|
||||
*/
|
||||
auto *tc = sys->threads[0];
|
||||
SETranslatingPortProxy virt_proxy(tc);
|
||||
|
||||
DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n");
|
||||
virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc,
|
||||
sizeof(AMDKernelCode));
|
||||
|
||||
dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
|
||||
} else {
|
||||
/**
|
||||
* In full system mode, the page table entry may point to a system
|
||||
* page or a device page. System pages use the proxy as normal, but
|
||||
* a device page needs to be read from device memory. Check what type
|
||||
* it is here.
|
||||
*/
|
||||
bool is_system_page = true;
|
||||
Addr phys_addr = disp_pkt->kernel_object;
|
||||
|
||||
/**
|
||||
* Full system currently only supports running on single VMID (one
|
||||
* virtual memory space), i.e., one application running on GPU at a
|
||||
@@ -149,61 +173,68 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
|
||||
phys_addr, tmp_bytes, BaseMMU::Mode::Read,
|
||||
is_system_page);
|
||||
}
|
||||
|
||||
DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
|
||||
disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
|
||||
is_system_page);
|
||||
DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d"
|
||||
" s:%d\n", disp_pkt->kernel_object, phys_addr,
|
||||
sizeof(AMDKernelCode), is_system_page);
|
||||
|
||||
/**
|
||||
* The kernel_object is a pointer to the machine code, whose entry
|
||||
* point is an 'amd_kernel_code_t' type, which is included in the
|
||||
* kernel binary, and describes various aspects of the kernel. The
|
||||
* desired entry is the 'kernel_code_entry_byte_offset' field,
|
||||
* which provides the byte offset (positive or negative) from the
|
||||
* address of the amd_kernel_code_t to the start of the machine
|
||||
* instructions.
|
||||
*/
|
||||
AMDKernelCode akc;
|
||||
if (is_system_page) {
|
||||
DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
|
||||
virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
|
||||
sizeof(AMDKernelCode));
|
||||
} else {
|
||||
assert(FullSystem);
|
||||
DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");
|
||||
/**
|
||||
* System objects use DMA device. Device objects need to use device
|
||||
* memory.
|
||||
*/
|
||||
if (is_system_page) {
|
||||
DPRINTF(GPUCommandProc,
|
||||
"sending system DMA read for kernel_object\n");
|
||||
|
||||
// Read from GPU memory manager one cache line at a time to prevent
|
||||
// rare cases where the AKC spans two memory pages.
|
||||
ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
|
||||
system()->cacheLineSize());
|
||||
for (; !gen.done(); gen.next()) {
|
||||
Addr chunk_addr = gen.addr();
|
||||
int vmid = 1;
|
||||
unsigned dummy;
|
||||
walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
|
||||
chunk_addr, dummy, BaseMMU::Mode::Read,
|
||||
is_system_page);
|
||||
auto dma_callback = new DmaVirtCallback<uint32_t>(
|
||||
[=](const uint32_t&) {
|
||||
dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
|
||||
});
|
||||
|
||||
Request::Flags flags = Request::PHYSICAL;
|
||||
RequestPtr request = std::make_shared<Request>(chunk_addr,
|
||||
system()->cacheLineSize(), flags, walker->getDevRequestor());
|
||||
Packet *readPkt = new Packet(request, MemCmd::ReadReq);
|
||||
readPkt->dataStatic((uint8_t *)&akc + gen.complete());
|
||||
system()->getDeviceMemory(readPkt)->access(readPkt);
|
||||
delete readPkt;
|
||||
dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode),
|
||||
dma_callback, (void *)akc);
|
||||
} else {
|
||||
DPRINTF(GPUCommandProc,
|
||||
"kernel_object in device, using device mem\n");
|
||||
|
||||
// Read from GPU memory manager one cache line at a time to prevent
|
||||
// rare cases where the AKC spans two memory pages.
|
||||
ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
|
||||
system()->cacheLineSize());
|
||||
for (; !gen.done(); gen.next()) {
|
||||
Addr chunk_addr = gen.addr();
|
||||
int vmid = 1;
|
||||
unsigned dummy;
|
||||
walker->startFunctional(
|
||||
gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
|
||||
dummy, BaseMMU::Mode::Read, is_system_page);
|
||||
|
||||
Request::Flags flags = Request::PHYSICAL;
|
||||
RequestPtr request = std::make_shared<Request>(chunk_addr,
|
||||
system()->cacheLineSize(), flags,
|
||||
walker->getDevRequestor());
|
||||
Packet *readPkt = new Packet(request, MemCmd::ReadReq);
|
||||
readPkt->dataStatic((uint8_t *)akc + gen.complete());
|
||||
system()->getDeviceMemory(readPkt)->access(readPkt);
|
||||
delete readPkt;
|
||||
}
|
||||
|
||||
dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
|
||||
uint32_t queue_id, Addr host_pkt_addr)
|
||||
{
|
||||
_hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
|
||||
|
||||
DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
|
||||
"kernel object\n", akc.kernel_code_entry_byte_offset);
|
||||
|
||||
DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
|
||||
(uint64_t)tc->cpuId());
|
||||
|
||||
"kernel object\n", akc->kernel_code_entry_byte_offset);
|
||||
|
||||
Addr machine_code_addr = (Addr)disp_pkt->kernel_object
|
||||
+ akc.kernel_code_entry_byte_offset;
|
||||
+ akc->kernel_code_entry_byte_offset;
|
||||
|
||||
DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
|
||||
machine_code_addr);
|
||||
@@ -219,7 +250,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
* APUs to implement asynchronous memcopy operations from 2 pointers in
|
||||
* host memory. I have no idea what BLIT stands for.
|
||||
* */
|
||||
if (akc.runtime_loader_kernel_symbol) {
|
||||
if (akc->runtime_loader_kernel_symbol) {
|
||||
kernel_name = "Some kernel";
|
||||
} else {
|
||||
kernel_name = "Blit kernel";
|
||||
@@ -230,7 +261,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
|
||||
: driver()->getGfxVersion();
|
||||
HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
|
||||
dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
|
||||
dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
|
||||
gfxVersion);
|
||||
|
||||
DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
|
||||
@@ -252,6 +283,8 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
// The driver expects the start time to be in ns
|
||||
Tick start_ts = curTick() / sim_clock::as_int::ns;
|
||||
dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
|
||||
|
||||
delete akc;
|
||||
}
|
||||
|
||||
void
|
||||
@@ -473,18 +506,27 @@ GPUCommandProcessor::driver()
|
||||
*/
|
||||
|
||||
/**
|
||||
* TODO: For now we simply tell the HSAPP to finish the packet,
|
||||
* however a future patch will update this method to provide
|
||||
* the proper handling of any required vendor-specific packets.
|
||||
* In the version of ROCm that is currently supported (1.6)
|
||||
* the runtime will send packets that direct the CP to
|
||||
* invalidate the GPUs caches. We do this automatically on
|
||||
* each kernel launch in the CU, so this is safe for now.
|
||||
* TODO: For now we simply tell the HSAPP to finish the packet and write a
|
||||
* completion signal, if any. However, in the future proper handing may be
|
||||
* required for vendor specific packets.
|
||||
*
|
||||
* In the version of ROCm that is currently supported the runtime will send
|
||||
* packets that direct the CP to invalidate the GPU caches. We do this
|
||||
* automatically on each kernel launch in the CU, so that situation is safe
|
||||
* for now.
|
||||
*/
|
||||
void
|
||||
GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr)
|
||||
{
|
||||
auto vendor_pkt = (_hsa_generic_vendor_pkt *)raw_pkt;
|
||||
|
||||
if (vendor_pkt->completion_signal) {
|
||||
sendCompletionSignal(vendor_pkt->completion_signal);
|
||||
}
|
||||
|
||||
warn("Ignoring vendor packet\n");
|
||||
|
||||
hsaPP->finishPkt(raw_pkt, queue_id);
|
||||
}
|
||||
|
||||
|
||||
@@ -99,6 +99,8 @@ class GPUCommandProcessor : public DmaVirtDevice
|
||||
Addr host_pkt_addr);
|
||||
void attachDriver(GPUComputeDriver *driver);
|
||||
|
||||
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
|
||||
uint32_t queue_id, Addr host_pkt_addr);
|
||||
void dispatchPkt(HSAQueueEntry *task);
|
||||
void signalWakeupEvent(uint32_t event_id);
|
||||
|
||||
@@ -149,6 +151,9 @@ class GPUCommandProcessor : public DmaVirtDevice
|
||||
HSAPacketProcessor *hsaPP;
|
||||
TranslationGenPtr translate(Addr vaddr, Addr size) override;
|
||||
|
||||
// Running counter of dispatched tasks
|
||||
int dynamic_task_id = 0;
|
||||
|
||||
// Keep track of start times for task dispatches.
|
||||
std::unordered_map<Addr, Tick> dispatchStartTime;
|
||||
|
||||
|
||||
@@ -70,8 +70,6 @@ class HSAQueueEntry
|
||||
_gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
|
||||
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
|
||||
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
|
||||
numVgprs(akc->workitem_vgpr_count),
|
||||
numSgprs(akc->wavefront_sgpr_count),
|
||||
_queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
|
||||
_hostDispPktAddr(host_pkt_addr),
|
||||
_completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
|
||||
@@ -88,40 +86,36 @@ class HSAQueueEntry
|
||||
_globalWgId(0), dispatchComplete(false)
|
||||
|
||||
{
|
||||
// Precompiled BLIT kernels actually violate the spec a bit
|
||||
// and don't set many of the required akc fields. For these kernels,
|
||||
// we need to rip register usage from the resource registers.
|
||||
//
|
||||
// We can't get an exact number of registers from the resource
|
||||
// registers because they round, but we can get an upper bound on it.
|
||||
// We determine the number of registers by solving for "vgprs_used"
|
||||
// in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
|
||||
// Use the resource descriptors to determine number of GPRs. This will
|
||||
// round up in some cases, however the exact number field in the AMD
|
||||
// kernel code struct is not backwards compatible and that field is
|
||||
// not populated in newer compiles. The resource descriptor dword must
|
||||
// be backwards compatible, so use that always.
|
||||
// LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
|
||||
// #code-object-v3-kernel-descriptor
|
||||
//
|
||||
// Currently, the only supported gfx version in gem5 that computes
|
||||
// this differently is gfx90a.
|
||||
if (!numVgprs) {
|
||||
if (gfx_version == GfxVersion::gfx90a) {
|
||||
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
|
||||
} else {
|
||||
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
|
||||
}
|
||||
// VGPR count differently is gfx90a.
|
||||
if (gfx_version == GfxVersion::gfx90a) {
|
||||
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
|
||||
} else {
|
||||
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
|
||||
}
|
||||
|
||||
if (!numSgprs || numSgprs ==
|
||||
std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {
|
||||
// Supported major generation numbers: 0 (BLIT kernels), 8, and 9
|
||||
uint16_t version = akc->amd_machine_version_major;
|
||||
assert((version == 0) || (version == 8) || (version == 9));
|
||||
// SGPR allocation granularies:
|
||||
// - GFX8: 8
|
||||
// - GFX9: 16
|
||||
// Source: https://llvm.org/docs/AMDGPUUsage.html
|
||||
if ((version == 0) || (version == 8)) {
|
||||
// We assume that BLIT kernels use the same granularity as GFX8
|
||||
numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
|
||||
} else if (version == 9) {
|
||||
numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
|
||||
}
|
||||
// SGPR allocation granularies:
|
||||
// - GFX8: 8
|
||||
// - GFX9: 16
|
||||
// Source: https://llvm.org/docs/.html
|
||||
if (gfx_version == GfxVersion::gfx801 ||
|
||||
gfx_version == GfxVersion::gfx803) {
|
||||
numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
|
||||
} else if (gfx_version == GfxVersion::gfx900 ||
|
||||
gfx_version == GfxVersion::gfx902 ||
|
||||
gfx_version == GfxVersion::gfx908 ||
|
||||
gfx_version == GfxVersion::gfx90a) {
|
||||
numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
|
||||
} else {
|
||||
panic("Saw unknown gfx version setting up GPR counts\n");
|
||||
}
|
||||
|
||||
initialVgprState.reset();
|
||||
|
||||
Reference in New Issue
Block a user