gpu-compute: Implement packed workitem ABI init

This initialization method is used in gfx90a (MI200). Rather than using
three VGPRs for X,Y,Z dimensions of the kernel, pack them into one
register with 10-bits for each dimensions.

Change-Id: I8e5b681c8287779ff9f80451d6028e862322294a
This commit is contained in:
Matthew Poremba
2023-11-11 13:03:23 -06:00
parent 5e45233484
commit 8c016ebbbc

View File

@@ -474,8 +474,48 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
regInitIdx = 0;
// iterate over all the init fields and check which
// bits are enabled
// VGPRs are initialized to the work item IDs for a given thread. There
// are two ways to initialize the IDs based on number of dimensions. ISAs
// will either have packed work-item IDs or not. LLVM lists them here:
// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table
// Default to false and set to true for gem5 supported ISAs.
bool packed_work_item_id = false;
if (task->gfxVersion() == GfxVersion::gfx90a) {
packed_work_item_id = true;
}
// For ISAs with packed work item IDs, only one VGPR is used and the
// (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits
// for each dimension
if (packed_work_item_id) {
TheGpuISA::VecRegContainerU32 raw_vgpr;
TheGpuISA::VecElemU32 *packed_vgpr
= raw_vgpr.as<TheGpuISA::VecElemU32>();
uint32_t physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;
}
if (task->vgprBitEnabled(1)) {
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);
}
}
if (task->vgprBitEnabled(2)) {
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);
}
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
return;
}
// For ISAs with non-packed work item IDs, map and initialize one VGPR
// per dimensions. Do this by iterating over all the init fields and
// checking which bits are enabled.
for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
if (task->vgprBitEnabled(en_bit)) {
uint32_t physVgprIdx = 0;