gpu-compute: Flat scratch implementation and bug fixes (#231)

Add commits fixing private segment counters, flat scratch address
calculation, and implementation of flat scratch instructions.

These commits were tested using a modified version of 'square':

template <typename T>
__global__ void
scratch_square(T *C_d, T *A_d, size_t N)
{
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x ;

    volatile int foo; // Volatile ensures scratch / unoptimized code

    for (size_t i=offset; i<N; i+=stride) {
        foo = A_d[i];
        C_d[i] = foo * foo;
    }
}
This commit is contained in:
Matthew Poremba
2023-08-27 07:40:24 -07:00
committed by GitHub
8 changed files with 59 additions and 25 deletions

View File

@@ -1546,6 +1546,8 @@ namespace VegaISA
// The SEG field specifies FLAT(0) SCRATCH(1) or GLOBAL(2)
if (iFmt->SEG == 0) {
setFlag(Flat);
} else if (iFmt->SEG == 1) {
setFlag(FlatScratch);
} else if (iFmt->SEG == 2) {
setFlag(FlatGlobal);
} else {
@@ -1573,12 +1575,12 @@ namespace VegaISA
Inst_FLAT::initOperandInfo()
{
// One of the flat subtypes should be specified via flags
assert(isFlat() ^ isFlatGlobal());
assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
if (isFlat()) {
initFlatOperandInfo();
} else if (isFlatGlobal()) {
initGlobalOperandInfo();
} else if (isFlatGlobal() || isFlatScratch()) {
initGlobalScratchOperandInfo();
} else {
panic("Unknown flat subtype!\n");
}
@@ -1622,7 +1624,7 @@ namespace VegaISA
}
void
Inst_FLAT::initGlobalOperandInfo()
Inst_FLAT::initGlobalScratchOperandInfo()
{
//3 formats:
// 1 dst + 2 src (load)
@@ -1691,12 +1693,12 @@ namespace VegaISA
Inst_FLAT::generateDisassembly()
{
// One of the flat subtypes should be specified via flags
assert(isFlat() ^ isFlatGlobal());
assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
if (isFlat()) {
generateFlatDisassembly();
} else if (isFlatGlobal()) {
generateGlobalDisassembly();
} else if (isFlatGlobal() || isFlatScratch()) {
generateGlobalScratchDisassembly();
} else {
panic("Unknown flat subtype!\n");
}
@@ -1720,11 +1722,16 @@ namespace VegaISA
}
void
Inst_FLAT::generateGlobalDisassembly()
Inst_FLAT::generateGlobalScratchDisassembly()
{
// Replace flat_ with global_ in assembly string
std::string global_opcode = _opcode;
global_opcode.replace(0, 4, "global");
if (isFlatGlobal()) {
global_opcode.replace(0, 4, "global");
} else {
assert(isFlatScratch());
global_opcode.replace(0, 4, "scratch");
}
std::stringstream dis_stream;
dis_stream << global_opcode << " ";

View File

@@ -926,7 +926,8 @@ namespace VegaISA
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
@@ -944,7 +945,8 @@ namespace VegaISA
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
@@ -966,7 +968,8 @@ namespace VegaISA
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
@@ -984,7 +987,8 @@ namespace VegaISA
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
@@ -1006,6 +1010,10 @@ namespace VegaISA
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
// Flat scratch requests may not be atomic according to ISA manual
// up to MI200. See MI200 manual Table 45.
assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
@@ -1044,7 +1052,8 @@ namespace VegaISA
// If saddr = 0x7f there is no scalar reg to read and address will
// be a 64-bit address. Otherwise, saddr is the reg index for a
// scalar reg used as the base address for a 32-bit address.
if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
|| isFlat()) {
ConstVecOperandU64 vbase(gpuDynInst, vaddr);
vbase.read();
@@ -1063,9 +1072,13 @@ namespace VegaISA
if (isFlat()) {
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
} else {
} else if (isFlatGlobal()) {
gpuDynInst->staticInstruction()->executed_as =
enums::SC_GLOBAL;
} else {
assert(isFlatScratch());
gpuDynInst->staticInstruction()->executed_as =
enums::SC_PRIVATE;
}
}
@@ -1081,7 +1094,9 @@ namespace VegaISA
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Unsupported scope for flat instruction.\n");
assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
}
}
@@ -1098,10 +1113,10 @@ namespace VegaISA
private:
void initFlatOperandInfo();
void initGlobalOperandInfo();
void initGlobalScratchOperandInfo();
void generateFlatDisassembly();
void generateGlobalDisassembly();
void generateGlobalScratchDisassembly();
void
calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,

View File

@@ -54,6 +54,7 @@ class GPUStaticInstFlags(Enum):
"MemoryRef", # References memory (load, store, or atomic)
"Flat", # Flat memory op
"FlatGlobal", # Global memory op
"FlatScratch", # Scratch memory op
"Load", # Reads from memory
"Store", # Writes to memory
# Atomic ops

View File

@@ -1917,6 +1917,8 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
}
} else if (gpuDynInst->isFlatGlobal()) {
stats.flatVMemInsts++;
} else if (gpuDynInst->isFlatScratch()) {
stats.flatVMemInsts++;
} else if (gpuDynInst->isLocalMem()) {
stats.ldsNoFlatInsts++;
} else if (gpuDynInst->isLoad()) {

View File

@@ -432,6 +432,12 @@ GPUDynInst::isFlatGlobal() const
return _staticInst->isFlatGlobal();
}
bool
GPUDynInst::isFlatScratch() const
{
return _staticInst->isFlatScratch();
}
bool
GPUDynInst::isLoad() const
{
@@ -901,12 +907,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
uint32_t numSgprs = wavefront()->maxSgprs;
uint32_t physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 3);
numSgprs - 4);
uint32_t offset =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 4);
numSgprs - 3);
uint32_t size =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
@@ -919,12 +925,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
wavefront()->execUnitId = wavefront()->flatLmUnitId;
wavefront()->decLGKMInstsIssued();
if (isLoad()) {
wavefront()->rdGmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrGmReqsInPipe--;
wavefront()->wrLmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->rdGmReqsInPipe--;
wavefront()->wrGmReqsInPipe--;
wavefront()->wrLmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}

View File

@@ -234,6 +234,7 @@ class GPUDynInst : public GPUExecContext
bool isMemRef() const;
bool isFlat() const;
bool isFlatGlobal() const;
bool isFlatScratch() const;
bool isLoad() const;
bool isStore() const;

View File

@@ -130,6 +130,7 @@ class GPUStaticInst : public GPUStaticInstFlags
bool isMemRef() const { return _flags[MemoryRef]; }
bool isFlat() const { return _flags[Flat]; }
bool isFlatGlobal() const { return _flags[FlatGlobal]; }
bool isFlatScratch() const { return _flags[FlatScratch]; }
bool isLoad() const { return _flags[Load]; }
bool isStore() const { return _flags[Store]; }

View File

@@ -154,7 +154,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
ii->isFlatGlobal() || ii->isFlatScratch() || ii->isSleep() ||
ii->isLocalMem())) {
panic("next instruction: %s is of unknown type\n", ii->disassemble());
}