gpu-compute,arch-vega: Implement flat scratch insts

Flat scratch instructions (aka private) are the 3rd and final segment of
flat instructions in gfx9 (Vega) and beyond. These are used for things
like spills/fills and thread local storage. This commit enables two
forms of flat scratch instructions: (1) flat_load/flat_store
instructions where the memory address resolves to private memory and (2)
the new scratch_load/scratch_store instructions in Vega. The first are
similar to older generation ISAs where the aperture is unknown until
address translation. The second are instructions guaranteed to go to
private memory.

Since these are very similar to flat global instructions there are
minimal changes needed:

- Ensure a flat instruction is either regular flat, global, XOR scratch
- Rename the global op_encoding methods to GlobalScratch to indicate
  they are for both and are intentionally used.
- Flat instructions in segment 1 output scratch_ in the disassembly
- Flat instruction executed as private use similar mem helpers as global
- Flat scratch cannot be an atomic

This was tested using a modified version of the 'square' application:

template <typename T>
__global__ void
scratch_square(T *C_d, T *A_d, size_t N)
{
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x ;

    volatile int foo; // Volatile ensures scratch / unoptimized code

    for (size_t i=offset; i<N; i+=stride) {
        foo = A_d[i];
        C_d[i] = foo * foo;
    }
}

Change-Id: Icc91a7f67836fa3e759fefe7c1c3f6851528ae7d
This commit is contained in:
Matthew Poremba
2023-08-26 13:14:13 -05:00
parent 4506188e00
commit 60f071d09a
8 changed files with 53 additions and 19 deletions

View File

@@ -1546,6 +1546,8 @@ namespace VegaISA
// The SEG field specifies FLAT(0) SCRATCH(1) or GLOBAL(2)
if (iFmt->SEG == 0) {
setFlag(Flat);
} else if (iFmt->SEG == 1) {
setFlag(FlatScratch);
} else if (iFmt->SEG == 2) {
setFlag(FlatGlobal);
} else {
@@ -1573,12 +1575,12 @@ namespace VegaISA
Inst_FLAT::initOperandInfo()
{
// One of the flat subtypes should be specified via flags
assert(isFlat() ^ isFlatGlobal());
assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
if (isFlat()) {
initFlatOperandInfo();
} else if (isFlatGlobal()) {
initGlobalOperandInfo();
} else if (isFlatGlobal() || isFlatScratch()) {
initGlobalScratchOperandInfo();
} else {
panic("Unknown flat subtype!\n");
}
@@ -1622,7 +1624,7 @@ namespace VegaISA
}
void
Inst_FLAT::initGlobalOperandInfo()
Inst_FLAT::initGlobalScratchOperandInfo()
{
//3 formats:
// 1 dst + 2 src (load)
@@ -1691,12 +1693,12 @@ namespace VegaISA
Inst_FLAT::generateDisassembly()
{
// One of the flat subtypes should be specified via flags
assert(isFlat() ^ isFlatGlobal());
assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
if (isFlat()) {
generateFlatDisassembly();
} else if (isFlatGlobal()) {
generateGlobalDisassembly();
} else if (isFlatGlobal() || isFlatScratch()) {
generateGlobalScratchDisassembly();
} else {
panic("Unknown flat subtype!\n");
}
@@ -1720,11 +1722,16 @@ namespace VegaISA
}
void
Inst_FLAT::generateGlobalDisassembly()
Inst_FLAT::generateGlobalScratchDisassembly()
{
// Replace flat_ with global_ in assembly string
std::string global_opcode = _opcode;
global_opcode.replace(0, 4, "global");
if (isFlatGlobal()) {
global_opcode.replace(0, 4, "global");
} else {
assert(isFlatScratch());
global_opcode.replace(0, 4, "scratch");
}
std::stringstream dis_stream;
dis_stream << global_opcode << " ";

View File

@@ -926,7 +926,8 @@ namespace VegaISA
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
@@ -944,7 +945,8 @@ namespace VegaISA
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
@@ -966,7 +968,8 @@ namespace VegaISA
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
@@ -984,7 +987,8 @@ namespace VegaISA
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
@@ -1006,6 +1010,10 @@ namespace VegaISA
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
// Flat scratch requests may not be atomic according to ISA manual
// up to MI200. See MI200 manual Table 45.
assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
@@ -1044,7 +1052,8 @@ namespace VegaISA
// If saddr = 0x7f there is no scalar reg to read and address will
// be a 64-bit address. Otherwise, saddr is the reg index for a
// scalar reg used as the base address for a 32-bit address.
if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
|| isFlat()) {
ConstVecOperandU64 vbase(gpuDynInst, vaddr);
vbase.read();
@@ -1063,9 +1072,13 @@ namespace VegaISA
if (isFlat()) {
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
} else {
} else if (isFlatGlobal()) {
gpuDynInst->staticInstruction()->executed_as =
enums::SC_GLOBAL;
} else {
assert(isFlatScratch());
gpuDynInst->staticInstruction()->executed_as =
enums::SC_PRIVATE;
}
}
@@ -1081,7 +1094,9 @@ namespace VegaISA
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Unsupported scope for flat instruction.\n");
assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
}
}
@@ -1098,10 +1113,10 @@ namespace VegaISA
private:
void initFlatOperandInfo();
void initGlobalOperandInfo();
void initGlobalScratchOperandInfo();
void generateFlatDisassembly();
void generateGlobalDisassembly();
void generateGlobalScratchDisassembly();
void
calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,

View File

@@ -54,6 +54,7 @@ class GPUStaticInstFlags(Enum):
"MemoryRef", # References memory (load, store, or atomic)
"Flat", # Flat memory op
"FlatGlobal", # Global memory op
"FlatScratch", # Scratch memory op
"Load", # Reads from memory
"Store", # Writes to memory
# Atomic ops

View File

@@ -1917,6 +1917,8 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
}
} else if (gpuDynInst->isFlatGlobal()) {
stats.flatVMemInsts++;
} else if (gpuDynInst->isFlatScratch()) {
stats.flatVMemInsts++;
} else if (gpuDynInst->isLocalMem()) {
stats.ldsNoFlatInsts++;
} else if (gpuDynInst->isLoad()) {

View File

@@ -432,6 +432,12 @@ GPUDynInst::isFlatGlobal() const
return _staticInst->isFlatGlobal();
}
bool
GPUDynInst::isFlatScratch() const
{
return _staticInst->isFlatScratch();
}
bool
GPUDynInst::isLoad() const
{

View File

@@ -234,6 +234,7 @@ class GPUDynInst : public GPUExecContext
bool isMemRef() const;
bool isFlat() const;
bool isFlatGlobal() const;
bool isFlatScratch() const;
bool isLoad() const;
bool isStore() const;

View File

@@ -130,6 +130,7 @@ class GPUStaticInst : public GPUStaticInstFlags
bool isMemRef() const { return _flags[MemoryRef]; }
bool isFlat() const { return _flags[Flat]; }
bool isFlatGlobal() const { return _flags[FlatGlobal]; }
bool isFlatScratch() const { return _flags[FlatScratch]; }
bool isLoad() const { return _flags[Load]; }
bool isStore() const { return _flags[Store]; }

View File

@@ -154,7 +154,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
ii->isFlatGlobal() || ii->isFlatScratch() || ii->isSleep() ||
ii->isLocalMem())) {
panic("next instruction: %s is of unknown type\n", ii->disassemble());
}