gpu-compute: Flat scratch implementation and bug fixes (#231)
Add commits fixing private segment counters, flat scratch address
calculation, and implementation of flat scratch instructions.
These commits were tested using a modified version of 'square':
template <typename T>
__global__ void
scratch_square(T *C_d, T *A_d, size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
volatile int foo; // Volatile ensures scratch / unoptimized code
for (size_t i=offset; i<N; i+=stride) {
foo = A_d[i];
C_d[i] = foo * foo;
}
}
This commit is contained in:
@@ -1546,6 +1546,8 @@ namespace VegaISA
|
||||
// The SEG field specifies FLAT(0) SCRATCH(1) or GLOBAL(2)
|
||||
if (iFmt->SEG == 0) {
|
||||
setFlag(Flat);
|
||||
} else if (iFmt->SEG == 1) {
|
||||
setFlag(FlatScratch);
|
||||
} else if (iFmt->SEG == 2) {
|
||||
setFlag(FlatGlobal);
|
||||
} else {
|
||||
@@ -1573,12 +1575,12 @@ namespace VegaISA
|
||||
Inst_FLAT::initOperandInfo()
|
||||
{
|
||||
// One of the flat subtypes should be specified via flags
|
||||
assert(isFlat() ^ isFlatGlobal());
|
||||
assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
|
||||
|
||||
if (isFlat()) {
|
||||
initFlatOperandInfo();
|
||||
} else if (isFlatGlobal()) {
|
||||
initGlobalOperandInfo();
|
||||
} else if (isFlatGlobal() || isFlatScratch()) {
|
||||
initGlobalScratchOperandInfo();
|
||||
} else {
|
||||
panic("Unknown flat subtype!\n");
|
||||
}
|
||||
@@ -1622,7 +1624,7 @@ namespace VegaISA
|
||||
}
|
||||
|
||||
void
|
||||
Inst_FLAT::initGlobalOperandInfo()
|
||||
Inst_FLAT::initGlobalScratchOperandInfo()
|
||||
{
|
||||
//3 formats:
|
||||
// 1 dst + 2 src (load)
|
||||
@@ -1691,12 +1693,12 @@ namespace VegaISA
|
||||
Inst_FLAT::generateDisassembly()
|
||||
{
|
||||
// One of the flat subtypes should be specified via flags
|
||||
assert(isFlat() ^ isFlatGlobal());
|
||||
assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
|
||||
|
||||
if (isFlat()) {
|
||||
generateFlatDisassembly();
|
||||
} else if (isFlatGlobal()) {
|
||||
generateGlobalDisassembly();
|
||||
} else if (isFlatGlobal() || isFlatScratch()) {
|
||||
generateGlobalScratchDisassembly();
|
||||
} else {
|
||||
panic("Unknown flat subtype!\n");
|
||||
}
|
||||
@@ -1720,11 +1722,16 @@ namespace VegaISA
|
||||
}
|
||||
|
||||
void
|
||||
Inst_FLAT::generateGlobalDisassembly()
|
||||
Inst_FLAT::generateGlobalScratchDisassembly()
|
||||
{
|
||||
// Replace flat_ with global_ in assembly string
|
||||
std::string global_opcode = _opcode;
|
||||
global_opcode.replace(0, 4, "global");
|
||||
if (isFlatGlobal()) {
|
||||
global_opcode.replace(0, 4, "global");
|
||||
} else {
|
||||
assert(isFlatScratch());
|
||||
global_opcode.replace(0, 4, "scratch");
|
||||
}
|
||||
|
||||
std::stringstream dis_stream;
|
||||
dis_stream << global_opcode << " ";
|
||||
|
||||
@@ -926,7 +926,8 @@ namespace VegaISA
|
||||
void
|
||||
initMemRead(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
|
||||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
|
||||
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
|
||||
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
@@ -944,7 +945,8 @@ namespace VegaISA
|
||||
void
|
||||
initMemRead(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
|
||||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
|
||||
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
|
||||
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
@@ -966,7 +968,8 @@ namespace VegaISA
|
||||
void
|
||||
initMemWrite(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
|
||||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
|
||||
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
|
||||
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
@@ -984,7 +987,8 @@ namespace VegaISA
|
||||
void
|
||||
initMemWrite(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
|
||||
gpuDynInst->executedAs() == enums::SC_PRIVATE) {
|
||||
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
|
||||
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
@@ -1006,6 +1010,10 @@ namespace VegaISA
|
||||
void
|
||||
initAtomicAccess(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
// Flat scratch requests may not be atomic according to ISA manual
|
||||
// up to MI200. See MI200 manual Table 45.
|
||||
assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
|
||||
|
||||
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
|
||||
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
|
||||
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
|
||||
@@ -1044,7 +1052,8 @@ namespace VegaISA
|
||||
// If saddr = 0x7f there is no scalar reg to read and address will
|
||||
// be a 64-bit address. Otherwise, saddr is the reg index for a
|
||||
// scalar reg used as the base address for a 32-bit address.
|
||||
if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
|
||||
if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
|
||||
|| isFlat()) {
|
||||
ConstVecOperandU64 vbase(gpuDynInst, vaddr);
|
||||
vbase.read();
|
||||
|
||||
@@ -1063,9 +1072,13 @@ namespace VegaISA
|
||||
|
||||
if (isFlat()) {
|
||||
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
|
||||
} else {
|
||||
} else if (isFlatGlobal()) {
|
||||
gpuDynInst->staticInstruction()->executed_as =
|
||||
enums::SC_GLOBAL;
|
||||
} else {
|
||||
assert(isFlatScratch());
|
||||
gpuDynInst->staticInstruction()->executed_as =
|
||||
enums::SC_PRIVATE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1081,7 +1094,9 @@ namespace VegaISA
|
||||
gpuDynInst->computeUnit()->localMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
} else {
|
||||
fatal("Unsupported scope for flat instruction.\n");
|
||||
assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe
|
||||
.issueRequest(gpuDynInst);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1098,10 +1113,10 @@ namespace VegaISA
|
||||
|
||||
private:
|
||||
void initFlatOperandInfo();
|
||||
void initGlobalOperandInfo();
|
||||
void initGlobalScratchOperandInfo();
|
||||
|
||||
void generateFlatDisassembly();
|
||||
void generateGlobalDisassembly();
|
||||
void generateGlobalScratchDisassembly();
|
||||
|
||||
void
|
||||
calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,
|
||||
|
||||
@@ -54,6 +54,7 @@ class GPUStaticInstFlags(Enum):
|
||||
"MemoryRef", # References memory (load, store, or atomic)
|
||||
"Flat", # Flat memory op
|
||||
"FlatGlobal", # Global memory op
|
||||
"FlatScratch", # Scratch memory op
|
||||
"Load", # Reads from memory
|
||||
"Store", # Writes to memory
|
||||
# Atomic ops
|
||||
|
||||
@@ -1917,6 +1917,8 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
|
||||
}
|
||||
} else if (gpuDynInst->isFlatGlobal()) {
|
||||
stats.flatVMemInsts++;
|
||||
} else if (gpuDynInst->isFlatScratch()) {
|
||||
stats.flatVMemInsts++;
|
||||
} else if (gpuDynInst->isLocalMem()) {
|
||||
stats.ldsNoFlatInsts++;
|
||||
} else if (gpuDynInst->isLoad()) {
|
||||
|
||||
@@ -432,6 +432,12 @@ GPUDynInst::isFlatGlobal() const
|
||||
return _staticInst->isFlatGlobal();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isFlatScratch() const
|
||||
{
|
||||
return _staticInst->isFlatScratch();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isLoad() const
|
||||
{
|
||||
@@ -901,12 +907,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
|
||||
uint32_t numSgprs = wavefront()->maxSgprs;
|
||||
uint32_t physSgprIdx =
|
||||
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
||||
numSgprs - 3);
|
||||
numSgprs - 4);
|
||||
uint32_t offset =
|
||||
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
||||
physSgprIdx =
|
||||
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
||||
numSgprs - 4);
|
||||
numSgprs - 3);
|
||||
uint32_t size =
|
||||
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
||||
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
||||
@@ -919,12 +925,12 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
|
||||
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
||||
wavefront()->decLGKMInstsIssued();
|
||||
if (isLoad()) {
|
||||
wavefront()->rdGmReqsInPipe--;
|
||||
wavefront()->rdLmReqsInPipe--;
|
||||
} else if (isStore()) {
|
||||
wavefront()->wrGmReqsInPipe--;
|
||||
wavefront()->wrLmReqsInPipe--;
|
||||
} else if (isAtomic() || isMemSync()) {
|
||||
wavefront()->rdGmReqsInPipe--;
|
||||
wavefront()->wrGmReqsInPipe--;
|
||||
wavefront()->wrLmReqsInPipe--;
|
||||
wavefront()->rdLmReqsInPipe--;
|
||||
} else {
|
||||
panic("Invalid memory operation!\n");
|
||||
}
|
||||
|
||||
@@ -234,6 +234,7 @@ class GPUDynInst : public GPUExecContext
|
||||
bool isMemRef() const;
|
||||
bool isFlat() const;
|
||||
bool isFlatGlobal() const;
|
||||
bool isFlatScratch() const;
|
||||
bool isLoad() const;
|
||||
bool isStore() const;
|
||||
|
||||
|
||||
@@ -130,6 +130,7 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
bool isMemRef() const { return _flags[MemoryRef]; }
|
||||
bool isFlat() const { return _flags[Flat]; }
|
||||
bool isFlatGlobal() const { return _flags[FlatGlobal]; }
|
||||
bool isFlatScratch() const { return _flags[FlatScratch]; }
|
||||
bool isLoad() const { return _flags[Load]; }
|
||||
bool isStore() const { return _flags[Store]; }
|
||||
|
||||
|
||||
@@ -154,7 +154,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
|
||||
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
|
||||
ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat() ||
|
||||
ii->isFlatGlobal() || ii->isSleep() || ii->isLocalMem())) {
|
||||
ii->isFlatGlobal() || ii->isFlatScratch() || ii->isSleep() ||
|
||||
ii->isLocalMem())) {
|
||||
panic("next instruction: %s is of unknown type\n", ii->disassemble());
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user