diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index e07a392ced..43c33e44cc 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -886,7 +886,7 @@ namespace VegaISA &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::decode_OPU_VOP3__V_FMAC_F32, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -6172,6 +6172,12 @@ namespace VegaISA return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A); } // decode_OPU_VOP3__V_SUBREV_U32 + GPUStaticInst* + Decoder::decode_OPU_VOP3__V_FMAC_F32(MachInst iFmt) + { + return new Inst_VOP3__V_FMAC_F32(&iFmt->iFmt_VOP3A); + } // decode_OPU_VOP3__V_FMAC_F32 + GPUStaticInst* Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 2523734ce5..285377ad3d 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -325,6 +325,7 @@ namespace VegaISA GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst); + GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst); GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst); @@ -1713,7 +1714,7 @@ namespace VegaISA struct InFmt_FLAT { unsigned int OFFSET : 13; - unsigned int LDS : 1; + unsigned int SVE : 1; unsigned int SEG : 2; unsigned int GLC : 1; unsigned int SLC : 1; diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc index 57d58638c5..c377daa487 100644 --- a/src/arch/amdgpu/vega/insts/ds.cc +++ b/src/arch/amdgpu/vega/insts/ds.cc @@ -1997,6 +1997,7 @@ namespace VegaISA * fits in better with the LDS pipeline logic. */ setFlag(Load); + setFlag(ALU); } // Inst_DS__DS_SWIZZLE_B32 Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32() diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index 21984a9bbd..a979c1e492 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -25950,6 +25950,40 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP3__V_SUBREV_U32 + class Inst_VOP3__V_FMAC_F32 : public Inst_VOP3A + { + public: + Inst_VOP3__V_FMAC_F32(InFmt_VOP3A*); + ~Inst_VOP3__V_FMAC_F32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 4; + case 1: //src_1 + return 4; + case 2: //vdst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3__V_FMAC_F32 + class Inst_VOP3__V_NOP : public Inst_VOP3A { public: diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index 3c5804526a..504946534f 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -1306,6 +1306,11 @@ namespace VegaISA ConstScalarOperandU32 soffset(gpuDynInst, saddr); soffset.read(); + ConstVecOperandU32 voffset(gpuDynInst, vaddr); + if (instData.SVE) { + voffset.read(); + } + Addr flat_scratch_addr = readFlatScratch(gpuDynInst); int elemSize; @@ -1320,6 +1325,7 @@ namespace VegaISA unsigned swizzleOffset = soffset.rawData() + offset; for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { + swizzleOffset += instData.SVE ? voffset[lane] : 0; gpuDynInst->addr.at(lane) = flat_scratch_addr + swizzle(swizzleOffset, lane, elemSize); } @@ -1328,7 +1334,9 @@ namespace VegaISA assert(isFlatScratch()); ConstVecOperandU32 voffset(gpuDynInst, vaddr); - voffset.read(); + if (instData.SVE) { + voffset.read(); + } Addr flat_scratch_addr = readFlatScratch(gpuDynInst); @@ -1343,8 +1351,11 @@ namespace VegaISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { + VecElemU32 vgpr_offset = + instData.SVE ? voffset[lane] : 0; + gpuDynInst->addr.at(lane) = flat_scratch_addr - + swizzle(voffset[lane] + offset, lane, elemSize); + + swizzle(vgpr_offset + offset, lane, elemSize); } } } diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index 47665ad353..b9fee17353 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -2404,6 +2404,73 @@ namespace VegaISA vdst.write(); } // execute + // --- Inst_VOP3__V_FMAC_F32 class methods --- + + Inst_VOP3__V_FMAC_F32::Inst_VOP3__V_FMAC_F32(InFmt_VOP3A *iFmt) + : Inst_VOP3A(iFmt, "v_fmac_f32", false) + { + setFlag(ALU); + setFlag(F32); + setFlag(FMA); + } // Inst_VOP3__V_FMAC_F32 + + Inst_VOP3__V_FMAC_F32::~Inst_VOP3__V_FMAC_F32() + { + } // ~Inst_VOP3__V_FMAC_F32 + + // --- description from .arch file --- + // D.f = S0.f * S1.f + D.f. + void + Inst_VOP3__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandF32 src0(gpuDynInst, extData.SRC0); + ConstVecOperandF32 src1(gpuDynInst, extData.SRC1); + VecOperandF32 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + vdst.read(); + + panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode); + panic_if(isDPPInst(), "DPP not implemented for %s", _opcode); + panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode); + + if (instData.ABS & 0x1) { + src0.absModifier(); + } + + if (instData.ABS & 0x2) { + src1.absModifier(); + } + + if (instData.ABS & 0x4) { + vdst.absModifier(); + } + + if (extData.NEG & 0x1) { + src0.negModifier(); + } + + if (extData.NEG & 0x2) { + src1.negModifier(); + } + + if (extData.NEG & 0x4) { + vdst.negModifier(); + } + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + float out = std::fma(src0[lane], src1[lane], vdst[lane]); + out = omodModifier(out, extData.OMOD); + out = std::clamp(vdst[lane], 0.0f, 1.0f); + vdst[lane] = out; + } + } + + vdst.write(); + } // execute // --- Inst_VOP3__V_NOP class methods --- Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt) diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index d4a6a8f447..c4a8e9085a 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -925,20 +925,14 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) ComputeUnit *cu = wavefront()->computeUnit; if (wavefront()->gfxVersion == GfxVersion::gfx942) { - // Architected flat scratch base address in FLAT_SCRATCH registers - uint32_t fs_lo = cu->srf[simdId]->read( - VegaISA::REG_FLAT_SCRATCH_LO); - uint32_t fs_hi = cu->srf[simdId]->read( - VegaISA::REG_FLAT_SCRATCH_HI); - - Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo; - + // Architected flat scratch base address is in a dedicated hardware + // register. for (int lane = 0; lane < cu->wfSize(); ++lane) { if (mask[lane]) { // The scratch base is added for other gfx versions, // otherwise this would simply add the register base. addr[lane] = addr[lane] - cu->shader->getScratchBase() - + arch_flat_scratch; + + wavefront()->archFlatScratchAddr; } } } else { diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 1b94b13b6e..d14f8aee3c 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -384,14 +384,13 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) // the FLAT_SCRATCH register pair to the scratch backing // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch if (task->gfxVersion() == GfxVersion::gfx942) { - Addr arch_flat_scratch = + archFlatScratchAddr = task->amdQueue.scratch_backing_memory_location; - computeUnit->srf[simdId]->write( - VegaISA::REG_FLAT_SCRATCH_HI, - bits(arch_flat_scratch, 63, 32)); - computeUnit->srf[simdId]->write( - VegaISA::REG_FLAT_SCRATCH_LO, - bits(arch_flat_scratch, 31, 0)); + + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting architected flat scratch = %x\n", + computeUnit->cu_id, simdId, wfSlotId, wfDynId, + archFlatScratchAddr); break; } diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index b7dff4617b..476393603b 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -205,6 +205,9 @@ class Wavefront : public SimObject // will live while the WF is executed uint32_t startSgprIndex; + // Architected flat scratch address for MI300+ + Addr archFlatScratchAddr = 0; + // Old value of destination gpr (for trace) std::vector oldVgpr; // Id of destination gpr (for trace)