diff --git a/src/arch/amdgpu/vega/insts/sop2.cc b/src/arch/amdgpu/vega/insts/sop2.cc index 93618b2124..a2965763f7 100644 --- a/src/arch/amdgpu/vega/insts/sop2.cc +++ b/src/arch/amdgpu/vega/insts/sop2.cc @@ -1224,7 +1224,8 @@ namespace VegaISA src0.read(); src1.read(); - sdst = src0.rawData() * src1.rawData(); + ScalarRegI64 tmp = src0.rawData() * src1.rawData(); + sdst = tmp & mask(32); sdst.write(); } // execute diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index f78f64bc91..59d72ac9ed 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -8583,7 +8583,7 @@ namespace VegaISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { - threadMask = ((1LL << lane) - 1LL); + threadMask = ((1ULL << lane) - 1ULL); vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) + src1[lane]; } @@ -8633,7 +8633,7 @@ namespace VegaISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { - threadMask = ((1LL << lane) - 1LL); + threadMask = ((1ULL << lane) - 1ULL); vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) + src1[lane]; } diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh index 698161d918..d4a7436c75 100644 --- a/src/arch/amdgpu/vega/operand.hh +++ b/src/arch/amdgpu/vega/operand.hh @@ -490,7 +490,7 @@ namespace VegaISA typename std::enable_if::type setBit(int bit, int bit_val) { - DataType &sgpr = *((DataType*)srfData.data()); + GEM5_ALIGNED(8) DataType &sgpr = *((DataType*)srfData.data()); replaceBits(sgpr, bit, bit_val); } @@ -739,7 +739,7 @@ namespace VegaISA * of a register is 1 dword. this class will take care to do the * proper packing/unpacking of sub-dword operands. */ - std::array srfData; + GEM5_ALIGNED(8) std::array srfData; }; // typedefs for the various sizes/types of scalar operands diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc index 063e87eee1..e2dd9f54f2 100644 --- a/src/gpu-compute/gpu_static_inst.cc +++ b/src/gpu-compute/gpu_static_inst.cc @@ -54,55 +54,63 @@ GPUStaticInst::disassemble() return disassembly; } + +void +GPUStaticInst::generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, + OperandInfo& op, + std::vector& opVec, + OpType opType) +{ + std::vector virt_idxs; + std::vector phys_idxs; + + int num_dwords = op.sizeInDWords(); + int virt_idx = op.registerIndex(wf->reservedScalarRegs); + + int phys_idx = -1; + for (int i = 0; i < num_dwords; i++) { + if (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) { + phys_idx = cu->registerManager->mapVgpr(wf, virt_idx + i); + } else { + assert(opType == OpType::SRC_SCALAR || + opType == OpType::DST_SCALAR); + phys_idx = cu->registerManager->mapSgpr(wf, virt_idx + i); + } + virt_idxs.push_back(virt_idx + i); + phys_idxs.push_back(phys_idx); + } + DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses " + "%d registers.\n", disassemble(), + (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ? + "vector" : "scalar", + (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ? + "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords); + + op.setVirtToPhysMapping(virt_idxs, phys_idxs); + + opVec.emplace_back(op); +} + void GPUStaticInst::initDynOperandInfo(Wavefront *wf, ComputeUnit *cu) { - // Lambda function, as this is only ever used here - auto generateVirtToPhysMap = [&](OperandInfo& op, - std::vector& opVec, - MapRegFn mapFn, OpType opType) - { - std::vector virt_idxs; - std::vector phys_idxs; - - int num_dwords = op.sizeInDWords(); - int virt_idx = op.registerIndex(wf->reservedScalarRegs); - - int phys_idx = -1; - for (int i = 0; i < num_dwords; i++){ - phys_idx = (cu->registerManager->*mapFn)(wf, virt_idx + i); - virt_idxs.push_back(virt_idx + i); - phys_idxs.push_back(phys_idx); - } - DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses " - "%d registers.\n", disassemble(), - (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ? - "vector" : "scalar", - (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ? - "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords); - - op.setVirtToPhysMapping(virt_idxs, phys_idxs); - - opVec.emplace_back(op); - }; - for (auto& srcOp : srcOps) { if (srcOp.isVectorReg()) { - generateVirtToPhysMap(srcOp, srcVecRegOps, - &RegisterManager::mapVgpr, OpType::SRC_VEC); + generateVirtToPhysMap(wf, cu, srcOp, srcVecRegOps, + OpType::SRC_VEC); } else if (srcOp.isScalarReg()) { - generateVirtToPhysMap(srcOp, srcScalarRegOps, - &RegisterManager::mapSgpr, OpType::SRC_SCALAR); + generateVirtToPhysMap(wf, cu, srcOp, srcScalarRegOps, + OpType::SRC_SCALAR); } } for (auto& dstOp : dstOps) { if (dstOp.isVectorReg()) { - generateVirtToPhysMap(dstOp, dstVecRegOps, - &RegisterManager::mapVgpr, OpType::DST_VEC); + generateVirtToPhysMap(wf, cu, dstOp, dstVecRegOps, + OpType::DST_VEC); } else if (dstOp.isScalarReg()) { - generateVirtToPhysMap(dstOp, dstScalarRegOps, - &RegisterManager::mapSgpr, OpType::DST_SCALAR); + generateVirtToPhysMap(wf, cu, dstOp, dstScalarRegOps, + OpType::DST_SCALAR); } } } diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 156f0e529d..6132ab2d29 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -321,6 +321,9 @@ class GPUStaticInst : public GPUStaticInstFlags int _ipdInstNum; std::bitset _flags; + + void generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, OperandInfo& op, + std::vector& opVec, OpType opType); }; class KernelLaunchStaticInst : public GPUStaticInst