diff --git a/src/arch/amdgpu/vega/insts/sop2.cc b/src/arch/amdgpu/vega/insts/sop2.cc
index 93618b2124..a2965763f7 100644
--- a/src/arch/amdgpu/vega/insts/sop2.cc
+++ b/src/arch/amdgpu/vega/insts/sop2.cc
@@ -1224,7 +1224,8 @@ namespace VegaISA
         src0.read();
         src1.read();
 
-        sdst = src0.rawData() * src1.rawData();
+        ScalarRegI64 tmp = src0.rawData() * src1.rawData();
+        sdst = tmp & mask(32);
 
         sdst.write();
     } // execute
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
index f78f64bc91..59d72ac9ed 100644
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -8583,7 +8583,7 @@ namespace VegaISA
 
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
+                threadMask = ((1ULL << lane) - 1ULL);
                 vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
                              src1[lane];
             }
@@ -8633,7 +8633,7 @@ namespace VegaISA
 
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
+                threadMask = ((1ULL << lane) - 1ULL);
                 vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
                              src1[lane];
             }
diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh
index 698161d918..d4a7436c75 100644
--- a/src/arch/amdgpu/vega/operand.hh
+++ b/src/arch/amdgpu/vega/operand.hh
@@ -490,7 +490,7 @@ namespace VegaISA
         typename std::enable_if<Condition, void>::type
         setBit(int bit, int bit_val)
         {
-            DataType &sgpr = *((DataType*)srfData.data());
+            GEM5_ALIGNED(8) DataType &sgpr = *((DataType*)srfData.data());
             replaceBits(sgpr, bit, bit_val);
         }
 
@@ -739,7 +739,7 @@ namespace VegaISA
          * of a register is 1 dword. this class will take care to do the
          * proper packing/unpacking of sub-dword operands.
          */
-        std::array<ScalarRegU32, NumDwords> srfData;
+        GEM5_ALIGNED(8) std::array<ScalarRegU32, NumDwords> srfData;
     };
 
     // typedefs for the various sizes/types of scalar operands
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
index 063e87eee1..e2dd9f54f2 100644
--- a/src/gpu-compute/gpu_static_inst.cc
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -54,55 +54,63 @@ GPUStaticInst::disassemble()
     return disassembly;
 }
 
+
+void
+GPUStaticInst::generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu,
+                                     OperandInfo& op,
+                                     std::vector<OperandInfo>& opVec,
+                                     OpType opType)
+{
+    std::vector<int> virt_idxs;
+    std::vector<int> phys_idxs;
+
+    int num_dwords = op.sizeInDWords();
+    int virt_idx = op.registerIndex(wf->reservedScalarRegs);
+
+    int phys_idx = -1;
+    for (int i = 0; i < num_dwords; i++) {
+        if (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) {
+            phys_idx = cu->registerManager->mapVgpr(wf, virt_idx + i);
+        } else {
+            assert(opType == OpType::SRC_SCALAR ||
+                   opType == OpType::DST_SCALAR);
+            phys_idx = cu->registerManager->mapSgpr(wf, virt_idx + i);
+        }
+        virt_idxs.push_back(virt_idx + i);
+        phys_idxs.push_back(phys_idx);
+    }
+    DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
+            "%d registers.\n", disassemble(),
+            (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
+            "vector" : "scalar",
+            (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
+            "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
+
+    op.setVirtToPhysMapping(virt_idxs, phys_idxs);
+
+    opVec.emplace_back(op);
+}
+
 void
 GPUStaticInst::initDynOperandInfo(Wavefront *wf, ComputeUnit *cu)
 {
-    // Lambda function, as this is only ever used here
-    auto generateVirtToPhysMap = [&](OperandInfo& op,
-                                     std::vector<OperandInfo>& opVec,
-                                     MapRegFn mapFn, OpType opType)
-    {
-        std::vector<int> virt_idxs;
-        std::vector<int> phys_idxs;
-
-        int num_dwords = op.sizeInDWords();
-        int virt_idx = op.registerIndex(wf->reservedScalarRegs);
-
-        int phys_idx = -1;
-        for (int i = 0; i < num_dwords; i++){
-            phys_idx = (cu->registerManager->*mapFn)(wf, virt_idx + i);
-            virt_idxs.push_back(virt_idx + i);
-            phys_idxs.push_back(phys_idx);
-        }
-        DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
-                "%d registers.\n", disassemble(),
-                (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
-                "vector" : "scalar",
-                (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
-                "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
-
-        op.setVirtToPhysMapping(virt_idxs, phys_idxs);
-
-        opVec.emplace_back(op);
-    };
-
     for (auto& srcOp : srcOps) {
         if (srcOp.isVectorReg()) {
-            generateVirtToPhysMap(srcOp, srcVecRegOps,
-                            &RegisterManager::mapVgpr, OpType::SRC_VEC);
+            generateVirtToPhysMap(wf, cu, srcOp, srcVecRegOps,
+                                  OpType::SRC_VEC);
         } else if (srcOp.isScalarReg()) {
-            generateVirtToPhysMap(srcOp, srcScalarRegOps,
-                            &RegisterManager::mapSgpr, OpType::SRC_SCALAR);
+            generateVirtToPhysMap(wf, cu, srcOp, srcScalarRegOps,
+                                  OpType::SRC_SCALAR);
         }
     }
 
     for (auto& dstOp : dstOps) {
         if (dstOp.isVectorReg()) {
-            generateVirtToPhysMap(dstOp, dstVecRegOps,
-                            &RegisterManager::mapVgpr, OpType::DST_VEC);
+            generateVirtToPhysMap(wf, cu, dstOp, dstVecRegOps,
+                                  OpType::DST_VEC);
         } else if (dstOp.isScalarReg()) {
-            generateVirtToPhysMap(dstOp, dstScalarRegOps,
-                            &RegisterManager::mapSgpr, OpType::DST_SCALAR);
+            generateVirtToPhysMap(wf, cu, dstOp, dstScalarRegOps,
+                                  OpType::DST_SCALAR);
         }
     }
 }
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index 156f0e529d..6132ab2d29 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -321,6 +321,9 @@ class GPUStaticInst : public GPUStaticInstFlags
     int _ipdInstNum;
 
     std::bitset<Num_Flags> _flags;
+
+    void generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, OperandInfo& op,
+                               std::vector<OperandInfo>& opVec, OpType opType);
 };
 
 class KernelLaunchStaticInst : public GPUStaticInst