diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
index 85f0af2a51..224c525e0f 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -596,10 +596,10 @@ void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)
 
 void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
 {
-    // The Acc register file is not supported in gem5 and has been removed
-    // in MI200. Therefore this instruction becomes a mov.
     Wavefront *wf = gpuDynInst->wavefront();
-    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    unsigned accum_offset = wf->accumOffset;
+
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0+accum_offset);
     VecOperandU32 vdst(gpuDynInst, instData.VDST);
 
     src.readSrc();
@@ -615,11 +615,11 @@ void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
 
 void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
 {
-    // The Acc register file is not supported in gem5 and has been removed
-    // in MI200. Therefore this instruction becomes a mov.
     Wavefront *wf = gpuDynInst->wavefront();
+    unsigned accum_offset = wf->accumOffset;
+
     ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
 
     src.readSrc();
 
diff --git a/src/arch/amdgpu/vega/insts/vop3p_mai.cc b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
index 943aa72cfd..6136a94da9 100644
--- a/src/arch/amdgpu/vega/insts/vop3p_mai.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
@@ -56,9 +56,19 @@ namespace VegaISA
     void
     Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
     {
-        int acc_offset = 0;
+        // Accumulation register offsets for A, B, and C/D matrix.
+        int a_offset = 0;
+        int b_offset = 0;
+        int cd_offset = 0;
         if (instData.ACC_CD) {
-            warn("ACC_CD not yet implemented\n");
+            cd_offset = gpuDynInst->wavefront()->accumOffset;
+        }
+        if (extData.ACC) {
+            if (extData.ACC & 0x1) {
+                a_offset = gpuDynInst->wavefront()->accumOffset;
+            } else if (extData.ACC & 0x2) {
+                b_offset = gpuDynInst->wavefront()->accumOffset;
+            }
         }
 
         // int8 size allows for 4 elements per lane. At 16x16 this means 4
@@ -71,24 +81,27 @@ namespace VegaISA
 
         // VecOperandI8 will read 8 bits and sign extend, so used U32 to read
         // as "untyped" 32-bit values.
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
-        ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
-        ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
-        ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0+a_offset);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1+b_offset);
+        ConstVecOperandI32 src2[4] = {
+            ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset),
+            ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+1*delta),
+            ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+2*delta),
+            ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+3*delta),
+        };
 
-        VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
-        VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
-        VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
-        VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
+        VecOperandI32 vdst[4] = {
+            VecOperandI32(gpuDynInst, instData.VDST+cd_offset),
+            VecOperandI32(gpuDynInst, instData.VDST+cd_offset+1),
+            VecOperandI32(gpuDynInst, instData.VDST+cd_offset+2),
+            VecOperandI32(gpuDynInst, instData.VDST+cd_offset+3),
+        };
 
         src0.readSrc();
         src1.readSrc();
-        src2a.readSrc();
-        src2b.readSrc();
-        src2c.readSrc();
-        src2d.readSrc();
+        for (int i = 0; i < 4; ++i) {
+            src2[i].readSrc();
+        }
 
         int32_t A[16][16];
         for (int i = 0; i < 64; ++i) {
@@ -124,14 +137,14 @@ namespace VegaISA
 
         // Load accumulation matrix C into result
         for (int i = 0; i < 64; ++i) {
-            // src2a contains rows 0, 4, 8, 12
-            result[(i/16)*4][(i%16)] = src2a[i];
-            // src2b contains rows 1, 5, 9, 13
-            result[(i/16)*4+1][(i%16)] = src2b[i];
-            // src2c contains rows 2, 6, 10, 14
-            result[(i/16)*4+2][(i%16)] = src2c[i];
-            // src2d contains rows 3, 7, 11, 15
-            result[(i/16)*4+3][(i%16)] = src2d[i];
+            // src2[0] contains rows 0, 4, 8, 12
+            result[(i/16)*4][(i%16)] = src2[0][i];
+            // src2[1] contains rows 1, 5, 9, 13
+            result[(i/16)*4+1][(i%16)] = src2[1][i];
+            // src2[2] contains rows 2, 6, 10, 14
+            result[(i/16)*4+2][(i%16)] = src2[2][i];
+            // src2[3] contains rows 3, 7, 11, 15
+            result[(i/16)*4+3][(i%16)] = src2[3][i];
         }
 
         // Compute new result - This is (obviously) not optimized
@@ -145,20 +158,19 @@ namespace VegaISA
 
         // Put result in dest VGPRs
         for (int i = 0; i < 64; ++i) {
-            // vdsta contains rows 0, 4, 8, 12
-            vdsta[i] = result[(i/16)*4][(i%16)];
-            // vdstb contains rows 1, 5, 9, 13
-            vdstb[i] = result[(i/16)*4+1][(i%16)];
-            // vdstc contains rows 2, 6, 10, 14
-            vdstc[i] = result[(i/16)*4+2][(i%16)];
-            // vdstd contains rows 3, 7, 11, 15
-            vdstd[i] = result[(i/16)*4+3][(i%16)];
+            // vdst[0] contains rows 0, 4, 8, 12
+            vdst[0][i] = result[(i/16)*4][(i%16)];
+            // vdst[1] contains rows 1, 5, 9, 13
+            vdst[1][i] = result[(i/16)*4+1][(i%16)];
+            // vdst[2] contains rows 2, 6, 10, 14
+            vdst[2][i] = result[(i/16)*4+2][(i%16)];
+            // vdst[3] contains rows 3, 7, 11, 15
+            vdst[3][i] = result[(i/16)*4+3][(i%16)];
         }
 
-        vdsta.write();
-        vdstb.write();
-        vdstc.write();
-        vdstd.write();
+        for (int i = 0; i < 4; ++i) {
+            vdst[i].write();
+        }
     } // execute
     // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
 
@@ -179,9 +191,19 @@ namespace VegaISA
     void
     Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
     {
-        int acc_offset = 0;
+        // Accumulation register offsets for A, B, and C/D matrix.
+        int a_offset = 0;
+        int b_offset = 0;
+        int cd_offset = 0;
         if (instData.ACC_CD) {
-            warn("ACC_CD not yet implemented\n");
+            cd_offset = gpuDynInst->wavefront()->accumOffset;
+        }
+        if (extData.ACC) {
+            if (extData.ACC & 0x1) {
+                a_offset = gpuDynInst->wavefront()->accumOffset;
+            } else if (extData.ACC & 0x2) {
+                b_offset = gpuDynInst->wavefront()->accumOffset;
+            }
         }
 
         // Handling of src2 is a bit tricky. The operator[] overload cannot
@@ -191,37 +213,41 @@ namespace VegaISA
         // a delta for each of the pairs of src2 GPRs.
         int delta = isVectorReg(extData.SRC2) ? 2 : 0;
 
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
-        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
-        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
-        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0+a_offset);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1+b_offset);
+        ConstVecOperandF64 src2[4] = {
+            ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset),
+            ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+1*delta),
+            ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+2*delta),
+            ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+3*delta),
+        };
 
-        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
-        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
-        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
-        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
+        VecOperandF64 vdst[4] = {
+            VecOperandF64(gpuDynInst, instData.VDST+cd_offset),
+            VecOperandF64(gpuDynInst, instData.VDST+cd_offset+2),
+            VecOperandF64(gpuDynInst, instData.VDST+cd_offset+4),
+            VecOperandF64(gpuDynInst, instData.VDST+cd_offset+6),
+        };
 
         src0.readSrc();
         src1.readSrc();
-        src2a.readSrc();
-        src2b.readSrc();
-        src2c.readSrc();
-        src2d.readSrc();
+
+        for (int i = 0; i < 4; ++i) {
+            src2[i].readSrc();
+        }
 
         double result[16][16];
 
         // Load src2 into result. src2 is row major
         for (int i = 0; i < 64; ++i) {
-            // src2a contains rows 0 - 3
-            result[(i/16)][(i%16)] = src2a[i];
-            // src2b contains rows 4 - 7
-            result[(i/16)+4][(i%16)] = src2b[i];
-            // src2c contains rows 8 - 11
-            result[(i/16)+8][(i%16)] = src2c[i];
-            // src2d contains rows 12 - 15
-            result[(i/16)+12][(i%16)] = src2d[i];
+            // src2[0] contains rows 0 - 3
+            result[(i/16)][(i%16)] = src2[0][i];
+            // src2[1] contains rows 4 - 7
+            result[(i/16)+4][(i%16)] = src2[1][i];
+            // src2[2] contains rows 8 - 11
+            result[(i/16)+8][(i%16)] = src2[2][i];
+            // src2[3] contains rows 12 - 15
+            result[(i/16)+12][(i%16)] = src2[3][i];
         }
 
         // Compute new result
@@ -238,20 +264,19 @@ namespace VegaISA
 
         // Put result in dest VGPRs
         for (int i = 0; i < 64; ++i) {
-            // vdsta contains rows 0 - 3
-            vdsta[i] = result[(i/16)][(i%16)];
-            // src2b contains rows 4 - 7
-            vdstb[i] = result[(i/16)+4][(i%16)];
-            // src2c contains rows 8 - 11
-            vdstc[i] = result[(i/16)+8][(i%16)];
-            // src2d contains rows 12 - 15
-            vdstd[i] = result[(i/16)+12][(i%16)];
+            // vdst[0] contains rows 0 - 3
+            vdst[0][i] = result[(i/16)][(i%16)];
+            // src2[1] contains rows 4 - 7
+            vdst[1][i] = result[(i/16)+4][(i%16)];
+            // src2[2] contains rows 8 - 11
+            vdst[2][i] = result[(i/16)+8][(i%16)];
+            // src2[3] contains rows 12 - 15
+            vdst[3][i] = result[(i/16)+12][(i%16)];
         }
 
-        vdsta.write();
-        vdstb.write();
-        vdstc.write();
-        vdstd.write();
+        for (int i = 0; i < 4; ++i) {
+            vdst[i].write();
+        }
     } // execute
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index dbb909f624..02b1bb174a 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -723,7 +723,6 @@ GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)
     warn_if(akc->kernarg_preload_spec_length ||
             akc->kernarg_preload_spec_offset,
             "Kernarg preload not implemented\n");
-    warn_if(akc->accum_offset, "ACC offset not implemented\n");
     warn_if(akc->tg_split, "TG split not implemented\n");
 }
 
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index a464e4882d..f015b091fc 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -122,6 +122,11 @@ class HSAQueueEntry
         }
 
         parseKernelCode(akc);
+
+        // Offset of a first AccVGPR in the unified register file.
+        // Granularity 4. Value 0-63. 0 - accum-offset = 4,
+        // 1 - accum-offset = 8, ..., 63 - accum-offset = 256.
+        _accumOffset = (akc->accum_offset + 1) * 4;
     }
 
     const GfxVersion&
@@ -394,6 +399,12 @@ class HSAQueueEntry
         assert(_outstandingWbs >= 0);
     }
 
+    unsigned
+    accumOffset() const
+    {
+        return _accumOffset;
+    }
+
   private:
     void
     parseKernelCode(AMDKernelCode *akc)
@@ -489,6 +500,8 @@ class HSAQueueEntry
 
     std::bitset<NumVectorInitFields> initialVgprState;
     std::bitset<NumScalarInitFields> initialSgprState;
+
+    unsigned _accumOffset;
 };
 
 } // namespace gem5
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index cb8b6220e7..98d882b20e 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -430,6 +430,9 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
         }
     }
 
+    // Save the offset to the first accumulation VGPR number from HSA task.
+    accumOffset = task->accumOffset();
+
     regInitIdx = 0;
 
     // VGPRs are initialized to the work item IDs for a given thread. There
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 43ac3e9ffc..82035f7d47 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -131,6 +131,8 @@ class Wavefront : public SimObject
     uint32_t maxVgprs;
     // number of SGPRs required by WF
     uint32_t maxSgprs;
+    // first accumulation vgpr number
+    uint32_t accumOffset;
     void freeResources();
     GPUDynInstPtr nextInstr();
     void setStatus(status_e newStatus);