From 8722aef2e21620341c028e94bc1075d88ca9b989 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 20 Feb 2024 13:34:51 -0600
Subject: [PATCH 1/3] gpu-compute: Store accum_offset from code object in WF

The accumulation offset is needed for some instructions. In order to
access this value we need to place it somewhere instruction definitions
can access. The most logical place is in the wavefront.

This commit simply copies the value from the HSA task to the wavefront
object.

Change-Id: I44ef62ef32d2421953f096c431dd758e882245b4
---
 src/gpu-compute/gpu_command_processor.cc |  1 -
 src/gpu-compute/hsa_queue_entry.hh       | 13 +++++++++++++
 src/gpu-compute/wavefront.cc             |  3 +++
 src/gpu-compute/wavefront.hh             |  2 ++
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index dbb909f624..02b1bb174a 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -723,7 +723,6 @@ GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)
     warn_if(akc->kernarg_preload_spec_length ||
             akc->kernarg_preload_spec_offset,
             "Kernarg preload not implemented\n");
-    warn_if(akc->accum_offset, "ACC offset not implemented\n");
     warn_if(akc->tg_split, "TG split not implemented\n");
 }
 
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index a464e4882d..f015b091fc 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -122,6 +122,11 @@ class HSAQueueEntry
         }
 
         parseKernelCode(akc);
+
+        // Offset of a first AccVGPR in the unified register file.
+        // Granularity 4. Value 0-63. 0 - accum-offset = 4,
+        // 1 - accum-offset = 8, ..., 63 - accum-offset = 256.
+        _accumOffset = (akc->accum_offset + 1) * 4;
     }
 
     const GfxVersion&
@@ -394,6 +399,12 @@ class HSAQueueEntry
         assert(_outstandingWbs >= 0);
     }
 
+    unsigned
+    accumOffset() const
+    {
+        return _accumOffset;
+    }
+
   private:
     void
     parseKernelCode(AMDKernelCode *akc)
@@ -489,6 +500,8 @@ class HSAQueueEntry
 
     std::bitset<NumVectorInitFields> initialVgprState;
     std::bitset<NumScalarInitFields> initialSgprState;
+
+    unsigned _accumOffset;
 };
 
 } // namespace gem5
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index cb8b6220e7..98d882b20e 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -430,6 +430,9 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
         }
     }
 
+    // Save the offset to the first accumulation VGPR number from HSA task.
+    accumOffset = task->accumOffset();
+
     regInitIdx = 0;
 
     // VGPRs are initialized to the work item IDs for a given thread. There
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 43ac3e9ffc..82035f7d47 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -131,6 +131,8 @@ class Wavefront : public SimObject
     uint32_t maxVgprs;
     // number of SGPRs required by WF
     uint32_t maxSgprs;
+    // first accumulation vgpr number
+    uint32_t accumOffset;
     void freeResources();
     GPUDynInstPtr nextInstr();
     void setStatus(status_e newStatus);

From e0e65221b47dfa03fb5e3676cb8340b2278fb854 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Mon, 26 Feb 2024 12:48:48 -0600
Subject: [PATCH 2/3] arch-vega: Use accum offset for v_accvgpr_read/write

The accum offset is used as an index into the unified VGPR register file
in MI200 and is not the same as a move if accum_offset in the dispatch
packet is non-zero.

Change these instructions to use the stored accum_offset value.

Change-Id: Ib661804f8f5b8392e4c586082c423645f539e641
---
 src/arch/amdgpu/vega/insts/vop3p.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
index 85f0af2a51..224c525e0f 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -596,10 +596,10 @@ void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)
 
 void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
 {
-    // The Acc register file is not supported in gem5 and has been removed
-    // in MI200. Therefore this instruction becomes a mov.
     Wavefront *wf = gpuDynInst->wavefront();
-    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    unsigned accum_offset = wf->accumOffset;
+
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0+accum_offset);
     VecOperandU32 vdst(gpuDynInst, instData.VDST);
 
     src.readSrc();
@@ -615,11 +615,11 @@ void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
 
 void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
 {
-    // The Acc register file is not supported in gem5 and has been removed
-    // in MI200. Therefore this instruction becomes a mov.
     Wavefront *wf = gpuDynInst->wavefront();
+    unsigned accum_offset = wf->accumOffset;
+
     ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
 
     src.readSrc();
 

From 2ca7f48828782368bed8d02a1a6f8c537b5af15a Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Mon, 26 Feb 2024 13:18:43 -0600
Subject: [PATCH 3/3] arch-vega: Accumulation offset for existing MFMA insts

This commit update the two exiting MFMA instructions to support the
accumulation offset for A, B, and C/D matrix. Additionally uses array
indexed C/D matrix registers to reduce duplicate code. Future MFMA
instructions have up to 16 registers for C/D and this reduces the amount
of code being written.

Change-Id: Ibdc3b6255234a3bab99f115c79e8a0248c800400
---
 src/arch/amdgpu/vega/insts/vop3p_mai.cc | 169 ++++++++++++++----------
 1 file changed, 97 insertions(+), 72 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/vop3p_mai.cc b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
index 943aa72cfd..6136a94da9 100644
--- a/src/arch/amdgpu/vega/insts/vop3p_mai.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
@@ -56,9 +56,19 @@ namespace VegaISA
     void
     Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
     {
-        int acc_offset = 0;
+        // Accumulation register offsets for A, B, and C/D matrix.
+        int a_offset = 0;
+        int b_offset = 0;
+        int cd_offset = 0;
         if (instData.ACC_CD) {
-            warn("ACC_CD not yet implemented\n");
+            cd_offset = gpuDynInst->wavefront()->accumOffset;
+        }
+        if (extData.ACC) {
+            if (extData.ACC & 0x1) {
+                a_offset = gpuDynInst->wavefront()->accumOffset;
+            } else if (extData.ACC & 0x2) {
+                b_offset = gpuDynInst->wavefront()->accumOffset;
+            }
         }
 
         // int8 size allows for 4 elements per lane. At 16x16 this means 4
@@ -71,24 +81,27 @@ namespace VegaISA
 
         // VecOperandI8 will read 8 bits and sign extend, so used U32 to read
         // as "untyped" 32-bit values.
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
-        ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
-        ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
-        ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0+a_offset);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1+b_offset);
+        ConstVecOperandI32 src2[4] = {
+            ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset),
+            ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+1*delta),
+            ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+2*delta),
+            ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+3*delta),
+        };
 
-        VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
-        VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
-        VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
-        VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
+        VecOperandI32 vdst[4] = {
+            VecOperandI32(gpuDynInst, instData.VDST+cd_offset),
+            VecOperandI32(gpuDynInst, instData.VDST+cd_offset+1),
+            VecOperandI32(gpuDynInst, instData.VDST+cd_offset+2),
+            VecOperandI32(gpuDynInst, instData.VDST+cd_offset+3),
+        };
 
         src0.readSrc();
         src1.readSrc();
-        src2a.readSrc();
-        src2b.readSrc();
-        src2c.readSrc();
-        src2d.readSrc();
+        for (int i = 0; i < 4; ++i) {
+            src2[i].readSrc();
+        }
 
         int32_t A[16][16];
         for (int i = 0; i < 64; ++i) {
@@ -124,14 +137,14 @@ namespace VegaISA
 
         // Load accumulation matrix C into result
         for (int i = 0; i < 64; ++i) {
-            // src2a contains rows 0, 4, 8, 12
-            result[(i/16)*4][(i%16)] = src2a[i];
-            // src2b contains rows 1, 5, 9, 13
-            result[(i/16)*4+1][(i%16)] = src2b[i];
-            // src2c contains rows 2, 6, 10, 14
-            result[(i/16)*4+2][(i%16)] = src2c[i];
-            // src2d contains rows 3, 7, 11, 15
-            result[(i/16)*4+3][(i%16)] = src2d[i];
+            // src2[0] contains rows 0, 4, 8, 12
+            result[(i/16)*4][(i%16)] = src2[0][i];
+            // src2[1] contains rows 1, 5, 9, 13
+            result[(i/16)*4+1][(i%16)] = src2[1][i];
+            // src2[2] contains rows 2, 6, 10, 14
+            result[(i/16)*4+2][(i%16)] = src2[2][i];
+            // src2[3] contains rows 3, 7, 11, 15
+            result[(i/16)*4+3][(i%16)] = src2[3][i];
         }
 
         // Compute new result - This is (obviously) not optimized
@@ -145,20 +158,19 @@ namespace VegaISA
 
         // Put result in dest VGPRs
         for (int i = 0; i < 64; ++i) {
-            // vdsta contains rows 0, 4, 8, 12
-            vdsta[i] = result[(i/16)*4][(i%16)];
-            // vdstb contains rows 1, 5, 9, 13
-            vdstb[i] = result[(i/16)*4+1][(i%16)];
-            // vdstc contains rows 2, 6, 10, 14
-            vdstc[i] = result[(i/16)*4+2][(i%16)];
-            // vdstd contains rows 3, 7, 11, 15
-            vdstd[i] = result[(i/16)*4+3][(i%16)];
+            // vdst[0] contains rows 0, 4, 8, 12
+            vdst[0][i] = result[(i/16)*4][(i%16)];
+            // vdst[1] contains rows 1, 5, 9, 13
+            vdst[1][i] = result[(i/16)*4+1][(i%16)];
+            // vdst[2] contains rows 2, 6, 10, 14
+            vdst[2][i] = result[(i/16)*4+2][(i%16)];
+            // vdst[3] contains rows 3, 7, 11, 15
+            vdst[3][i] = result[(i/16)*4+3][(i%16)];
         }
 
-        vdsta.write();
-        vdstb.write();
-        vdstc.write();
-        vdstd.write();
+        for (int i = 0; i < 4; ++i) {
+            vdst[i].write();
+        }
     } // execute
     // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
 
@@ -179,9 +191,19 @@ namespace VegaISA
     void
     Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
     {
-        int acc_offset = 0;
+        // Accumulation register offsets for A, B, and C/D matrix.
+        int a_offset = 0;
+        int b_offset = 0;
+        int cd_offset = 0;
         if (instData.ACC_CD) {
-            warn("ACC_CD not yet implemented\n");
+            cd_offset = gpuDynInst->wavefront()->accumOffset;
+        }
+        if (extData.ACC) {
+            if (extData.ACC & 0x1) {
+                a_offset = gpuDynInst->wavefront()->accumOffset;
+            } else if (extData.ACC & 0x2) {
+                b_offset = gpuDynInst->wavefront()->accumOffset;
+            }
         }
 
         // Handling of src2 is a bit tricky. The operator[] overload cannot
@@ -191,37 +213,41 @@ namespace VegaISA
         // a delta for each of the pairs of src2 GPRs.
         int delta = isVectorReg(extData.SRC2) ? 2 : 0;
 
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
-        ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
-        ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
-        ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0+a_offset);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1+b_offset);
+        ConstVecOperandF64 src2[4] = {
+            ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset),
+            ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+1*delta),
+            ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+2*delta),
+            ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+3*delta),
+        };
 
-        VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
-        VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
-        VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
-        VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
+        VecOperandF64 vdst[4] = {
+            VecOperandF64(gpuDynInst, instData.VDST+cd_offset),
+            VecOperandF64(gpuDynInst, instData.VDST+cd_offset+2),
+            VecOperandF64(gpuDynInst, instData.VDST+cd_offset+4),
+            VecOperandF64(gpuDynInst, instData.VDST+cd_offset+6),
+        };
 
         src0.readSrc();
         src1.readSrc();
-        src2a.readSrc();
-        src2b.readSrc();
-        src2c.readSrc();
-        src2d.readSrc();
+
+        for (int i = 0; i < 4; ++i) {
+            src2[i].readSrc();
+        }
 
         double result[16][16];
 
         // Load src2 into result. src2 is row major
         for (int i = 0; i < 64; ++i) {
-            // src2a contains rows 0 - 3
-            result[(i/16)][(i%16)] = src2a[i];
-            // src2b contains rows 4 - 7
-            result[(i/16)+4][(i%16)] = src2b[i];
-            // src2c contains rows 8 - 11
-            result[(i/16)+8][(i%16)] = src2c[i];
-            // src2d contains rows 12 - 15
-            result[(i/16)+12][(i%16)] = src2d[i];
+            // src2[0] contains rows 0 - 3
+            result[(i/16)][(i%16)] = src2[0][i];
+            // src2[1] contains rows 4 - 7
+            result[(i/16)+4][(i%16)] = src2[1][i];
+            // src2[2] contains rows 8 - 11
+            result[(i/16)+8][(i%16)] = src2[2][i];
+            // src2[3] contains rows 12 - 15
+            result[(i/16)+12][(i%16)] = src2[3][i];
         }
 
         // Compute new result
@@ -238,20 +264,19 @@ namespace VegaISA
 
         // Put result in dest VGPRs
         for (int i = 0; i < 64; ++i) {
-            // vdsta contains rows 0 - 3
-            vdsta[i] = result[(i/16)][(i%16)];
-            // src2b contains rows 4 - 7
-            vdstb[i] = result[(i/16)+4][(i%16)];
-            // src2c contains rows 8 - 11
-            vdstc[i] = result[(i/16)+8][(i%16)];
-            // src2d contains rows 12 - 15
-            vdstd[i] = result[(i/16)+12][(i%16)];
+            // vdst[0] contains rows 0 - 3
+            vdst[0][i] = result[(i/16)][(i%16)];
+            // src2[1] contains rows 4 - 7
+            vdst[1][i] = result[(i/16)+4][(i%16)];
+            // src2[2] contains rows 8 - 11
+            vdst[2][i] = result[(i/16)+8][(i%16)];
+            // src2[3] contains rows 12 - 15
+            vdst[3][i] = result[(i/16)+12][(i%16)];
         }
 
-        vdsta.write();
-        vdstb.write();
-        vdstc.write();
-        vdstd.write();
+        for (int i = 0; i < 4; ++i) {
+            vdst[i].write();
+        }
     } // execute
 } // namespace VegaISA
 } // namespace gem5