From 7cdb69bf21b84ac5182ef47bf11990393427fc0a Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 31 May 2024 09:27:10 -0700
Subject: [PATCH 1/3] arch-vega: Fill in scratch insts to match flat/global

Flat, scratch, and global share the same instruction implementation with
different address calculations essentially. These instructions were
already implemented but not added to the decoder. This commit adds the
remaining scratch instructions which have a shared instruction
implementation.

Change-Id: I8f2e9ceb221294dce1b81c45745b642f0592d985
---
 src/arch/amdgpu/vega/gpu_decoder.cc | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 0f4b1e9872..8c12013ae8 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -8622,7 +8622,6 @@ namespace VegaISA
     Decoder::decode_OP_GLOBAL__GLOBAL_STORE_DWORD(MachInst iFmt)
     {
         return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
-        return nullptr;
     }
 
     GPUStaticInst*
@@ -9898,29 +9897,25 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_UBYTE(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_UBYTE(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_SBYTE(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_SBYTE(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_USHORT(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_USHORT(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_SSHORT(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_SSHORT(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -9950,8 +9945,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_BYTE(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_BYTE(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -9964,8 +9958,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_SHORT(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_SHORT(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*

From 6c8caf83c6f0c0e49f6eb601b204e60f1d334149 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 15 May 2024 12:05:54 -0700
Subject: [PATCH 2/3] arch-vega: Implement V_ACCVGPR_MOV_B32 instruction

This instruction is a simple move from accumulation register to
accumulation register. It is essentially a move with the accumulation
offset added to the register index.

Change-Id: Ic93ae72599b75c91213f56ebafe5bbd7b2867089
---
 src/arch/amdgpu/vega/gpu_decoder.cc        |  8 +++++-
 src/arch/amdgpu/vega/gpu_decoder.hh        |  1 +
 src/arch/amdgpu/vega/insts/instructions.hh | 32 ++++++++++++++++++++++
 src/arch/amdgpu/vega/insts/vop1.cc         | 32 ++++++++++++++++++++++
 4 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 8c12013ae8..e07a392ced 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -3144,7 +3144,7 @@ namespace VegaISA
         &Decoder::decode_OP_VOP1__V_SAT_PK_U8_I16,
         &Decoder::decode_invalid,
         &Decoder::decode_OP_VOP1__V_SWAP_B32,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP1__V_ACCVGPR_MOV_B32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -11777,6 +11777,12 @@ namespace VegaISA
         return nullptr;
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP1__V_ACCVGPR_MOV_B32(MachInst iFmt)
+    {
+        return new Inst_VOP1__V_ACCVGPR_MOV_B32(&iFmt->iFmt_VOP1);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_VOPC__V_CMP_CLASS_F32(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 8094233bd8..2523734ce5 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -1314,6 +1314,7 @@ namespace VegaISA
         GPUStaticInst* decode_OP_VOP1__V_CVT_NORM_U16_F16(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_SAT_PK_U8_I16(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_SWAP_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP1__V_ACCVGPR_MOV_B32(MachInst);
         GPUStaticInst* decode_OP_VOP2__V_CNDMASK_B32(MachInst);
         GPUStaticInst* decode_OP_VOP2__V_ADD_F32(MachInst);
         GPUStaticInst* decode_OP_VOP2__V_SUB_F32(MachInst);
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index 9d91526f3f..4e71f13ad4 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -10562,6 +10562,38 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP1__V_LOG_LEGACY_F32
 
+    class Inst_VOP1__V_ACCVGPR_MOV_B32 : public Inst_VOP1
+    {
+      public:
+        Inst_VOP1__V_ACCVGPR_MOV_B32(InFmt_VOP1*);
+        ~Inst_VOP1__V_ACCVGPR_MOV_B32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src
+                return 4;
+              case 1: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP1__V_ACCVGPR_MOV_B32
+
     class Inst_VOPC__V_CMP_CLASS_F32 : public Inst_VOPC
     {
       public:
diff --git a/src/arch/amdgpu/vega/insts/vop1.cc b/src/arch/amdgpu/vega/insts/vop1.cc
index 3bbf1e0085..f970923951 100644
--- a/src/arch/amdgpu/vega/insts/vop1.cc
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
@@ -2397,6 +2397,38 @@ namespace VegaISA
             }
         }
 
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_ACCVGPR_MOV_B32 class methods ---
+
+    Inst_VOP1__V_ACCVGPR_MOV_B32::
+        Inst_VOP1__V_ACCVGPR_MOV_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_accvgpr_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_ACCVGPR_MOV_B32
+
+    Inst_VOP1__V_ACCVGPR_MOV_B32::~Inst_VOP1__V_ACCVGPR_MOV_B32()
+    {
+    } // ~Inst_VOP1__V_ACCVGPR_MOV_B32
+
+    void
+    Inst_VOP1__V_ACCVGPR_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        unsigned accum_offset = wf->accumOffset;
+
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0+accum_offset);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src[lane];
+            }
+        }
+
         vdst.write();
     } // execute
 } // namespace VegaISA

From 00dcd5b0bc9a817f7bf64bbac48f97c3a7a19c00 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 31 May 2024 13:40:20 -0700
Subject: [PATCH 3/3] arch-vega: Implement literals for 64b dest operands

This feature has been available since Vega10 but was never implemented.
MI300 adds a few new instructions that make use of this more often
(e.g., v_mov_b64).

Change-Id: Ieeb7834462b76d77c0030f49622d0de09f90c9e4
---
 src/arch/amdgpu/vega/operand.hh | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh
index 593f0e34fd..1bb9b43d1f 100644
--- a/src/arch/amdgpu/vega/operand.hh
+++ b/src/arch/amdgpu/vega/operand.hh
@@ -579,8 +579,30 @@ namespace VegaISA
               case REG_SRC_SWDA:
               case REG_SRC_DPP:
               case REG_SRC_LITERAL:
-                assert(NumDwords == 1);
+                /**
+                 * From the Vega specification:
+                 * When a literal constant is used with a 64 bit instruction,
+                 * the literal is expanded to 64 bits by: padding the LSBs
+                 * with zeros for floats, padding the MSBs with zeros for
+                 * unsigned ints, and by sign-extending signed ints.
+                 */
                 srfData[0] = _gpuDynInst->srcLiteral();
+                if constexpr (NumDwords == 2) {
+                    if constexpr (std::is_integral_v<DataType>) {
+                        if constexpr (std::is_signed_v<DataType>) {
+                            if (bits(srfData[0], 31, 31) == 1) {
+                                srfData[1] = 0xffffffff;
+                            } else {
+                                srfData[1] = 0;
+                            }
+                        } else {
+                            srfData[1] = 0;
+                        }
+                    } else {
+                        srfData[1] = _gpuDynInst->srcLiteral();
+                        srfData[0] = 0;
+                    }
+                }
                 break;
               case REG_SHARED_BASE:
                 {