From b91c9be10283e450b3c806c4d8360318f0baca71 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 15 May 2024 16:28:36 -0700
Subject: [PATCH] arch-vega: Load/stores commonly used with 16b MFMA

This implements some missing loads and store that are commonly used in
applications with MFMA instructions to load 16-bit data types into
specific register locations: DS_READ_U16_D16, DS_READ_U16_D16_HI,
BUFFER_LOAD_SHORT_D16, BUFFER_LOAD_SHORT_D16_HI.

Change-Id: Ie22d81ef010328f4541553a9a674764dc16a9f4d
---
 src/arch/amdgpu/vega/gpu_decoder.cc        |  12 +-
 src/arch/amdgpu/vega/insts/ds.cc           | 128 +++++++++++++
 src/arch/amdgpu/vega/insts/instructions.hh | 144 ++++++++++++++
 src/arch/amdgpu/vega/insts/mubuf.cc        | 207 +++++++++++++++++++++
 4 files changed, 483 insertions(+), 8 deletions(-)
diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 6924ea6922..0f4b1e9872 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -7808,15 +7808,13 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_DS__DS_READ_U16_D16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_DS__DS_READ_U16_D16(&iFmt->iFmt_DS);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_DS__DS_READ_U16_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_DS__DS_READ_U16_D16_HI(&iFmt->iFmt_DS);
     }
 
     GPUStaticInst*
@@ -10207,14 +10205,12 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16(&iFmt->iFmt_MUBUF);
     }
     GPUStaticInst*
     Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(&iFmt->iFmt_MUBUF);
     }
     GPUStaticInst*
     Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_HI_X(MachInst iFmt)
diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc
index 17acdaa287..57d58638c5 100644
--- a/src/arch/amdgpu/vega/insts/ds.cc
+++ b/src/arch/amdgpu/vega/insts/ds.cc
@@ -1858,6 +1858,134 @@ namespace VegaISA
 
         vdst.write();
     } // completeAcc
+    // --- Inst_DS__DS_READ_U16_D16 class methods ---
+
+    Inst_DS__DS_READ_U16_D16::
+        Inst_DS__DS_READ_U16_D16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u16_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U16_D16
+
+    Inst_DS__DS_READ_U16_D16::~Inst_DS__DS_READ_U16_D16()
+    {
+    } // ~Inst_DS__DS_READ_U16_D16
+
+    // --- description from .arch file ---
+    // RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;
+    // // RETURN_DATA[31:16] is preserved.
+    void
+    Inst_DS__DS_READ_U16_D16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+    void
+    Inst_DS__DS_READ_U16_D16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U16_D16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                VecElemU16 ds_val = reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data)[lane];
+                replaceBits(vdst[lane], 15, 0, ds_val);
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_U16_D16_HI class methods ---
+
+    Inst_DS__DS_READ_U16_D16_HI::
+        Inst_DS__DS_READ_U16_D16_HI(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u16_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U16_D16_HI
+
+    Inst_DS__DS_READ_U16_D16_HI::~Inst_DS__DS_READ_U16_D16_HI()
+    {
+    } // ~Inst_DS__DS_READ_U16_D16_HI
+
+    // --- description from .arch file ---
+    // RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16;
+    // // RETURN_DATA[15:0] is preserved.
+    void
+    Inst_DS__DS_READ_U16_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+    void
+    Inst_DS__DS_READ_U16_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U16_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                VecElemU16 ds_val = reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data)[lane];
+                replaceBits(vdst[lane], 31, 16, ds_val);
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
     // --- Inst_DS__DS_SWIZZLE_B32 class methods ---
 
     Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index 8195e52341..587058cb89 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -33126,6 +33126,74 @@ namespace VegaISA
         void completeAcc(GPUDynInstPtr) override;
     }; // Inst_DS__DS_READ_U16
 
+    class Inst_DS__DS_READ_U16_D16 : public Inst_DS
+    {
+      public:
+        Inst_DS__DS_READ_U16_D16(InFmt_DS*);
+        ~Inst_DS__DS_READ_U16_D16();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_a
+                return 4;
+              case 1: //vgpr_rtn
+                return 2;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_DS__DS_READ_U16_D16
+
+    class Inst_DS__DS_READ_U16_D16_HI : public Inst_DS
+    {
+      public:
+        Inst_DS__DS_READ_U16_D16_HI(InFmt_DS*);
+        ~Inst_DS__DS_READ_U16_D16_HI();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_a
+                return 4;
+              case 1: //vgpr_rtn
+                return 2;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_DS__DS_READ_U16_D16_HI
+
     class Inst_DS__DS_SWIZZLE_B32 : public Inst_DS
     {
       public:
@@ -36810,6 +36878,82 @@ namespace VegaISA
         void completeAcc(GPUDynInstPtr) override;
     }; // Inst_MUBUF__BUFFER_LOAD_SSHORT
 
+    class Inst_MUBUF__BUFFER_LOAD_SHORT_D16 : public Inst_MUBUF
+    {
+      public:
+        Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF*);
+        ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_a
+                return 8;
+              case 1: //sgpr_r
+                return 16;
+              case 2: //sgpr_o
+                return 4;
+              case 3: //vgpr_d
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16
+
+    class Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI : public Inst_MUBUF
+    {
+      public:
+        Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF*);
+        ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_a
+                return 8;
+              case 1: //sgpr_r
+                return 16;
+              case 2: //sgpr_o
+                return 4;
+              case 3: //vgpr_d
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
+
     class Inst_MUBUF__BUFFER_LOAD_DWORD : public Inst_MUBUF
     {
       public:
diff --git a/src/arch/amdgpu/vega/insts/mubuf.cc b/src/arch/amdgpu/vega/insts/mubuf.cc
index ff8bae2475..ffc68e8c2b 100644
--- a/src/arch/amdgpu/vega/insts/mubuf.cc
+++ b/src/arch/amdgpu/vega/insts/mubuf.cc
@@ -823,6 +823,209 @@ namespace VegaISA
     Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
     {
     } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16
+        ::Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_short_d16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+            warn("BUFFER.LDS not implemented!");
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_SHORT_D16
+
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16::~Inst_MUBUF__BUFFER_LOAD_SHORT_D16()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16
+
+    // --- description from .arch file ---
+    // RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;
+    // // RETURN_DATA[31:16] is preserved.
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        // For explanation of buffer addressing, see section 9.1.5 in:
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    VecElemU16 buf_val = (reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane];
+                    replaceBits(vdst[lane], 15, 0, buf_val);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
+        ::Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_short_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+            warn("BUFFER.LDS not implemented!");
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
+
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::
+        ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
+
+    // --- description from .arch file ---
+    // VDATA[31 : 16].b16 = MEM[ADDR].b16;
+    // // VDATA[15:0] is preserved.
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        // For explanation of buffer addressing, see section 9.1.5 in:
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    VecElemU16 buf_val = (reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane];
+                    replaceBits(vdst[lane], 31, 16, buf_val);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
     // --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
 
     Inst_MUBUF__BUFFER_LOAD_DWORD
@@ -868,6 +1071,10 @@ namespace VegaISA
 
         int inst_offset = instData.OFFSET;
 
+        // For explanation of buffer addressing, see section 9.1.5 in:
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
         if (!instData.IDXEN && !instData.OFFEN) {
             calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                 ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,