From cc75281802c40b0fce8f82e1f7dba8547c0f099e Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 25 Nov 2023 12:09:41 -0600
Subject: [PATCH] gpu-compute: Update code object to latest LLVM

The AMDKernelCode struct is very outdated. Most of the fields are no
longer used and have been replaced with new fields that are used.
Therefore in order to support the new fields the code object needs to be
updated. The new structure is based on the table located at
https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor

Most notably this adds the new compute_pgm_rsrc3 and kernarg preload
fields which are new features in gfx90a (MI200). The accum_offset in
compute_pgm_rsrc3 and kergarg preload values are necessary to run
application which enable those features and therefore a way to check
their values is needed.

Also noteable is the removal of enable_sgpr_workgroup_id_{X,Y,Z}. These
seem to be unused in all versions of ROCm that gem5 supports and
therefore these fields can be removed. They are replaced with a reserved
field in the new code object.

Change-Id: I5542442e1e5961b05e17affad0adb5186d6d9d1a
---
 src/gpu-compute/gpu_command_processor.cc | 113 ++++++++++++++++++++++-
 src/gpu-compute/gpu_command_processor.hh |   1 +
 src/gpu-compute/hsa_queue_entry.hh       |   8 +-
 src/gpu-compute/kernel_code.hh           | 101 ++++++++------------
 src/gpu-compute/wavefront.cc             |  43 ---------
 5 files changed, 151 insertions(+), 115 deletions(-)

diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 05c9a95eed..dbb909f624 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -36,6 +36,7 @@
 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
+#include "debug/GPUInitAbi.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
@@ -230,6 +231,8 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
 {
     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
 
+    sanityCheckAKC(akc);
+
     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
         "kernel object\n", akc->kernel_code_entry_byte_offset);
 
@@ -250,7 +253,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
      * APUs to implement asynchronous memcopy operations from 2 pointers in
      * host memory.  I have no idea what BLIT stands for.
      * */
-    if (akc->runtime_loader_kernel_symbol) {
+    if (!disp_pkt->completion_signal) {
         kernel_name = "Some kernel";
     } else {
         kernel_name = "Blit kernel";
@@ -616,6 +619,114 @@ GPUCommandProcessor::initABI(HSAQueueEntry *task)
         sizeof(uint32_t), cb, &cb->dmaBuffer);
 }
 
+void
+GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)
+{
+    DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",
+            akc->group_segment_fixed_size);
+    DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",
+            akc->private_segment_fixed_size);
+    DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);
+    DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",
+            akc->kernel_code_entry_byte_offset);
+    DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);
+    DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);
+    DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",
+            akc->granulated_workitem_vgpr_count);
+    DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",
+            akc->granulated_wavefront_sgpr_count);
+    DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);
+    DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);
+    DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",
+            akc->float_mode_round_16_64);
+    DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",
+            akc->float_mode_denorm_32);
+    DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",
+            akc->float_mode_denorm_16_64);
+    DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);
+    DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);
+    DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);
+    DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);
+    DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);
+    DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);
+    DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);
+    DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);
+    DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);
+    DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);
+    DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",
+            akc->enable_private_segment);
+    DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);
+    DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",
+            akc->enable_sgpr_workgroup_id_x);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",
+            akc->enable_sgpr_workgroup_id_y);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",
+            akc->enable_sgpr_workgroup_id_z);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",
+            akc->enable_sgpr_workgroup_info);
+    DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",
+            akc->enable_vgpr_workitem_id);
+    DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",
+            akc->enable_exception_address_watch);
+    DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",
+            akc->enable_exception_memory);
+    DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",
+            akc->enable_exception_ieee_754_fp_invalid_operation);
+    DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",
+            akc->enable_exception_fp_denormal_source);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",
+            akc->enable_exception_ieee_754_fp_division_by_zero);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",
+            akc->enable_exception_ieee_754_fp_overflow);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",
+            akc->enable_exception_ieee_754_fp_underflow);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",
+            akc->enable_exception_ieee_754_fp_inexact);
+    DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",
+            akc->enable_exception_int_divide_by_zero);
+    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",
+            akc->enable_sgpr_private_segment_buffer);
+    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",
+            akc->enable_sgpr_dispatch_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",
+            akc->enable_sgpr_queue_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",
+            akc->enable_sgpr_kernarg_segment_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",
+            akc->enable_sgpr_dispatch_id);
+    DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",
+            akc->enable_sgpr_flat_scratch_init);
+    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",
+            akc->enable_sgpr_private_segment_size);
+    DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",
+            akc->enable_wavefront_size32);
+    DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);
+    DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",
+            akc->kernarg_preload_spec_length);
+    DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",
+            akc->kernarg_preload_spec_offset);
+
+
+    // Check for features not implemented in gem5
+    fatal_if(akc->wgp_mode, "WGP mode not supported\n");
+    fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");
+    fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");
+
+
+    // Warn on features that gem5 will ignore
+    warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");
+    warn_if(akc->bulky, "Bulky code object bit ignored\n");
+    // TODO: All the IEEE bits
+
+    warn_if(akc->kernarg_preload_spec_length ||
+            akc->kernarg_preload_spec_offset,
+            "Kernarg preload not implemented\n");
+    warn_if(akc->accum_offset, "ACC offset not implemented\n");
+    warn_if(akc->tg_split, "TG split not implemented\n");
+}
+
 System*
 GPUCommandProcessor::system()
 {
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index 85b2a44494..ac73c179d7 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -148,6 +148,7 @@ class GPUCommandProcessor : public DmaVirtDevice
     // Typedefing dmaRead and dmaWrite function pointer
     typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
     void initABI(HSAQueueEntry *task);
+    void sanityCheckAKC(AMDKernelCode *akc);
     HSAPacketProcessor *hsaPP;
     TranslationGenPtr translate(Addr vaddr, Addr size) override;
 
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index 7e6744704a..d81b879594 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -418,12 +418,6 @@ class HSAQueueEntry
             akc->enable_sgpr_flat_scratch_init);
         initialSgprState.set(PrivateSegSize,
             akc->enable_sgpr_private_segment_size);
-        initialSgprState.set(GridWorkgroupCountX,
-            akc->enable_sgpr_grid_workgroup_count_x);
-        initialSgprState.set(GridWorkgroupCountY,
-            akc->enable_sgpr_grid_workgroup_count_y);
-        initialSgprState.set(GridWorkgroupCountZ,
-            akc->enable_sgpr_grid_workgroup_count_z);
         initialSgprState.set(WorkgroupIdX,
             akc->enable_sgpr_workgroup_id_x);
         initialSgprState.set(WorkgroupIdY,
@@ -433,7 +427,7 @@ class HSAQueueEntry
         initialSgprState.set(WorkgroupInfo,
             akc->enable_sgpr_workgroup_info);
         initialSgprState.set(PrivSegWaveByteOffset,
-            akc->enable_sgpr_private_segment_wave_byte_offset);
+            akc->enable_private_segment);
 
         /**
          * set the enable bits for the initial VGPR state. the
diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh
index 1879dee672..c230af0fad 100644
--- a/src/gpu-compute/kernel_code.hh
+++ b/src/gpu-compute/kernel_code.hh
@@ -60,15 +60,12 @@ enum ScalarRegInitFields : int
     DispatchId = 4,
     FlatScratchInit = 5,
     PrivateSegSize = 6,
-    GridWorkgroupCountX = 7,
-    GridWorkgroupCountY = 8,
-    GridWorkgroupCountZ = 9,
-    WorkgroupIdX = 10,
-    WorkgroupIdY = 11,
-    WorkgroupIdZ = 12,
-    WorkgroupInfo = 13,
-    PrivSegWaveByteOffset = 14,
-    NumScalarInitFields = 15
+    WorkgroupIdX = 7,
+    WorkgroupIdY = 8,
+    WorkgroupIdZ = 9,
+    WorkgroupInfo = 10,
+    PrivSegWaveByteOffset = 11,
+    NumScalarInitFields = 12
 };
 
 enum VectorRegInitFields : int
@@ -79,28 +76,24 @@ enum VectorRegInitFields : int
     NumVectorInitFields = 3
 };
 
-struct AMDKernelCode
+// Kernel code object based on the table on LLVM's website:
+// https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor
+typedef struct GEM5_PACKED
 {
-    uint32_t amd_kernel_code_version_major;
-    uint32_t amd_kernel_code_version_minor;
-    uint16_t amd_machine_kind;
-    uint16_t amd_machine_version_major;
-    uint16_t amd_machine_version_minor;
-    uint16_t amd_machine_version_stepping;
+    uint32_t group_segment_fixed_size;
+    uint32_t private_segment_fixed_size;
+    uint32_t kernarg_size;
+    uint8_t reserved0[4];
     int64_t kernel_code_entry_byte_offset;
-    int64_t kernel_code_prefetch_byte_offset;
-    uint64_t kernel_code_prefetch_byte_size;
-    uint64_t max_scratch_backing_memory_byte_size;
+    uint8_t reserved1[20];
 
-    /**
-     * The fields below are used to set program settings for
-     * compute shaders. Here they are primarily used to setup
-     * initial register state. See the following for full details
-     * about kernel launch, state initialization, and the AMD kernel
-     * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
-     *              blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
-     *              #initial-kernel-register-state
-     */
+    // the 32b below here represent the fields of
+    // the COMPUTE_PGM_RSRC3 register for GFX90A, GFX940
+    uint32_t accum_offset : 6;
+    uint32_t compute_pgm_rsrc3_reserved1 : 10;
+    uint32_t tg_split : 1;
+    uint32_t compute_pgm_rsrc3_reserved2 : 15;
+    // end COMPUTE_PGM_RSRC3 register
 
     // the 32b below here represent the fields of
     // the COMPUTE_PGM_RSRC1 register
@@ -117,12 +110,16 @@ struct AMDKernelCode
     uint32_t enable_ieee_mode : 1;
     uint32_t bulky : 1;
     uint32_t cdbg_user : 1;
-    uint32_t compute_pgm_rsrc1_reserved : 6;
+    uint32_t fp16_ovfl : 1;
+    uint32_t compute_pgm_rsrc1_reserved : 2;
+    uint32_t wgp_mode : 1;
+    uint32_t mem_ordered : 1;
+    uint32_t fwd_progress : 1;
     // end COMPUTE_PGM_RSRC1 register
 
     // the 32b below here represent the fields of
     // the COMPUTE_PGM_RSRC2 register
-    uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
+    uint32_t enable_private_segment : 1;
     uint32_t user_sgpr_count : 5;
     uint32_t enable_trap_handler : 1;
     uint32_t enable_sgpr_workgroup_id_x : 1;
@@ -131,7 +128,7 @@ struct AMDKernelCode
     uint32_t enable_sgpr_workgroup_info : 1;
     uint32_t enable_vgpr_workitem_id : 2;
     uint32_t enable_exception_address_watch : 1;
-    uint32_t enable_exception_memory_violation : 1;
+    uint32_t enable_exception_memory : 1;
     uint32_t granulated_lds_size : 9;
     uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
     uint32_t enable_exception_fp_denormal_source : 1;
@@ -152,41 +149,17 @@ struct AMDKernelCode
     uint32_t enable_sgpr_dispatch_id : 1;
     uint32_t enable_sgpr_flat_scratch_init : 1;
     uint32_t enable_sgpr_private_segment_size : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_x : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_y : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_z : 1;
-    uint32_t kernel_code_properties_reserved1 : 6;
-    uint32_t enable_ordered_append_gds : 1;
-    uint32_t private_element_size : 2;
-    uint32_t is_ptr64 : 1;
-    uint32_t is_dynamic_callstack : 1;
-    uint32_t is_debug_enabled : 1;
-    uint32_t is_xnack_enabled : 1;
-    uint32_t kernel_code_properties_reserved2 : 9;
+    uint32_t kernel_code_properties_reserved1 : 3;
+    uint32_t enable_wavefront_size32 : 1;
+    uint32_t use_dynamic_stack : 1;
+    uint32_t kernel_code_properties_reserved2 : 4;
     // end KERNEL_CODE_PROPERTIES
 
-    uint32_t workitem_private_segment_byte_size;
-    uint32_t workgroup_group_segment_byte_size;
-    uint32_t gds_segment_byte_size;
-    uint64_t kernarg_segment_byte_size;
-    uint32_t workgroup_fbarrier_count;
-    uint16_t wavefront_sgpr_count;
-    uint16_t workitem_vgpr_count;
-    uint16_t reserved_vgpr_first;
-    uint16_t reserved_vgpr_count;
-    uint16_t reserved_sgpr_first;
-    uint16_t reserved_sgpr_count;
-    uint16_t debug_wavefront_private_segment_offset_sgpr;
-    uint16_t debug_private_segment_buffer_sgpr;
-    uint8_t kernarg_segment_alignment;
-    uint8_t group_segment_alignment;
-    uint8_t private_segment_alignment;
-    uint8_t wavefront_size;
-    int32_t call_convention;
-    uint8_t reserved[12];
-    uint64_t runtime_loader_kernel_symbol;
-    uint64_t control_directives[16];
-};
+    uint32_t kernarg_preload_spec_length : 7;
+    uint32_t kernarg_preload_spec_offset : 9;
+    uint8_t reserved2[4];
+} AMDKernelCode;
+static_assert(sizeof(AMDKernelCode) == 64);
 
 } // namespace gem5
 
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 1d17a69d54..af8a47a84b 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -126,7 +126,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 
         if (task->sgprBitEnabled(en_bit)) {
             int physSgprIdx = 0;
-            uint32_t wiCount = 0;
             uint32_t firstWave = 0;
             int orderedAppendTerm = 0;
             int numWfsInWg = 0;
@@ -341,48 +340,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                         wfSlotId, wfDynId, physSgprIdx,
                         task->privMemPerItem());
                 break;
-              case GridWorkgroupCountX:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(0) +
-                           task->wgSize(0) - 1) /
-                           task->wgSize(0));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG X: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
-              case GridWorkgroupCountY:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(1) +
-                           task->wgSize(1) - 1) /
-                           task->wgSize(1));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG Y: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
-              case GridWorkgroupCountZ:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(2) +
-                           task->wgSize(2) - 1) /
-                           task->wgSize(2));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG Z: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
               case WorkgroupIdX:
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);