diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 05c9a95eed..dbb909f624 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -36,6 +36,7 @@
 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
+#include "debug/GPUInitAbi.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
@@ -230,6 +231,8 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
 {
     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
 
+    sanityCheckAKC(akc);
+
     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
         "kernel object\n", akc->kernel_code_entry_byte_offset);
 
@@ -250,7 +253,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
      * APUs to implement asynchronous memcopy operations from 2 pointers in
      * host memory.  I have no idea what BLIT stands for.
      * */
-    if (akc->runtime_loader_kernel_symbol) {
+    if (!disp_pkt->completion_signal) {
         kernel_name = "Some kernel";
     } else {
         kernel_name = "Blit kernel";
@@ -616,6 +619,114 @@ GPUCommandProcessor::initABI(HSAQueueEntry *task)
         sizeof(uint32_t), cb, &cb->dmaBuffer);
 }
 
+void
+GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)
+{
+    DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",
+            akc->group_segment_fixed_size);
+    DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",
+            akc->private_segment_fixed_size);
+    DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);
+    DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",
+            akc->kernel_code_entry_byte_offset);
+    DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);
+    DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);
+    DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",
+            akc->granulated_workitem_vgpr_count);
+    DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",
+            akc->granulated_wavefront_sgpr_count);
+    DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);
+    DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);
+    DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",
+            akc->float_mode_round_16_64);
+    DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",
+            akc->float_mode_denorm_32);
+    DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",
+            akc->float_mode_denorm_16_64);
+    DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);
+    DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);
+    DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);
+    DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);
+    DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);
+    DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);
+    DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);
+    DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);
+    DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);
+    DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);
+    DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",
+            akc->enable_private_segment);
+    DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);
+    DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",
+            akc->enable_sgpr_workgroup_id_x);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",
+            akc->enable_sgpr_workgroup_id_y);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",
+            akc->enable_sgpr_workgroup_id_z);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",
+            akc->enable_sgpr_workgroup_info);
+    DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",
+            akc->enable_vgpr_workitem_id);
+    DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",
+            akc->enable_exception_address_watch);
+    DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",
+            akc->enable_exception_memory);
+    DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",
+            akc->enable_exception_ieee_754_fp_invalid_operation);
+    DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",
+            akc->enable_exception_fp_denormal_source);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",
+            akc->enable_exception_ieee_754_fp_division_by_zero);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",
+            akc->enable_exception_ieee_754_fp_overflow);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",
+            akc->enable_exception_ieee_754_fp_underflow);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",
+            akc->enable_exception_ieee_754_fp_inexact);
+    DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",
+            akc->enable_exception_int_divide_by_zero);
+    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",
+            akc->enable_sgpr_private_segment_buffer);
+    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",
+            akc->enable_sgpr_dispatch_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",
+            akc->enable_sgpr_queue_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",
+            akc->enable_sgpr_kernarg_segment_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",
+            akc->enable_sgpr_dispatch_id);
+    DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",
+            akc->enable_sgpr_flat_scratch_init);
+    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",
+            akc->enable_sgpr_private_segment_size);
+    DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",
+            akc->enable_wavefront_size32);
+    DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);
+    DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",
+            akc->kernarg_preload_spec_length);
+    DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",
+            akc->kernarg_preload_spec_offset);
+
+
+    // Check for features not implemented in gem5
+    fatal_if(akc->wgp_mode, "WGP mode not supported\n");
+    fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");
+    fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");
+
+
+    // Warn on features that gem5 will ignore
+    warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");
+    warn_if(akc->bulky, "Bulky code object bit ignored\n");
+    // TODO: All the IEEE bits
+
+    warn_if(akc->kernarg_preload_spec_length ||
+            akc->kernarg_preload_spec_offset,
+            "Kernarg preload not implemented\n");
+    warn_if(akc->accum_offset, "ACC offset not implemented\n");
+    warn_if(akc->tg_split, "TG split not implemented\n");
+}
+
 System*
 GPUCommandProcessor::system()
 {
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index 85b2a44494..ac73c179d7 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -148,6 +148,7 @@ class GPUCommandProcessor : public DmaVirtDevice
     // Typedefing dmaRead and dmaWrite function pointer
     typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
     void initABI(HSAQueueEntry *task);
+    void sanityCheckAKC(AMDKernelCode *akc);
     HSAPacketProcessor *hsaPP;
     TranslationGenPtr translate(Addr vaddr, Addr size) override;
 
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index 7e6744704a..d81b879594 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -418,12 +418,6 @@ class HSAQueueEntry
             akc->enable_sgpr_flat_scratch_init);
         initialSgprState.set(PrivateSegSize,
             akc->enable_sgpr_private_segment_size);
-        initialSgprState.set(GridWorkgroupCountX,
-            akc->enable_sgpr_grid_workgroup_count_x);
-        initialSgprState.set(GridWorkgroupCountY,
-            akc->enable_sgpr_grid_workgroup_count_y);
-        initialSgprState.set(GridWorkgroupCountZ,
-            akc->enable_sgpr_grid_workgroup_count_z);
         initialSgprState.set(WorkgroupIdX,
             akc->enable_sgpr_workgroup_id_x);
         initialSgprState.set(WorkgroupIdY,
@@ -433,7 +427,7 @@ class HSAQueueEntry
         initialSgprState.set(WorkgroupInfo,
             akc->enable_sgpr_workgroup_info);
         initialSgprState.set(PrivSegWaveByteOffset,
-            akc->enable_sgpr_private_segment_wave_byte_offset);
+            akc->enable_private_segment);
 
         /**
          * set the enable bits for the initial VGPR state. the
diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh
index 1879dee672..c230af0fad 100644
--- a/src/gpu-compute/kernel_code.hh
+++ b/src/gpu-compute/kernel_code.hh
@@ -60,15 +60,12 @@ enum ScalarRegInitFields : int
     DispatchId = 4,
     FlatScratchInit = 5,
     PrivateSegSize = 6,
-    GridWorkgroupCountX = 7,
-    GridWorkgroupCountY = 8,
-    GridWorkgroupCountZ = 9,
-    WorkgroupIdX = 10,
-    WorkgroupIdY = 11,
-    WorkgroupIdZ = 12,
-    WorkgroupInfo = 13,
-    PrivSegWaveByteOffset = 14,
-    NumScalarInitFields = 15
+    WorkgroupIdX = 7,
+    WorkgroupIdY = 8,
+    WorkgroupIdZ = 9,
+    WorkgroupInfo = 10,
+    PrivSegWaveByteOffset = 11,
+    NumScalarInitFields = 12
 };
 
 enum VectorRegInitFields : int
@@ -79,28 +76,24 @@ enum VectorRegInitFields : int
     NumVectorInitFields = 3
 };
 
-struct AMDKernelCode
+// Kernel code object based on the table on LLVM's website:
+// https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor
+typedef struct GEM5_PACKED
 {
-    uint32_t amd_kernel_code_version_major;
-    uint32_t amd_kernel_code_version_minor;
-    uint16_t amd_machine_kind;
-    uint16_t amd_machine_version_major;
-    uint16_t amd_machine_version_minor;
-    uint16_t amd_machine_version_stepping;
+    uint32_t group_segment_fixed_size;
+    uint32_t private_segment_fixed_size;
+    uint32_t kernarg_size;
+    uint8_t reserved0[4];
     int64_t kernel_code_entry_byte_offset;
-    int64_t kernel_code_prefetch_byte_offset;
-    uint64_t kernel_code_prefetch_byte_size;
-    uint64_t max_scratch_backing_memory_byte_size;
+    uint8_t reserved1[20];
 
-    /**
-     * The fields below are used to set program settings for
-     * compute shaders. Here they are primarily used to setup
-     * initial register state. See the following for full details
-     * about kernel launch, state initialization, and the AMD kernel
-     * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
-     *              blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
-     *              #initial-kernel-register-state
-     */
+    // the 32b below here represent the fields of
+    // the COMPUTE_PGM_RSRC3 register for GFX90A, GFX940
+    uint32_t accum_offset : 6;
+    uint32_t compute_pgm_rsrc3_reserved1 : 10;
+    uint32_t tg_split : 1;
+    uint32_t compute_pgm_rsrc3_reserved2 : 15;
+    // end COMPUTE_PGM_RSRC3 register
 
     // the 32b below here represent the fields of
     // the COMPUTE_PGM_RSRC1 register
@@ -117,12 +110,16 @@ struct AMDKernelCode
     uint32_t enable_ieee_mode : 1;
     uint32_t bulky : 1;
     uint32_t cdbg_user : 1;
-    uint32_t compute_pgm_rsrc1_reserved : 6;
+    uint32_t fp16_ovfl : 1;
+    uint32_t compute_pgm_rsrc1_reserved : 2;
+    uint32_t wgp_mode : 1;
+    uint32_t mem_ordered : 1;
+    uint32_t fwd_progress : 1;
     // end COMPUTE_PGM_RSRC1 register
 
     // the 32b below here represent the fields of
     // the COMPUTE_PGM_RSRC2 register
-    uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
+    uint32_t enable_private_segment : 1;
     uint32_t user_sgpr_count : 5;
     uint32_t enable_trap_handler : 1;
     uint32_t enable_sgpr_workgroup_id_x : 1;
@@ -131,7 +128,7 @@ struct AMDKernelCode
     uint32_t enable_sgpr_workgroup_info : 1;
     uint32_t enable_vgpr_workitem_id : 2;
     uint32_t enable_exception_address_watch : 1;
-    uint32_t enable_exception_memory_violation : 1;
+    uint32_t enable_exception_memory : 1;
     uint32_t granulated_lds_size : 9;
     uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
     uint32_t enable_exception_fp_denormal_source : 1;
@@ -152,41 +149,17 @@ struct AMDKernelCode
     uint32_t enable_sgpr_dispatch_id : 1;
     uint32_t enable_sgpr_flat_scratch_init : 1;
     uint32_t enable_sgpr_private_segment_size : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_x : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_y : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_z : 1;
-    uint32_t kernel_code_properties_reserved1 : 6;
-    uint32_t enable_ordered_append_gds : 1;
-    uint32_t private_element_size : 2;
-    uint32_t is_ptr64 : 1;
-    uint32_t is_dynamic_callstack : 1;
-    uint32_t is_debug_enabled : 1;
-    uint32_t is_xnack_enabled : 1;
-    uint32_t kernel_code_properties_reserved2 : 9;
+    uint32_t kernel_code_properties_reserved1 : 3;
+    uint32_t enable_wavefront_size32 : 1;
+    uint32_t use_dynamic_stack : 1;
+    uint32_t kernel_code_properties_reserved2 : 4;
     // end KERNEL_CODE_PROPERTIES
 
-    uint32_t workitem_private_segment_byte_size;
-    uint32_t workgroup_group_segment_byte_size;
-    uint32_t gds_segment_byte_size;
-    uint64_t kernarg_segment_byte_size;
-    uint32_t workgroup_fbarrier_count;
-    uint16_t wavefront_sgpr_count;
-    uint16_t workitem_vgpr_count;
-    uint16_t reserved_vgpr_first;
-    uint16_t reserved_vgpr_count;
-    uint16_t reserved_sgpr_first;
-    uint16_t reserved_sgpr_count;
-    uint16_t debug_wavefront_private_segment_offset_sgpr;
-    uint16_t debug_private_segment_buffer_sgpr;
-    uint8_t kernarg_segment_alignment;
-    uint8_t group_segment_alignment;
-    uint8_t private_segment_alignment;
-    uint8_t wavefront_size;
-    int32_t call_convention;
-    uint8_t reserved[12];
-    uint64_t runtime_loader_kernel_symbol;
-    uint64_t control_directives[16];
-};
+    uint32_t kernarg_preload_spec_length : 7;
+    uint32_t kernarg_preload_spec_offset : 9;
+    uint8_t reserved2[4];
+} AMDKernelCode;
+static_assert(sizeof(AMDKernelCode) == 64);
 
 } // namespace gem5
 
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 1d17a69d54..af8a47a84b 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -126,7 +126,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 
         if (task->sgprBitEnabled(en_bit)) {
             int physSgprIdx = 0;
-            uint32_t wiCount = 0;
             uint32_t firstWave = 0;
             int orderedAppendTerm = 0;
             int numWfsInWg = 0;
@@ -341,48 +340,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                         wfSlotId, wfDynId, physSgprIdx,
                         task->privMemPerItem());
                 break;
-              case GridWorkgroupCountX:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(0) +
-                           task->wgSize(0) - 1) /
-                           task->wgSize(0));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG X: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
-              case GridWorkgroupCountY:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(1) +
-                           task->wgSize(1) - 1) /
-                           task->wgSize(1));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG Y: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
-              case GridWorkgroupCountZ:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(2) +
-                           task->wgSize(2) - 1) /
-                           task->wgSize(2));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG Z: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
               case WorkgroupIdX:
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);