gpu-compute: Update code object to latest LLVM

The AMDKernelCode struct is very outdated. Most of the fields are no longer used and have been replaced with new fields that are used. Therefore in order to support the new fields the code object needs to be updated. The new structure is based on the table located at https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor Most notably this adds the new compute_pgm_rsrc3 and kernarg preload fields which are new features in gfx90a (MI200). The accum_offset in compute_pgm_rsrc3 and kergarg preload values are necessary to run application which enable those features and therefore a way to check their values is needed. Also noteable is the removal of enable_sgpr_workgroup_id_{X,Y,Z}. These seem to be unused in all versions of ROCm that gem5 supports and therefore these fields can be removed. They are replaced with a reserved field in the new code object. Change-Id: I5542442e1e5961b05e17affad0adb5186d6d9d1a
2023-11-25 12:09:41 -06:00
parent 7e1b27969f
commit cc75281802
5 changed files with 151 additions and 115 deletions
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -36,6 +36,7 @@
 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
+#include "debug/GPUInitAbi.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
@@ -230,6 +231,8 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
 {
    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;

+    sanityCheckAKC(akc);
+
    DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
        "kernel object\n", akc->kernel_code_entry_byte_offset);

@@ -250,7 +253,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
     * APUs to implement asynchronous memcopy operations from 2 pointers in
     * host memory.  I have no idea what BLIT stands for.
     * */
-    if (akc->runtime_loader_kernel_symbol) {
+    if (!disp_pkt->completion_signal) {
        kernel_name = "Some kernel";
    } else {
        kernel_name = "Blit kernel";
@@ -616,6 +619,114 @@ GPUCommandProcessor::initABI(HSAQueueEntry *task)
        sizeof(uint32_t), cb, &cb->dmaBuffer);
 }

+void
+GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)
+{
+    DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",
+            akc->group_segment_fixed_size);
+    DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",
+            akc->private_segment_fixed_size);
+    DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);
+    DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",
+            akc->kernel_code_entry_byte_offset);
+    DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);
+    DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);
+    DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",
+            akc->granulated_workitem_vgpr_count);
+    DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",
+            akc->granulated_wavefront_sgpr_count);
+    DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);
+    DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);
+    DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",
+            akc->float_mode_round_16_64);
+    DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",
+            akc->float_mode_denorm_32);
+    DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",
+            akc->float_mode_denorm_16_64);
+    DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);
+    DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);
+    DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);
+    DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);
+    DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);
+    DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);
+    DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);
+    DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);
+    DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);
+    DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);
+    DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",
+            akc->enable_private_segment);
+    DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);
+    DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",
+            akc->enable_sgpr_workgroup_id_x);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",
+            akc->enable_sgpr_workgroup_id_y);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",
+            akc->enable_sgpr_workgroup_id_z);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",
+            akc->enable_sgpr_workgroup_info);
+    DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",
+            akc->enable_vgpr_workitem_id);
+    DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",
+            akc->enable_exception_address_watch);
+    DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",
+            akc->enable_exception_memory);
+    DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",
+            akc->enable_exception_ieee_754_fp_invalid_operation);
+    DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",
+            akc->enable_exception_fp_denormal_source);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",
+            akc->enable_exception_ieee_754_fp_division_by_zero);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",
+            akc->enable_exception_ieee_754_fp_overflow);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",
+            akc->enable_exception_ieee_754_fp_underflow);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",
+            akc->enable_exception_ieee_754_fp_inexact);
+    DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",
+            akc->enable_exception_int_divide_by_zero);
+    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",
+            akc->enable_sgpr_private_segment_buffer);
+    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",
+            akc->enable_sgpr_dispatch_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",
+            akc->enable_sgpr_queue_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",
+            akc->enable_sgpr_kernarg_segment_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",
+            akc->enable_sgpr_dispatch_id);
+    DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",
+            akc->enable_sgpr_flat_scratch_init);
+    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",
+            akc->enable_sgpr_private_segment_size);
+    DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",
+            akc->enable_wavefront_size32);
+    DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);
+    DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",
+            akc->kernarg_preload_spec_length);
+    DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",
+            akc->kernarg_preload_spec_offset);
+
+
+    // Check for features not implemented in gem5
+    fatal_if(akc->wgp_mode, "WGP mode not supported\n");
+    fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");
+    fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");
+
+
+    // Warn on features that gem5 will ignore
+    warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");
+    warn_if(akc->bulky, "Bulky code object bit ignored\n");
+    // TODO: All the IEEE bits
+
+    warn_if(akc->kernarg_preload_spec_length ||
+            akc->kernarg_preload_spec_offset,
+            "Kernarg preload not implemented\n");
+    warn_if(akc->accum_offset, "ACC offset not implemented\n");
+    warn_if(akc->tg_split, "TG split not implemented\n");
+}
+
 System*
 GPUCommandProcessor::system()
 {