diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index 05c9a95eed..dbb909f624 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -36,6 +36,7 @@ #include "arch/amdgpu/vega/pagetable_walker.hh" #include "base/chunk_generator.hh" #include "debug/GPUCommandProc.hh" +#include "debug/GPUInitAbi.hh" #include "debug/GPUKernelInfo.hh" #include "dev/amdgpu/amdgpu_device.hh" #include "gpu-compute/dispatcher.hh" @@ -230,6 +231,8 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, { _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt; + sanityCheckAKC(akc); + DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the " "kernel object\n", akc->kernel_code_entry_byte_offset); @@ -250,7 +253,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, * APUs to implement asynchronous memcopy operations from 2 pointers in * host memory. I have no idea what BLIT stands for. * */ - if (akc->runtime_loader_kernel_symbol) { + if (!disp_pkt->completion_signal) { kernel_name = "Some kernel"; } else { kernel_name = "Blit kernel"; @@ -616,6 +619,114 @@ GPUCommandProcessor::initABI(HSAQueueEntry *task) sizeof(uint32_t), cb, &cb->dmaBuffer); } +void +GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc) +{ + DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n", + akc->group_segment_fixed_size); + DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n", + akc->private_segment_fixed_size); + DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size); + DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n", + akc->kernel_code_entry_byte_offset); + DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset); + DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split); + DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n", + akc->granulated_workitem_vgpr_count); + DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n", + akc->granulated_wavefront_sgpr_count); + DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority); + DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32); + DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n", + akc->float_mode_round_16_64); + DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n", + akc->float_mode_denorm_32); + DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n", + akc->float_mode_denorm_16_64); + DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv); + DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp); + DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode); + DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode); + DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky); + DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user); + DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl); + DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode); + DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered); + DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress); + DPRINTF(GPUInitAbi, "enable_private_segment: %d\n", + akc->enable_private_segment); + DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count); + DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler); + DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n", + akc->enable_sgpr_workgroup_id_x); + DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n", + akc->enable_sgpr_workgroup_id_y); + DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n", + akc->enable_sgpr_workgroup_id_z); + DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n", + akc->enable_sgpr_workgroup_info); + DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n", + akc->enable_vgpr_workitem_id); + DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n", + akc->enable_exception_address_watch); + DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n", + akc->enable_exception_memory); + DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size); + DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n", + akc->enable_exception_ieee_754_fp_invalid_operation); + DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n", + akc->enable_exception_fp_denormal_source); + DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n", + akc->enable_exception_ieee_754_fp_division_by_zero); + DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n", + akc->enable_exception_ieee_754_fp_overflow); + DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n", + akc->enable_exception_ieee_754_fp_underflow); + DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n", + akc->enable_exception_ieee_754_fp_inexact); + DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n", + akc->enable_exception_int_divide_by_zero); + DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n", + akc->enable_sgpr_private_segment_buffer); + DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n", + akc->enable_sgpr_dispatch_ptr); + DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n", + akc->enable_sgpr_queue_ptr); + DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n", + akc->enable_sgpr_kernarg_segment_ptr); + DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n", + akc->enable_sgpr_dispatch_id); + DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n", + akc->enable_sgpr_flat_scratch_init); + DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n", + akc->enable_sgpr_private_segment_size); + DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n", + akc->enable_wavefront_size32); + DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack); + DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n", + akc->kernarg_preload_spec_length); + DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n", + akc->kernarg_preload_spec_offset); + + + // Check for features not implemented in gem5 + fatal_if(akc->wgp_mode, "WGP mode not supported\n"); + fatal_if(akc->mem_ordered, "Memory ordering control not supported\n"); + fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n"); + + + // Warn on features that gem5 will ignore + warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n"); + warn_if(akc->bulky, "Bulky code object bit ignored\n"); + // TODO: All the IEEE bits + + warn_if(akc->kernarg_preload_spec_length || + akc->kernarg_preload_spec_offset, + "Kernarg preload not implemented\n"); + warn_if(akc->accum_offset, "ACC offset not implemented\n"); + warn_if(akc->tg_split, "TG split not implemented\n"); +} + System* GPUCommandProcessor::system() { diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh index 85b2a44494..ac73c179d7 100644 --- a/src/gpu-compute/gpu_command_processor.hh +++ b/src/gpu-compute/gpu_command_processor.hh @@ -148,6 +148,7 @@ class GPUCommandProcessor : public DmaVirtDevice // Typedefing dmaRead and dmaWrite function pointer typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick); void initABI(HSAQueueEntry *task); + void sanityCheckAKC(AMDKernelCode *akc); HSAPacketProcessor *hsaPP; TranslationGenPtr translate(Addr vaddr, Addr size) override; diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index 7e6744704a..d81b879594 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -418,12 +418,6 @@ class HSAQueueEntry akc->enable_sgpr_flat_scratch_init); initialSgprState.set(PrivateSegSize, akc->enable_sgpr_private_segment_size); - initialSgprState.set(GridWorkgroupCountX, - akc->enable_sgpr_grid_workgroup_count_x); - initialSgprState.set(GridWorkgroupCountY, - akc->enable_sgpr_grid_workgroup_count_y); - initialSgprState.set(GridWorkgroupCountZ, - akc->enable_sgpr_grid_workgroup_count_z); initialSgprState.set(WorkgroupIdX, akc->enable_sgpr_workgroup_id_x); initialSgprState.set(WorkgroupIdY, @@ -433,7 +427,7 @@ class HSAQueueEntry initialSgprState.set(WorkgroupInfo, akc->enable_sgpr_workgroup_info); initialSgprState.set(PrivSegWaveByteOffset, - akc->enable_sgpr_private_segment_wave_byte_offset); + akc->enable_private_segment); /** * set the enable bits for the initial VGPR state. the diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh index 1879dee672..c230af0fad 100644 --- a/src/gpu-compute/kernel_code.hh +++ b/src/gpu-compute/kernel_code.hh @@ -60,15 +60,12 @@ enum ScalarRegInitFields : int DispatchId = 4, FlatScratchInit = 5, PrivateSegSize = 6, - GridWorkgroupCountX = 7, - GridWorkgroupCountY = 8, - GridWorkgroupCountZ = 9, - WorkgroupIdX = 10, - WorkgroupIdY = 11, - WorkgroupIdZ = 12, - WorkgroupInfo = 13, - PrivSegWaveByteOffset = 14, - NumScalarInitFields = 15 + WorkgroupIdX = 7, + WorkgroupIdY = 8, + WorkgroupIdZ = 9, + WorkgroupInfo = 10, + PrivSegWaveByteOffset = 11, + NumScalarInitFields = 12 }; enum VectorRegInitFields : int @@ -79,28 +76,24 @@ enum VectorRegInitFields : int NumVectorInitFields = 3 }; -struct AMDKernelCode +// Kernel code object based on the table on LLVM's website: +// https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor +typedef struct GEM5_PACKED { - uint32_t amd_kernel_code_version_major; - uint32_t amd_kernel_code_version_minor; - uint16_t amd_machine_kind; - uint16_t amd_machine_version_major; - uint16_t amd_machine_version_minor; - uint16_t amd_machine_version_stepping; + uint32_t group_segment_fixed_size; + uint32_t private_segment_fixed_size; + uint32_t kernarg_size; + uint8_t reserved0[4]; int64_t kernel_code_entry_byte_offset; - int64_t kernel_code_prefetch_byte_offset; - uint64_t kernel_code_prefetch_byte_size; - uint64_t max_scratch_backing_memory_byte_size; + uint8_t reserved1[20]; - /** - * The fields below are used to set program settings for - * compute shaders. Here they are primarily used to setup - * initial register state. See the following for full details - * about kernel launch, state initialization, and the AMD kernel - * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/ - * blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst - * #initial-kernel-register-state - */ + // the 32b below here represent the fields of + // the COMPUTE_PGM_RSRC3 register for GFX90A, GFX940 + uint32_t accum_offset : 6; + uint32_t compute_pgm_rsrc3_reserved1 : 10; + uint32_t tg_split : 1; + uint32_t compute_pgm_rsrc3_reserved2 : 15; + // end COMPUTE_PGM_RSRC3 register // the 32b below here represent the fields of // the COMPUTE_PGM_RSRC1 register @@ -117,12 +110,16 @@ struct AMDKernelCode uint32_t enable_ieee_mode : 1; uint32_t bulky : 1; uint32_t cdbg_user : 1; - uint32_t compute_pgm_rsrc1_reserved : 6; + uint32_t fp16_ovfl : 1; + uint32_t compute_pgm_rsrc1_reserved : 2; + uint32_t wgp_mode : 1; + uint32_t mem_ordered : 1; + uint32_t fwd_progress : 1; // end COMPUTE_PGM_RSRC1 register // the 32b below here represent the fields of // the COMPUTE_PGM_RSRC2 register - uint32_t enable_sgpr_private_segment_wave_byte_offset : 1; + uint32_t enable_private_segment : 1; uint32_t user_sgpr_count : 5; uint32_t enable_trap_handler : 1; uint32_t enable_sgpr_workgroup_id_x : 1; @@ -131,7 +128,7 @@ struct AMDKernelCode uint32_t enable_sgpr_workgroup_info : 1; uint32_t enable_vgpr_workitem_id : 2; uint32_t enable_exception_address_watch : 1; - uint32_t enable_exception_memory_violation : 1; + uint32_t enable_exception_memory : 1; uint32_t granulated_lds_size : 9; uint32_t enable_exception_ieee_754_fp_invalid_operation : 1; uint32_t enable_exception_fp_denormal_source : 1; @@ -152,41 +149,17 @@ struct AMDKernelCode uint32_t enable_sgpr_dispatch_id : 1; uint32_t enable_sgpr_flat_scratch_init : 1; uint32_t enable_sgpr_private_segment_size : 1; - uint32_t enable_sgpr_grid_workgroup_count_x : 1; - uint32_t enable_sgpr_grid_workgroup_count_y : 1; - uint32_t enable_sgpr_grid_workgroup_count_z : 1; - uint32_t kernel_code_properties_reserved1 : 6; - uint32_t enable_ordered_append_gds : 1; - uint32_t private_element_size : 2; - uint32_t is_ptr64 : 1; - uint32_t is_dynamic_callstack : 1; - uint32_t is_debug_enabled : 1; - uint32_t is_xnack_enabled : 1; - uint32_t kernel_code_properties_reserved2 : 9; + uint32_t kernel_code_properties_reserved1 : 3; + uint32_t enable_wavefront_size32 : 1; + uint32_t use_dynamic_stack : 1; + uint32_t kernel_code_properties_reserved2 : 4; // end KERNEL_CODE_PROPERTIES - uint32_t workitem_private_segment_byte_size; - uint32_t workgroup_group_segment_byte_size; - uint32_t gds_segment_byte_size; - uint64_t kernarg_segment_byte_size; - uint32_t workgroup_fbarrier_count; - uint16_t wavefront_sgpr_count; - uint16_t workitem_vgpr_count; - uint16_t reserved_vgpr_first; - uint16_t reserved_vgpr_count; - uint16_t reserved_sgpr_first; - uint16_t reserved_sgpr_count; - uint16_t debug_wavefront_private_segment_offset_sgpr; - uint16_t debug_private_segment_buffer_sgpr; - uint8_t kernarg_segment_alignment; - uint8_t group_segment_alignment; - uint8_t private_segment_alignment; - uint8_t wavefront_size; - int32_t call_convention; - uint8_t reserved[12]; - uint64_t runtime_loader_kernel_symbol; - uint64_t control_directives[16]; -}; + uint32_t kernarg_preload_spec_length : 7; + uint32_t kernarg_preload_spec_offset : 9; + uint8_t reserved2[4]; +} AMDKernelCode; +static_assert(sizeof(AMDKernelCode) == 64); } // namespace gem5 diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 1d17a69d54..af8a47a84b 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -126,7 +126,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) if (task->sgprBitEnabled(en_bit)) { int physSgprIdx = 0; - uint32_t wiCount = 0; uint32_t firstWave = 0; int orderedAppendTerm = 0; int numWfsInWg = 0; @@ -341,48 +340,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) wfSlotId, wfDynId, physSgprIdx, task->privMemPerItem()); break; - case GridWorkgroupCountX: - physSgprIdx = - computeUnit->registerManager->mapSgpr(this, regInitIdx); - wiCount = ((task->gridSize(0) + - task->wgSize(0) - 1) / - task->wgSize(0)); - computeUnit->srf[simdId]->write(physSgprIdx, wiCount); - - ++regInitIdx; - DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " - "Setting num WG X: s[%d] = %x\n", - computeUnit->cu_id, simdId, - wfSlotId, wfDynId, physSgprIdx, wiCount); - break; - case GridWorkgroupCountY: - physSgprIdx = - computeUnit->registerManager->mapSgpr(this, regInitIdx); - wiCount = ((task->gridSize(1) + - task->wgSize(1) - 1) / - task->wgSize(1)); - computeUnit->srf[simdId]->write(physSgprIdx, wiCount); - - ++regInitIdx; - DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " - "Setting num WG Y: s[%d] = %x\n", - computeUnit->cu_id, simdId, - wfSlotId, wfDynId, physSgprIdx, wiCount); - break; - case GridWorkgroupCountZ: - physSgprIdx = - computeUnit->registerManager->mapSgpr(this, regInitIdx); - wiCount = ((task->gridSize(2) + - task->wgSize(2) - 1) / - task->wgSize(2)); - computeUnit->srf[simdId]->write(physSgprIdx, wiCount); - - ++regInitIdx; - DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " - "Setting num WG Z: s[%d] = %x\n", - computeUnit->cu_id, simdId, - wfSlotId, wfDynId, physSgprIdx, wiCount); - break; case WorkgroupIdX: physSgprIdx = computeUnit->registerManager->mapSgpr(this, regInitIdx);