gpu-compute: Update code object to latest LLVM
The AMDKernelCode struct is very outdated. Most of the fields are no longer used and have been replaced with new fields that are used. Therefore in order to support the new fields the code object needs to be updated. The new structure is based on the table located at https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor Most notably this adds the new compute_pgm_rsrc3 and kernarg preload fields which are new features in gfx90a (MI200). The accum_offset in compute_pgm_rsrc3 and kergarg preload values are necessary to run application which enable those features and therefore a way to check their values is needed. Also noteable is the removal of enable_sgpr_workgroup_id_{X,Y,Z}. These seem to be unused in all versions of ROCm that gem5 supports and therefore these fields can be removed. They are replaced with a reserved field in the new code object. Change-Id: I5542442e1e5961b05e17affad0adb5186d6d9d1a
This commit is contained in:
@@ -36,6 +36,7 @@
|
|||||||
#include "arch/amdgpu/vega/pagetable_walker.hh"
|
#include "arch/amdgpu/vega/pagetable_walker.hh"
|
||||||
#include "base/chunk_generator.hh"
|
#include "base/chunk_generator.hh"
|
||||||
#include "debug/GPUCommandProc.hh"
|
#include "debug/GPUCommandProc.hh"
|
||||||
|
#include "debug/GPUInitAbi.hh"
|
||||||
#include "debug/GPUKernelInfo.hh"
|
#include "debug/GPUKernelInfo.hh"
|
||||||
#include "dev/amdgpu/amdgpu_device.hh"
|
#include "dev/amdgpu/amdgpu_device.hh"
|
||||||
#include "gpu-compute/dispatcher.hh"
|
#include "gpu-compute/dispatcher.hh"
|
||||||
@@ -230,6 +231,8 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
|
|||||||
{
|
{
|
||||||
_hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
|
_hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
|
||||||
|
|
||||||
|
sanityCheckAKC(akc);
|
||||||
|
|
||||||
DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
|
DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
|
||||||
"kernel object\n", akc->kernel_code_entry_byte_offset);
|
"kernel object\n", akc->kernel_code_entry_byte_offset);
|
||||||
|
|
||||||
@@ -250,7 +253,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
|
|||||||
* APUs to implement asynchronous memcopy operations from 2 pointers in
|
* APUs to implement asynchronous memcopy operations from 2 pointers in
|
||||||
* host memory. I have no idea what BLIT stands for.
|
* host memory. I have no idea what BLIT stands for.
|
||||||
* */
|
* */
|
||||||
if (akc->runtime_loader_kernel_symbol) {
|
if (!disp_pkt->completion_signal) {
|
||||||
kernel_name = "Some kernel";
|
kernel_name = "Some kernel";
|
||||||
} else {
|
} else {
|
||||||
kernel_name = "Blit kernel";
|
kernel_name = "Blit kernel";
|
||||||
@@ -616,6 +619,114 @@ GPUCommandProcessor::initABI(HSAQueueEntry *task)
|
|||||||
sizeof(uint32_t), cb, &cb->dmaBuffer);
|
sizeof(uint32_t), cb, &cb->dmaBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)
|
||||||
|
{
|
||||||
|
DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",
|
||||||
|
akc->group_segment_fixed_size);
|
||||||
|
DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",
|
||||||
|
akc->private_segment_fixed_size);
|
||||||
|
DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);
|
||||||
|
DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",
|
||||||
|
akc->kernel_code_entry_byte_offset);
|
||||||
|
DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);
|
||||||
|
DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);
|
||||||
|
DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",
|
||||||
|
akc->granulated_workitem_vgpr_count);
|
||||||
|
DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",
|
||||||
|
akc->granulated_wavefront_sgpr_count);
|
||||||
|
DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);
|
||||||
|
DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);
|
||||||
|
DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",
|
||||||
|
akc->float_mode_round_16_64);
|
||||||
|
DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",
|
||||||
|
akc->float_mode_denorm_32);
|
||||||
|
DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",
|
||||||
|
akc->float_mode_denorm_16_64);
|
||||||
|
DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);
|
||||||
|
DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);
|
||||||
|
DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);
|
||||||
|
DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);
|
||||||
|
DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);
|
||||||
|
DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);
|
||||||
|
DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);
|
||||||
|
DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",
|
||||||
|
akc->enable_private_segment);
|
||||||
|
DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",
|
||||||
|
akc->enable_sgpr_workgroup_id_x);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",
|
||||||
|
akc->enable_sgpr_workgroup_id_y);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",
|
||||||
|
akc->enable_sgpr_workgroup_id_z);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",
|
||||||
|
akc->enable_sgpr_workgroup_info);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",
|
||||||
|
akc->enable_vgpr_workitem_id);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",
|
||||||
|
akc->enable_exception_address_watch);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",
|
||||||
|
akc->enable_exception_memory);
|
||||||
|
DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",
|
||||||
|
akc->enable_exception_ieee_754_fp_invalid_operation);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",
|
||||||
|
akc->enable_exception_fp_denormal_source);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",
|
||||||
|
akc->enable_exception_ieee_754_fp_division_by_zero);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",
|
||||||
|
akc->enable_exception_ieee_754_fp_overflow);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",
|
||||||
|
akc->enable_exception_ieee_754_fp_underflow);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",
|
||||||
|
akc->enable_exception_ieee_754_fp_inexact);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",
|
||||||
|
akc->enable_exception_int_divide_by_zero);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",
|
||||||
|
akc->enable_sgpr_private_segment_buffer);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",
|
||||||
|
akc->enable_sgpr_dispatch_ptr);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",
|
||||||
|
akc->enable_sgpr_queue_ptr);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",
|
||||||
|
akc->enable_sgpr_kernarg_segment_ptr);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",
|
||||||
|
akc->enable_sgpr_dispatch_id);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",
|
||||||
|
akc->enable_sgpr_flat_scratch_init);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",
|
||||||
|
akc->enable_sgpr_private_segment_size);
|
||||||
|
DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",
|
||||||
|
akc->enable_wavefront_size32);
|
||||||
|
DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);
|
||||||
|
DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",
|
||||||
|
akc->kernarg_preload_spec_length);
|
||||||
|
DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",
|
||||||
|
akc->kernarg_preload_spec_offset);
|
||||||
|
|
||||||
|
|
||||||
|
// Check for features not implemented in gem5
|
||||||
|
fatal_if(akc->wgp_mode, "WGP mode not supported\n");
|
||||||
|
fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");
|
||||||
|
fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");
|
||||||
|
|
||||||
|
|
||||||
|
// Warn on features that gem5 will ignore
|
||||||
|
warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");
|
||||||
|
warn_if(akc->bulky, "Bulky code object bit ignored\n");
|
||||||
|
// TODO: All the IEEE bits
|
||||||
|
|
||||||
|
warn_if(akc->kernarg_preload_spec_length ||
|
||||||
|
akc->kernarg_preload_spec_offset,
|
||||||
|
"Kernarg preload not implemented\n");
|
||||||
|
warn_if(akc->accum_offset, "ACC offset not implemented\n");
|
||||||
|
warn_if(akc->tg_split, "TG split not implemented\n");
|
||||||
|
}
|
||||||
|
|
||||||
System*
|
System*
|
||||||
GPUCommandProcessor::system()
|
GPUCommandProcessor::system()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -148,6 +148,7 @@ class GPUCommandProcessor : public DmaVirtDevice
|
|||||||
// Typedefing dmaRead and dmaWrite function pointer
|
// Typedefing dmaRead and dmaWrite function pointer
|
||||||
typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
|
typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
|
||||||
void initABI(HSAQueueEntry *task);
|
void initABI(HSAQueueEntry *task);
|
||||||
|
void sanityCheckAKC(AMDKernelCode *akc);
|
||||||
HSAPacketProcessor *hsaPP;
|
HSAPacketProcessor *hsaPP;
|
||||||
TranslationGenPtr translate(Addr vaddr, Addr size) override;
|
TranslationGenPtr translate(Addr vaddr, Addr size) override;
|
||||||
|
|
||||||
|
|||||||
@@ -418,12 +418,6 @@ class HSAQueueEntry
|
|||||||
akc->enable_sgpr_flat_scratch_init);
|
akc->enable_sgpr_flat_scratch_init);
|
||||||
initialSgprState.set(PrivateSegSize,
|
initialSgprState.set(PrivateSegSize,
|
||||||
akc->enable_sgpr_private_segment_size);
|
akc->enable_sgpr_private_segment_size);
|
||||||
initialSgprState.set(GridWorkgroupCountX,
|
|
||||||
akc->enable_sgpr_grid_workgroup_count_x);
|
|
||||||
initialSgprState.set(GridWorkgroupCountY,
|
|
||||||
akc->enable_sgpr_grid_workgroup_count_y);
|
|
||||||
initialSgprState.set(GridWorkgroupCountZ,
|
|
||||||
akc->enable_sgpr_grid_workgroup_count_z);
|
|
||||||
initialSgprState.set(WorkgroupIdX,
|
initialSgprState.set(WorkgroupIdX,
|
||||||
akc->enable_sgpr_workgroup_id_x);
|
akc->enable_sgpr_workgroup_id_x);
|
||||||
initialSgprState.set(WorkgroupIdY,
|
initialSgprState.set(WorkgroupIdY,
|
||||||
@@ -433,7 +427,7 @@ class HSAQueueEntry
|
|||||||
initialSgprState.set(WorkgroupInfo,
|
initialSgprState.set(WorkgroupInfo,
|
||||||
akc->enable_sgpr_workgroup_info);
|
akc->enable_sgpr_workgroup_info);
|
||||||
initialSgprState.set(PrivSegWaveByteOffset,
|
initialSgprState.set(PrivSegWaveByteOffset,
|
||||||
akc->enable_sgpr_private_segment_wave_byte_offset);
|
akc->enable_private_segment);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set the enable bits for the initial VGPR state. the
|
* set the enable bits for the initial VGPR state. the
|
||||||
|
|||||||
@@ -60,15 +60,12 @@ enum ScalarRegInitFields : int
|
|||||||
DispatchId = 4,
|
DispatchId = 4,
|
||||||
FlatScratchInit = 5,
|
FlatScratchInit = 5,
|
||||||
PrivateSegSize = 6,
|
PrivateSegSize = 6,
|
||||||
GridWorkgroupCountX = 7,
|
WorkgroupIdX = 7,
|
||||||
GridWorkgroupCountY = 8,
|
WorkgroupIdY = 8,
|
||||||
GridWorkgroupCountZ = 9,
|
WorkgroupIdZ = 9,
|
||||||
WorkgroupIdX = 10,
|
WorkgroupInfo = 10,
|
||||||
WorkgroupIdY = 11,
|
PrivSegWaveByteOffset = 11,
|
||||||
WorkgroupIdZ = 12,
|
NumScalarInitFields = 12
|
||||||
WorkgroupInfo = 13,
|
|
||||||
PrivSegWaveByteOffset = 14,
|
|
||||||
NumScalarInitFields = 15
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum VectorRegInitFields : int
|
enum VectorRegInitFields : int
|
||||||
@@ -79,28 +76,24 @@ enum VectorRegInitFields : int
|
|||||||
NumVectorInitFields = 3
|
NumVectorInitFields = 3
|
||||||
};
|
};
|
||||||
|
|
||||||
struct AMDKernelCode
|
// Kernel code object based on the table on LLVM's website:
|
||||||
|
// https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor
|
||||||
|
typedef struct GEM5_PACKED
|
||||||
{
|
{
|
||||||
uint32_t amd_kernel_code_version_major;
|
uint32_t group_segment_fixed_size;
|
||||||
uint32_t amd_kernel_code_version_minor;
|
uint32_t private_segment_fixed_size;
|
||||||
uint16_t amd_machine_kind;
|
uint32_t kernarg_size;
|
||||||
uint16_t amd_machine_version_major;
|
uint8_t reserved0[4];
|
||||||
uint16_t amd_machine_version_minor;
|
|
||||||
uint16_t amd_machine_version_stepping;
|
|
||||||
int64_t kernel_code_entry_byte_offset;
|
int64_t kernel_code_entry_byte_offset;
|
||||||
int64_t kernel_code_prefetch_byte_offset;
|
uint8_t reserved1[20];
|
||||||
uint64_t kernel_code_prefetch_byte_size;
|
|
||||||
uint64_t max_scratch_backing_memory_byte_size;
|
|
||||||
|
|
||||||
/**
|
// the 32b below here represent the fields of
|
||||||
* The fields below are used to set program settings for
|
// the COMPUTE_PGM_RSRC3 register for GFX90A, GFX940
|
||||||
* compute shaders. Here they are primarily used to setup
|
uint32_t accum_offset : 6;
|
||||||
* initial register state. See the following for full details
|
uint32_t compute_pgm_rsrc3_reserved1 : 10;
|
||||||
* about kernel launch, state initialization, and the AMD kernel
|
uint32_t tg_split : 1;
|
||||||
* code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
|
uint32_t compute_pgm_rsrc3_reserved2 : 15;
|
||||||
* blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
|
// end COMPUTE_PGM_RSRC3 register
|
||||||
* #initial-kernel-register-state
|
|
||||||
*/
|
|
||||||
|
|
||||||
// the 32b below here represent the fields of
|
// the 32b below here represent the fields of
|
||||||
// the COMPUTE_PGM_RSRC1 register
|
// the COMPUTE_PGM_RSRC1 register
|
||||||
@@ -117,12 +110,16 @@ struct AMDKernelCode
|
|||||||
uint32_t enable_ieee_mode : 1;
|
uint32_t enable_ieee_mode : 1;
|
||||||
uint32_t bulky : 1;
|
uint32_t bulky : 1;
|
||||||
uint32_t cdbg_user : 1;
|
uint32_t cdbg_user : 1;
|
||||||
uint32_t compute_pgm_rsrc1_reserved : 6;
|
uint32_t fp16_ovfl : 1;
|
||||||
|
uint32_t compute_pgm_rsrc1_reserved : 2;
|
||||||
|
uint32_t wgp_mode : 1;
|
||||||
|
uint32_t mem_ordered : 1;
|
||||||
|
uint32_t fwd_progress : 1;
|
||||||
// end COMPUTE_PGM_RSRC1 register
|
// end COMPUTE_PGM_RSRC1 register
|
||||||
|
|
||||||
// the 32b below here represent the fields of
|
// the 32b below here represent the fields of
|
||||||
// the COMPUTE_PGM_RSRC2 register
|
// the COMPUTE_PGM_RSRC2 register
|
||||||
uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
|
uint32_t enable_private_segment : 1;
|
||||||
uint32_t user_sgpr_count : 5;
|
uint32_t user_sgpr_count : 5;
|
||||||
uint32_t enable_trap_handler : 1;
|
uint32_t enable_trap_handler : 1;
|
||||||
uint32_t enable_sgpr_workgroup_id_x : 1;
|
uint32_t enable_sgpr_workgroup_id_x : 1;
|
||||||
@@ -131,7 +128,7 @@ struct AMDKernelCode
|
|||||||
uint32_t enable_sgpr_workgroup_info : 1;
|
uint32_t enable_sgpr_workgroup_info : 1;
|
||||||
uint32_t enable_vgpr_workitem_id : 2;
|
uint32_t enable_vgpr_workitem_id : 2;
|
||||||
uint32_t enable_exception_address_watch : 1;
|
uint32_t enable_exception_address_watch : 1;
|
||||||
uint32_t enable_exception_memory_violation : 1;
|
uint32_t enable_exception_memory : 1;
|
||||||
uint32_t granulated_lds_size : 9;
|
uint32_t granulated_lds_size : 9;
|
||||||
uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
|
uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
|
||||||
uint32_t enable_exception_fp_denormal_source : 1;
|
uint32_t enable_exception_fp_denormal_source : 1;
|
||||||
@@ -152,41 +149,17 @@ struct AMDKernelCode
|
|||||||
uint32_t enable_sgpr_dispatch_id : 1;
|
uint32_t enable_sgpr_dispatch_id : 1;
|
||||||
uint32_t enable_sgpr_flat_scratch_init : 1;
|
uint32_t enable_sgpr_flat_scratch_init : 1;
|
||||||
uint32_t enable_sgpr_private_segment_size : 1;
|
uint32_t enable_sgpr_private_segment_size : 1;
|
||||||
uint32_t enable_sgpr_grid_workgroup_count_x : 1;
|
uint32_t kernel_code_properties_reserved1 : 3;
|
||||||
uint32_t enable_sgpr_grid_workgroup_count_y : 1;
|
uint32_t enable_wavefront_size32 : 1;
|
||||||
uint32_t enable_sgpr_grid_workgroup_count_z : 1;
|
uint32_t use_dynamic_stack : 1;
|
||||||
uint32_t kernel_code_properties_reserved1 : 6;
|
uint32_t kernel_code_properties_reserved2 : 4;
|
||||||
uint32_t enable_ordered_append_gds : 1;
|
|
||||||
uint32_t private_element_size : 2;
|
|
||||||
uint32_t is_ptr64 : 1;
|
|
||||||
uint32_t is_dynamic_callstack : 1;
|
|
||||||
uint32_t is_debug_enabled : 1;
|
|
||||||
uint32_t is_xnack_enabled : 1;
|
|
||||||
uint32_t kernel_code_properties_reserved2 : 9;
|
|
||||||
// end KERNEL_CODE_PROPERTIES
|
// end KERNEL_CODE_PROPERTIES
|
||||||
|
|
||||||
uint32_t workitem_private_segment_byte_size;
|
uint32_t kernarg_preload_spec_length : 7;
|
||||||
uint32_t workgroup_group_segment_byte_size;
|
uint32_t kernarg_preload_spec_offset : 9;
|
||||||
uint32_t gds_segment_byte_size;
|
uint8_t reserved2[4];
|
||||||
uint64_t kernarg_segment_byte_size;
|
} AMDKernelCode;
|
||||||
uint32_t workgroup_fbarrier_count;
|
static_assert(sizeof(AMDKernelCode) == 64);
|
||||||
uint16_t wavefront_sgpr_count;
|
|
||||||
uint16_t workitem_vgpr_count;
|
|
||||||
uint16_t reserved_vgpr_first;
|
|
||||||
uint16_t reserved_vgpr_count;
|
|
||||||
uint16_t reserved_sgpr_first;
|
|
||||||
uint16_t reserved_sgpr_count;
|
|
||||||
uint16_t debug_wavefront_private_segment_offset_sgpr;
|
|
||||||
uint16_t debug_private_segment_buffer_sgpr;
|
|
||||||
uint8_t kernarg_segment_alignment;
|
|
||||||
uint8_t group_segment_alignment;
|
|
||||||
uint8_t private_segment_alignment;
|
|
||||||
uint8_t wavefront_size;
|
|
||||||
int32_t call_convention;
|
|
||||||
uint8_t reserved[12];
|
|
||||||
uint64_t runtime_loader_kernel_symbol;
|
|
||||||
uint64_t control_directives[16];
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace gem5
|
} // namespace gem5
|
||||||
|
|
||||||
|
|||||||
@@ -126,7 +126,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
|
|||||||
|
|
||||||
if (task->sgprBitEnabled(en_bit)) {
|
if (task->sgprBitEnabled(en_bit)) {
|
||||||
int physSgprIdx = 0;
|
int physSgprIdx = 0;
|
||||||
uint32_t wiCount = 0;
|
|
||||||
uint32_t firstWave = 0;
|
uint32_t firstWave = 0;
|
||||||
int orderedAppendTerm = 0;
|
int orderedAppendTerm = 0;
|
||||||
int numWfsInWg = 0;
|
int numWfsInWg = 0;
|
||||||
@@ -341,48 +340,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
|
|||||||
wfSlotId, wfDynId, physSgprIdx,
|
wfSlotId, wfDynId, physSgprIdx,
|
||||||
task->privMemPerItem());
|
task->privMemPerItem());
|
||||||
break;
|
break;
|
||||||
case GridWorkgroupCountX:
|
|
||||||
physSgprIdx =
|
|
||||||
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
||||||
wiCount = ((task->gridSize(0) +
|
|
||||||
task->wgSize(0) - 1) /
|
|
||||||
task->wgSize(0));
|
|
||||||
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
|
|
||||||
|
|
||||||
++regInitIdx;
|
|
||||||
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
||||||
"Setting num WG X: s[%d] = %x\n",
|
|
||||||
computeUnit->cu_id, simdId,
|
|
||||||
wfSlotId, wfDynId, physSgprIdx, wiCount);
|
|
||||||
break;
|
|
||||||
case GridWorkgroupCountY:
|
|
||||||
physSgprIdx =
|
|
||||||
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
||||||
wiCount = ((task->gridSize(1) +
|
|
||||||
task->wgSize(1) - 1) /
|
|
||||||
task->wgSize(1));
|
|
||||||
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
|
|
||||||
|
|
||||||
++regInitIdx;
|
|
||||||
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
||||||
"Setting num WG Y: s[%d] = %x\n",
|
|
||||||
computeUnit->cu_id, simdId,
|
|
||||||
wfSlotId, wfDynId, physSgprIdx, wiCount);
|
|
||||||
break;
|
|
||||||
case GridWorkgroupCountZ:
|
|
||||||
physSgprIdx =
|
|
||||||
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
||||||
wiCount = ((task->gridSize(2) +
|
|
||||||
task->wgSize(2) - 1) /
|
|
||||||
task->wgSize(2));
|
|
||||||
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
|
|
||||||
|
|
||||||
++regInitIdx;
|
|
||||||
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
||||||
"Setting num WG Z: s[%d] = %x\n",
|
|
||||||
computeUnit->cu_id, simdId,
|
|
||||||
wfSlotId, wfDynId, physSgprIdx, wiCount);
|
|
||||||
break;
|
|
||||||
case WorkgroupIdX:
|
case WorkgroupIdX:
|
||||||
physSgprIdx =
|
physSgprIdx =
|
||||||
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
||||||
|
|||||||
Reference in New Issue
Block a user