From 01f2df4b8abf462746440fde7e4a4bcf3e12aa5d Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 13 Apr 2024 15:49:53 -0700 Subject: [PATCH 1/2] gpu-compute: Fix stat bucket sizes Change-Id: If30505515867a866c631cb117d3d22e19814a2f2 --- src/gpu-compute/compute_unit.cc | 6 +++--- src/gpu-compute/exec_stage.cc | 2 +- src/gpu-compute/shader.cc | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index e485aa6161..3e0b8070fd 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -2436,15 +2436,15 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent, instCyclesLdsPerSimd.init(cu->numVectorALUs); hitsPerTLBLevel.init(4); - execRateDist.init(0, 10, 2); - ldsBankConflictDist.init(0, cu->wfSize(), 2); + execRateDist.init(0, 10-1, 2); + ldsBankConflictDist.init(0, cu->wfSize()-1, 2); pageDivergenceDist.init(1, cu->wfSize(), 4); controlFlowDivergenceDist.init(1, cu->wfSize(), 4); activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4); activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4); - headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf | + headTailLatency.init(0, 1000000-1, 10000).flags(statistics::pdf | statistics::oneline); waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1); instInterleave.init(cu->numVectorALUs, 0, 20, 1); diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index bcba938cd8..f2b847c8a7 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -216,7 +216,7 @@ ExecStage::ExecStageStats::ExecStageStats(statistics::Group *parent) ComputeUnit *compute_unit = static_cast(parent); spc.init(0, compute_unit->numExeUnits(), 1); - idleDur.init(0, 75, 5); + idleDur.init(0, 75-1, 5); numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits()); numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits()); diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 6e3d556026..792ecb6678 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -584,31 +584,31 @@ Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size) "vector instruction destination operand distribution") { allLatencyDist - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); loadLatencyDist - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); storeLatencyDist - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); initToCoalesceLatency - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); rubyNetworkLatency - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); gmEnqueueLatency - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); gmToCompleteLatency - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); coalsrLineAddresses @@ -624,7 +624,7 @@ Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size) ccprintf(namestr, "%s.cacheBlockRoundTrip%d", static_cast(parent)->name(), idx); cacheBlockRoundTrip[idx] - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .name(namestr.str()) .desc("Coalsr-to-coalsr time for the Nth cache block in an inst") .flags(statistics::pdf | statistics::oneline); From 3db6e86fea8d48c862d90798b5e69130c1b52823 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sun, 14 Apr 2024 12:22:57 -0700 Subject: [PATCH 2/2] arch-vega: Fix string check warnings on fast build gem5.fast does not currently build if the GPU model is built. This fixes the array-bounds warnings allowing gem5.fast to build again. Change-Id: I463c2847c3ecfd2257a70418fa247090b0493f9b --- src/arch/amdgpu/vega/operand.hh | 81 ++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh index 43ff3400e6..698161d918 100644 --- a/src/arch/amdgpu/vega/operand.hh +++ b/src/arch/amdgpu/vega/operand.hh @@ -517,12 +517,23 @@ namespace VegaISA switch(_opIdx) { case REG_EXEC_LO: { - ScalarRegU64 exec_mask = _gpuDynInst->wavefront()-> - execMask().to_ullong(); - std::memcpy((void*)srfData.data(), (void*)&exec_mask, - sizeof(exec_mask)); - DPRINTF(GPUSRF, "Read EXEC\n"); - DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask); + if constexpr (NumDwords == 2) { + ScalarRegU64 exec_mask = _gpuDynInst->wavefront()-> + execMask().to_ullong(); + std::memcpy((void*)srfData.data(), (void*)&exec_mask, + sizeof(exec_mask)); + DPRINTF(GPUSRF, "Read EXEC\n"); + DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask); + } else { + ScalarRegU64 exec_mask = _gpuDynInst->wavefront()-> + execMask().to_ullong(); + + ScalarRegU32 exec_mask_lo = bits(exec_mask, 31, 0); + std::memcpy((void*)srfData.data(), + (void*)&exec_mask_lo, sizeof(exec_mask_lo)); + DPRINTF(GPUSRF, "Read EXEC_LO\n"); + DPRINTF(GPUSRF, "EXEC_LO = %#x\n", exec_mask_lo); + } } break; case REG_EXEC_HI: @@ -550,39 +561,55 @@ namespace VegaISA break; case REG_SHARED_BASE: { - ComputeUnit *cu = _gpuDynInst->computeUnit(); - ScalarRegU64 shared_base = cu->shader->ldsApe().base; - std::memcpy((void*)srfData.data(), (void*)&shared_base, - sizeof(shared_base)); - DPRINTF(GPUSRF, "Read SHARED_BASE = %#x\n", shared_base); + assert(NumDwords == 2); + if constexpr (NumDwords == 2) { + ComputeUnit *cu = _gpuDynInst->computeUnit(); + ScalarRegU64 shared_base = cu->shader->ldsApe().base; + std::memcpy((void*)srfData.data(), (void*)&shared_base, + sizeof(srfData)); + DPRINTF(GPUSRF, "Read SHARED_BASE = %#x\n", + shared_base); + } } break; case REG_SHARED_LIMIT: { - ComputeUnit *cu = _gpuDynInst->computeUnit(); - ScalarRegU64 shared_limit = cu->shader->ldsApe().limit; - std::memcpy((void*)srfData.data(), (void*)&shared_limit, - sizeof(shared_limit)); - DPRINTF(GPUSRF, "Read SHARED_LIMIT = %#x\n", shared_limit); + assert(NumDwords == 2); + if constexpr (NumDwords == 2) { + ComputeUnit *cu = _gpuDynInst->computeUnit(); + ScalarRegU64 shared_limit = cu->shader->ldsApe().limit; + std::memcpy((void*)srfData.data(), + (void*)&shared_limit, sizeof(srfData)); + DPRINTF(GPUSRF, "Read SHARED_LIMIT = %#x\n", + shared_limit); + } } break; case REG_PRIVATE_BASE: { - ComputeUnit *cu = _gpuDynInst->computeUnit(); - ScalarRegU64 priv_base = cu->shader->scratchApe().base; - std::memcpy((void*)srfData.data(), (void*)&priv_base, - sizeof(priv_base)); - DPRINTF(GPUSRF, "Read PRIVATE_BASE = %#x\n", priv_base); + assert(NumDwords == 2); + if constexpr (NumDwords == 2) { + ComputeUnit *cu = _gpuDynInst->computeUnit(); + ScalarRegU64 priv_base = cu->shader->scratchApe().base; + std::memcpy((void*)srfData.data(), (void*)&priv_base, + sizeof(srfData)); + DPRINTF(GPUSRF, "Read PRIVATE_BASE = %#x\n", + priv_base); + } } break; case REG_PRIVATE_LIMIT: { - ComputeUnit *cu = _gpuDynInst->computeUnit(); - ScalarRegU64 priv_limit = cu->shader->scratchApe().limit; - std::memcpy((void*)srfData.data(), (void*)&priv_limit, - sizeof(priv_limit)); - DPRINTF(GPUSRF, "Read PRIVATE_LIMIT = %#x\n", - priv_limit); + assert(NumDwords == 2); + if constexpr (NumDwords == 2) { + ComputeUnit *cu = _gpuDynInst->computeUnit(); + ScalarRegU64 priv_limit = + cu->shader->scratchApe().limit; + std::memcpy((void*)srfData.data(), (void*)&priv_limit, + sizeof(srfData)); + DPRINTF(GPUSRF, "Read PRIVATE_LIMIT = %#x\n", + priv_limit); + } } break; case REG_POS_HALF: