diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh index 43ff3400e6..698161d918 100644 --- a/src/arch/amdgpu/vega/operand.hh +++ b/src/arch/amdgpu/vega/operand.hh @@ -517,12 +517,23 @@ namespace VegaISA switch(_opIdx) { case REG_EXEC_LO: { - ScalarRegU64 exec_mask = _gpuDynInst->wavefront()-> - execMask().to_ullong(); - std::memcpy((void*)srfData.data(), (void*)&exec_mask, - sizeof(exec_mask)); - DPRINTF(GPUSRF, "Read EXEC\n"); - DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask); + if constexpr (NumDwords == 2) { + ScalarRegU64 exec_mask = _gpuDynInst->wavefront()-> + execMask().to_ullong(); + std::memcpy((void*)srfData.data(), (void*)&exec_mask, + sizeof(exec_mask)); + DPRINTF(GPUSRF, "Read EXEC\n"); + DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask); + } else { + ScalarRegU64 exec_mask = _gpuDynInst->wavefront()-> + execMask().to_ullong(); + + ScalarRegU32 exec_mask_lo = bits(exec_mask, 31, 0); + std::memcpy((void*)srfData.data(), + (void*)&exec_mask_lo, sizeof(exec_mask_lo)); + DPRINTF(GPUSRF, "Read EXEC_LO\n"); + DPRINTF(GPUSRF, "EXEC_LO = %#x\n", exec_mask_lo); + } } break; case REG_EXEC_HI: @@ -550,39 +561,55 @@ namespace VegaISA break; case REG_SHARED_BASE: { - ComputeUnit *cu = _gpuDynInst->computeUnit(); - ScalarRegU64 shared_base = cu->shader->ldsApe().base; - std::memcpy((void*)srfData.data(), (void*)&shared_base, - sizeof(shared_base)); - DPRINTF(GPUSRF, "Read SHARED_BASE = %#x\n", shared_base); + assert(NumDwords == 2); + if constexpr (NumDwords == 2) { + ComputeUnit *cu = _gpuDynInst->computeUnit(); + ScalarRegU64 shared_base = cu->shader->ldsApe().base; + std::memcpy((void*)srfData.data(), (void*)&shared_base, + sizeof(srfData)); + DPRINTF(GPUSRF, "Read SHARED_BASE = %#x\n", + shared_base); + } } break; case REG_SHARED_LIMIT: { - ComputeUnit *cu = _gpuDynInst->computeUnit(); - ScalarRegU64 shared_limit = cu->shader->ldsApe().limit; - std::memcpy((void*)srfData.data(), (void*)&shared_limit, - sizeof(shared_limit)); - DPRINTF(GPUSRF, "Read SHARED_LIMIT = %#x\n", shared_limit); + assert(NumDwords == 2); + if constexpr (NumDwords == 2) { + ComputeUnit *cu = _gpuDynInst->computeUnit(); + ScalarRegU64 shared_limit = cu->shader->ldsApe().limit; + std::memcpy((void*)srfData.data(), + (void*)&shared_limit, sizeof(srfData)); + DPRINTF(GPUSRF, "Read SHARED_LIMIT = %#x\n", + shared_limit); + } } break; case REG_PRIVATE_BASE: { - ComputeUnit *cu = _gpuDynInst->computeUnit(); - ScalarRegU64 priv_base = cu->shader->scratchApe().base; - std::memcpy((void*)srfData.data(), (void*)&priv_base, - sizeof(priv_base)); - DPRINTF(GPUSRF, "Read PRIVATE_BASE = %#x\n", priv_base); + assert(NumDwords == 2); + if constexpr (NumDwords == 2) { + ComputeUnit *cu = _gpuDynInst->computeUnit(); + ScalarRegU64 priv_base = cu->shader->scratchApe().base; + std::memcpy((void*)srfData.data(), (void*)&priv_base, + sizeof(srfData)); + DPRINTF(GPUSRF, "Read PRIVATE_BASE = %#x\n", + priv_base); + } } break; case REG_PRIVATE_LIMIT: { - ComputeUnit *cu = _gpuDynInst->computeUnit(); - ScalarRegU64 priv_limit = cu->shader->scratchApe().limit; - std::memcpy((void*)srfData.data(), (void*)&priv_limit, - sizeof(priv_limit)); - DPRINTF(GPUSRF, "Read PRIVATE_LIMIT = %#x\n", - priv_limit); + assert(NumDwords == 2); + if constexpr (NumDwords == 2) { + ComputeUnit *cu = _gpuDynInst->computeUnit(); + ScalarRegU64 priv_limit = + cu->shader->scratchApe().limit; + std::memcpy((void*)srfData.data(), (void*)&priv_limit, + sizeof(srfData)); + DPRINTF(GPUSRF, "Read PRIVATE_LIMIT = %#x\n", + priv_limit); + } } break; case REG_POS_HALF: diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 5daa82e576..daad5e9b40 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -2476,15 +2476,15 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent, instCyclesLdsPerSimd.init(cu->numVectorALUs); hitsPerTLBLevel.init(4); - execRateDist.init(0, 10, 2); - ldsBankConflictDist.init(0, cu->wfSize(), 2); + execRateDist.init(0, 10-1, 2); + ldsBankConflictDist.init(0, cu->wfSize()-1, 2); pageDivergenceDist.init(1, cu->wfSize(), 4); controlFlowDivergenceDist.init(1, cu->wfSize(), 4); activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4); activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4); - headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf | + headTailLatency.init(0, 1000000-1, 10000).flags(statistics::pdf | statistics::oneline); waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1); instInterleave.init(cu->numVectorALUs, 0, 20, 1); diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index bcba938cd8..f2b847c8a7 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -216,7 +216,7 @@ ExecStage::ExecStageStats::ExecStageStats(statistics::Group *parent) ComputeUnit *compute_unit = static_cast(parent); spc.init(0, compute_unit->numExeUnits(), 1); - idleDur.init(0, 75, 5); + idleDur.init(0, 75-1, 5); numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits()); numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits()); diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 437d590b70..13b03b0a34 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -607,31 +607,31 @@ Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size) "vector instruction destination operand distribution") { allLatencyDist - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); loadLatencyDist - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); storeLatencyDist - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); initToCoalesceLatency - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); rubyNetworkLatency - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); gmEnqueueLatency - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); gmToCompleteLatency - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .flags(statistics::pdf | statistics::oneline); coalsrLineAddresses @@ -647,7 +647,7 @@ Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size) ccprintf(namestr, "%s.cacheBlockRoundTrip%d", static_cast(parent)->name(), idx); cacheBlockRoundTrip[idx] - .init(0, 1600000, 10000) + .init(0, 1600000-1, 10000) .name(namestr.str()) .desc("Coalsr-to-coalsr time for the Nth cache block in an inst") .flags(statistics::pdf | statistics::oneline);