From ae7476bcdca9c44fc45a057b1204b5be0ccfea4a Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 12 May 2023 18:28:00 -0500 Subject: [PATCH] arch-gcn3,arch-vega: Fix ds_read2st64_b32 This instruction has two issues. The first is that it should write two consecutive registers, starting with vdst because it is writing two dwords. The second is that the data assignment to the lanes from the dynamic instruction should cast to a U32 type otherwise the array index goes out of bounds and returns the wrong data. The first issue was fixed in GCN3 a few years ago in this review: https://gem5-review.googlesource.com/c/public/gem5/+/32236. This changeset makes the same change for Vega and applies the U32 cast in both ISAs. Tested with rocPRIM unit test. The test was failing before this changeset and now passes. Change-Id: Ifb110fc9a36ad198da7eaf86b1e3e37eccd3bb10 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70577 Maintainer: Matt Sinclair Reviewed-by: Matt Sinclair Tested-by: kokoro --- src/arch/amdgpu/gcn3/insts/instructions.cc | 4 ++-- src/arch/amdgpu/vega/insts/instructions.cc | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc index 8c51af5187..478b1d38d0 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.cc +++ b/src/arch/amdgpu/gcn3/insts/instructions.cc @@ -32123,9 +32123,9 @@ namespace Gcn3ISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { - vdst0[lane] = (reinterpret_cast( + vdst0[lane] = (reinterpret_cast( gpuDynInst->d_data))[lane * 2]; - vdst1[lane] = (reinterpret_cast( + vdst1[lane] = (reinterpret_cast( gpuDynInst->d_data))[lane * 2 + 1]; } } diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc index 45c84910f2..6c014bc107 100644 --- a/src/arch/amdgpu/vega/insts/instructions.cc +++ b/src/arch/amdgpu/vega/insts/instructions.cc @@ -35665,13 +35665,13 @@ namespace VegaISA Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst) { VecOperandU32 vdst0(gpuDynInst, extData.VDST); - VecOperandU32 vdst1(gpuDynInst, extData.VDST + 2); + VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { - vdst0[lane] = (reinterpret_cast( + vdst0[lane] = (reinterpret_cast( gpuDynInst->d_data))[lane * 2]; - vdst1[lane] = (reinterpret_cast( + vdst1[lane] = (reinterpret_cast( gpuDynInst->d_data))[lane * 2 + 1]; } }