From 4754e32219a70532ea5eb04d60fb7a42f0c53c21 Mon Sep 17 00:00:00 2001 From: Javier Garcia Hernandez Date: Mon, 12 Jul 2021 19:43:36 +0200 Subject: [PATCH 1/9] arch-arm: Fixes an error related to HTM error code handling Arguments of the function bits(), called in restore method, are the other way around. This leads to wrong retry handling. Jira Issue: https://gem5.atlassian.net/browse/GEM5-1041 Change-Id: I0748b1cad57bea5527ca585852d183bd75b4c9ef Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47939 Reviewed-by: Jason Lowe-Power Maintainer: Jason Lowe-Power Tested-by: kokoro --- src/arch/arm/htm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arch/arm/htm.cc b/src/arch/arm/htm.cc index 0062b56df7..e94e4379ce 100644 --- a/src/arch/arm/htm.cc +++ b/src/arch/arm/htm.cc @@ -129,7 +129,7 @@ ArmISA::HTMCheckpoint::restore(ThreadContext *tc, HtmFailureFaultCause cause) case HtmFailureFaultCause::EXPLICIT: replaceBits(error_code, 14, 0, tcreason); replaceBits(error_code, 16, 1); - retry = bits(15, tcreason); + retry = bits(tcreason, 15); break; case HtmFailureFaultCause::MEMORY: replaceBits(error_code, 17, 1); From 46e62e5eb3ba0fc6aca68d3bfa35490fd69a1f40 Mon Sep 17 00:00:00 2001 From: Kyle Roarty Date: Mon, 12 Jul 2021 20:49:55 -0500 Subject: [PATCH 2/9] arch-gcn3: Free dest registers in non-memory Load DS insts Certain DS insts are classfied as Loads, but don't actually go through the memory pipeline. However, any instruction classified as a load marks its destination registers as free in the memory pipeline. Because these instructions didn't use the memory pipeline, they never freed their destination registers, which led to a deadlock. This patch explicitly calls the function used to free the destination registers in the execute() method of those Load instructions that don't use the memory pipeline. Change-Id: Ic2ac2e232c8fbad63d0c62c1862f2bdaeaba4edf Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48019 Reviewed-by: Matt Sinclair Reviewed-by: Bobby R. Bruce Maintainer: Matt Sinclair Tested-by: kokoro --- src/arch/amdgpu/gcn3/insts/instructions.cc | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc index a421454824..21ab58d177 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.cc +++ b/src/arch/amdgpu/gcn3/insts/instructions.cc @@ -32397,6 +32397,15 @@ namespace Gcn3ISA } vdst.write(); + + /** + * This is needed because we treat this instruction as a load + * but it's not an actual memory request. + * Without this, the destination register never gets marked as + * free, leading to a possible deadlock + */ + wf->computeUnit->vrf[wf->simdId]-> + scheduleWriteOperandsFromLoad(wf, gpuDynInst); } // execute // --- Inst_DS__DS_PERMUTE_B32 class methods --- @@ -32468,6 +32477,15 @@ namespace Gcn3ISA wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); + + /** + * This is needed because we treat this instruction as a load + * but it's not an actual memory request. + * Without this, the destination register never gets marked as + * free, leading to a possible deadlock + */ + wf->computeUnit->vrf[wf->simdId]-> + scheduleWriteOperandsFromLoad(wf, gpuDynInst); } // execute // --- Inst_DS__DS_BPERMUTE_B32 class methods --- @@ -32539,6 +32557,15 @@ namespace Gcn3ISA wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); + + /** + * This is needed because we treat this instruction as a load + * but it's not an actual memory request. + * Without this, the destination register never gets marked as + * free, leading to a possible deadlock + */ + wf->computeUnit->vrf[wf->simdId]-> + scheduleWriteOperandsFromLoad(wf, gpuDynInst); } // execute // --- Inst_DS__DS_ADD_U64 class methods --- From fb4a7e1e24676751af81d58cdc41381296ce20fb Mon Sep 17 00:00:00 2001 From: Kyle Roarty Date: Mon, 12 Jul 2021 22:48:02 -0500 Subject: [PATCH 3/9] gpu-compute: Fix off-by-one when creating an AddrRange The end value of an AddrRange is already not included in the range, so subtracting one from the end creates an off-by-one error. This patch removes the extra -1 that was used when determining the end of an AddrRange in allocateGpuVma Change-Id: I75659e9a7fabd991bb37be9aa40f8e409eb21154 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48020 Reviewed-by: Matt Sinclair Reviewed-by: Bobby R. Bruce Maintainer: Matt Sinclair Tested-by: kokoro --- src/gpu-compute/gpu_compute_driver.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc index 92ac6413d6..f794b43528 100644 --- a/src/gpu-compute/gpu_compute_driver.cc +++ b/src/gpu-compute/gpu_compute_driver.cc @@ -985,7 +985,7 @@ void GPUComputeDriver::allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start, Addr length) { - AddrRange range = AddrRange(start, start + length - 1); + AddrRange range = AddrRange(start, start + length); DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n", range.start(), range.end(), mtype); fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(), From ccd03cf7046745a11af37d172f7d2c6fbbc7bbb0 Mon Sep 17 00:00:00 2001 From: Hoa Nguyen Date: Wed, 14 Jul 2021 12:42:31 -0700 Subject: [PATCH 4/9] cpu: remove O3 dependency of CheckerCPU Currently, compiling CheckerCPU uses the dyn_inst.hh header from O3CPU. However, including this header is not required and it causes gem5 failed to build when O3CPU is not part of CPU_MODELS. This change also involves moving the the dependency on src/cpu/o3/dyn_inst.hh to src/cpu/o3/cpu.cc and src/cpu/lsq_unit.cc, which previously includes src/cpu/o3/dyn_inst.hh implicitly through src/cpu/checker/cpu.hh. JIRA: https://gem5.atlassian.net/browse/GEM5-1025 Change-Id: I7664cd4b9591bf0a4635338fff576cb5f5cbfa10 Signed-off-by: Hoa Nguyen Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48079 Reviewed-by: Jason Lowe-Power Maintainer: Jason Lowe-Power Tested-by: kokoro --- src/cpu/checker/cpu.hh | 1 - src/cpu/o3/cpu.cc | 1 + src/cpu/o3/lsq_unit.cc | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh index aebf522624..a191ae4828 100644 --- a/src/cpu/checker/cpu.hh +++ b/src/cpu/checker/cpu.hh @@ -51,7 +51,6 @@ #include "cpu/base.hh" #include "cpu/exec_context.hh" #include "cpu/inst_res.hh" -#include "cpu/o3/dyn_inst.hh" #include "cpu/pc_event.hh" #include "cpu/simple_thread.hh" #include "cpu/static_inst.hh" diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index fbdfcbd281..e35223608b 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -46,6 +46,7 @@ #include "cpu/activity.hh" #include "cpu/checker/cpu.hh" #include "cpu/checker/thread_context.hh" +#include "cpu/o3/dyn_inst.hh" #include "cpu/o3/limits.hh" #include "cpu/o3/thread_context.hh" #include "cpu/simple_thread.hh" diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 5394e4fdf8..039184d444 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -46,6 +46,7 @@ #include "base/str.hh" #include "config/the_isa.hh" #include "cpu/checker/cpu.hh" +#include "cpu/o3/dyn_inst.hh" #include "cpu/o3/limits.hh" #include "cpu/o3/lsq.hh" #include "debug/Activity.hh" From a02161874526737fdc01e7cf087c4ee533b371c4 Mon Sep 17 00:00:00 2001 From: Hoa Nguyen Date: Thu, 15 Jul 2021 00:40:02 +0000 Subject: [PATCH 5/9] arch-riscv: Revert change-45522 This reverts change: https://gem5-review.googlesource.com/c/public/gem5/+/45522. This reverts commit 1cf41d4c54c988ef4808d8efc1f6212e54a4c120. Reason for revert: The above commit caused booting Linux using RISCV either to hang or to take significantly time more than to finish. For the v21-1 release, the above commit will be reverted. JIRA: https://gem5.atlassian.net/browse/GEM5-1043 Change-Id: I58fbe96d7ea50031eba40ff49dabdef971faf6ff Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48099 Reviewed-by: Bobby R. Bruce Maintainer: Bobby R. Bruce Tested-by: kokoro --- src/arch/riscv/isa/formats/standard.isa | 140 ++++++++---------------- 1 file changed, 46 insertions(+), 94 deletions(-) diff --git a/src/arch/riscv/isa/formats/standard.isa b/src/arch/riscv/isa/formats/standard.isa index edb22683a6..dad2c2baf0 100644 --- a/src/arch/riscv/isa/formats/standard.isa +++ b/src/arch/riscv/isa/formats/standard.isa @@ -274,7 +274,7 @@ def template JumpExecute {{ } }}; -def template CSRExecuteRo {{ +def template CSRExecute {{ Fault %(class_name)s::execute(ExecContext *xc, Trace::InstRecord *traceData) const @@ -287,6 +287,8 @@ def template CSRExecuteRo {{ %(op_decl)s; %(op_rd)s; + RegVal data, olddata; + switch (csr) { case CSR_SATP: { auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV); @@ -311,91 +313,55 @@ def template CSRExecuteRo {{ break; } - RegVal data; if (csr == CSR_FCSR) { - data = xc->readMiscReg(MISCREG_FFLAGS) | - (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET); - } else { - data = xc->readMiscReg(midx); - } - - DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data); - - %(code)s; - %(op_wb)s; - - return NoFault; - } -}}; - -def template CSRExecuteRw {{ - Fault - %(class_name)s::execute(ExecContext *xc, - Trace::InstRecord *traceData) const - { - if (!valid) { - return std::make_shared( - csprintf("Illegal CSR index %#x\n", csr), machInst); - } - if (bits(csr, 11, 10) == 0x3) { - return std::make_shared( - csprintf("CSR %s is read-only\n", csrName), machInst); - } - - %(op_decl)s; - %(op_rd)s; - - switch (csr) { - case CSR_SATP: { - auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV); - STATUS status = xc->readMiscReg(MISCREG_STATUS); - if (pm == PRV_U || (pm == PRV_S && status.tvm == 1)) { - return std::make_shared( - "SATP access in user mode or with TVM enabled\n", - machInst); - } - break; - } - case CSR_MSTATUS: { - auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV); - if (pm != PrivilegeMode::PRV_M) { - return std::make_shared( - "MSTATUS is only accessibly in machine mode\n", - machInst); - } - break; - } - default: - break; - } - - RegVal data; - if (csr == CSR_FCSR) { - data = xc->readMiscReg(MISCREG_FFLAGS) | + olddata = xc->readMiscReg(MISCREG_FFLAGS) | (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET); } else { - data = xc->readMiscReg(midx); + olddata = xc->readMiscReg(midx); } + auto olddata_all = olddata; - RegVal original = data; - - DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data & maskVal); + olddata &= maskVal; + DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, olddata); + data = olddata; %(code)s; - // We must keep those original bits not in the mask. Hidden bits should - // keep their original value. - data = (original & ~maskVal) | (data & maskVal); - - DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n", data, csrName); - - if (csr == CSR_FCSR) { - xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0)); - xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5)); - } else { - xc->setMiscReg(midx, data); + data &= maskVal; + if (data != olddata) { + if (bits(csr, 11, 10) == 0x3) { + return std::make_shared( + csprintf("CSR %s is read-only\n", csrName), machInst); + } + auto newdata_all = data; + // We must keep those original bits not in mask. + // olddata and data only contain the bits visable + // in current privilige level. + newdata_all = (olddata_all & ~maskVal) | data; + DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n", + newdata_all, csrName); + switch (csr) { + case CSR_FCSR: + xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0)); + xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5)); + break; + case CSR_MIP: case CSR_MIE: + case CSR_SIP: case CSR_SIE: + case CSR_UIP: case CSR_UIE: + case CSR_MSTATUS: case CSR_SSTATUS: case CSR_USTATUS: + if (newdata_all != olddata_all) { + xc->setMiscReg(midx, newdata_all); + } else { + return std::make_shared( + "Only bits in mask are allowed to be set\n", + machInst); + } + break; + default: + xc->setMiscReg(midx, data); + break; + } } - %(op_wb)s; return NoFault; } @@ -499,24 +465,10 @@ def format SystemOp(code, *opt_flags) {{ exec_output = BasicExecute.subst(iop) }}; -def template CSRDecode {{ - if (RS1) - return new %(class_name)sRw(machInst); - else - return new %(class_name)sRo(machInst); -}}; - def format CSROp(code, *opt_flags) {{ - iop = InstObjParams(name, Name + "Ro", 'CSROp', code, opt_flags) + iop = InstObjParams(name, Name, 'CSROp', code, opt_flags) header_output = BasicDeclare.subst(iop) decoder_output = BasicConstructor.subst(iop) - exec_output = CSRExecuteRo.subst(iop) - - iop = InstObjParams(name, Name + "Rw", 'CSROp', code, opt_flags) - header_output += BasicDeclare.subst(iop) - decoder_output += BasicConstructor.subst(iop) - exec_output += CSRExecuteRw.subst(iop) - - iop = InstObjParams(name, Name, 'CSROp', "", opt_flags) - decode_block = CSRDecode.subst(iop) + decode_block = BasicDecode.subst(iop) + exec_output = CSRExecute.subst(iop) }}; From a10106e94a0b895a25df56f0e3bf95f072c3f91b Mon Sep 17 00:00:00 2001 From: Giacomo Travaglini Date: Thu, 15 Jul 2021 09:46:43 +0100 Subject: [PATCH 6/9] arch-arm: Stage1&2 TableWalkers sharing same port This patch reverts part of the changes made by the removal of the Stage2MMU class [1]: Prior to that patch the stage1 and stage2 walkers were sharing the same port (which was instantiated in the Stage2MMU). By removing the Stage2MMU we provided every table walker a unique port. With this patch we are reintroducing port sharing to temporarily fix existing platforms using walker caches. (The long term design goal will be to have a unique page table walker) Those complain if we try to connect a single ported cache to 2 table walker ports (stage1 and stage2) [1]: https://gem5-review.googlesource.com/c/public/gem5/+/45780 Change-Id: Ib68ef97f1e9772a698771269c9a4ec4514f5d4d7 Signed-off-by: Giacomo Travaglini Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48200 Reviewed-by: Jason Lowe-Power Maintainer: Jason Lowe-Power Tested-by: kokoro --- src/arch/arm/ArmMMU.py | 8 +++----- src/arch/arm/mmu.cc | 9 ++++++++- src/arch/arm/mmu.hh | 4 ++++ src/arch/arm/table_walker.cc | 2 +- src/arch/arm/table_walker.hh | 1 + 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/arch/arm/ArmMMU.py b/src/arch/arm/ArmMMU.py index 00a0b3116a..1880f6f6c7 100644 --- a/src/arch/arm/ArmMMU.py +++ b/src/arch/arm/ArmMMU.py @@ -65,6 +65,8 @@ class ArmMMU(BaseMMU): itb = ArmITB() dtb = ArmDTB() + sys = Param.System(Parent.any, "system object parameter") + stage2_itb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Instruction TLB") stage2_dtb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Data TLB") @@ -80,12 +82,8 @@ class ArmMMU(BaseMMU): @classmethod def walkerPorts(cls): - return ["mmu.itb_walker.port", "mmu.dtb_walker.port", - "mmu.stage2_itb_walker.port", "mmu.stage2_dtb_walker.port"] + return ["mmu.itb_walker.port", "mmu.dtb_walker.port"] def connectWalkerPorts(self, iport, dport): self.itb_walker.port = iport self.dtb_walker.port = dport - - self.stage2_itb_walker.port = iport - self.stage2_dtb_walker.port = dport diff --git a/src/arch/arm/mmu.cc b/src/arch/arm/mmu.cc index 7392947cc4..30164b6303 100644 --- a/src/arch/arm/mmu.cc +++ b/src/arch/arm/mmu.cc @@ -47,10 +47,17 @@ using namespace ArmISA; MMU::MMU(const ArmMMUParams &p) : BaseMMU(p), itbStage2(p.stage2_itb), dtbStage2(p.stage2_dtb), + iport(p.itb_walker, p.sys->getRequestorId(p.itb_walker)), + dport(p.dtb_walker, p.sys->getRequestorId(p.dtb_walker)), itbWalker(p.itb_walker), dtbWalker(p.dtb_walker), itbStage2Walker(p.stage2_itb_walker), dtbStage2Walker(p.stage2_dtb_walker) -{} +{ + itbWalker->setPort(&iport); + dtbWalker->setPort(&dport); + itbStage2Walker->setPort(&iport); + dtbStage2Walker->setPort(&dport); +} void MMU::init() diff --git a/src/arch/arm/mmu.hh b/src/arch/arm/mmu.hh index f9ebeb3679..a129831b79 100644 --- a/src/arch/arm/mmu.hh +++ b/src/arch/arm/mmu.hh @@ -38,6 +38,7 @@ #ifndef __ARCH_ARM_MMU_HH__ #define __ARCH_ARM_MMU_HH__ +#include "arch/arm/table_walker.hh" #include "arch/arm/tlb.hh" #include "arch/generic/mmu.hh" @@ -69,6 +70,9 @@ class MMU : public BaseMMU TLB *itbStage2; TLB *dtbStage2; + TableWalker::Port iport; + TableWalker::Port dport; + TableWalker *itbWalker; TableWalker *dtbWalker; TableWalker *itbStage2Walker; diff --git a/src/arch/arm/table_walker.cc b/src/arch/arm/table_walker.cc index 5632be1037..6dc00eb265 100644 --- a/src/arch/arm/table_walker.cc +++ b/src/arch/arm/table_walker.cc @@ -61,7 +61,7 @@ using namespace ArmISA; TableWalker::TableWalker(const Params &p) : ClockedObject(p), requestorId(p.sys->getRequestorId(this)), - port(new Port(this, requestorId)), + port(nullptr), isStage2(p.is_stage2), tlb(NULL), currState(NULL), pending(false), numSquashable(p.num_squash_per_cycle), diff --git a/src/arch/arm/table_walker.hh b/src/arch/arm/table_walker.hh index 992e22466d..165a922950 100644 --- a/src/arch/arm/table_walker.hh +++ b/src/arch/arm/table_walker.hh @@ -1037,6 +1037,7 @@ class TableWalker : public ClockedObject void setMmu(MMU *_mmu) { mmu = _mmu; } void setTlb(TLB *_tlb) { tlb = _tlb; } + void setPort(Port *_port) { port = _port; } TLB* getTlb() { return tlb; } void memAttrs(ThreadContext *tc, TlbEntry &te, SCTLR sctlr, uint8_t texcb, bool s); From f8578e4b050198303d67f3cda6fe3d9cf5093b2e Mon Sep 17 00:00:00 2001 From: Kyle Roarty Date: Wed, 14 Jul 2021 15:50:48 -0500 Subject: [PATCH 7/9] gpu-compute: Fix TLB coalescer starvation Currently, we are storing coalesced accesses in an std::unordered_map indexed by a tick index, i.e. issue tick / coalescing window. If there are multiple coalesced requests, at different tick indexes, to the same virtual address, then the TLB coalescer will issue just the first one. However, std::unordered_map is not a sorted container and we issue coalesced requests by iterating through such container. This means that the coalesced request sent in TLBCoalescer::processProbeTLBEvent is not necessarly the oldest one. Because of this, in cases of high contention the oldest coalesced request will have a huge TLB access latency. To fix this issue, we will use an std::map which is a sorted container and therefore guarantees the oldest coalesced request will be sent first. Change-Id: I9c7ab32c038d5e60f6b55236266a27b0cae8bfb0 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48340 Reviewed-by: Matt Sinclair Reviewed-by: Matthew Poremba Maintainer: Matt Sinclair Tested-by: kokoro --- src/gpu-compute/tlb_coalescer.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh index b97801b034..fce87406b2 100644 --- a/src/gpu-compute/tlb_coalescer.hh +++ b/src/gpu-compute/tlb_coalescer.hh @@ -100,7 +100,7 @@ class TLBCoalescer : public ClockedObject * option is to change it to curTick(), so we coalesce based * on the receive time. */ - typedef std::unordered_map> + typedef std::map> CoalescingFIFO; CoalescingFIFO coalescerFIFO; From 1415308d103f73085239791f6578885fce315d37 Mon Sep 17 00:00:00 2001 From: Kyle Roarty Date: Wed, 14 Jul 2021 15:53:57 -0500 Subject: [PATCH 8/9] mem-ruby: Account for misaligned accesses in GPUCoalescer Previously, we assumed that the maximum number of requests that would be issued by an instruction was equal to the number of threads that were active for that instruction. However, if a thread has an access that crosses a cache line, that thread has a misaligned access, and needs to request both cache lines. This patch takes that into account by checking the status vector for each thread in that instruction to determine the number of requests. Change-Id: I1994962c46d504b48654dbd22bcd786c9f382fd9 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48341 Tested-by: kokoro Reviewed-by: Matt Sinclair Reviewed-by: Matthew Poremba Maintainer: Matt Sinclair --- src/mem/ruby/system/GPUCoalescer.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index c00e7c0986..2390ba6c47 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -645,7 +645,10 @@ GPUCoalescer::makeRequest(PacketPtr pkt) // of the exec_mask. int num_packets = 1; if (!m_usingRubyTester) { - num_packets = getDynInst(pkt)->exec_mask.count(); + num_packets = 0; + for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) { + num_packets += getDynInst(pkt)->getLaneStatus(i); + } } // the pkt is temporarily stored in the uncoalesced table until From 523a92f7f09178a62ee144ecf51697aa2de2228a Mon Sep 17 00:00:00 2001 From: Kyle Roarty Date: Tue, 20 Jul 2021 14:25:51 -0500 Subject: [PATCH 9/9] arch-gcn3: Implement large ds_read/write instructions This implements the 96 and 128b ds_read/write instructions in a similar fashion to the 3 and 4 dword flat_load/store instructions. These instructions are treated as reads/writes of 3 or 4 dwords, instead of as a single 96b/128b memory transaction, due to the limitations of the VecOperand class used in the amdgpu code. In order to handle treating the memory transaction as multiple dwords, the patch also adds in new initMemRead/initMemWrite functions for ds instructions. These are similar to the functions used in flat instructions for the same purpose. Change-Id: I0f2ba3cb7cf040abb876e6eae55a6d38149ee960 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48342 Tested-by: kokoro Reviewed-by: Alex Dutu Reviewed-by: Matt Sinclair Maintainer: Matt Sinclair --- src/arch/amdgpu/gcn3/insts/instructions.cc | 190 ++++++++++++++++++++- src/arch/amdgpu/gcn3/insts/instructions.hh | 8 + src/arch/amdgpu/gcn3/insts/op_encodings.hh | 38 +++++ 3 files changed, 232 insertions(+), 4 deletions(-) diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc index 21ab58d177..79af7ac156 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.cc +++ b/src/arch/amdgpu/gcn3/insts/instructions.cc @@ -34335,9 +34335,52 @@ namespace Gcn3ISA void Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU32 data0(gpuDynInst, extData.DATA0); + ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1); + ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2); + + addr.read(); + data0.read(); + data1.read(); + data2.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4] = data0[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1] = data1[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2] = data2[lane]; + } + } + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); } + void + Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemWrite<3>(gpuDynInst, offset); + } // initiateAcc + + void + Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst) + { + } // completeAcc + Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_write_b128") { @@ -34354,9 +34397,56 @@ namespace Gcn3ISA void Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU32 data0(gpuDynInst, extData.DATA0); + ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1); + ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2); + ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3); + + addr.read(); + data0.read(); + data1.read(); + data2.read(); + data3.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4] = data0[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1] = data1[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2] = data2[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 3] = data3[lane]; + } + } + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); } + void + Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemWrite<4>(gpuDynInst, offset); + } // initiateAcc + + void + Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst) + { + } // completeAcc + Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_read_b96") { @@ -34372,7 +34462,51 @@ namespace Gcn3ISA void Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + + addr.read(); + + calcAddr(gpuDynInst, addr); + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); + } + + void + Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemRead<3>(gpuDynInst, offset); + } + + void + Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst) + { + VecOperandU32 vdst0(gpuDynInst, extData.VDST); + VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1); + VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst0[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4]; + vdst1[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1]; + vdst2[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2]; + } + } + + vdst0.write(); + vdst1.write(); + vdst2.write(); } Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt) @@ -34390,9 +34524,57 @@ namespace Gcn3ISA void Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + + addr.read(); + + calcAddr(gpuDynInst, addr); + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); } + void + Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemRead<4>(gpuDynInst, offset); + } // initiateAcc + + void + Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst) + { + VecOperandU32 vdst0(gpuDynInst, extData.VDST); + VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1); + VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2); + VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst0[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4]; + vdst1[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1]; + vdst2[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2]; + vdst3[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 3]; + } + } + + vdst0.write(); + vdst1.write(); + vdst2.write(); + vdst3.write(); + } // completeAcc + Inst_MUBUF__BUFFER_LOAD_FORMAT_X ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt) : Inst_MUBUF(iFmt, "buffer_load_format_x") diff --git a/src/arch/amdgpu/gcn3/insts/instructions.hh b/src/arch/amdgpu/gcn3/insts/instructions.hh index 1ee8220762..f49182ec8a 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.hh +++ b/src/arch/amdgpu/gcn3/insts/instructions.hh @@ -35226,6 +35226,8 @@ namespace Gcn3ISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_WRITE_B96 class Inst_DS__DS_WRITE_B128 : public Inst_DS @@ -35258,6 +35260,8 @@ namespace Gcn3ISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_WRITE_B128 class Inst_DS__DS_READ_B96 : public Inst_DS @@ -35290,6 +35294,8 @@ namespace Gcn3ISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_READ_B96 class Inst_DS__DS_READ_B128 : public Inst_DS @@ -35322,6 +35328,8 @@ namespace Gcn3ISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_READ_B128 class Inst_MUBUF__BUFFER_LOAD_FORMAT_X : public Inst_MUBUF diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh index c4e107c903..a0612858db 100644 --- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh +++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh @@ -416,6 +416,25 @@ namespace Gcn3ISA } } + template + void + initMemRead(GPUDynInstPtr gpuDynInst, Addr offset) + { + Wavefront *wf = gpuDynInst->wavefront(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane] + offset; + for (int i = 0; i < N; ++i) { + (reinterpret_cast( + gpuDynInst->d_data))[lane * N + i] + = wf->ldsChunk->read( + vaddr + i*sizeof(VecElemU32)); + } + } + } + } + template void initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1) @@ -450,6 +469,25 @@ namespace Gcn3ISA } } + template + void + initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset) + { + Wavefront *wf = gpuDynInst->wavefront(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane] + offset; + for (int i = 0; i < N; ++i) { + wf->ldsChunk->write( + vaddr + i*sizeof(VecElemU32), + (reinterpret_cast( + gpuDynInst->d_data))[lane * N + i]); + } + } + } + } + template void initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)