diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index f28a8e39c7..ba4c14c4f0 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -397,9 +397,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, } /** - * trigger invalidate operation in the cu + * trigger invalidate operation in the CU * - * req: request initialized in shader, carrying the invlidate flags + * req: request initialized in shader, carrying the invalidate flags */ void ComputeUnit::doInvalidate(RequestPtr req, int kernId){ @@ -425,6 +425,26 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) { injectGlobalMemFence(gpuDynInst, true); } +/** + * trigger SQCinvalidate operation in the CU + * + * req: request initialized in shader, carrying the invalidate flags + */ +void +ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){ + GPUDynInstPtr gpuDynInst + = std::make_shared(this, nullptr, + new KernelLaunchStaticInst(), getAndIncSeqNum()); + + // kern_id will be used in inv responses + gpuDynInst->kern_id = kernId; + // update contextId field + req->setContext(gpuDynInst->wfDynId); + + gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar); + scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req); +} + // reseting SIMD register pools // I couldn't think of any other place and // I think it is needed in my implementation @@ -1012,7 +1032,14 @@ ComputeUnit::DataPort::recvReqRetry() bool ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) { - computeUnit->handleSQCReturn(pkt); + SenderState *sender_state = safe_cast(pkt->senderState); + /** Process the response only if there is a wavefront associated with it. + * Otherwise, it is from SQC invalidate that was issued at kernel start + * and doesn't have a wavefront or instruction associated with it. + */ + if (sender_state->wavefront != nullptr) { + computeUnit->handleSQCReturn(pkt); + } return true; } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 24324bb515..7e3f05d070 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -412,6 +412,7 @@ class ComputeUnit : public ClockedObject void doInvalidate(RequestPtr req, int kernId); void doFlush(GPUDynInstPtr gpuDynInst); + void doSQCInvalidate(RequestPtr req, int kernId); void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg); bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg); diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index e13e7c9cf4..a83b413cf9 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -221,6 +221,11 @@ Shader::prepareInvalidate(HSAQueueEntry *task) { // all necessary INV flags are all set now, call cu to execute cuList[i_cu]->doInvalidate(req, task->dispatchId()); + + if ((i_cu % 4) == 0) { + cuList[i_cu]->doSQCInvalidate(req, task->dispatchId()); + } + // I don't like this. This is intrusive coding. cuList[i_cu]->resetRegisterPool(); }