diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc index 41430191ea..68cdcf4386 100644 --- a/src/dev/hsa/hsa_packet_processor.cc +++ b/src/dev/hsa/hsa_packet_processor.cc @@ -282,11 +282,11 @@ HSAPacketProcessor::CmdQueueCmdDmaEvent::process() } void -HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx) +HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx, Tick delay) { RQLEntry *queue = regdQList[rl_idx]; - if (!queue->aqlProcessEvent.scheduled() && !queue->getBarrierBit()) { - Tick processingTick = curTick() + pktProcessDelay; + if (!queue->aqlProcessEvent.scheduled()) { + Tick processingTick = curTick() + delay; schedule(queue->aqlProcessEvent, processingTick); DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n", processingTick); @@ -295,42 +295,48 @@ HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx) } } -bool +void +HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx) +{ + schedAQLProcessing(rl_idx, pktProcessDelay); +} + +Q_STATE HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr) { - bool is_submitted = false; + Q_STATE is_submitted = BLOCKED_BPKT; SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState); // Dependency signals are not read yet. And this can only be a retry. // The retry logic will schedule the packet processor wakeup if (dep_sgnl_rd_st->pendingReads != 0) { - return false; + return BLOCKED_BPKT; } // `pkt` can be typecasted to any type of AQL packet since they all // have header information at offset zero auto disp_pkt = (_hsa_dispatch_packet_t *)pkt; hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt); + if (IS_BARRIER(disp_pkt) && + regdQList[rl_idx]->compltnPending() > 0) { + // If this packet is using the "barrier bit" to enforce ordering with + // previous packets, and if there are outstanding packets, set the + // barrier bit for this queue and block the queue. + DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \ + " list ID = %d\n", __FUNCTION__, rl_idx); + regdQList[rl_idx]->setBarrierBit(true); + return BLOCKED_BBIT; + } if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) { DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \ " active list ID = %d\n", __FUNCTION__, rl_idx); // Submit packet to HSA device (dispatcher) hsa_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr); - is_submitted = true; + is_submitted = UNBLOCKED; } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) { DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \ " active list ID = %d\n", __FUNCTION__, rl_idx); // Submit packet to HSA device (dispatcher) hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr); - is_submitted = true; - /* - If this packet is using the "barrier bit" to enforce ordering with - subsequent kernels, set the bit for this queue now, after - dispatching. - */ - if (IS_BARRIER(disp_pkt)) { - DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \ - " list ID = %d\n", __FUNCTION__, rl_idx); - regdQList[rl_idx]->setBarrierBit(true); - } + is_submitted = UNBLOCKED; } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) { DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \ " active list ID = %d\n", __FUNCTION__, rl_idx); @@ -387,7 +393,7 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr) // TODO: Completion signal of barrier packet to be // atomically decremented here finishPkt((void*)bar_and_pkt, rl_idx); - is_submitted = true; + is_submitted = UNBLOCKED; // Reset signal values dep_sgnl_rd_st->resetSigVals(); // The completion signal is connected @@ -447,6 +453,13 @@ HSAPacketProcessor::QueueProcessEvent::process() " dispIdx %d, active list ID = %d\n", __FUNCTION__, aqlRingBuffer->rdIdx(), aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx); + // If barrier bit is set, then this wakeup is a dummy wakeup + // just to model the processing time. Do nothing. + if (hsaPP->regdQList[rqIdx]->getBarrierBit()) { + DPRINTF(HSAPacketProcessor, + "Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx); + return; + } // In the future, we may support batch processing of packets. // Then, we can just remove the break statements and the code // will support batch processing. That is why we are using a @@ -456,7 +469,8 @@ HSAPacketProcessor::QueueProcessEvent::process() DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n", __FUNCTION__, aqlRingBuffer->dispIdx()); Addr host_addr = aqlRingBuffer->hostDispAddr(); - if (hsaPP->processPkt(pkt, rqIdx, host_addr)) { + Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr); + if (q_state == UNBLOCKED) { aqlRingBuffer->incDispIdx(1); DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n", __FUNCTION__, aqlRingBuffer->dispIdx()); @@ -464,10 +478,20 @@ HSAPacketProcessor::QueueProcessEvent::process() hsaPP->schedAQLProcessing(rqIdx); } break; - } else { - // This queue is blocked, scheduled a processing event + } else if (q_state == BLOCKED_BPKT) { + // This queue is blocked by barrier packet, + // schedule a processing event hsaPP->schedAQLProcessing(rqIdx); break; + } else if (q_state == BLOCKED_BBIT) { + // This queue is blocked by barrier bit, and processing event + // should be scheduled from finishPkt(). However, to elapse + // "pktProcessDelay" processing time, let us schedule a dummy + // wakeup once which will just wakeup and will do nothing. + hsaPP->schedAQLProcessing(rqIdx); + break; + } else { + panic("Unknown queue state\n"); } } } @@ -647,19 +671,23 @@ HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx) { HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc; - // if barrier bit was set, unset it here -- we assume that finishPkt is - // only called after the completion of a kernel - if (regdQList[rl_idx]->getBarrierBit()) { + // if barrier bit was set and this is the last + // outstanding packet from that queue, + // unset it here + if (regdQList[rl_idx]->getBarrierBit() && + regdQList[rl_idx]->isLastOutstandingPkt()) { DPRINTF(HSAPacketProcessor, "Unset barrier bit for active list ID %d\n", rl_idx); regdQList[rl_idx]->setBarrierBit(false); - // if pending kernels in the queue after this kernel, reschedule - if (regdQList[rl_idx]->dispPending()) { - DPRINTF(HSAPacketProcessor, - "Rescheduling active list ID %d after unsetting barrier " - "bit\n", rl_idx); - schedAQLProcessing(rl_idx); - } + panic_if(!regdQList[rl_idx]->dispPending(), + "There should be pending kernels in this queue\n"); + DPRINTF(HSAPacketProcessor, + "Rescheduling active list ID %d after unsetting barrier " + "bit\n", rl_idx); + // Try to schedule wakeup in the next cycle. There is a minimum + // pktProcessDelay for queue wake up. If that processing delay is + // elapsed, schedAQLProcessing will wakeup next tick. + schedAQLProcessing(rl_idx, 1); } // If set, then blocked schedule, so need to reschedule diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh index 3ff7ad27f3..551d09ddaa 100644 --- a/src/dev/hsa/hsa_packet_processor.hh +++ b/src/dev/hsa/hsa_packet_processor.hh @@ -52,6 +52,19 @@ // HSA runtime supports only 5 signals per barrier packet #define NumSignalsPerBarrier 5 +// Ideally, each queue should store this status and +// the processPkt() should make decisions based on that +// status variable. +typedef enum { + UNBLOCKED = 0, // Unblocked queue, can submit packets. + BLOCKED_BBIT, // Queue blocked by barrier bit. + // Can submit packet packets after + // previous packet completes. + BLOCKED_BPKT, // Queue blocked by barrier packet. + // Can submit packet packets after + // barrier packet completes. +} Q_STATE; + class HSADevice; class HWScheduler; @@ -151,6 +164,25 @@ class AQLRingBuffer return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID; } + /** + * Packets aren't guaranteed to be completed in-order, and we need + * to know when the last packet is finished in order to un-set + * the barrier bit. In order to confirm if the packet at _rdIdx + * is the last packet, we check if the packets ahead of _rdIdx + * are finished. If they are, _rdIdx is the last packet. If not, + * there are other outstanding packets. + */ + bool + isLastOutstandingPkt() const + { + for (int i = _rdIdx + 1; i < _dispIdx; i++) { + if (!_aqlComplete[i % _aqlBuf.size()]) { + return false; + } + } + return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx; + } + uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); } void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); } uint32_t numObjs() const { return _aqlBuf.size(); }; @@ -162,7 +194,7 @@ class AQLRingBuffer void incRdIdx(uint64_t value) { _rdIdx += value; } void incWrIdx(uint64_t value) { _wrIdx += value; } void incDispIdx(uint64_t value) { _dispIdx += value; } - + uint64_t compltnPending() { return (_dispIdx - _rdIdx); } }; typedef struct QueueContext { @@ -233,10 +265,15 @@ class HSAPacketProcessor: public DmaDevice : aqlProcessEvent(hsaPP, rqIdx) {} QCntxt qCntxt; bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; } + uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); } SignalState depSignalRdState; QueueProcessEvent aqlProcessEvent; void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; } bool getBarrierBit() const { return qCntxt.barrierBit; } + bool isLastOutstandingPkt() const + { + return qCntxt.aqlBuf->isLastOutstandingPkt(); + } }; // Keeps track of queueDescriptors of registered queues std::vector regdQList; @@ -250,7 +287,7 @@ class HSAPacketProcessor: public DmaDevice void dmaWriteVirt(Addr host_addr, unsigned size, Event *event, void *data, Tick delay = 0); - bool processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr); + Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr); void displayQueueDescriptor(int pid, uint32_t rl_idx); public: @@ -290,6 +327,7 @@ class HSAPacketProcessor: public DmaDevice void finishPkt(void *pkt, uint32_t rl_idx); void finishPkt(void *pkt) { finishPkt(pkt, 0); } void schedAQLProcessing(uint32_t rl_idx); + void schedAQLProcessing(uint32_t rl_idx, Tick delay); class DepSignalsReadDmaEvent : public Event {