diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index 404beab16c..4f98f18d16 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -441,12 +441,17 @@ void PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, SDMAQueueDesc *mqd, uint16_t vmid) { + uint32_t rlc_size = 4UL << bits(mqd->sdmax_rlcx_rb_cntl, 6, 1); + Addr rptr_wb_addr = mqd->sdmax_rlcx_rb_rptr_addr_hi; + rptr_wb_addr <<= 32; + rptr_wb_addr |= mqd->sdmax_rlcx_rb_rptr_addr_lo; + DPRINTF(PM4PacketProcessor, "SDMAMQD: rb base: %#lx rptr: %#x/%#x wptr: " - "%#x/%#x ib: %#x/%#x size: %d ctrl: %#x\n", mqd->rb_base, - mqd->sdmax_rlcx_rb_rptr, mqd->sdmax_rlcx_rb_rptr_hi, + "%#x/%#x ib: %#x/%#x size: %d ctrl: %#x rptr wb addr: %#lx\n", + mqd->rb_base, mqd->sdmax_rlcx_rb_rptr, mqd->sdmax_rlcx_rb_rptr_hi, mqd->sdmax_rlcx_rb_wptr, mqd->sdmax_rlcx_rb_wptr_hi, mqd->sdmax_rlcx_ib_base_lo, mqd->sdmax_rlcx_ib_base_hi, - mqd->sdmax_rlcx_ib_size, mqd->sdmax_rlcx_rb_cntl); + rlc_size, mqd->sdmax_rlcx_rb_cntl, rptr_wb_addr); // Engine 2 points to SDMA0 while engine 3 points to SDMA1 assert(pkt->engineSel == 2 || pkt->engineSel == 3); @@ -454,7 +459,8 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, // Register RLC queue with SDMA sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2, - mqd->rb_base << 8); + mqd->rb_base << 8, rlc_size, + rptr_wb_addr); // Register doorbell with GPU device gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng); diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index e9a4c176d8..59c5027c85 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -161,7 +161,8 @@ SDMAEngine::translate(Addr vaddr, Addr size) } void -SDMAEngine::registerRLCQueue(Addr doorbell, Addr rb_base) +SDMAEngine::registerRLCQueue(Addr doorbell, Addr rb_base, uint32_t size, + Addr rptr_wb_addr) { // Get first free RLC if (!rlc0.valid()) { @@ -171,19 +172,19 @@ SDMAEngine::registerRLCQueue(Addr doorbell, Addr rb_base) rlc0.base(rb_base); rlc0.rptr(0); rlc0.wptr(0); + rlc0.rptrWbAddr(rptr_wb_addr); rlc0.processing(false); - // TODO: size - I think pull from MQD 2^rb_cntrl[6:1]-1 - rlc0.size(1024*1024); + rlc0.size(size); } else if (!rlc1.valid()) { DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC1\n", doorbell); rlcInfo[1] = doorbell; rlc1.valid(true); rlc1.base(rb_base); - rlc1.rptr(1); - rlc1.wptr(1); + rlc1.rptr(0); + rlc1.wptr(0); + rlc1.rptrWbAddr(rptr_wb_addr); rlc1.processing(false); - // TODO: size - I think pull from MQD 2^rb_cntrl[6:1]-1 - rlc1.size(1024*1024); + rlc1.size(size); } else { panic("No free RLCs. Check they are properly unmapped."); } @@ -291,6 +292,17 @@ SDMAEngine::decodeNext(SDMAQueue *q) { decodeHeader(q, header); }); dmaReadVirt(q->rptr(), sizeof(uint32_t), cb, &cb->dmaBuffer); } else { + // The driver expects the rptr to be written back to host memory + // periodically. In simulation, we writeback rptr after each burst of + // packets from a doorbell, rather than using the cycle count which + // is not accurate in all simulation settings (e.g., KVM). + DPRINTF(SDMAEngine, "Writing rptr %#lx back to host addr %#lx\n", + q->globalRptr(), q->rptrWbAddr()); + if (q->rptrWbAddr()) { + auto cb = new DmaVirtCallback( + [ = ](const uint64_t &) { }, q->globalRptr()); + dmaWriteVirt(q->rptrWbAddr(), sizeof(Addr), cb, &cb->dmaBuffer); + } q->processing(false); if (q->parent()) { DPRINTF(SDMAEngine, "SDMA switching queues\n"); @@ -1158,6 +1170,7 @@ SDMAEngine::setGfxRptrLo(uint32_t data) { gfxRptr = insertBits(gfxRptr, 31, 0, 0); gfxRptr |= data; + gfx.rptrWbAddr(getGARTAddr(gfxRptr)); } void @@ -1165,6 +1178,7 @@ SDMAEngine::setGfxRptrHi(uint32_t data) { gfxRptr = insertBits(gfxRptr, 63, 32, 0); gfxRptr |= ((uint64_t)data) << 32; + gfx.rptrWbAddr(getGARTAddr(gfxRptr)); } void @@ -1236,6 +1250,7 @@ SDMAEngine::setPageRptrLo(uint32_t data) { pageRptr = insertBits(pageRptr, 31, 0, 0); pageRptr |= data; + page.rptrWbAddr(getGARTAddr(pageRptr)); } void @@ -1243,6 +1258,7 @@ SDMAEngine::setPageRptrHi(uint32_t data) { pageRptr = insertBits(pageRptr, 63, 32, 0); pageRptr |= ((uint64_t)data) << 32; + page.rptrWbAddr(getGARTAddr(pageRptr)); } void diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index 6fe7a8e565..d0afaf7a4a 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -58,6 +58,8 @@ class SDMAEngine : public DmaVirtDevice Addr _rptr; Addr _wptr; Addr _size; + Addr _rptr_wb_addr = 0; + Addr _global_rptr = 0; bool _valid; bool _processing; SDMAQueue *_parent; @@ -72,6 +74,8 @@ class SDMAEngine : public DmaVirtDevice Addr wptr() { return _base + _wptr; } Addr getWptr() { return _wptr; } Addr size() { return _size; } + Addr rptrWbAddr() { return _rptr_wb_addr; } + Addr globalRptr() { return _global_rptr; } bool valid() { return _valid; } bool processing() { return _processing; } SDMAQueue* parent() { return _parent; } @@ -82,22 +86,27 @@ class SDMAEngine : public DmaVirtDevice void incRptr(uint32_t value) { - //assert((_rptr + value) <= (_size << 1)); _rptr = (_rptr + value) % _size; + _global_rptr += value; } - void rptr(Addr value) { _rptr = value; } + void + rptr(Addr value) + { + _rptr = value; + _global_rptr = value; + } void setWptr(Addr value) { - //assert(value <= (_size << 1)); _wptr = value % _size; } void wptr(Addr value) { _wptr = value; } void size(Addr value) { _size = value; } + void rptrWbAddr(Addr value) { _rptr_wb_addr = value; } void valid(bool v) { _valid = v; } void processing(bool value) { _processing = value; } void parent(SDMAQueue* q) { _parent = q; } @@ -268,7 +277,8 @@ class SDMAEngine : public DmaVirtDevice /** * Methods for RLC queues */ - void registerRLCQueue(Addr doorbell, Addr rb_base); + void registerRLCQueue(Addr doorbell, Addr rb_base, uint32_t size, + Addr rptr_wb_addr); void unregisterRLCQueue(Addr doorbell); void deallocateRLCQueues();