dev-amdgpu: Support multiple CPs and MMIO AddrRanges

Currently gem5 assumes that there is only one command processor (CP) which contains the PM4 packet processor. Some GPU devices have multiple CPs which the driver tests individually during POST if they are used or not. Therefore, these additional CPs need to be supported. This commit allows for multiple PM4 packet processors which represent multiple CPs. Each of these processors will have its own independent MMIO address range. To more easily support ranges, the MMIO addresses now use AddrRange to index a PM4 packet processor instead of the hard-coded constexpr MMIO start and size pairs. By default only one PM4 packet processor is created, meaning the functionality of the simulation is unchanged for devices currently supported in gem5. Change-Id: I977f4fd3a169ef4a78671a4fb58c8ea0e19bf52c
2024-02-13 17:43:23 -06:00
parent 39153cd234
commit 823b5a6eb8
10 changed files with 245 additions and 151 deletions
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -54,8 +54,7 @@ namespace gem5

 AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
    : PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih),
-      pm4PktProc(p.pm4_pkt_proc), cp(p.cp),
-      checkpoint_before_mmios(p.checkpoint_before_mmios),
+      cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios),
      init_interrupt_count(0), _lastVMID(0),
      deviceMem(name() + ".deviceMem", p.memories, false, "", false)
 {
@@ -81,6 +80,16 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
        romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE);
    }

+    if (p.device_name == "Vega10") {
+        gfx_version = GfxVersion::gfx900;
+    } else if (p.device_name == "MI100") {
+        gfx_version = GfxVersion::gfx908;
+    } else if (p.device_name == "MI200") {
+        gfx_version = GfxVersion::gfx90a;
+    } else {
+        panic("Unknown GPU device %s\n", p.device_name);
+    }
+
    if (p.trace_file != "") {
        mmioReader.readMMIOTrace(p.trace_file);
    }
@@ -126,8 +135,22 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
        panic("Unknown GPU device %s\n", p.device_name);
    }

+    // Setup PM4 packet processors and sanity check IDs
+    std::set<int> pm4_ids;
+    for (auto& pm4 : p.pm4_pkt_procs) {
+        pm4->setGPUDevice(this);
+        fatal_if(pm4_ids.count(pm4->getIpId()),
+                "Two PM4s with same IP IDs is not allowed");
+        pm4_ids.insert(pm4->getIpId());
+        pm4PktProcs.insert({pm4->getIpId(), pm4});
+
+        pm4Ranges.insert({pm4->getMMIORange(), pm4});
+    }
+
+    // There should be at least one PM4 packet processor with ID 0
+    fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found");
+
    deviceIH->setGPUDevice(this);
-    pm4PktProc->setGPUDevice(this);
    cp->hsaPacketProc().setGPUDevice(this);
    cp->setGPUDevice(this);
    nbio.setGPUDevice(this);
@@ -136,6 +159,23 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
    // could possibly be anything, but these are the values used by hardware.
    uint64_t mmhubBase = 0x8000ULL << 24;
    uint64_t mmhubTop = 0x83ffULL << 24;
+    uint64_t mem_size = 0x3ff0; // 16 GB of memory
+
+    gpuvm.setMMHUBBase(mmhubBase);
+    gpuvm.setMMHUBTop(mmhubTop);
+
+    // Map other MMIO apertures based on gfx version. This must be done before
+    // any calls to get/setRegVal.
+    // NBIO               0x0     - 0x4280
+    // IH                 0x4280  - 0x4980
+    // GRBM               0x8000  - 0xC000
+    // GFX                0x28000 - 0x3F000
+    // MMHUB              0x68000 - 0x6a120
+    gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));
+    gpuvm.setMMIOAperture(IH_MMIO_RANGE,   AddrRange(0x4280, 0x4980));
+    gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));
+    gpuvm.setMMIOAperture(GFX_MMIO_RANGE,  AddrRange(0x28000, 0x3F000));
+    gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE,  AddrRange(0x68000, 0x6A120));

    // These are hardcoded register values to return what the driver expects
    setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000);
@@ -145,25 +185,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
    if (p.device_name == "Vega10") {
        setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
        setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
-        gfx_version = GfxVersion::gfx900;
    } else if (p.device_name == "MI100") {
        setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
        setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
-        setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
-        gfx_version = GfxVersion::gfx908;
+        setRegVal(MI100_MEM_SIZE_REG, mem_size);
    } else if (p.device_name == "MI200") {
        // This device can have either 64GB or 128GB of device memory.
        // This limits to 16GB for simulation.
        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
-        setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
-        gfx_version = GfxVersion::gfx90a;
+        setRegVal(MI200_MEM_SIZE_REG, mem_size);
    } else {
        panic("Unknown GPU device %s\n", p.device_name);
    }
-
-    gpuvm.setMMHUBBase(mmhubBase);
-    gpuvm.setMMHUBTop(mmhubTop);
 }

 void
@@ -356,29 +390,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset)
 void
 AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
 {
-    Addr aperture = gpuvm.getMmioAperture(offset);
-    Addr aperture_offset = offset - aperture;
+    AddrRange aperture = gpuvm.getMMIOAperture(offset);
+    Addr aperture_offset = offset - aperture.start();

    // By default read from MMIO trace. Overwrite the packet for a select
    // few more dynamic MMIOs.
    DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset);
    mmioReader.readFromTrace(pkt, MMIO_BAR, offset);

-    switch (aperture) {
-      case NBIO_BASE:
+    if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "NBIO base\n");
        nbio.readMMIO(pkt, aperture_offset);
-        break;
-      case GRBM_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GRBM base\n");
        gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        break;
-      case GFX_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GFX base\n");
        gfx.readMMIO(pkt, aperture_offset);
-        break;
-      case MMHUB_BASE:
+    } else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "MMHUB base\n");
        gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);
-        break;
-      default:
-        break;
+    } else {
+        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset);
    }
 }

@@ -422,17 +455,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
    DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset);

    if (doorbells.find(offset) != doorbells.end()) {
-        QueueType q_type = doorbells[offset];
+        QueueType q_type = doorbells[offset].qtype;
+        int ip_id = doorbells[offset].ip_id;
        DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n",
                              offset, q_type);
        switch (q_type) {
          case Compute:
-            pm4PktProc->process(pm4PktProc->getQueue(offset),
-                                pkt->getLE<uint64_t>());
+            assert(pm4PktProcs.count(ip_id));
+            pm4PktProcs[ip_id]->process(
+                pm4PktProcs[ip_id]->getQueue(offset),
+                pkt->getLE<uint64_t>());
          break;
          case Gfx:
-            pm4PktProc->process(pm4PktProc->getQueue(offset, true),
-                                pkt->getLE<uint64_t>());
+            assert(pm4PktProcs.count(ip_id));
+            pm4PktProcs[ip_id]->process(
+                pm4PktProcs[ip_id]->getQueue(offset, true),
+                pkt->getLE<uint64_t>());
          break;
          case SDMAGfx: {
            SDMAEngine *sdmaEng = getSDMAEngine(offset);
@@ -443,9 +481,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
            sdmaEng->processPage(pkt->getLE<uint64_t>());
          } break;
          case ComputeAQL: {
+            assert(pm4PktProcs.count(ip_id));
            cp->hsaPacketProc().hwScheduler()->write(offset,
                pkt->getLE<uint64_t>() + 1);
-            pm4PktProc->updateReadIndex(offset, pkt->getLE<uint64_t>() + 1);
+            pm4PktProcs[ip_id]->updateReadIndex(offset,
+                pkt->getLE<uint64_t>() + 1);
          } break;
          case InterruptHandler:
            deviceIH->updateRptr(pkt->getLE<uint32_t>());
@@ -475,12 +515,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
 void
 AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
 {
-    Addr aperture = gpuvm.getMmioAperture(offset);
-    Addr aperture_offset = offset - aperture;
+    AddrRange aperture = gpuvm.getMMIOAperture(offset);
+    Addr aperture_offset = offset - aperture.start();

    DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset);

-    // Check SDMA functions first, then fallback to switch statement
+    // Check SDMA functions first, then fallback to MMIO ranges.
    for (int idx = 0; idx < sdmaIds.size(); ++idx) {
        if (sdmaMmios[idx].contains(offset)) {
            Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2;
@@ -498,26 +538,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
        }
    }

-    switch (aperture) {
-      /* Write a general register to the graphics register bus manager. */
-      case GRBM_BASE:
+    // Check PM4s next, returning to avoid duplicate writes.
+    for (auto& [range, pm4_proc] : pm4Ranges) {
+        if (range.contains(offset)) {
+            // PM4 MMIOs are offset based on the MMIO range start
+            Addr ip_offset = offset - range.start();
+            pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT);
+
+            return;
+        }
+    }
+
+    if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GRBM base\n");
        gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        break;
-      /* Write a register to the interrupt handler. */
-      case IH_BASE:
+    } else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "IH base\n");
        deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT);
-        break;
-      /* Write an IO space register */
-      case NBIO_BASE:
+    } else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "NBIO base\n");
        nbio.writeMMIO(pkt, aperture_offset);
-        break;
-      case GFX_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GFX base\n");
        gfx.writeMMIO(pkt, aperture_offset);
-        break;
-      default:
-        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset);
-        break;
+    } else {
+        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset);
    }
 }

@@ -638,10 +683,11 @@ AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
 }

 void
-AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt)
+AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id)
 {
    DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset);
-    doorbells[offset] = qt;
+    doorbells[offset].qtype = qt;
+    doorbells[offset].ip_id = ip_id;
 }

 void
@@ -692,6 +738,7 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
    // Make a c-style array of the regs to serialize
    uint32_t doorbells_offset[doorbells_size];
    QueueType doorbells_queues[doorbells_size];
+    int doorbells_ip_ids[doorbells_size];
    uint32_t sdma_engs_offset[sdma_engs_size];
    int sdma_engs[sdma_engs_size];
    int used_vmids[used_vmid_map_size];
@@ -701,7 +748,8 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
    int idx = 0;
    for (auto & it : doorbells) {
        doorbells_offset[idx] = it.first;
-        doorbells_queues[idx] = it.second;
+        doorbells_queues[idx] = it.second.qtype;
+        doorbells_ip_ids[idx] = it.second.ip_id;
        ++idx;
    }

@@ -730,6 +778,8 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
        sizeof(doorbells_offset[0]));
    SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
        sizeof(doorbells_queues[0]));
+    SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
+        sizeof(doorbells_ip_ids[0]));
    SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/
        sizeof(sdma_engs_offset[0]));
    SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0]));
@@ -768,14 +818,18 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
    if (doorbells_size > 0) {
        uint32_t doorbells_offset[doorbells_size];
        QueueType doorbells_queues[doorbells_size];
+        int doorbells_ip_ids[doorbells_size];

        UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
                sizeof(doorbells_offset[0]));
        UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
                sizeof(doorbells_queues[0]));
+        UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
+                sizeof(doorbells_ip_ids[0]));

        for (int idx = 0; idx < doorbells_size; ++idx) {
-            doorbells[doorbells_offset[idx]] = doorbells_queues[idx];
+            doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
+            doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
        }
    }