arch-gcn3: add support for unaligned accesses

Previously, with HSAIL, we were guaranteed by the HSA specification
that the GPU will never issue unaligned accesses.  However, now
that we are directly running GCN this is no longer true.
Accordingly, this commit adds support for unaligned accesses.
Moreover, to reduce the replication of nearly identical
code for the different request types, I also added new helper
functions that are called by all the different memory request
producing instruction types in op_encodings.hh.

Adding support for unaligned instructions requires changing
the statusBitVector used to track the status of the memory
requests for each lane from a bit per lane to an int per lane.
This is necessary because an unaligned access may span multiple
cache lines.  In the worst case, each lane may span multiple
cache lines.  There are corresponding changes in the files that
use the statusBitVector.

Change-Id: I319bf2f0f644083e98ca546d2bfe68cf87a5f967
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29920
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Matt Sinclair
2018-03-07 17:54:19 -05:00
committed by Anthony Gutierrez
parent fbcdf880ee
commit 8177fc4392
7 changed files with 298 additions and 242 deletions

View File

@@ -832,7 +832,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
gpuDynInst->disassemble(), w->outstandingReqs,
w->outstandingReqs - 1);
if (gpuDynInst->statusBitVector.none()) {
if (gpuDynInst->allLanesZero()) {
// ask gm pipe to decrement request counters, instead of directly
// performing here, to avoid asynchronous counter update and
// instruction retirement (which may hurt waincnt effects)
@@ -1078,7 +1078,6 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
gpuDynInst->tlbHitLevel[index] = hit_level;
// translation is done. Schedule the mem_req_event at the
// appropriate cycle to send the timing memory request to ruby
EventFunctionWrapper *mem_req_event =
@@ -1116,9 +1115,9 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
}
} else {
if (pkt->cmd == MemCmd::MemSyncReq) {
gpuDynInst->statusBitVector = VectorMask(0);
gpuDynInst->resetEntireStatusVector();
} else {
gpuDynInst->statusBitVector &= (~(1ll << index));
gpuDynInst->decrementStatusVector(index);
}
// New SenderState for the memory access
@@ -1289,12 +1288,10 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
gpuDynInst->memStatusVector[paddr].pop_back();
gpuDynInst->pAddr = pkt->req->getPaddr();
gpuDynInst->statusBitVector &= (~(1ULL << index));
gpuDynInst->decrementStatusVector(index);
DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
DPRINTF(GPUMem, "bitvector is now %#x\n",
gpuDynInst->statusBitVector);
if (gpuDynInst->statusBitVector == VectorMask(0)) {
if (gpuDynInst->allLanesZero()) {
auto iter = gpuDynInst->memStatusVector.begin();
auto end = gpuDynInst->memStatusVector.end();