diff --git a/src/arch/gcn3/gpu_mem_helpers.hh b/src/arch/gcn3/gpu_mem_helpers.hh
new file mode 100644
index 0000000000..40ca56561b
--- /dev/null
+++ b/src/arch/gcn3/gpu_mem_helpers.hh
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Matt Sinclair
+ */
+
+#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+
+#include "arch/gcn3/insts/gpu_static_inst.hh"
+#include "arch/gcn3/insts/op_encodings.hh"
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+/**
+ * Helper function for instructions declared in op_encodings.  This function
+ * takes in all of the arguments for a given memory request we are trying to
+ * initialize, then submits the request or requests depending on if the
+ * original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
+                 bool is_atomic=false)
+{
+    // local variables
+    int req_size = N * sizeof(T);
+    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+    Addr vaddr = 0, split_addr = 0;
+    bool misaligned_acc = false;
+    RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
+    PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
+
+    gpuDynInst->resetEntireStatusVector();
+    for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
+        if (gpuDynInst->exec_mask[lane]) {
+            vaddr = gpuDynInst->addr[lane];
+
+            /**
+             * the base address of the cache line where the the last
+             * byte of the request will be stored.
+             */
+            split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+            /**
+             * if the base cache line address of the last byte is
+             * greater than the address of the first byte then we have
+             * a misaligned access.
+             */
+            misaligned_acc = split_addr > vaddr;
+
+            if (is_atomic) {
+                req = std::make_shared<Request>(vaddr, sizeof(T), 0,
+                    gpuDynInst->computeUnit()->masterId(), 0,
+                    gpuDynInst->wfDynId,
+                    gpuDynInst->makeAtomicOpFunctor<T>(
+                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
+                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
+            } else {
+                req = std::make_shared<Request>(vaddr, req_size, 0,
+                                  gpuDynInst->computeUnit()->masterId(), 0,
+                                  gpuDynInst->wfDynId);
+            }
+
+            if (misaligned_acc) {
+                gpuDynInst->setStatusVector(lane, 2);
+                req->splitOnVaddr(split_addr, req1, req2);
+                gpuDynInst->setRequestFlags(req1);
+                gpuDynInst->setRequestFlags(req2);
+                pkt1 = new Packet(req1, mem_req_type);
+                pkt2 = new Packet(req2, mem_req_type);
+                pkt1->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N]);
+                pkt2->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N + req1->getSize()]);
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
+                        "request for %#x\n", gpuDynInst->cu_id,
+                        gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
+                        split_addr);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
+            } else {
+                gpuDynInst->setStatusVector(lane, 1);
+                gpuDynInst->setRequestFlags(req);
+                pkt = new Packet(req, mem_req_type);
+                pkt->dataStatic(&(reinterpret_cast<T*>(
+                    gpuDynInst->d_data))[lane * N]);
+                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
+            }
+        } else { // if lane is not active, then no pending requests
+            gpuDynInst->setStatusVector(lane, 0);
+        }
+    }
+}
+
+/**
+ * Helper function for scalar instructions declared in op_encodings.  This
+ * function takes in all of the arguments for a given memory request we are
+ * trying to initialize, then submits the request or requests depending on if
+ * the original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
+{
+    int req_size = N * sizeof(T);
+    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+    Addr vaddr = gpuDynInst->scalarAddr;
+
+    /**
+     * the base address of the cache line where the the last byte of
+     * the request will be stored.
+     */
+    Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+    assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+    /**
+     * if the base cache line address of the last byte is greater
+     * than the address of the first byte then we have a misaligned
+     * access.
+     */
+    bool misaligned_acc = split_addr > vaddr;
+
+    RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
+                                 gpuDynInst->computeUnit()->masterId(), 0,
+                                 gpuDynInst->wfDynId);
+
+    if (misaligned_acc) {
+        RequestPtr req1, req2;
+        req->splitOnVaddr(split_addr, req1, req2);
+        gpuDynInst->numScalarReqs = 2;
+        gpuDynInst->setRequestFlags(req1);
+        gpuDynInst->setRequestFlags(req2);
+        PacketPtr pkt1 = new Packet(req1, mem_req_type);
+        PacketPtr pkt2 = new Packet(req2, mem_req_type);
+        pkt1->dataStatic(gpuDynInst->scalar_data);
+        pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
+                " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, split_addr);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
+    } else {
+        gpuDynInst->numScalarReqs = 1;
+        gpuDynInst->setRequestFlags(req);
+        PacketPtr pkt = new Packet(req, mem_req_type);
+        pkt->dataStatic(gpuDynInst->scalar_data);
+        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
+    }
+}
+
+#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__
diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh
index 3197dc078f..308560a5f7 100644
--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -37,6 +37,7 @@
 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
 
 #include "arch/gcn3/gpu_decoder.hh"
+#include "arch/gcn3/gpu_mem_helpers.hh"
 #include "arch/gcn3/insts/gpu_static_inst.hh"
 #include "arch/gcn3/operand.hh"
 #include "debug/GPUExec.hh"
@@ -174,47 +175,8 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-            int req_size = N * sizeof(ScalarRegU32);
-            Addr vaddr = gpuDynInst->scalarAddr;
-
-            /**
-             * the base address of the cache line where the the last byte of
-             * the request will be stored.
-             */
-            Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-            /**
-             * if the base cache line address of the last byte is greater
-             * than the address of the first byte then we have a misaligned
-             * access.
-             */
-            bool misaligned_acc = split_addr > vaddr;
-
-            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
-                    gpuDynInst->computeUnit()->masterId(), 0,
-                    gpuDynInst->wfDynId);
-
-            if (misaligned_acc) {
-                RequestPtr req1, req2;
-                req->splitOnVaddr(split_addr, req1, req2);
-                gpuDynInst->numScalarReqs = 2;
-                gpuDynInst->setRequestFlags(req1);
-                gpuDynInst->setRequestFlags(req2);
-                PacketPtr pkt1 = new Packet(req1, MemCmd::ReadReq);
-                PacketPtr pkt2 = new Packet(req2, MemCmd::ReadReq);
-                pkt1->dataStatic(gpuDynInst->scalar_data);
-                pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-            } else {
-                gpuDynInst->numScalarReqs = 1;
-                gpuDynInst->setRequestFlags(req);
-                PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                pkt->dataStatic(gpuDynInst->scalar_data);
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
-            }
+            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+                                                    MemCmd::ReadReq);
         }
 
         /**
@@ -224,47 +186,8 @@ namespace Gcn3ISA
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-            int req_size = N * sizeof(ScalarRegU32);
-            Addr vaddr = gpuDynInst->scalarAddr;
-
-            /**
-             * the base address of the cache line where the the last byte of
-             * the request will be stored.
-             */
-            Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-            /**
-             * if the base cache line address of the last byte is greater
-             * than the address of the first byte then we have a misaligned
-             * access.
-             */
-            bool misaligned_acc = split_addr > vaddr;
-
-            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
-                    gpuDynInst->computeUnit()->masterId(), 0,
-                    gpuDynInst->wfDynId);
-
-            if (misaligned_acc) {
-                RequestPtr req1, req2;
-                req->splitOnVaddr(split_addr, req1, req2);
-                gpuDynInst->numScalarReqs = 2;
-                gpuDynInst->setRequestFlags(req1);
-                gpuDynInst->setRequestFlags(req2);
-                PacketPtr pkt1 = new Packet(req1, MemCmd::WriteReq);
-                PacketPtr pkt2 = new Packet(req2, MemCmd::WriteReq);
-                pkt1->dataStatic(gpuDynInst->scalar_data);
-                pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-            } else {
-                gpuDynInst->numScalarReqs = 1;
-                gpuDynInst->setRequestFlags(req);
-                PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                pkt->dataStatic(gpuDynInst->scalar_data);
-                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
-            }
+            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+                                                    MemCmd::WriteReq);
         }
 
         void
@@ -566,59 +489,22 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(), 0,
-                        gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-
-                    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                        pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
         }
 
         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(),
-                        0, gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-                    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                        pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
         }
 
         void
         injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
         {
             // create request and set flags
-            gpuDynInst->statusBitVector = VectorMask(1);
+            gpuDynInst->resetEntireStatusVector();
+            gpuDynInst->setStatusVector(0, 1);
             RequestPtr req = std::make_shared<Request>(0, 0, 0,
                                        gpuDynInst->computeUnit()->
                                        masterId(), 0,
@@ -771,133 +657,35 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                            gpuDynInst->computeUnit()->masterId(), 0,
-                            gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-                    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-                    gpuDynInst->computeUnit()
-                        ->sendRequest(gpuDynInst, lane, pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
         }
 
         template<int N>
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            int req_size = N * sizeof(VecElemU32);
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr, req_size,
-                        0,
-                        gpuDynInst->computeUnit()->masterId(), 0,
-                        gpuDynInst->wfDynId);
-
-                   gpuDynInst->setRequestFlags(req);
-                   PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-                   pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * N]);
-                   gpuDynInst->computeUnit()
-                        ->sendRequest(gpuDynInst, lane, pkt);
-                }
-            }
+            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
         }
 
         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(),
-                            0, gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-                    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                                                           pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
         }
 
         template<int N>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            int req_size = N * sizeof(VecElemU32);
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr, req_size,
-                        0,
-                        gpuDynInst->computeUnit()->masterId(),
-                            0, gpuDynInst->wfDynId);
-
-                    gpuDynInst->setRequestFlags(req);
-                    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-                    pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * N]);
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                        pkt);
-                }
-            }
+            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
         }
 
         template<typename T>
         void
         initAtomicAccess(GPUDynInstPtr gpuDynInst)
         {
-            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane];
-
-                    RequestPtr req = std::make_shared<Request>(vaddr,
-                        sizeof(T), 0,
-                        gpuDynInst->computeUnit()->masterId(), 0,
-                        gpuDynInst->wfDynId,
-                        gpuDynInst->makeAtomicOpFunctor<T>(
-                            &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
-                            &(reinterpret_cast<T*>(
-                                gpuDynInst->x_data))[lane]));
-
-                    gpuDynInst->setRequestFlags(req);
-
-                    PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
-                    pkt->dataStatic(&(reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane]);
-
-                    gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
-                        pkt);
-                }
-            }
+            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
         }
 
         void
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index feeb803e19..b0616d677b 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -832,7 +832,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
                         gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
                         gpuDynInst->disassemble(), w->outstandingReqs,
                         w->outstandingReqs - 1);
-        if (gpuDynInst->statusBitVector.none()) {
+        if (gpuDynInst->allLanesZero()) {
             // ask gm pipe to decrement request counters, instead of directly
             // performing here, to avoid asynchronous counter update and
             // instruction retirement (which may hurt waincnt effects)
@@ -1078,7 +1078,6 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
             gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
             gpuDynInst->tlbHitLevel[index] = hit_level;
 
-
             // translation is done. Schedule the mem_req_event at the
             // appropriate cycle to send the timing memory request to ruby
             EventFunctionWrapper *mem_req_event =
@@ -1116,9 +1115,9 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
         }
     } else {
         if (pkt->cmd == MemCmd::MemSyncReq) {
-            gpuDynInst->statusBitVector = VectorMask(0);
+            gpuDynInst->resetEntireStatusVector();
         } else {
-            gpuDynInst->statusBitVector &= (~(1ll << index));
+            gpuDynInst->decrementStatusVector(index);
         }
 
         // New SenderState for the memory access
@@ -1289,12 +1288,10 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
     gpuDynInst->memStatusVector[paddr].pop_back();
     gpuDynInst->pAddr = pkt->req->getPaddr();
 
-    gpuDynInst->statusBitVector &= (~(1ULL << index));
+    gpuDynInst->decrementStatusVector(index);
+    DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
 
-    DPRINTF(GPUMem, "bitvector is now %#x\n",
-            gpuDynInst->statusBitVector);
-
-    if (gpuDynInst->statusBitVector == VectorMask(0)) {
+    if (gpuDynInst->allLanesZero()) {
         auto iter = gpuDynInst->memStatusVector.begin();
         auto end = gpuDynInst->memStatusVector.end();
 
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 74b963b73c..2a49522da9 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,9 +42,10 @@
 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
                        GPUStaticInst *static_inst, InstSeqNum instSeqNum)
     : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
-      (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
+      (Addr)0), numScalarReqs(0), isSaveRestore(false),
       _staticInst(static_inst), _seqNum(instSeqNum)
 {
+    statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
     tlbHitLevel.assign(computeUnit()->wfSize(), -1);
     // vector instructions can have up to 4 source/destination operands
     d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index 392b57d12d..3d2fa0d3f3 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -39,6 +39,8 @@
 
 #include "base/amo.hh"
 #include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUMem.hh"
 #include "enums/StorageClassType.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_exec_context.hh"
@@ -307,13 +309,103 @@ class GPUDynInst : public GPUExecContext
         }
     }
 
+    // reset the number of pending memory requests for all lanes
+    void
+    resetEntireStatusVector()
+    {
+        assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            resetStatusVector(lane);
+        }
+    }
+
+    // reset the number of pending memory requests for the inputted lane
+    void
+    resetStatusVector(int lane)
+    {
+        setStatusVector(lane, 0);
+    }
+
+    // set the number of pending memory requests for the inputted lane
+    void
+    setStatusVector(int lane, int newVal)
+    {
+        // currently we can have up to 2 memory requests per lane (if the
+        // lane's request goes across multiple cache lines)
+        assert((newVal >= 0) && (newVal <= 2));
+        statusVector[lane] = newVal;
+    }
+
+    // subtracts the number of pending memory requests for the inputted lane
+    // by 1
+    void
+    decrementStatusVector(int lane)
+    {
+        // this lane may have multiple requests, so only subtract one for
+        // this request
+        assert(statusVector[lane] >= 1);
+        statusVector[lane]--;
+    }
+
+    // return the current number of pending memory requests for the inputted
+    // lane
+    int
+    getLaneStatus(int lane) const
+    {
+        return statusVector[lane];
+    }
+
+    // returns true if all memory requests from all lanes have been received,
+    // else returns false
+    bool
+    allLanesZero() const
+    {
+        // local variables
+        bool allZero = true;
+
+        // iterate over all lanes, checking the number of pending memory
+        // requests they have
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            // if any lane still has pending requests, return false
+            if (statusVector[lane] > 0) {
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
+                        "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
+                        statusVector[lane], addr[lane]);
+                allZero = false;
+            }
+        }
+
+        if (allZero) {
+            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
+                    " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
+        }
+        return allZero;
+    }
+
+    // returns a string representing the current state of the statusVector
+    std::string
+    printStatusVector() const
+    {
+        std::string statusVec_str = "[";
+
+        // iterate over all lanes, adding the current number of pending
+        // requests for this lane to the string
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            statusVec_str += std::to_string(statusVector[lane]);
+        }
+        statusVec_str += "]";
+
+        return statusVec_str;
+    }
+
     // Map returned packets and the addresses they satisfy with which lane they
     // were requested from
     typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
     StatusVector memStatusVector;
 
-    // Track the status of memory requests per lane, a bit per lane
-    VectorMask statusBitVector;
+    // Track the status of memory requests per lane, an int per lane to allow
+    // unaligned accesses
+    std::vector<int> statusVector;
     // for ld_v# or st_v#
     std::vector<int> tlbHitLevel;
 
diff --git a/src/mem/ruby/common/DataBlock.cc b/src/mem/ruby/common/DataBlock.cc
index a4d7f4916b..359f6bb0f7 100644
--- a/src/mem/ruby/common/DataBlock.cc
+++ b/src/mem/ruby/common/DataBlock.cc
@@ -107,7 +107,6 @@ DataBlock::getDataMod(int offset)
 void
 DataBlock::setData(const uint8_t *data, int offset, int len)
 {
-    assert(offset + len <= RubySystem::getBlockSizeBytes());
     memcpy(&m_data[offset], data, len);
 }
 
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 83aaa1a507..92fed81dd2 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -267,9 +267,6 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
                 curTick() + rs->clockPeriod());
             return true;
         }
-
-        assert(getOffset(pkt->getAddr()) + pkt->getSize() <=
-               RubySystem::getBlockSizeBytes());
     }
 
     // Save the port in the sender state object to be used later to