diff --git a/configs/example/gpufs/Disjoint_VIPER.py b/configs/example/gpufs/Disjoint_VIPER.py index 28f0768c2a..0fd258e0fd 100644 --- a/configs/example/gpufs/Disjoint_VIPER.py +++ b/configs/example/gpufs/Disjoint_VIPER.py @@ -58,6 +58,8 @@ class Disjoint_VIPER(RubySystem): self.network_cpu = DisjointSimple(self) self.network_gpu = DisjointSimple(self) + self.block_size_bytes = options.cacheline_size + # Construct CPU controllers cpu_dir_nodes = construct_dirs(options, system, self, self.network_cpu) (cp_sequencers, cp_cntrl_nodes) = construct_corepairs( diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index 02b1bb174a..3737f8a6ff 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -118,7 +118,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr) { _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt; - assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1))); + // The kernel object should be aligned to a 64B boundary, but not + // necessarily a cache line boundary. + unsigned akc_alignment_granularity = 64; + assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1))); /** * Need to use a raw pointer for DmaVirtDevice API. This is deleted @@ -201,7 +204,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, // Read from GPU memory manager one cache line at a time to prevent // rare cases where the AKC spans two memory pages. ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode), - system()->cacheLineSize()); + akc_alignment_granularity); for (; !gen.done(); gen.next()) { Addr chunk_addr = gen.addr(); int vmid = 1; @@ -212,10 +215,13 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Request::Flags flags = Request::PHYSICAL; RequestPtr request = std::make_shared(chunk_addr, - system()->cacheLineSize(), flags, + akc_alignment_granularity, flags, walker->getDevRequestor()); Packet *readPkt = new Packet(request, MemCmd::ReadReq); readPkt->dataStatic((uint8_t *)akc + gen.complete()); + // If the request spans two device memories, the device memory + // returned will be null. + assert(system()->getDeviceMemory(readPkt) != nullptr); system()->getDeviceMemory(readPkt)->access(readPkt); delete readPkt; }