the RequestDesc was previously implemented as a std::pair, which made the implementation overly complex and error prone. here we encapsulate the packet, primary, and secondary types all in a single data structure with all members properly intialized in a ctor
1396 lines
48 KiB
C++
1396 lines
48 KiB
C++
/*
|
|
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* Author: Sooraj Puthoor
|
|
*/
|
|
|
|
#include "base/misc.hh"
|
|
#include "base/str.hh"
|
|
#include "config/the_isa.hh"
|
|
|
|
#if THE_ISA == X86_ISA
|
|
#include "arch/x86/insts/microldstop.hh"
|
|
|
|
#endif // X86_ISA
|
|
#include "mem/ruby/system/GPUCoalescer.hh"
|
|
|
|
#include "cpu/testers/rubytest/RubyTester.hh"
|
|
#include "debug/GPUCoalescer.hh"
|
|
#include "debug/MemoryAccess.hh"
|
|
#include "debug/ProtocolTrace.hh"
|
|
#include "debug/RubyPort.hh"
|
|
#include "debug/RubyStats.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "mem/packet.hh"
|
|
#include "mem/ruby/common/DataBlock.hh"
|
|
#include "mem/ruby/common/SubBlock.hh"
|
|
#include "mem/ruby/network/MessageBuffer.hh"
|
|
#include "mem/ruby/profiler/Profiler.hh"
|
|
#include "mem/ruby/slicc_interface/AbstractController.hh"
|
|
#include "mem/ruby/slicc_interface/RubyRequest.hh"
|
|
#include "mem/ruby/structures/CacheMemory.hh"
|
|
#include "mem/ruby/system/RubySystem.hh"
|
|
#include "params/RubyGPUCoalescer.hh"
|
|
|
|
using namespace std;
|
|
|
|
GPUCoalescer *
|
|
RubyGPUCoalescerParams::create()
|
|
{
|
|
return new GPUCoalescer(this);
|
|
}
|
|
|
|
HSAScope
|
|
reqScopeToHSAScope(Request* req)
|
|
{
|
|
HSAScope accessScope = HSAScope_UNSPECIFIED;
|
|
if (req->isScoped()) {
|
|
if (req->isWavefrontScope()) {
|
|
accessScope = HSAScope_WAVEFRONT;
|
|
} else if (req->isWorkgroupScope()) {
|
|
accessScope = HSAScope_WORKGROUP;
|
|
} else if (req->isDeviceScope()) {
|
|
accessScope = HSAScope_DEVICE;
|
|
} else if (req->isSystemScope()) {
|
|
accessScope = HSAScope_SYSTEM;
|
|
} else {
|
|
fatal("Bad scope type");
|
|
}
|
|
}
|
|
return accessScope;
|
|
}
|
|
|
|
HSASegment
|
|
reqSegmentToHSASegment(Request* req)
|
|
{
|
|
HSASegment accessSegment = HSASegment_GLOBAL;
|
|
|
|
if (req->isGlobalSegment()) {
|
|
accessSegment = HSASegment_GLOBAL;
|
|
} else if (req->isGroupSegment()) {
|
|
accessSegment = HSASegment_GROUP;
|
|
} else if (req->isPrivateSegment()) {
|
|
accessSegment = HSASegment_PRIVATE;
|
|
} else if (req->isKernargSegment()) {
|
|
accessSegment = HSASegment_KERNARG;
|
|
} else if (req->isReadonlySegment()) {
|
|
accessSegment = HSASegment_READONLY;
|
|
} else if (req->isSpillSegment()) {
|
|
accessSegment = HSASegment_SPILL;
|
|
} else if (req->isArgSegment()) {
|
|
accessSegment = HSASegment_ARG;
|
|
} else {
|
|
fatal("Bad segment type");
|
|
}
|
|
|
|
return accessSegment;
|
|
}
|
|
|
|
GPUCoalescer::GPUCoalescer(const Params *p)
|
|
: RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
|
|
{
|
|
m_store_waiting_on_load_cycles = 0;
|
|
m_store_waiting_on_store_cycles = 0;
|
|
m_load_waiting_on_store_cycles = 0;
|
|
m_load_waiting_on_load_cycles = 0;
|
|
|
|
m_outstanding_count = 0;
|
|
|
|
m_max_outstanding_requests = 0;
|
|
m_deadlock_threshold = 0;
|
|
m_instCache_ptr = nullptr;
|
|
m_dataCache_ptr = nullptr;
|
|
|
|
m_instCache_ptr = p->icache;
|
|
m_dataCache_ptr = p->dcache;
|
|
m_max_outstanding_requests = p->max_outstanding_requests;
|
|
m_deadlock_threshold = p->deadlock_threshold;
|
|
|
|
assert(m_max_outstanding_requests > 0);
|
|
assert(m_deadlock_threshold > 0);
|
|
assert(m_instCache_ptr);
|
|
assert(m_dataCache_ptr);
|
|
|
|
m_data_cache_hit_latency = p->dcache_hit_latency;
|
|
|
|
m_runningGarnetStandalone = p->garnet_standalone;
|
|
assumingRfOCoherence = p->assume_rfo;
|
|
}
|
|
|
|
GPUCoalescer::~GPUCoalescer()
|
|
{
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::wakeup()
|
|
{
|
|
// Check for deadlock of any of the requests
|
|
Cycles current_time = curCycle();
|
|
|
|
// Check across all outstanding requests
|
|
int total_outstanding = 0;
|
|
|
|
RequestTable::iterator read = m_readRequestTable.begin();
|
|
RequestTable::iterator read_end = m_readRequestTable.end();
|
|
for (; read != read_end; ++read) {
|
|
GPUCoalescerRequest* request = read->second;
|
|
if (current_time - request->issue_time < m_deadlock_threshold)
|
|
continue;
|
|
|
|
panic("Possible Deadlock detected. Aborting!\n"
|
|
"version: %d request.paddr: 0x%x m_readRequestTable: %d "
|
|
"current time: %u issue_time: %d difference: %d\n", m_version,
|
|
request->pkt->getAddr(), m_readRequestTable.size(),
|
|
current_time * clockPeriod(), request->issue_time * clockPeriod(),
|
|
(current_time - request->issue_time)*clockPeriod());
|
|
}
|
|
|
|
RequestTable::iterator write = m_writeRequestTable.begin();
|
|
RequestTable::iterator write_end = m_writeRequestTable.end();
|
|
for (; write != write_end; ++write) {
|
|
GPUCoalescerRequest* request = write->second;
|
|
if (current_time - request->issue_time < m_deadlock_threshold)
|
|
continue;
|
|
|
|
panic("Possible Deadlock detected. Aborting!\n"
|
|
"version: %d request.paddr: 0x%x m_writeRequestTable: %d "
|
|
"current time: %u issue_time: %d difference: %d\n", m_version,
|
|
request->pkt->getAddr(), m_writeRequestTable.size(),
|
|
current_time * clockPeriod(), request->issue_time * clockPeriod(),
|
|
(current_time - request->issue_time) * clockPeriod());
|
|
}
|
|
|
|
total_outstanding += m_writeRequestTable.size();
|
|
total_outstanding += m_readRequestTable.size();
|
|
|
|
assert(m_outstanding_count == total_outstanding);
|
|
|
|
if (m_outstanding_count > 0) {
|
|
// If there are still outstanding requests, keep checking
|
|
schedule(deadlockCheckEvent,
|
|
m_deadlock_threshold * clockPeriod() +
|
|
curTick());
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::resetStats()
|
|
{
|
|
m_latencyHist.reset();
|
|
m_missLatencyHist.reset();
|
|
for (int i = 0; i < RubyRequestType_NUM; i++) {
|
|
m_typeLatencyHist[i]->reset();
|
|
m_missTypeLatencyHist[i]->reset();
|
|
for (int j = 0; j < MachineType_NUM; j++) {
|
|
m_missTypeMachLatencyHist[i][j]->reset();
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < MachineType_NUM; i++) {
|
|
m_missMachLatencyHist[i]->reset();
|
|
|
|
m_IssueToInitialDelayHist[i]->reset();
|
|
m_InitialToForwardDelayHist[i]->reset();
|
|
m_ForwardToFirstResponseDelayHist[i]->reset();
|
|
m_FirstResponseToCompletionDelayHist[i]->reset();
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::printProgress(ostream& out) const
|
|
{
|
|
}
|
|
|
|
RequestStatus
|
|
GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
|
|
{
|
|
Addr line_addr = makeLineAddress(pkt->getAddr());
|
|
|
|
if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
|
|
return RequestStatus_BufferFull;
|
|
}
|
|
|
|
if (m_controller->isBlocked(line_addr) &&
|
|
request_type != RubyRequestType_Locked_RMW_Write) {
|
|
return RequestStatus_Aliased;
|
|
}
|
|
|
|
if ((request_type == RubyRequestType_ST) ||
|
|
(request_type == RubyRequestType_ATOMIC) ||
|
|
(request_type == RubyRequestType_ATOMIC_RETURN) ||
|
|
(request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
|
|
(request_type == RubyRequestType_RMW_Read) ||
|
|
(request_type == RubyRequestType_RMW_Write) ||
|
|
(request_type == RubyRequestType_Load_Linked) ||
|
|
(request_type == RubyRequestType_Store_Conditional) ||
|
|
(request_type == RubyRequestType_Locked_RMW_Read) ||
|
|
(request_type == RubyRequestType_Locked_RMW_Write) ||
|
|
(request_type == RubyRequestType_FLUSH)) {
|
|
|
|
// Check if there is any outstanding read request for the same
|
|
// cache line.
|
|
if (m_readRequestTable.count(line_addr) > 0) {
|
|
m_store_waiting_on_load_cycles++;
|
|
return RequestStatus_Aliased;
|
|
}
|
|
|
|
if (m_writeRequestTable.count(line_addr) > 0) {
|
|
// There is an outstanding write request for the cache line
|
|
m_store_waiting_on_store_cycles++;
|
|
return RequestStatus_Aliased;
|
|
}
|
|
} else {
|
|
// Check if there is any outstanding write request for the same
|
|
// cache line.
|
|
if (m_writeRequestTable.count(line_addr) > 0) {
|
|
m_load_waiting_on_store_cycles++;
|
|
return RequestStatus_Aliased;
|
|
}
|
|
|
|
if (m_readRequestTable.count(line_addr) > 0) {
|
|
// There is an outstanding read request for the cache line
|
|
m_load_waiting_on_load_cycles++;
|
|
return RequestStatus_Aliased;
|
|
}
|
|
}
|
|
|
|
return RequestStatus_Ready;
|
|
|
|
}
|
|
|
|
|
|
|
|
// sets the kernelEndList
|
|
void
|
|
GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
|
|
{
|
|
// Don't know if this will happen or is possible
|
|
// but I just want to be careful and not have it become
|
|
// simulator hang in the future
|
|
DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
|
|
assert(kernelEndList.count(wavefront_id) == 0);
|
|
|
|
kernelEndList[wavefront_id] = pkt;
|
|
DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
|
|
kernelEndList.size());
|
|
}
|
|
|
|
|
|
// Insert the request on the correct request table. Return true if
|
|
// the entry was already present.
|
|
bool
|
|
GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
|
|
{
|
|
assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
|
|
pkt->req->isLockedRMW() ||
|
|
!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
|
|
|
|
int total_outstanding M5_VAR_USED =
|
|
m_writeRequestTable.size() + m_readRequestTable.size();
|
|
|
|
assert(m_outstanding_count == total_outstanding);
|
|
|
|
// See if we should schedule a deadlock check
|
|
if (!deadlockCheckEvent.scheduled()) {
|
|
schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
|
|
}
|
|
|
|
Addr line_addr = makeLineAddress(pkt->getAddr());
|
|
if ((request_type == RubyRequestType_ST) ||
|
|
(request_type == RubyRequestType_ATOMIC) ||
|
|
(request_type == RubyRequestType_ATOMIC_RETURN) ||
|
|
(request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
|
|
(request_type == RubyRequestType_RMW_Read) ||
|
|
(request_type == RubyRequestType_RMW_Write) ||
|
|
(request_type == RubyRequestType_Load_Linked) ||
|
|
(request_type == RubyRequestType_Store_Conditional) ||
|
|
(request_type == RubyRequestType_Locked_RMW_Read) ||
|
|
(request_type == RubyRequestType_Locked_RMW_Write) ||
|
|
(request_type == RubyRequestType_FLUSH)) {
|
|
|
|
pair<RequestTable::iterator, bool> r =
|
|
m_writeRequestTable.insert(RequestTable::value_type(line_addr,
|
|
(GPUCoalescerRequest*) NULL));
|
|
if (r.second) {
|
|
RequestTable::iterator i = r.first;
|
|
i->second = new GPUCoalescerRequest(pkt, request_type,
|
|
curCycle());
|
|
DPRINTF(GPUCoalescer,
|
|
"Inserting write request for paddr %#x for type %d\n",
|
|
pkt->req->getPaddr(), i->second->m_type);
|
|
m_outstanding_count++;
|
|
} else {
|
|
return true;
|
|
}
|
|
} else {
|
|
pair<RequestTable::iterator, bool> r =
|
|
m_readRequestTable.insert(RequestTable::value_type(line_addr,
|
|
(GPUCoalescerRequest*) NULL));
|
|
|
|
if (r.second) {
|
|
RequestTable::iterator i = r.first;
|
|
i->second = new GPUCoalescerRequest(pkt, request_type,
|
|
curCycle());
|
|
DPRINTF(GPUCoalescer,
|
|
"Inserting read request for paddr %#x for type %d\n",
|
|
pkt->req->getPaddr(), i->second->m_type);
|
|
m_outstanding_count++;
|
|
} else {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
m_outstandReqHist.sample(m_outstanding_count);
|
|
|
|
total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
|
|
assert(m_outstanding_count == total_outstanding);
|
|
|
|
return false;
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::markRemoved()
|
|
{
|
|
m_outstanding_count--;
|
|
assert(m_outstanding_count ==
|
|
m_writeRequestTable.size() + m_readRequestTable.size());
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
|
|
{
|
|
assert(m_outstanding_count ==
|
|
m_writeRequestTable.size() + m_readRequestTable.size());
|
|
|
|
Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
|
|
if ((srequest->m_type == RubyRequestType_ST) ||
|
|
(srequest->m_type == RubyRequestType_RMW_Read) ||
|
|
(srequest->m_type == RubyRequestType_RMW_Write) ||
|
|
(srequest->m_type == RubyRequestType_Load_Linked) ||
|
|
(srequest->m_type == RubyRequestType_Store_Conditional) ||
|
|
(srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
|
|
(srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
|
|
m_writeRequestTable.erase(line_addr);
|
|
} else {
|
|
m_readRequestTable.erase(line_addr);
|
|
}
|
|
|
|
markRemoved();
|
|
}
|
|
|
|
bool
|
|
GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
|
|
{
|
|
//
|
|
// The success flag indicates whether the LLSC operation was successful.
|
|
// LL ops will always succeed, but SC may fail if the cache line is no
|
|
// longer locked.
|
|
//
|
|
bool success = true;
|
|
if (request->m_type == RubyRequestType_Store_Conditional) {
|
|
if (!m_dataCache_ptr->isLocked(address, m_version)) {
|
|
//
|
|
// For failed SC requests, indicate the failure to the cpu by
|
|
// setting the extra data to zero.
|
|
//
|
|
request->pkt->req->setExtraData(0);
|
|
success = false;
|
|
} else {
|
|
//
|
|
// For successful SC requests, indicate the success to the cpu by
|
|
// setting the extra data to one.
|
|
//
|
|
request->pkt->req->setExtraData(1);
|
|
}
|
|
//
|
|
// Independent of success, all SC operations must clear the lock
|
|
//
|
|
m_dataCache_ptr->clearLocked(address);
|
|
} else if (request->m_type == RubyRequestType_Load_Linked) {
|
|
//
|
|
// Note: To fully follow Alpha LLSC semantics, should the LL clear any
|
|
// previously locked cache lines?
|
|
//
|
|
m_dataCache_ptr->setLocked(address, m_version);
|
|
} else if ((m_dataCache_ptr->isTagPresent(address)) &&
|
|
(m_dataCache_ptr->isLocked(address, m_version))) {
|
|
//
|
|
// Normal writes should clear the locked address
|
|
//
|
|
m_dataCache_ptr->clearLocked(address);
|
|
}
|
|
return success;
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCallback(Addr address, DataBlock& data)
|
|
{
|
|
writeCallback(address, MachineType_NULL, data);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data)
|
|
{
|
|
writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime)
|
|
{
|
|
writeCallback(address, mach, data,
|
|
initialRequestTime, forwardRequestTime, firstResponseTime,
|
|
false);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion)
|
|
{
|
|
assert(address == makeLineAddress(address));
|
|
|
|
DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
|
|
assert(m_writeRequestTable.count(makeLineAddress(address)));
|
|
|
|
RequestTable::iterator i = m_writeRequestTable.find(address);
|
|
assert(i != m_writeRequestTable.end());
|
|
GPUCoalescerRequest* request = i->second;
|
|
|
|
m_writeRequestTable.erase(i);
|
|
markRemoved();
|
|
|
|
assert((request->m_type == RubyRequestType_ST) ||
|
|
(request->m_type == RubyRequestType_ATOMIC) ||
|
|
(request->m_type == RubyRequestType_ATOMIC_RETURN) ||
|
|
(request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
|
|
(request->m_type == RubyRequestType_RMW_Read) ||
|
|
(request->m_type == RubyRequestType_RMW_Write) ||
|
|
(request->m_type == RubyRequestType_Load_Linked) ||
|
|
(request->m_type == RubyRequestType_Store_Conditional) ||
|
|
(request->m_type == RubyRequestType_Locked_RMW_Read) ||
|
|
(request->m_type == RubyRequestType_Locked_RMW_Write) ||
|
|
(request->m_type == RubyRequestType_FLUSH));
|
|
|
|
|
|
//
|
|
// For Alpha, properly handle LL, SC, and write requests with respect to
|
|
// locked cache blocks.
|
|
//
|
|
// Not valid for Garnet_standalone protocl
|
|
//
|
|
bool success = true;
|
|
if (!m_runningGarnetStandalone)
|
|
success = handleLlsc(address, request);
|
|
|
|
if (request->m_type == RubyRequestType_Locked_RMW_Read) {
|
|
m_controller->blockOnQueue(address, m_mandatory_q_ptr);
|
|
} else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
|
|
m_controller->unblock(address);
|
|
}
|
|
|
|
hitCallback(request, mach, data, success,
|
|
request->issue_time, forwardRequestTime, firstResponseTime,
|
|
isRegion);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::readCallback(Addr address, DataBlock& data)
|
|
{
|
|
readCallback(address, MachineType_NULL, data);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data)
|
|
{
|
|
readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime)
|
|
{
|
|
|
|
readCallback(address, mach, data,
|
|
initialRequestTime, forwardRequestTime, firstResponseTime,
|
|
false);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion)
|
|
{
|
|
assert(address == makeLineAddress(address));
|
|
assert(m_readRequestTable.count(makeLineAddress(address)));
|
|
|
|
DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
|
|
RequestTable::iterator i = m_readRequestTable.find(address);
|
|
assert(i != m_readRequestTable.end());
|
|
GPUCoalescerRequest* request = i->second;
|
|
|
|
m_readRequestTable.erase(i);
|
|
markRemoved();
|
|
|
|
assert((request->m_type == RubyRequestType_LD) ||
|
|
(request->m_type == RubyRequestType_IFETCH));
|
|
|
|
hitCallback(request, mach, data, true,
|
|
request->issue_time, forwardRequestTime, firstResponseTime,
|
|
isRegion);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
bool success,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion)
|
|
{
|
|
PacketPtr pkt = srequest->pkt;
|
|
Addr request_address = pkt->getAddr();
|
|
Addr request_line_address = makeLineAddress(request_address);
|
|
|
|
RubyRequestType type = srequest->m_type;
|
|
|
|
// Set this cache entry to the most recently used
|
|
if (type == RubyRequestType_IFETCH) {
|
|
if (m_instCache_ptr->isTagPresent(request_line_address))
|
|
m_instCache_ptr->setMRU(request_line_address);
|
|
} else {
|
|
if (m_dataCache_ptr->isTagPresent(request_line_address))
|
|
m_dataCache_ptr->setMRU(request_line_address);
|
|
}
|
|
|
|
recordMissLatency(srequest, mach,
|
|
initialRequestTime,
|
|
forwardRequestTime,
|
|
firstResponseTime,
|
|
success, isRegion);
|
|
// update the data
|
|
//
|
|
// MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
|
|
int len = reqCoalescer[request_line_address].size();
|
|
std::vector<PacketPtr> mylist;
|
|
for (int i = 0; i < len; ++i) {
|
|
PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
|
|
assert(type == reqCoalescer[request_line_address][i].primaryType);
|
|
request_address = pkt->getAddr();
|
|
request_line_address = makeLineAddress(pkt->getAddr());
|
|
if (pkt->getPtr<uint8_t>()) {
|
|
if ((type == RubyRequestType_LD) ||
|
|
(type == RubyRequestType_ATOMIC) ||
|
|
(type == RubyRequestType_ATOMIC_RETURN) ||
|
|
(type == RubyRequestType_IFETCH) ||
|
|
(type == RubyRequestType_RMW_Read) ||
|
|
(type == RubyRequestType_Locked_RMW_Read) ||
|
|
(type == RubyRequestType_Load_Linked)) {
|
|
memcpy(pkt->getPtr<uint8_t>(),
|
|
data.getData(getOffset(request_address),
|
|
pkt->getSize()),
|
|
pkt->getSize());
|
|
} else {
|
|
data.setData(pkt->getPtr<uint8_t>(),
|
|
getOffset(request_address), pkt->getSize());
|
|
}
|
|
} else {
|
|
DPRINTF(MemoryAccess,
|
|
"WARNING. Data not transfered from Ruby to M5 for type " \
|
|
"%s\n",
|
|
RubyRequestType_to_string(type));
|
|
}
|
|
|
|
// If using the RubyTester, update the RubyTester sender state's
|
|
// subBlock with the recieved data. The tester will later access
|
|
// this state.
|
|
// Note: RubyPort will access it's sender state before the
|
|
// RubyTester.
|
|
if (m_usingRubyTester) {
|
|
RubyPort::SenderState *requestSenderState =
|
|
safe_cast<RubyPort::SenderState*>(pkt->senderState);
|
|
RubyTester::SenderState* testerSenderState =
|
|
safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
|
|
testerSenderState->subBlock.mergeFrom(data);
|
|
}
|
|
|
|
mylist.push_back(pkt);
|
|
}
|
|
delete srequest;
|
|
reqCoalescer.erase(request_line_address);
|
|
assert(!reqCoalescer.count(request_line_address));
|
|
|
|
|
|
|
|
completeHitCallback(mylist, len);
|
|
}
|
|
|
|
bool
|
|
GPUCoalescer::empty() const
|
|
{
|
|
return m_writeRequestTable.empty() && m_readRequestTable.empty();
|
|
}
|
|
|
|
// Analyzes the packet to see if this request can be coalesced.
|
|
// If request can be coalesced, this request is added to the reqCoalescer table
|
|
// and makeRequest returns RequestStatus_Issued;
|
|
// If this is the first request to a cacheline, request is added to both
|
|
// newRequests queue and to the reqCoalescer table; makeRequest
|
|
// returns RequestStatus_Issued.
|
|
// If there is a pending request to this cacheline and this request
|
|
// can't be coalesced, RequestStatus_Aliased is returned and
|
|
// the packet needs to be reissued.
|
|
RequestStatus
|
|
GPUCoalescer::makeRequest(PacketPtr pkt)
|
|
{
|
|
// Check for GPU Barrier Kernel End or Kernel Begin
|
|
// Leave these to be handled by the child class
|
|
// Kernel End/Barrier = isFlush + isRelease
|
|
// Kernel Begin = isFlush + isAcquire
|
|
if (pkt->req->isKernel()) {
|
|
if (pkt->req->isAcquire()){
|
|
// This is a Kernel Begin leave handling to
|
|
// virtual xCoalescer::makeRequest
|
|
return RequestStatus_Issued;
|
|
}else if (pkt->req->isRelease()) {
|
|
// This is a Kernel End leave handling to
|
|
// virtual xCoalescer::makeRequest
|
|
// If we are here then we didn't call
|
|
// a virtual version of this function
|
|
// so we will also schedule the callback
|
|
int wf_id = 0;
|
|
if (pkt->req->hasContextId()) {
|
|
wf_id = pkt->req->contextId();
|
|
}
|
|
insertKernel(wf_id, pkt);
|
|
newKernelEnds.push_back(wf_id);
|
|
if (!issueEvent.scheduled()) {
|
|
schedule(issueEvent, curTick());
|
|
}
|
|
return RequestStatus_Issued;
|
|
}
|
|
}
|
|
|
|
// If number of outstanding requests greater than the max allowed,
|
|
// return RequestStatus_BufferFull. This logic can be extended to
|
|
// support proper backpressure.
|
|
if (m_outstanding_count >= m_max_outstanding_requests) {
|
|
return RequestStatus_BufferFull;
|
|
}
|
|
|
|
RubyRequestType primary_type = RubyRequestType_NULL;
|
|
RubyRequestType secondary_type = RubyRequestType_NULL;
|
|
|
|
if (pkt->isLLSC()) {
|
|
//
|
|
// Alpha LL/SC instructions need to be handled carefully by the cache
|
|
// coherence protocol to ensure they follow the proper semantics. In
|
|
// particular, by identifying the operations as atomic, the protocol
|
|
// should understand that migratory sharing optimizations should not
|
|
// be performed (i.e. a load between the LL and SC should not steal
|
|
// away exclusive permission).
|
|
//
|
|
if (pkt->isWrite()) {
|
|
primary_type = RubyRequestType_Store_Conditional;
|
|
} else {
|
|
assert(pkt->isRead());
|
|
primary_type = RubyRequestType_Load_Linked;
|
|
}
|
|
secondary_type = RubyRequestType_ATOMIC;
|
|
} else if (pkt->req->isLockedRMW()) {
|
|
//
|
|
// x86 locked instructions are translated to store cache coherence
|
|
// requests because these requests should always be treated as read
|
|
// exclusive operations and should leverage any migratory sharing
|
|
// optimization built into the protocol.
|
|
//
|
|
if (pkt->isWrite()) {
|
|
primary_type = RubyRequestType_Locked_RMW_Write;
|
|
} else {
|
|
assert(pkt->isRead());
|
|
primary_type = RubyRequestType_Locked_RMW_Read;
|
|
}
|
|
secondary_type = RubyRequestType_ST;
|
|
} else if (pkt->isAtomicOp()) {
|
|
//
|
|
// GPU Atomic Operation
|
|
//
|
|
primary_type = RubyRequestType_ATOMIC;
|
|
secondary_type = RubyRequestType_ATOMIC;
|
|
} else {
|
|
if (pkt->isRead()) {
|
|
if (pkt->req->isInstFetch()) {
|
|
primary_type = secondary_type = RubyRequestType_IFETCH;
|
|
} else {
|
|
#if THE_ISA == X86_ISA
|
|
uint32_t flags = pkt->req->getFlags();
|
|
bool storeCheck = flags &
|
|
(TheISA::StoreCheck << TheISA::FlagShift);
|
|
#else
|
|
bool storeCheck = false;
|
|
#endif // X86_ISA
|
|
if (storeCheck) {
|
|
primary_type = RubyRequestType_RMW_Read;
|
|
secondary_type = RubyRequestType_ST;
|
|
} else {
|
|
primary_type = secondary_type = RubyRequestType_LD;
|
|
}
|
|
}
|
|
} else if (pkt->isWrite()) {
|
|
//
|
|
// Note: M5 packets do not differentiate ST from RMW_Write
|
|
//
|
|
primary_type = secondary_type = RubyRequestType_ST;
|
|
} else if (pkt->isFlush()) {
|
|
primary_type = secondary_type = RubyRequestType_FLUSH;
|
|
} else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
|
|
if (assumingRfOCoherence) {
|
|
// If we reached here, this request must be a memFence
|
|
// and the protocol implements RfO, the coalescer can
|
|
// assume sequentially consistency and schedule the callback
|
|
// immediately.
|
|
// Currently the code implements fence callbacks
|
|
// by reusing the mechanism for kernel completions.
|
|
// This should be fixed.
|
|
int wf_id = 0;
|
|
if (pkt->req->hasContextId()) {
|
|
wf_id = pkt->req->contextId();
|
|
}
|
|
insertKernel(wf_id, pkt);
|
|
newKernelEnds.push_back(wf_id);
|
|
if (!issueEvent.scheduled()) {
|
|
schedule(issueEvent, curTick());
|
|
}
|
|
return RequestStatus_Issued;
|
|
} else {
|
|
// If not RfO, return issued here and let the child coalescer
|
|
// take care of it.
|
|
return RequestStatus_Issued;
|
|
}
|
|
} else {
|
|
panic("Unsupported ruby packet type\n");
|
|
}
|
|
}
|
|
|
|
// Check if there is any pending request to this cache line from
|
|
// previous cycles.
|
|
// If there is a pending request, return aliased. Since coalescing
|
|
// across time is not permitted, aliased requests are not coalesced.
|
|
// If a request for this address has already been issued, we must block
|
|
RequestStatus status = getRequestStatus(pkt, primary_type);
|
|
if (status != RequestStatus_Ready)
|
|
return status;
|
|
|
|
Addr line_addr = makeLineAddress(pkt->getAddr());
|
|
|
|
// Check if this request can be coalesced with previous
|
|
// requests from this cycle.
|
|
if (!reqCoalescer.count(line_addr)) {
|
|
// This is the first access to this cache line.
|
|
// A new request to the memory subsystem has to be
|
|
// made in the next cycle for this cache line, so
|
|
// add this line addr to the "newRequests" queue
|
|
newRequests.push_back(line_addr);
|
|
|
|
// There was a request to this cache line in this cycle,
|
|
// let us see if we can coalesce this request with the previous
|
|
// requests from this cycle
|
|
} else if (primary_type !=
|
|
reqCoalescer[line_addr][0].primaryType) {
|
|
// can't coalesce loads, stores and atomics!
|
|
return RequestStatus_Aliased;
|
|
} else if (pkt->req->isLockedRMW() ||
|
|
reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
|
|
// can't coalesce locked accesses, but can coalesce atomics!
|
|
return RequestStatus_Aliased;
|
|
} else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
|
|
pkt->req->contextId() !=
|
|
reqCoalescer[line_addr][0].pkt->req->contextId()) {
|
|
// can't coalesce releases from different wavefronts
|
|
return RequestStatus_Aliased;
|
|
}
|
|
|
|
// in addition to the packet, we need to save both request types
|
|
reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
|
|
if (!issueEvent.scheduled())
|
|
schedule(issueEvent, curTick());
|
|
// TODO: issue hardware prefetches here
|
|
return RequestStatus_Issued;
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
|
|
{
|
|
|
|
int proc_id = -1;
|
|
if (pkt != NULL && pkt->req->hasContextId()) {
|
|
proc_id = pkt->req->contextId();
|
|
}
|
|
|
|
// If valid, copy the pc to the ruby request
|
|
Addr pc = 0;
|
|
if (pkt->req->hasPC()) {
|
|
pc = pkt->req->getPC();
|
|
}
|
|
|
|
// At the moment setting scopes only counts
|
|
// for GPU spill space accesses
|
|
// which is pkt->req->isStack()
|
|
// this scope is REPLACE since it
|
|
// does not need to be flushed at the end
|
|
// of a kernel Private and local may need
|
|
// to be visible at the end of the kernel
|
|
HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
|
|
HSAScope accessScope = reqScopeToHSAScope(pkt->req);
|
|
|
|
Addr line_addr = makeLineAddress(pkt->getAddr());
|
|
|
|
// Creating WriteMask that records written bytes
|
|
// and atomic operations. This enables partial writes
|
|
// and partial reads of those writes
|
|
DataBlock dataBlock;
|
|
dataBlock.clear();
|
|
uint32_t blockSize = RubySystem::getBlockSizeBytes();
|
|
std::vector<bool> accessMask(blockSize,false);
|
|
std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
|
|
uint32_t tableSize = reqCoalescer[line_addr].size();
|
|
for (int i = 0; i < tableSize; i++) {
|
|
PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
|
|
uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
|
|
uint32_t tmpSize = tmpPkt->getSize();
|
|
if (tmpPkt->isAtomicOp()) {
|
|
std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
|
|
tmpPkt->getAtomicOp());
|
|
atomicOps.push_back(tmpAtomicOp);
|
|
} else if (tmpPkt->isWrite()) {
|
|
dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
|
|
tmpOffset, tmpSize);
|
|
}
|
|
for (int j = 0; j < tmpSize; j++) {
|
|
accessMask[tmpOffset + j] = true;
|
|
}
|
|
}
|
|
std::shared_ptr<RubyRequest> msg;
|
|
if (pkt->isAtomicOp()) {
|
|
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
|
|
pkt->getPtr<uint8_t>(),
|
|
pkt->getSize(), pc, secondary_type,
|
|
RubyAccessMode_Supervisor, pkt,
|
|
PrefetchBit_No, proc_id, 100,
|
|
blockSize, accessMask,
|
|
dataBlock, atomicOps,
|
|
accessScope, accessSegment);
|
|
} else {
|
|
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
|
|
pkt->getPtr<uint8_t>(),
|
|
pkt->getSize(), pc, secondary_type,
|
|
RubyAccessMode_Supervisor, pkt,
|
|
PrefetchBit_No, proc_id, 100,
|
|
blockSize, accessMask,
|
|
dataBlock,
|
|
accessScope, accessSegment);
|
|
}
|
|
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
|
|
curTick(), m_version, "Coal", "Begin", "", "",
|
|
printAddress(msg->getPhysicalAddress()),
|
|
RubyRequestType_to_string(secondary_type));
|
|
|
|
fatal_if(secondary_type == RubyRequestType_IFETCH,
|
|
"there should not be any I-Fetch requests in the GPU Coalescer");
|
|
|
|
// Send the message to the cache controller
|
|
fatal_if(m_data_cache_hit_latency == 0,
|
|
"should not have a latency of zero");
|
|
|
|
assert(m_mandatory_q_ptr);
|
|
m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
|
|
}
|
|
|
|
template <class KEY, class VALUE>
|
|
std::ostream &
|
|
operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
|
|
{
|
|
out << "[";
|
|
for (auto i = map.begin(); i != map.end(); ++i)
|
|
out << " " << i->first << "=" << i->second;
|
|
out << " ]";
|
|
|
|
return out;
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::print(ostream& out) const
|
|
{
|
|
out << "[GPUCoalescer: " << m_version
|
|
<< ", outstanding requests: " << m_outstanding_count
|
|
<< ", read request table: " << m_readRequestTable
|
|
<< ", write request table: " << m_writeRequestTable
|
|
<< "]";
|
|
}
|
|
|
|
// this can be called from setState whenever coherence permissions are
|
|
// upgraded when invoked, coherence violations will be checked for the
|
|
// given block
|
|
void
|
|
GPUCoalescer::checkCoherence(Addr addr)
|
|
{
|
|
#ifdef CHECK_COHERENCE
|
|
m_ruby_system->checkGlobalCoherenceInvariant(addr);
|
|
#endif
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
|
|
DPRINTF(RubyStats, "Recorded statistic: %s\n",
|
|
SequencerRequestType_to_string(requestType));
|
|
}
|
|
|
|
GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
|
|
: Event(Progress_Event_Pri), seq(_seq)
|
|
{
|
|
}
|
|
|
|
|
|
void
|
|
GPUCoalescer::completeIssue()
|
|
{
|
|
// newRequests has the cacheline addresses of all the
|
|
// requests which need to be issued to the memory subsystem
|
|
// in this cycle
|
|
int len = newRequests.size();
|
|
DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
|
|
for (int i = 0; i < len; ++i) {
|
|
// Get the requests from reqCoalescer table. Get only the
|
|
// first request for each cacheline, the remaining requests
|
|
// can be coalesced with the first request. So, only
|
|
// one request is issued per cacheline.
|
|
RequestDesc info = reqCoalescer[newRequests[i]][0];
|
|
PacketPtr pkt = info.pkt;
|
|
DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
|
|
i, pkt->req->getPaddr());
|
|
// Insert this request to the read/writeRequestTables. These tables
|
|
// are used to track aliased requests in makeRequest subroutine
|
|
bool found = insertRequest(pkt, info.primaryType);
|
|
|
|
if (found) {
|
|
panic("GPUCoalescer::makeRequest should never be called if the "
|
|
"request is already outstanding\n");
|
|
}
|
|
|
|
// Issue request to ruby subsystem
|
|
issueRequest(pkt, info.secondaryType);
|
|
}
|
|
newRequests.clear();
|
|
|
|
// have Kernel End releases been issued this cycle
|
|
len = newKernelEnds.size();
|
|
for (int i = 0; i < len; i++) {
|
|
kernelCallback(newKernelEnds[i]);
|
|
}
|
|
newKernelEnds.clear();
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::IssueEvent::process()
|
|
{
|
|
seq->completeIssue();
|
|
}
|
|
|
|
const char *
|
|
GPUCoalescer::IssueEvent::description() const
|
|
{
|
|
return "Issue coalesced request";
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::evictionCallback(Addr address)
|
|
{
|
|
ruby_eviction_callback(address);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::kernelCallback(int wavefront_id)
|
|
{
|
|
assert(kernelEndList.count(wavefront_id));
|
|
|
|
ruby_hit_callback(kernelEndList[wavefront_id]);
|
|
|
|
kernelEndList.erase(wavefront_id);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::atomicCallback(Addr address,
|
|
MachineType mach,
|
|
const DataBlock& data)
|
|
{
|
|
assert(address == makeLineAddress(address));
|
|
|
|
DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
|
|
assert(m_writeRequestTable.count(makeLineAddress(address)));
|
|
|
|
RequestTable::iterator i = m_writeRequestTable.find(address);
|
|
assert(i != m_writeRequestTable.end());
|
|
GPUCoalescerRequest* srequest = i->second;
|
|
|
|
m_writeRequestTable.erase(i);
|
|
markRemoved();
|
|
|
|
assert((srequest->m_type == RubyRequestType_ATOMIC) ||
|
|
(srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
|
|
(srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
|
|
|
|
|
|
// Atomics don't write to cache, so there is no MRU update...
|
|
|
|
recordMissLatency(srequest, mach,
|
|
srequest->issue_time, Cycles(0), Cycles(0), true, false);
|
|
|
|
PacketPtr pkt = srequest->pkt;
|
|
Addr request_address = pkt->getAddr();
|
|
Addr request_line_address = makeLineAddress(pkt->getAddr());
|
|
|
|
int len = reqCoalescer[request_line_address].size();
|
|
std::vector<PacketPtr> mylist;
|
|
for (int i = 0; i < len; ++i) {
|
|
PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
|
|
assert(srequest->m_type ==
|
|
reqCoalescer[request_line_address][i].primaryType);
|
|
request_address = (pkt->getAddr());
|
|
request_line_address = makeLineAddress(request_address);
|
|
if (pkt->getPtr<uint8_t>() &&
|
|
srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
|
|
/* atomics are done in memory, and return the data *before* the atomic op... */
|
|
memcpy(pkt->getPtr<uint8_t>(),
|
|
data.getData(getOffset(request_address),
|
|
pkt->getSize()),
|
|
pkt->getSize());
|
|
} else {
|
|
DPRINTF(MemoryAccess,
|
|
"WARNING. Data not transfered from Ruby to M5 for type " \
|
|
"%s\n",
|
|
RubyRequestType_to_string(srequest->m_type));
|
|
}
|
|
|
|
// If using the RubyTester, update the RubyTester sender state's
|
|
// subBlock with the recieved data. The tester will later access
|
|
// this state.
|
|
// Note: RubyPort will access it's sender state before the
|
|
// RubyTester.
|
|
if (m_usingRubyTester) {
|
|
RubyPort::SenderState *requestSenderState =
|
|
safe_cast<RubyPort::SenderState*>(pkt->senderState);
|
|
RubyTester::SenderState* testerSenderState =
|
|
safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
|
|
testerSenderState->subBlock.mergeFrom(data);
|
|
}
|
|
|
|
mylist.push_back(pkt);
|
|
}
|
|
delete srequest;
|
|
reqCoalescer.erase(request_line_address);
|
|
assert(!reqCoalescer.count(request_line_address));
|
|
|
|
completeHitCallback(mylist, len);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
|
|
{
|
|
if (myMachID == senderMachID) {
|
|
CP_TCPLdHits++;
|
|
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
|
|
CP_TCPLdTransfers++;
|
|
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
|
|
CP_TCCLdHits++;
|
|
} else {
|
|
CP_LdMiss++;
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
|
|
{
|
|
if (myMachID == senderMachID) {
|
|
CP_TCPStHits++;
|
|
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
|
|
CP_TCPStTransfers++;
|
|
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
|
|
CP_TCCStHits++;
|
|
} else {
|
|
CP_StMiss++;
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
|
|
{
|
|
for (int i = 0; i < len; ++i) {
|
|
RubyPort::SenderState *ss =
|
|
safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
|
|
MemSlavePort *port = ss->port;
|
|
assert(port != NULL);
|
|
|
|
mylist[i]->senderState = ss->predecessor;
|
|
delete ss;
|
|
port->hitCallback(mylist[i]);
|
|
trySendRetries();
|
|
}
|
|
|
|
testDrainComplete();
|
|
}
|
|
|
|
PacketPtr
|
|
GPUCoalescer::mapAddrToPkt(Addr address)
|
|
{
|
|
RequestTable::iterator i = m_readRequestTable.find(address);
|
|
assert(i != m_readRequestTable.end());
|
|
GPUCoalescerRequest* request = i->second;
|
|
return request->pkt;
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
|
|
MachineType mach,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool success, bool isRegion)
|
|
{
|
|
RubyRequestType type = srequest->m_type;
|
|
Cycles issued_time = srequest->issue_time;
|
|
Cycles completion_time = curCycle();
|
|
assert(completion_time >= issued_time);
|
|
Cycles total_lat = completion_time - issued_time;
|
|
|
|
// cache stats (valid for RfO protocol only)
|
|
if (mach == MachineType_TCP) {
|
|
if (type == RubyRequestType_LD) {
|
|
GPU_TCPLdHits++;
|
|
} else {
|
|
GPU_TCPStHits++;
|
|
}
|
|
} else if (mach == MachineType_L1Cache_wCC) {
|
|
if (type == RubyRequestType_LD) {
|
|
GPU_TCPLdTransfers++;
|
|
} else {
|
|
GPU_TCPStTransfers++;
|
|
}
|
|
} else if (mach == MachineType_TCC) {
|
|
if (type == RubyRequestType_LD) {
|
|
GPU_TCCLdHits++;
|
|
} else {
|
|
GPU_TCCStHits++;
|
|
}
|
|
} else {
|
|
if (type == RubyRequestType_LD) {
|
|
GPU_LdMiss++;
|
|
} else {
|
|
GPU_StMiss++;
|
|
}
|
|
}
|
|
|
|
// Profile all access latency, even zero latency accesses
|
|
m_latencyHist.sample(total_lat);
|
|
m_typeLatencyHist[type]->sample(total_lat);
|
|
|
|
// Profile the miss latency for all non-zero demand misses
|
|
if (total_lat != Cycles(0)) {
|
|
m_missLatencyHist.sample(total_lat);
|
|
m_missTypeLatencyHist[type]->sample(total_lat);
|
|
|
|
if (mach != MachineType_NUM) {
|
|
m_missMachLatencyHist[mach]->sample(total_lat);
|
|
m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
|
|
|
|
if ((issued_time <= initialRequestTime) &&
|
|
(initialRequestTime <= forwardRequestTime) &&
|
|
(forwardRequestTime <= firstResponseTime) &&
|
|
(firstResponseTime <= completion_time)) {
|
|
|
|
m_IssueToInitialDelayHist[mach]->sample(
|
|
initialRequestTime - issued_time);
|
|
m_InitialToForwardDelayHist[mach]->sample(
|
|
forwardRequestTime - initialRequestTime);
|
|
m_ForwardToFirstResponseDelayHist[mach]->sample(
|
|
firstResponseTime - forwardRequestTime);
|
|
m_FirstResponseToCompletionDelayHist[mach]->sample(
|
|
completion_time - firstResponseTime);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
|
|
curTick(), m_version, "Coal",
|
|
success ? "Done" : "SC_Failed", "", "",
|
|
printAddress(srequest->pkt->getAddr()), total_lat);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::regStats()
|
|
{
|
|
RubyPort::regStats();
|
|
|
|
// These statistical variables are not for display.
|
|
// The profiler will collate these across different
|
|
// coalescers and display those collated statistics.
|
|
m_outstandReqHist.init(10);
|
|
m_latencyHist.init(10);
|
|
m_missLatencyHist.init(10);
|
|
|
|
for (int i = 0; i < RubyRequestType_NUM; i++) {
|
|
m_typeLatencyHist.push_back(new Stats::Histogram());
|
|
m_typeLatencyHist[i]->init(10);
|
|
|
|
m_missTypeLatencyHist.push_back(new Stats::Histogram());
|
|
m_missTypeLatencyHist[i]->init(10);
|
|
}
|
|
|
|
for (int i = 0; i < MachineType_NUM; i++) {
|
|
m_missMachLatencyHist.push_back(new Stats::Histogram());
|
|
m_missMachLatencyHist[i]->init(10);
|
|
|
|
m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
|
|
m_IssueToInitialDelayHist[i]->init(10);
|
|
|
|
m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
|
|
m_InitialToForwardDelayHist[i]->init(10);
|
|
|
|
m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
|
|
m_ForwardToFirstResponseDelayHist[i]->init(10);
|
|
|
|
m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
|
|
m_FirstResponseToCompletionDelayHist[i]->init(10);
|
|
}
|
|
|
|
for (int i = 0; i < RubyRequestType_NUM; i++) {
|
|
m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
|
|
|
|
for (int j = 0; j < MachineType_NUM; j++) {
|
|
m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
|
|
m_missTypeMachLatencyHist[i][j]->init(10);
|
|
}
|
|
}
|
|
|
|
// GPU cache stats
|
|
GPU_TCPLdHits
|
|
.name(name() + ".gpu_tcp_ld_hits")
|
|
.desc("loads that hit in the TCP")
|
|
;
|
|
GPU_TCPLdTransfers
|
|
.name(name() + ".gpu_tcp_ld_transfers")
|
|
.desc("TCP to TCP load transfers")
|
|
;
|
|
GPU_TCCLdHits
|
|
.name(name() + ".gpu_tcc_ld_hits")
|
|
.desc("loads that hit in the TCC")
|
|
;
|
|
GPU_LdMiss
|
|
.name(name() + ".gpu_ld_misses")
|
|
.desc("loads that miss in the GPU")
|
|
;
|
|
|
|
GPU_TCPStHits
|
|
.name(name() + ".gpu_tcp_st_hits")
|
|
.desc("stores that hit in the TCP")
|
|
;
|
|
GPU_TCPStTransfers
|
|
.name(name() + ".gpu_tcp_st_transfers")
|
|
.desc("TCP to TCP store transfers")
|
|
;
|
|
GPU_TCCStHits
|
|
.name(name() + ".gpu_tcc_st_hits")
|
|
.desc("stores that hit in the TCC")
|
|
;
|
|
GPU_StMiss
|
|
.name(name() + ".gpu_st_misses")
|
|
.desc("stores that miss in the GPU")
|
|
;
|
|
|
|
// CP cache stats
|
|
CP_TCPLdHits
|
|
.name(name() + ".cp_tcp_ld_hits")
|
|
.desc("loads that hit in the TCP")
|
|
;
|
|
CP_TCPLdTransfers
|
|
.name(name() + ".cp_tcp_ld_transfers")
|
|
.desc("TCP to TCP load transfers")
|
|
;
|
|
CP_TCCLdHits
|
|
.name(name() + ".cp_tcc_ld_hits")
|
|
.desc("loads that hit in the TCC")
|
|
;
|
|
CP_LdMiss
|
|
.name(name() + ".cp_ld_misses")
|
|
.desc("loads that miss in the GPU")
|
|
;
|
|
|
|
CP_TCPStHits
|
|
.name(name() + ".cp_tcp_st_hits")
|
|
.desc("stores that hit in the TCP")
|
|
;
|
|
CP_TCPStTransfers
|
|
.name(name() + ".cp_tcp_st_transfers")
|
|
.desc("TCP to TCP store transfers")
|
|
;
|
|
CP_TCCStHits
|
|
.name(name() + ".cp_tcc_st_hits")
|
|
.desc("stores that hit in the TCC")
|
|
;
|
|
CP_StMiss
|
|
.name(name() + ".cp_st_misses")
|
|
.desc("stores that miss in the GPU")
|
|
;
|
|
}
|