ruby: Added merge GETS optimization to hammer
Added an optimization that merges multiple pending GETS requests into a single request to the owner node.
This commit is contained in:
@@ -95,6 +95,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
// Requests
|
||||
Other_GETX, desc="A GetX from another processor";
|
||||
Other_GETS, desc="A GetS from another processor";
|
||||
Merged_GETS, desc="A Merged GetS from another processor";
|
||||
Other_GETS_No_Mig, desc="A GetS from another processor";
|
||||
Invalidate, desc="Invalidate block";
|
||||
|
||||
@@ -136,6 +137,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for";
|
||||
bool Sharers, desc="On a GetS, did we find any other sharers in the system";
|
||||
MachineID LastResponder, desc="last machine to send a response for this request";
|
||||
MachineID CurOwner, desc="current owner of the block, used for UnblockS responses";
|
||||
Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
|
||||
Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
|
||||
Time FirstResponseTime, default="0", desc="the time the first response was received";
|
||||
@@ -286,6 +288,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
peek(forwardToCache_in, RequestMsg, block_on="Address") {
|
||||
if (in_msg.Type == CoherenceRequestType:GETX) {
|
||||
trigger(Event:Other_GETX, in_msg.Address);
|
||||
} else if (in_msg.Type == CoherenceRequestType:MERGED_GETS) {
|
||||
trigger(Event:Merged_GETS, in_msg.Address);
|
||||
} else if (in_msg.Type == CoherenceRequestType:GETS) {
|
||||
if (isCacheTagPresent(in_msg.Address)) {
|
||||
if (getCacheEntry(in_msg.Address).AtomicAccessed && no_mig_atomic) {
|
||||
@@ -518,6 +522,24 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
}
|
||||
}
|
||||
|
||||
action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors") {
|
||||
peek(forwardToCache_in, RequestMsg) {
|
||||
enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
|
||||
out_msg.Address := address;
|
||||
out_msg.Type := CoherenceResponseType:DATA_SHARED;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.Destination := in_msg.MergedRequestors;
|
||||
out_msg.DataBlk := getCacheEntry(address).DataBlk;
|
||||
DEBUG_EXPR(out_msg.DataBlk);
|
||||
out_msg.Dirty := getCacheEntry(address).Dirty;
|
||||
out_msg.Acks := machineCount(MachineType:L1Cache);
|
||||
out_msg.MessageSize := MessageSizeType:Response_Data;
|
||||
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
|
||||
out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(f_sendAck, "f", desc="Send ack from cache to requestor") {
|
||||
peek(forwardToCache_in, RequestMsg) {
|
||||
enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
|
||||
@@ -575,6 +597,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
out_msg.Address := address;
|
||||
out_msg.Type := CoherenceResponseType:UNBLOCKS;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.CurOwner := TBEs[address].CurOwner;
|
||||
out_msg.Destination.add(map_Address_to_Directory(address));
|
||||
out_msg.MessageSize := MessageSizeType:Unblock_Control;
|
||||
}
|
||||
@@ -690,6 +713,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
}
|
||||
}
|
||||
}
|
||||
action(uo_updateCurrentOwner, "uo", desc="When moving SS state, update current owner.") {
|
||||
peek(responseToCache_in, ResponseMsg) {
|
||||
TBEs[address].CurOwner := in_msg.Sender;
|
||||
}
|
||||
}
|
||||
|
||||
action(n_popResponseQueue, "n", desc="Pop response queue") {
|
||||
responseToCache_in.dequeue();
|
||||
@@ -745,6 +773,24 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
}
|
||||
}
|
||||
|
||||
action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers") {
|
||||
peek(forwardToCache_in, RequestMsg) {
|
||||
enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) {
|
||||
out_msg.Address := address;
|
||||
out_msg.Type := CoherenceResponseType:DATA;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.Destination := in_msg.MergedRequestors;
|
||||
DEBUG_EXPR(out_msg.Destination);
|
||||
out_msg.DataBlk := TBEs[address].DataBlk;
|
||||
out_msg.Dirty := TBEs[address].Dirty;
|
||||
out_msg.Acks := machineCount(MachineType:L1Cache);
|
||||
out_msg.MessageSize := MessageSizeType:Response_Data;
|
||||
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
|
||||
out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(qq_sendDataFromTBEToMemory, "\q", desc="Send data from TBE to memory") {
|
||||
enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
|
||||
out_msg.Address := address;
|
||||
@@ -899,7 +945,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
zz_recycleMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({IT, ST, OT, MT, MMT}, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
|
||||
transition({IT, ST, OT, MT, MMT}, {Other_GETX, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate}) {
|
||||
// stall
|
||||
}
|
||||
|
||||
@@ -1111,6 +1157,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
transition(O, Merged_GETS) {
|
||||
em_sendDataSharedMultiple;
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
// Transitions from Modified
|
||||
transition(MM, {Load, Ifetch}) {
|
||||
h_load_hit;
|
||||
@@ -1143,6 +1194,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
transition(MM, Merged_GETS, O) {
|
||||
em_sendDataSharedMultiple;
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
// Transitions from Dirty Exclusive
|
||||
transition(M, {Load, Ifetch}) {
|
||||
h_load_hit;
|
||||
@@ -1170,6 +1226,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
transition(M, Merged_GETS, O) {
|
||||
em_sendDataSharedMultiple;
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
// Transitions from IM
|
||||
|
||||
transition(IM, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
|
||||
@@ -1249,6 +1310,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
transition(OM, Merged_GETS) {
|
||||
em_sendDataSharedMultiple;
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
transition(OM, Ack) {
|
||||
m_decrementNumberOfMessages;
|
||||
o_checkForCompletion;
|
||||
@@ -1287,6 +1353,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
m_decrementNumberOfMessages;
|
||||
o_checkForCompletion;
|
||||
hx_external_load_hit;
|
||||
uo_updateCurrentOwner;
|
||||
n_popResponseQueue;
|
||||
}
|
||||
|
||||
@@ -1304,6 +1371,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
m_decrementNumberOfMessages;
|
||||
o_checkForCompletion;
|
||||
hx_external_load_hit;
|
||||
uo_updateCurrentOwner;
|
||||
n_popResponseQueue;
|
||||
}
|
||||
|
||||
@@ -1385,6 +1453,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
transition({OI, MI}, Merged_GETS, OI) {
|
||||
qm_sendDataFromTBEToCache;
|
||||
l_popForwardQueue;
|
||||
}
|
||||
|
||||
transition(MI, Writeback_Ack, I) {
|
||||
t_sendExclusiveDataFromTBEToMemory;
|
||||
s_deallocateTBE;
|
||||
|
||||
@@ -69,6 +69,9 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
NO_R, desc="Was Not Owner or Sharer, replacing probe filter entry";
|
||||
|
||||
NO_B, "NO^B", desc="Not Owner, Blocked";
|
||||
NO_B_X, "NO^B", desc="Not Owner, Blocked, next queued request GETX";
|
||||
NO_B_S, "NO^B", desc="Not Owner, Blocked, next queued request GETS";
|
||||
NO_B_S_W, "NO^B", desc="Not Owner, Blocked, forwarded merged GETS, waiting for responses";
|
||||
O_B, "O^B", desc="Owner, Blocked";
|
||||
NO_B_W, desc="Not Owner, Blocked, waiting for Dram";
|
||||
O_B_W, desc="Owner, Blocked, waiting for Dram";
|
||||
@@ -121,6 +124,7 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
All_acks_and_shared_data, desc="Received shared data and message acks";
|
||||
All_acks_and_owner_data, desc="Received shared data and message acks";
|
||||
All_acks_and_data_no_sharers, desc="Received all acks and no other processor has a shared copy";
|
||||
All_Unblocks, desc="Received all unblocks for a merged gets request";
|
||||
}
|
||||
|
||||
// TYPES
|
||||
@@ -148,6 +152,7 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
DataBlock DataBlk, desc="The current view of system memory";
|
||||
int Len, desc="...";
|
||||
MachineID DmaRequestor, desc="DMA requestor";
|
||||
NetDest GetSRequestors, desc="GETS merged requestors";
|
||||
int NumPendingMsgs, desc="Number of pending acks/messages";
|
||||
bool CacheDirty, default="false", desc="Indicates whether a cache has responded with dirty data";
|
||||
bool Sharers, default="false", desc="Indicates whether a cache has indicated it is currently a sharer";
|
||||
@@ -243,6 +248,8 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
trigger(Event:All_acks_and_shared_data, in_msg.Address);
|
||||
} else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) {
|
||||
trigger(Event:All_acks_and_data_no_sharers, in_msg.Address);
|
||||
} else if (in_msg.Type == TriggerType:ALL_UNBLOCKS) {
|
||||
trigger(Event:All_Unblocks, in_msg.Address);
|
||||
} else {
|
||||
error("Unexpected message");
|
||||
}
|
||||
@@ -487,6 +494,20 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
}
|
||||
}
|
||||
|
||||
action(mu_decrementNumberOfUnblocks, "mu", desc="Decrement the number of messages for which we're waiting") {
|
||||
peek(unblockNetwork_in, ResponseMsg) {
|
||||
assert(in_msg.Type == CoherenceResponseType:UNBLOCKS);
|
||||
DEBUG_EXPR(TBEs[address].NumPendingMsgs);
|
||||
//
|
||||
// Note that cache data responses will have an ack count of 2. However,
|
||||
// directory DMA requests must wait for acks from all LLC caches, so
|
||||
// only decrement by 1.
|
||||
//
|
||||
TBEs[address].NumPendingMsgs := TBEs[address].NumPendingMsgs - 1;
|
||||
DEBUG_EXPR(TBEs[address].NumPendingMsgs);
|
||||
}
|
||||
}
|
||||
|
||||
action(n_popResponseQueue, "n", desc="Pop response queue") {
|
||||
responseToDir_in.dequeue();
|
||||
}
|
||||
@@ -508,6 +529,19 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
}
|
||||
}
|
||||
|
||||
action(os_checkForMergedGetSCompletion, "os", desc="Check for merged GETS completion") {
|
||||
if (TBEs[address].NumPendingMsgs == 0) {
|
||||
enqueue(triggerQueue_out, TriggerMsg) {
|
||||
out_msg.Address := address;
|
||||
out_msg.Type := TriggerType:ALL_UNBLOCKS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(sp_setPendingMsgsToMergedSharers, "sp", desc="Set pending messages to waiting sharers") {
|
||||
TBEs[address].NumPendingMsgs := TBEs[address].GetSRequestors.count();
|
||||
}
|
||||
|
||||
action(spa_setPendingAcksToZeroIfPF, "spa", desc="if probe filter, no need to wait for acks") {
|
||||
if (probe_filter_enabled) {
|
||||
TBEs[address].NumPendingMsgs := 0;
|
||||
@@ -598,6 +632,12 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
}
|
||||
}
|
||||
|
||||
action(rs_recordGetSRequestor, "rs", desc="Record GETS requestor in TBE") {
|
||||
peek(requestQueue_in, RequestMsg) {
|
||||
TBEs[address].GetSRequestors.add(in_msg.Requestor);
|
||||
}
|
||||
}
|
||||
|
||||
action(r_setSharerBit, "r", desc="We saw other sharers") {
|
||||
TBEs[address].Sharers := true;
|
||||
}
|
||||
@@ -694,6 +734,29 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
}
|
||||
}
|
||||
|
||||
action(fr_forwardMergeReadRequestsToOwner, "frr", desc="Forward coalesced read request to owner") {
|
||||
assert(machineCount(MachineType:L1Cache) > 1);
|
||||
//
|
||||
// Fixme! The unblock network should not stall on the forward network. Add a trigger queue to
|
||||
// decouple the two.
|
||||
//
|
||||
peek(unblockNetwork_in, ResponseMsg) {
|
||||
enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
|
||||
out_msg.Address := address;
|
||||
out_msg.Type := CoherenceRequestType:MERGED_GETS;
|
||||
out_msg.MergedRequestors := TBEs[address].GetSRequestors;
|
||||
if (in_msg.Type == CoherenceResponseType:UNBLOCKS) {
|
||||
out_msg.Destination.add(in_msg.CurOwner);
|
||||
} else {
|
||||
out_msg.Destination.add(in_msg.Sender);
|
||||
}
|
||||
out_msg.MessageSize := MessageSizeType:Request_Control;
|
||||
out_msg.InitialRequestTime := zero_time();
|
||||
out_msg.ForwardRequestTime := get_time();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(fc_forwardRequestConditionalOwner, "fc", desc="Forward request to one or more nodes") {
|
||||
assert(machineCount(MachineType:L1Cache) > 1);
|
||||
if (probe_filter_enabled) {
|
||||
@@ -1058,31 +1121,81 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
}
|
||||
|
||||
// Blocked transient states
|
||||
transition({NO_B, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D,
|
||||
NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W,
|
||||
transition({NO_B_X, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D,
|
||||
NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, NO_B_S_W,
|
||||
NO_W, O_W, WB, WB_E_W, WB_O_W, O_R, S_R, NO_R},
|
||||
{GETS, GETX, PUT, Pf_Replacement}) {
|
||||
z_stallAndWaitRequest;
|
||||
}
|
||||
|
||||
transition({NO_B, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D,
|
||||
NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W,
|
||||
transition(NO_B, GETX, NO_B_X) {
|
||||
z_stallAndWaitRequest;
|
||||
}
|
||||
|
||||
transition(NO_B, {PUT, Pf_Replacement}) {
|
||||
z_stallAndWaitRequest;
|
||||
}
|
||||
|
||||
transition(NO_B_S, {GETX, PUT, Pf_Replacement}) {
|
||||
z_stallAndWaitRequest;
|
||||
}
|
||||
|
||||
transition({NO_B, NO_B_S, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D,
|
||||
NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, NO_B_S_W,
|
||||
NO_W, O_W, WB, WB_E_W, WB_O_W, O_R, S_R, NO_R},
|
||||
{DMA_READ, DMA_WRITE}) {
|
||||
zd_stallAndWaitDMARequest;
|
||||
}
|
||||
|
||||
transition(NO_B, UnblockS, NX) {
|
||||
// merge GETS into one response
|
||||
transition(NO_B, GETS, NO_B_S) {
|
||||
v_allocateTBE;
|
||||
rs_recordGetSRequestor;
|
||||
i_popIncomingRequestQueue;
|
||||
}
|
||||
|
||||
transition(NO_B_S, GETS) {
|
||||
rs_recordGetSRequestor;
|
||||
i_popIncomingRequestQueue;
|
||||
}
|
||||
|
||||
// unblock responses
|
||||
transition({NO_B, NO_B_X}, UnblockS, NX) {
|
||||
k_wakeUpDependents;
|
||||
j_popIncomingUnblockQueue;
|
||||
}
|
||||
|
||||
transition(NO_B, UnblockM, NO) {
|
||||
transition({NO_B, NO_B_X}, UnblockM, NO) {
|
||||
uo_updateOwnerIfPf;
|
||||
k_wakeUpDependents;
|
||||
j_popIncomingUnblockQueue;
|
||||
}
|
||||
|
||||
transition(NO_B_S, UnblockS, NO_B_S_W) {
|
||||
fr_forwardMergeReadRequestsToOwner;
|
||||
sp_setPendingMsgsToMergedSharers;
|
||||
j_popIncomingUnblockQueue;
|
||||
}
|
||||
|
||||
transition(NO_B_S, UnblockM, NO_B_S_W) {
|
||||
uo_updateOwnerIfPf;
|
||||
fr_forwardMergeReadRequestsToOwner;
|
||||
sp_setPendingMsgsToMergedSharers;
|
||||
j_popIncomingUnblockQueue;
|
||||
}
|
||||
|
||||
transition(NO_B_S_W, UnblockS) {
|
||||
mu_decrementNumberOfUnblocks;
|
||||
os_checkForMergedGetSCompletion;
|
||||
j_popIncomingUnblockQueue;
|
||||
}
|
||||
|
||||
transition(NO_B_S_W, All_Unblocks, NX) {
|
||||
w_deallocateTBE;
|
||||
k_wakeUpDependents;
|
||||
g_popTriggerQueue;
|
||||
}
|
||||
|
||||
transition(O_B, UnblockS, O) {
|
||||
k_wakeUpDependents;
|
||||
j_popIncomingUnblockQueue;
|
||||
@@ -1315,7 +1428,12 @@ machine(Directory, "AMD Hammer-like protocol")
|
||||
l_popMemQueue;
|
||||
}
|
||||
|
||||
transition(NO_B_W, {UnblockM, UnblockS}, NO_W) {
|
||||
transition(NO_B_W, UnblockM, NO_W) {
|
||||
uo_updateOwnerIfPf;
|
||||
j_popIncomingUnblockQueue;
|
||||
}
|
||||
|
||||
transition(NO_B_W, UnblockS, NO_W) {
|
||||
j_popIncomingUnblockQueue;
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
enumeration(CoherenceRequestType, desc="...") {
|
||||
GETX, desc="Get eXclusive";
|
||||
GETS, desc="Get Shared";
|
||||
MERGED_GETS, desc="Get Shared";
|
||||
PUT, desc="Put Ownership";
|
||||
WB_ACK, desc="Writeback ack";
|
||||
WB_NACK, desc="Writeback neg. ack";
|
||||
@@ -62,6 +63,7 @@ enumeration(TriggerType, desc="...") {
|
||||
ALL_ACKS, desc="See corresponding event";
|
||||
ALL_ACKS_OWNER_EXISTS,desc="See corresponding event";
|
||||
ALL_ACKS_NO_SHARERS, desc="See corresponding event";
|
||||
ALL_UNBLOCKS, desc="all unblockS received";
|
||||
}
|
||||
|
||||
// TriggerMsg
|
||||
@@ -75,6 +77,7 @@ structure(RequestMsg, desc="...", interface="NetworkMessage") {
|
||||
Address Address, desc="Physical address for this request";
|
||||
CoherenceRequestType Type, desc="Type of request (GetS, GetX, PutX, etc)";
|
||||
MachineID Requestor, desc="Node who initiated the request";
|
||||
NetDest MergedRequestors, desc="Merge set of read requestors";
|
||||
NetDest Destination, desc="Multicast destination mask";
|
||||
MessageSizeType MessageSize, desc="size category of the message";
|
||||
bool DirectedProbe, default="false", desc="probe filter directed probe";
|
||||
@@ -87,6 +90,7 @@ structure(ResponseMsg, desc="...", interface="NetworkMessage") {
|
||||
Address Address, desc="Physical address for this request";
|
||||
CoherenceResponseType Type, desc="Type of response (Ack, Data, etc)";
|
||||
MachineID Sender, desc="Node who sent the data";
|
||||
MachineID CurOwner, desc="current owner of the block, used for UnblockS responses";
|
||||
NetDest Destination, desc="Node to whom the data is sent";
|
||||
DataBlock DataBlk, desc="data for the cache line";
|
||||
bool Dirty, desc="Is the data dirty (different than memory)?";
|
||||
|
||||
Reference in New Issue
Block a user