mem-ruby: Ensure MOESI_AMD_Base-dir has probe destinations

The directory has an assert that this is at least one destination for a
probe when sending an invalidation or shared probe to coherence end
points in the protocol (TCC, LLC). This is not necessarily request and
for certain configurations there will be no probes required and none
will be sent. One such configuration is the GPU protocol tester which
would not require a probe to the CPU if it does not exist.

To fix this we first collect the probe destinations. Then we check if
any destinations exist. If so, we send the probe message. Otherwise we
immediately enqueue a probe complete message to the trigger queue. This
reorganization prevents messages with no destinations from being
enqueued, meeting the criteria for the assertion.

Change-Id: If016f457cb8c9e0277a910ac2c3f315c25b50ce8
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/55543
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Matthew Poremba
2022-01-21 16:55:51 -06:00
parent b354e1a252
commit 54fc137945

View File

@@ -646,202 +646,274 @@ machine(MachineType:Directory, "AMD Baseline protocol")
action(icd_probeInvCoreDataForDMA, "icd", desc="Probe inv cores, return data for DMA") {
peek(dmaRequestQueue_in, DMARequestMsg) {
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbInv;
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
NetDest probe_dests;
// Add relevant machine types based on CPUonly, GPUonly and noTCCdir:
// CPUonly && GPUonly -> Invalid
// CPUonly && !GPUonly -> Add CorePairs
// !CPUonly && GPUonly -> Add TCCs or TCC dirs
// !CPUonly && !GPUonly -> Add CorePairs and TCCs or TCC dirs
if (CPUonly) {
assert(!GPUonly);
probe_dests.broadcast(MachineType:CorePair);
} else {
// CPU + GPU system
if (!GPUonly) {
out_msg.Destination.broadcast(MachineType:CorePair);
probe_dests.broadcast(MachineType:CorePair);
}
// Add relevant TCC node to list. This replaces all TCPs and SQCs
if (CPUonly) {
// CPU only has neither TCC nor TCC directory to add.
} else if (noTCCdir) {
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
// CPU + GPU or GPU only system
if (noTCCdir) {
probe_dests.add(mapAddressToRange(address, MachineType:TCC,
TCC_select_low_bit,
TCC_select_num_bits));
} else {
out_msg.Destination.add(mapAddressToRange(address,
MachineType:TCCdir,
TCC_select_low_bit, TCC_select_num_bits));
probe_dests.add(mapAddressToRange(address, MachineType:TCCdir,
TCC_select_low_bit,
TCC_select_num_bits));
}
out_msg.Destination.remove(in_msg.Requestor);
tbe.NumPendingAcks := out_msg.Destination.count();
if (tbe.NumPendingAcks == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
}
probe_dests.remove(in_msg.Requestor);
if (probe_dests.count() > 0) {
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbInv;
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
tbe.NumPendingAcks := out_msg.Destination.count();
DPRINTF(RubySlicc, "%s\n", out_msg);
APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
}
if (probe_dests.count() == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
DPRINTF(RubySlicc, "%s\n", out_msg);
APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
}
}
action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") {
peek(requestNetwork_in, CPURequestMsg) {
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbInv;
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
NetDest probe_dests;
// Add relevant machine types based on CPUonly, GPUonly and noTCCdir:
// CPUonly && GPUonly -> Invalid
// CPUonly && !GPUonly -> Add CorePairs
// !CPUonly && GPUonly -> Add TCCs or TCC dirs if no write conflict
// !CPUonly && !GPUonly -> Add CorePairs and TCCs or TCC dirs
// if no write conflict
if (CPUonly) {
assert(!GPUonly);
probe_dests.broadcast(MachineType:CorePair);
} else {
// CPU + GPU system
if (!GPUonly) {
// won't be realistic for multisocket
out_msg.Destination.broadcast(MachineType:CorePair);
probe_dests.broadcast(MachineType:CorePair);
}
// add relevant TCC node to list. This replaces all TCPs and SQCs
if (((in_msg.Type == CoherenceRequestType:WriteThrough ||
in_msg.Type == CoherenceRequestType:Atomic) &&
in_msg.NoWriteConflict) ||
CPUonly) {
} else if (noTCCdir) {
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
} else {
out_msg.Destination.add(mapAddressToRange(address,
MachineType:TCCdir,
TCC_select_low_bit, TCC_select_num_bits));
}
out_msg.Destination.remove(in_msg.Requestor);
tbe.NumPendingAcks := out_msg.Destination.count();
if (tbe.NumPendingAcks == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
// CPU + GPU or GPU only system
if ((in_msg.Type != CoherenceRequestType:WriteThrough &&
in_msg.Type != CoherenceRequestType:Atomic) ||
!in_msg.NoWriteConflict) {
if (noTCCdir) {
probe_dests.add(mapAddressToRange(address, MachineType:TCC,
TCC_select_low_bit,
TCC_select_num_bits));
} else {
probe_dests.add(mapAddressToRange(address, MachineType:TCCdir,
TCC_select_low_bit,
TCC_select_num_bits));
}
}
DPRINTF(RubySlicc, "%s\n", out_msg);
APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
probe_dests.remove(in_msg.Requestor);
if (probe_dests.count() > 0) {
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbInv;
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
tbe.NumPendingAcks := out_msg.Destination.count();
DPRINTF(RubySlicc, "%s\n", out_msg);
APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
}
if (probe_dests.count() == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
}
}
}
action(scd_probeShrCoreDataForDma, "dsc", desc="probe shared cores, return data for DMA") {
peek(dmaRequestQueue_in, DMARequestMsg) {
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbDowngrade;
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
NetDest probe_dests;
// Add relevant machine types based on CPUonly, GPUonly and noTCCdir:
// CPUonly && GPUonly -> Invalid
// CPUonly && !GPUonly -> Add CorePairs
// !CPUonly && GPUonly -> Add TCCs or TCC dirs
// !CPUonly && !GPUonly -> Add CorePairs and TCCs or TCC dirs
if (CPUonly) {
assert(!GPUonly);
probe_dests.broadcast(MachineType:CorePair);
} else {
// CPU + GPU system
if (!GPUonly) {
out_msg.Destination.broadcast(MachineType:CorePair);
probe_dests.broadcast(MachineType:CorePair);
}
// add relevant TCC node to the list. This replaces all TCPs and SQCs
if (noTCCdir || CPUonly) {
//Don't need to notify TCC about reads
} else {
out_msg.Destination.add(mapAddressToRange(address,
MachineType:TCCdir,
TCC_select_low_bit, TCC_select_num_bits));
// CPU + GPU or GPU only system
// We don't need to notify TCC about reads
if (!noTCCdir) {
probe_dests.add(mapAddressToRange(address, MachineType:TCCdir,
TCC_select_low_bit,
TCC_select_num_bits));
}
if (noTCCdir && !CPUonly) {
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
}
probe_dests.remove(in_msg.Requestor);
if (probe_dests.count() > 0) {
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbDowngrade;
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
tbe.NumPendingAcks := out_msg.Destination.count();
DPRINTF(RubySlicc, "%s\n", (out_msg));
APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
out_msg.Destination.remove(in_msg.Requestor);
tbe.NumPendingAcks := out_msg.Destination.count();
if (tbe.NumPendingAcks == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
}
if (probe_dests.count() == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
DPRINTF(RubySlicc, "%s\n", (out_msg));
APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
}
}
action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
peek(requestNetwork_in, CPURequestMsg) { // not the right network?
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbDowngrade;
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
NetDest probe_dests;
// Add relevant machine types based on CPUonly, GPUonly and noTCCdir:
// CPUonly && GPUonly -> Invalid
// CPUonly && !GPUonly -> Add CorePairs
// !CPUonly && GPUonly -> Add TCCs or TCC dirs
// !CPUonly && !GPUonly -> Add CorePairs and TCCs or TCC dirs
if (CPUonly) {
assert(!GPUonly);
probe_dests.broadcast(MachineType:CorePair);
} else {
// CPU + GPU system
if (!GPUonly) {
// won't be realistic for multisocket
out_msg.Destination.broadcast(MachineType:CorePair);
probe_dests.broadcast(MachineType:CorePair);
}
// add relevant TCC node to the list. This replaces all TCPs and SQCs
if (noTCCdir || CPUonly) {
//Don't need to notify TCC about reads
} else {
out_msg.Destination.add(mapAddressToRange(address,
MachineType:TCCdir,
TCC_select_low_bit, TCC_select_num_bits));
tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
// CPU + GPU or GPU only system
// We don't need to notify TCC about reads
if (!noTCCdir) {
probe_dests.add(mapAddressToRange(address, MachineType:TCCdir,
TCC_select_low_bit,
TCC_select_num_bits));
}
if (noTCCdir && !CPUonly) {
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
}
probe_dests.remove(in_msg.Requestor);
if (probe_dests.count() > 0) {
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbDowngrade;
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
tbe.NumPendingAcks := out_msg.Destination.count();
DPRINTF(RubySlicc, "%s\n", (out_msg));
APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
out_msg.Destination.remove(in_msg.Requestor);
tbe.NumPendingAcks := out_msg.Destination.count();
if (tbe.NumPendingAcks == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
}
if (probe_dests.count() == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
DPRINTF(RubySlicc, "%s\n", (out_msg));
APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
}
}
action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
peek(requestNetwork_in, CPURequestMsg) { // not the right network?
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbInv;
out_msg.ReturnData := false;
out_msg.MessageSize := MessageSizeType:Control;
NetDest probe_dests;
// Add relevant machine types based on CPUonly, GPUonly and noTCCdir:
// CPUonly && GPUonly -> Invalid
// CPUonly && !GPUonly -> Add CorePairs
// !CPUonly && GPUonly -> Add TCCs or TCC dirs
// !CPUonly && !GPUonly -> Add CorePairs and TCCs or TCC dirs
if (CPUonly) {
assert(!GPUonly);
probe_dests.broadcast(MachineType:CorePair);
} else {
// CPU + GPU system
if (!GPUonly) {
// won't be realistic for multisocket
out_msg.Destination.broadcast(MachineType:CorePair);
probe_dests.broadcast(MachineType:CorePair);
}
// add relevant TCC node to the list. This replaces all TCPs and SQCs
if (noTCCdir && !CPUonly) {
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
// CPU + GPU or GPU only system
if (noTCCdir) {
probe_dests.add(mapAddressToRange(address, MachineType:TCC,
TCC_select_low_bit,
TCC_select_num_bits));
} else {
if (!noTCCdir) {
out_msg.Destination.add(mapAddressToRange(address,
MachineType:TCCdir,
TCC_select_low_bit,
TCC_select_num_bits));
}
probe_dests.add(mapAddressToRange(address, MachineType:TCCdir,
TCC_select_low_bit,
TCC_select_num_bits));
}
out_msg.Destination.remove(in_msg.Requestor);
tbe.NumPendingAcks := out_msg.Destination.count();
if (tbe.NumPendingAcks == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
}
probe_dests.remove(in_msg.Requestor);
if (probe_dests.count() > 0) {
enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
out_msg.addr := address;
out_msg.Type := ProbeRequestType:PrbInv;
out_msg.ReturnData := false;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
tbe.NumPendingAcks := out_msg.Destination.count();
APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
DPRINTF(RubySlicc, "%s\n", out_msg);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
}
if (probe_dests.count() == 0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
out_msg.addr := address;
out_msg.Type := TriggerType:AcksComplete;
}
APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
DPRINTF(RubySlicc, "%s\n", out_msg);
tbe.ProbeRequestStartTime := curCycle();
assert(out_msg.Destination.count() > 0);
}
}
}