Implementation additions

This commit is contained in:
2022-07-06 19:32:34 +02:00
parent 888a57c4bc
commit d890c4cc79
5 changed files with 121 additions and 95 deletions

View File

@@ -2,38 +2,38 @@
\begin{pgfonlayer}{nodelayer} \begin{pgfonlayer}{nodelayer}
\node [style=none] (0) at (0.5, 0) {}; \node [style=none] (0) at (0.5, 0) {};
\node [style=none] (1) at (-2.25, 4) {}; \node [style=none] (1) at (-2.25, 4) {};
\node [style=none] (2) at (-1.75, 8.5) {}; \node [style=none] (2) at (-1.5, 9.25) {};
\node [style=none] (3) at (3, 11.75) {}; \node [style=none] (3) at (3, 12.75) {};
\node [style=none] (4) at (9.25, 12.25) {}; \node [style=none] (4) at (9.25, 13) {};
\node [style=none] (5) at (15.75, 11) {}; \node [style=none] (5) at (15.5, 12.25) {};
\node [style=none] (6) at (17.75, 2.25) {}; \node [style=none] (6) at (17.25, 1.75) {};
\node [style=none] (7) at (13.75, -0.5) {}; \node [style=none] (7) at (12.5, -0.5) {};
\node [style=none] (8) at (6.75, -0.75) {}; \node [style=none] (8) at (6.75, -0.75) {};
\node [style=wrap text] (10) at (6.25, 11.25) {Scheduling\\Policy}; \node [style=wrap text] (10) at (6.25, 12) {Scheduling\\Policy};
\node [style=wrap text] (11) at (11.25, 8) {Number\\of\\Devices}; \node [style=wrap text] (11) at (11, 8.25) {Number\\of\\Devices};
\node [style=wrap text] (13) at (0, 5.75) {Refresh\\Policy}; \node [style=wrap text] (13) at (-0.25, 6.5) {Refresh\\Policy};
\node [style=wrap text] (14) at (7, 1.5) {Number\\of\\Channels}; \node [style=wrap text] (14) at (7, 1.5) {Number\\of\\Channels};
\node [style=wrap text] (16) at (16.25, 8.5) {Powerdown\\Policy}; \node [style=wrap text] (16) at (16.25, 9.25) {Power\\Down\\Policy};
\node [style=wrap text] (17) at (9, 4.75) {DRAM\\Speed}; \node [style=wrap text] (17) at (10.5, 5) {DRAM\\Speed};
\node [style=wrap text] (18) at (4.25, 5.25) {Page\\Policy}; \node [style=wrap text] (18) at (5, 5) {Page\\Policy};
\node [style=wrap text] (19) at (1.5, 2) {Command\\Multiplexer\\Policy}; \node [style=wrap text] (19) at (1.5, 2) {Command\\Multiplexer\\Policy};
\node [style=wrap text] (20) at (12.5, 1.75) {Response\\Queue\\Policy}; \node [style=wrap text] (20) at (12.5, 1.75) {Response\\Queue\\Policy};
\node [style=wrap text] (21) at (5.75, 8.25) {Address\\Mapping}; \node [style=wrap text] (21) at (5.25, 8.5) {Address\\Mapping};
\node [style=wrap text] (22) at (1, 9.5) {DRAM\\Standard}; \node [style=wrap text] (22) at (1.25, 10.75) {DRAM\\Standard};
\node [style=wrap text] (23) at (12.25, 11) {DRAM\\Capacity}; \node [style=wrap text] (23) at (12.25, 11.5) {DRAM\\Capacity};
\node [style=wrap text] (24) at (16.25, 4.75) {Timing\\Parameters}; \node [style=wrap text] (24) at (16.75, 4.75) {Timing\\Parameters};
\node [style=none] (25) at (18.75, 6.75) {}; \node [style=none] (25) at (18.75, 6.75) {};
\end{pgfonlayer} \end{pgfonlayer}
\begin{pgfonlayer}{edgelayer} \begin{pgfonlayer}{edgelayer}
\draw [style=very thick line, bend right=285, looseness=1.25] (0.center) to (1.center); \draw [style=very thick line, bend left=75, looseness=1.25] (0.center) to (1.center);
\draw [style=very thick line, bend left=75] (1.center) to (2.center); \draw [style=very thick line, bend left=90, looseness=1.25] (1.center) to (2.center);
\draw [style=very thick line, bend left=60, looseness=1.25] (2.center) to (3.center); \draw [style=very thick line, bend left=75, looseness=1.25] (2.center) to (3.center);
\draw [style=very thick line, bend left=60] (3.center) to (4.center); \draw [style=very thick line, bend left=75] (3.center) to (4.center);
\draw [style=very thick line, bend left=60] (4.center) to (5.center); \draw [style=very thick line, bend left=75] (4.center) to (5.center);
\draw [style=very thick line, bend left=60, looseness=1.25] (6.center) to (7.center); \draw [style=very thick line, bend left=60, looseness=1.25] (6.center) to (7.center);
\draw [style=very thick line, bend left=60] (7.center) to (8.center); \draw [style=very thick line, bend left=60] (7.center) to (8.center);
\draw [style=very thick line, bend right=300] (8.center) to (0.center); \draw [style=very thick line, bend left=75] (8.center) to (0.center);
\draw [style=very thick line, bend right=60, looseness=1.25] (6.center) to (25.center); \draw [style=very thick line, bend right=75, looseness=1.25] (6.center) to (25.center);
\draw [style=very thick line, bend right=60, looseness=1.25] (25.center) to (5.center); \draw [style=very thick line, bend right=75, looseness=1.25] (25.center) to (5.center);
\end{pgfonlayer} \end{pgfonlayer}
\end{tikzpicture} \end{tikzpicture}

View File

@@ -83,15 +83,15 @@
to (46.center) to (46.center)
to (45.center) to (45.center)
to cycle; to cycle;
\draw [style=arrow without head] (55.center) to (57.center); \draw [style=very thick line] (55.center) to (57.center);
\draw [style=arrow without head] (57.center) to (56.center); \draw [style=very thick line] (57.center) to (56.center);
\draw [style=arrow] (57.center) to (44); \draw [style=very thick latex arrow] (57.center) to (44);
\draw [style=arrow] (56.center) to (50); \draw [style=very thick latex arrow] (56.center) to (50);
\draw [style=arrow] (9) to (18.center); \draw [style=very thick latex arrow] (9) to (18.center);
\draw [style=arrow without head] (30.center) to (60.center); \draw [style=very thick line] (30.center) to (60.center);
\draw [style=arrow without head] (60.center) to (61.center); \draw [style=very thick line] (60.center) to (61.center);
\draw [style=arrow without head] (63.center) to (62.center); \draw [style=very thick line] (63.center) to (62.center);
\draw [style=arrow] (61.center) to (36.center); \draw [style=very thick latex arrow] (61.center) to (36.center);
\draw [style=arrow] (62.center) to (55.center); \draw [style=very thick latex arrow] (62.center) to (55.center);
\end{pgfonlayer} \end{pgfonlayer}
\end{tikzpicture} \end{tikzpicture}

View File

@@ -8,7 +8,7 @@
\node [style=none] (5) at (16, 2.25) {Target}; \node [style=none] (5) at (16, 2.25) {Target};
\node [style=none] (6) at (-1, 0.5) {}; \node [style=none] (6) at (-1, 0.5) {};
\node [style=none] (7) at (-1, -20) {}; \node [style=none] (7) at (-1, -20) {};
\node [style=align text] (8) at (-2, -8) {Time}; \node [style=align text] (8) at (-2, -9) {Time};
\node [style=none] (9) at (0, -0.5) {}; \node [style=none] (9) at (0, -0.5) {};
\node [style=none] (10) at (16, -0.5) {}; \node [style=none] (10) at (16, -0.5) {};
\node [style=none] (11) at (16, -1) {}; \node [style=none] (11) at (16, -1) {};

View File

@@ -71,7 +71,7 @@ An example subdivision of the address in the index, tag and byte offset is shown
\begin{figure}[!ht] \begin{figure}[!ht]
\begin{center} \begin{center}
\tikzfig{img/address} \tikzfig{img/address}
\caption{Example address mapping for the tag, index and byte offset.} \caption{Exemplary address mapping for the tag, index and byte offset.}
\label{fig:address_mapping} \label{fig:address_mapping}
\end{center} \end{center}
\end{figure} \end{figure}

View File

@@ -1,52 +1,51 @@
\section{Implementation} \section{Implementation}
\label{sec:implementation} \label{sec:implementation}
In this section, the new components that were developed, which enable the tracing of an arbitrary application in real-time, as well as the replay of those traces in DRAMSys, will be introduced. In this section, the the components developed in this thesis for the new simulator frontend, that enable the tracing of an arbitrary application in real-time, as well as the replay of the recorded traces in DRAMSys, will be introduced.
At first, the DynamoRIO analyzer tool that produces the memory access traces and its place in the DrCacheSim-Framework will be explained. To briefly summarize which components are necessary to implement the new simulation frontend, they are briefly listed below:
Furthermore, the new trace player for DRAMSys will acquire special focus as well as the mandatory cache model that is used to model the cache-filtering in a real system.
% Oder auch nicht: ? \begin{itemize}
\item A DynamoRIO client that traces memory accesses from an running application.
\item A simplified core model that replays those traces by sending transactions to DRAMSys.
\item A cache model that simulates the cache-filtering of memory requests of the processor.
\end{itemize}
The following sections will first explain the DynamoRIO analysis tool that generates the memory access traces and its place in the DrMemtrace framework.
Furthermore, the new trace player for DRAMSys will acquire special attention as well as the mandatory cache model that is used to model the cache-filtering in a real system.
The last part will concentrate on the special architecture of the new trace player interface and challenges the internal interconnection solves. The last part will concentrate on the special architecture of the new trace player interface and challenges the internal interconnection solves.
\subsection{Analysis Tool} \subsection{Analysis Tool}
\label{sec:analysis_tool} \label{sec:analysis_tool}
As described in section \ref{sec:dynamorio} the dynamic binary instrumentation tool DynamoRIO will be used to trace the memory accesses while the target application is running. As described in section \ref{sec:dynamorio} the dynamic binary instrumentation tool DynamoRIO will be used to trace the memory accesses while the target application is running.
Instead of writing a DynamoRIO client from the ground up, the DrCacheSim framework is used. Instead of writing a DynamoRIO client from the ground up, the DrMemtrace framework, that comes bundled with DynamoRIO, is used.
DrCacheSim is a DynamoRIO client that gathers memory and instruction access traces and forwards them to an analyzer tool. DrCacheSim is a DynamoRIO client that build on the DrMemtrace framework and gathers memory and instruction access traces from the target application and forwards them to one or multiple analyzer tools.
It is a purely observational client and does not modify the behavior of the application. In addition, so-called marker records are sent to an analyzer on certain events, with which meta information such as the cpu core used, kernel events or a timestamp are transmitted.
These markers are also essential for a processor simulation, for example to reconstruct the thread interleaving, as it is intended for the new simulator frontend.
DrCacheSim is a purely observational client and does not alter the behavior of the application.
Optionally, DrCacheSim converts the addresses of the memory accesses from virtual addresses into physical addresses, which is an important step for simulating a real memory system: Using one of many possible configuration parameters, it is possible for DrCacheSim to convert the addresses of the memory accesses from virtual addresses into the corresponding physical addresses, which is an important step for simulating a real memory system:
As the virtual address space is unique for every running process and need to be translated to physical addresses by the operating system kernel to access the real memory, these physical addresses should be traced instead of the virtuals. As the virtual address space is unique for every running process and does not match the true address space of the processor, where the memory and also peripherals are mapped into, it needs to be translated to physical addresses by the operating system kernel to access the real memory.
These physical addresses should be traced instead of the virtual addresses to account for effects such as paging in the simulated system.
It is to note that the physical addresses do not directly correspond into the internal addresses of the memory subsystem: It should be noted that in most systems the physical addresses do not directly represent the addresses that the memory subsystem perceives.
The physical memory is mapped at a specific address in the physical address space, so this address offset has to be considered. The physical memory is mapped at a specific address region in the physical address space, so an address offset also has to be considered.
On Linux systems, this mapping can be obtained by reading the contents of the virtual file \texttt{/proc/iomem}. On Linux systems, this mapping can be obtained by investigating the contents of the virtual file \texttt{/proc/iomem}, which is provided by the kernel.
The trace player then needs to substract this offset as it will be explained in section \ref{sec:dbiplayer_functionality}. The trace player then substracts this offset as it will be explained in more detail in section \ref{sec:dbiplayer_functionality}.
The physical address conversion only works on Linux and requires in modern kernel versions root privileges (or alternatively the CAP\_SYS\_ADMIN capability).
The physical address conversion only works on Linux and requires root privileges (or alternatively the CAP\_SYS\_ADMIN capability) in modern kernel versions. There are two different operation modes for an analyzer tool that DrCacheSim provides:
The analyzer tool can either be running alongside with DrCacheSim (online) or operate on an internal trace format (offline). The analyzer tool can either be running alongside with DrCacheSim (online) or run after the target application has exited and operate on an internal trace format (offline).
Offline tracing has the additional advantage of being able to disassemble the executed instructions afterwards. Offline tracing has the additional advantage of being able to disassemble the executed instructions afterwards.
For this, the mapping of the executable binaries and shared libraries is stored alongside with the trace, enabling the decoding of the instructions from the traced program counter values. For this, the mapping of the executable binaries and shared libraries is stored alongside with the trace, enabling the decoding of the instructions from the traced program counter values.
As of writing this thesis, the offline tracing mode has recently gained support for the physical address conversation, but the online mode will be used throughout this thesis as its support is still limited. The instruction decoding is currently not natively supported for the online execution model, but this feature received limited attention in the development of the new frontend.
As of writing this thesis, the offline tracing mode has only recently gained support for the physical address conversation.
In case of the online tracing, DrCacheSim consists of two separate processes: Nnevertheless, the online execution model will be used throughout this thesis as the physical address support is still limited for offline tracing.
\begin{itemize}
\item
A client-side process (the DynamoRIO client) which injects observational instructions into the application's code cache.
For every instruction or memory access, a data packet of the type \texttt{memref\_t} is generated.
\item
An analyzer-side process which connects to the client and processes the \texttt{memref\_t} data packets.
The analyzer-side can contain many analysis tools that operate on those stream of records.
\end{itemize}
The \revabbr{inter-process communication}{IPC} between the two parts is achieved through a \textit{named\ pipe}.
Figure \ref{fig:drcachesim} illustrates the structure of the individual parts.
\input{img/thesis.tikzstyles} \input{img/thesis.tikzstyles}
\begin{figure}[!ht] \begin{figure}
\begin{center} \begin{center}
\tikzfig{img/drcachesim} \tikzfig{img/drcachesim}
\caption{Structure of the DrCacheSim online tracing.} \caption{Structure of the DrCacheSim online tracing.}
@@ -54,20 +53,46 @@ Figure \ref{fig:drcachesim} illustrates the structure of the individual parts.
\end{center} \end{center}
\end{figure} \end{figure}
In case of the online tracing, DrCacheSim consists of two separate processes:
\begin{itemize}
\item
A client-side process (the DynamoRIO client) which injects observational instructions into the application's code cache.
For every instruction, memory access or marker event, a data packet of the type \texttt{memref\_t} is generated and sent to the analyzer process.
\item
An analyzer-side process which is connected to the client and processes the \texttt{memref\_t} data packets.
The analyzer-side can contain many analysis tools that operate on this stream of records.
\end{itemize}
The \revabbr{inter-process communication}{IPC} between the two processes is achieved through a \textit{named\ pipe}.
Figure \ref{fig:drcachesim} illustrates the structure of online tracing mechanism.
A \texttt{memref\_t} can either represent an instruction, a data reference or a metadata event such as a timestamp or a CPU identifier. A \texttt{memref\_t} can either represent an instruction, a data reference or a metadata event such as a timestamp or a CPU identifier.
Besides of the type, the \revabbr{process identifier}{PID} and \revabbr{thread identifier}{TID} of the initiating process and thread is included in every record. Besides of the type, the \revabbr{process identifier}{PID} and \revabbr{thread identifier}{TID} of the initiating process and thread is included in every record.
For an instruction marker, the size of the instruction as well as the virtual address of the instruction in the memory map is provided. For an instruction marker, the size of the instruction as well as the address of the instruction in the virtual address space of the application is provided.
For data references, the address and size of the desired access is provided as well the \revabbr{program counter}{PC} from where it was initiated. For data references, the address and size of the desired access is provided as well the \revabbr{program counter}{PC} from where it was initiated from.
In offline mode, DrCacheSim stores the current mapping of all binary executables and shared libraries in a separate file, so that it is possible to decode named instructions even after the application has exited. In offline mode, DrCacheSim stores the current mapping of all binary executables and shared libraries in a separate file, so that it is possible to decode and disassemble the traced instructions even after the application has exited.
In case of online tracing, the analyzer has to inspect the memory of the client-side process for this. As mentioned earlier, instruction decoding is not natively supported for online tracing, but to work around the problem, the analyzer can examine the memory map of the client-side process and read the encoded instructions from there.
Analysis tools implement the \texttt{analysis\_tool\_t} interface as this enables the analyzer to forward a received record to multiple tools in a polymorphic manner. Using command line options, it is also possible to instruct DrCachSim to trace only a portion of an application, rather than everyting from start to finish.
This region of interest can be specified by the number of instructions after which the tracing should start or stop.
All analysis tools implement the common \texttt{analysis\_tool\_t} interface as this enables the analyzer to forward a received record to multiple tools in a polymorphic manner.
In particular, the \texttt{process\_memref\_t()} method of any tool is called for every incoming record. In particular, the \texttt{process\_memref\_t()} method of any tool is called for every incoming record.
The newly developed DRAMTracer tool creates for every application thread a separate trace file. It is possible for a analysis tool to implement parallel processing of the received \texttt{memref\_t} types by splitting up the trace into \textit{shards}.
As it is not known how many threads an application will spawn, the tool will listen for records with new TIDs that it did not register yet. However, in this thesis the sequential processing of a single sorted and interleaved trace was used because of missing support for parallel processing for the online execution model.
For every data reference, a new entry in the corresponding trace file is made which contains the size and the physical address of the access, whether it was a read or write, and also a count of (computational) instructions that have been executed since the last reference.
This instruction count is used to approximate the delay between the memory accesses when the trace is replayed by DRAMSys. The newly developed DRAMTracer tool creates a separate trace file for every application thread.
Since it is not known a priori how many threads an application will spawn, the tool will listen for records with new TIDs that it did not register yet.
For every data reference, a new entry in the corresponding trace file is made which contains the size and the physical address of the access, whether it was a read or write, and also a count of (computational) instructions that have been executed since the last data reference.
To compute the instruction count, a counter is incremented for every registered instruction record and reset again for any data reference.
This instruction count is used, together with the clock period, to approximate the delay between two memory accesses when the trace is replayed by DRAMSys.
Lastly, the analysis tool inserts a timestamp into the trace for every received timestamp marker.
The use of this timestamp will be further explained in section \ref{sec:dbiplayer_functionality}.
Listing \ref{list:memtrace} presents an exemplary memory trace.
Lines consisting of a number between two angle brackets represent a timestamp whereas lines for memory references consist of the instruction count, a character denoting a read or write, the size and the address of the access.
Also, comments which are ignored by the trace player are possible by starting the line with a number sign.
\begin{listing} \begin{listing}
\begin{textcode} \begin{textcode}
@@ -87,24 +112,23 @@ This instruction count is used to approximate the delay between the memory acces
\label{list:memtrace} \label{list:memtrace}
\end{listing} \end{listing}
As of writing this thesis, there is no application binary interface for analysis tools defined in the DrCacheSim-Framework. As of writing this thesis, there is no application binary interface for analysis tools defined for the DrMemtrace framework.
Therefore it is not possible to load the DRAMTracer tool as a shared library but rather it is required to modify the DynamoRIO source code to integrate the tool. Therefore it is not possible to load the DRAMTracer tool as a shared library but rather it is required to modify the DynamoRIO source code to integrate the tool.
Also, to be able to decode the instructions in the online tracing, a set of patches had to be applied to DynamoRIO.
\subsection{Trace Player Architecture} \subsection{Trace Player Architecture}
\label{sec:dbiplayer_architecture} \label{sec:dbiplayer_architecture}
This section covers the general architecture of the \textit{DbiPlayer}, the new trace player for DRAMSys that replays the captured trace files. This section covers the general architecture of the \textit{DbiPlayer}, the new trace player for DRAMSys that replays the captured trace files.
For every recorded thread, a new so-called DbiThreadPlayer is spawned, which is a standalone initiator for transactions. For every recorded thread, a traffic initiator thread, a so-called \textit{DbiThreadPlayer}, is spawned, which is a standalone initiator for memory transactions.
Because those threads need to be synchronized to approximate the real behavior, they need to communicate among each other. Because those threads need to be synchronized to approximate real thread interleaving, they need to communicate among each other.
The detailed mechanism behind this synchronization will be further explained in section \ref{sec:dbiplayer_functionality}. The detailed mechanism behind this synchronization will be further explained in section \ref{sec:dbiplayer_functionality}.
This communication, however, brings up the necessity to containerize the thread players into a single module that can directly be connected to DRAMSys. This communication, however, brings up the necessity to containerize the thread players into a single module that can directly be connected to DRAMSys.
With the old DRAMSys interface for trace players this was not easily realizable, so a new generic initiator interface was developed that makes it possible to connect components to DRAMSys whose internal architecture can be arbitrary. With the old DRAMSys interface for trace players this was not easily realizable, so a new generic initiator interface was developed which allows components to be connected to DRAMSys whose internal architecture can be arbitrary.
This new interface will be further discussed in section \ref{sec:traceplayer_interface}. This new interface will be further discussed in section \ref{sec:traceplayer_interface}.
For the \textit{DbiPlayer}, an additional interconnect module will bundle up all \\ \texttt{simple\_initiator\_sockets} to a single \texttt{multi\_passthrough\_initiator\_socket} as presented in figure \ref{fig:dbiplayer_without_caches}. For the \textit{DbiPlayer}, an additional interconnect module will bundle up all \\ \texttt{simple\_initiator\_sockets} in a single \texttt{multi\_passthrough\_initiator\_socket}.
So the \textit{DbiPlayer} is a hierarchical module that consists of a more complex architecture with multiple traffic initiators, illustrated in figure \ref{fig:dbiplayer_without_caches}.
\begin{figure} \begin{figure}
\begin{center} \begin{center}
@@ -114,10 +138,11 @@ For the \textit{DbiPlayer}, an additional interconnect module will bundle up all
\end{center} \end{center}
\end{figure} \end{figure}
As the memory accesses are directly extracted from the executed instructions, simply sending a transaction to the DRAM subsystem for every data reference would neglect the caches of today's processors completely. As the memory accesses are directly extracted from the executed instructions, simply sending a transaction to the DRAM subsystem for every data reference would completely neglect the caches of today's processors.
Therefore, also a cache model is required whose implementation will be explained in more detail in section \ref{sec:cache_implementation}. Therefore, also a cache model is required whose implementation will be explained in more detail in section \ref{sec:cache_implementation}.
Many modern cache hierarchies compose of 3 cache levels: 2 caches for every processor core, the L1 and L2 cache, and one cache that is shared across all cores, the L3 cache. Many modern cache hierarchies compose of 3 cache levels: 2 caches for every processor core, the L1 and L2 cache, and one cache that is shared across all cores, the L3 cache.
This hierarchy is also reflected in the \textit{DbiPlayer} as shown in Figure \ref{fig:dbiplayer_with_caches}. This cache hierarchy is also reflected in the \textit{DbiPlayer} as shown in Figure \ref{fig:dbiplayer_with_caches}, but also more simple hierarchies such as a L1 cache for every processor core and one shared L2 cache are configurable.
In order to connect the different SystemC socket types, one additional interconnect is required which is explained in more detail in section \ref{sec:interconnect}.
\begin{landscape} \begin{landscape}
\begin{figure} \begin{figure}
@@ -132,32 +157,33 @@ This hierarchy is also reflected in the \textit{DbiPlayer} as shown in Figure \r
\subsection{Trace Player Functionality} \subsection{Trace Player Functionality}
\label{sec:dbiplayer_functionality} \label{sec:dbiplayer_functionality}
With the overall architecture of the initiator introduced, this section explains the internal functionality of the \textit{DbiPlayer} and its threads. With the overall architecture of the main initiator module introduced, this section explains the internal functionality of the \textit{DbiPlayer} and its threads.
The threads of the \textit{DbiPlayer} are specialized initiator modules that inherit from the more generic \texttt{TrafficInitiatorThread} class. The threads of the \textit{DbiPlayer} are specialized initiator modules that inherit from the more generic \texttt{TrafficInitiatorThread} class.
Each \texttt{TrafficInitiatorThread} consists of an \texttt{sendNextPayloadThread()} \texttt{SC\_THREAD} that inturn calls the virtual method \texttt{sendNextPayload()}, that is implemented in the \texttt{DbiThreadPlayer}, each time the \texttt{sc\_event\_queue} \texttt{sendNextPayloadEvent} is being notified. Each \texttt{TrafficInitiatorThread} consists of an \texttt{sendNextPayloadThread()} \texttt{SC\_THREAD} that inturn calls the virtual method \texttt{sendNextPayload()}, that is implemented in the \texttt{DbiThreadPlayer}, each time the \texttt{sc\_event\_queue} \texttt{sendNextPayloadEvent} is being notified.
Each \texttt{DbiThreadPlayer} iterates through its trace file and stores the entries in an internal buffer. Each \texttt{DbiThreadPlayer} iterates through the lines of its trace file and stores the entries in an internal buffer.
In \texttt{sendNextPayload()} then, a new generic payload object is created from the next entry of this buffer. In \texttt{sendNextPayload()} then, a new generic payload object is created from the following entry of this buffer.
The address of the payload is calculated from the physical address stored in the trace file entry. The address of the payload is calculated from the physical address stored in the trace file entry.
As previously discussed, the trace player now needs to account for the offset the RAM was placed at in the physical memory map and substract this offset from the physical address. As previously discussed, the trace player now needs to account for the offset the RAM was placed at in the physical memory map and substract this offset from the physical address.
The instruction count field of the trace is used to approximate the delay between two consecutive memory accesses: The instruction count field of the trace is used to approximate the delay between two consecutive memory accesses:
The count is multiplied with the trace player clock period and a constant to defer the initiation of the next transaction by the resulting value. The count is multiplied with the trace player clock period and a constant to defer the initiation of the next transaction by the resulting value.
While this does not take the type of the executed instructions into account, it is still a simple approximation that can be made. While this does not take the type of the executed instructions into account, it is still a simple approximation that can be made.
As mentioned previously, the threads cannot run by themselves, rather they require synchronization to ensure the simulated system replicates the real running application as good as possible. As mentioned previously, the threads should run by themselves without paying attention to the others, rather they require synchronization to ensure the simulated system replicates the real running application as good as possible.
The analysis tool appends timestamps into the memory access traces that will be used to pause the execution of a thread, when the global time has not yet reached this far, or to advance the global time, when the thread is allowed to run. The analysis tool appends timestamps into the memory access traces.
It is to note that the term global time in this context does not correspond to the SystemC simulation time but denotes a loose time variable that the \textit{DbiPlayer} uses to schedule its threads. When such a timestamp is reached, it will be used to pause the execution of a thread, if the global time has not yet reached this far, or to advance the global time, when the thread is allowed to continue.
It is to note that the term global time in this context does not correspond to the SystemC simulation time but denotes a loose time variable that only the \textit{DbiPlayer} uses to schedule its threads.
A set of rules determine if a thread is allowed to make progress beyond a timestamp that is further than the current global time: A set of rules determine if a thread is allowed to make progress beyond a timestamp that is further than the current global time:
\begin{enumerate} \begin{enumerate}
\item The main thread at the start of the program is always allowed to run. \item The main thread at the start of the program is always allowed to run.
\item Threads don't go to sleep when they would produce a deadlock. This is the case when they are the only thread currently running. \item Threads do not suspend themselves when they would produce a deadlock. This is the case when they are the only thread currently running.
\item When a previous running thread exits and all other threads are sleeping, then they will be woken up. \item When a previous running thread exits and all other threads are suspended, then they will be resumed.
\item As a fallback, when currently all threads are waiting, one thread will be woken up. \item As a fallback, when currently all threads are suspended, one thread will be resumed.
\end{enumerate} \end{enumerate}
Those rules reconstruct the thread interleaving of the application as it was running while being traced. Those rules reconstruct the thread interleaving of the instrumented application as it was running while being traced.
The two latter rules ensure that always at least one thread is running so that the simulation does not come to a premature halt. The two latter rules ensure that always at least one thread is running so that the simulation does not come to a premature halt.
\subsection{Non-Blocking Cache} \subsection{Non-Blocking Cache}