diff --git a/img/thesis.tikzstyles b/img/thesis.tikzstyles index 84f9b24..07c7f5f 100644 --- a/img/thesis.tikzstyles +++ b/img/thesis.tikzstyles @@ -48,3 +48,9 @@ \tikzstyle{dashed arrow}=[dashed, ->] \tikzstyle{latex double arrow}=[-, latex-latex] \tikzstyle{cache assoc}=[-, fill={rgb,255: red,230; green,230; blue,230}] +\tikzstyle{latex arrow red}=[-, -latex, draw={rgb,255: red,185; green,41; blue,25}, very thick] +\tikzstyle{latex arrow green}=[-, -latex, draw={rgb,255: red,0; green,128; blue,0}, very thick] +\tikzstyle{latex arrow blue}=[-, -latex, draw={rgb,255: red,8; green,94; blue,138}, very thick] +\tikzstyle{very thick line}=[-, very thick] +\tikzstyle{very thick latex arrow}=[-, very thick, -latex] +\tikzstyle{dotted very thick green latex arrow}=[-, very thick, -latex, dashed, draw={rgb,255: red,0; green,128; blue,0}] diff --git a/img/tlm_at.tikz b/img/tlm_at.tikz index ff07ff2..59a4d3d 100644 --- a/img/tlm_at.tikz +++ b/img/tlm_at.tikz @@ -1,42 +1,78 @@ \begin{tikzpicture} \begin{pgfonlayer}{nodelayer} - \node [style=none] (0) at (0, 1) {}; - \node [style=none] (1) at (0, -12) {}; - \node [style=none] (2) at (12, 1) {}; - \node [style=none] (3) at (12, -12) {}; - \node [style=none] (4) at (0, 1.75) {Initiator}; - \node [style=none] (5) at (12, 1.75) {Target}; - \node [style=none] (6) at (-1, 0) {}; - \node [style=none] (7) at (-1, -11) {}; - \node [style=align text] (8) at (-1.5, -6) {Time}; - \node [style=none] (9) at (0, -1) {}; - \node [style=none] (10) at (12, -1) {}; - \node [style=none] (11) at (12, -2.25) {}; - \node [style=none] (12) at (0, -2.25) {}; - \node [style=none] (13) at (0, -6) {}; - \node [style=none] (14) at (12, -6) {}; - \node [style=none] (15) at (0, -10) {}; - \node [style=none] (16) at (12, -10) {}; - \node [style=none] (17) at (2.5, -0.5) {BEGIN\_REQ}; - \node [style=none] (18) at (9.75, -1.75) {END\_REQ}; - \node [style=none] (19) at (9.5, -5.5) {BEGIN\_RESP}; - \node [style=none] (20) at (2.25, -9.5) {END\_RESP}; - \node [style=none] (21) at (15.25, -6) {}; - \node [style=none] (22) at (15.25, -10) {}; - \node [style=align text] (23) at (15.75, -8) {Back Pressure}; - \node [style=none] (24) at (15, -6) {}; - \node [style=none] (25) at (15, -10) {}; + \node [style=none] (0) at (0, 1.5) {}; + \node [style=none] (1) at (0, -21) {}; + \node [style=none] (2) at (14, 1.5) {}; + \node [style=none] (3) at (14, -21) {}; + \node [style=none] (4) at (0, 2.25) {Initiator}; + \node [style=none] (5) at (14, 2.25) {Target}; + \node [style=none] (6) at (-1, 0.5) {}; + \node [style=none] (7) at (-1, -18.5) {}; + \node [style=align text] (8) at (-2, -8) {Time}; + \node [style=none] (9) at (0, -0.5) {}; + \node [style=none] (10) at (14, -0.5) {}; + \node [style=none] (11) at (14, -0.75) {}; + \node [style=none] (12) at (0, -0.75) {}; + \node [style=none] (13) at (0, -5.5) {}; + \node [style=none] (14) at (14, -5.5) {}; + \node [style=none] (15) at (0, -10.5) {}; + \node [style=none] (16) at (14, -10.5) {}; + \node [style=none] (17) at (2.5, 0) {BEGIN\_REQ}; + \node [style=none] (18) at (12, -1.5) {END\_REQ}; + \node [style=none] (19) at (11.5, -5) {BEGIN\_RESP}; + \node [style=none] (20) at (2.25, -10) {END\_RESP}; + \node [style=none] (21) at (17.5, -3) {}; + \node [style=none] (22) at (17.5, -7) {}; + \node [style=align text] (23) at (18.25, -5) {Back-\\pressure}; + \node [style=none] (24) at (17, -3) {}; + \node [style=none] (25) at (17, -7) {}; + \node [style=none] (26) at (0, -3) {}; + \node [style=none] (27) at (14, -3) {}; + \node [style=none] (28) at (0, -7) {}; + \node [style=none] (29) at (14, -7) {}; + \node [style=none] (30) at (12, -6.5) {END\_REQ}; + \node [style=none] (31) at (2.5, -2.5) {BEGIN\_REQ}; + \node [style=none] (32) at (0, -7.25) {}; + \node [style=none] (33) at (14, -7.25) {}; + \node [style=none] (34) at (2.5, -8) {BEGIN\_REQ}; + \node [style=none] (38) at (0, -12.75) {}; + \node [style=none] (39) at (14, -12.75) {}; + \node [style=none] (40) at (8.25, -12.25) {BEGIN\_RESP (Skip END\_REQ)}; + \node [style=none] (41) at (0, -17) {}; + \node [style=none] (42) at (14, -17) {}; + \node [style=none] (43) at (2.25, -16.5) {END\_RESP}; + \node [style=none] (44) at (-3.5, -12.75) {}; + \node [style=none] (45) at (-3.5, -17) {}; + \node [style=align text] (46) at (-4.5, -15) {Back-\\pressure}; + \node [style=none] (47) at (-3, -12.75) {}; + \node [style=none] (48) at (-3, -17) {}; + \node [style=none] (49) at (0, -19) {}; + \node [style=none] (50) at (14, -19) {}; + \node [style=none] (51) at (0, -18.5) {}; + \node [style=none] (52) at (14, -18.5) {}; + \node [style=none] (53) at (11.5, -18) {BEGIN\_RESP}; + \node [style=none] (54) at (3.75, -19.75) {TLM\_COMPLETED}; \end{pgfonlayer} \begin{pgfonlayer}{edgelayer} - \draw (0.center) to (1.center); - \draw (2.center) to (3.center); - \draw [style=latex arrow] (6.center) to (7.center); - \draw [style=latex arrow] (9.center) to (10.center); - \draw [style=latex arrow] (11.center) to (12.center); - \draw [style=latex arrow] (14.center) to (13.center); - \draw [style=latex arrow] (15.center) to (16.center); - \draw [style=dashed line] (22.center) to (16.center); - \draw [style=dashed line] (21.center) to (14.center); + \draw [style=very thick line, in=90, out=-90] (0.center) to (1.center); + \draw [style=very thick line] (2.center) to (3.center); + \draw [style=very thick latex arrow] (6.center) to (7.center); + \draw [style=latex arrow blue] (9.center) to (10.center); + \draw [style=latex arrow blue] (11.center) to (12.center); + \draw [style=latex arrow blue] (14.center) to (13.center); + \draw [style=latex arrow blue] (15.center) to (16.center); \draw [style=latex double arrow] (24.center) to (25.center); + \draw [style=latex arrow green] (26.center) to (27.center); + \draw [style=latex arrow green] (29.center) to (28.center); + \draw [style=latex arrow red] (32.center) to (33.center); + \draw [style=latex arrow red] (39.center) to (38.center); + \draw [style=latex double arrow] (47.center) to (48.center); + \draw [style=dashed line] (38.center) to (44.center); + \draw [style=dashed line] (41.center) to (45.center); + \draw [style=dotted very thick green latex arrow] (49.center) to (50.center); + \draw [style=latex arrow green] (52.center) to (51.center); + \draw [style=dashed line] (21.center) to (27.center); + \draw [style=dashed line] (29.center) to (22.center); + \draw [style=latex arrow red] (41.center) to (42.center); \end{pgfonlayer} \end{tikzpicture} diff --git a/inc/0.titlepage.tex b/inc/0.titlepage.tex index 260f9bc..a7ecb6d 100644 --- a/inc/0.titlepage.tex +++ b/inc/0.titlepage.tex @@ -14,7 +14,7 @@ Microelectronic Systems Design Research Group \\[3ex] \textsc{\Huge Bachelor Thesis}\\[6ex] \centerline{\Large Development of a System-Level Simulation Frontend for DRAMSys} \vspace{20pt} -\centerline{\Large Should be here} +\centerline{\Large Entwicklung eines System-Level-Simulations-Frontends für DRAMSys} \vfill \vfill diff --git a/inc/3.systemc.tex b/inc/3.systemc.tex index 988abe5..099ce29 100644 --- a/inc/3.systemc.tex +++ b/inc/3.systemc.tex @@ -36,41 +36,47 @@ TLM abstracts the modeling of the communication between modules using so-called In contrast to pin and cycle accurate models, this greatly reduces the simulation overhead at the cost of reduced accuracy. Modules communicate with each other through \textit{initiator} sockets and \textit{target} sockets. -A processor for example sends requests to a memory using its initiator socket, whereas the memory responds trough its target socket. -Interconnect modules, which can be used to model a bus, use both sockets to communicate with both the initiator and the target modules. +A processor, for example, sends requests to a memory using its initiator socket, whereas the memory responds trough its target socket. +Interconnect modules, which can be used to model a bus, use both sockets to communicate with both initiator and the target modules. This concept is illustrated in figure \ref{fig:tlm}. -The transaction object itself is a \revabbr{generic payload}{GP}, which consists of address, command, status and other information as well as the actual data to transfer. -GPs are transferred as references, avoiding the need to copy them between the modules. +The transaction object itself is a \revabbr{generic payload}{GP}, which consists of the target address, whether the transaction is a read or write command, status information and other transaction parameters as well as the actual data to transfer. +GPs are passed along as references, avoiding the need to copy them between the modules. \input{img/thesis.tikzstyles} \begin{figure}[!ht] \begin{center} \tikzfig{img/tlm} -\caption{Forward and backward path between TLM sockets\cite{Menard2017}. $\blacksquare$ denotes an initiator socket, $\square$ denotes a target socket.} +\caption[Forward and backward path between TLM sockets\cite{Menard2017}.]{Forward and backward path between TLM sockets\cite{Menard2017}. $\blacksquare$ denotes an initiator socket, $\square$ denotes a target socket.} \label{fig:tlm} \end{center} \end{figure} -In the \revabbr{loosley-timed}{LT} coding style, a transaction is blocking, meaning it will be modeled by only one function call. -This comes at the cost of limited timing accuracy as only the beginning and the end of the transaction are modeled as timing points and no other module can initiate transcations during this time. +SystemC defines two coding styles for the use of TLM, called \revabbr{loosley-timed}{LT} and \revabbr{approximateley-timed}{AT}. +In the LT coding style, a transaction is blocking, meaning it will be modeled by only one function call. +This comes at the cost of limited timing accuracy as only the beginning and the end of the transaction are modeled as timing points and the initiator has to wait for the transaction to complete until it can make the next request. +However, the fast simulation time, especially when \textit{temporal decoupling} with \textit{quantums} is used, makes it possible to use this coding style for rapid software development, like developing drivers for a simulated hardware component. +For such a task the timing accuracy is sufficient. -The \revabbr{approximateley-timed}{AT} coding style is non-blocking and consists of a four-phase handshake: +The AT coding style is non-blocking and therefore can be used to model with a higher timing accuracy than LT. +This high accuracy makes it possible to use AT to conduct design space exploration on a hardware level. +With AT, a special protocol is used that consists of a four-phase handshake: \texttt{BEGIN\_REQ}, \texttt{END\_REQ}, \texttt{BEGIN\_RESP} and \texttt{END\_RESP}. When an initiator requests certain data from a target, it starts the transaction with the \texttt{BEGIN\_REQ} phase using its \texttt{nb\_transport\_fw()} method. -The target now enqueues the payload into its \revabbr{payload event queue}{PEQ} and pretends it has received the payload after the delay, the initiator has specified. -When the target is not ready yet to accept the requests, it defers its \texttt{END\_REQ} until it is. -During this time, the initiator is blocked from sending further requests to other modules as the target applies \textit{back pressure} on the initiator. +The target now enqueues the payload into its \revabbr{payload event queue}{PEQ} and pretends it has received the payload after the delay the initiator has specified. +When the target is not yet ready to accept a new request, it defers its \texttt{END\_REQ} phase until it is. +During this time, the initiator is blocked from sending further requests either to this or other modules as the target applies \textit{backpressure} on the initiator. This concept is called the \textit{exclusion rule}. The target now prepares the response and sends the \texttt{BEGIN\_RESP} phase through its \texttt{nb\_transport\_bw()} method when the data is available. -The initiator can now also apply back pressure to the target by deferring its \texttt{END\_RESP} phase. +The initiator can now also apply backpressure to the target by deferring its \texttt{END\_RESP} phase. When the \texttt{END\_RESP} phase is received by the target, the transaction is completed. -Figure \ref{fig:tlm_at} shows an exemplary handshake sequence diagram with all four phases. +% hier komplexeres handshake beispiell +Figure \ref{fig:tlm_at} shows an exemplary handshake sequence diagram of three different transactions. \begin{figure}[!ht] \begin{center} @@ -80,8 +86,26 @@ Figure \ref{fig:tlm_at} shows an exemplary handshake sequence diagram with all f \end{center} \end{figure} -SystemC also supports additional user-defined phases through its \texttt{DECLARE\_EXTENDED\_PHASE()} macro. -In contrast to the TLM-LT protocol, TLM-AT makes it possible model pipelining of transactions; multiple transactions can be processed by a target at one time. +SystemC defines various special cases and shortcuts that can be used troughout the protocol. +Both in the \texttt{BEGIN\_REQ} phase as well as in the \texttt{BEGIN\_RESP} phase, it is possible for the target to skip the \texttt{END\_REQ} phase or for the initiator to skip the \texttt{END\_RESP} phase respectively using the return value of the forward or backward transport function call. +For this the return type \texttt{tlm\_sync\_enum} has to be set to \texttt{TLM\_UPDATED} instead of \texttt{TLM\_ACCEPTED} in the normal case. + +Besides using the return path to skip the \texttt{END\_REQ} phase, it is possible for the target to directly respond with the \texttt{BEGIN\_RESP} phase. +The initiator has to react accordingly and must detect that the \texttt{END\_REQ} has been skipped. +However, since the initiator is blocked due to backpressure, this shortcut should only be used if the response is ready to send after a very short delay. +Another form of this shortcut, is the combination with the return path of the forward transport function call. +Here, the return path is used to directly send the \texttt{BEGIN\_REQ} phase, without invoking the backward transport function, reducing the required number of transport calls to only two. + +The last shortcut, that can be made is the so-called \textit{early completion}. +When the target receives the \texttt{BEGIN\_REQ} phase, it can already place the requested data into the payload and pass \texttt{TLM\_COMPLETED} as the return value back to the initiator. +This notifies that the whole transaction is already completed at this point, so no further transport calls are required. +Note that this form of early completion is very similar to the LT coding style, where a transaction is modeled using only one function call. +Early completion can also be used by the initiator to skip the \texttt{END\_REQ} phase. +Here, \texttt{TLM\_COMPLETED} is returned during the backward transport call and thus, the target experiences no backpressure from the initiator. + +SystemC also supports additional user-defined phases through its \texttt{DECLARE\_EXTENDED\_\\PHASE()} macro. +In contrast to the TLM-LT protocol, TLM-AT allows model pipelining of transactions; multiple transactions can be processed simultaneously by one target. The responses also do not need to be in the same order as the initiator has sent them: they can be \textit{out out order}. -The TLM-AT protol is the used protocol to implement the processor model and the cache model in section \ref{sec:implementation} of this thesis. +The TLM-AT coding style is the used protocol to implement the processor model and the cache model in section \ref{sec:implementation} of this thesis. +Also, some of the earlier described shortcuts are taken advantage of throughout those models. diff --git a/inc/4.caches.tex b/inc/4.caches.tex index 5e572ef..20f4b76 100644 --- a/inc/4.caches.tex +++ b/inc/4.caches.tex @@ -155,7 +155,7 @@ Such a cache is called \textit{virtually indexed} and \textit{physically tagged} \begin{figure}[!ht] \begin{center} \tikzfig{img/virtual_address_conversion} -\caption{Virtually indexed, physically tagged cache.\cite{Jacob2008} ASID refers to address-space identifier.} +\caption[Virtually indexed, physically tagged cache\cite{Jacob2008}.]{Virtually indexed, physically tagged cache\cite{Jacob2008}. ASID refers to address-space identifier.} \label{fig:virtual_address_conversion} \end{center} \end{figure} @@ -198,7 +198,7 @@ An architecture of an MSHR file is illustrated in figure \ref{fig:mshr_file}. \begin{figure}[!ht] \begin{center} \tikzfig{img/mshr_file} -\caption{Miss Holding Status Register File.\cite{Jahre2007} V refers to a valid bit.} +\caption[Miss Holding Status Register File\cite{Jahre2007}.]{Miss Holding Status Register File\cite{Jahre2007}. V refers to a valid bit.} \label{fig:mshr_file} \end{center} \end{figure} diff --git a/inc/5.dramsys.tex b/inc/5.dramsys.tex index 195dadd..bd42b94 100644 --- a/inc/5.dramsys.tex +++ b/inc/5.dramsys.tex @@ -4,7 +4,7 @@ DRAMSys is an open-source design space exploration framework, capable of simulating the latest \revabbr{Joint Electron Device Engineering Council}{JEDEC} DRAM standards. It is optimized to achieve high simulation speeds and utilizes the TLM-AT coding style while still achieving cycle accurate results \cite{Steiner2020}. -DRAMSys is composed of an arbitration \& mapping unit (also called arbiter) and independent channel controllers with a DRAM device each. +DRAMSys is composed of an arbitration \& mapping unit (also called arbiter) and independent channel controllers, each driving one DRAM device. The general architecture of DRAMSys is illustrated in figure \ref{fig:dramsys}. \begin{figure}[!ht] @@ -14,15 +14,44 @@ The general architecture of DRAMSys is illustrated in figure \ref{fig:dramsys}. \label{fig:dramsys} \end{center} \end{figure} - -Several initiators can be connected to the arbiter, sending requests to the DRAM subsystem. +% doch noch über interne funktionen schreiben +Several initiators can be connected to DRAMSys at the same time, sending requests independently to the DRAM subsystem. An initiator can either be a sophisticated processor model like the gem5 out of order processor model \cite{Binkert2011} or a trace player that simply replays a trace file containing a sequence of memory requests and timestamps. To support a large variety of DRAM standards robustly and error-free, DRAMSys uses a formal domain specific language based on Petri nets called DRAMml. This language includes a standards timing dependencies between all DRAM commands and compiles to source code of the internal timing checkers that ensure compliance to the specific standard \cite{Jung2017a}. -Because a single memory access can cause the issuance of multiple commands (e.g. precharge (\texttt{PRE}), activate (\texttt{ACT}), read (\texttt{RD}) or write (\texttt{WR})), the four phase handshake of the TLM-AT protocol is not sufficient enough. +Since a single memory access can result in the issuance of multiple commands (e.g. a precharge (\texttt{PRE}), an activate (\texttt{ACT}), a read (\texttt{RD}) or a write (\texttt{WR})), the four phase handshake of the TLM-AT protocol is not sufficient. Therefore, a custom TLM protocol called DRAM-AT is used as the communication protocol between the channel controller and the DRAM device \cite{Steiner2020}. +This custom protocol introduces a \texttt{BEGIN} and \texttt{END} phase for every available DRAM command. +Which commands can be issued depends on the used DRAM standard. + +Some of the internal modules and their functionality will now be explained. +The task of the \textit{arbiter} is to accept the incoming transactions from the various initiators and decode the address according to the configured address mapping. +From there the transactions are passed to the channel controller. + +The channel controller is the centerpiece of the DRAM simulation, consisting of a \textit{scheduler}, \textit{bank machines}, \textit{refresh managers}, \textit{power down managers}, a \textit{response queue} and a \textit{command multiplexer}. +New incoming requests get placed into the scheduler. +The purpose of the scheduler is to group transactions by their accessed memory bank and reorder the payloads according to a predefined policy. +Available policies are, for example, the \textit{first-in, first-out} or the \textit{first-ready - first-come, first-served} policy. +The former policy does not reorder payloads and therefore optimizes for a short response latency and whereas the latter policy does reorder payloads and optimizes for a high memory bandwidth. + +A bank machine, whose responsibility is to manage the state of its corresponding memory bank, then fetches the next transaction from the scheduler. +There are also a number of available policies for the bank machines, each of which determine in which state the bank is being held after a completed memory request. + +With the fetched transaction, the bank machine then selects the command that it needs to send to its memory bank to enforce its policy. +However, the selected command can not be sent instantaneously to the DRAM, as complex timing constraints need to be satisfied before the issuance of a specific command. +To obey those timing constraints, the bank machine verifies through the so-called \textit{timing checker} that the selected command is allowed to be sent to the memory. +The bank machine then tries to enque the command, so that the controller can send it out. + +The task of the command multiplexer is to select one command out of all commands that have been enqueued by the bank machines, the refresh managers or the power down managers. +The command multiplexer also has a set of configurable policies, that determine the commands individual priorities. +The selected command then will be sent out to the DRAM by the controller. + +The last important module to mention is the response queue. +The completed DRAM transactions are enqueued into the response queue to send the responses back to the initiators. +In the response queue, the responses can either be passed to the initiator using the \textit{first-in, first-out} scheme, or firstly be reordered in the queue itself. +A reordering might be necessary to be able to support initiators that can not handle \textit{out-out-order} responses. % Evtl TA falls Bilder genutzt werden? DRAMSys also provides the so-called \textit{Trace Analyzer}, a graphical tool that visualizes database files created by DRAMSys. @@ -30,10 +59,10 @@ It shows the \texttt{REQ} and \texttt{RESP} phases between the initiator and the An example trace database, visualized in the Trace Analyzer is shown in figure \ref{fig:traceanalyzer}. Furthermore, the Trace Analyzer is capable of calculating numerous metrics and creating plots of interesting characteristics. -\begin{figure}[!ht] +\begin{figure}%[!ht] \begin{center} -\includegraphics{img/traceanalyzer.pdf} -\caption{Exemplary visualization of a trace database in the Trace Analyzer.} +\includegraphics[width=\linewidth]{img/traceanalyzer.pdf} +\caption[Exemplary visualization of a trace database in the Trace Analyzer.]{Exemplary visualization of a trace database in the Trace Analyzer. The used DRAM consists of one rank and eight bank groups with two banks each.} \label{fig:traceanalyzer} \end{center} \end{figure} diff --git a/inc/6.implementation.tex b/inc/6.implementation.tex index 5b039fc..fb1fafa 100644 --- a/inc/6.implementation.tex +++ b/inc/6.implementation.tex @@ -83,7 +83,7 @@ This instruction count is used to approximate the delay between the memory acces 1,w,16,1190cf2b0 0,w,16,1190cf2c0 \end{textcode} -\caption{Example of a memory access trace with a timestamp. For each thread, a seperate trace file is generated.} +\caption[Example of a memory access trace with a timestamp.]{Example of a memory access trace with a timestamp. For each thread, a seperate trace file is generated.} \label{list:memtrace} \end{listing} @@ -172,16 +172,16 @@ It has a configurable size, associativity, cache line size, MSHR buffer depth, w To understand how the cache model works, a hypothetical request from the CPU will be assumed to explain the internal processing of the transaction in detail: When the transaction arrives, it will be placed in the PEQ of the cache from where, after the specified amount of delay, the handler for the \texttt{BEGIN\_REQ} phase is called. -The handler verifies that the cache buffers are not full\footnote{Otherwise the cache will apply back pressure on the CPU and postpone the handling of the transaction.} and checks if the requested data is stored in the cache. +The handler verifies that the cache buffers are not full\footnote{Otherwise the cache will apply backpressure on the CPU and postpone the handling of the transaction.} and checks if the requested data is stored in the cache. If it is the case (i.e. a cache hit), the cache model sends immediately an \texttt{END\_REQ} and, when the target socket is not currently occupied with a response, accesses the cache and sends the \texttt{BEGIN\_RESP} phase to the processor. During a cache access, the content of the cache line is copied into the transaction in case of a read request, or the cache line is updated with the new value in case of a write request. Further, in both cases the timestamp of the last access is updated to the current simulation time. -The processor then finalizes the transaction with the \texttt{END\_RESP} phase, the target back pressure of the cache will be cleared and the postponed request from the CPU (if it exists) is now placed into the PEQ once again. +The processor then finalizes the transaction with the \texttt{END\_RESP} phase, the target backpressure of the cache will be cleared and the postponed request from the CPU (if it exists) is now placed into the PEQ once again. If, on the other hand, the requested data is not in the cache (i.e. a cache miss), first it will be checked if there is already an existing MSHR entry for the corresponding cache line. If this is the case\footnote{And if the target list of the MSHR entry is not full. Otherwise the transaction will be postponed.}, the transaction is appended to it as an additional target. If not, a cache line is evicted to make space for the new cache line that will be fetched from the underlying memory. -The cache model implements the optimal replacement policy \revabbr{least recently used}{LRU}, so the cache line with the oldest last access time is chosen to be evicted. +The cache model implements the optimal replacement policy LRU, so the cache line with the oldest last access time is chosen to be evicted. When an eviction is not possible, the transaction will be postponed. An eviction is not possible when the selected cache line is allocated but not yet filled with requested data from the underlying cache, the cache line is currently present in the MSHR queue, or a hit for this cache line is yet to be handled. When the \texttt{dirty} flag of the old cache line is set, it has to be placed into the write buffer and written back to the memory. @@ -192,7 +192,7 @@ To process the entries in the MSHR and in the write buffer, the \texttt{processM In the former, a not yet issued MSHR entry is selected for which a new fetch transaction is generated and sent to the underlying memory. Note that special care has to be taken when the requested cache line is also present in the write buffer: To ensure consistency, no new request is sent to the DRAM and instead the value is snooped out of the write buffer. -In the latter, the processing of the write back buffer, a not yet issued entry is selected and a new write transaction is sent to the memory.\footnote{Both \texttt{processMshrQueue()} and \texttt{processWriteBuffer()} also need to ensure that currently no back pressure is applied onto the cache from the memory.} +In the latter, the processing of the write back buffer, a not yet issued entry is selected and a new write transaction is sent to the memory.\footnote{Both \texttt{processMshrQueue()} and \texttt{processWriteBuffer()} also need to ensure that currently no backpressure is applied onto the cache from the memory.} Incoming transactions from the memory side are accepted with a \texttt{END\_RESP} and, in case of a fetch transaction, used to update the cache contents and possibly preparing a new response transaction for the processor as described before. @@ -203,7 +203,7 @@ The rough internal structure of the cache model is shown again in figure \ref{fi \begin{figure} \begin{center} \tikzfig{img/cache} -\caption{Internal architecture of the cache model. \textit{V} stands for \textit{valid}, \textit{D} for \textit{dirty}, \textit{A} for \textit{allocated}, \textit{T} for \textit{tag}, \textit{AT} for \textit{access time}, \textit{I} for \textit{issued} and \textit{Idx} for \textit{index}. In the cache line array, adjacent lines with the same addressing index are colored in the same gray shade. The size of such a group is the \textit{associativity}.} +\caption[Internal architecture of the cache model.]{Internal architecture of the cache model. \textit{V} stands for \textit{valid}, \textit{D} for \textit{dirty}, \textit{A} for \textit{allocated}, \textit{T} for \textit{tag}, \textit{AT} for \textit{access time}, \textit{I} for \textit{issued} and \textit{Idx} for \textit{index}. In the cache line array, adjacent lines with the same addressing index are colored in the same gray shade. The size of such a group is the \textit{associativity}.} \label{fig:cache} \end{center} \end{figure} @@ -236,7 +236,7 @@ So with the new trace player interface, a top-level initiator can either be a si As already seen in figure \ref{fig:dbiplayer_with_caches}, interconnection modules are needed to connect the caches with each other. While the implementation of the \textit{MultiCoupler} component is trivial as it only passes the transactions from its so-called \texttt{multi\_passthrough\_target\_socket} to its \texttt{multi\_passthrough\_initiator\_socket}, the \textit{MultiSimpleCoupler} is more complex because it has to internally buffer transactions. -In order to understand why this buffering needed, consider scenario where the L3 cache applies back pressure to one L2 cache. +In order to understand why this buffering needed, consider scenario where the L3 cache applies backpressure to one L2 cache. The L2 cache is not allowed to send further requests due to the exclusion rule. But since the target socket of the L3 cache is occupied, this also applies to all other other L2 caches. This information, however, is not propagated to the other caches, leading to an incorrect behavior if not addressed, as the other caches will send further requests. @@ -249,7 +249,7 @@ For illustrating this further, a simple example can be assumed: One L2 cache needs to request a cache line from the underlying L3 cache. The MultiSimpleCoupler receives the \texttt{BEGIN\_REQ} phase and places it into its PEQ. From there, an internal routing table is updated to be able to send the response back through the correct multi-socket binding afterwards. -As the L3 cache is currently not applying back pressure onto the interconnect, it can forward the transaction with the \texttt{BEGIN\_REQ} phase to the L3 cache. +As the L3 cache is currently not applying backpressure onto the interconnect, it can forward the transaction with the \texttt{BEGIN\_REQ} phase to the L3 cache. Until the L3 cache responds with the \texttt{END\_REQ} phase, the interconnect defers any new request from any L2 cache and buffers the payload objects in an internal data structure. When the \texttt{END\_REQ} phase is received, the next transaction from this request buffer is sent to the L3 cache. After some time the, L3 cache will respond with the requested cache lines. diff --git a/inc/7.simulation_results.tex b/inc/7.simulation_results.tex index 79da579..4f658f6 100644 --- a/inc/7.simulation_results.tex +++ b/inc/7.simulation_results.tex @@ -83,7 +83,7 @@ Their access patterns are as followed: In the following, the simulation results of the new simulation frontend, the gem5 full-system emulation and the gem5 syscall-emulation will now be presented. \begin{table}[!ht] -\caption{Results for bandwidth and bytes read/written with DDR3-1600. FS denotes gem5 full-system, SE denotes gem5 syscall-emulation, DS denotes DRAMSys.} +\caption[Results for bandwidth and bytes read/written with DDR3-1600.]{Results for bandwidth and bytes read/written with DDR3-1600. FS denotes gem5 full-system, SE denotes gem5 syscall-emulation, DS denotes DRAMSys.} \begin{center} \begin{tabular}{|c|c|c|c|c|c|c|c|c|c|} \hline