Apply Lukas' corrections
This commit is contained in:
10
doc.bib
10
doc.bib
@@ -1,5 +1,5 @@
|
||||
@Article{Bruening2003,
|
||||
author = {Bruening, D. and Garnett, T. and Amarasinghe, S.},
|
||||
author = {Bruening, Derek and Garnett, Timothy and Amarasinghe, Saman P.},
|
||||
journal = {International Symposium on Code Generation and Optimization, 2003. CGO 2003.},
|
||||
title = {An infrastructure for adaptive dynamic optimization},
|
||||
year = {2003},
|
||||
@@ -7,7 +7,7 @@
|
||||
}
|
||||
|
||||
@Article{Bruening2004,
|
||||
author = {Bruening, D.},
|
||||
author = {Bruening, Derek},
|
||||
journal = {Massachusetts Institute of Technology},
|
||||
title = {Efficient, transparent, and comprehensive runtime code manipulation},
|
||||
year = {2004},
|
||||
@@ -31,14 +31,14 @@
|
||||
}
|
||||
|
||||
@Book{Jacob2008,
|
||||
author = {B. Jacob and S. W. Ng and D. T. Wang},
|
||||
author = {Bruce Jacob and Spencer W. Ng and David T. Wang},
|
||||
publisher = {Morgan Kaufmann},
|
||||
title = {Memory Systems: Cache, DRAM, Disk},
|
||||
year = {2008},
|
||||
}
|
||||
|
||||
@Article{Jahre2007,
|
||||
author = {Jahre, M. and Natvig, L.},
|
||||
author = {Jahre, Magnus and Natvig, Lasse},
|
||||
title = {Performance Effects of a Cache Miss Handling Architecture in a Multi-core Processor},
|
||||
year = {2007},
|
||||
}
|
||||
@@ -86,7 +86,7 @@
|
||||
}
|
||||
|
||||
@Book{Jung2017,
|
||||
author = {Jung, M.},
|
||||
author = {Jung, Matthias},
|
||||
publisher = {Technische Universit{\"a}t Kaiserslautern},
|
||||
title = {System-level Modeling, Analysis and Optimization of DRAM Memories and Controller Architectures},
|
||||
year = {2017},
|
||||
|
||||
50
img/associativity.tikz
Normal file
50
img/associativity.tikz
Normal file
@@ -0,0 +1,50 @@
|
||||
\begin{tikzpicture}
|
||||
\begin{pgfonlayer}{nodelayer}
|
||||
\node [style=cache entry] (0) at (0, 0.5) {};
|
||||
\node [style=cache entry] (1) at (0, 1) {};
|
||||
\node [style=cache entry] (2) at (0, 1.5) {};
|
||||
\node [style=cache entry] (3) at (0, 2) {};
|
||||
\node [style=cache entry] (4) at (0, 2.5) {};
|
||||
\node [style=cache entry] (5) at (0, 3) {};
|
||||
\node [style=cache entry] (6) at (0, 3.5) {};
|
||||
\node [style=cache entry] (7) at (0, 4) {};
|
||||
\node [style=cache entry] (8) at (5, 2.5) {};
|
||||
\node [style=cache entry] (9) at (5, 3) {};
|
||||
\node [style=cache entry] (10) at (5, 3.5) {};
|
||||
\node [style=cache entry] (11) at (5, 4) {};
|
||||
\node [style=cache entry] (12) at (7, 2.5) {};
|
||||
\node [style=cache entry] (13) at (7, 3) {};
|
||||
\node [style=cache entry] (14) at (7, 3.5) {};
|
||||
\node [style=cache entry] (15) at (7, 4) {};
|
||||
\node [style=cache entry] (16) at (5, 0.5) {};
|
||||
\node [style=cache entry] (17) at (7, 0.5) {};
|
||||
\node [style=cache entry] (18) at (9, 0.5) {};
|
||||
\node [style=cache entry] (19) at (11, 0.5) {};
|
||||
\node [style=cache entry] (20) at (13, 0.5) {};
|
||||
\node [style=cache entry] (21) at (15, 0.5) {};
|
||||
\node [style=cache entry] (22) at (17, 0.5) {};
|
||||
\node [style=cache entry] (23) at (19, 0.5) {};
|
||||
\node [style=cache entry] (24) at (17, 3.5) {};
|
||||
\node [style=cache entry] (25) at (17, 4) {};
|
||||
\node [style=cache entry] (26) at (13, 3.5) {};
|
||||
\node [style=cache entry] (27) at (13, 4) {};
|
||||
\node [style=cache entry] (28) at (19, 3.5) {};
|
||||
\node [style=cache entry] (29) at (19, 4) {};
|
||||
\node [style=cache entry] (30) at (15, 3.5) {};
|
||||
\node [style=cache entry] (31) at (15, 4) {};
|
||||
\node [style=none] (32) at (21, 1) {};
|
||||
\node [style=none] (33) at (21, 4.5) {};
|
||||
\node [style=none] (34) at (25, 1) {};
|
||||
\node [style=none] (35) at (0.75, -0.25) {};
|
||||
\node [style=none] (36) at (1, -0.25) {\scriptsize Direct-mapped};
|
||||
\node [style=none] (38) at (7.25, 1.75) {\scriptsize Two-way set associative};
|
||||
\node [style=none] (39) at (6.25, -0.25) {\scriptsize Fully associative};
|
||||
\node [style=none] (40) at (15.25, 2.75) {\scriptsize Four-way set associative};
|
||||
\node [style=align left] (41) at (22.75, 4) {\scriptsize equivalence\\\scriptsize classes\\\scriptsize ("sets")};
|
||||
\node [style=align left] (42) at (23.75, 0) {\scriptsize items in same set or\\ \scriptsize equivalence class};
|
||||
\end{pgfonlayer}
|
||||
\begin{pgfonlayer}{edgelayer}
|
||||
\draw [style=latex arrow] (32.center) to (33.center);
|
||||
\draw [style=latex arrow] (32.center) to (34.center);
|
||||
\end{pgfonlayer}
|
||||
\end{tikzpicture}
|
||||
@@ -1,28 +1,27 @@
|
||||
\begin{tikzpicture}
|
||||
\begin{pgfonlayer}{nodelayer}
|
||||
\node [style=none] (0) at (0.5, 0) {};
|
||||
\node [style=none] (1) at (-2.25, 4) {};
|
||||
\node [style=none] (2) at (-1.5, 9.25) {};
|
||||
\node [style=none] (3) at (3, 12.75) {};
|
||||
\node [style=none] (4) at (9.25, 13) {};
|
||||
\node [style=none] (5) at (15.5, 12.25) {};
|
||||
\node [style=none] (6) at (17.25, 1.75) {};
|
||||
\node [style=none] (7) at (12.5, -0.5) {};
|
||||
\node [style=none] (8) at (6.75, -0.75) {};
|
||||
\node [style=wrap text] (10) at (6.25, 12) {Scheduling\\Policy};
|
||||
\node [style=wrap text] (11) at (11, 8.25) {Number\\of\\Devices};
|
||||
\node [style=wrap text] (13) at (-0.25, 6.5) {Refresh\\Policy};
|
||||
\node [style=wrap text] (14) at (7, 1.5) {Number\\of\\Channels};
|
||||
\node [style=wrap text] (16) at (16.25, 9.25) {Power\\Down\\Policy};
|
||||
\node [style=wrap text] (17) at (10.5, 5) {DRAM\\Speed};
|
||||
\node [style=wrap text] (18) at (5, 5) {Page\\Policy};
|
||||
\node [style=wrap text] (19) at (1.5, 2) {Command\\Multiplexer\\Policy};
|
||||
\node [style=wrap text] (20) at (12.5, 1.75) {Response\\Queue\\Policy};
|
||||
\node [style=wrap text] (21) at (5.25, 8.5) {Address\\Mapping};
|
||||
\node [style=wrap text] (22) at (1.25, 10.75) {DRAM\\Standard};
|
||||
\node [style=wrap text] (23) at (12.25, 11.5) {DRAM\\Capacity};
|
||||
\node [style=wrap text] (24) at (16.75, 4.75) {Timing\\Parameters};
|
||||
\node [style=none] (25) at (18.75, 6.75) {};
|
||||
\node [style=none] (0) at (0.25, -0.5) {};
|
||||
\node [style=none] (1) at (-2.5, 4) {};
|
||||
\node [style=none] (2) at (-1.75, 9.25) {};
|
||||
\node [style=none] (3) at (3, 13.25) {};
|
||||
\node [style=none] (4) at (9.25, 13.5) {};
|
||||
\node [style=none] (5) at (15.75, 12.75) {};
|
||||
\node [style=none] (6) at (17.5, 1.25) {};
|
||||
\node [style=none] (7) at (12.5, -1) {};
|
||||
\node [style=none] (8) at (6.75, -1.25) {};
|
||||
\node [style=cloud elements] (10) at (6.25, 12.5) {Scheduling\\Policy};
|
||||
\node [style=cloud elements] (11) at (12.25, 8) {Number\\of\\Devices};
|
||||
\node [style=cloud elements] (13) at (-0.5, 6.5) {Refresh\\Policy};
|
||||
\node [style=cloud elements] (14) at (5.25, 7.25) {Number\\of\\Channels};
|
||||
\node [style=cloud elements] (16) at (16.5, 9.5) {Power\\Down\\Policy};
|
||||
\node [style=cloud elements] (17) at (6.75, 1.25) {DRAM\\Speed};
|
||||
\node [style=cloud elements] (18) at (9, 5) {Page\\Policy};
|
||||
\node [style=cloud elements] (20) at (12.5, 1.25) {Number\\of\\Ranks};
|
||||
\node [style=cloud elements] (21) at (1.5, 2.5) {Address\\Mapping};
|
||||
\node [style=cloud elements] (22) at (1, 10.75) {DRAM\\Standard};
|
||||
\node [style=cloud elements] (23) at (12.25, 12.5) {DRAM\\Capacity};
|
||||
\node [style=cloud elements] (24) at (17, 4.5) {Timing\\Parameters};
|
||||
\node [style=none] (25) at (19, 6.75) {};
|
||||
\end{pgfonlayer}
|
||||
\begin{pgfonlayer}{edgelayer}
|
||||
\draw [style=very thick line, bend left=75, looseness=1.25] (0.center) to (1.center);
|
||||
|
||||
@@ -29,6 +29,9 @@
|
||||
\tikzstyle{generic block}=[fill=white, draw=black, shape=rectangle, minimum height=1.25cm, minimum width=1.75cm, align=center]
|
||||
\tikzstyle{payload}=[fill=white, draw=black, shape=rectangle, dashed, align=center]
|
||||
\tikzstyle{wrap text}=[fill=none, draw=none, shape=circle, align=center]
|
||||
\tikzstyle{cloud elements}=[fill=none, draw={rgb,255: red,180; green,180; blue,180}, shape=circle, dashed, align=center, very thin]
|
||||
\tikzstyle{cache entry}=[fill=white, draw=black, shape=rectangle, minimum width=1cm, minimum height=0.25cm]
|
||||
\tikzstyle{align left}=[fill=none, draw=none, shape=circle, align=left]
|
||||
|
||||
% Edge styles
|
||||
\tikzstyle{dashed line}=[-, dashed]
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
\section{Introduction}
|
||||
\label{sec:introduction}
|
||||
|
||||
Todays computing systems accompany us in almost all areas of life in the form of smart devices, computers, or game consoles.
|
||||
%vlt noch warum DRAMs immer mehr eingesetzt werden
|
||||
Today's computing systems accompany us in almost all areas of life in the form of smart devices, computers, or game consoles.
|
||||
With the increasing performance requirements on these devices, not only faster processors are needed, but also high-performance memory systems, namely dynamic random access memories, which are supposed to deliver a lot of bandwidth at low latency.
|
||||
While these storage systems are very complex and offer a lot of room for configuration, as the used DRAM standard, the memory controller configuration or the address mapping, there are different requirements for the very different applications\cite{Gomony2012}.
|
||||
Consequently, system designers are commissioned with the complex task of finding the most effective configurations that match the performance and power contraints with good optimizations applied for the specific use case.
|
||||
While these storage systems are very complex and offer a lot of room for configuration, e.g., the \revabbr{dynamic random-access memory}{DRAM} standard, the memory controller configuration or the address mapping, there are different requirements for the very different applications\cite{Gomony2012}.
|
||||
Consequently, system designers are entrusted with the complex task of finding the most effective configurations that match the performance and power contraints with good optimizations applied for the specific use case.
|
||||
|
||||
\input{img/thesis.tikzstyles}
|
||||
\begin{figure}[!ht]
|
||||
@@ -15,24 +15,24 @@ Consequently, system designers are commissioned with the complex task of finding
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
For the exploration of the design space for these configurations it is impractical to use real systems as they expensive and are not suitable for rapid prototyping.
|
||||
For the exploration of the design space of these configurations it is impractical to use real systems as they are too cost-intensive and not modifyable and therefore not suitable for rapid prototyping.
|
||||
To overcome this limitation, it is important to simulate the memory system using a simulation framework with sufficient accuracy.
|
||||
|
||||
Such a simulation framework is DRAMSys\cite{Steiner2020}\cite{Jung2017}, which is based on transaction level modeling and enables the fast simulation of numerous DRAM standards and controller configurations with cycle-accuracy.
|
||||
Stimuli for the memory system can either be generated using a prerecorded trace file with fixed or relative timestamps, a traffic generator that acts as a state machine and initiates different request patterns or a detailed processor model of the gem5\cite{Binkert2011} simulation framework.
|
||||
Such a simulation framework is DRAMSys\cite{Steiner2020}\cite{Jung2017}, which is based on SystemC \revabbr{transaction level modeling}{TLM} and enables the fast simulation of numerous DRAM standards and controller configurations with cycle-accuracy.
|
||||
Stimuli for the memory system can either be generated using a prerecorded trace file with timestamps, a traffic generator that acts as a state machine and initiates different request patterns, or a detailed processor model of the gem5\cite{Binkert2011} simulation framework.
|
||||
|
||||
However, the two former methods lack in accurary whereas the latter may provide the sufficient precision but represents a very time-consuming effort.
|
||||
However, the two former methods lack in accurary whereas the latter may provide the sufficient precision but is a very time-consuming effort.
|
||||
To fill this gap of fast but accurate traffic generation, a new simulation frontend for DRAMSys is developed and presented in this thesis.
|
||||
|
||||
The methology this new framwork is based on is dynamic binary instrumentation.
|
||||
It allows the extraction of memory accesses of multi-threaded applications as they are executed on real hardware.
|
||||
These memory access traces then are played back using a simplified core model and filtered by a cache model before the memory requests are passed to the DRAM.
|
||||
This allows an accurate modeling of the system and the variing of numerous configuration parameters in a short time.
|
||||
The methology this new framework is based on is dynamic binary instrumentation.
|
||||
It allows the extraction of memory accesses of multi-threaded applications while they are executed on real hardware.
|
||||
These memory access traces are then played back using a simplified core model and are filtered by a cache model before the memory requests are passed to the DRAM.
|
||||
This allows an accurate modeling of the system and the variation of numerous configuration parameters in a short time.
|
||||
|
||||
The remainder of the thesis is structured as follows:
|
||||
In section \ref{sec:dynamorio} the used dynamic binary instrumentation framework, DynamoRIO, is introduced.
|
||||
The section \ref{sec:systemc} presents the modeling language SystemC, the developed core and cache models are based on.
|
||||
After that, the section \ref{sec:caches} gives a short overview over modern cache architectures and their high-level implementations.
|
||||
Section \ref{sec:dramsys} introduces the DRAMSys simulator framework and its basic functionalities.
|
||||
Section \ref{sec:implementation} concerns with the implementation of the cache model, the processor model and the instrumentation tool.
|
||||
In section \ref{sec:simulation_results} the accuracy of the new framwork is compared against the gem5 and Ramulator\cite{Kim2016} simulators, whereas section \ref{sec:future_work} denotes which future improvements can be achieved.
|
||||
In Section \ref{sec:dynamorio} the used dynamic binary instrumentation framework, DynamoRIO, is introduced.
|
||||
Section \ref{sec:systemc} presents the modeling language SystemC, on which the developed core and cache models are based on.
|
||||
After that, Section \ref{sec:caches} gives a short overview of modern cache architectures and their high-level implementations.
|
||||
Section \ref{sec:dramsys} introduces the DRAMSys simulation framework and its basic functionalities.
|
||||
Section \ref{sec:implementation} explains the implementation of the cache model, the processor model and the instrumentation tool in detail.
|
||||
In Section \ref{sec:simulation_results} the accuracy of the new framework is compared against the gem5 and Ramulator\cite{Kim2016} simulators, whereas Section \ref{sec:future_work} denotes future improvements that can be achieved.
|
||||
|
||||
@@ -2,71 +2,70 @@
|
||||
\label{sec:dynamorio}
|
||||
|
||||
This section will give a short overview of the dynamic binary instrumentation tool DynamoRIO, which will be used throughout this thesis.
|
||||
The exlained topics are mainly based on the chapter \textit{DynamoRIO} and \textit{Code Cache} of \cite{Bruening2004} as well as on \cite{Bruening2003}.
|
||||
The explained topics are mainly based on the chapters \textit{``DynamoRIO''}, \textit{``Code Cache''} and \textit{``Transparency''} of \cite{Bruening2004} as well as on \cite{Bruening2003}.
|
||||
|
||||
\subsection{Dynamic Binary Instrumentation}
|
||||
\label{sec:dbi}
|
||||
|
||||
\revabbr{Dynamic binary instrumentation}{DBI} is a method for analyzing, profiling, manipulating and optimizng the behavior of a binary application while it is executing.
|
||||
This is achieved through the injection of additional instructions into the instruction trace of the target application, that either accumulate statistics or intervene the instruction trace.
|
||||
\revabbr{Dynamic binary instrumentation}{DBI} is a method to analyze, profile, manipulate and optimize the behavior of a binary application while it is executed.
|
||||
This is achieved through the injection of additional instructions into the instruction trace of the target application, which either accumulate statistics or intervene the instruction trace.
|
||||
|
||||
In comparison, debuggers use special breakpoint instructions (e.g. INT3 on x86 or BKPT on ARM) that get injected at specific places in the code, raising a debug exception when reaching it.
|
||||
In comparison, debuggers use special breakpoint instructions (e.g. INT3 on x86 or BKPT on ARM) that are injected at specific places in the code, raising a debug exception when reaching it.
|
||||
At those exceptions a context switch to the operating system kernel will be performed.
|
||||
However, those context switches result in a significant performance penalty as the processor state has to be saved and restored afterwards, making it less efficient than DBI.
|
||||
|
||||
DBI tools can either start the target application by themselfes or attach to the applications process dynamically.
|
||||
DBI tools can either invoke the target application by themselfes or are attached to the application's process dynamically.
|
||||
The former method allows instrumentation of even the early startup stage of the application whereas the latter method might be used if the application has to be first brought into a certain state or the process cannot be restarted due to reliability reasons.
|
||||
Some DBI tools also allow for directly implementing the DBI framework into the applications source code.
|
||||
While this removes the flexibility of observing applications that are only available in binary form, this enables the control over the DBI tool using its application interface.
|
||||
Some DBI tools also allow to directly implement the DBI framework into the applications source code.
|
||||
While this removes the flexibility of observing applications that are only available in binary form, it enables the control over the DBI tool using its application interface.
|
||||
With this method, it is possible to precisely instrument only a specific code region of interest and otherwise disable the tool for performance reasons.
|
||||
|
||||
In all cases, the instrumentation tool executes in the same process as the target application and implants itself into the its address space.
|
||||
While this enables great control of the DBI tool over the target application, it becomes important that it operates transparently, meaning that it will not affect the applications behavior in unintended ways.
|
||||
In all cases, the instrumentation tool executes in the same process and address space as the target application.
|
||||
While this enables great control of the DBI tool over the target application, it becomes important that the tool operates transparently, meaning that the application's behavior is not affected in unintended ways.
|
||||
This is a special challenge as the instrumentation tool as well as the user-written instrumentation clients are not allowed to use library routines for memory operations/allocation, synchronization or input/output buffering that interfere with the target application \cite{Bruening2003}.
|
||||
This is especially the case with library routines that are not reentrant, which means they are unsafe to call concurrently.
|
||||
In particular, this is the case with library routines that are not \textit{reentrant}, which means they are unsafe to call concurrently.
|
||||
The dispatcher of the DBI tool can run in arbitrary places, also during non-reentrant routines.
|
||||
When the instrumentation tool or user-written client then calls the same non-reentrant routine, undefined behavior would be the consequence.
|
||||
When the instrumentation tool or user-written client calls the same non-reentrant routine concurrently, undefined behavior would be the consequence.
|
||||
|
||||
Although it is evident, the user-written client should make no assumptions over the running systems behavior and should restore all modified registers and processor states unless it is an intentional interference with the application.
|
||||
Most DBI tools offer the use of two distinct methods of injecting user code into the applications trace:
|
||||
In one case the framework saves all relevant registers and flags by itself and dispatches the execution to a user-defined function.
|
||||
Although it is evident, the user-written client should make no assumptions on the running system's behavior and should restore all modified registers and processor states unless it is an intentional interference with the application.
|
||||
Most DBI tools offer the use of two distinct methods of injecting user code into the applications trace; in one case, the framework saves all relevant registers and flags by itself and dispatches the execution to a user-defined function.
|
||||
This is the easiest method, but comes at the cost of the described context switch.
|
||||
The more advanced approach is the injection of few but sufficient instructions directly into the applications instruction trace.
|
||||
Here, it is the responsibility of the user to save and restore all altered states.
|
||||
|
||||
Generally speaking, the application should have no possibility to be able to detect that it is being instrumented by a DBI tool and should execute the same way as it would normally, even when the application itself commits incorrect behavior such as accessing invalid memory regions.
|
||||
Generally speaking, the application should have no possibility to be able to detect that it is being instrumented by a DBI tool and should execute the same way as it would do normally, even when the application itself commits incorrect behavior such as accessing invalid memory regions.
|
||||
|
||||
In summary, dynamic code analysis has the full runtime information available, unlike static code analysis, which cannot predict the execution path of the program.
|
||||
So DBI can be a mature choice for examining the runtime behavior of a binary application in a performant way.
|
||||
|
||||
The following section \ref{sec:dynamorio_core} will explain how the core functionality of the DBI tool DynamoRIO works.
|
||||
The following Section \ref{sec:dynamorio_core} will explain how the core functionality of the DBI tool DynamoRIO works.
|
||||
|
||||
\subsection{Core Functionality}
|
||||
\label{sec:dynamorio_core}
|
||||
|
||||
A simple way observe and potentially modify the instructions of an application during execution is the use of an interpretation engine that emulates the binary executable in its entirety.
|
||||
A simple way to observe and potentially modify the instructions of an application during execution is the use of an interpretation engine that emulates the binary executable in its entirety.
|
||||
One widely used framework that uses this technique is for example Valgrind\cite{Valgrind}.
|
||||
At its core, Valgrind uses a virtual machine and just-in-time compilation to instrument the target application.
|
||||
However, this approach might be powerful but comes at the cost of significantly reduced performance.
|
||||
This approach might be powerful, but it comes at the cost of significantly reduced performance.
|
||||
|
||||
DynamoRIO on the other hand uses a so-called \textit{code cache} where \textit{basic blocks} get copied into prior to execution.
|
||||
A basic block is a sequence of instructions extracted from the target applications binary that end with a single control transfer instruction.
|
||||
DynamoRIO, on the other hand, uses a so-called \textit{code cache} where \textit{basic blocks} are copied into prior to execution.
|
||||
A basic block is a sequence of instructions extracted from the target application's binary that end with a single control transfer instruction.
|
||||
In the code cache, the instrumentation instructions will directly be inserted.
|
||||
|
||||
To be able to execute the modified code, basic blocks in the code cache get extended by two \textit{exit stubs}, ensuring that at the end the control is transferred back to DynamoRIO via a context switch.
|
||||
To be able to execute the modified code, basic blocks in the code cache are extended by two \textit{exit stubs}, ensuring that at the end the control is transferred back to DynamoRIO via a context switch.
|
||||
From there the applications and processor state is saved and the next basic block will be copied into the code cache, modified and executed after restoring the previously saved state.
|
||||
Basic blocks that are already in the code cache get directly executed without copying, however, a context switch is still needed to determine the next basic block to execute.
|
||||
Basic blocks that are already located in the code cache are directly executed without copying, however, a context switch is still needed to determine the next basic block to execute.
|
||||
|
||||
To reduce this overhead and avoid a context switch, DynamoRIO can \textit{link} two basic blocks together that were targeted by a direct branch, i.e. branches whose target address will not change during runtime.
|
||||
To achieve this, the target address has to be convertes in-place to point to the new address in the code cache and not the original one in mapped binary executable.
|
||||
For indirect branches, branches whose target address is calculated at runtime, it is not possible to link them as their target basic blocks may vary.
|
||||
However, basic blocks that are often executed in a sequence are be merged into a \textit{trace}.
|
||||
To reduce this overhead and avoid a context switch, DynamoRIO can \textit{link} two basic blocks together that were targeted by a direct branch, i.e., branches whose target address will not change during runtime.
|
||||
To achieve this, the target address has to be converted in-place to point to the new address in the code cache and not the original one in the mapped binary executable.
|
||||
For indirect branches, i.e., branches whose target address is calculated at runtime, it is not possible to link them as their target basic blocks may vary.
|
||||
However, basic blocks that are often executed in a sequence are merged into a \textit{trace}.
|
||||
At the end of each basic block, an additional check is performed to determine if the indirect branch target will stay in the same trace, possibly preventing the context switch.
|
||||
Those often executed parts of the application code are also referred to as \textit{hot code} and their optimization using traces is worthwhile, even if this results in multiple copies of the same basic block in the code cache.
|
||||
Those regularly executed parts of the application code are also referred to as \textit{hot code} and their optimization using traces improves the performance but introduces the minor disadvantage of multiple copies of the same basic block in the code cache.
|
||||
The generic term for a basic block or a trace is \textit{fragment}.
|
||||
|
||||
Figure \ref{fig:dynamorio} illustrates the internal architecture and functionality of DynamoRIO.
|
||||
The application code gets loaded by the dispatcher, modified by the basic block builder, copied into the code cache and finally be executed.
|
||||
The application code is loaded by the dispatcher, modified by the basic block builder, copied into the code cache and finally executed from there.
|
||||
|
||||
\input{img/thesis.tikzstyles}
|
||||
\begin{figure}
|
||||
@@ -77,22 +76,22 @@ The application code gets loaded by the dispatcher, modified by the basic block
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
As mentioned in section \ref{sec:dbi}, it is important for a DBI tool to operate transparently.
|
||||
As mentioned in Section \ref{sec:dbi}, it is important for a DBI tool to operate transparently.
|
||||
DynamoRIO takes a number of measures to achieve this goal, some of which will now be explained \cite{Bruening2004}.
|
||||
As sharing libraries with the target application can cause transparency issues, especially when using non-reentrant routines or routines that alter static state such as error codes, DynamoRIO directly interfaces with the system using system calls and even avoids to use the C standard library (e.g. \textit{glibc} on Linux).
|
||||
The same should also apply for user-written instrumentation clients (introduced in more detail in section \ref{sec:dynamorio_client}) but the direct usage of system calls is discouraged as this bypasses the internal monitoring of DynamoRIO for changes that affect the processes address space.
|
||||
As sharing libraries with the target application can cause transparency issues, especially when using non-reentrant routines or routines that alter static state such as error codes, DynamoRIO directly interfaces with the system using system calls and even avoids to use the C standard library (e.g., \textit{glibc} on Linux).
|
||||
The same should also apply for user-written instrumentation clients (introduced in more detail in Section \ref{sec:dynamorio_client}), but the direct usage of system calls is discouraged as this bypasses the internal monitoring of DynamoRIO for changes that affect the processes address space.
|
||||
Instead, DynamoRIO provides a cross-platform API for generic routines as file system operations and memory allocation.
|
||||
To guarantee thread-transparency, DynamoRIO does not spawn new threads by itself, but uses the application threads instead and creates one DynamoRIO context for each.
|
||||
When a instrumentation client needs to spawn threads, they should be hidden from introspection from the application.
|
||||
To guarantee thread transparency, DynamoRIO does not spawn new threads by itself, but uses the application threads instead and creates one DynamoRIO context for each.
|
||||
When an instrumentation client needs to spawn threads, they should be hidden from introspection of the application.
|
||||
Client code should also not alter the application stack in any way, as some specialized applications access data beyond the top of the stack.
|
||||
Alternatively, DynamoRIO provides a separate stack that should be used instead to store temporary data.
|
||||
To remain undetected, it is also required for DynamoRIO to protect its own memory from malicious reads or writes from the application.
|
||||
Those should, like in the native case, raise an exception as unallocated data is accessed.
|
||||
However, as these memory regions are fact allocated, DynamoRIO has to produce those execption itself to remain transparent.
|
||||
When the application branches to an dynamically calculated address, DynamoRIO has to translate this address to the corresponding address of the basic block in the code cache.
|
||||
But also in the backward case, whenever a code cache address is exposed to the application, it has to be converted back to the corresponding address into the mapped address region of the binary executable.
|
||||
However, as these memory regions are actually allocated, DynamoRIO has to produce those execption itself to remain transparent.
|
||||
When the application branches to a dynamically calculated address, DynamoRIO has to translate this address to the corresponding address of the basic block in the code cache.
|
||||
But also in the backward case, whenever a code cache address is exposed to the application, it has to be converted back to the corresponding address to the mapped address region of the binary executable.
|
||||
|
||||
As can be seen, DynamoRIO makes significant effort to ensure transparency.
|
||||
As it can be seen, DynamoRIO makes significant effort to ensure transparency.
|
||||
However, factors such as timing deviations cannot be taken into account, since the instrumentation code consists of additional instructions that must be executed.
|
||||
So a sophisticated application could try to detect the presence of an instrumentation tool by estimating and comparing the execution time of its own routines.
|
||||
|
||||
@@ -103,13 +102,12 @@ With the inner workings introduced so far, the presence of DynamoRIO does not ha
|
||||
DynamoRIO provides a programming interface to develop external so-called \textit{clients}\cite{Bruening2004}.
|
||||
Clients are user-written instrumentation tools and make it possible to dynamically modify the basic blocks, either to alter the application behavior or to insert observational instructions.
|
||||
A DynamoRIO client is compiled into a shared library and passed to the \textit{drrun} utility using a command line option.
|
||||
Clients implement a number of hook functions that will called by DynamoRIO for certain events such as the creation of a basic block or of a trace.
|
||||
Generally, there are two classes of hooks:
|
||||
Those that execute on basic block creation instrument all of the application code and those that execute on trace generation are only interested in frequently executed code.
|
||||
It is important to note that the hooks for basic block and trace generation are not called every time when this code sequence is executed but when these basic blocks are generated and placed into the code cache.
|
||||
Clients implement a number of hook functions that will be called by DynamoRIO for certain events such as the creation of a basic block or of a trace.
|
||||
Generally, there are two classes of hooks: those that execute on basic block creation instrument all of the application code and those that execute on trace generation are only interested in frequently executed code.
|
||||
It is important to note that the hooks for basic block and trace generation are not called every time when this code sequence is executed, but only when these basic blocks are generated and placed into the code cache.
|
||||
So the required instructions have to be inserted into the basic block instruction stream in this stage, rather than implementing the observational or manipulative behavior in the hook function itself.
|
||||
|
||||
The table \ref{tab:dynamorio_api} lists some of the most important hooks that a client can implement.
|
||||
Table \ref{tab:dynamorio_api} lists some of the most important hooks that a client can implement.
|
||||
|
||||
\begin{table}
|
||||
\caption{Client routines that get called by DynamoRIO \cite{Bruening2003}.}
|
||||
@@ -141,16 +139,16 @@ The table \ref{tab:dynamorio_api} lists some of the most important hooks that a
|
||||
\end{table}
|
||||
|
||||
Most of the hooks receive a \texttt{void *context} pointer to the thread-local machine context through its parameter list, which then needs to be passed to the code manipulation routines.
|
||||
Those routines are available through DynamoRIOs rich code manipulation API that enables the generation, the encoding and the decoding of instructions.
|
||||
Since the processors flag and general purpose registers might be altered by executing those new instructions, it is necessary to store them before and restoring them after executing them to guarantee transparency.
|
||||
DynamoRIO also provides client routines to store those flags and registers in a thread-local slots.
|
||||
An alternative to manually storing and restoring are, as previously mentioned in section \ref{sec:dbi}, so-called \textit{clean calls} where DynamoRIO takes the responsibility for storing and restoring the processors state.
|
||||
Those routines are available through DynamoRIO's rich code manipulation API that enables the generation, the encoding and the decoding of instructions.
|
||||
Since the processor's flag and general purpose registers might be altered by executing those new instructions, it is necessary to store them before and restoring them after execution to guarantee transparency.
|
||||
DynamoRIO also provides client routines to store those flags and registers in thread-local slots.
|
||||
An alternative to manually storing and restoring are, as previously mentioned in Section \ref{sec:dbi}, so-called \textit{clean calls} where DynamoRIO takes the responsibility for storing and restoring the processor's state.
|
||||
The clean call then dispatches to a user-defined function that will be run every time the basic block executes by modifying the program counter.
|
||||
This comes at the great advantage of not having to implement the observational or manipulative behavior using assembly instructions, instead the compiler of the client takes care of converting the clean call function into machine code.
|
||||
However, since DynamoRIO can not know which registers do need to be stored as this depends on the user code, it has to preserve the whole processors state.
|
||||
This comes at the great advantage of not having to implement the observational or manipulative behavior using assembly instructions; instead the compiler of the client takes care of converting the clean call function into machine code.
|
||||
However, since DynamoRIO can not know which registers have to be stored as this depends on the user code, it has to preserve the whole processors state.
|
||||
The dispatching to the clean call function is essentially a context switch and therefore has a great impact on the performance.
|
||||
So it is up to the user to decide whether the gain in performance by avoiding clean calls outweighs the higher development effort.
|
||||
|
||||
An exemplary client that already comes with DynamoRIO is \textit{DrCacheSim}.
|
||||
Together with the \textit{DrMemtrace-Framework}, this client provides an easy way to trace the executed instructions of the application and the memory accesses it makes.
|
||||
This framework will be further explained in section \ref{sec:analysis_tool}.
|
||||
This framework will be further explained in Section \ref{sec:analysis_tool}.
|
||||
|
||||
@@ -23,10 +23,10 @@ There are two ways to implement a process in a module:
|
||||
% \end{itemize}
|
||||
Moreover, there is \texttt{sc\_event\_queue} which makes it possible to queue multiple pending events, where as an \texttt{sc\_event} ignores further notifications until it is waited on.
|
||||
|
||||
Those concepts being introduced will become important in section \ref{sec:implementation} where the implementation of several SystemC modules will be discussed.
|
||||
Those concepts being introduced will become important in Section \ref{sec:implementation} where the implementation of several SystemC modules will be discussed.
|
||||
|
||||
SystemC supports numerous abstraction levels for modeling systems, namely \textit{cycle-accurate}, which is the most accurate abstraction but also the slowest, \textit{approximateley-timed} and \textit{loosley-timed}.
|
||||
The latter two abstraction levels belog to \revabbr{transaction level modeling}{TLM}, which will be discussed in the next section \ref{sec:tlm}.
|
||||
The latter two abstraction levels belog to \revabbr{transaction level modeling}{TLM}, which will be discussed in the next Section \ref{sec:tlm}.
|
||||
One further abstraction level, \textit{untimed}, will not be topic of this thesis.
|
||||
|
||||
\subsection{Transaction Level Modeling}
|
||||
@@ -38,7 +38,7 @@ In contrast to pin and cycle accurate models, this greatly reduces the simulatio
|
||||
Modules communicate with each other through \textit{initiator} sockets and \textit{target} sockets.
|
||||
A processor, for example, sends requests to a memory using its initiator socket, whereas the memory responds trough its target socket.
|
||||
Interconnect modules, which can be used to model a bus, use both sockets to communicate with both initiator and the target modules.
|
||||
This concept is illustrated in figure \ref{fig:tlm}.
|
||||
This concept is illustrated in Figure \ref{fig:tlm}.
|
||||
|
||||
The transaction object itself is a \revabbr{generic payload}{GP}, which consists of the target address, whether the transaction is a read or write command, status information and other transaction parameters as well as the actual data to transfer.
|
||||
GPs are passed along as references, avoiding the need to copy them between the modules.
|
||||
@@ -81,7 +81,7 @@ Figure \ref{fig:tlm_at} shows an exemplary handshake sequence diagram of three d
|
||||
\begin{figure}[!ht]
|
||||
\begin{center}
|
||||
\tikzfig{img/tlm_at}
|
||||
\caption{Sequence diagram of an exemplary transaction.}
|
||||
\caption{Sequence diagram of exemplary transactions.}
|
||||
\label{fig:tlm_at}
|
||||
\end{center}
|
||||
\end{figure}
|
||||
@@ -107,5 +107,5 @@ SystemC also supports additional user-defined phases through its \texttt{DECLARE
|
||||
In contrast to the TLM-LT protocol, TLM-AT allows model pipelining of transactions; multiple transactions can be processed simultaneously by one target.
|
||||
The responses also do not need to be in the same order as the initiator has sent them: they can be \textit{out out order}.
|
||||
|
||||
The TLM-AT coding style is the used protocol to implement the processor model and the cache model in section \ref{sec:implementation} of this thesis.
|
||||
The TLM-AT coding style is the used protocol to implement the processor model and the cache model in Section \ref{sec:implementation} of this thesis.
|
||||
Also, some of the earlier described shortcuts are taken advantage of throughout those models.
|
||||
|
||||
107
inc/4.caches.tex
107
inc/4.caches.tex
@@ -1,42 +1,41 @@
|
||||
\section{Caches}
|
||||
\label{sec:caches}
|
||||
|
||||
In this section, the necessity and functionality of caches in modern computing systems will be explained as well as the required considerations resulting from virtual memory addressing.
|
||||
A special focus will also be placed on non-blocking caches.
|
||||
The theory will be based on the chapters \textit{An Overview of Cache Principles} and \textit{Logical Organization} of \cite{Jacob2008} and on \cite{Jahre2007}.
|
||||
In this section, the necessity and functionality of caches in modern computing systems is explained as well as the required considerations resulting from virtual memory addressing.
|
||||
A special focus is also be placed on non-blocking caches.
|
||||
The theory is be based on the chapters \textit{``An Overview of Cache Principles''} and \textit{``Logical Organization''} of \cite{Jacob2008} and on \cite{Jahre2007}.
|
||||
|
||||
With the advancement of faster multi-core processors, the performance difference to the main \revabbr{dynamic random-access memory}{DRAM} is increasing, commonly referred to as the \textit{memory wall}.
|
||||
Therefore caches, whose goal is to decrease the latency and increase the bandwidth of an memory access, play an important role when it comes to the overall performance of computing systems.
|
||||
With the advancement of faster multi-core processors, the performance difference to the main memory is increasing, commonly referred to as the \textit{memory wall}.
|
||||
Therefore, caches, whose goal is to decrease the latency and increase the bandwidth of a memory access, play an important role when it comes to the overall performance of computing systems.
|
||||
|
||||
Caches are faster than DRAM, but only provide a small capacity, as the per-bit cost is larger.
|
||||
Caches are faster than DRAM, but only provide a small capacity, as the area cost is a lot higher.
|
||||
For this reason, at least the \textit{working set}, the data that the currently running application is working on, should be stored in the cache to improve performance.
|
||||
|
||||
The two most important heuristics that make this possible will be explained in section \ref{sec:caches_locality_principles}.
|
||||
After that the typical structure of a cache will be discussed in \ref{sec:caches_logical_organization}.
|
||||
Replacement policies will be explained in \ref{sec:replacement_policies} and write policies in \ref{sec:write_policies}, followed by the considerations to make when it comes to virtual addressing in section \ref{sec:caches_virtual_addressing}.
|
||||
The two most important heuristics that make this possible will be explained in Section \ref{sec:caches_locality_principles}.
|
||||
After that, the typical structure of a cache will be discussed in \ref{sec:caches_logical_organization}.
|
||||
Replacement policies will be explained in \ref{sec:replacement_policies} and write policies in \ref{sec:write_policies}, followed by the considerations to make when it comes to virtual addressing in Section \ref{sec:caches_virtual_addressing}.
|
||||
Section \ref{sec:caches_coherency} gives a short introduction on cache coherency and snooping.
|
||||
Finally, the advantage of non-blocking caches is the topic of section \ref{sec:caches_non_blocking_caches}.
|
||||
Finally, the advantage of non-blocking caches is the topic of Section \ref{sec:caches_non_blocking_caches}.
|
||||
|
||||
\subsection{Locality Principles}
|
||||
\label{sec:caches_locality_principles}
|
||||
|
||||
Access patterns of a typical application are not random.
|
||||
They tend to repeat themselves in time or are located in the near surrounding of previous accesses.
|
||||
They tend to repeat in time or are located in the near surrounding of previous accesses.
|
||||
Those two heuristics are called \textit{temporal locality} and \textit{spatial locality}.
|
||||
|
||||
\subsubsection{Temporal Locality}
|
||||
|
||||
Temporal locality is the concept of referenced data being likely to be referenced again in the near future.
|
||||
Taking advantage of this is the main idea behind a cache:
|
||||
When new data is referenced, it will be read from the main memory and buffered in the cache.
|
||||
The processor can now perform operations on this data and use its end result further without needing to access the main memory.
|
||||
Taking advantage of this is the main idea behind a cache: when new data is referenced, it will be read from the main memory and buffered in the cache.
|
||||
The processor can now perform operations on this data and use its end result without needing to access the main memory.
|
||||
|
||||
\subsubsection{Spatial Locality}
|
||||
|
||||
Programs have a tendency to reference data that is nearby in the memory space of already referenced data.
|
||||
This tendency, spatial locality, arises because related data is often clustered together, for example in arrays or structures.
|
||||
When calculations are performed on those arrays, sequential access patterns can be observed as one element is processed after the other.
|
||||
Spatial locality can be exploited by organizing blocks of data in so called \textit{cache blocks} or \textit{cache lines} which are larger than a single data word.
|
||||
Spatial locality can be exploited by organizing blocks of data in so called \textit{cache blocks} or \textit{cache lines}, which are larger than a single data word.
|
||||
This is a passive form of making use of spatial locality, as referenced data will also cause nearby words to be loaded into the same cache line, making them available for further accesses.
|
||||
|
||||
An active form of exploiting spatial locality is the use of \textit{prefetching}.
|
||||
@@ -45,9 +44,9 @@ Here, the program causes the cache to fetch more than one cache line from the un
|
||||
\subsection{Logical Organization}
|
||||
\label{sec:caches_logical_organization}
|
||||
|
||||
This section concerns the question where to store the fetched data in the cache.
|
||||
This section revolves about the question where to store the retrieved data in the cache.
|
||||
Because the cache is much smaller than the DRAM, only a subset of the memory can be held in the cache at a time.
|
||||
Into which cache line a block of memory placed is determined by the \textit{placement policy}.
|
||||
Into which cache line a block of memory is placed is determined by the \textit{placement policy}.
|
||||
There are three main policies:
|
||||
|
||||
\begin{itemize}
|
||||
@@ -55,19 +54,28 @@ There are three main policies:
|
||||
In \textit{direct-mapped caches} the cache is divided into multiple sets with a single cache line in each set.
|
||||
For every address there is only one cache line where the data can be placed in.
|
||||
\item
|
||||
A \textit{fully associative cache} there is only one large set, containing all available cache lines.
|
||||
In a \textit{fully associative cache} there is only one large set, containing all available cache lines.
|
||||
Referenced data has no restriction in which cache line it can be placed.
|
||||
\item
|
||||
\textit{Set-associative caches} are a hybrid form of the former two: There are multiple sets containing several cache lines each.
|
||||
\textit{Set-associative caches} are a hybrid form of the former two: there are multiple sets containing several cache lines each.
|
||||
The address determines the corresponding set, in that the data can be placed in any of the cache lines.
|
||||
\end{itemize}
|
||||
|
||||
\input{img/thesis.tikzstyles}
|
||||
\begin{figure}
|
||||
\begin{center}
|
||||
\tikzfig{img/associativity}
|
||||
\caption{Four organizations for a cache of eight blocks \cite{Jacob2008}.}
|
||||
\label{fig:associativity}
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
Figure \ref{fig:associativity} illustrates four different organizations for a cache of eight cache lines.
|
||||
In all three cases, the least significant portion of the physical address of the referenced data, the \textit{index}, determines the set in which the data is to store.
|
||||
However, several entries in the DRAM map to the same set, so the remaining most significant portion of the address is used as a \textit{tag} and is stored next to the actual data in the cache line.
|
||||
After an entry is fetched from the cache, the tag is used to determine if the entry actually corresponds to the referenced data.
|
||||
An example subdivision of the address in the index, tag and byte offset is shown in figure \ref{fig:address_mapping}.
|
||||
An example subdivision of the address in the index, tag and byte offset is shown in Figure \ref{fig:address_mapping}.
|
||||
|
||||
\input{img/thesis.tikzstyles}
|
||||
\begin{figure}[!ht]
|
||||
\begin{center}
|
||||
\tikzfig{img/address}
|
||||
@@ -77,11 +85,11 @@ An example subdivision of the address in the index, tag and byte offset is shown
|
||||
\end{figure}
|
||||
|
||||
Directly-mapped caches have the advantage, that only one tag has to be compared with the address.
|
||||
However, every time new data is referenced that gets placed into the same set, the cache line will be evicted.
|
||||
This leads to an overall lower cache hit rate as the other two policies.
|
||||
However, every time new data is referenced that is placed into the same set, the cache line needs to be evicted.
|
||||
This leads to an overall lower cache hit rate compared to the other two policies.
|
||||
|
||||
In a fully associative cache, a memory reference can be placed anywhere, consequently all cache lines have to be fetched and compared to the tag.
|
||||
Although this policy has the highest potential cache hit rate, the high space consumption due to comparators and high power consumption due to the lookup process, makes it non-feasible for many systems.
|
||||
Although this policy has the highest potential cache hit rate, the area cost due to additional comparators and high power consumption due to the lookup process, makes it non-feasible for many systems.
|
||||
|
||||
The hybrid approach of set-associative caches offers a trade-off between both policies.
|
||||
The term \textit{associativity} denotes the number of cache lines that are contained in a set.
|
||||
@@ -90,14 +98,14 @@ The term \textit{associativity} denotes the number of cache lines that are conta
|
||||
\label{sec:replacement_policies}
|
||||
|
||||
In case of contention, cache lines have to be evicted.
|
||||
To determine which one of the corresponding set, there are several replacement policies:
|
||||
To determine which cache line in the corresponding set is evicted, there are several replacement policies:
|
||||
|
||||
\begin{itemize}
|
||||
\item
|
||||
The random policy selects a cache line of a set at random.
|
||||
\item
|
||||
The \revabbr{least recently used}{LRU} policy selects the cache line whose last usage is the longest time ago.
|
||||
A LRU algorithm is expensive to implement, a counter value for every cache line of a set has to be updated every time the set is accessed.
|
||||
An LRU algorithm is expensive to implement, as a counter value for every cache line of a set has to be updated every time the set is accessed.
|
||||
\item
|
||||
An alternative is a \revabbr{pseudo LRU}{PLRU} policy, where an extra bit is set to 1 every time a cache line is accessed.
|
||||
When the extra bit of every cache line in a set is set to 1, they will get reset to 0.
|
||||
@@ -113,15 +121,14 @@ To determine which one of the corresponding set, there are several replacement p
|
||||
\label{sec:write_policies}
|
||||
|
||||
To maintain consistency to the underlying memory subsystem, special care has to be taken when a write access occurs.
|
||||
In case of a \textit{write-through} cache, the underlying memory is updated immediately, meaning the updated value will also directly be written in the DRAM.
|
||||
In case of a \textit{write-through} cache, the underlying memory is updated immediately, meaning the updated value will also directly be written into the DRAM.
|
||||
Because the DRAM provides a significantly lower bandwidth than the cache, this comes at a performance penalty.
|
||||
To mitigate the problem, a write buffer can be used, which allows the processor to make further progress as the data is written.
|
||||
To mitigate the problem, a write buffer can be used, which allows the processor to make further progress while the data is written.
|
||||
|
||||
An alternative is a so called \textit{write-back} cache.
|
||||
Instead of writing the updated value immediately to the underlying memory, it will be written back when the corresponding cache line is evicted.
|
||||
To identify if a cache line has to be written back, a so-called \textit{dirty-bit} is used:
|
||||
It denotes if the value has been updated while it has been in the cache.
|
||||
If it is the case, it has to be written back to ensure consistency, otherwise it is not needed.
|
||||
To identify if a cache line has to be written back, a so-called \textit{dirty-bit} is used; it denotes if the value has been updated while it has been in the cache.
|
||||
If this is the case, it must be written back to ensure consistency, otherwise it is not necessary.
|
||||
Also here, a write buffer can be used to place the actual write back requests into a queue.
|
||||
|
||||
\subsection{Virtual Addressing}
|
||||
@@ -143,16 +150,16 @@ Figure \ref{fig:virtual_address} shows an exemplary division of a virtual addres
|
||||
|
||||
Before a process can access a specific region in memory, the kernel has to translate the virtual page number into a physical page number.
|
||||
For conversions, so called \textit{page tables} are used to look up the physical page number.
|
||||
Page tables are usually multiple levels deep (e.g. 4-levels on x86), so a single conversion can cause up to 4 memory accesses, which is expensive.
|
||||
To improve performance, a \revabbr{translation lookaside buffer}{TLB} is used that acts like a cache on its own for physical page numbers.
|
||||
Page tables are usually multiple levels deep (e.g. 4-levels on x86), so a single conversion can cause a number of memory accesses, which is expensive.
|
||||
To improve performance, a \revabbr{translation lookaside buffer}{TLB} is used, which acts like a cache on its own for physical page numbers.
|
||||
|
||||
However, as long as the physical address is not present, the data cache cannot look up its entries as the index is not known yet.
|
||||
So the cache has to wait on the TLB, or worse on multiple memory accesses.
|
||||
To circumvent this problem, the cache can be indexed by the virtual address what makes it possible to parallelize both procedures.
|
||||
Such a cache is called \textit{virtually indexed} and \textit{physically tagged} and is illustrated in figure \ref{fig:virtual_address_conversion}.
|
||||
So the cache has to wait for the TLB or even multiple memory accesses if the physical page number is not stored in it.
|
||||
To circumvent this problem, the cache can be indexed by the virtual address, which makes it possible to parallelize both procedures.
|
||||
Such a cache is called \textit{virtually indexed} and \textit{physically tagged} and is illustrated in Figure \ref{fig:virtual_address_conversion}.
|
||||
|
||||
% Ist die Darstellung aus dem Buch richtig? Sollte der Cache Index wirklich über den Page Offset hinaus gehen?
|
||||
\begin{figure}[!ht]
|
||||
\begin{figure}
|
||||
\begin{center}
|
||||
\tikzfig{img/virtual_address_conversion}
|
||||
\caption[Virtually indexed, physically tagged cache\cite{Jacob2008}.]{Virtually indexed, physically tagged cache\cite{Jacob2008}. ASID refers to address-space identifier.}
|
||||
@@ -160,42 +167,40 @@ Such a cache is called \textit{virtually indexed} and \textit{physically tagged}
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
The result from the TLB, the physical page number, needs to be compared to tag that is stored in the cache.
|
||||
The result from the TLB, which is the physical page number, needs to be compared to the tag that is stored in the cache.
|
||||
When the tag and the physical page number match, then the cache entry is valid for this virtual address.
|
||||
Note that when the cache index is completely contained in the page offset, another problem called \textit{aliasing} can be resolved, which will not further be discussed in this thesis.
|
||||
|
||||
\subsection{Cache Coherency}
|
||||
\label{sec:caches_coherency}
|
||||
|
||||
In multi-core environments, caches become a distributed system.
|
||||
As every core uses its own set of caches and possibly shares a cache at the last stage with the other processors, a new problem arises.
|
||||
Should two or more cores operate on the same shared data, multiple copies of the data will be placed in the private caches and it must be guaranteed that all cores agree on the actual value the data has at any point in time.
|
||||
Divergent perceptions of the same data are to be regarded as errors.
|
||||
If two or more cores operate on the same shared data, multiple copies of the data will be placed in the private caches and it must be guaranteed that all cores agree on the actual value the data has at any point in time.
|
||||
Different perceptions of the same data should be considered as errors.
|
||||
|
||||
Therefore, it is important to guarantee \textit{cache coherency}.
|
||||
One of the solutions for cache coherency is the use of a so-called snooping protocol.
|
||||
A cache will snoop the cache coherence bus to examine if it already has a copy of requested data.
|
||||
Snooping packets then are used to update or invalidate other copies of the data.
|
||||
Snooping protocols can be very complex and hard to formally verify that they in fact guarantee cache coherence.
|
||||
For this reason, they will not further discussed in this thesis.
|
||||
Snooping packets are then used to update or invalidate other copies of the data.
|
||||
Snooping protocols are complex and difficult to formally verify that they in fact guarantee cache coherence.
|
||||
For this reason, they are not further discussed in this thesis.
|
||||
|
||||
\subsection{Non-blocking Caches}
|
||||
\subsection{Non-Blocking Caches}
|
||||
\label{sec:caches_non_blocking_caches}
|
||||
|
||||
In blocking caches, cache misses require the processor to stall until the data is fetched from the underlying memory.
|
||||
As this is a major slowdown, non-blocking caches try to solve this problem, making it possible for the processor to make further progress while waiting on the value.
|
||||
|
||||
Similarly to the write buffer, previously discussed in \ref{sec:write_policies}, a new buffer will be introduced: the \revabbr{miss status hold register}{MSHR}.
|
||||
Similarly to the write buffer, previously discussed in Section \ref{sec:write_policies}, a new buffer will be introduced: the \revabbr{miss status hold register}{MSHR}.
|
||||
The number of MSHRs correspond to the number of misses the cache can handle concurrently; when all available MSHRs are occupied and a further miss occurs, the cache will block.
|
||||
An MSHR entry always corresponds to one cache line that is currently being fetched from the underlying memory subsystem.
|
||||
|
||||
There are two variants of cache misses:
|
||||
\textit{Primary misses} are misses that lead to another occupation of an MSHR, where as \textit{secondary misses} are added to an existing MSHR entry and therefore cannot cause the cache to block.
|
||||
There are two variants of cache misses: \textit{primary misses} are misses that lead to another occupation of an MSHR, where as \textit{secondary misses} are added to an existing MSHR entry and therefore cannot cause the cache to block.
|
||||
This is the case when the same cache line as accessed.
|
||||
|
||||
An architecture of an MSHR file is illustrated in figure \ref{fig:mshr_file}.
|
||||
An architecture of an MSHR file is illustrated in Figure \ref{fig:mshr_file}.
|
||||
|
||||
\begin{figure}[!ht]
|
||||
\begin{figure}
|
||||
\begin{center}
|
||||
\tikzfig{img/mshr_file}
|
||||
\caption[Miss Holding Status Register File\cite{Jahre2007}.]{Miss Holding Status Register File\cite{Jahre2007}. V refers to a valid bit.}
|
||||
@@ -203,4 +208,4 @@ An architecture of an MSHR file is illustrated in figure \ref{fig:mshr_file}.
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
When the data for a cache miss is returned from the underlying memory, the cache will be updated, all targets of the MSHR entry will be served with the value and the MSHR entry will eventually get deallocated.
|
||||
When the data for a cache miss is returned from the underlying memory, the cache will be updated, all targets of the MSHR entry will be served with the value and the MSHR entry will eventually become deallocated.
|
||||
|
||||
@@ -56,7 +56,7 @@ A reordering might be necessary to be able to support initiators that can not ha
|
||||
% Evtl TA falls Bilder genutzt werden?
|
||||
DRAMSys also provides the so-called \textit{Trace Analyzer}, a graphical tool that visualizes database files created by DRAMSys.
|
||||
It shows the \texttt{REQ} and \texttt{RESP} phases between the initiator and the arbiter, the occupation of the command bus and data bus as well as representations of the different phases in the DRAM banks.
|
||||
An example trace database, visualized in the Trace Analyzer is shown in figure \ref{fig:traceanalyzer}.
|
||||
An example trace database, visualized in the Trace Analyzer is shown in Figure \ref{fig:traceanalyzer}.
|
||||
Furthermore, the Trace Analyzer is capable of calculating numerous metrics and creating plots of interesting characteristics.
|
||||
|
||||
\begin{figure}%[!ht]
|
||||
@@ -67,4 +67,4 @@ Furthermore, the Trace Analyzer is capable of calculating numerous metrics and c
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
In section \ref{sec:implementation} of this thesis the new special traffic generator for DRAMSys will be developed.
|
||||
In Section \ref{sec:implementation} of this thesis the new special traffic generator for DRAMSys will be developed.
|
||||
|
||||
@@ -1,28 +1,28 @@
|
||||
\section{Implementation}
|
||||
\label{sec:implementation}
|
||||
|
||||
In this section, the the components developed in this thesis for the new simulator frontend, that enable the tracing of an arbitrary application in real-time, as well as the replay of the recorded traces in DRAMSys, will be introduced.
|
||||
In this section, the developed components for the new simulator frontend, which enable the tracing of an arbitrary application in real-time, as well as the replay of the recorded traces in DRAMSys, will be introduced.
|
||||
|
||||
To briefly summarize which components are necessary to implement the new simulation frontend, they are briefly listed below:
|
||||
|
||||
\begin{itemize}
|
||||
\item A DynamoRIO client that traces memory accesses from an running application.
|
||||
\item A DynamoRIO client that traces memory accesses from a running application.
|
||||
\item A simplified core model that replays those traces by sending transactions to DRAMSys.
|
||||
\item A cache model that simulates the cache-filtering of memory requests of the processor.
|
||||
\item A cache model that simulates the cache filtering of memory requests of the processor.
|
||||
\end{itemize}
|
||||
|
||||
The following sections will first explain the DynamoRIO analysis tool that generates the memory access traces and its place in the DrMemtrace framework.
|
||||
Furthermore, the new trace player for DRAMSys will acquire special attention as well as the mandatory cache model that is used to model the cache-filtering in a real system.
|
||||
The last part will concentrate on the special architecture of the new trace player interface and challenges the internal interconnection solves.
|
||||
The last part will concentrate on the special architecture of the new trace player interface and challenges, that the internal interconnection solves.
|
||||
|
||||
\subsection{Analysis Tool}
|
||||
\label{sec:analysis_tool}
|
||||
|
||||
As described in section \ref{sec:dynamorio} the dynamic binary instrumentation tool DynamoRIO will be used to trace the memory accesses while the target application is running.
|
||||
Instead of writing a DynamoRIO client from the ground up, the DrMemtrace framework, that comes bundled with DynamoRIO, is used.
|
||||
As described in Section \ref{sec:dynamorio} the dynamic binary instrumentation tool DynamoRIO will be used to trace the memory accesses while the target application is running.
|
||||
Instead of writing a DynamoRIO client from the ground up, the DrMemtrace framework, which comes bundled with DynamoRIO, is used.
|
||||
|
||||
DrCacheSim is a DynamoRIO client that build on the DrMemtrace framework and gathers memory and instruction access traces from the target application and forwards them to one or multiple analyzer tools.
|
||||
In addition, so-called marker records are sent to an analyzer on certain events, with which meta information such as the cpu core used, kernel events or a timestamp are transmitted.
|
||||
DrCacheSim is a DynamoRIO client that builds on top of the DrMemtrace framework, which gathers memory and instruction access traces from the target application and forwards them to one or multiple analysis tools.
|
||||
In addition, so-called marker records are sent to the analysis tools when certain events occur, which are used to transmit meta information such as the CPU core used, kernel events or timestamps.
|
||||
These markers are also essential for a processor simulation, for example to reconstruct the thread interleaving, as it is intended for the new simulator frontend.
|
||||
DrCacheSim is a purely observational client and does not alter the behavior of the application.
|
||||
|
||||
@@ -33,16 +33,16 @@ These physical addresses should be traced instead of the virtual addresses to a
|
||||
It should be noted that in most systems the physical addresses do not directly represent the addresses that the memory subsystem perceives.
|
||||
The physical memory is mapped at a specific address region in the physical address space, so an address offset also has to be considered.
|
||||
On Linux systems, this mapping can be obtained by investigating the contents of the virtual file \texttt{/proc/iomem}, which is provided by the kernel.
|
||||
The trace player then substracts this offset as it will be explained in more detail in section \ref{sec:dbiplayer_functionality}.
|
||||
The physical address conversion only works on Linux and requires in modern kernel versions root privileges (or alternatively the CAP\_SYS\_ADMIN capability).
|
||||
The trace player then substracts this offset as it will be explained in more detail in Section \ref{sec:dbiplayer_functionality}.
|
||||
The physical address conversion only works on Linux and, in modern kernel versions, requires root privileges (or alternatively the CAP\_SYS\_ADMIN capability).
|
||||
|
||||
There are two different operation modes for an analyzer tool that DrCacheSim provides:
|
||||
The analyzer tool can either be running alongside with DrCacheSim (online) or run after the target application has exited and operate on an internal trace format (offline).
|
||||
The analyzer tool can either run alongside with DrCacheSim (online) or run after the target application has exited and operate on an internal trace format (offline).
|
||||
Offline tracing has the additional advantage of being able to disassemble the executed instructions afterwards.
|
||||
For this, the mapping of the executable binaries and shared libraries is stored alongside with the trace, enabling the decoding of the instructions from the traced program counter values.
|
||||
The instruction decoding is currently not natively supported for the online execution model, but this feature received limited attention in the development of the new frontend.
|
||||
The instruction decoding is currently not natively supported by the online execution model, but this feature received limited attention in the development of the new frontend.
|
||||
As of writing this thesis, the offline tracing mode has only recently gained support for the physical address conversation.
|
||||
Nnevertheless, the online execution model will be used throughout this thesis as the physical address support is still limited for offline tracing.
|
||||
Nevertheless, the online execution model will be used throughout this thesis as the physical address support is still limited for offline tracing.
|
||||
|
||||
\input{img/thesis.tikzstyles}
|
||||
\begin{figure}
|
||||
@@ -53,7 +53,7 @@ Nnevertheless, the online execution model will be used throughout this thesis as
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
In case of the online tracing, DrCacheSim consists of two separate processes:
|
||||
In the case of online tracing, DrCacheSim consists of two separate processes:
|
||||
\begin{itemize}
|
||||
\item
|
||||
A client-side process (the DynamoRIO client) which injects observational instructions into the application's code cache.
|
||||
@@ -65,12 +65,12 @@ In case of the online tracing, DrCacheSim consists of two separate processes:
|
||||
\end{itemize}
|
||||
|
||||
The \revabbr{inter-process communication}{IPC} between the two processes is achieved through a \textit{named\ pipe}.
|
||||
Figure \ref{fig:drcachesim} illustrates the structure of online tracing mechanism.
|
||||
Figure \ref{fig:drcachesim} illustrates the structure the of online tracing mechanism.
|
||||
|
||||
A \texttt{memref\_t} can either represent an instruction, a data reference or a metadata event such as a timestamp or a CPU identifier.
|
||||
Besides of the type, the \revabbr{process identifier}{PID} and \revabbr{thread identifier}{TID} of the initiating process and thread is included in every record.
|
||||
Besides the type, the \revabbr{process identifier}{PID} and \revabbr{thread identifier}{TID} of the initiating process and thread is included in every record.
|
||||
For an instruction marker, the size of the instruction as well as the address of the instruction in the virtual address space of the application is provided.
|
||||
For data references, the address and size of the desired access is provided as well the \revabbr{program counter}{PC} from where it was initiated from.
|
||||
For data references, the address and size of the desired access is provided as well the \revabbr{program counter}{PC} from where it was initiated.
|
||||
In offline mode, DrCacheSim stores the current mapping of all binary executables and shared libraries in a separate file, so that it is possible to decode and disassemble the traced instructions even after the application has exited.
|
||||
As mentioned earlier, instruction decoding is not natively supported for online tracing, but to work around the problem, the analyzer can examine the memory map of the client-side process and read the encoded instructions from there.
|
||||
|
||||
@@ -79,20 +79,21 @@ This region of interest can be specified by the number of instructions after whi
|
||||
|
||||
All analysis tools implement the common \texttt{analysis\_tool\_t} interface as this enables the analyzer to forward a received record to multiple tools in a polymorphic manner.
|
||||
In particular, the \texttt{process\_memref\_t()} method of any tool is called for every incoming record.
|
||||
Virtual functions, such as \texttt{initialize()} and \texttt{print\_results()}, which are called by the analyzer in appropriate places, should also be implemented.
|
||||
|
||||
It is possible for a analysis tool to implement parallel processing of the received \texttt{memref\_t} types by splitting up the trace into \textit{shards}.
|
||||
However, in this thesis the sequential processing of a single sorted and interleaved trace was used because of missing support for parallel processing for the online execution model.
|
||||
|
||||
The newly developed DRAMTracer tool creates a separate trace file for every application thread.
|
||||
Since it is not known a priori how many threads an application will spawn, the tool will listen for records with new TIDs that it did not register yet.
|
||||
For every data reference, a new entry in the corresponding trace file is made which contains the size and the physical address of the access, whether it was a read or write, and also a count of (computational) instructions that have been executed since the last data reference.
|
||||
For every data reference, a new entry in the corresponding trace file is created which contains the size and the physical address of the access, whether it was a read or write, and also a count of (computational) instructions that have been executed since the last data reference.
|
||||
To compute the instruction count, a counter is incremented for every registered instruction record and reset again for any data reference.
|
||||
This instruction count is used, together with the clock period, to approximate the delay between two memory accesses when the trace is replayed by DRAMSys.
|
||||
This instruction count is used together with the clock period to approximate the delay between two memory accesses when the trace is replayed by DRAMSys.
|
||||
Lastly, the analysis tool inserts a timestamp into the trace for every received timestamp marker.
|
||||
The use of this timestamp will be further explained in section \ref{sec:dbiplayer_functionality}.
|
||||
The use of this timestamp will be further explained in Section \ref{sec:dbiplayer_functionality}.
|
||||
Listing \ref{list:memtrace} presents an exemplary memory trace.
|
||||
Lines consisting of a number between two angle brackets represent a timestamp whereas lines for memory references consist of the instruction count, a character denoting a read or write, the size and the address of the access.
|
||||
Also, comments which are ignored by the trace player are possible by starting the line with a number sign.
|
||||
Lines consisting of a number between two angle brackets represent a timestamp whereas lines for memory references consist of the instruction count, a character denoting a read or write, the size and the physical address of the access.
|
||||
Also, comments which, are ignored by the trace player, can be added by starting the line with a number sign.
|
||||
|
||||
\begin{listing}
|
||||
\begin{textcode}
|
||||
@@ -122,13 +123,13 @@ This section covers the general architecture of the \textit{DbiPlayer}, the new
|
||||
|
||||
For every recorded thread, a traffic initiator thread, a so-called \textit{DbiThreadPlayer}, is spawned, which is a standalone initiator for memory transactions.
|
||||
Because those threads need to be synchronized to approximate real thread interleaving, they need to communicate among each other.
|
||||
The detailed mechanism behind this synchronization will be further explained in section \ref{sec:dbiplayer_functionality}.
|
||||
The detailed mechanism behind this synchronization will be further explained in Section \ref{sec:dbiplayer_functionality}.
|
||||
This communication, however, brings up the necessity to containerize the thread players into a single module that can directly be connected to DRAMSys.
|
||||
With the old DRAMSys interface for trace players this was not easily realizable, so a new generic initiator interface was developed which allows components to be connected to DRAMSys whose internal architecture can be arbitrary.
|
||||
This new interface will be further discussed in section \ref{sec:traceplayer_interface}.
|
||||
With the old DRAMSys interface for trace players this was not easily realizable, so a new generic initiator interface was developed that allows components to be connected to DRAMSys whose internal architecture can be arbitrary.
|
||||
This new interface will be further discussed in Section \ref{sec:traceplayer_interface}.
|
||||
|
||||
For the \textit{DbiPlayer}, an additional interconnect module will bundle up all \\ \texttt{simple\_initiator\_sockets} in a single \texttt{multi\_passthrough\_initiator\_socket}.
|
||||
So the \textit{DbiPlayer} is a hierarchical module that consists of a more complex architecture with multiple traffic initiators, illustrated in figure \ref{fig:dbiplayer_without_caches}.
|
||||
For the \textit{DbiPlayer}, an additional interconnect module will bundle up all \\ \texttt{simple\_initiator\_sockets} into a single \texttt{multi\_passthrough\_initiator\_socket}.
|
||||
So the \textit{DbiPlayer} is a hierarchical module that consists of a more complex architecture with multiple traffic initiators, illustrated in Figure \ref{fig:dbiplayer_without_caches}.
|
||||
|
||||
\begin{figure}
|
||||
\begin{center}
|
||||
@@ -139,10 +140,10 @@ So the \textit{DbiPlayer} is a hierarchical module that consists of a more compl
|
||||
\end{figure}
|
||||
|
||||
As the memory accesses are directly extracted from the executed instructions, simply sending a transaction to the DRAM subsystem for every data reference would completely neglect the caches of today's processors.
|
||||
Therefore, also a cache model is required whose implementation will be explained in more detail in section \ref{sec:cache_implementation}.
|
||||
Many modern cache hierarchies compose of 3 cache levels: 2 caches for every processor core, the L1 and L2 cache, and one cache that is shared across all cores, the L3 cache.
|
||||
This cache hierarchy is also reflected in the \textit{DbiPlayer} as shown in Figure \ref{fig:dbiplayer_with_caches}, but also more simple hierarchies such as a L1 cache for every processor core and one shared L2 cache are configurable.
|
||||
In order to connect the different SystemC socket types, one additional interconnect is required which is explained in more detail in section \ref{sec:interconnect}.
|
||||
Therefore, also a cache model is required whose implementation will be explained in more detail in Section \ref{sec:cache_implementation}.
|
||||
Many modern cache hierarchies are composed of 3 cache levels: 2 caches for every processor core, the L1 and L2 cache, and one cache that is shared across all cores, the L3 cache.
|
||||
This cache hierarchy is also reflected in the \textit{DbiPlayer} shown in Figure \ref{fig:dbiplayer_with_caches}, but also more simplistic hierarchies such as an L1 cache for every processor core and one shared L2 cache are configurable.
|
||||
In order to connect the different SystemC socket types, one additional interconnect is required which is explained in more detail in Section \ref{sec:interconnect}.
|
||||
|
||||
\begin{landscape}
|
||||
\begin{figure}
|
||||
@@ -160,26 +161,27 @@ In order to connect the different SystemC socket types, one additional interconn
|
||||
With the overall architecture of the main initiator module introduced, this section explains the internal functionality of the \textit{DbiPlayer} and its threads.
|
||||
|
||||
The threads of the \textit{DbiPlayer} are specialized initiator modules that inherit from the more generic \texttt{TrafficInitiatorThread} class.
|
||||
Each \texttt{TrafficInitiatorThread} consists of an \texttt{sendNextPayloadThread()} \texttt{SC\_THREAD} that inturn calls the virtual method \texttt{sendNextPayload()}, that is implemented in the \texttt{DbiThreadPlayer}, each time the \texttt{sc\_event\_queue} \texttt{sendNextPayloadEvent} is being notified.
|
||||
Each \texttt{TrafficInitiatorThread} consists of a \texttt{sendNextPayloadThread()} \texttt{SC\_THREAD}, which in turn calls the virtual method \texttt{sendNextPayload()} each time the \texttt{sc\_event\_queue} \texttt{sendNextPayloadEvent} is being notified.
|
||||
\texttt{sendNextPayload()} is implemented in the \texttt{DbiThreadPlayer}.
|
||||
|
||||
Each \texttt{DbiThreadPlayer} iterates through the lines of its trace file and stores the entries in an internal buffer.
|
||||
In \texttt{sendNextPayload()} then, a new generic payload object is created from the following entry of this buffer.
|
||||
In \texttt{sendNextPayload()}, a new generic payload object is created from the following entry of this buffer.
|
||||
The address of the payload is calculated from the physical address stored in the trace file entry.
|
||||
As previously discussed, the trace player now needs to account for the offset the RAM was placed at in the physical memory map and substract this offset from the physical address.
|
||||
The instruction count field of the trace is used to approximate the delay between two consecutive memory accesses:
|
||||
The count is multiplied with the trace player clock period and a constant to defer the initiation of the next transaction by the resulting value.
|
||||
While this does not take the type of the executed instructions into account, it is still a simple approximation that can be made.
|
||||
The instruction count field of the trace is used to approximate the delay between two consecutive memory accesses: the count is multiplied with the trace player clock period to defer the initiation of the next transaction by the resulting value.
|
||||
Additionally, this count can be multiplied by an approximation of the \revabbr{clocks per instruction}{CPI} value.
|
||||
While this does not take into account the type of the instructions executed, it is still a simple approximation that can be used to model the system more accuratly.
|
||||
|
||||
As mentioned previously, the threads should run by themselves without paying attention to the others, rather they require synchronization to ensure the simulated system replicates the real running application as good as possible.
|
||||
The individual initator threads should run by themselves without paying attention to the others; rather, they require synchronization to ensure the simulated system replicates the real running application as closely as possible.
|
||||
The analysis tool appends timestamps into the memory access traces.
|
||||
When such a timestamp is reached, it will be used to pause the execution of a thread, if the global time has not yet reached this far, or to advance the global time, when the thread is allowed to continue.
|
||||
It is to note that the term global time in this context does not correspond to the SystemC simulation time but denotes a loose time variable that only the \textit{DbiPlayer} uses to schedule its threads.
|
||||
When such a timestamp is reached, it will be used to pause the execution of a thread if the global time has not yet reached this far, or to advance the global time when the thread is allowed to continue.
|
||||
Note that the term global time in this context does not correspond to the SystemC simulation time, but denotes a loose time variable that only the \textit{DbiPlayer} uses to schedule its threads.
|
||||
|
||||
A set of rules determine if a thread is allowed to make progress beyond a timestamp that is further than the current global time:
|
||||
A set of rules determine if a thread is allowed to make progress beyond a timestamp that is greater than current global time:
|
||||
\begin{enumerate}
|
||||
\item The main thread at the start of the program is always allowed to run.
|
||||
\item Threads do not suspend themselves when they would produce a deadlock. This is the case when they are the only thread currently running.
|
||||
\item When a previous running thread exits and all other threads are suspended, then they will be resumed.
|
||||
\item When a previously running thread exits and all other threads are suspended, then they will be resumed.
|
||||
\item As a fallback, when currently all threads are suspended, one thread will be resumed.
|
||||
\end{enumerate}
|
||||
|
||||
@@ -189,42 +191,43 @@ The two latter rules ensure that always at least one thread is running so that t
|
||||
\subsection{Non-Blocking Cache}
|
||||
\label{sec:cache_implementation}
|
||||
|
||||
This section gives an overview over the cache model that is used by the new trace player.
|
||||
It is implemented as a non-blocking cache that, as explained in section \ref{sec:caches_non_blocking_caches}, can accept new requests even when multiple cache misses are being handled.
|
||||
This section gives an overview of the cache model that is used by the new trace player.
|
||||
It is implemented as a non-blocking cache that, as explained in Section \ref{sec:caches_non_blocking_caches}, can accept new requests even when multiple cache misses are being handled.
|
||||
|
||||
The cache inherits from the \texttt{sc\_module} base class and consists of a target socket, to accept requests from the processor or a higher level cache, as well as an initiator socket, to send requests to a lower level cache or to the DRAM subsystem.
|
||||
The cache inherits from the \texttt{sc\_module} base class and consists of a target socket to accept requests from the processor or a higher level cache as well as an initiator socket to send requests to a lower level cache or to the DRAM subsystem.
|
||||
It has a configurable size, associativity, cache line size, MSHR buffer depth, write buffer depth and target depth for one MSHR entry.
|
||||
|
||||
To understand how the cache model works, a hypothetical request from the CPU will be assumed to explain the internal processing of the transaction in detail:
|
||||
|
||||
When the transaction arrives, it will be placed in the PEQ of the cache from where, after the specified amount of delay, the handler for the \texttt{BEGIN\_REQ} phase is called.
|
||||
When the transaction arrives, it will be placed in the PEQ of the cache from where, after the specified amount of delay has elapsed, the handler for the \texttt{BEGIN\_REQ} phase is called.
|
||||
The handler verifies that the cache buffers are not full\footnote{Otherwise the cache will apply backpressure on the CPU and postpone the handling of the transaction.} and checks if the requested data is stored in the cache.
|
||||
If it is the case (i.e. a cache hit), the cache model sends immediately an \texttt{END\_REQ} and, when the target socket is not currently occupied with a response, accesses the cache and sends the \texttt{BEGIN\_RESP} phase to the processor.
|
||||
If it is the case (i.e., a cache hit), the cache model sends immediately an \texttt{END\_REQ} and, when the target socket is not currently occupied with a response, accesses the cache and sends the \texttt{BEGIN\_RESP} phase to the processor.
|
||||
During a cache access, the content of the cache line is copied into the transaction in case of a read request, or the cache line is updated with the new value in case of a write request.
|
||||
Further, in both cases the timestamp of the last access is updated to the current simulation time.
|
||||
The processor then finalizes the transaction with the \texttt{END\_RESP} phase, the target backpressure of the cache will be cleared and the postponed request from the CPU (if it exists) is now placed into the PEQ once again.
|
||||
The processor then finalizes the transaction with the \texttt{END\_RESP} phase, the target backpressure of the cache will be cleared a the postponed request from the CPU (if it exists) is placed into the PEQ once again.
|
||||
|
||||
If, on the other hand, the requested data is not in the cache (i.e. a cache miss), first it will be checked if there is already an existing MSHR entry for the corresponding cache line.
|
||||
If, on the other hand, the requested data is not in the cache (i.e., a cache miss), first it will be checked if there is already an existing MSHR entry for the corresponding cache line.
|
||||
If this is the case\footnote{And if the target list of the MSHR entry is not full. Otherwise the transaction will be postponed.}, the transaction is appended to it as an additional target.
|
||||
If not, a cache line is evicted to make space for the new cache line that will be fetched from the underlying memory.
|
||||
The cache model implements the optimal replacement policy LRU, so the cache line with the oldest last access time is chosen to be evicted.
|
||||
The cache model implements the optimal replacement policy LRU, so the cache line with the last access time, which lies furthest back in the past, is chosen to be evicted.
|
||||
When an eviction is not possible, the transaction will be postponed.
|
||||
An eviction is not possible when the selected cache line is allocated but not yet filled with requested data from the underlying cache, the cache line is currently present in the MSHR queue, or a hit for this cache line is yet to be handled.
|
||||
When the \texttt{dirty} flag of the old cache line is set, it has to be placed into the write buffer and written back to the memory.
|
||||
The newly evicted cache line is now \textit{allocated}, but not \textit{valid}.
|
||||
Then, the transaction is put in an MSHR entry and the \texttt{END\_REQ} phase is sent back to the processor.
|
||||
Then, the transaction is put into an MSHR entry and the \texttt{END\_REQ} phase is sent back to the processor.
|
||||
|
||||
To process the entries in the MSHR and in the write buffer, the \texttt{processMshrQueue()} and \texttt{processWriteBuffer()} methods are called at appropriate times.
|
||||
In the former, a not yet issued MSHR entry is selected for which a new fetch transaction is generated and sent to the underlying memory.
|
||||
Note that special care has to be taken when the requested cache line is also present in the write buffer:
|
||||
To ensure consistency, no new request is sent to the DRAM and instead the value is snooped out of the write buffer.
|
||||
Since the cache line in the write buffer is now allocated again in the cache, the entry in the write buffer can be removed to prevent an unnecessary write-back.
|
||||
In the latter, the processing of the write back buffer, a not yet issued entry is selected and a new write transaction is sent to the memory.\footnote{Both \texttt{processMshrQueue()} and \texttt{processWriteBuffer()} also need to ensure that currently no backpressure is applied onto the cache from the memory.}
|
||||
|
||||
Incoming transactions from the memory side are accepted with a \texttt{END\_RESP} and, in case of a fetch transaction, used to update the cache contents and possibly preparing a new response transaction for the processor as described before.
|
||||
Incoming transactions from the memory side are accepted with an \texttt{END\_RESP} and, in case of a fetch transaction, used to update the cache contents and possibly preparing a new response transaction for the processor as described before.
|
||||
|
||||
This example works analogously with another cache as the requesting module or another cache as the target module for a fetch or write back accesses.
|
||||
|
||||
The rough internal structure of the cache model is shown again in figure \ref{fig:cache}.
|
||||
The rough internal structure of the cache model is shown again in Figure \ref{fig:cache}.
|
||||
|
||||
\begin{figure}
|
||||
\begin{center}
|
||||
@@ -242,46 +245,44 @@ The implementation of a snooping protocol is a candidate for future improvements
|
||||
\subsection{Trace Player Interface}
|
||||
\label{sec:traceplayer_interface}
|
||||
|
||||
Previously, initiators could only represent one thread when they are connected to DRAMSys.
|
||||
This, however, conflicted with the goal to develop an trace player module that internally composes of multiple threads that communicate with each other and initiate transactions to DRAMSys independently.
|
||||
Previously, initiators could only represent one thread when they were connected to \pbox{3cm}{DRAMSys}.
|
||||
This, however, conflicted with the goal to develop a trace player module that is internally composed of multiple threads, which communicate with each other and initiate transactions to DRAMSys independently.
|
||||
|
||||
To be able to couple such hierarchical initiator modules with DRAMSys, a new trace player interface was developed:
|
||||
To be able to couple such hierarchical initiator modules with DRAMSys, a new trace player interface was developed.
|
||||
The \texttt{TrafficInitiatorIF} is a generic interface that every module that connectes to DRAMSys needs to implement.
|
||||
It requires to implement the \texttt{bindTargetSocket()} method so that top-level initiators can be coupled regardless of the used initiator socket type (e.g. \texttt{simple\_initiator\_socket} or \texttt{multi\_passthrough\_initiator\_socket}).
|
||||
It requires to implement the \texttt{bindTargetSocket()} method so that top-level initiators can be coupled regardless of the used initiator socket type (e.g., \texttt{simple\_initiator\_socket} or \texttt{multi\_passthrough\_initiator\_socket}).
|
||||
|
||||
When coupling a \texttt{multi\_passthrough\_initiator\_socket} to a \texttt{multi\_passthrough\_\\target\_socket}, the SystemC \texttt{bind()} method has to be called multiple times - once for each thread.
|
||||
Because of this, a wrapper module also has to overwrite the \\ \texttt{getNumberOfThreads()} method of the new interface and use this number to bind the target socket in \texttt{bindTargetSocket()} the correct number of times.
|
||||
|
||||
This makes it possible to polymorphically treat all initiator modules, whether they are simple threads or more complex wrapper modules, as this interface and connect them to DRAMSys with the provided bind method, abstracting away the concrete type of initiator socket used.
|
||||
This makes it possible to polymorphically treat all initiator modules as this interface, whether they are simple threads or more complex wrapper modules, and connect them to DRAMSys with the provided bind method, abstracting away the concrete type of initiator socket used.
|
||||
|
||||
So with the new trace player interface, a top-level initiator can either be a single thread, like in previous versions, or a more complex hierarchical module with many internal threads.
|
||||
With the new trace player interface, a top-level initiator can either be a single thread, like in previous versions, or a more complex hierarchical module with many internal threads.
|
||||
|
||||
\subsection{Interconnect}
|
||||
\label{sec:interconnect}
|
||||
|
||||
As already seen in figure \ref{fig:dbiplayer_with_caches}, interconnection modules are needed to connect the caches with each other.
|
||||
As already seen in Figure \ref{fig:dbiplayer_with_caches}, interconnection modules are needed to connect the caches to each other.
|
||||
While the implementation of the \textit{MultiCoupler} component is trivial as it only passes the transactions from its so-called \texttt{multi\_passthrough\_target\_socket} to its \texttt{multi\_passthrough\_initiator\_socket}, the \textit{MultiSimpleCoupler} is more complex because it has to internally buffer transactions.
|
||||
|
||||
In order to understand why this buffering needed, consider scenario where the L3 cache applies backpressure to one L2 cache.
|
||||
In order to understand why this buffering is needed, consider the scenario where the L3 cache applies backpressure to one L2 cache.
|
||||
The L2 cache is not allowed to send further requests due to the exclusion rule.
|
||||
But since the target socket of the L3 cache is occupied, this also applies to all other other L2 caches.
|
||||
This information, however, is not propagated to the other caches, leading to an incorrect behavior if not addressed, as the other caches will send further requests.
|
||||
|
||||
To solve this problem, the MultiSimpleCoupler only forwards requests to the L3 cache when it is able to accept them.
|
||||
If this is not the case, the request gets internally buffered and forwarded when an earlier request is being completed with the \texttt{END\_REQ} phase.
|
||||
If this is not the case, the request is internally buffered and forwarded when an earlier request is being completed with the \texttt{END\_REQ} phase.
|
||||
|
||||
% Beispiel
|
||||
For illustrating this further, a simple example can be assumed:
|
||||
One L2 cache needs to request a cache line from the underlying L3 cache.
|
||||
For illustrating this further, a simple example can be assumed: one L2 cache needs to request a cache line from the underlying L3 cache.
|
||||
The MultiSimpleCoupler receives the \texttt{BEGIN\_REQ} phase and places it into its PEQ.
|
||||
From there, a hash table used as an internal routing table is updated to be able to send the response back through the correct multi-socket binding afterwards.
|
||||
As the L3 cache is currently not applying backpressure onto the interconnect, it can forward the transaction with the \texttt{BEGIN\_REQ} phase to the L3 cache.
|
||||
Until the L3 cache responds with the \texttt{END\_REQ} phase, the interconnect defers any new request from any L2 cache and buffers the payload objects in an internal data structure.
|
||||
When the \texttt{END\_REQ} phase is received, the next transaction from this request buffer is sent to the L3 cache.
|
||||
After some time the, L3 cache will respond with the requested cache lines.
|
||||
After some time, the L3 cache will respond with the requested cache lines.
|
||||
During this \texttt{BEGIN\_RESP} phase, the L2 cache that requested this line is looked up using the routing table and the payload is sent back to it.
|
||||
Until the L2 cache responds with an \texttt{END\_RESP}, the exclusion rule has to be honored also here:
|
||||
When a new response from the L3 cache is received, it has to be buffered into another internal data structure until the corresponding target socket binding is clear again.
|
||||
Until the L2 cache responds with an \texttt{END\_RESP}, the exclusion rule has to be honored also here: when a new response from the L3 cache is received, it has to be buffered in another internal data structure until the corresponding target socket binding is clear again.
|
||||
Once the L2 cache sends out the \texttt{END\_RESP} phase, the interconnect will forward the \texttt{END\_RESP} to the L3 cache, and initiate new response transactions in case the response buffer is not empty.
|
||||
|
||||
In conclusion, this special interconnect module with an multi-target socket and a simple-initiator socket ensures that the exclusion rule is respected in both directions.
|
||||
In conclusion, this special interconnect module with a multi-target socket and a simple-initiator socket ensures that the exclusion rule is respected in both directions.
|
||||
|
||||
@@ -44,7 +44,7 @@ In this configuration, every processor core has its own L1 data cache (in case o
|
||||
The gem5 simulator uses four ARM CPU core models (TimingSimpleCPU, an in-order core model) at 1 GHz, whereas the DynamoRIO traces are obtained using a QEMU\cite{Qemu} ARM virtual machine, configured to use four cores as well.
|
||||
The DRAM subsystem will be varied between a single-channel DDR3 memory (1600 MT/s) and a single-channel DDR4 memory (2400 MT/s).
|
||||
% Hier die DRAMSys Configuration erklären!
|
||||
To match the same configuration as in gem5, the memory controller in DRAMSys is set to use a \revabbr{first-ready - first-come, first-served}{FR-FCFS} scheduling policy, a \revabbr{first-in, first-out}{FIFO} response queue policy, and a row-rank-bank-column-channel address mapping (explained in more detail in appendix \ref{sec:address_mappings}).
|
||||
To match the same configuration as in gem5, the memory controller in DRAMSys is set to use a \revabbr{first-ready - first-come, first-served}{FR-FCFS} scheduling policy, a \revabbr{first-in, first-out}{FIFO} response queue policy, and a row-rank-bank-column-channel address mapping (explained in more detail in Appendix \ref{sec:address_mappings}).
|
||||
The trace player operates at the same clock frequency as the gem5 core models.
|
||||
|
||||
The micro-benchmarks itself are multi-threaded and use all four cores.
|
||||
|
||||
@@ -24,7 +24,7 @@ Although this can be a complex task, it is possible to implement this in future
|
||||
A less impactful inaccuracy results from the scheduling of the applications threads in the new simplified core models.
|
||||
While an application can spawn a arbitrary number of threads, the platform may not be able to process them all in parallel.
|
||||
Currently, the new trace player does not take this into account and runs all threads in parallel.
|
||||
This, however, could be prevented by recording used processor cores on the initial system and using this information to better match the scheduling.
|
||||
This deviation could be prevented by recording used processor cores on the initial system and using this information to better match the scheduling.
|
||||
|
||||
Another inaccuracy can be caused by the hyperthreading of some of today's processors:
|
||||
While hyperthreading enables the parallel processing of two pipelines in a processor core, those threads do share the same first level cache.
|
||||
|
||||
Reference in New Issue
Block a user