From f542b2c03451f201426ebe47c3f9b861b6a3cf75 Mon Sep 17 00:00:00 2001
From: Derek Christ <dchrist@rhrk.uni-kl.de>
Date: Sat, 16 Jul 2022 14:47:30 +0200
Subject: [PATCH] Simulation additions

---
 doc.bib                      |  30 +--
 doc.tex                      |   1 +
 inc/1.introduction.tex       |   6 +-
 inc/2.dynamorio.tex          |   4 +-
 inc/3.systemc.tex            |   2 +-
 inc/4.caches.tex             |   4 +-
 inc/5.dramsys.tex            |   2 +-
 inc/6.implementation.tex     |   2 +-
 inc/7.simulation_results.tex | 430 ++++++++++++++++++++++-------------
 inc/appendix.tex             | 137 +++++++++++
 10 files changed, 435 insertions(+), 183 deletions(-)

diff --git a/doc.bib b/doc.bib
index 312847e..ba47278 100644
--- a/doc.bib
+++ b/doc.bib
@@ -16,7 +16,7 @@
 @InProceedings{Abel19a,
   author    = {Abel, Andreas and Reineke, Jan},
   booktitle = {ASPLOS},
-  title     = {uops.info: Characterizing Latency, Throughput, and Port Usage of Instructions on Intel Microarchitectures},
+  title     = {uops.info: {C}haracterizing {L}atency, {T}hroughput, and {P}ort {U}sage of {I}nstructions on {I}ntel {M}icroarchitectures},
   year      = {2019},
   address   = {New York, NY, USA},
   pages     = {673--686},
@@ -33,20 +33,20 @@
 @Book{Jacob2008,
   author    = {Bruce Jacob and Spencer W. Ng and David T. Wang},
   publisher = {Morgan Kaufmann},
-  title     = {Memory Systems: Cache, DRAM, Disk},
+  title     = {{Memory Systems: Cache, DRAM, Disk}},
   year      = {2008},
 }
 
 @Article{Jahre2007,
   author = {Jahre, Magnus and Natvig, Lasse},
-  title  = {Performance Effects of a Cache Miss Handling Architecture in a Multi-core Processor},
+  title  = {{Performance Effects of a Cache Miss Handling Architecture in a Multi-core Processor}},
   year   = {2007},
 }
 
 @InProceedings{Antonino2018,
   author    = {Antonino, Pablo Oliveira and Jung, Matthias and Morgenstern, Andreas and Fa{\ss}nacht, Florian and Bauer, Thomas and Bachorek, Adam and Kuhn, Thomas and Nakagawa, Elisa Yumi},
   booktitle = {Software Architecture},
-  title     = {Enabling Continuous Software Engineering for Embedded Systems Architectures with Virtual Prototypes},
+  title     = {{Enabling Continuous Software Engineering for Embedded Systems Architectures with Virtual Prototypes}},
   year      = {2018},
   address   = {Cham},
   editor    = {Cuesta, Carlos E. and Garlan, David and P{\'e}rez, Jennifer},
@@ -58,7 +58,7 @@
 
 @Article{IEEE2012,
   journal = {IEEE Std 1666-2011 (Revision of IEEE Std 1666-2005)},
-  title   = {IEEE Standard for Standard SystemC Language Reference Manual},
+  title   = {{IEEE} {S}tandard for {S}tandard {S}ystem{C} {L}anguage {R}eference {M}anual},
   year    = {2012},
   doi     = {10.1109/IEEESTD.2012.6134619},
 }
@@ -66,7 +66,7 @@
 @InProceedings{Menard2017,
   author    = {Menard, Christian and Castrillon, Jeronimo and Jung, Matthias and Wehn, Norbert},
   booktitle = {2017 International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS)},
-  title     = {System simulation with gem5 and SystemC: The keystone for full interoperability},
+  title     = {{System simulation with gem5 and SystemC: The keystone for full interoperability}},
   year      = {2017},
   pages     = {62-69},
   doi       = {10.1109/SAMOS.2017.8344612},
@@ -75,7 +75,7 @@
 @InProceedings{Steiner2020,
   author    = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov, Kirill and Wehn, Norbert},
   booktitle = {Embedded Computer Systems: Architectures, Modeling, and Simulation},
-  title     = {DRAMSys4.0: A Fast and Cycle-Accurate SystemC/TLM-Based DRAM Simulator},
+  title     = {{DRAMSys4.0: A Fast and Cycle-Accurate SystemC/TLM-Based DRAM Simulator}},
   year      = {2020},
   address   = {Cham},
   editor    = {Orailoglu, Alex and Jung, Matthias and Reichenbach, Marc},
@@ -88,7 +88,7 @@
 @Book{Jung2017,
   author    = {Jung, Matthias},
   publisher = {Technische Universit{\"a}t Kaiserslautern},
-  title     = {System-level Modeling, Analysis and Optimization of DRAM Memories and Controller Architectures},
+  title     = {{System-level Modeling, Analysis and Optimization of DRAM Memories and Controller Architectures}},
   year      = {2017},
   isbn      = {9783959740517},
   series    = {Forschungsberichte Mikroelektronik},
@@ -97,7 +97,7 @@
 @Article{Binkert2011,
   author     = {Binkert, Nathan and Beckmann, Bradford and Black, Gabriel and Reinhardt, Steven K. and Saidi, Ali and Basu, Arkaprava and Hestness, Joel and Hower, Derek R. and Krishna, Tushar and Sardashti, Somayeh and Sen, Rathijit and Sewell, Korey and Shoaib, Muhammad and Vaish, Nilay and Hill, Mark D. and Wood, David A.},
   journal    = {SIGARCH Comput. Archit. News},
-  title      = {The Gem5 Simulator},
+  title      = {{The Gem5 Simulator}},
   year       = {2011},
   issn       = {0163-5964},
   month      = aug,
@@ -115,7 +115,7 @@
 @InProceedings{Jung2017a,
   author    = {Jung, Matthias and Kraft, Kira and Wehn, Norbert},
   booktitle = {2017 International Conference on Embedded Computer Systems: Architectures, Modeling, and Simulation (SAMOS)},
-  title     = {A new state model for DRAMs using Petri Nets},
+  title     = {{A new state model for DRAMs using Petri Nets}},
   year      = {2017},
   doi       = {10.1109/SAMOS.2017.8344631},
 }
@@ -123,7 +123,7 @@
 @Book{Hennessy2011,
   author    = {Hennessy, John L. and Patterson, David A.},
   publisher = {Morgan Kaufmann Publishers Inc.},
-  title     = {Computer Architecture, Fifth Edition: A Quantitative Approach},
+  title     = {{Computer Architecture, Fifth Edition: A Quantitative Approach}},
   year      = {2011},
   address   = {San Francisco, CA, USA},
   edition   = {5th},
@@ -134,7 +134,7 @@
 @Article{Ghose2019,
   author     = {Ghose, Saugata and Li, Tianshi and Hajinazar, Nastaran and Cali, Damla Senol and Mutlu, Onur},
   journal    = {Proc. ACM Meas. Anal. Comput. Syst.},
-  title      = {Demystifying Complex Workload-DRAM Interactions: An Experimental Study},
+  title      = {{Demystifying Complex Workload-DRAM Interactions: An Experimental Study}},
   year       = {2019},
   month      = {dec},
   number     = {3},
@@ -153,7 +153,7 @@
 @InProceedings{Gomony2012,
   author    = {Gomony, Manil Dev and Weis, Christian and Akesson, Benny and Wehn, Norbert and Goossens, Kees},
   booktitle = {2012 Design, Automation \& Test in Europe Conference \& Exhibition (DATE)},
-  title     = {DRAM selection and configuration for real-time mobile systems},
+  title     = {{DRAM selection and configuration for real-time mobile systems}},
   year      = {2012},
   pages     = {51-56},
   doi       = {10.1109/DATE.2012.6176432},
@@ -162,7 +162,7 @@
 @Article{Kim2016,
   author  = {Kim, Yoongu and Yang, Weikun and Mutlu, Onur},
   journal = {IEEE Computer Architecture Letters},
-  title   = {Ramulator: A Fast and Extensible DRAM Simulator},
+  title   = {{Ramulator: A Fast and Extensible DRAM Simulator}},
   year    = {2016},
   number  = {1},
   pages   = {45-49},
@@ -200,7 +200,7 @@
   note   = {https://github.com/RRZE-HPC/TheBandwidthBenchmark. Accessed: 2022-06-28},
 }
 
-@Article{,
+@Article{Qemu,
   journal = {A generic and open source machine emulator and virtualizer},
   title   = {Q{E}{M}{U}},
   note    = {https://www.qemu.org/. Accessed: 2022-06-28},
diff --git a/doc.tex b/doc.tex
index ebbab54..144fa7b 100644
--- a/doc.tex
+++ b/doc.tex
@@ -36,6 +36,7 @@
 \usepackage{tikzit}
 \usepackage{lscape}
 \usepackage{pbox}
+\usepackage{pgfplots}
 %\usepackage{listings}
 %\input{subsections.sty}
 \setcounter{secnumdepth}{5}
diff --git a/inc/1.introduction.tex b/inc/1.introduction.tex
index d47a968..3bbc08f 100644
--- a/inc/1.introduction.tex
+++ b/inc/1.introduction.tex
@@ -18,8 +18,8 @@ Consequently, system designers are entrusted with the complex task of finding th
 For the exploration of the design space of these configurations it is impractical to use real systems as they are too cost-intensive and not modifyable and therefore not suitable for rapid prototyping.
 To overcome this limitation, it is important to simulate the memory system using a simulation framework with sufficient accuracy.
 
-Such a simulation framework is DRAMSys\cite{Steiner2020}\cite{Jung2017}, which is based on SystemC \revabbr{transaction level modeling}{TLM} and enables the fast simulation of numerous DRAM standards and controller configurations with cycle-accuracy.
-Stimuli for the memory system can either be generated using a prerecorded trace file with timestamps, a traffic generator that acts as a state machine and initiates different request patterns, or a detailed processor model of the gem5\cite{Binkert2011} simulation framework.
+Such a simulation framework is DRAMSys \cite{Steiner2020, Jung2017}, which is based on SystemC \revabbr{transaction level modeling}{TLM} and enables the fast simulation of numerous DRAM standards and controller configurations with cycle-accuracy.
+Stimuli for the memory system can either be generated using a prerecorded trace file with timestamps, a traffic generator that acts as a state machine and initiates different request patterns, or a detailed processor model of the gem5 \cite{Binkert2011} simulation framework.
 
 However, the two former methods lack in accurary whereas the latter may provide the sufficient precision but is a very time-consuming effort.
 To fill this gap of fast but accurate traffic generation, a new simulation frontend for DRAMSys is developed and presented in this thesis.
@@ -35,4 +35,4 @@ Section \ref{sec:systemc} presents the modeling language SystemC, on which the d
 After that, Section \ref{sec:caches} gives a short overview of modern cache architectures and their high-level implementations.
 Section \ref{sec:dramsys} introduces the DRAMSys simulation framework and its basic functionalities.
 Section \ref{sec:implementation} explains the implementation of the cache model, the processor model and the instrumentation tool in detail.
-In Section \ref{sec:simulation_results} the accuracy of the new framework is compared against the gem5 and Ramulator\cite{Kim2016} simulators, whereas Section \ref{sec:future_work} denotes future improvements that can be achieved.
+In Section \ref{sec:simulation_results} the accuracy of the new framework is compared against the gem5 and Ramulator \cite{Kim2016} simulators, whereas Section \ref{sec:future_work} denotes future improvements that can be achieved.
diff --git a/inc/2.dynamorio.tex b/inc/2.dynamorio.tex
index 84c88b0..33d5cd4 100644
--- a/inc/2.dynamorio.tex
+++ b/inc/2.dynamorio.tex
@@ -44,7 +44,7 @@ The following Section \ref{sec:dynamorio_core} will explain how the core functio
 \label{sec:dynamorio_core}
 
 A simple way to observe and potentially modify the instructions of an application during execution is the use of an interpretation engine that emulates the binary executable in its entirety.
-One widely used framework that uses this technique is for example Valgrind\cite{Valgrind}.
+One widely used framework that uses this technique is for example Valgrind \cite{Valgrind}.
 At its core, Valgrind uses a virtual machine and just-in-time compilation to instrument the target application.
 This approach might be powerful, but it comes at the cost of significantly reduced performance.
 
@@ -99,7 +99,7 @@ So a sophisticated application could try to detect the presence of an instrument
 \label{sec:dynamorio_client}
 
 With the inner workings introduced so far, the presence of DynamoRIO does not have an effect other than that the application is executed from the code cache.
-DynamoRIO provides a programming interface to develop external so-called \textit{clients}\cite{Bruening2004}.
+DynamoRIO provides a programming interface to develop external so-called \textit{clients} \cite{Bruening2004}.
 Clients are user-written instrumentation tools and make it possible to dynamically modify the basic blocks, either to alter the application behavior or to insert observational instructions.
 A DynamoRIO client is compiled into a shared library and passed to the \textit{drrun} utility using a command line option.
 Clients implement a number of hook functions that will be called by DynamoRIO for certain events such as the creation of a basic block or of a trace.
diff --git a/inc/3.systemc.tex b/inc/3.systemc.tex
index 2e1c0da..22e1b4c 100644
--- a/inc/3.systemc.tex
+++ b/inc/3.systemc.tex
@@ -47,7 +47,7 @@ GPs are passed as references, so they do not need to be copied between modules.
 \begin{figure}
 \begin{center}
 \tikzfig{img/tlm}
-\caption[Forward and backward path between TLM sockets\cite{Menard2017}.]{Forward and backward path between TLM sockets\cite{Menard2017}. $\blacksquare$ denotes an initiator socket, $\square$ denotes a target socket.}
+\caption[Forward and backward path between TLM sockets \cite{Menard2017}.]{Forward and backward path between TLM sockets \cite{Menard2017}. $\blacksquare$ denotes an initiator socket, $\square$ denotes a target socket.}
 \label{fig:tlm}
 \end{center}
 \end{figure}
diff --git a/inc/4.caches.tex b/inc/4.caches.tex
index 6625cae..9f89bb3 100644
--- a/inc/4.caches.tex
+++ b/inc/4.caches.tex
@@ -162,7 +162,7 @@ Such a cache is called \textit{virtually indexed} and \textit{physically tagged}
 \begin{figure}
 \begin{center}
 \tikzfig{img/virtual_address_conversion}
-\caption[Virtually indexed, physically tagged cache\cite{Jacob2008}.]{Virtually indexed, physically tagged cache\cite{Jacob2008}. ASID refers to address-space identifier.}
+\caption[Virtually indexed, physically tagged cache \cite{Jacob2008}.]{Virtually indexed, physically tagged cache \cite{Jacob2008}. ASID refers to address-space identifier.}
 \label{fig:virtual_address_conversion}
 \end{center}
 \end{figure}
@@ -203,7 +203,7 @@ An architecture of an MSHR file is illustrated in Figure \ref{fig:mshr_file}.
 \begin{figure}
 \begin{center}
 \tikzfig{img/mshr_file}
-\caption[Miss Holding Status Register File\cite{Jahre2007}.]{Miss Holding Status Register File\cite{Jahre2007}. V refers to a valid bit.}
+\caption[Miss Holding Status Register File \cite{Jahre2007}.]{Miss Holding Status Register File \cite{Jahre2007}. V refers to a valid bit.}
 \label{fig:mshr_file}
 \end{center}
 \end{figure}
diff --git a/inc/5.dramsys.tex b/inc/5.dramsys.tex
index c719f77..8708c6d 100644
--- a/inc/5.dramsys.tex
+++ b/inc/5.dramsys.tex
@@ -56,7 +56,7 @@ A reordering might be necessary to be able to support initiators that can not ha
 
 % Evtl TA falls Bilder genutzt werden?
 DRAMSys also provides the so-called \textit{Trace Analyzer}, a graphical tool that visualizes database files created by DRAMSys.
-It makes visible the \texttt{REQ} and \texttt{RESP} phases between the initiator and the arbiter, the occupation of the command bus and data bus as well as representations of the different phases in the DRAM banks.
+% It makes visible the \texttt{REQ} and \texttt{RESP} phases between the initiator and the arbiter, the occupation of the command bus and data bus as well as representations of the different phases in the DRAM banks.
 An example trace database, visualized in the Trace Analyzer is shown in Figure \ref{fig:traceanalyzer}.
 Furthermore, the Trace Analyzer is capable of calculating numerous metrics and creating plots of interesting characteristics.
 
diff --git a/inc/6.implementation.tex b/inc/6.implementation.tex
index 9864ff9..86ec39c 100644
--- a/inc/6.implementation.tex
+++ b/inc/6.implementation.tex
@@ -169,7 +169,7 @@ In \texttt{sendNextPayload()}, a new generic payload object is created from the
 The address of the payload is calculated from the physical address stored in the trace file entry.
 As previously discussed, the trace player now needs to account for the offset the RAM was placed at in the physical memory map and substract this offset from the physical address.
 The instruction count field of the trace is used to approximate the delay between two consecutive memory accesses: the count is multiplied with the trace player clock period to defer the initiation of the next transaction by the resulting value.
-Additionally, this count can be multiplied by an approximation of the \revabbr{clocks per instruction}{CPI} value.
+Additionally, this count can be multiplied by an approximation of the average \revabbr{cycles per instruction}{CPI} value.
 While this does not take into account the type of the instructions executed, it is still a simple approximation that can be used to model the system more accuratly.
 
 The individual initator threads should run by themselves without paying attention to the others; rather, they require synchronization to ensure the simulated system replicates the real running application as closely as possible.
diff --git a/inc/7.simulation_results.tex b/inc/7.simulation_results.tex
index bc75fd3..d792068 100644
--- a/inc/7.simulation_results.tex
+++ b/inc/7.simulation_results.tex
@@ -4,26 +4,31 @@
 In this section the accuracy of the new simulation frontend will be evaluated.
 After a short discussion about the general expections regarding the accuracy and considerations to make, the simulation results will be presented.
 The presentation is structured into two parts:
-At first simulation statistics of numerous benchmarks are compared against the gem5\cite{Binkert2011} simulator that uses detailed processor models and can be considered as a reference.
-Secondly, the new simulation frontend is compared against the memory access trace generator tool of the Ramulator DRAM simulator\cite{Ghose2019}.
+At first simulation statistics of numerous benchmarks are compared against the gem5 \cite{Binkert2011} simulator that uses detailed processor models and can be considered as a reference.
+Secondly, the new simulation frontend is compared against the memory access trace generator tool of the Ramulator DRAM simulator \cite{Ghose2019}.
 
 \subsection{Accuracy}
 Generating memory access traces using dynamic binary instrumentation as a faster alternative to the simulation of detailed processor models introduces several inaccuracies, which of some will now be enumerated.
 
 The most important aspect to consider is that DBI can only instrument the target application but fails to also take the operating system the application is running on into account.
-That includes the inability to observe the execution of kernel routines that are directly invoked by the target application through system calls, but also the preemtive scheduling of other programs that are running on the system at the same time.
+That includes the inability to observe the execution of kernel routines that are directly invoked by the application through system calls, but also the preemtive scheduling of other programs that are running on the system at the same time.
 
-What is also to concern is the fetching of the instructions itself:
-In a real system the binary executable of the target application is placed in the DRAM, along with its data, and gets fetched into the instruction cache while executing.
+The fetching of the instructions themselves should also be considered:
+In a real system the binary executable of the target application is placed in the DRAM, along with its data, and is loaded into the instruction cache while executing.
 Since the DBI cannot observe the fetching of those instructions, the new simulator frontend cannot model this memory traffic.
 
 \subsection{Comparison to the gem5 Simulator}
 
-At first, the micro-benchmark suite TheBandwithBenchmark\cite{TheBandwidthBenchmark}, containing various streaming kernels, will be used to compare the gem5 full-system simulation as well as the gem5 syscall-emulation simulation with the newly developed frontend.
+At first, the micro-benchmark suite TheBandwithBenchmark \cite{TheBandwidthBenchmark}, consisting of various streaming kernels, will be used to compare the gem5 full-system simulation as well as the gem5 syscall-emulation simulation with the newly developed frontend.
+
+The gem5 syscall-emulation does not simulate a whole operating system, rather it utilizes the host system's Linux kernel and therefore only simulates the binary application.
+In contrast, the gem5 full-system simulation boots into a complete Linux system including all processes running in the background.
+Therefore, syscall-emulation is conceptually closer to the DynamoRIO approach than full-system simulation.
+
 The simulation setup consists in both cases of a two-level cache hierarchy with the following parameters:
 
 \begin{table}[!ht]
-\caption{Cache parameters.}
+\caption{Cache parameters used in simulations.}
 \begin{center}
 \begin{tabular}{|c|c|c|c|c|c|c|}
  \hline
@@ -41,21 +46,26 @@ The simulation setup consists in both cases of a two-level cache hierarchy with
 \end{table}
 
 In this configuration, every processor core has its own L1 data cache (in case of gem5 also a L1 instruction cache) whereas the L2 cache is shared between all cores.
-The gem5 simulator uses four ARM CPU core models (TimingSimpleCPU, an in-order core model) at 1 GHz, whereas the DynamoRIO traces are obtained using a QEMU\cite{Qemu} ARM virtual machine, configured to use four cores as well.
+The gem5 simulator uses four ARM CPU core models (TimingSimpleCPU, an in-order core model) at \textit{1000 MHz}, whereas the DynamoRIO traces are obtained using a QEMU \cite{Qemu} ARM virtual machine, configured to use four cores as well.
 The DRAM subsystem will be varied between a single-channel DDR3 memory (1600 MT/s) and a single-channel DDR4 memory (2400 MT/s).
 % Hier die DRAMSys Configuration erklären!
 To match the same configuration as in gem5, the memory controller in DRAMSys is set to use a \revabbr{first-ready - first-come, first-served}{FR-FCFS} scheduling policy, a \revabbr{first-in, first-out}{FIFO} response queue policy, and a row-rank-bank-column-channel address mapping (explained in more detail in Appendix \ref{sec:address_mappings}).
 The trace player operates at the same clock frequency as the gem5 core models.
 
-The micro-benchmarks itself are multi-threaded and use all four cores.
+It is important to configure the CPI value of the new trace player to a sensible value to approximate the delay between two consecutive memory accesses.
+For the simulations, the CPI value that gem5 SE reports in its statistics is used.
+It has been found that the CPI value gives an approximate value of \textit{10} if only computation instructions are considered and load and store operations are ignored, since those are affected by the latency of the memory subsystem.
+
+The micro-benchmarks itself are multi-threaded and make use of all available cores.
+Furthermore, the compiler optimizations are set to \texttt{-Ofast} for all benchmarks.
 Their access patterns are as followed:
 
 \begin{table}[!ht]
-\caption{Access patterns of the micro-benchmark kernels\cite{TheBandwidthBenchmark}.}
+\caption{Access patterns of the micro-benchmark kernels \cite{TheBandwidthBenchmark}.}
 \begin{center}
 \begin{tabular}{|c|c|c|}
  \hline
- Benchmark kernel & Description & Access pattern \\
+ Benchmark Kernel & Description & Access Pattern \\
  \hline
  \hline
  INIT & Initialize an array & a = s (store, write allocate) \\
@@ -82,38 +92,6 @@ Their access patterns are as followed:
 
 In the following, the simulation results of the new simulation frontend, the gem5 full-system emulation and the gem5 syscall-emulation will now be presented.
 
-\begin{table}[!ht]
-\caption[Results for bandwidth and bytes read/written with DDR3-1600.]{Results for bandwidth and bytes read/written with DDR3-1600. FS denotes gem5 full-system, SE denotes gem5 syscall-emulation, DS denotes DRAMSys.}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|}
- \hline
- \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Avg. Bandwidth [GB/s]} & \multicolumn{3}{|c|}{Bytes Read [MB]} & \multicolumn{3}{|c|}{Bytes Written [MB]} \\
- \cline{2-10}
- & FS & SE & DS & FS & SE & DS & FS & SE & DS\\
- \hline
- \hline
- COPY & 2.031 & 2.698& 4 & 238.3 & 268.8& 7 & 140.3 & 134.3 & 10\\
- \hline
- DAXPY & 2.070 & 2.627& 4 & 238.2 & 268.9 & 7 & 140.2 & 134.4 & 10\\
- \hline
- INIT & 2.028 & 2.629& 4 & 141.9 & 172.9 & 7 & 140.1 & 134.4 & 10\\
- \hline
- SDAXPY & 2.101 & 2.755& 4 & 335.1 & 364.8 & 7 & 140.4 & 134.4 & 10\\
- \hline
- STRIAD & 2.228 & 2.613& 4 & 431.6& 460.9 & 7 & 140.4 & 134.4 & 10\\
- \hline
- SUM & 1.393 & 1.969& 4 & 142.0 & 172.9 & 7 & 44.1 & 38.5 & 10\\
- \hline
- TRIAD & 2.162 & 2.725& 4 & 335.1 & 364.9 & 7 & 140.4 & 134.4 & 10\\
- \hline
- UPDATE & 1.938 & 2.528& 4 & 142.0& 172.8 & 7 & 140.1 & 134.3 & 10\\
- \hline
-
-\end{tabular}
-\end{center}
-\label{tab:benchmark_bandwidth_ddr3}
-\end{table}
-
 \begin{table}[!ht]
 \caption{Results for bandwidth and bytes read/written with DDR4-2400.}
 \begin{center}
@@ -124,165 +102,301 @@ In the following, the simulation results of the new simulation frontend, the gem
  & FS & SE & DS & FS & SE & DS & FS & SE & DS\\
  \hline
  \hline
- COPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+
+ COPY & 2.201 & 2.794 & 2.130 & 238.4 & 268.8 & 307.8 & 140.2 & 134.3 & 134.4 \\
  \hline
- DAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ DAXPY & 2.157 & 2.721 & 1.600 & 238.2 & 268.8 & 302.0 & 140.2 & 134.4 & 134.4 \\
  \hline
- INIT & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ INIT & 2.058 & 2.737 & 2.040 & 141.9 & 172.6 & 216.1 & 140.0 & 134.1 & 134.4 \\
  \hline
- SDAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ SDAXPY & 2.239 & 2.813 & 2.080 & 335.1 & 364.8 & 403.0 & 140.3 & 134.4 & 134.4 \\
  \hline
- STRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ STRIAD & 2.246 & 2.803 & 2.350 & 335.1 & 460.9 & 494.4 & 140.4 & 134.4 & 134.4 \\
  \hline
- SUM & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ SUM & 1.429 & 1.982 & 1.110 & 142.0 & 172.7 & 189.1 & 44.0 & 38.4 & 38.5 \\
  \hline
- TRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ TRIAD & 2.246 & 2.853 & 2.110 & 335.1 & 364.9 & 402.6 & 140.4 & 134.4 & 134.4 \\
  \hline
- UPDATE & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ UPDATE & 1.995 & 2.611 & 1.430 & 142.0 & 172.7 & 220.0 & 140.1 & 134.2 & 134.4 \\
  \hline
 
 \end{tabular}
 \end{center}
-\label{tab:benchmark_bandwidth_ddr4}
+\label{tab:benchmark_gem5_bandwidth_ddr4}
 \end{table}
 
-Tables \ref{tab:benchmark_bandwidth_ddr3} and \ref{tab:benchmark_bandwidth_ddr4}
+Listed in Table \ref{tab:benchmark_gem5_bandwidth_ddr4} are three key parameters, specifically the average memory bandwidth and the number of bytes that has been read or written for the DDR4-2400 configuration.
+The results show that all parameters of DRAMSys correlate well with the gem5 statistics.
+While for the average bandwidth the DynamoRIO results are on average 31.0\% slower compared to gem5 SE, this deviation is only 11.1\% for gem5 FS.
+The numbers for the total amount of bytes read result in a deviation of 35.5\% in comparision to gem5 FS and only to 14.6\% to gem5 SE.
+The amount of bytes written, on the other hand, shows a very small deviation of 5.2\% for gem5 FS and only 0.07\% for gem5 SE.
+Therefore, it can be stated that almost the same number of bytes were written back to DRAM due to cache write-backs.
 
-\begin{table}[!ht]
-\caption{Results for memory access latency and data bus utilization with DDR3-1600.}
+Those numbers are also illustrated in Figure \ref{fig:benchmark_gem5_bandwidth_ddr4}.
+
+\begin{figure}[!ht]
 \begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|}
- \hline
- \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Avg. Access Latency [ns]} & \multicolumn{3}{|c|}{Data Bus Utilization [\%]} \\
- \cline{2-7}
- & FS & SE & DS & FS & SE & DS\\
- \hline
- \hline
- COPY & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- DAXPY & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- INIT & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- SDAXPY & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- STRIAD & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- SUM & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- TRIAD & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- UPDATE & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
+\begin{tikzpicture}
+\begin{axis}[
+    width=\textwidth-0.5cm,
+    ybar=1pt,
+    bar width = 8pt,
+    ymin=0,
+    ymajorgrids,
+    yminorgrids,
+    ylabel={Avg. Bandwidth [GB/s]},
+    symbolic x coords = {COPY, DAXPY, INIT, SDAXPY, STRIAD, SUM, TRIAD, UPDATE},
+    legend style={
+        at={(current bounding box.south-|current axis.south)},
+        anchor=north,
+        legend columns=-1,
+        draw=none,
+        /tikz/every even column/.append style={column sep=0.5cm}
+    },
+    x tick label style={/pgf/number format/1000 sep=},
+    x tick label style={rotate=90,anchor=east},
+    enlargelimits=0.075,
+]
 
-\end{tabular}
+    \addplot
+        coordinates {(COPY,2.201) (DAXPY,2.157) (INIT,2.058) (SDAXPY,2.239) (STRIAD,2.246) (SUM,1.429) (TRIAD,2.246) (UPDATE,1.995)};
+    \addplot
+        coordinates {(COPY,2.794) (DAXPY,2.721) (INIT,2.737) (SDAXPY,2.813) (STRIAD,2.803) (SUM,1.982) (TRIAD,2.853) (UPDATE,2.611)};
+    \addplot
+        coordinates {(COPY,2.130) (DAXPY,1.600) (INIT,2.040) (SDAXPY,2.080) (STRIAD,2.350) (SUM,1.110) (TRIAD,2.110) (UPDATE,1.430)};
+
+    \legend{gem5 FS,gem5 SE,DRAMSys}
+\end{axis}
+\end{tikzpicture}
 \end{center}
-\label{tab:benchmark_access_ddr3}
-\end{table}
+\caption{Average Bandwidth with DDR4-2400.}
+\label{fig:benchmark_gem5_bandwidth_ddr4}
+\end{figure}
 
 \begin{table}[!ht]
-\caption{Results for memory access latency and data bus utilization with DDR4-2400.}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|}
- \hline
- \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Avg. Access Latency [ns]} & \multicolumn{3}{|c|}{Data Bus Utilization [\%]} \\
- \cline{2-7}
- & FS & SE & DS & FS & SE & DS\\
- \hline
- \hline
- COPY & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- DAXPY & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- INIT & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- SDAXPY & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- STRIAD & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- SUM & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- TRIAD & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
- UPDATE & 2 & 3 & 4 & 5 & 6 & 7\\
- \hline
-
-\end{tabular}
-\end{center}
-\label{tab:benchmark_access_ddr4}
-\end{table}
-
-\begin{table}[!ht]
-\caption{Results last-level cache (L2) statistics with DDR3-1600.}
+\caption[Results for bandwidth and bytes read/written with DDR3-1600.]{Results for bandwidth and bytes read/written with DDR3-1600. FS denotes gem5 full-system, SE denotes gem5 syscall-emulation, DS denotes DRAMSys.}
 \begin{center}
 \begin{tabular}{|c|c|c|c|c|c|c|c|c|c|}
  \hline
- \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Hits} & \multicolumn{3}{|c|}{Misses} & \multicolumn{3}{|c|}{Miss Rate [\%]} \\
+ \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Avg. Bandwidth [GB/s]} & \multicolumn{3}{|c|}{Bytes Read [MB]} & \multicolumn{3}{|c|}{Bytes Written [MB]} \\
  \cline{2-10}
  & FS & SE & DS & FS & SE & DS & FS & SE & DS\\
  \hline
  \hline
- COPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ COPY & 2.031 & 2.698& 2.160 & 238.3 & 268.8& 310.1 & 140.3 & 134.3 & 134.4\\
  \hline
- DAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ DAXPY & 2.070 & 2.627& 1.610 & 238.2 & 268.9 & 301.9 & 140.2 & 134.4 & 134.4\\
  \hline
- INIT & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ INIT & 2.028 & 2.629& 2.070 & 141.9 & 172.9 & 216.0 & 140.1 & 134.4 & 134.4\\
  \hline
- SDAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ SDAXPY & 2.101 & 2.755& 2.110 & 335.1 & 364.8 & 404.0 & 140.4 & 134.4 & 134.4\\
  \hline
- STRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ STRIAD & 2.228 & 2.613& 2.370 & 431.6& 460.9 & 494.7 & 140.4 & 134.4 & 134.4\\
  \hline
- SUM & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ SUM & 1.393 & 1.969& 1.120 & 142.0 & 172.9 & 189.1 & 44.1 & 38.5 & 38.5\\
  \hline
- TRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ TRIAD & 2.162 & 2.725& 2.140 & 335.1 & 364.9 & 403.8 & 140.4 & 134.4 & 134.4\\
  \hline
- UPDATE & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
+ UPDATE & 1.938 & 2.528& 1.430 & 142.0& 172.8 & 220.0 & 140.1 & 134.3 & 134.4\\
  \hline
 
 \end{tabular}
 \end{center}
-\label{tab:benchmark_cache_ddr3}
+\label{tab:benchmark_gem5_bandwidth_ddr3}
 \end{table}
 
-\begin{table}[!ht]
-\caption{Results last-level cache (L2) statistics with DDR4-2400.}
+\begin{figure}[!ht]
 \begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|}
- \hline
- \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Hits} & \multicolumn{3}{|c|}{Misses} & \multicolumn{3}{|c|}{Miss Rate [\%]} \\
- \cline{2-10}
- & FS & SE & DS & FS & SE & DS & FS & SE & DS\\
- \hline
- \hline
- COPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
- \hline
- DAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
- \hline
- INIT & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
- \hline
- SDAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
- \hline
- STRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
- \hline
- SUM & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
- \hline
- TRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
- \hline
- UPDATE & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
- \hline
+\begin{tikzpicture}
+\begin{axis}[
+    width=\textwidth-0.5cm,
+    ybar=1pt,
+    bar width = 8pt,
+    ymin=0,
+    ymajorgrids,
+    yminorgrids,
+    ylabel={Avg. Bandwidth [GB/s]},
+    symbolic x coords = {COPY, DAXPY, INIT, SDAXPY, STRIAD, SUM, TRIAD, UPDATE},
+    legend style={
+        at={(current bounding box.south-|current axis.south)},
+        anchor=north,
+        legend columns=-1,
+        draw=none,
+        /tikz/every even column/.append style={column sep=0.5cm}
+    },
+    x tick label style={/pgf/number format/1000 sep=},
+    x tick label style={rotate=90,anchor=east},
+    enlargelimits=0.075,
+]
+    \addplot
+        coordinates {(COPY,2.031) (DAXPY,2.070) (INIT,2.028) (SDAXPY,2.101) (STRIAD,2.276) (SUM,1.393) (TRIAD,2.162) (UPDATE,1.938)};
+    \addplot
+        coordinates {(COPY,2.698) (DAXPY,2.627) (INIT,2.629) (SDAXPY,2.755) (STRIAD,2.613) (SUM,1.969) (TRIAD,2.725) (UPDATE,2.528)};
+    \addplot
+        coordinates {(COPY,2.160) (DAXPY,1.610) (INIT,2.070) (SDAXPY,2.110) (STRIAD,2.370) (SUM,1.120) (TRIAD,2.140) (UPDATE,1.430)};
 
-\end{tabular}
+    \legend{gem5 FS,gem5 SE,DRAMSys}
+\end{axis}
+\end{tikzpicture}
 \end{center}
-\label{tab:benchmark_cache_ddr4}
-\end{table}
+\caption{Average Bandwidth with DDR3-1600.}
+\label{fig:benchmark_gem5_bandwidth_ddr3}
+\end{figure}
 
-% \subsubsection{New simulation frontend}
-%
-% \subsubsection{gem5 full-system mode}
-%
-% \subsubsection{gem5 syscall-emulation mode}
+Table \ref{tab:benchmark_gem5_bandwidth_ddr3} and Figure \ref{fig:benchmark_gem5_bandwidth_ddr3} show those same key parameters for the DDR3 configuration.
+Here, the absolute deviations in the average memory bandwidth quantify by 27.5\% and 7.0\% for gem5 SE and gem5 FS respectively.
+The differences for the amount of bytes read result to 31.6\% for gem5 FS and to 14.7\% to gem5 SE.
+Also here, the bytes written only show small deviations of 5.2\% for gem5 FS and 0.02\% for gem5 SE.
+
+It has to be noted that the average memory bandwidth for the new trace player is highly influenced by the configured CPI value.
+So to match a real system, this value has to be chosen wisely to achieve good simulation results for the memory bandwidth.
+
+
+% Latency und simulation time
+
+\begin{figure}[!ht]
+\begin{center}
+\begin{tikzpicture}
+\begin{axis}[
+    width=\textwidth-0.5cm,
+    ybar=1pt,
+    bar width = 8pt,
+    ymin=0,
+    ymajorgrids,
+    yminorgrids,
+    ylabel={Avg. Latency [ns]},
+    symbolic x coords = {COPY, DAXPY, INIT, SDAXPY, STRIAD, SUM, TRIAD, UPDATE},
+    legend style={
+        at={(current bounding box.south-|current axis.south)},
+        anchor=north,
+        legend columns=-1,
+        draw=none,
+        /tikz/every even column/.append style={column sep=0.5cm}
+    },
+    x tick label style={/pgf/number format/1000 sep=},
+    x tick label style={rotate=90,anchor=east},
+    enlargelimits=0.075,
+]
+    \addplot
+        coordinates {(COPY,32.5) (DAXPY,31.4) (INIT,36.0) (SDAXPY,32.7) (STRIAD,34.5) (SUM,24.1) (TRIAD,34.5) (UPDATE,33.0)};
+    \addplot
+        coordinates {(COPY,29.8) (DAXPY,29.5) (INIT,34.8) (SDAXPY,26.4) (STRIAD,29.1) (SUM,27.0) (TRIAD,26.7) (UPDATE,34.2)};
+    \addplot
+        coordinates {(COPY,43.4) (DAXPY,38.8) (INIT,39.5) (SDAXPY,40.4) (STRIAD,40.1) (SUM,37.1) (TRIAD,40.4) (UPDATE,40.5)};
+
+    \legend{gem5 FS,gem5 SE,DRAMSys}
+\end{axis}
+\end{tikzpicture}
+\end{center}
+\caption{Average latency with DDR4-2400.}
+\end{figure}
 
 
 \subsection{Comparison to Ramulator}
 
-\subsection{Simulation Runtime}
+Noch die Konfiguration mit MHz und so neu erzählen.
+
+\begin{table}[!ht]
+\caption{Results for bandwidth and bytes read/written with DDR3-1600.}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|}
+ \hline
+ \multirow{2}*{Benchmark} & \multicolumn{2}{|c|}{Avg. Bandwidth [GB/s]} & \multicolumn{2}{|c|}{Bytes Read [MB]} & \multicolumn{2}{|c|}{Bytes Written [MB]} \\
+ \cline{2-7}
+ & Ramulator & DRAMSys & Ramulator & DRAMSys & Ramulator & DRAMSys\\
+ \hline
+ \hline
+
+ COPY & 3.053 & 2.93 & 60.2 & 420.3 & 26.4 & 210.1 \\
+ \hline
+ DAXPY & 3.049 & 2.94 & 60.3 & 420.2 & 26.5 & 210.1 \\
+ \hline
+ INIT & 3.063 & 2.76 & 60.9 & 271.6 & 26.8 & 210.1 \\
+ \hline
+ SDAXPY & 3.047 & 2.84 & 60.6 & 570.1 & 26.9 & 210.1 \\
+ \hline
+ STRIAD & 3.058 & 3.18 & 60.7 & 720.4 & 26.7 & 210.1 \\
+ \hline
+ SUM & 3.039 & 2.65 & 61.4 & 270.1 & 27.2 & 60.1 \\
+ \hline
+ TRIAD & 3.057 & 3.31 & 60.6 & 570.1 & 26.7 & 210.1 \\
+ \hline
+ UPDATE & 3.064 & 2.48 & 61.0 & 271.6 & 26.7 & 210.1 \\
+ \hline
+
+\end{tabular}
+\end{center}
+\label{tab:benchmark_ramulator_bandwidth_ddr3}
+\end{table}
+
+\begin{table}[!ht]
+\caption{Results for bandwidth and bytes read/written with DDR4-2400.}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|}
+ \hline
+ \multirow{2}*{Benchmark} & \multicolumn{2}{|c|}{Avg. Bandwidth [GB/s]} & \multicolumn{2}{|c|}{Bytes Read [MB]} & \multicolumn{2}{|c|}{Bytes Written [MB]} \\
+ \cline{2-7}
+ & Ramulator & DRAMSys & Ramulator & DRAMSys & Ramulator & DRAMSys\\
+ \hline
+ \hline
+
+ COPY & 3.462 & 3.740 & 60.2 & 269.0 & 26.4 & 134.4 \\
+ \hline
+ DAXPY & 3.454 & 3.240 & 60.3 & 268.9 & 26.5 & 134.4 \\
+ \hline
+ INIT & 3.480 & 3.340 & 60.9 & 173.8 & 26.8 & 134.4 \\
+ \hline
+ SDAXPY & 3.475 & 3.430 & 60.6 & 364.9 & 26.9 & 134.4 \\
+ \hline
+ STRIAD & 3.490 & 3.830 & 60.7 & 461.0 & 26.7 & 134.4 \\
+ \hline
+ SUM & 3.496 & 3.040 & 61.4 & 172.9 & 27.2 & 38.4 \\
+ \hline
+ TRIAD & 3.468 & 4.210 & 60.6 & 364.9 & 26.7 & 134.4 \\
+ \hline
+ UPDATE & 3.478 & 3.130 & 61.0 & 173.9 & 26.7 & 134.4 \\
+ \hline
+
+\end{tabular}
+\end{center}
+\label{tab:benchmark_ramulator_bandwidth_ddr3}
+\end{table}
+
+\subsection{Simulation Runtime Analysis}
+
+\begin{figure}[!ht]
+\begin{center}
+\begin{tikzpicture}
+\begin{axis}[
+    width=\textwidth-0.5cm,
+    ybar=1pt,
+    bar width = 8pt,
+    ymin=0,
+    ymajorgrids,
+    yminorgrids,
+    ylabel={Runtime [s]},
+    symbolic x coords = {COPY, DAXPY, INIT, SDAXPY, STRIAD, SUM, TRIAD, UPDATE},
+    legend style={
+        at={(current bounding box.south-|current axis.south)},
+        anchor=north,
+        legend columns=-1,
+        draw=none,
+        /tikz/every even column/.append style={column sep=0.5cm}
+    },
+    x tick label style={/pgf/number format/1000 sep=},
+    x tick label style={rotate=90,anchor=east},
+    enlargelimits=0.075,
+]
+    \addplot
+        coordinates {(COPY,265.07) (DAXPY,301.15) (INIT,216.9) (SDAXPY,338.08) (STRIAD,352.47) (SUM,213.43) (TRIAD,315.63) (UPDATE,262.51)};
+    \addplot
+        coordinates {(COPY,129.4) (DAXPY,149.87) (INIT,97.77) (SDAXPY,180.52) (STRIAD,195.25) (SUM,88.57) (TRIAD,166.9) (UPDATE,122.3)};
+    \addplot
+        coordinates {(COPY,73.096731) (DAXPY,80.801838) (INIT,54.796846) (SDAXPY,97.89146) (STRIAD,113.816785) (SUM,37.074149) (TRIAD,92.063386) (UPDATE,58.63603)};
+
+    \legend{gem5 FS,gem5 SE,DRAMSys}
+\end{axis}
+\end{tikzpicture}
+\end{center}
+\caption{ddr4.}
+\end{figure}
diff --git a/inc/appendix.tex b/inc/appendix.tex
index 51bb341..5dcdc49 100644
--- a/inc/appendix.tex
+++ b/inc/appendix.tex
@@ -37,3 +37,140 @@
 \end{tabular}
 \end{center}
 \end{table}
+
+\begin{table}[!hbt]
+\caption{Memory configuration used in comparison simulations against Ramulator.}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|c|}
+ \hline
+ DRAM & \parbox{2.2cm}{\centering Ranks\\per Channel} & \parbox{2cm}{\centering Banks\\per Rank} & Rows & Columns & \parbox{2cm}{\centering Devices\\per Rank} & Width\\
+ \hline
+ \hline
+ DDR3 & 1 & 8 & 32768 & 1024 & 8 & 8\\
+ \hline
+ DDR4 & 1 & 16 & 32768 & 1024 & 8 & 8\\
+ \hline
+
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[!hbt]
+\caption{Address mappings used in comparison simulations against Ramulator.}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|c|}
+ \hline
+ DRAM & Byte & Column & Bankgroup & Bank & Rank & Row\\
+ \hline
+ \hline
+ DDR3 & 0-2 & 3-12 & - & 13-15 & - & 16-30 \\
+ \hline
+ DDR4 & 0-2 & 3-12 & 13-14 & 15-16 & - & 17-31 \\
+ \hline
+
+\end{tabular}
+\end{center}
+\end{table}
+
+\subsection{Simulation Results}
+\label{sec:appendix_sim_results}
+
+\begin{table}[!ht]
+\caption{Last-level cache (L2) statistics.}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|}
+ \hline
+ \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{\parbox{2.3cm}{Miss Rate [\%]\\(DDR3-1600)}} & \multicolumn{3}{|c|}{\parbox{2.3cm}{Miss Rate [\%]\\(DDR4-2400)}} \\
+ \cline{2-7}
+ & FS & SE & DS & FS & SE & DS \\
+ \hline
+ \hline
+
+ COPY & 96.8 & 100.0 & 76.9 & 96.7 & 100.0 & 76.3 \\
+ \hline
+ DAXPY & 96.7 & 100.0 & 74.8 & 96.7 & 100.0 & 74.9 \\
+ \hline
+ INIT & 94.9 & 100.0 & 70.3 & 94.8 & 99.8 & 70.3 \\
+ \hline
+ SDAXPY & 96.7 & 100.0 & 080.9 & 96.6 & 100.0 & 80.7 \\
+ \hline
+ STRIAD & 97.1 & 100.0 & 083.1 & 96.6 & 100.0 & 83.0 \\
+ \hline
+ SUM & 94.8 & 100.0 & 89.4 & 94.8 & 99.9 & 89.5 \\
+ \hline
+ TRIAD & 96.6 & 100.0 & 80.9 & 96.6 & 100.0 & 80.6 \\
+ \hline
+ UPDATE & 94.9 & 100.0 & 71.6 & 94.8 & 99.9 & 71.6 \\
+ \hline
+
+\end{tabular}
+\end{center}
+\label{tab:benchmark_gem5_cache_ddr4}
+\end{table}
+
+\begin{table}[!ht]
+\caption{Results for memory access latency and data bus utilization with DDR3-1600.}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|}
+ \hline
+ \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Simulation Time [s]} & \multicolumn{3}{|c|}{Avg. Latency [ns]} \\
+ \cline{2-7}
+ & FS & SE & DS & FS & SE & DS\\
+ \hline
+ \hline
+
+ COPY & 0.186 & 0.149 & 0.206 & 47.4 & 35.2 & 55.8 \\
+ \hline
+ DAXPY & 0.183 & 0.153 & 0.271 & 39.1 & 34.8 & 49.4 \\
+ \hline
+ INIT & 0.139 & 0.117 & 0.169 & 39.0 & 42.2 & 48.1 \\
+ \hline
+ SDAXPY & 0.226 & 0.181 & 0.255 & 43.2 & 29.3 & 50.6 \\
+ \hline
+ STRIAD & 0.251 & 0.228 & 0.266 & 36.6 & 37.6 & 50.4 \\
+ \hline
+ SUM & 0.134 & 0.107 & 0.204 & 29.0 & 28.4 & 44.2 \\
+ \hline
+ TRIAD & 0.220 & 0.183 & 0.251 & 40.6 & 32.5 & 51.2 \\
+ \hline
+ UPDATE & 0.146 & 0.121 & 0.248 & 39.8 & 40.3 & 49.9 \\
+ \hline
+
+\end{tabular}
+\end{center}
+\label{tab:benchmark_access_ddr3}
+\end{table}
+
+\begin{table}[!ht]
+\caption{Results for memory access latency and data bus utilization with DDR4-2400.}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|}
+ \hline
+ \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Simulation Time [s]} & \multicolumn{3}{|c|}{Avg. Latency [ns]} \\
+ \cline{2-7}
+ & FS & SE & DS & FS & SE & DS\\
+ \hline
+ \hline
+
+ COPY & 0.172 & 0.144 & 0.208 & 32.5 & 29.8 & 43.4 \\
+ \hline
+ DAXPY & 0.175 & 0.148 & 0.273 & 31.4 & 29.5 & 38.8 \\
+ \hline
+ INIT & 0.137 & 0.112 & 0.172 & 36.0 & 34.8 & 39.5 \\
+ \hline
+ SDAXPY & 0.212 & 0.177 & 0.259 & 32.7 & 26.4 & 40.4 \\
+ \hline
+ STRIAD & 0.210 & 0.212 & 0.268 & 34.5 & 29.1 & 40.1 \\
+ \hline
+ SUM & 0.130 & 0.107 & 0.205 & 24.1 & 27.0 & 37.1 \\
+ \hline
+ TRIAD & 0.212 & 0.175 & 0.254 & 34.5 & 26.7 & 40.4 \\
+ \hline
+ UPDATE & 0.141 & 0.118 & 0.247 & 33.0 & 34.2 & 40.5 \\
+ \hline
+
+\end{tabular}
+\end{center}
+\label{tab:benchmark_gem5_access_ddr4}
+\end{table}
+