From 6d781f5cd8c1e216d85d5111164eaf3d8f11f006 Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Tue, 28 Jun 2022 15:54:15 +0200 Subject: [PATCH] Start of simulation chapter --- Bachelorarbeit.kilepr | 6 +- doc.bib | 12 ++ inc/7.simulation_results.tex | 268 ++++++++++++++++++++++++++++++++++- 3 files changed, 282 insertions(+), 4 deletions(-) diff --git a/Bachelorarbeit.kilepr b/Bachelorarbeit.kilepr index bdbf3df..571bd5e 100644 --- a/Bachelorarbeit.kilepr +++ b/Bachelorarbeit.kilepr @@ -92,9 +92,9 @@ mode=LaTeX [item:inc/8.future_work.tex] archive=true -encoding= -highlight= -mode= +encoding=UTF-8 +highlight=LaTeX +mode=LaTeX [item:inc/appendix.tex] archive=true diff --git a/doc.bib b/doc.bib index 5c5ecf2..145b634 100644 --- a/doc.bib +++ b/doc.bib @@ -188,4 +188,16 @@ doi = {10.1109/SAMOS.2016.7818336}, } +@Article{Qemu, + journal = {A generic and open source machine emulator and virtualizer}, + title = {Q{E}{M}{U}}, + note = {https://www.qemu.org/. Accessed: 2022-06-28}, +} + +@Article{TheBandwidthBenchmark, + author = {Erlangen National High Performance Computing Center}, + title = {The {B}andwidth {B}enchmark}, + note = {https://github.com/RRZE-HPC/TheBandwidthBenchmark. Accessed: 2022-06-28}, +} + @Comment{jabref-meta: databaseType:bibtex;} diff --git a/inc/7.simulation_results.tex b/inc/7.simulation_results.tex index e768a65..9a03f6a 100644 --- a/inc/7.simulation_results.tex +++ b/inc/7.simulation_results.tex @@ -19,4 +19,270 @@ Since the DBI cannot observe the fetching of those instructions, the new simulat \subsection{Comparison to the gem5 Simulator} -At first, the micro-benchmark suite TheBandwithBenchmark\cite{} will be used to compare the gem5 full-system simulation as well as the gem5 syscall-emulation simulation modes with the newly developed frontend. +At first, the micro-benchmark suite TheBandwithBenchmark\cite{TheBandwidthBenchmark}, containing various streaming kernels, will be used to compare the gem5 full-system simulation as well as the gem5 syscall-emulation simulation with the newly developed frontend. +The simulation setup consists in both cases of a two-level cache hierarchy with the following parameters: + +\begin{table}[!ht] +\caption{Cache parameters.} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|} + \hline + Cache & Size & Associativity & Line size & MSHRs & MSHR targets & WB entries\\ + \hline + \hline + L1 & 32 kiB & 8 & 64 & 4 & 20 & 8\\ + \hline + L2 & 256 kiB & 4 & 64 & 20 & 12 & 8\\ + \hline + +\end{tabular} +\end{center} +\label{tab:cache_parameters} +\end{table} + +In this configuration, every processor core has its own L1 data cache (in case of gem5 also a L1 instruction cache) whereas the L2 cache is shared between all cores. +The gem5 simulator uses four ARM CPU core models (TimingSimpleCPU, an in-order core model) at 1 GHz, whereas the DynamoRIO traces are obtained using a QEMU\cite{Qemu} ARM virtual machine, configured to use four cores as well. +The DRAM subsystem will be varied between a single-channel DDR3 memory (1600 MT/s) and a single-channel DDR4 memory (2400 MT/s). +% Hier die DRAMSys Configuration erklären! +To match the same configuration as in gem5, the memory controller in DRAMSys is set to use a \revabbr{first-ready - first-come, first-served}{FR-FCFS} scheduling policy, a \revabbr{first-in, first-out}{FIFO} response queue policy, and a row-rank-bank-column-channel address mapping. +The trace player operates at the same clock frequency as the gem5 core models. + +The micro-benchmarks itself are multi-threaded and use all four cores. +Their access patterns are as followed: + +\begin{table}[!ht] +\caption{Access patterns of the micro-benchmark kernels\cite{TheBandwidthBenchmark}.} +\begin{center} +\begin{tabular}{|c|c|c|} + \hline + Benchmark kernel & Description & Access pattern \\ + \hline + \hline + INIT & Initialize an array & a = s (store, write allocate) \\ + \hline + SUM & Vector reduction & s += a (load)\\ + \hline + COPY & Memory copy & a = b (load, store, write allocate)\\ + \hline + UPDATE & Update vector & a = a * scalar (load, store)\\ + \hline + TRIAD & Stream triad & a = b + c * scalar (load, store, write allocate)\\ + \hline + DAXPY & Daxpy & a = a + b * scalar (load, store)\\ + \hline + STRIAD & Schönauer triad & a = b + c * d (load, store, write allocate)\\ + \hline + SDAXPY & Schönauer triad & a = a + b * c (load, store)\\ + \hline + +\end{tabular} +\end{center} +\label{tab:benchmark_description} +\end{table} + +In the following, the simulation results of the new simulation frontend, the gem5 full-system emulation and the gem5 syscall-emulation will now be presented. + +\begin{table}[!ht] +\caption{Results for bandwidth and bytes read/written with DDR3-1600. FS denotes gem5 full-system, SE denotes gem5 syscall-emulation, DS denotes DRAMSys.} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|} + \hline + \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Avg. Bandwidth [GB/s]} & \multicolumn{3}{|c|}{Bytes Read [MB]} & \multicolumn{3}{|c|}{Bytes Written [MB]} \\ + \cline{2-10} + & FS & SE & DS & FS & SE & DS & FS & SE & DS\\ + \hline + \hline + COPY & 2.031 & 2.698& 4 & 238.3 & 268.8& 7 & 140.3 & 134.3 & 10\\ + \hline + DAXPY & 2.070 & 2.627& 4 & 238.2 & 268.9 & 7 & 140.2 & 134.4 & 10\\ + \hline + INIT & 2.028 & 2.629& 4 & 141.9 & 172.9 & 7 & 140.1 & 134.4 & 10\\ + \hline + SDAXPY & 2.101 & 2.755& 4 & 335.1 & 364.8 & 7 & 140.4 & 134.4 & 10\\ + \hline + STRIAD & 2.228 & 2.613& 4 & 431.6& 460.9 & 7 & 140.4 & 134.4 & 10\\ + \hline + SUM & 1.393 & 1.969& 4 & 142.0 & 172.9 & 7 & 44.1 & 38.5 & 10\\ + \hline + TRIAD & 2.162 & 2.725& 4 & 335.1 & 364.9 & 7 & 140.4 & 134.4 & 10\\ + \hline + UPDATE & 1.938 & 2.528& 4 & 142.0& 172.8 & 7 & 140.1 & 134.3 & 10\\ + \hline + +\end{tabular} +\end{center} +\label{tab:benchmark_bandwidth_ddr3} +\end{table} + +\begin{table}[!ht] +\caption{Results for bandwidth and bytes read/written with DDR4-2400.} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|} + \hline + \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Avg. Bandwidth [GB/s]} & \multicolumn{3}{|c|}{Bytes Read [MB]} & \multicolumn{3}{|c|}{Bytes Written [MB]} \\ + \cline{2-10} + & FS & SE & DS & FS & SE & DS & FS & SE & DS\\ + \hline + \hline + COPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + DAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + INIT & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + SDAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + STRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + SUM & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + TRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + UPDATE & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + +\end{tabular} +\end{center} +\label{tab:benchmark_bandwidth_ddr4} +\end{table} + +Tables \ref{tab:benchmark_bandwidth_ddr3} and \ref{tab:benchmark_bandwidth_ddr4} + +\begin{table}[!ht] +\caption{Results for memory access latency and data bus utilization with DDR3-1600.} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|} + \hline + \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Avg. Access Latency [ns]} & \multicolumn{3}{|c|}{Data Bus Utilization [\%]} \\ + \cline{2-7} + & FS & SE & DS & FS & SE & DS\\ + \hline + \hline + COPY & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + DAXPY & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + INIT & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + SDAXPY & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + STRIAD & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + SUM & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + TRIAD & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + UPDATE & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + +\end{tabular} +\end{center} +\label{tab:benchmark_access_ddr3} +\end{table} + +\begin{table}[!ht] +\caption{Results for memory access latency and data bus utilization with DDR4-2400.} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|} + \hline + \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Avg. Access Latency [ns]} & \multicolumn{3}{|c|}{Data Bus Utilization [\%]} \\ + \cline{2-7} + & FS & SE & DS & FS & SE & DS\\ + \hline + \hline + COPY & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + DAXPY & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + INIT & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + SDAXPY & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + STRIAD & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + SUM & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + TRIAD & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + UPDATE & 2 & 3 & 4 & 5 & 6 & 7\\ + \hline + +\end{tabular} +\end{center} +\label{tab:benchmark_access_ddr4} +\end{table} + +\begin{table}[!ht] +\caption{Results last-level cache (L2) statistics with DDR3-1600.} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|} + \hline + \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Hits} & \multicolumn{3}{|c|}{Misses} & \multicolumn{3}{|c|}{Miss Rate [\%]} \\ + \cline{2-10} + & FS & SE & DS & FS & SE & DS & FS & SE & DS\\ + \hline + \hline + COPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + DAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + INIT & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + SDAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + STRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + SUM & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + TRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + UPDATE & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + +\end{tabular} +\end{center} +\label{tab:benchmark_cache_ddr3} +\end{table} + +\begin{table}[!ht] +\caption{Results last-level cache (L2) statistics with DDR4-2400.} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|} + \hline + \multirow{2}*{Benchmark} & \multicolumn{3}{|c|}{Hits} & \multicolumn{3}{|c|}{Misses} & \multicolumn{3}{|c|}{Miss Rate [\%]} \\ + \cline{2-10} + & FS & SE & DS & FS & SE & DS & FS & SE & DS\\ + \hline + \hline + COPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + DAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + INIT & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + SDAXPY & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + STRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + SUM & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + TRIAD & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + UPDATE & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\ + \hline + +\end{tabular} +\end{center} +\label{tab:benchmark_cache_ddr4} +\end{table} + +% \subsubsection{New simulation frontend} +% +% \subsubsection{gem5 full-system mode} +% +% \subsubsection{gem5 syscall-emulation mode} + + +\subsection{Comparison to Ramulator} + +\subsection{Simulation Runtime}