From 443ac6bf30d2f2ebe885f5a9f7f0f71165ff9599 Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Sat, 25 Jun 2022 16:50:50 +0200 Subject: [PATCH] Future work --- doc.bib | 18 ++++++++++++++++ inc/2.dynamorio.tex | 2 +- inc/6.implementation.tex | 22 ++++++++++++-------- inc/7.simulation_results.tex | 4 ++-- inc/8.future_work.tex | 40 ++++++++++++++++++++++++++++++++++++ 5 files changed, 74 insertions(+), 12 deletions(-) diff --git a/doc.bib b/doc.bib index db623a1..5c5ecf2 100644 --- a/doc.bib +++ b/doc.bib @@ -170,4 +170,22 @@ doi = {10.1109/LCA.2015.2414456}, } +@Article{Fog2022, + author = {Agner Fog}, + journal = {Technical University of Denmark}, + title = {Instruction tables}, + year = {2022}, + month = jun, + note = {Lists of instruction latencies, throughputs and micro-operation breakdowns for Intel, AMD, and VIA CPUs}, +} + +@InProceedings{Jagtap2016, + author = {Jagtap, Radhika and Diestelhorst, Stephan and Hansson, Andreas and Jung, Matthias and When, Norbert}, + booktitle = {2016 International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation (SAMOS)}, + title = {Exploring system performance using elastic traces: Fast, accurate and portable}, + year = {2016}, + pages = {96-105}, + doi = {10.1109/SAMOS.2016.7818336}, +} + @Comment{jabref-meta: databaseType:bibtex;} diff --git a/inc/2.dynamorio.tex b/inc/2.dynamorio.tex index e1510b2..50db940 100644 --- a/inc/2.dynamorio.tex +++ b/inc/2.dynamorio.tex @@ -69,7 +69,7 @@ A client that already comes with DynamoRIO is DrCacheSim with the DrMemtrace-Fra \begin{table}[!ht] \caption{Client routines that get called by DynamoRIO \cite{Bruening2003}.} \begin{center} -\begin{tabular}{|p{0.6\linewidth} | p{0.4\linewidth}|} +\begin{tabular}{|p{0.55\linewidth} | p{0.35\linewidth}|} \hline Client Routine & Description\\ \hline diff --git a/inc/6.implementation.tex b/inc/6.implementation.tex index 5092e47..4bd760e 100644 --- a/inc/6.implementation.tex +++ b/inc/6.implementation.tex @@ -20,7 +20,9 @@ It is a purely observational client and does not modify the behavior of the appl Optionally, DrCacheSim converts the addresses of the memory accesses from virtual addresses into physical addresses, which is an important step for simulating a real memory system. The physical address conversion only works on Linux and requires root privileges (or alternatively the CAP\_SYS\_ADMIN capability) in modern kernel versions. The analyzer tool can either be running alongside with DrCacheSim (online) or operate on an internal trace format (offline). -As of writing this thesis, the offline tracing mode does not yet support the physical address conversation, so the online mode has to be used. +Offline tracing has the additional advantage of being able to disassemble the executed instructions afterwards. +For this, the mapping of the executable binaries and shared libraries is stored alongside with the trace, enabling the decoding of the instructions from the traced program counter values. +As of writing this thesis, the offline tracing mode has recently gained support for the physical address conversation, but the online mode will be used throughout this thesis as its support is still limited. In case of the online tracing, DrCacheSim consists of two separate processes: \begin{itemize} @@ -64,15 +66,17 @@ This instruction count is used to approximate the delay between the memory acces \begin{textcode} # instruction count,read/write,data size,data address # -<13295366593324052> -4,r,8,1774ef30 -0,r,8,1774ef38 -1,w,8,1774ef28 -2,w,8,1774ee88 -0,r,8,17744728 -1,r,8,238c3fb0 +<13300116157764414> +3,r,8,1190cf3f0 +9,w,16,1190cf270 +2,r,8,10200be48 +0,w,16,1190cf280 +1,w,16,1190cf290 +2,w,16,1190cf2a0 +1,w,16,1190cf2b0 +0,w,16,1190cf2c0 \end{textcode} -\caption{Example of a memory access trace with a timestamp.} +\caption{Example of a memory access trace with a timestamp. For each thread, a seperate trace file is generated.} \label{list:memtrace} \end{listing} diff --git a/inc/7.simulation_results.tex b/inc/7.simulation_results.tex index 00d0daf..e768a65 100644 --- a/inc/7.simulation_results.tex +++ b/inc/7.simulation_results.tex @@ -4,7 +4,7 @@ In this section the accuracy of the new simulation frontend will be evaluated. After a short discussion about the general expections regarding the accuracy and considerations to make, the simulation results will be presented. The presentation is structured into two parts: -At first simulation statistics of numerous benchmarks are compared against the gem5\cite{Binkert2011} simulator that uses detailed processor models and can be considered as a reference. . +At first simulation statistics of numerous benchmarks are compared against the gem5\cite{Binkert2011} simulator that uses detailed processor models and can be considered as a reference. Secondly, the new simulation frontend is compared against the memory access trace generator tool of the Ramulator DRAM simulator\cite{Ghose2019}. \subsection{Accuracy} @@ -19,4 +19,4 @@ Since the DBI cannot observe the fetching of those instructions, the new simulat \subsection{Comparison to the gem5 Simulator} - +At first, the micro-benchmark suite TheBandwithBenchmark\cite{} will be used to compare the gem5 full-system simulation as well as the gem5 syscall-emulation simulation modes with the newly developed frontend. diff --git a/inc/8.future_work.tex b/inc/8.future_work.tex index 27dbe2c..ca50c2c 100644 --- a/inc/8.future_work.tex +++ b/inc/8.future_work.tex @@ -1,2 +1,42 @@ \section{Future Work} \label{sec:future_work} + +Due to the complexity of possible memory sub-system configurations, simulation is an indispensable part of the development process of today's systems. +It not only has an high impact on the development cost but also significantly reduces the time-to-market and enables the rapid release of new products. +However, the accurate simulation of a specific application takes a large period of time because of the detailed processor core models. +On the other hand, fixed or relative time memory traces allow faster simulation at the expense of accuracy, which makes it often unsuitable. +To fill this gap, this thesis introduced a new simulation frontend for DRAMSys, that is fast and makes only few compromises on accuracy. + +In conclusion, the newly developed instrumentation tool provides an flexible way of generating traces for arbitrary multi-threaded applications. +The mature DRAMSys simulator framework then can be used to explore the design space and vary numerous configuration parameters of the DRAM subsystem to find a well-suited set of options. + +It was shown that in comparison to the well-established full-system simulation framework gem5, only small deviations have to be accepted. +Also, the Pin-Tool based memory access tracing of the Ramulator DRAM simulator was compared to the new fronted. %(ergenisse kurz hier zusammenfassen) +A noteworthy advantage of the newly developed tool is its support for all hardware architectures that DynamoRIO provides (currently IA-32, x86-64, ARM, and AArch64) in contrast to the supported architectures of Pin (IA-32 and x86-64). + +Still, there is room for improvement. +As mentioned in \ref{sec:cache_implementation}, the cache models do not yet guarantee cache coherency due to the lack of a snooping protocol. +Although this can be a complex task, it is possible to implement this in future work. + +A less impactful inaccuracy results from the scheduling of the applications threads in the new simplified core models. +While an application can spawn a arbitrary number of threads, the platform may not be able to process them all in parallel. +Currently, the new trace player does not take this into account and runs all threads in parallel. +This, however, could be prevented by recording used processor cores on the initial system and using this information to better match the scheduling. + +Another inaccuracy can be caused by the hyperthreading of some of today's processors: +While hyperthreading enables the parallel processing of two pipelines in a processor core, those threads do share the same first level cache. +Currently, this is not taken into account and every application thread gets its own first level cache assigned. + +Further room for improvement offers the consideration of the special prefetch and instructions the architectures provide. +DynamoRIO already offers an interface to catch those instructions without much effort. +Support for this would have to be added to the core and cache models as well as the memory trace format. + +The recorded number of computational instructions between each memory access, which are used to esimate the time between those accesses, is multiplied with the clock period of the trace player. +However, this is a vast simplification of the real timing behavior of a processor. +In the future, the DynamoRIO tool could decode those computational instructions and create a better estimate of the execution time of those instructions, based on statistical estimates that have been published before\cite{Abel19a}\cite{Fog2022}. + +One significant improvement that still could be applied is the consideration of dependencies between the memory accesses. +Similarily to the elastic trace player of gem5\cite{Jagtap2016}, which captures data load and store dependencies by instrumenting a detailed out-of-order processor model, the DynamoRIO tool could create a dependency graph of the memory accesses using the decoded instructions. +By using this technique, it is possible to also model out-of-order behavior of modern processors and make the simulation more accurate, whereas the current implementation is entirely in-order. + +These mentioned potential improvements could make the new simulation frontend for dramsys even more accurate.