diff --git a/Bachelorarbeit.kilepr b/Bachelorarbeit.kilepr index 65c0b5c..bdbf3df 100644 --- a/Bachelorarbeit.kilepr +++ b/Bachelorarbeit.kilepr @@ -86,9 +86,9 @@ mode=LaTeX [item:inc/7.simulation_results.tex] archive=true -encoding= -highlight= -mode= +encoding=UTF-8 +highlight=LaTeX +mode=LaTeX [item:inc/8.future_work.tex] archive=true diff --git a/doc.bib b/doc.bib index 011102c..db623a1 100644 --- a/doc.bib +++ b/doc.bib @@ -120,4 +120,54 @@ doi = {10.1109/SAMOS.2017.8344631}, } +@Book{Hennessy2011, + author = {Hennessy, John L. and Patterson, David A.}, + publisher = {Morgan Kaufmann Publishers Inc.}, + title = {Computer Architecture, Fifth Edition: A Quantitative Approach}, + year = {2011}, + address = {San Francisco, CA, USA}, + edition = {5th}, + isbn = {012383872X}, + abstract = {The computing world today is in the middle of a revolution: mobile clients and cloud computing have emerged as the dominant paradigms driving programming and hardware innovation today. The Fifth Edition of Computer Architecture focuses on this dramatic shift, exploring the ways in which software and technology in the "cloud" are accessed by cell phones, tablets, laptops, and other mobile computing devices. Each chapter includes two real-world examples, one mobile and one datacenter, to illustrate this revolutionary change. Updated to cover the mobile computing revolutionEmphasizes the two most important topics in architecture today: memory hierarchy and parallelism in all its forms.Develops common themes throughout each chapter: power, performance, cost, dependability, protection, programming models, and emerging trends ("What's Next")Includes three review appendices in the printed text. Additional reference appendices are available online.Includes updated Case Studies and completely new exercises.}, +} + +@Article{Ghose2019, + author = {Ghose, Saugata and Li, Tianshi and Hajinazar, Nastaran and Cali, Damla Senol and Mutlu, Onur}, + journal = {Proc. ACM Meas. Anal. Comput. Syst.}, + title = {Demystifying Complex Workload-DRAM Interactions: An Experimental Study}, + year = {2019}, + month = {dec}, + number = {3}, + volume = {3}, + abstract = {It has become increasingly difficult to understand the complex interactions between modern applications and main memory, composed of Dynamic Random Access Memory (DRAM) chips. Manufacturers are now selling and proposing many different types of DRAM, with each DRAM type catering to different needs (e.g., high throughput, low power, high memory density). At the same time, memory access patterns of prevalent and emerging applications are rapidly diverging, as these applications manipulate larger data sets in very different ways. As a result, the combined DRAM-workload behavior is often difficult to intuitively determine today, which can hinder memory optimizations in both hardware and software. In this work, we identify important families of workloads, as well as prevalent types of DRAM chips, and rigorously analyze the combined DRAM-workload behavior. To this end, we perform a comprehensive experimental study of the interaction between nine different DRAM types and 115 modern applications and multiprogrammed workloads. We draw 12 key observations from our characterization, enabled in part by our development of new metrics that take into account contention between memory requests due to hardware design. Notably, we find that (1) newer DRAM technologies such as DDR4 and HMC often do not outperform older technologies such as DDR3, due to higher access latencies and, also in the case of HMC, poor exploitation of locality; (2) there is no single memory type that can effectively cater to all of the components of a heterogeneous system (e.g., GDDR5 significantly outperforms other memories for multimedia acceleration, while HMC significantly outperforms other memories for network acceleration); and (3) there is still a strong need to lower DRAM latency, but unfortunately the current design trend of commodity DRAM is toward higher latencies to obtain other benefits. We hope that the trends we identify can drive optimizations in both hardware and software design. To aid further study, we open-source our extensively-modified simulator, as well as a benchmark suite containing our applications.}, + address = {New York, NY, USA}, + articleno = {60}, + doi = {10.1145/3366708}, + issue_date = {December 2019}, + keywords = {power consumption, memory systems, performance modeling, experimental characterization, dram, low-power memory, energy, 3d-stacked memory}, + numpages = {50}, + publisher = {Association for Computing Machinery}, + url = {https://doi.org/10.1145/3366708}, +} + +@InProceedings{Gomony2012, + author = {Gomony, Manil Dev and Weis, Christian and Akesson, Benny and Wehn, Norbert and Goossens, Kees}, + booktitle = {2012 Design, Automation \& Test in Europe Conference \& Exhibition (DATE)}, + title = {DRAM selection and configuration for real-time mobile systems}, + year = {2012}, + pages = {51-56}, + doi = {10.1109/DATE.2012.6176432}, +} + +@Article{Kim2016, + author = {Kim, Yoongu and Yang, Weikun and Mutlu, Onur}, + journal = {IEEE Computer Architecture Letters}, + title = {Ramulator: A Fast and Extensible DRAM Simulator}, + year = {2016}, + number = {1}, + pages = {45-49}, + volume = {15}, + doi = {10.1109/LCA.2015.2414456}, +} + @Comment{jabref-meta: databaseType:bibtex;} diff --git a/inc/1.introduction.tex b/inc/1.introduction.tex index 5feb017..149b9fe 100644 --- a/inc/1.introduction.tex +++ b/inc/1.introduction.tex @@ -1,3 +1,28 @@ \section{Introduction} \label{sec:introduction} +Todays computing systems accompany us in almost all areas of life in the form of smart devices, computers, or game consoles. with the increasing performance requirements on these devices, not only faster processors are needed, but also high-performance memory systems, especially DRAMs, which are supposed to deliver a lot of bandwidth at low latency. +While these storage systems are very complex and offer a lot of room for configuration, as the used DRAM standard, the memory controller configuration or the address mapping, there are different requirements for the very different applications\cite{Gomony2012}. +Consequently, system designers are commissioned with the complex task of finding the most effective configurations that match the performance and power contraints with good optimizations applied for the specific use case. + +For the exploration of the design space for these configurations it is impractical to use real systems as they expensive and are not suitable for rapid prototyping. +To overcome this limitation, it is important to simulate the memory system using a simulation framework with sufficient accuracy. + +Such a simulation framework is DRAMSys\cite{Steiner2020}\cite{Jung2017}, which is based on transaction level modeling and enables the fast simulation of numerous DRAM standards and controller configurations with cycle-accuracy. +Stimuli for the memory system can either be generated using a prerecorded trace file with fixed or relative timestamps, a traffic generator that acts as a state machine and initiates different request patterns or a detailed processor model of the gem5\cite{Binkert2011} simulation framework. + +However, the two former methods lack in accurary whereas the latter may provide the sufficient precision but represents a very time-consuming effort. +To fill this gap of fast but accurate traffic generation, a new simulation frontend for DRAMSys is developed and presented in this thesis. + +The methology this new framwork is based on is dynamic binary instrumentation. +It allows the extraction of memory accesses of multi-threaded applications as they are executed on real hardware. +These memory access traces then are played back using a simplified core model and filtered by a cache model before the memory requests are passed to the DRAM. +This allows an accurate modeling of the system and the variing of numerous configuration parameters in a short time. + +The remainder of the thesis is structured as follows: +In section \ref{sec:dynamorio} the used dynamic binary instrumentation framework, DynamoRIO, is introduced. +The section \ref{sec:systemc} presents the modeling language SystemC, the developed core and cache models are based on. +After that, the section \ref{sec:caches} gives a short overview over modern cache architectures and their high-level implementations. +Section \ref{sec:dramsys} introduces the DRAMSys simulator framework and its basic functionalities. +Section \ref{sec:implementation} concerns with the implementation of the cache model, the processor model and the instrumentation tool. +In section \ref{sec:simulation_results} the accuracy of the new framwork is compared against the gem5 and Ramulator\cite{Kim2016} simulators, whereas section \ref{sec:future_work} denotes which future improvements can be achieved. diff --git a/inc/5.dramsys.tex b/inc/5.dramsys.tex index 0e57b57..195dadd 100644 --- a/inc/5.dramsys.tex +++ b/inc/5.dramsys.tex @@ -38,4 +38,4 @@ Furthermore, the Trace Analyzer is capable of calculating numerous metrics and c \end{center} \end{figure} -In section \ref{sec:implementation} of this thesis a special trace player for DRAMSys will be developed. +In section \ref{sec:implementation} of this thesis the new special traffic generator for DRAMSys will be developed. diff --git a/inc/6.implementation.tex b/inc/6.implementation.tex index cf4375a..5092e47 100644 --- a/inc/6.implementation.tex +++ b/inc/6.implementation.tex @@ -187,16 +187,18 @@ This, however, conflicted with the goal to develop an trace player module that i To be able to couple such hierarchical initiator modules with DRAMSys, a new trace player interface was developed. The \texttt{TrafficInitiatorIF} interface requires to implement the \texttt{bindTargetSocket()} method for every top-level initiator. -A top-level initiator can either be a single thread, like in previous versions, or a more complex hierarchical module with many internal threads. -This makes it possible to polymorphically threat all initiator modules like this interface and connect them to DRAMSys with the provided bind method. +This makes it possible to polymorphically treat all initiator modules like this interface and connect them to DRAMSys with the provided bind method, abstracting away the concrete initiator socket used. + +So with the new trace player interface, a top-level initiator can either be a single thread, like in previous versions, or a more complex hierarchical module with many internal threads. \subsection{Interconnect} \label{sec:interconnect} As already seen in figure \ref{fig:dbiplayer_with_caches}, interconnection modules are needed to connect the caches with each other. -While the implementation of the \textit{MultiCoupler} component is trivial as it only passes the transactions from its so-called target multi-socket to its initiator multi-socket, the \textit{MultiSimpleCoupler} is more complex because it has to internally buffer transactions. +While the implementation of the \textit{MultiCoupler} component is trivial as it only passes the transactions from its so-called \texttt{multi\_passthrough\_target\_socket} to its \texttt{multi\_passthrough\_initiator\_socket}, the \textit{MultiSimpleCoupler} is more complex because it has to internally buffer transactions. + In order to understand why this buffering needed, consider scenario where the L3 cache applies back pressure to one L2 cache. -The L2 cache is not allowed to send further requests but since the target socket of the L3 cache is occupied, this also applies to all other other L2 caches. -This information, however, is not propagated to the other caches, leading to an incorrect behavior. +The L2 cache is not allowed to send further requests due to the exclusion rule but since the target socket of the L3 cache is occupied, this also applies to all other other L2 caches. +This information, however, is not propagated to the other caches, leading to an incorrect behavior if not addressed. To solve this problem, the MultiSimpleCoupler only forwards requests to the L3 cache when it is able to accept them. If this is not the case, the request gets internally buffered and forwarded when an earlier request is being completed with the \texttt{END\_REQ} phase. diff --git a/inc/7.simulation_results.tex b/inc/7.simulation_results.tex index ae2571e..00d0daf 100644 --- a/inc/7.simulation_results.tex +++ b/inc/7.simulation_results.tex @@ -1,2 +1,22 @@ \section{Simulation Results} \label{sec:simulation_results} + +In this section the accuracy of the new simulation frontend will be evaluated. +After a short discussion about the general expections regarding the accuracy and considerations to make, the simulation results will be presented. +The presentation is structured into two parts: +At first simulation statistics of numerous benchmarks are compared against the gem5\cite{Binkert2011} simulator that uses detailed processor models and can be considered as a reference. . +Secondly, the new simulation frontend is compared against the memory access trace generator tool of the Ramulator DRAM simulator\cite{Ghose2019}. + +\subsection{Accuracy} +Generating memory access traces using dynamic binary instrumentation as a faster alternative to the simulation of detailed processor models introduces several inaccuracies, which of some will now be enumerated. + +The most important aspect to consider is that DBI can only instrument the target application but fails to also take the operating system the application is running on into account. +That includes the inability to observe the execution of kernel routines that are directly invoked by the target application through system calls, but also the preemtive scheduling of other programs that are running on the system at the same time. + +What is also to concern is the fetching of the instructions itself: +In a real system the binary executable of the target application is placed in the DRAM, along with its data, and gets fetched into the instruction cache while executing. +Since the DBI cannot observe the fetching of those instructions, the new simulator frontend cannot model this memory traffic. + +\subsection{Comparison to the gem5 Simulator} + +