From c97099c7b0697e93c095b34bf11852f0504c3c02 Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Tue, 5 Jul 2022 18:30:52 +0200 Subject: [PATCH] DynamoRIO additions --- doc.bib | 14 +++-- img/tlm_at.tikz | 56 ++++++++--------- inc/0.titlepage.tex | 2 +- inc/2.dynamorio.tex | 126 +++++++++++++++++++++++++++++---------- inc/6.implementation.tex | 2 +- 5 files changed, 133 insertions(+), 67 deletions(-) diff --git a/doc.bib b/doc.bib index 145b634..43d5f80 100644 --- a/doc.bib +++ b/doc.bib @@ -188,10 +188,10 @@ doi = {10.1109/SAMOS.2016.7818336}, } -@Article{Qemu, - journal = {A generic and open source machine emulator and virtualizer}, - title = {Q{E}{M}{U}}, - note = {https://www.qemu.org/. Accessed: 2022-06-28}, +@Article{Valgrind, + journal = {Valgrind is an instrumentation framework for building dynamic analysis tools.}, + title = {Valgrind}, + note = {https://valgrind.org/. Accessed: 2022-07-05}, } @Article{TheBandwidthBenchmark, @@ -200,4 +200,10 @@ note = {https://github.com/RRZE-HPC/TheBandwidthBenchmark. Accessed: 2022-06-28}, } +@Article{, + journal = {A generic and open source machine emulator and virtualizer}, + title = {Q{E}{M}{U}}, + note = {https://www.qemu.org/. Accessed: 2022-06-28}, +} + @Comment{jabref-meta: databaseType:bibtex;} diff --git a/img/tlm_at.tikz b/img/tlm_at.tikz index 59a4d3d..6bb5b90 100644 --- a/img/tlm_at.tikz +++ b/img/tlm_at.tikz @@ -2,44 +2,44 @@ \begin{pgfonlayer}{nodelayer} \node [style=none] (0) at (0, 1.5) {}; \node [style=none] (1) at (0, -21) {}; - \node [style=none] (2) at (14, 1.5) {}; - \node [style=none] (3) at (14, -21) {}; + \node [style=none] (2) at (16, 1.5) {}; + \node [style=none] (3) at (16, -21) {}; \node [style=none] (4) at (0, 2.25) {Initiator}; - \node [style=none] (5) at (14, 2.25) {Target}; + \node [style=none] (5) at (16, 2.25) {Target}; \node [style=none] (6) at (-1, 0.5) {}; - \node [style=none] (7) at (-1, -18.5) {}; + \node [style=none] (7) at (-1, -20) {}; \node [style=align text] (8) at (-2, -8) {Time}; \node [style=none] (9) at (0, -0.5) {}; - \node [style=none] (10) at (14, -0.5) {}; - \node [style=none] (11) at (14, -0.75) {}; - \node [style=none] (12) at (0, -0.75) {}; + \node [style=none] (10) at (16, -0.5) {}; + \node [style=none] (11) at (16, -1) {}; + \node [style=none] (12) at (0, -1) {}; \node [style=none] (13) at (0, -5.5) {}; - \node [style=none] (14) at (14, -5.5) {}; + \node [style=none] (14) at (16, -5.5) {}; \node [style=none] (15) at (0, -10.5) {}; - \node [style=none] (16) at (14, -10.5) {}; + \node [style=none] (16) at (16, -10.5) {}; \node [style=none] (17) at (2.5, 0) {BEGIN\_REQ}; - \node [style=none] (18) at (12, -1.5) {END\_REQ}; - \node [style=none] (19) at (11.5, -5) {BEGIN\_RESP}; + \node [style=none] (18) at (14, -1.75) {END\_REQ}; + \node [style=none] (19) at (13.5, -5) {BEGIN\_RESP}; \node [style=none] (20) at (2.25, -10) {END\_RESP}; - \node [style=none] (21) at (17.5, -3) {}; - \node [style=none] (22) at (17.5, -7) {}; - \node [style=align text] (23) at (18.25, -5) {Back-\\pressure}; - \node [style=none] (24) at (17, -3) {}; - \node [style=none] (25) at (17, -7) {}; + \node [style=none] (21) at (19.5, -3) {}; + \node [style=none] (22) at (19.5, -7) {}; + \node [style=align text] (23) at (20.25, -5) {Back-\\pressure}; + \node [style=none] (24) at (19, -3) {}; + \node [style=none] (25) at (19, -7) {}; \node [style=none] (26) at (0, -3) {}; - \node [style=none] (27) at (14, -3) {}; + \node [style=none] (27) at (16, -3) {}; \node [style=none] (28) at (0, -7) {}; - \node [style=none] (29) at (14, -7) {}; - \node [style=none] (30) at (12, -6.5) {END\_REQ}; + \node [style=none] (29) at (16, -7) {}; + \node [style=none] (30) at (14, -6.5) {END\_REQ}; \node [style=none] (31) at (2.5, -2.5) {BEGIN\_REQ}; - \node [style=none] (32) at (0, -7.25) {}; - \node [style=none] (33) at (14, -7.25) {}; - \node [style=none] (34) at (2.5, -8) {BEGIN\_REQ}; + \node [style=none] (32) at (0, -7.5) {}; + \node [style=none] (33) at (16, -7.5) {}; + \node [style=none] (34) at (2.5, -8.25) {BEGIN\_REQ}; \node [style=none] (38) at (0, -12.75) {}; - \node [style=none] (39) at (14, -12.75) {}; - \node [style=none] (40) at (8.25, -12.25) {BEGIN\_RESP (Skip END\_REQ)}; + \node [style=none] (39) at (16, -12.75) {}; + \node [style=none] (40) at (10.25, -12.25) {BEGIN\_RESP (Skip END\_REQ)}; \node [style=none] (41) at (0, -17) {}; - \node [style=none] (42) at (14, -17) {}; + \node [style=none] (42) at (16, -17) {}; \node [style=none] (43) at (2.25, -16.5) {END\_RESP}; \node [style=none] (44) at (-3.5, -12.75) {}; \node [style=none] (45) at (-3.5, -17) {}; @@ -47,10 +47,10 @@ \node [style=none] (47) at (-3, -12.75) {}; \node [style=none] (48) at (-3, -17) {}; \node [style=none] (49) at (0, -19) {}; - \node [style=none] (50) at (14, -19) {}; + \node [style=none] (50) at (16, -19) {}; \node [style=none] (51) at (0, -18.5) {}; - \node [style=none] (52) at (14, -18.5) {}; - \node [style=none] (53) at (11.5, -18) {BEGIN\_RESP}; + \node [style=none] (52) at (16, -18.5) {}; + \node [style=none] (53) at (13.5, -18) {BEGIN\_RESP}; \node [style=none] (54) at (3.75, -19.75) {TLM\_COMPLETED}; \end{pgfonlayer} \begin{pgfonlayer}{edgelayer} diff --git a/inc/0.titlepage.tex b/inc/0.titlepage.tex index a7ecb6d..5adaf42 100644 --- a/inc/0.titlepage.tex +++ b/inc/0.titlepage.tex @@ -1,6 +1,6 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{titlepage} -\setcounter{page}{-1} +\setcounter{page}{1} \begin{center} \includegraphics*[scale=2.5]{img/TUKL_LOGO.pdf}\\[3ex] diff --git a/inc/2.dynamorio.tex b/inc/2.dynamorio.tex index 50db940..1cef265 100644 --- a/inc/2.dynamorio.tex +++ b/inc/2.dynamorio.tex @@ -2,22 +2,41 @@ \label{sec:dynamorio} This section will give a short overview of the dynamic binary instrumentation tool DynamoRIO, which will be used throughout this thesis. -It is mainly based on on the chapter \textit{DynamoRIO} and \textit{Code Cache} of \cite{Bruening2004} as well as on \cite{Bruening2003}. +The exlained topics are mainly based on the chapter \textit{DynamoRIO} and \textit{Code Cache} of \cite{Bruening2004} as well as on \cite{Bruening2003}. \subsection{Dynamic Binary Instrumentation} \label{sec:dbi} -\revabbr{Dynamic binary instrumentation}{DBI} is a method for analyzing and manipulating the behavior of a binary application while it is running. -This is achieved through the injection of additional instructions into the instruction trace of the target application. +\revabbr{Dynamic binary instrumentation}{DBI} is a method for analyzing, profiling, manipulating and optimizng the behavior of a binary application while it is executing. +This is achieved through the injection of additional instructions into the instruction trace of the target application, that either accumulate statistics or intervene the instruction trace. -Debuggers, on the other hand, use special breakpoint instructions (e.g. INT3 on x86 or BKPT on ARM) that get injected at specific places in the code, raising a debug exception when reaching it. -At those exceptions a context switch to the operating system kernel will be performed, however, those context switches result in a significant performance penalty as the processor state has to be saved and restored afterwards, making it slower than DBI. +In comparison, debuggers use special breakpoint instructions (e.g. INT3 on x86 or BKPT on ARM) that get injected at specific places in the code, raising a debug exception when reaching it. +At those exceptions a context switch to the operating system kernel will be performed. +However, those context switches result in a significant performance penalty as the processor state has to be saved and restored afterwards, making it less efficient than DBI. -Because the instrumentation tool runs in the same process as the target application, it is important that it operates transparently, meaning that it will not affect the application behavior in unintended ways. -This is a special challenge as the dynamic instrumentation is not allowed to use the same memory routines or input/output buffering as the target application \cite{Bruening2003}. +DBI tools can either start the target application by themselfes or attach to the applications process dynamically. +The former method allows instrumentation of even the early startup stage of the application whereas the latter method might be used if the application has to be first brought into a certain state or the process cannot be restarted due to reliability reasons. +Some DBI tools also allow for directly implementing the DBI framework into the applications source code. +While this removes the flexibility of observing applications that are only available in binary form, this enables the control over the DBI tool using its application interface. +With this method, it is possible to precisely instrument only a specific code region of interest and otherwise disable the tool for performance reasons. -In contrast to static code analysis, which cannot predict the execution path of the program, the full runtime information is available to the dynamic instrumentation. +In all cases, the instrumentation tool executes in the same process as the target application and implants itself into the its address space. +While this enables great control of the DBI tool over the target application, it becomes important that it operates transparently, meaning that it will not affect the applications behavior in unintended ways. +This is a special challenge as the instrumentation tool as well as the user-written instrumentation clients are not allowed to use library routines for memory operations/allocation, synchronization or input/output buffering that interfere with the target application \cite{Bruening2003}. +This is especially the case with library routines that are not reentrant, which means they are unsafe to call concurrently. +The dispatcher of the DBI tool can run in arbitrary places, also during non-reentrant routines. +When the instrumentation tool or user-written client then calls the same non-reentrant routine, undefined behavior would be the consequence. +Although it is evident, the user-written client should make no assumptions over the running systems behavior and should restore all modified registers and processor states unless it is an intentional interference with the application. +Most DBI tools offer the use of two distinct methods of injecting user code into the applications trace: +In one case the framework saves all relevant registers and flags by itself and dispatches the execution to a user-defined function. +This is the easiest method, but comes at the cost of the described context switch. +The more advanced approach is the injection of few but sufficient instructions directly into the applications instruction trace. +Here, it is the responsibility of the user to save and restore all altered states. + +Generally speaking, the application should have no possibility to be able to detect that it is being instrumented by a DBI tool and should execute the same way as it would normally, even when the application itself commits incorrect behavior such as accessing invalid memory regions. + +In summary, dynamic code analysis has the full runtime information available, unlike static code analysis, which cannot predict the execution path of the program. So DBI can be a mature choice for examining the runtime behavior of a binary application in a performant way. The following section \ref{sec:dynamorio_core} will explain how the core functionality of the DBI tool DynamoRIO works. @@ -25,48 +44,74 @@ The following section \ref{sec:dynamorio_core} will explain how the core functio \subsection{Core Functionality} \label{sec:dynamorio_core} -A simple way observe and potentially modify the instructions of an application during execution is the use of a interpretation engine, where the binary gets emulated. -This approach, however, might be powerful but is very slow. +A simple way observe and potentially modify the instructions of an application during execution is the use of an interpretation engine that emulates the binary executable in its entirety. +One widely used framework that uses this technique is for example Valgrind\cite{Valgrind}. +At its core, Valgrind uses a virtual machine and just-in-time compilation to instrument the target application. +However, this approach might be powerful but comes at the cost of significantly reduced performance. -DynamoRIO on the other hand uses a so called \textit{code cache} where \textit{basic blocks} get copied into prior to execution. -A basic block is a sequence of instructions that end with a single control transfer instruction. -In the code cache basic blocks get extended by two \textit{exit stubs}, ensuring that at the end the control is transferred back to DynamoRIO via a context switch. -From there the applications state is saved and the next basic block will be copied into the code cache, modified and executed after restoring the applications state. -Basic blocks that are already in the code cache get directly executed, however, a context switch is still needed to determine the next basic block. -To reduce this overhead, DynamoRIO can \textit{link} two basic blocks together that were targeted by a direct branch, avoiding the context switch. -For indirect branches it is not possible to link them as their target basic blocks may vary and DynamoRIO needs to translate the branch address to the address of the basic block in the code cache. -However, basic block that are often executed in a sequence are be merged into a \textit{trace}. -At the end of each basic block, a additional check is performed to determine if the indirect branch target will stay in the same trace, possibly preventing the context switch. +DynamoRIO on the other hand uses a so-called \textit{code cache} where \textit{basic blocks} get copied into prior to execution. +A basic block is a sequence of instructions extracted from the target applications binary that end with a single control transfer instruction. +In the code cache, the instrumentation instructions will directly be inserted. + +To be able to execute the modified code, basic blocks in the code cache get extended by two \textit{exit stubs}, ensuring that at the end the control is transferred back to DynamoRIO via a context switch. +From there the applications and processor state is saved and the next basic block will be copied into the code cache, modified and executed after restoring the previously saved state. +Basic blocks that are already in the code cache get directly executed without copying, however, a context switch is still needed to determine the next basic block to execute. + +To reduce this overhead and avoid a context switch, DynamoRIO can \textit{link} two basic blocks together that were targeted by a direct branch, i.e. branches whose target address will not change during runtime. +To achieve this, the target address has to be convertes in-place to point to the new address in the code cache and not the original one in mapped binary executable. +For indirect branches, branches whose target address is calculated at runtime, it is not possible to link them as their target basic blocks may vary. +However, basic blocks that are often executed in a sequence are be merged into a \textit{trace}. +At the end of each basic block, an additional check is performed to determine if the indirect branch target will stay in the same trace, possibly preventing the context switch. +Those often executed parts of the application code are also referred to as \textit{hot code} and their optimization using traces is worthwhile, even if this results in multiple copies of the same basic block in the code cache. The generic term for a basic block or a trace is \textit{fragment}. -Figure \ref{fig:dynamorio} illustrates the functionality of DynamoRIO. -The application code will get loaded by the dispatcher, modified by the basic block builder and finally be executed in the code cache. - -% vlt noch auf transparenz eingehen wie dies gelöst wird. +Figure \ref{fig:dynamorio} illustrates the internal architecture and functionality of DynamoRIO. +The application code gets loaded by the dispatcher, modified by the basic block builder, copied into the code cache and finally be executed. \input{img/thesis.tikzstyles} -\begin{figure}[!ht] +\begin{figure} \begin{center} \tikzfig{img/dynamorio} -\caption{DynamoRIO runtime code manipulation layer \cite{Bruening2003}.} +\caption{DynamoRIO runtime code manipulation layer \cite{Bruening2004}.} \label{fig:dynamorio} \end{center} \end{figure} +As mentioned in section \ref{sec:dbi}, it is important for a DBI tool to operate transparently. +DynamoRIO takes a number of measures to achieve this goal, some of which will now be explained\cite{Bruening2004}. +As sharing libraries with the target application can cause transparency issues, especially when using non-reentrant routines or routines that alter static state such as error codes, DynamoRIO directly interfaces with the system using system calls and even avoids to use the C standard library (e.g. \textit{glibc} on Linux). +The same should also apply for user-written instrumentation clients (introduced in more detail in section \ref{sec:dynamorio_client}) but the direct usage of system calls is discouraged as this bypasses the internal monitoring of DynamoRIO for changes that affect the processes address space. +Instead, DynamoRIO provides a cross-platform API for generic routines as file system operations and memory allocation. +To guarantee thread-transparency, DynamoRIO does not spawn new threads by itself, but uses the application threads instead and creates one DynamoRIO context for each. +When a instrumentation client needs to spawn threads, they should be hidden from introspection from the application. +Client code should also not alter the application stack in any way, as some specialized applications access data beyond the top of the stack. +Alternatively, DynamoRIO provides a separate stack that should be used instead to store temporary data. +To remain undetected, it is also required for DynamoRIO to protect its own memory from malicious reads or writes from the application. +Those should, like in the native case, raise an exception as unallocated data is accessed. +However, as these memory regions are fact allocated, DynamoRIO has to produce those execption itself to remain transparent. +When the application branches to an dynamically calculated address, DynamoRIO has to translate this address to the corresponding address of the basic block in the code cache. +But also in the backward case, whenever a code cache address is exposed to the application, it has to be converted back to the corresponding address into the mapped address region of the binary executable. + +As can be seen, DynamoRIO makes significant effort to ensure transparency. +However, factors such as timing deviations cannot be taken into account, since the instrumentation code consists of additional instructions that must be executed. +So a sophisticated application could try to detect the presence of an instrumentation tool by estimating and comparing the execution time of its own routines. + \subsection{Clients} \label{sec:dynamorio_client} -Currently, the presence of DynamoRIO does not have an effect other than that the application is executed from the code cache. -Clients make it possible to dynamically modify the basic blocks, either to alter the application behavior or to insert observational instructions. +With the inner workings introduced so far, the presence of DynamoRIO does not have an effect other than that the application is executed from the code cache. +DynamoRIO provides a programming interface to develop external so-called \textit{clients}\cite{Bruening2004}. +Clients are user-written instrumentation tools and make it possible to dynamically modify the basic blocks, either to alter the application behavior or to insert observational instructions. A DynamoRIO client is compiled into a shared library and passed to the \textit{drrun} utility using a command line option. -It then can implement a number of hook functions that get called by DynamoRIO such as the basic block creation or the trace creation event. -It is important to note that hooks like the basic block creation function do not get called when this basic block is executed but when it is generated and placed into the code cache. +Clients implement a number of hook functions that will called by DynamoRIO for certain events such as the creation of a basic block or of a trace. +Generally, there are two classes of hooks: +Those that execute on basic block creation instrument all of the application code and those that execute on trace generation are only interested in frequently executed code. +It is important to note that the hooks for basic block and trace generation are not called every time when this code sequence is executed but when these basic blocks are generated and placed into the code cache. +So the required instructions have to be inserted into the basic block instruction stream in this stage, rather than implementing the observational or manipulative behavior in the hook function itself. -The table \ref{tab:dynamorio_api} lists the most important hooks that a client can implement. +The table \ref{tab:dynamorio_api} lists some of the most important hooks that a client can implement. -A client that already comes with DynamoRIO is DrCacheSim with the DrMemtrace-Framework, which will be further explained in section \ref{sec:analysis_tool}. - -\begin{table}[!ht] +\begin{table} \caption{Client routines that get called by DynamoRIO \cite{Bruening2003}.} \begin{center} \begin{tabular}{|p{0.55\linewidth} | p{0.35\linewidth}|} @@ -94,3 +139,18 @@ A client that already comes with DynamoRIO is DrCacheSim with the DrMemtrace-Fra \end{center} \label{tab:dynamorio_api} \end{table} + +Most of the hooks receive a \texttt{void *context} pointer to the thread-local machine context through its parameter list, which then needs to be passed to the code manipulation routines. +Those routines are available through DynamoRIOs rich code manipulation API that enables the generation, the encoding and the decoding of instructions. +Since the processors flag and general purpose registers might be altered by executing those new instructions, it is necessary to store them before and restoring them after executing them to guarantee transparency. +DynamoRIO also provides client routines to store those flags and registers in a thread-local slots. +An alternative to manually storing and restoring are, as previously mentioned in section \ref{sec:dbi}, so-called \textit{clean calls} where DynamoRIO takes the responsibility for storing and restoring the processors state. +The clean call then dispatches to a user-defined function that will be run every time the basic block executes by modifying the program counter. +This comes at the great advantage of not having to implement the observational or manipulative behavior using assembly instructions, instead the compiler of the client takes care of converting the clean call function into machine code. +However, since DynamoRIO can not know which registers do need to be stored as this depends on the user code, it has to preserve the whole processors state. +The dispatching to the clean call function is essentially a context switch and therefore has a great impact on the performance. +So it is up to the user to decide whether the gain in performance by avoiding clean calls outweighs the higher development effort. + +An exemplary client that already comes with DynamoRIO is \textit{DrCacheSim}. +Together with the \textit{DrMemtrace-Framework}, this client provides an easy way to trace the executed instructions of the application and the memory accesses it makes. +This framework will be further explained in section \ref{sec:analysis_tool}. diff --git a/inc/6.implementation.tex b/inc/6.implementation.tex index fb1fafa..2ad9e17 100644 --- a/inc/6.implementation.tex +++ b/inc/6.implementation.tex @@ -248,7 +248,7 @@ If this is not the case, the request gets internally buffered and forwarded when For illustrating this further, a simple example can be assumed: One L2 cache needs to request a cache line from the underlying L3 cache. The MultiSimpleCoupler receives the \texttt{BEGIN\_REQ} phase and places it into its PEQ. -From there, an internal routing table is updated to be able to send the response back through the correct multi-socket binding afterwards. +From there, a hash table used as an internal routing table is updated to be able to send the response back through the correct multi-socket binding afterwards. As the L3 cache is currently not applying backpressure onto the interconnect, it can forward the transaction with the \texttt{BEGIN\_REQ} phase to the L3 cache. Until the L3 cache responds with the \texttt{END\_REQ} phase, the interconnect defers any new request from any L2 cache and buffers the payload objects in an internal data structure. When the \texttt{END\_REQ} phase is received, the next transaction from this request buffer is sent to the L3 cache.