From b8d75bf8f10aad965e4f045c00c642494a5e3067 Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Wed, 25 May 2022 19:13:05 +0200 Subject: [PATCH] Non-blocking caches --- doc.tex | 2 +- img/mshr_file.tikz | 56 +++++++++++++++++++++ img/thesis.tikzstyles | 10 ++++ img/virtual_address.tikz | 32 ++++++++++++ img/virtual_address_conversion.tikz | 66 ++++++++++++++++++++++++ inc/2.dynamorio.tex | 8 +-- inc/4.caches.tex | 78 ++++++++++++++++++++++++----- inc/6.implementation.tex | 4 +- 8 files changed, 237 insertions(+), 19 deletions(-) create mode 100644 img/mshr_file.tikz create mode 100644 img/virtual_address.tikz create mode 100644 img/virtual_address_conversion.tikz diff --git a/doc.tex b/doc.tex index 60f805e..ee328cf 100644 --- a/doc.tex +++ b/doc.tex @@ -38,7 +38,7 @@ %\usepackage{listings} %\input{subsections.sty} \setcounter{secnumdepth}{5} -\setcounter{tocdepth}{5} +\setcounter{tocdepth}{5} \numberwithin{equation}{section} \numberwithin{figure}{section} diff --git a/img/mshr_file.tikz b/img/mshr_file.tikz new file mode 100644 index 0000000..f6930d2 --- /dev/null +++ b/img/mshr_file.tikz @@ -0,0 +1,56 @@ +\begin{tikzpicture} + \begin{pgfonlayer}{nodelayer} + \node [style=block address] (19) at (7.5, -8.5) {}; + \node [style=block address] (18) at (7.25, -8.75) {}; + \node [style=none] (0) at (-0.5, 0) {}; + \node [style=none] (1) at (-0.5, -10) {}; + \node [style=none] (2) at (19, 0) {}; + \node [style=none] (3) at (19, -10) {}; + \node [style=none] (4) at (1.5, -1) {MSHR 1}; + \node [style=none] (5) at (1.5, -3) {MSHR 2}; + \node [style=none] (6) at (1.5, -4) {\vdots}; + \node [style=none] (7) at (1.5, -5.5) {MSHR n}; + \node [style=block address] (8) at (7, -1) {Block Address}; + \node [style=block address] (9) at (14, -1) {Target Information}; + \node [style=valid] (10) at (18, -1) {V}; + \node [style=block address] (11) at (7, -3) {Block Address}; + \node [style=block address] (12) at (14, -3) {Target Information}; + \node [style=valid] (13) at (18, -3) {V}; + \node [style=block address] (14) at (7, -5.5) {Block Address}; + \node [style=block address] (15) at (14, -5.5) {Target Information}; + \node [style=valid] (16) at (18, -5.5) {V}; + \node [style=block address] (17) at (7, -9) {Comparators}; + \node [style=none] (20) at (5.5, -1.75) {}; + \node [style=none] (21) at (5.5, -2.25) {}; + \node [style=none] (22) at (5.5, -3.75) {}; + \node [style=none] (23) at (5.5, -4.75) {}; + \node [style=none] (24) at (5.5, -6.25) {}; + \node [style=none] (25) at (5.5, -7.75) {}; + \node [style=none] (26) at (7, -3.75) {}; + \node [style=none] (27) at (7, -4.75) {}; + \node [style=none] (28) at (7, -6.25) {}; + \node [style=none] (29) at (7, -7.75) {}; + \node [style=none] (30) at (9.5, -6.25) {}; + \node [style=none] (31) at (9.5, -7.75) {}; + \node [style=none] (32) at (8.25, -7) {\dots}; + \node [style=none] (33) at (7, -11) {}; + \node [style=none] (34) at (7.75, -10.5) {Hit}; + \node [style=none] (35) at (9.25, 0.5) {MSHR File}; + \node [style=none] (36) at (-1.5, -9) {}; + \node [style=none] (37) at (1.25, -8.5) {Address}; + \end{pgfonlayer} + \begin{pgfonlayer}{edgelayer} + \draw (20.center) to (21.center); + \draw (22.center) to (23.center); + \draw (24.center) to (25.center); + \draw (26.center) to (27.center); + \draw (28.center) to (29.center); + \draw (30.center) to (31.center); + \draw (17) to (33.center); + \draw (0.center) to (2.center); + \draw (2.center) to (3.center); + \draw (3.center) to (1.center); + \draw (1.center) to (0.center); + \draw (36.center) to (17); + \end{pgfonlayer} +\end{tikzpicture} diff --git a/img/thesis.tikzstyles b/img/thesis.tikzstyles index 41945ce..39fa7cb 100644 --- a/img/thesis.tikzstyles +++ b/img/thesis.tikzstyles @@ -19,6 +19,13 @@ \tikzstyle{os}=[fill={rgb,255: red,207; green,207; blue,207}, draw=black, shape=rectangle, minimum width=6cm] \tikzstyle{hardware}=[fill={rgb,255: red,174; green,169; blue,174}, draw=black, shape=rectangle, minimum width=6cm] \tikzstyle{align text}=[fill=none, draw=none, rotate=90, align=center] +\tikzstyle{tlb}=[fill={rgb,255: red,230; green,230; blue,230}, draw=black, shape=rectangle, minimum width=3.5cm, minimum height=2cm] +\tikzstyle{asid}=[fill={rgb,255: red,230; green,230; blue,230}, draw=black, shape=rectangle] +\tikzstyle{cache 2}=[fill={rgb,255: red,230; green,230; blue,230}, draw=black, shape=rectangle, minimum width=3.5cm, minimum height=2.75cm] +\tikzstyle{page frame}=[fill={rgb,255: red,179; green,179; blue,179}, draw=black, shape=rectangle, minimum width=6.5cm] +\tikzstyle{cache data}=[fill={rgb,255: red,230; green,230; blue,230}, draw=black, shape=rectangle, minimum width=10cm] +\tikzstyle{block address}=[fill={rgb,255: red,200; green,200; blue,255}, draw=black, shape=rectangle, minimum height=0.75cm, minimum width=3.5cm] +\tikzstyle{valid}=[fill={rgb,255: red,200; green,200; blue,255}, draw=black, shape=rectangle, minimum height=0.75cm, minimum width=0.6cm] % Edge styles \tikzstyle{dashed line}=[-, dashed] @@ -32,3 +39,6 @@ \tikzstyle{latex arrow}=[-, -latex] \tikzstyle{context switch}=[-, dashed, fill={rgb,255: red,222; green,222; blue,222}] \tikzstyle{latex arrow dashed}=[-, -latex, dashed] +\tikzstyle{thin}=[-, very thin] +\tikzstyle{virtual page number}=[-, fill={rgb,255: red,179; green,179; blue,179}] +\tikzstyle{page offset}=[-, fill={rgb,255: red,247; green,247; blue,247}] diff --git a/img/virtual_address.tikz b/img/virtual_address.tikz new file mode 100644 index 0000000..2fa2348 --- /dev/null +++ b/img/virtual_address.tikz @@ -0,0 +1,32 @@ +\begin{tikzpicture} + \begin{pgfonlayer}{nodelayer} + \node [style=none] (0) at (0, 0) {}; + \node [style=none] (2) at (16, 0) {}; + \node [style=none] (3) at (16, -1) {}; + \node [style=none] (4) at (19, 0) {}; + \node [style=none] (5) at (19, -1) {}; + \node [style=none] (6) at (24, 0) {}; + \node [style=none] (7) at (24, -1) {}; + \node [style=none] (9) at (0, -1) {}; + \node [style=none] (10) at (8, -0.5) {Virtual Page Number}; + \node [style=none] (13) at (0.5, 0.5) {63}; + \node [style=none] (14) at (23.5, 0.5) {0}; + \node [style=none] (17) at (16.5, 0.5) {11}; + \node [style=none] (18) at (15.5, 0.5) {12}; + \node [style=none] (19) at (8, 0.5) {\dots}; + \node [style=none] (20) at (20, 0.5) {\dots}; + \node [style=none] (22) at (20, -0.5) {Page Offset}; + \end{pgfonlayer} + \begin{pgfonlayer}{edgelayer} + \draw (3.center) + to (9.center) + to [in=270, out=90] (0.center) + to (2.center) + to (4.center) + to (6.center) + to (7.center) + to (5.center) + to cycle; + \draw (2.center) to (3.center); + \end{pgfonlayer} +\end{tikzpicture} diff --git a/img/virtual_address_conversion.tikz b/img/virtual_address_conversion.tikz new file mode 100644 index 0000000..3734141 --- /dev/null +++ b/img/virtual_address_conversion.tikz @@ -0,0 +1,66 @@ +\begin{tikzpicture} + \begin{pgfonlayer}{nodelayer} + \node [style=none] (0) at (0, 0) {}; + \node [style=none] (1) at (0, -1) {}; + \node [style=none] (2) at (12, 0) {}; + \node [style=none] (3) at (12, -1) {}; + \node [style=none] (4) at (17, 0) {}; + \node [style=none] (5) at (17, -1) {}; + \node [style=none] (6) at (6, -0.5) {Virtual Page Number}; + \node [style=none] (7) at (14.5, -0.5) {Page Offset}; + \node [style=none] (8) at (2.5, 0.75) {Virtual Address}; + \node [style=tlb] (9) at (3, -7) {TLB}; + \node [style=asid] (10) at (1.5, -3.5) {ASID}; + \node [style=cache 2] (11) at (14, -6.25) {Cache}; + \node [style=page frame] (12) at (3, -11) {Page Frame Number}; + \node [style=page frame] (13) at (3, -13) {Tag: Page Frame Number}; + \node [style=cache data] (14) at (10, -16.5) {Cache Data}; + \node [style=none] (15) at (1.5, -5) {}; + \node [style=none] (16) at (6, -1) {}; + \node [style=none] (17) at (6, -3) {}; + \node [style=none] (18) at (4.5, -3) {}; + \node [style=none] (19) at (4.5, -5) {}; + \node [style=none] (20) at (3, -9) {}; + \node [style=none] (21) at (10, -1.25) {}; + \node [style=none] (22) at (10.25, -1.5) {}; + \node [style=none] (23) at (17, -1.25) {}; + \node [style=none] (24) at (16.75, -1.5) {}; + \node [style=none] (25) at (13.5, -1.5) {}; + \node [style=none] (26) at (9.5, -4.5) {}; + \node [style=none] (27) at (9.5, -3) {}; + \node [style=none] (28) at (13.5, -3) {}; + \node [style=none] (29) at (10.5, -4.5) {}; + \node [style=none] (30) at (11.25, -2.5) {Cache Index}; + \node [style=none] (31) at (14, -16) {}; + \node [style=none] (32) at (13, -9) {}; + \node [style=none] (33) at (13, -13) {}; + \node [style=none] (34) at (12, 0) {}; + \node [style=none] (35) at (12, -1) {}; + \end{pgfonlayer} + \begin{pgfonlayer}{edgelayer} + \draw [style=page offset] (3.center) + to (5.center) + to (4.center) + to (2.center); + \draw [style=virtual page number] (0.center) + to (1.center) + to (3.center) + to (2.center) + to [in=0, out=180] cycle; + \draw [style=latex arrow] (10) to (15.center); + \draw (16.center) to (17.center); + \draw (17.center) to (18.center); + \draw [style=latex arrow] (18.center) to (19.center); + \draw [style=latex arrow] (20.center) to (12); + \draw [style=thin] (21.center) to (22.center); + \draw [style=thin] (22.center) to (24.center); + \draw [style=thin] (24.center) to (23.center); + \draw (25.center) to (28.center); + \draw (28.center) to (27.center); + \draw (27.center) to (26.center); + \draw [style=latex arrow] (26.center) to (29.center); + \draw [style=latex arrow] (11) to (31.center); + \draw [style=latex arrow] (33.center) to (13); + \draw (32.center) to (33.center); + \end{pgfonlayer} +\end{tikzpicture} diff --git a/inc/2.dynamorio.tex b/inc/2.dynamorio.tex index e0977cf..8ac0f0c 100644 --- a/inc/2.dynamorio.tex +++ b/inc/2.dynamorio.tex @@ -10,10 +10,10 @@ It is mainly based on on the chapter \textit{DynamoRIO} and \textit{Code Cache} \revabbr{Dynamic binary instrumentation}{DBI} is a method for analyzing and manipulating the behavior of a binary application while it is running. This is achieved through the injection of additional instructions into the instruction trace of the target application. -Debuggers on the other hand, use special breakpoint instructions (e.g. INT3 on x86 or BKPT on ARM) that get injected at specific places in the code, raising a debug exception when reaching it. -At those exceptions a context switch to the operating system kernel will be performed, however, those context switches result in a significant performance penalty as the processor state has to be saved and restored afterwards. +Debuggers, on the other hand, use special breakpoint instructions (e.g. INT3 on x86 or BKPT on ARM) that get injected at specific places in the code, raising a debug exception when reaching it. +At those exceptions a context switch to the operating system kernel will be performed, however, those context switches result in a significant performance penalty as the processor state has to be saved and restored afterwards, making it slower than DBI. -Because the instrumentation tool runs in the same process as the application, it is important that it operates transparently, meaning that it will not affect the application behavior in unintended ways. +Because the instrumentation tool runs in the same process as the target application, it is important that it operates transparently, meaning that it will not affect the application behavior in unintended ways. This is a special challenge as the dynamic instrumentation is not allowed to use the same memory routines or input/output buffering as the target application \cite{Bruening2003}. In contrast to static code analysis, which cannot predict the execution path of the program, the full runtime information is available to the dynamic instrumentation. @@ -37,7 +37,7 @@ To reduce this overhead, DynamoRIO can \textit{link} two basic blocks together t For indirect branches it is not possible to link them as their target basic blocks may vary and DynamoRIO needs to translate the branch address to the address of the basic block in the code cache. However, basic block that are often executed in a sequence are be merged into a \textit{trace}. At the end of each basic block, a additional check is performed to determine if the indirect branch target will stay in the same trace, possibly preventing the context switch. -The generic term for a basic block or a trace is a \textit{fragment}. +The generic term for a basic block or a trace is \textit{fragment}. Figure \ref{fig:dynamorio} illustrates the functionality of DynamoRIO. The application code will get loaded by the dispatcher, modified by the basic block builder and finally be executed in the code cache. diff --git a/inc/4.caches.tex b/inc/4.caches.tex index 9b5b1d3..e3fa34e 100644 --- a/inc/4.caches.tex +++ b/inc/4.caches.tex @@ -10,10 +10,11 @@ Therefore caches, whose goal is to decrease the latency and increase the bandwid Caches are faster than DRAM, but only provide a small capacity, as the per-bit cost is larger. For this reason, at least the \textit{working set}, the data that the currently running application is working on, should be stored in the cache. + The two most important heuristics that make this possible will be explained in section \ref{sec:caches_locality_principles}. -After that the typical structure of a cache will be discussed in \ref{sec:caches_logical_organization}, followed by the considerations to make when it comes to virtual addressing in section \ref{sec:caches_virtual_addressing}. +After that the typical structure of a cache will be discussed in \ref{sec:caches_logical_organization}. +Replacement policies will be explained in \ref{sec:replacement_policies} and write policies in \ref{sec:write_policies}, followed by the considerations to make when it comes to virtual addressing in section \ref{sec:caches_virtual_addressing}. Finally, the advantage of non-blocking caches is the topic of section \ref{sec:caches_non_blocking_caches}. -TODO update \subsection{Locality Principles} \label{sec:caches_locality_principles} @@ -24,18 +25,17 @@ Those two heuristics are called \textit{temporal locality} and \textit{spatial l \subsubsection{Temporal Locality} -Referenced data is likely to be referenced again by the application in the future. -This is most important characteristic that make it possible for a cache to optimize the access latency. -When new data is referenced, it will be fetched from the main memory and kept in the cache. -Operations using this data can now perform calculations and use the end result further by only accessing the cache, exploiting this tendency of the application. +Temporal locality is the concept of referenced data being likely to be referenced again in the near future. +Taking advantage of this is the main idea behind a cache: +When new data is referenced, it will be read from the main memory and buffered in the cache. +The processor can now perform operations on this data and use its end result further without needing to access the main memory. \subsubsection{Spatial Locality} -Programs have a tendency to reference data that is nearby already referenced data in the memory space. -This is because related data is often clustered together, for example in arrays or structures. +Programs have a tendency to reference data that is nearby in the memory space of already referenced data. +This tendency, spatial locality, arises because related data is often clustered together, for example in arrays or structures. When calculations are performed on those arrays, sequential access patterns can be observed as one element is processed after the other. - -This tendency can be exploited by organizing blocks of data in so called \textit{cache blocks} or \textit{cache lines} which are larger than a single data word. +Spatial locality can be exploited by organizing blocks of data in so called \textit{cache blocks} or \textit{cache lines} which are larger than a single data word. This is a passive form of making use of spatial locality, as referenced data will also cause nearby words to be loaded into the same cache line, making them available for further accesses. An active form of exploiting spatial locality is the use of \textit{prefetching}. @@ -97,9 +97,9 @@ To determine which one of the corresponding set, there are several replacement p The \revabbr{least recently used}{LRU} policy selects the cache line whose last usage is the longest time ago. A LRU algorithm is expensive to implement, a counter value for every cache line of a set has to be updated every time the set is accessed. \item - An alternative is a \revabbr{Pseudo-LRU}{PLRU} policy, where one bit is set to 1 every time a cache line is accessed. + An alternative is a \revabbr{pseudo LRU}{PLRU} policy, where an extra bit is set to 1 every time a cache line is accessed. When the extra bit of every cache line in a set is set to 1, they will get reset to 0. - In case of an contention, the first cache line whose extra bit is 0 will be evicted, which indicates that the last usage was some time ago. + In case of contention, the first cache line whose extra bit is 0 will be evicted, which indicates that the last usage was likely some time ago. \item In the \revabbr{least frequently used}{LFU} policy, every time a cache line is accessed, a counter value will be increased. The cache line with the lowest value, the least frequently used one, will be chosen to be evicted. @@ -122,9 +122,63 @@ Also here, a write buffer can be used to place the actual write back requests in \subsection{Virtual Addressing} \label{sec:caches_virtual_addressing} +Operating systems use virtual addressing to isolate the memory spaces of user space programs from each other, giving each process an own virtal address space. +\textit{Virtual addresses} are composed of a \textit{virtual page number} and a \textit{page offset}. +The virtual page number is the actual part that is virtual, the page offset is the same for the virtual and the physical address. +Figure \ref{fig:virtual_address} shows an exemplary division of a virtual address into its components. + +\begin{figure}[!ht] +\begin{center} +\tikzfig{img/virtual_address} +\caption{Exemplary division of the virtual address into a virtual page number and page offset.} +\label{fig:virtual_address} +\end{center} +\end{figure} + +Before a process can access a specific region in memory, the kernel has to translate the virtual page number into a physical page number. +For conversions, so called \textit{page tables} are used to look up the physical page number. +Page tables are usually multiple levels deep (e.g. 4-levels on x86), so a single conversion can cause up to 4 memory accesses, which is expensive. +To improve performance, a \revabbr{translation lookaside buffer}{TLB} is used that acts like a cache on its own for physical page numbers. + +However, as long as the physical address is not present, the data cache cannot lookup its entries as the index is not known yet. +So the cache has to wait on the TLB, or worse on multiple memory accesses. +To circuumvent this problem, the cache can be indexed by the virtual address what makes it possible to parallize both procedures. +Such a cache is called \textit{virtually indexed} and \textit{physically tagged} and is illustrated in figure \ref{fig:virtual_address_conversion}. + +% Ist die Darstellung aus dem Buch richtig? Sollte der Cache Index wirklich über den Page Offset hinaus gehen? +\begin{figure}[!ht] +\begin{center} +\tikzfig{img/virtual_address_conversion} +\caption{Virtually indexed, physically tagged cache.\cite{Jacob2008} ASID refers to address-space identifier.} +\label{fig:virtual_address_conversion} +\end{center} +\end{figure} + +The result from the TLB, the physical page number, needs to be compared to tag that is stored in the cache. +When the tag and the physical page number match, then the cache entry is valid for this virtual address. +Note that when the cache index is completely contained in the page offset, another problem called \textit{aliasing} is resolved, which will not further be discussed in this thesis. \subsection{Non-blocking Caches} \label{sec:caches_non_blocking_caches} +In blocking caches, cache misses require the processor to stall until the data is fetched from the underlying memory. +As this is a major slowdown, non-blocking caches try to solve this problem, making it possible for the processor to make further progress while waiting on the value. +Similarly to the write buffer, previously discussed in \ref{sec:write_policies}, a new buffer will be introduced: the \revabbr{miss status hold register}{MSHR}. +The number of MSHRs correspond to the number of misses the cache can handle concurrently; when all available MSHRs are occupied and a further miss occurs, the cache will block. +A MSHR entry always corresponds to one cache line that is currently being fetched from the underlying memory subsystem. + +There are two variants of cache misses: +\textit{Primary misses} are misses that lead to another occupation of a MSHR, where as \textit{secondary misses} are added to an existing MSHR entry and therefore cannot cause the cache to block. +This is the case when the same cache line as accessed. + +An architecture of a MSHR file is illustrated in figure \ref{fig:mshr_file}. + +\begin{figure}[!ht] +\begin{center} +\tikzfig{img/mshr_file} +\caption{Miss Holding Status Register File.\cite{Jahre2007} V refers to a valid bit.} +\label{fig:mshr_file} +\end{center} +\end{figure} diff --git a/inc/6.implementation.tex b/inc/6.implementation.tex index eff2c5b..b76ca37 100644 --- a/inc/6.implementation.tex +++ b/inc/6.implementation.tex @@ -54,8 +54,8 @@ In particular, the \texttt{process\_memref\_t()} method of a tool is called for The newly developed DRAMTracer tool creates for every thread of the application a seperate trace file. As it is not known how many threads an application will spawn, the tool will listen for records with new TIDs that it did not register yet. -For every data reference, a new entry in the corresponding trace file is made which contains the size and the address of the access, whether it was a read or write, and also a count of (computational) instructions that have been executed since the last reference. -This instruction count is used to approximate the delay between the memory accesses when the trace is replayed by DRAMSys as described in section TODO. +For every data reference, a new entry in the corresponding trace file is made which contains the size and the physical address of the access, whether it was a read or write, and also a count of (computational) instructions that have been executed since the last reference. +This instruction count is used to approximate the delay between the memory accesses when the trace is replayed by DRAMSys. \begin{listing} \begin{textcode}