mem-cache: Standardize data parsing in compressors

The compressors are not able to process a whole line at once,
so they must divide it into multiple same-sized chunks. This
patch makes the base compressor responsible for this division,
so that the derived classes are mostly agnostic to this
translation.

This change has been coupled with a change of the signature
of the public compress() to avoid introducing a temporary
function rename. Previously, this function did not return
the compressed data, under the assumption that everything
related to the compressed data would be handled by the
compressor. However, sometimes the units using the compressor
could need to know or store the compressed data.

For example, when sharing dictionaries the compressed data
must be checked to determine if two blocks can co-allocate
(DISH, Panda et al. 2016).

Change-Id: Id8dbf68936b1457ca8292cc0a852b0f0a2eeeb51
Signed-off-by: Daniel R. Carvalho <odanrc@yahoo.com.br>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/33379
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Daniel R. Carvalho
2020-06-04 13:42:56 +02:00
committed by Daniel Carvalho
parent 3fc4c0a415
commit de94a29f85
20 changed files with 179 additions and 82 deletions

11
src/mem/cache/base.cc vendored
View File

@@ -844,11 +844,11 @@ BaseCache::updateCompressionData(CacheBlk *blk, const uint64_t* data,
// The compressor is called to compress the updated data, so that its
// metadata can be updated.
std::size_t compression_size = 0;
Cycles compression_lat = Cycles(0);
Cycles decompression_lat = Cycles(0);
compressor->compress(data, compression_lat, decompression_lat,
compression_size);
const auto comp_data =
compressor->compress(data, compression_lat, decompression_lat);
std::size_t compression_size = comp_data->getSizeBits();
// If block's compression factor increased, it may not be co-allocatable
// anymore. If so, some blocks might need to be evicted to make room for
@@ -1421,8 +1421,9 @@ BaseCache::allocateBlock(const PacketPtr pkt, PacketList &writebacks)
// calculate the amount of extra cycles needed to read or write compressed
// blocks.
if (compressor && pkt->hasData()) {
compressor->compress(pkt->getConstPtr<uint64_t>(), compression_lat,
decompression_lat, blk_size_bits);
const auto comp_data = compressor->compress(
pkt->getConstPtr<uint64_t>(), compression_lat, decompression_lat);
blk_size_bits = comp_data->getSizeBits();
}
// Find replacement victim

View File

@@ -35,6 +35,8 @@ class BaseCacheCompressor(SimObject):
cxx_header = "mem/cache/compressors/base.hh"
block_size = Param.Int(Parent.cache_line_size, "Block size in bytes")
chunk_size_bits = Param.Unsigned(32,
"Size of a parsing data chunk (in bits)")
size_threshold = Param.Unsigned(Parent.cache_line_size, "Minimum size, "
"in bytes, in which a block must be compressed to. Otherwise it is "
"stored in its uncompressed state")
@@ -53,31 +55,43 @@ class Base64Delta8(BaseDictionaryCompressor):
cxx_class = 'Compressor::Base64Delta8'
cxx_header = "mem/cache/compressors/base_delta.hh"
chunk_size_bits = 64
class Base64Delta16(BaseDictionaryCompressor):
type = 'Base64Delta16'
cxx_class = 'Compressor::Base64Delta16'
cxx_header = "mem/cache/compressors/base_delta.hh"
chunk_size_bits = 64
class Base64Delta32(BaseDictionaryCompressor):
type = 'Base64Delta32'
cxx_class = 'Compressor::Base64Delta32'
cxx_header = "mem/cache/compressors/base_delta.hh"
chunk_size_bits = 64
class Base32Delta8(BaseDictionaryCompressor):
type = 'Base32Delta8'
cxx_class = 'Compressor::Base32Delta8'
cxx_header = "mem/cache/compressors/base_delta.hh"
chunk_size_bits = 32
class Base32Delta16(BaseDictionaryCompressor):
type = 'Base32Delta16'
cxx_class = 'Compressor::Base32Delta16'
cxx_header = "mem/cache/compressors/base_delta.hh"
chunk_size_bits = 32
class Base16Delta8(BaseDictionaryCompressor):
type = 'Base16Delta8'
cxx_class = 'Compressor::Base16Delta8'
cxx_header = "mem/cache/compressors/base_delta.hh"
chunk_size_bits = 16
class CPack(BaseDictionaryCompressor):
type = 'CPack'
cxx_class = 'Compressor::CPack'
@@ -105,6 +119,7 @@ class PerfectCompressor(BaseCacheCompressor):
cxx_class = 'Compressor::Perfect'
cxx_header = "mem/cache/compressors/perfect.hh"
chunk_size_bits = 64
max_compression_ratio = Param.Int(2,
"Maximum compression ratio allowed")
compression_latency = Param.Cycles(1,
@@ -117,11 +132,15 @@ class RepeatedQwordsCompressor(BaseDictionaryCompressor):
cxx_class = 'Compressor::RepeatedQwords'
cxx_header = "mem/cache/compressors/repeated_qwords.hh"
chunk_size_bits = 64
class ZeroCompressor(BaseDictionaryCompressor):
type = 'ZeroCompressor'
cxx_class = 'Compressor::Zero'
cxx_header = "mem/cache/compressors/zero.hh"
chunk_size_bits = 64
class BDI(MultiCompressor):
compressors = [ZeroCompressor(), RepeatedQwordsCompressor(),
Base64Delta8(), Base64Delta16(), Base64Delta32(), Base32Delta8(),

View File

@@ -33,6 +33,7 @@
#include "mem/cache/compressors/base.hh"
#include <algorithm>
#include <climits>
#include <cmath>
#include <cstdint>
#include <string>
@@ -75,19 +76,58 @@ Base::CompressionData::getSize() const
}
Base::Base(const Params *p)
: SimObject(p), blkSize(p->block_size), sizeThreshold(p->size_threshold),
: SimObject(p), blkSize(p->block_size), chunkSizeBits(p->chunk_size_bits),
sizeThreshold(p->size_threshold),
stats(*this)
{
fatal_if(64 % chunkSizeBits,
"64 must be a multiple of the chunk granularity.");
fatal_if(blkSize < sizeThreshold, "Compressed data must fit in a block");
}
std::vector<Base::Chunk>
Base::toChunks(const uint64_t* data) const
{
// Number of chunks in a 64-bit value
const unsigned num_chunks_per_64 =
(sizeof(uint64_t) * CHAR_BIT) / chunkSizeBits;
// Turn a 64-bit array into a chunkSizeBits-array
std::vector<Chunk> chunks((blkSize * CHAR_BIT) / chunkSizeBits, 0);
for (int i = 0; i < chunks.size(); i++) {
const int index_64 = std::floor(i / (double)num_chunks_per_64);
const unsigned start = i % num_chunks_per_64;
chunks[i] = bits(data[index_64],
(start + 1) * chunkSizeBits - 1, start * chunkSizeBits);
}
return chunks;
}
void
Base::compress(const uint64_t* data, Cycles& comp_lat,
Cycles& decomp_lat, std::size_t& comp_size_bits)
Base::fromChunks(const std::vector<Chunk>& chunks, uint64_t* data) const
{
// Number of chunks in a 64-bit value
const unsigned num_chunks_per_64 =
(sizeof(uint64_t) * CHAR_BIT) / chunkSizeBits;
// Turn a chunkSizeBits-array into a 64-bit array
std::memset(data, 0, blkSize);
for (int i = 0; i < chunks.size(); i++) {
const int index_64 = std::floor(i / (double)num_chunks_per_64);
const unsigned start = i % num_chunks_per_64;
replaceBits(data[index_64], (start + 1) * chunkSizeBits - 1,
start * chunkSizeBits, chunks[i]);
}
}
std::unique_ptr<Base::CompressionData>
Base::compress(const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat)
{
// Apply compression
std::unique_ptr<CompressionData> comp_data =
compress(data, comp_lat, decomp_lat);
compress(toChunks(data), comp_lat, decomp_lat);
// If we are in debug mode apply decompression just after the compression.
// If the results do not match, we've got an error
@@ -104,9 +144,10 @@ Base::compress(const uint64_t* data, Cycles& comp_lat,
// Get compression size. If compressed size is greater than the size
// threshold, the compression is seen as unsuccessful
comp_size_bits = comp_data->getSizeBits();
if (comp_size_bits >= sizeThreshold * 8) {
comp_size_bits = blkSize * 8;
std::size_t comp_size_bits = comp_data->getSizeBits();
if (comp_size_bits > sizeThreshold * CHAR_BIT) {
comp_size_bits = blkSize * CHAR_BIT;
comp_data->setSizeBits(comp_size_bits);
}
// Update stats
@@ -118,6 +159,8 @@ Base::compress(const uint64_t* data, Cycles& comp_lat,
DPRINTF(CacheComp, "Compressed cache line from %d to %d bits. " \
"Compression latency: %llu, decompression latency: %llu\n",
blkSize*8, comp_size_bits, comp_lat, decomp_lat);
return std::move(comp_data);
}
Cycles

View File

@@ -50,27 +50,45 @@ namespace Compressor {
/**
* Base cache compressor interface. Every cache compressor must implement a
* compression and a decompression method.
*
* Compressors usually cannot parse all data input at once. Therefore, they
* typically divide the input into multiple *chunks*, and parse them one at
* a cycle.
*/
class Base : public SimObject
{
public:
/**
* Forward declaration of compression data. Every new compressor must
* create a new compression data based on it.
*/
class CompressionData;
protected:
/**
* A chunk is a basic lexical unit. The data being compressed is received
* by the compressor as a raw pointer. In order to parse this data, the
* compressor must divide it into smaller units. Typically, state-of-the-
* art compressors interpret cache lines as sequential 32-bit chunks
* (chunks), but any size is valid.
* @sa chunkSizeBits
*/
typedef uint64_t Chunk;
/**
* This compressor must be able to access the protected functions of
* its sub-compressors.
*/
friend class Multi;
/**
* Forward declaration of compression data. Every new compressor must
* create a new compression data based on it.
*/
class CompressionData;
/**
* Uncompressed cache line size (in bytes).
*/
const std::size_t blkSize;
/** Chunk size, in number of bits. */
const unsigned chunkSizeBits;
/**
* Size in bytes at which a compression is classified as bad and therefore
* the compressed block is restored to its uncompressed format.
@@ -101,6 +119,23 @@ class Base : public SimObject
Stats::Scalar decompressions;
} stats;
/**
* This function splits the raw data into chunks, so that it can be
* parsed by the compressor.
*
* @param data The raw pointer to the data being compressed.
* @return The raw data divided into a vector of sequential chunks.
*/
std::vector<Chunk> toChunks(const uint64_t* data) const;
/**
* This function re-joins the chunks to recreate the original data.
*
* @param chunks The raw data divided into a vector of sequential chunks.
* @param data The raw pointer to the data.
*/
void fromChunks(const std::vector<Chunk>& chunks, uint64_t* data) const;
/**
* Apply the compression process to the cache line.
* Returns the number of cycles used by the compressor, however it is
@@ -108,13 +143,14 @@ class Base : public SimObject
* The decompression latency is also returned, in order to avoid
* increasing simulation time and memory consumption.
*
* @param cache_line The cache line to be compressed.
* @param chunks The cache line to be compressed, divided into chunks.
* @param comp_lat Compression latency in number of cycles.
* @param decomp_lat Decompression latency in number of cycles.
* @return Cache line after compression.
*/
virtual std::unique_ptr<CompressionData> compress(
const uint64_t* cache_line, Cycles& comp_lat, Cycles& decomp_lat) = 0;
const std::vector<Chunk>& chunks, Cycles& comp_lat,
Cycles& decomp_lat) = 0;
/**
* Apply the decompression process to the compressed data.
@@ -137,10 +173,10 @@ class Base : public SimObject
* @param data The cache line to be compressed.
* @param comp_lat Compression latency in number of cycles.
* @param decomp_lat Decompression latency in number of cycles.
* @param comp_size_bits Compressed data size (in bits).
* @return Cache line after compression.
*/
void compress(const uint64_t* data, Cycles& comp_lat,
Cycles& decomp_lat, std::size_t& comp_size_bits);
std::unique_ptr<CompressionData>
compress(const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat);
/**
* Get the decompression latency if the block is compressed. Latency is 0

View File

@@ -115,9 +115,9 @@ class BaseDelta : public DictionaryCompressor<BaseType>
void addToDictionary(DictionaryEntry data) override;
std::unique_ptr<Base::CompressionData>
compress(const uint64_t* data, Cycles& comp_lat,
Cycles& decomp_lat) override;
std::unique_ptr<Base::CompressionData> compress(
const std::vector<Base::Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat) override;
public:
typedef BaseDictionaryCompressorParams Params;

View File

@@ -67,11 +67,12 @@ BaseDelta<BaseType, DeltaSizeBits>::addToDictionary(DictionaryEntry data)
template <class BaseType, std::size_t DeltaSizeBits>
std::unique_ptr<Base::CompressionData>
BaseDelta<BaseType, DeltaSizeBits>::compress(const uint64_t* data,
Cycles& comp_lat, Cycles& decomp_lat)
BaseDelta<BaseType, DeltaSizeBits>::compress(
const std::vector<Base::Chunk>& chunks, Cycles& comp_lat,
Cycles& decomp_lat)
{
std::unique_ptr<Base::CompressionData> comp_data =
DictionaryCompressor<BaseType>::compress(data);
DictionaryCompressor<BaseType>::compress(chunks);
// If there are more bases than the maximum, the compressor failed.
// Otherwise, we have to take into account all bases that have not

View File

@@ -50,10 +50,11 @@ CPack::addToDictionary(DictionaryEntry data)
}
std::unique_ptr<Base::CompressionData>
CPack::compress(const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat)
CPack::compress(const std::vector<Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat)
{
std::unique_ptr<Base::CompressionData> comp_data =
DictionaryCompressor<uint32_t>::compress(data);
DictionaryCompressor<uint32_t>::compress(chunks);
// Set compression latency (Accounts for pattern matching, length
// generation, packaging and shifting)

View File

@@ -98,16 +98,9 @@ class CPack : public DictionaryCompressor<uint32_t>
void addToDictionary(DictionaryEntry data) override;
/**
* Apply compression.
*
* @param data The cache line to be compressed.
* @param comp_lat Compression latency in number of cycles.
* @param decomp_lat Decompression latency in number of cycles.
* @return Cache line after compression.
*/
std::unique_ptr<Base::CompressionData> compress(
const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat) override;
const std::vector<Base::Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat) override;
public:
/** Convenience typedef. */

View File

@@ -232,10 +232,11 @@ class DictionaryCompressor : public BaseDictionaryCompressor
/**
* Apply compression.
*
* @param data The cache line to be compressed.
* @param chunks The cache line to be compressed.
* @return Cache line after compression.
*/
std::unique_ptr<Base::CompressionData> compress(const uint64_t* data);
std::unique_ptr<Base::CompressionData> compress(
const std::vector<Chunk>& chunks);
using BaseDictionaryCompressor::compress;

View File

@@ -123,7 +123,7 @@ DictionaryCompressor<T>::compressValue(const T data)
template <class T>
std::unique_ptr<Base::CompressionData>
DictionaryCompressor<T>::compress(const uint64_t* data)
DictionaryCompressor<T>::compress(const std::vector<Chunk>& chunks)
{
std::unique_ptr<Base::CompressionData> comp_data =
instantiateDictionaryCompData();
@@ -133,8 +133,7 @@ DictionaryCompressor<T>::compress(const uint64_t* data)
// Compress every value sequentially
CompData* const comp_data_ptr = static_cast<CompData*>(comp_data.get());
const std::vector<T> values((T*)data, (T*)data + blkSize / sizeof(T));
for (const auto& value : values) {
for (const auto& value : chunks) {
std::unique_ptr<Pattern> pattern = compressValue(value);
DPRINTF(CacheComp, "Compressed %016x to %s\n", value,
pattern->print());

View File

@@ -55,10 +55,11 @@ FPCD::addToDictionary(DictionaryEntry data)
}
std::unique_ptr<Base::CompressionData>
FPCD::compress(const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat)
FPCD::compress(const std::vector<Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat)
{
std::unique_ptr<Base::CompressionData> comp_data =
DictionaryCompressor<uint32_t>::compress(data);
DictionaryCompressor<uint32_t>::compress(chunks);
// Set compression latency (Accounts for zero checks, ones check, match
// previous check, match penultimate check, repeated values check, pattern

View File

@@ -140,7 +140,8 @@ class FPCD : public DictionaryCompressor<uint32_t>
void addToDictionary(DictionaryEntry data) override;
std::unique_ptr<Base::CompressionData> compress(
const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat) override;
const std::vector<Base::Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat) override;
public:
typedef FPCDParams Params;

View File

@@ -72,7 +72,7 @@ Multi::~Multi()
}
std::unique_ptr<Base::CompressionData>
Multi::compress(const uint64_t* cache_line, Cycles& comp_lat,
Multi::compress(const std::vector<Chunk>& chunks, Cycles& comp_lat,
Cycles& decomp_lat)
{
struct Results
@@ -114,6 +114,12 @@ Multi::compress(const uint64_t* cache_line, Cycles& comp_lat,
}
};
// Each sub-compressor can have its own chunk size; therefore, revert
// the chunks to raw data, so that they handle the conversion internally
uint64_t data[blkSize / sizeof(uint64_t)];
std::memset(data, 0, blkSize);
fromChunks(chunks, data);
// Find the ranking of the compressor outputs
std::priority_queue<std::shared_ptr<Results>,
std::vector<std::shared_ptr<Results>>, ResultsComparator> results;
@@ -121,7 +127,7 @@ Multi::compress(const uint64_t* cache_line, Cycles& comp_lat,
for (unsigned i = 0; i < compressors.size(); i++) {
Cycles temp_decomp_lat;
auto temp_comp_data =
compressors[i]->compress(cache_line, comp_lat, temp_decomp_lat);
compressors[i]->compress(data, comp_lat, temp_decomp_lat);
results.push(std::make_shared<Results>(i, std::move(temp_comp_data),
temp_decomp_lat, blkSize));
max_comp_lat = std::max(max_comp_lat, comp_lat);

View File

@@ -78,7 +78,8 @@ class Multi : public Base
~Multi();
std::unique_ptr<Base::CompressionData> compress(
const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat) override;
const std::vector<Base::Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat) override;
void decompress(const CompressionData* comp_data, uint64_t* data) override;
};

View File

@@ -40,27 +40,19 @@
namespace Compressor {
Perfect::CompData::CompData(const uint64_t* data,
std::size_t num_entries)
: CompressionData(), entries(data, data + num_entries)
{
}
Perfect::Perfect(const Params *p)
: Base(p),
compressedSize(8 * blkSize / p->max_compression_ratio),
compressionLatency(p->compression_latency),
decompressionLatency(p->decompression_latency)
: Base(p), compressedSize(8 * blkSize / p->max_compression_ratio),
compressionLatency(p->compression_latency),
decompressionLatency(p->decompression_latency)
{
}
std::unique_ptr<Base::CompressionData>
Perfect::compress(const uint64_t* cache_line, Cycles& comp_lat,
Cycles& decomp_lat)
Perfect::compress(const std::vector<Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat)
{
// Compress every word sequentially
std::unique_ptr<Base::CompressionData> comp_data(
new CompData(cache_line, blkSize/8));
std::unique_ptr<Base::CompressionData> comp_data(new CompData(chunks));
// Set relevant metadata
comp_data->setSizeBits(compressedSize);
@@ -75,10 +67,7 @@ Perfect::decompress(const CompressionData* comp_data,
uint64_t* data)
{
// Decompress every entry sequentially
const std::vector<uint64_t>& entries =
static_cast<const CompData*>(comp_data)->entries;
assert(entries.size() == (blkSize/8));
std::copy(entries.begin(), entries.end(), data);
fromChunks(static_cast<const CompData*>(comp_data)->chunks, data);
}
} // namespace Compressor

View File

@@ -59,8 +59,9 @@ class Perfect : public Base
/** Number of cycles needed to perform decompression. */
const Cycles decompressionLatency;
std::unique_ptr<CompressionData> compress(const uint64_t* cache_line,
Cycles& comp_lat, Cycles& decomp_lat) override;
std::unique_ptr<CompressionData> compress(
const std::vector<Chunk>& chunks, Cycles& comp_lat,
Cycles& decomp_lat) override;
void decompress(const CompressionData* comp_data, uint64_t* data) override;
@@ -74,15 +75,17 @@ class Perfect::CompData : public CompressionData
{
public:
/** The original data is simply copied over to this vector. */
std::vector<uint64_t> entries;
std::vector<Chunk> chunks;
/**
* Default constructor that creates a copy of the original data.
*
* @param data The data to be compressed.
* @param num_entries The number of qwords in the data.
* @param chunks The data to be compressed.
*/
CompData(const uint64_t* data, std::size_t num_entries);
CompData(const std::vector<Chunk>& chunks)
: CompressionData(), chunks(chunks)
{
}
~CompData() = default;
};

View File

@@ -53,11 +53,11 @@ RepeatedQwords::addToDictionary(DictionaryEntry data)
}
std::unique_ptr<Base::CompressionData>
RepeatedQwords::compress(const uint64_t* data, Cycles& comp_lat,
Cycles& decomp_lat)
RepeatedQwords::compress(const std::vector<Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat)
{
std::unique_ptr<Base::CompressionData> comp_data =
DictionaryCompressor::compress(data);
DictionaryCompressor::compress(chunks);
// Since there is a single value repeated over and over, there should be
// a single dictionary entry. If there are more, the compressor failed

View File

@@ -92,7 +92,8 @@ class RepeatedQwords : public DictionaryCompressor<uint64_t>
void addToDictionary(DictionaryEntry data) override;
std::unique_ptr<Base::CompressionData> compress(
const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat) override;
const std::vector<Base::Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat) override;
public:
typedef RepeatedQwordsCompressorParams Params;

View File

@@ -53,11 +53,11 @@ Zero::addToDictionary(DictionaryEntry data)
}
std::unique_ptr<Base::CompressionData>
Zero::compress(const uint64_t* data, Cycles& comp_lat,
Zero::compress(const std::vector<Chunk>& chunks, Cycles& comp_lat,
Cycles& decomp_lat)
{
std::unique_ptr<Base::CompressionData> comp_data =
DictionaryCompressor::compress(data);
DictionaryCompressor::compress(chunks);
// If there is any non-zero entry, the compressor failed
if (numEntries > 0) {

View File

@@ -92,7 +92,8 @@ class Zero : public DictionaryCompressor<uint64_t>
void addToDictionary(DictionaryEntry data) override;
std::unique_ptr<Base::CompressionData> compress(
const uint64_t* data, Cycles& comp_lat, Cycles& decomp_lat) override;
const std::vector<Base::Chunk>& chunks,
Cycles& comp_lat, Cycles& decomp_lat) override;
public:
typedef ZeroCompressorParams Params;