util-gem5art: Decouple artifacts from mongodb

This commit does a few things to decouple the artifacts from mongodb.
- Creates an ArtifactFileDB which stores artifacts in a json file on the
local machine
- Adds tests fro the artifact file DB
- Sets the file database to be default if pymongo isn't installed
- Extends the Artifact class to prepare for downloading artifacts from
gem5-resources

Change-Id: I1bceef94dc53c066d1c0475e79c9a1ad1f1a6202
Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47463
Reviewed-by: Hoa Nguyen <hoanguyen@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Hoa Nguyen
2021-07-01 15:33:24 -07:00
committed by Jason Lowe-Power
parent bb596f55e3
commit d7cf6be93a
4 changed files with 425 additions and 32 deletions

View File

@@ -36,14 +36,24 @@ Some common queries can be found in common_queries.py
from abc import ABC, abstractmethod
import gridfs # type: ignore
import copy
import json
import os
from pathlib import Path
from pymongo import MongoClient # type: ignore
from typing import Any, Dict, Iterable, Union, Type
import shutil
from typing import Any, Dict, Iterable, Union, Type, List, Tuple
from urllib.parse import urlparse
from uuid import UUID
try:
import gridfs # type: ignore
from pymongo import MongoClient # type: ignore
MONGO_SUPPORT = True
except ModuleNotFoundError:
# If pymongo isn't installed, then disable support for it
MONGO_SUPPORT = False
class ArtifactDB(ABC):
"""
@@ -51,7 +61,7 @@ class ArtifactDB(ABC):
"""
@abstractmethod
def __init__(self, uri: str):
def __init__(self, uri: str) -> None:
"""Initialize the database with a URI"""
pass
@@ -205,11 +215,194 @@ class ArtifactMongoDB(ArtifactDB):
yield d
class ArtifactFileDB(ArtifactDB):
"""
This is a file-based database where Artifacts (as defined in artifacts.py)
are stored in a JSON file.
This database stores a list of serialized artifacts in a JSON file.
This database is not thread-safe.
If the user specifies a valid path in the environment variable
GEM5ART_STORAGE then this database will copy all artifacts to that
directory named with their UUIDs.
"""
class ArtifactEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, UUID):
return str(obj)
return ArtifactFileDB.ArtifactEncoder(self, obj)
_json_file: Path
_uuid_artifact_map: Dict[str, Dict[str, str]]
_hash_uuid_map: Dict[str, List[str]]
_storage_enabled: bool
_storage_path: Path
def __init__(self, uri: str) -> None:
"""Initialize the file-driven database from a JSON file.
If the file doesn't exist, a new file will be created.
"""
parsed_uri = urlparse(uri)
# using urlparse to parse relative/absolute file path
# abs path: urlparse("file:///path/to/file") ->
# (netloc='', path='/path/to/file')
# rel path: urlparse("file://path/to/file") ->
# (netloc='path', path='/to/file')
# so, the filepath would be netloc+path for both cases
self._json_file = Path(parsed_uri.netloc) / Path(parsed_uri.path)
storage_path = os.environ.get("GEM5ART_STORAGE", "")
self._storage_enabled = True if storage_path else False
self._storage_path = Path(storage_path)
if (
self._storage_enabled
and self._storage_path.exists()
and not self._storage_path.is_dir()
):
raise Exception(
f"GEM5ART_STORAGE={storage_path} exists and is not a directory"
)
if self._storage_enabled:
os.makedirs(self._storage_path, exist_ok=True)
self._uuid_artifact_map, self._hash_uuid_map = self._load_from_file(
self._json_file
)
def put(self, key: UUID, artifact: Dict[str, Union[str, UUID]]) -> None:
"""Insert the artifact into the database with the key."""
assert artifact["_id"] == key
assert isinstance(artifact["hash"], str)
self.insert_artifact(key, artifact["hash"], artifact)
def upload(self, key: UUID, path: Path) -> None:
"""Copy the artifact to the folder specified by GEM5ART_STORAGE."""
if not self._storage_enabled:
return
src_path = path
dst_path = self._storage_path / str(key)
if not dst_path.exists():
shutil.copy2(src_path, dst_path)
def __contains__(self, key: Union[UUID, str]) -> bool:
"""Key can be a UUID or a string. Returns true if item in DB"""
if isinstance(key, UUID):
return self.has_uuid(key)
return self.has_hash(key)
def get(self, key: Union[UUID, str]) -> Dict[str, str]:
"""Key can be a UUID or a string. Returns a dictionary to construct
an artifact.
"""
artifact: List[Dict[str, str]] = []
if isinstance(key, UUID):
artifact = list(self.get_artifact_by_uuid(key))
else:
# This is a hash.
artifact = list(self.get_artifact_by_hash(key))
return artifact[0]
def downloadFile(self, key: UUID, path: Path) -> None:
"""Copy the file from the storage to specified path."""
assert path.exists()
if not self._storage_enabled:
return
src_path = self._storage_path / str(key)
dst_path = path
shutil.copy2(src_path, dst_path)
def _load_from_file(
self, json_file: Path
) -> Tuple[Dict[str, Dict[str, str]], Dict[str, List[str]]]:
uuid_mapping: Dict[str, Dict[str, str]] = {}
hash_mapping: Dict[str, List[str]] = {}
if json_file.exists():
with open(json_file, "r") as f:
j = json.load(f)
for an_artifact in j:
the_uuid = an_artifact["_id"]
the_hash = an_artifact["hash"]
uuid_mapping[the_uuid] = an_artifact
if not the_hash in hash_mapping:
hash_mapping[the_hash] = []
hash_mapping[the_hash].append(the_uuid)
return uuid_mapping, hash_mapping
def _save_to_file(self, json_file: Path) -> None:
content = list(self._uuid_artifact_map.values())
with open(json_file, "w") as f:
json.dump(content, f, indent=4, cls=ArtifactFileDB.ArtifactEncoder)
def has_uuid(self, the_uuid: UUID) -> bool:
return str(the_uuid) in self._uuid_artifact_map
def has_hash(self, the_hash: str) -> bool:
return the_hash in self._hash_uuid_map
def get_artifact_by_uuid(self, the_uuid: UUID) -> Iterable[Dict[str, str]]:
uuid_str = str(the_uuid)
if not uuid_str in self._uuid_artifact_map:
return
yield self._uuid_artifact_map[uuid_str]
def get_artifact_by_hash(self, the_hash: str) -> Iterable[Dict[str, str]]:
if not the_hash in self._hash_uuid_map:
return
for the_uuid in self._hash_uuid_map[the_hash]:
yield self._uuid_artifact_map[the_uuid]
def insert_artifact(
self,
the_uuid: UUID,
the_hash: str,
the_artifact: Dict[str, Union[str, UUID]],
) -> bool:
"""
Put the artifact to the database.
Return True if the artifact uuid does not exist in the database prior
to calling this function; return False otherwise.
"""
uuid_str = str(the_uuid)
if uuid_str in self._uuid_artifact_map:
return False
artifact_copy = copy.deepcopy(the_artifact)
artifact_copy["_id"] = str(artifact_copy["_id"])
self._uuid_artifact_map[uuid_str] = artifact_copy # type: ignore
if not the_hash in self._hash_uuid_map:
self._hash_uuid_map[the_hash] = []
self._hash_uuid_map[the_hash].append(uuid_str)
self._save_to_file(self._json_file)
return True
def find_exact(
self, attr: Dict[str, str], limit: int
) -> Iterable[Dict[str, Any]]:
"""
Return all artifacts such that, for every yielded artifact,
and for every (k,v) in attr, the attribute `k` of the artifact has
the value of `v`.
"""
count = 0
if count >= limit:
return
for artifact in self._uuid_artifact_map.values():
#https://docs.python.org/3/library/stdtypes.html#frozenset.issubset
if attr.items() <= artifact.items():
yield artifact
_db = None
_default_uri = "mongodb://localhost:27017"
if MONGO_SUPPORT:
_default_uri = "mongodb://localhost:27017"
else:
_default_uri = "file://db.json"
_db_schemes: Dict[str, Type[ArtifactDB]] = {"mongodb": ArtifactMongoDB}
_db_schemes: Dict[str, Type[ArtifactDB]] = {"file": ArtifactFileDB}
if MONGO_SUPPORT:
_db_schemes["mongodb"] = ArtifactMongoDB
def _getDBType(uri: str) -> Type[ArtifactDB]:
@@ -220,6 +413,10 @@ def _getDBType(uri: str) -> Type[ArtifactDB]:
Supported types:
**ArtifactMongoDB**: mongodb://...
See http://dochub.mongodb.org/core/connections for details.
**ArtifactFileDB**: file://...
A simple flat file database with optional storage for the binary
artifacts. The filepath is where the json file is stored and the
data storage can be specified with GEM5ART_STORAGE
"""
result = urlparse(uri)
if result.scheme in _db_schemes:

View File

@@ -29,12 +29,13 @@
import hashlib
from inspect import cleandoc
import os
import json
from pathlib import Path
import subprocess
import time
from typing import Any, Dict, Iterator, List, Union
from typing import Any, Dict, List, Union, Optional
from uuid import UUID, uuid4
import json
from ._artifactdb import getDBConnection
@@ -108,6 +109,19 @@ class Artifact:
6) ID: unique identifier of the artifact
7) inputs: list of the input artifacts used to create this artifact stored
as a list of uuids
Optional fields:
a) architecture: name of the ISA (e.g. x86, riscv) ("" by default)
b) size: size of the artifact in bytes (None by default)
c) is_zipped: True when the artifact must be decompressed before using,
False otherwise (False by default)
d) md5sum: the md5 checksum of the artifact, used for integrity checking
("" by default)
e) url: URL to download the artifact ("" by default)
f) supported_gem5_versions: a list of supported gem5 versions that the
artifact should be used with (an empty list by default)
g) version: version of the artifact, e.g. "v21-0" ("" by default)
h) **kwargs: other fields, values must have __str__() defined.
"""
_id: UUID
@@ -122,8 +136,19 @@ class Artifact:
cwd: Path
inputs: List["Artifact"]
# Optional fields
architecture: str
size: Optional[int]
is_zipped: bool
md5sum: str
url: str
supported_gem5_versions: List[str]
version: str
extra: Dict[str, str]
@classmethod
def registerArtifact(
def createArtifact(
cls,
command: str,
name: str,
@@ -132,15 +157,24 @@ class Artifact:
path: Union[str, Path],
documentation: str,
inputs: List["Artifact"] = [],
architecture: str = "",
size: int = None,
is_zipped: bool = False,
md5sum: str = "",
url: str = "",
supported_gem5_versions: List[str] = [],
version: str = "",
**kwargs: str,
) -> "Artifact":
"""Constructs a new artifact.
This assume either it's not in the database or it is the exact same as
when it was added to the database
"""Constructs a new artifact without using the database.
Different from registerArtifact(), this method won't use database.
As a result, this method won't check whether the artifact has
already existed in the database, as well as it won't add the artifact
to the database.
"""
_db = getDBConnection()
# Dictionary with all of the kwargs for construction.
data: Dict[str, Any] = {}
@@ -180,20 +214,75 @@ class Artifact:
data["inputs"] = [i._id for i in inputs]
if data["hash"] in _db:
old_artifact = Artifact(_db.get(data["hash"]))
data["_id"] = old_artifact._id
data["architecture"] = architecture
data["size"] = size
data["is_zipped"] = is_zipped
data["md5sum"] = md5sum
data["url"] = url
data["supported_gem5_versions"] = supported_gem5_versions[:]
data["version"] = version
data["extra"] = kwargs
data["_id"] = uuid4()
# Now that we have a complete object, construct it
self = cls(data)
return self
@classmethod
def registerArtifact(
cls,
command: str,
name: str,
cwd: str,
typ: str,
path: Union[str, Path],
documentation: str,
inputs: List["Artifact"] = [],
architecture: str = "",
size: Optional[int] = None,
is_zipped: bool = False,
md5sum: str = "",
url: str = "",
supported_gem5_versions: List[str] = [],
version: str = "",
**kwargs: str,
) -> "Artifact":
"""Constructs a new artifact and adds to the database.
This assume either it's not in the database or it is the exact same as
when it was added to the database
"""
self = cls.createArtifact(
command,
name,
cwd,
typ,
path,
documentation,
inputs,
architecture,
size,
is_zipped,
md5sum,
url,
supported_gem5_versions,
version,
**kwargs,
)
_db = getDBConnection()
if self.hash in _db:
old_artifact = Artifact(_db.get(self.hash))
self._id = old_artifact._id
# Now that we have a complete object, construct it
self = cls(data)
self._checkSimilar(old_artifact)
else:
data["_id"] = uuid4()
# Now that we have a complete object, construct it
self = cls(data)
# Upload the file if there is one.
if self.path.is_file():
_db.upload(self._id, self.path)
@@ -204,18 +293,23 @@ class Artifact:
return self
def __init__(self, other: Union[str, UUID, Dict[str, Any]]) -> None:
"""Constructs the object from the database based on a UUID or
dictionary from the database
"""Constructs an artifact object from the database based on a UUID or
dictionary from the database. Note that if the variable `other` is of
type `Dict[str, Any]`, this function will not try to establish a
connection to the database.
"""
_db = getDBConnection()
if isinstance(other, str):
other = UUID(other)
if isinstance(other, UUID):
other = _db.get(other)
if not isinstance(other, Dict):
_db = getDBConnection()
if isinstance(other, str):
other = UUID(other)
if isinstance(other, UUID):
other = _db.get(other)
if not other:
raise Exception("Cannot construct artifact")
if isinstance(other["_id"], str):
other["_id"] = UUID(other["_id"]) # type: ignore
assert isinstance(other["_id"], UUID)
self._id = other["_id"]
self.name = other["name"]
@@ -229,6 +323,35 @@ class Artifact:
self.cwd = Path(other["cwd"])
self.inputs = [Artifact(i) for i in other["inputs"]]
# Optional fields
self.architecture = other.get("architecture", "")
if "size" in other:
if isinstance(other["size"], int):
self.size = other["size"]
else:
self.size = None
self.is_zipped = bool(other.get("is_zipped", False))
self.md5sum = other.get("md5sum", "")
self.url = other.get("url", "")
self.supported_gem5_versions = []
if "supported_gem5_versions" in other:
if isinstance(other["supported_gem5_versions"], list):
self.supported_gem5_versions = other[
"supported_gem5_versions"
][:]
elif isinstance(other["supported_gem5_versions"], str):
self.supported_gem5_versions = json.loads(
other["supported_gem5_versions"]
)
self.version = other.get("version", "")
self.extra = {}
if "extra" in other:
if isinstance(other["extra"], dict):
self.extra = {k: v for k, v in other["extra"].items()}
elif isinstance(other["extra"], str):
self.extra = json.loads(other["extra"])
def __str__(self) -> str:
inputs = ", ".join([i.name + ":" + str(i._id) for i in self.inputs])
return "\n ".join(
@@ -250,6 +373,10 @@ class Artifact:
data["inputs"] = [input._id for input in self.inputs]
data["cwd"] = str(data["cwd"])
data["path"] = str(data["path"])
data["supported_gem5_versions"] = json.dumps(
self.supported_gem5_versions
)
data["extra"] = json.dumps(self.extra)
return data
def __eq__(self, other: object) -> bool:

View File

@@ -48,7 +48,6 @@ class MockDB(ArtifactDB):
self.hashes = {}
def put(self, key, metadata):
print("putting an entry in the mock database")
self.db[key] = metadata
self.hashes[metadata["hash"]] = key

View File

@@ -0,0 +1,70 @@
# Copyright (c) 2021 The Regents of the University of California
# All Rights Reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Tests for ArtifactFileDB"""
import json
import os
from pathlib import Path
import unittest
from uuid import UUID
from gem5art.artifact import Artifact
from gem5art.artifact._artifactdb import getDBConnection
class TestArtifactFileDB(unittest.TestCase):
def setUp(self):
_db = getDBConnection("file://test.json")
with open("test-file.txt", "w") as f:
f.write("This is a test file.")
self.artifact = Artifact.registerArtifact(
name=f"test-artifact",
typ="text",
path=f"test-file.txt",
cwd="./",
command='echo "This is a test file" > test-file.txt',
inputs=[],
documentation=f"This artifact is made for testing.",
)
def tearDown(self):
os.remove("test-file.txt")
os.remove("test.json")
def test_init_function(self):
self.assertTrue(Path("test.json").exists())
def test_json_content(self):
with open("test.json", "r") as f:
artifacts = json.load(f)
self.assertTrue(len(artifacts) == 1)
artifact = artifacts[0]
self.assertTrue(artifact["hash"] == self.artifact.hash)
self.assertTrue(UUID(artifact["_id"]) == self.artifact._id)