371 lines
14 KiB
Python
371 lines
14 KiB
Python
# Copyright (c) 2021-2023 The Regents of the University of California
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are
|
|
# met: redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer;
|
|
# redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution;
|
|
# neither the name of the copyright holders nor the names of its
|
|
# contributors may be used to endorse or promote products derived from
|
|
# this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
import urllib.request
|
|
import urllib.parse
|
|
import os
|
|
import shutil
|
|
import gzip
|
|
import time
|
|
import random
|
|
from pathlib import Path
|
|
import tarfile
|
|
from urllib.error import HTTPError
|
|
from typing import List, Optional, Dict
|
|
|
|
from _m5 import core
|
|
|
|
from .client import (
|
|
get_resource_json_obj,
|
|
list_resources as client_list_resources,
|
|
)
|
|
from .md5_utils import md5_file, md5_dir
|
|
from ..utils.progress_bar import tqdm, progress_hook
|
|
|
|
from ..utils.filelock import FileLock
|
|
|
|
"""
|
|
This Python module contains functions used to download, list, and obtain
|
|
information about resources from resources.gem5.org.
|
|
"""
|
|
|
|
|
|
def _download(url: str, download_to: str, max_attempts: int = 6) -> None:
|
|
"""
|
|
Downloads a file.
|
|
|
|
The function will run a Truncated Exponential Backoff algorithm to retry
|
|
the download if the HTTP Status Code returned is deemed retryable.
|
|
|
|
:param url: The URL of the file to download.
|
|
|
|
:param download_to: The location the downloaded file is to be stored.
|
|
|
|
:param max_attempts: The max number of download attempts before stopping.
|
|
The default is 6. This translates to roughly 1 minute of retrying before
|
|
stopping.
|
|
"""
|
|
|
|
# TODO: This whole setup will only work for single files we can get via
|
|
# wget. We also need to support git clones going forward.
|
|
|
|
attempt = 0
|
|
while True:
|
|
# The loop will be broken on a successful download, via a `return`, or
|
|
# if an exception is raised. An exception will be raised if the maximum
|
|
# number of download attempts has been reached or if a HTTP status code
|
|
# other than 408, 429, or 5xx is received.
|
|
try:
|
|
# check to see if user requests a proxy connection
|
|
use_proxy = os.getenv("GEM5_USE_PROXY")
|
|
if use_proxy:
|
|
# If the "use_proxy" variable is specified we setup a socks5
|
|
# connection.
|
|
|
|
import socks
|
|
import socket
|
|
import ssl
|
|
|
|
IP_ADDR, host_port = use_proxy.split(":")
|
|
PORT = int(host_port)
|
|
socks.set_default_proxy(socks.SOCKS5, IP_ADDR, PORT)
|
|
socket.socket = socks.socksocket
|
|
|
|
# base SSL context for https connection
|
|
ctx = ssl.create_default_context()
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
|
|
# get the file as a bytes blob
|
|
request = urllib.request.Request(url)
|
|
with urllib.request.urlopen(request, context=ctx) as fr:
|
|
with tqdm.wrapattr(
|
|
open(download_to, "wb"),
|
|
"write",
|
|
miniters=1,
|
|
desc="Downloading {download_to}",
|
|
total=getattr(fr, "length", None),
|
|
) as fw:
|
|
for chunk in fr:
|
|
fw.write(chunk)
|
|
else:
|
|
with tqdm(
|
|
unit="B",
|
|
unit_scale=True,
|
|
unit_divisor=1024,
|
|
miniters=1,
|
|
desc=f"Downloading {download_to}",
|
|
) as t:
|
|
urllib.request.urlretrieve(
|
|
url, download_to, reporthook=progress_hook(t)
|
|
)
|
|
return
|
|
except HTTPError as e:
|
|
# If the error code retrieved is retryable, we retry using a
|
|
# Truncated Exponential backoff algorithm, truncating after
|
|
# "max_attempts". We consider HTTP status codes 408, 429, and 5xx
|
|
# as retryable. If any other is retrieved we raise the error.
|
|
if e.code in (408, 429) or 500 <= e.code < 600:
|
|
attempt += 1
|
|
if attempt >= max_attempts:
|
|
raise Exception(
|
|
f"After {attempt} attempts, the resource json could "
|
|
"not be retrieved. HTTP Status Code retrieved: "
|
|
f"{e.code}"
|
|
)
|
|
time.sleep((2**attempt) + random.uniform(0, 1))
|
|
else:
|
|
raise e
|
|
except ConnectionResetError as e:
|
|
# This catches the ConnectionResetError we see occassionally see
|
|
# when accessing resources on GitHub Actions. It retries using a
|
|
# Truncated Exponential backoff algorithm, truncating after
|
|
# "max_attempts". If any other is retrieved we raise the error.
|
|
if e.errno == 104:
|
|
attempt += 1
|
|
if attempt >= max_attempts:
|
|
raise Exception(
|
|
f"After {attempt} attempts, the resource json could "
|
|
"not be retrieved. OS Error Code retrieved: "
|
|
f"{e.errno}"
|
|
)
|
|
time.sleep((2**attempt) + random.uniform(0, 1))
|
|
else:
|
|
raise e
|
|
except ValueError as e:
|
|
raise Exception(
|
|
f"ValueError: {e}\n"
|
|
"Environment variable GEM5_USE_PROXY is set to "
|
|
f"'{use_proxy}'. The expected form is "
|
|
"<host>:<port>'."
|
|
)
|
|
except ImportError as e:
|
|
raise Exception(
|
|
f"ImportError: {e}\n"
|
|
"An import error has occurred. This is likely due "
|
|
"the Python SOCKS client module not being "
|
|
"installed. It can be installed with "
|
|
"`pip install PySocks`."
|
|
)
|
|
|
|
|
|
def list_resources(
|
|
clients: Optional[List] = None, gem5_version: Optional[str] = None
|
|
) -> Dict[str, List[str]]:
|
|
"""
|
|
Lists all available resources. Returns a dictionary where the key is the
|
|
id of the resources and the value is a list of that resource's versions.
|
|
|
|
:param clients: A list of clients to use when listing resources. If None,
|
|
all clients will be used. None by default.
|
|
|
|
:param gem5_version: The gem5 version to which all resources should be
|
|
compatible with. If None, compatibility of resources is not considered and
|
|
all resources will be returned.
|
|
|
|
**Note**: This function is here for legacy reasons. The `list_resources`
|
|
function was originally stored here. In order to remain backwards
|
|
compatible, this function will call the `client_list_resources` function
|
|
|
|
"""
|
|
return client_list_resources(clients=clients, gem5_version=gem5_version)
|
|
|
|
|
|
def get_resource(
|
|
resource_name: str,
|
|
to_path: str,
|
|
unzip: bool = True,
|
|
untar: bool = True,
|
|
download_md5_mismatch: bool = True,
|
|
resource_version: Optional[str] = None,
|
|
clients: Optional[List] = None,
|
|
gem5_version: Optional[str] = core.gem5Version,
|
|
) -> None:
|
|
"""
|
|
Obtains a gem5 resource and stored it to a specified location. If the
|
|
specified resource is already at the location, no action is taken.
|
|
|
|
:param resource_name: The resource to be obtained.
|
|
|
|
:param to_path: The location in the file system the resource is to be
|
|
stored. The filename should be included.
|
|
|
|
:param unzip: If true, gzipped resources will be unzipped prior to saving
|
|
to `to_path`. True by default.
|
|
|
|
:param untar: If true, tar achieve resource will be unpacked prior to
|
|
saving to `to_path`. True by default.
|
|
|
|
:param download_md5_mismatch: If a resource is present with an incorrect
|
|
hash (e.g., an outdated version of the resource is present), `get_resource`
|
|
will delete this local resource and re-download it if this parameter is
|
|
True. True by default.
|
|
|
|
:param resource_version: The version of the resource to be obtained. If
|
|
None, the latest version of the resource compatible with the working
|
|
directory's gem5 version will be obtained. None by default.
|
|
|
|
:param clients: A list of clients to use when obtaining the resource. If
|
|
None, all clients will be used. None by default.
|
|
|
|
:param gem5_version: The gem5 version to use when obtaining the resource.
|
|
By default, the version of gem5 being used is used. This is used primarily
|
|
for testing purposes.
|
|
|
|
:raises Exception: An exception is thrown if a file is already present at
|
|
`to_path` but it does not have the correct md5 sum. An exception will also
|
|
be thrown is a directory is present at `to_path`
|
|
"""
|
|
|
|
# We apply a lock for a specific resource. This is to avoid circumstances
|
|
# where multiple instances of gem5 are running and trying to obtain the
|
|
# same resources at once. The timeout here is somewhat arbitarily put at 15
|
|
# minutes.Most resources should be downloaded and decompressed in this
|
|
# timeframe, even on the most constrained of systems.
|
|
with FileLock(f"{to_path}.lock", timeout=900):
|
|
resource_json = get_resource_json_obj(
|
|
resource_name,
|
|
resource_version=resource_version,
|
|
clients=clients,
|
|
gem5_version=gem5_version,
|
|
)
|
|
|
|
if os.path.exists(to_path):
|
|
if os.path.isfile(to_path):
|
|
md5 = md5_file(Path(to_path))
|
|
else:
|
|
md5 = md5_dir(Path(to_path))
|
|
|
|
if md5 == resource_json["md5sum"]:
|
|
# In this case, the file has already been download, no need to
|
|
# do so again.
|
|
return
|
|
elif download_md5_mismatch:
|
|
if os.path.isfile(to_path):
|
|
os.remove(to_path)
|
|
else:
|
|
shutil.rmtree(to_path)
|
|
else:
|
|
raise Exception(
|
|
"There already a file present at '{}' but "
|
|
"its md5 value is invalid.".format(to_path)
|
|
)
|
|
|
|
download_dest = to_path
|
|
|
|
# This if-statement is remain backwards compatable with the older,
|
|
# string-based way of doing things. It can be refactored away over
|
|
# time:
|
|
# https://gem5-review.googlesource.com/c/public/gem5-resources/+/51168
|
|
run_unzip = False
|
|
if "is_zipped" in resource_json:
|
|
if isinstance(resource_json["is_zipped"], str):
|
|
run_unzip = (
|
|
unzip and resource_json["is_zipped"].lower() == "true"
|
|
)
|
|
elif isinstance(resource_json["is_zipped"], bool):
|
|
run_unzip = unzip and resource_json["is_zipped"]
|
|
else:
|
|
raise Exception(
|
|
"The resource.json entry for '{}' has a value for the "
|
|
"'is_zipped' field which is neither a string or a boolean.".format(
|
|
resource_name
|
|
)
|
|
)
|
|
|
|
run_tar_extract = (
|
|
untar
|
|
and "is_tar_archive" in resource_json
|
|
and resource_json["is_tar_archive"]
|
|
)
|
|
|
|
tar_extension = ".tar"
|
|
if run_tar_extract:
|
|
download_dest += tar_extension
|
|
|
|
zip_extension = ".gz"
|
|
if run_unzip:
|
|
download_dest += zip_extension
|
|
|
|
# TODO: Might be nice to have some kind of download status bar here.
|
|
# TODO: There might be a case where this should be silenced.
|
|
print(
|
|
"Resource '{}' was not found locally. Downloading to '{}'...".format(
|
|
resource_name, download_dest
|
|
)
|
|
)
|
|
|
|
# Get the URL.
|
|
url = resource_json["url"]
|
|
|
|
_download(url=url, download_to=download_dest)
|
|
print(f"Finished downloading resource '{resource_name}'.")
|
|
|
|
if run_unzip:
|
|
print(
|
|
f"Decompressing resource '{resource_name}' ('{download_dest}')..."
|
|
)
|
|
unzip_to = download_dest[: -len(zip_extension)]
|
|
with gzip.open(download_dest, "rb") as f:
|
|
with open(unzip_to, "wb") as o:
|
|
shutil.copyfileobj(f, o)
|
|
os.remove(download_dest)
|
|
download_dest = unzip_to
|
|
print(f"Finished decompressing resource '{resource_name}'.")
|
|
|
|
if run_tar_extract:
|
|
print(
|
|
f"Unpacking the the resource '{resource_name}' "
|
|
f"('{download_dest}')"
|
|
)
|
|
unpack_to = download_dest[: -len(tar_extension)]
|
|
with tarfile.open(download_dest) as f:
|
|
|
|
def is_within_directory(directory, target):
|
|
|
|
abs_directory = os.path.abspath(directory)
|
|
abs_target = os.path.abspath(target)
|
|
|
|
prefix = os.path.commonprefix([abs_directory, abs_target])
|
|
|
|
return prefix == abs_directory
|
|
|
|
def safe_extract(
|
|
tar, path=".", members=None, *, numeric_owner=False
|
|
):
|
|
|
|
for member in tar.getmembers():
|
|
member_path = os.path.join(path, member.name)
|
|
if not is_within_directory(path, member_path):
|
|
raise Exception(
|
|
"Attempted Path Traversal in Tar File"
|
|
)
|
|
|
|
tar.extractall(path, members, numeric_owner=numeric_owner)
|
|
|
|
safe_extract(f, unpack_to)
|
|
os.remove(download_dest)
|