# Copyright (c) 2023 The Regents of the University of California # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer; # redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution; # neither the name of the copyright holders nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json import requests import base64 import os from jsonschema import validate class ResourceJsonCreator: """ This class generates the JSON which is pushed onto MongoDB. On a high-level, it does the following: - Adds certain fields to the JSON. - Populates those fields. - Makes sure the JSON follows the schema. """ # Global Variables base_url = "https://github.com/gem5/gem5/tree/develop" # gem5 GitHub URL resource_url_map = { "dev": ( "https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/" "develop/resources.json?format=TEXT" ), "22.1": ( "https://gem5.googlesource.com/public/gem5-resources/+/refs/heads/" "stable/resources.json?format=TEXT" ), "22.0": ( "http://resources.gem5.org/prev-resources-json/" "resources-21-2.json" ), "21.2": ( "http://resources.gem5.org/prev-resources-json/" "resources-22-0.json" ), } def __init__(self): self.schema = {} with open("schema/schema.json", "r") as f: self.schema = json.load(f) def _get_file_data(self, url): json_data = None try: json_data = requests.get(url).text json_data = base64.b64decode(json_data).decode("utf-8") return json.loads(json_data) except: json_data = requests.get(url).json() return json_data def _get_size(self, url): """ Helper function to return the size of a download through its URL. Returns 0 if URL has an error. :param url: Download URL """ try: response = requests.head(url) size = int(response.headers.get("content-length", 0)) return size except Exception as e: return 0 def _search_folder(self, folder_path, id): """ Helper function to find the instance of a string in a folder. This is recursive, i.e., subfolders will also be searched. :param folder_path: Path to the folder to begin searching :param id: Phrase to search in the folder :returns matching_files: List of file paths to the files containing id """ matching_files = [] for filename in os.listdir(folder_path): file_path = os.path.join(folder_path, filename) if os.path.isfile(file_path): with open( file_path, "r", encoding="utf-8", errors="ignore" ) as f: contents = f.read() if id in contents: file_path = file_path.replace("\\", "/") matching_files.append(file_path) elif os.path.isdir(file_path): matching_files.extend(self._search_folder(file_path, id)) return matching_files def _change_type(self, resource): if resource["type"] == "workload": # get the architecture from the name and remove 64 from it resource["architecture"] = ( resource["name"].split("-")[0].replace("64", "").upper() ) return resource if "kernel" in resource["name"]: resource["type"] = "kernel" elif "bootloader" in resource["name"]: resource["type"] = "bootloader" elif "benchmark" in resource["documentation"]: resource["type"] = "disk-image" # if tags not in resource: if "tags" not in resource: resource["tags"] = [] resource["tags"].append("benchmark") if ( "additional_metadata" in resource and "root_partition" in resource["additional_metadata"] and resource["additional_metadata"]["root_partition"] is not None ): resource["root_partition"] = resource["additional_metadata"][ "root_partition" ] else: resource["root_partition"] = "" elif resource["url"] is not None and ".img.gz" in resource["url"]: resource["type"] = "disk-image" if ( "additional_metadata" in resource and "root_partition" in resource["additional_metadata"] and resource["additional_metadata"]["root_partition"] is not None ): resource["root_partition"] = resource["additional_metadata"][ "root_partition" ] else: resource["root_partition"] = "" elif "binary" in resource["documentation"]: resource["type"] = "binary" elif "checkpoint" in resource["documentation"]: resource["type"] = "checkpoint" elif "simpoint" in resource["documentation"]: resource["type"] = "simpoint" return resource def _extract_code_examples(self, resource, source): """ This function goes by IDs present in the resources DataFrame. It finds which files use those IDs in gem5/configs. It adds the GitHub URL of those files under "example". It finds whether those files are used in gem5/tests/gem5. If yes, it marks "tested" as True. If not, it marks "tested" as False. "example" and "tested" are made into a JSON for every code example. This list of JSONs is assigned to the 'code_examples' field of the DataFrame. :param resources: A DataFrame containing the current state of resources. :param source: Path to gem5 :returns resources: DataFrame with ['code-examples'] populated. """ id = resource["id"] # search for files in the folder tree that contain the 'id' value matching_files = self._search_folder( source + "/configs", '"' + id + '"' ) filenames = [os.path.basename(path) for path in matching_files] tested_files = [] for file in filenames: tested_files.append( True if len(self._search_folder(source + "/tests/gem5", file)) > 0 else False ) matching_files = [ file.replace(source, self.base_url) for file in matching_files ] code_examples = [] for i in range(len(matching_files)): json_obj = { "example": matching_files[i], "tested": tested_files[i], } code_examples.append(json_obj) return code_examples def unwrap_resources(self, ver): data = self._get_file_data(self.resource_url_map[ver]) resources = data["resources"] new_resources = [] for resource in resources: if resource["type"] == "group": for group in resource["contents"]: new_resources.append(group) else: new_resources.append(resource) return new_resources def _get_example_usage(self, resource): if resource["category"] == "workload": return f"Workload(\"{resource['id']}\")" else: return f"obtain_resource(resource_id=\"{resource['id']}\")" def _parse_readme(self, url): metadata = { "tags": [], "author": [], "license": "", } try: request = requests.get(url) content = request.text content = content.split("---")[1] content = content.split("---")[0] if "tags:" in content: tags = content.split("tags:\n")[1] tags = tags.split(":")[0] tags = tags.split("\n")[:-1] tags = [tag.strip().replace("- ", "") for tag in tags] if tags == [""] or tags == None: tags = [] metadata["tags"] = tags if "author:" in content: author = content.split("author:")[1] author = author.split("\n")[0] author = ( author.replace("[", "").replace("]", "").replace('"', "") ) author = author.split(",") author = [a.strip() for a in author] metadata["author"] = author if "license:" in content: license = content.split("license:")[1].split("\n")[0] metadata["license"] = license except: pass return metadata def _add_fields(self, resources, source): new_resources = [] for resource in resources: res = self._change_type(resource) res["gem5_versions"] = ["23.0"] res["resource_version"] = "1.0.0" res["category"] = res["type"] del res["type"] res["id"] = res["name"] del res["name"] res["description"] = res["documentation"] del res["documentation"] if "additional_metadata" in res: for k, v in res["additional_metadata"].items(): res[k] = v del res["additional_metadata"] res["example_usage"] = self._get_example_usage(res) if "source" in res: url = ( "https://raw.githubusercontent.com/gem5/" "gem5-resources/develop/" + str(res["source"]) + "/README.md" ) res["source_url"] = ( "https://github.com/gem5/gem5-resources/tree/develop/" + str(res["source"]) ) else: url = "" res["source_url"] = "" metadata = self._parse_readme(url) if "tags" in res: res["tags"].extend(metadata["tags"]) else: res["tags"] = metadata["tags"] res["author"] = metadata["author"] res["license"] = metadata["license"] res["code_examples"] = self._extract_code_examples(res, source) if "url" in resource: download_url = res["url"].replace( "{url_base}", "http://dist.gem5.org/dist/develop" ) res["url"] = download_url res["size"] = self._get_size(download_url) else: res["size"] = 0 res = {k: v for k, v in res.items() if v is not None} new_resources.append(res) return new_resources def _validate_schema(self, resources): for resource in resources: try: validate(resource, schema=self.schema) except Exception as e: print(resource) raise e def create_json(self, version, source, output): resources = self.unwrap_resources(version) resources = self._add_fields(resources, source) self._validate_schema(resources) with open(output, "w") as f: json.dump(resources, f, indent=4)