util-gem5art: Simplify rerunning failing runs

Often, a gem5 experiment will fail for some reason. This happens
non-deterministically when fastforwarding with KVM making it more
difficult to handle.

This change allows the user to specify a function `check_failure` to
check to see if the test has failed. An example would be to open the
terminal and check to see if the kernel has panicked.

Additionally, this change adds a rerun function to rerun a particular
run that has failed.

Change-Id: Ib4a8d47c824254ae89ac9e1593ebd2710e263146
Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47464
Reviewed-by: Hoa Nguyen <hoanguyen@ucdavis.edu>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Hoa Nguyen
2021-07-01 15:41:44 -07:00
committed by Jason Lowe-Power
parent d7cf6be93a
commit 43004d9506

View File

@@ -39,7 +39,7 @@ from pathlib import Path
import signal
import subprocess
import time
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
from uuid import UUID, uuid4
import zipfile
@@ -64,6 +64,7 @@ class gem5Run:
run_script_git_artifact: Artifact
params: Tuple[str, ...]
timeout: int
check_failure: Callable[["gem5Run"], bool]
gem5_name: str
script_name: str
@@ -93,6 +94,8 @@ class gem5Run:
results: Optional[Artifact]
artifacts: List[Artifact]
rerunnable: bool
@classmethod
def _create(
cls,
@@ -105,6 +108,7 @@ class gem5Run:
run_script_git_artifact: Artifact,
params: Tuple[str, ...],
timeout: int,
check_failure: Callable[["gem5Run"], bool],
) -> "gem5Run":
"""
Shared code between SE and FS when creating a run object.
@@ -119,6 +123,9 @@ class gem5Run:
run.params = params
run.timeout = timeout
# Note: Mypy doesn't support monkey patching like this
run.check_failure = check_failure # type: ignore
run._id = uuid4()
run.outdir = outdir.resolve() # ensure this is absolute
@@ -142,6 +149,8 @@ class gem5Run:
# Initially, there are no results
run.results = None
run.rerunnable = False
return run
@classmethod
@@ -156,6 +165,7 @@ class gem5Run:
run_script_git_artifact: Artifact,
*params: str,
timeout: int = 60 * 15,
check_failure: Callable[["gem5Run"], bool] = lambda run: False,
) -> "gem5Run":
"""
name is the name of the run. The name is not necessarily unique. The
@@ -186,6 +196,7 @@ class gem5Run:
run_script_git_artifact,
params,
timeout,
check_failure,
)
run.artifacts = [
@@ -230,6 +241,7 @@ class gem5Run:
disk_image_artifact: Artifact,
*params: str,
timeout: int = 60 * 15,
check_failure: Callable[["gem5Run"], bool] = lambda run: False,
) -> "gem5Run":
"""
name is the name of the run. The name is not necessarily unique. The
@@ -244,6 +256,10 @@ class gem5Run:
Further parameters can be passed via extra arguments. These
parameters will be passed in order to the gem5 run script.
check_failure is a user-defined function that will be executed
periodically (e.g., every 10 seconds) to check the health of the
simulation. When it returns True, the simulation will be killed
Note: When instantiating this class for the first time, it will create
a file `info.json` in the outdir which contains a serialized version
of this class.
@@ -259,6 +275,7 @@ class gem5Run:
run_script_git_artifact,
params,
timeout,
check_failure,
)
run.linux_binary = Path(linux_binary)
run.disk_image = Path(disk_image)
@@ -397,6 +414,10 @@ class gem5Run:
# Remove list of artifacts
del d["artifacts"]
# Doesn't make sense to serialize the user-specified fail function
if "check_failure" in d.keys():
del d["check_failure"]
# Replace the artifacts with their UUIDs
for k, v in d.items():
if isinstance(v, Artifact):
@@ -438,7 +459,7 @@ class gem5Run:
d = self._convertForJson(self._getSerializable())
return json.dumps(d)
def run(self, task: Any = None, cwd: str = ".") -> None:
def _run(self, task: Any = None, cwd: str = ".") -> None:
"""Actually run the test.
Calls Popen with the command to fork a new process.
@@ -452,11 +473,8 @@ class gem5Run:
process to run in a different directory than the running process. Note
that only the spawned process runs in the new directory.
"""
# Check if the run is already in the database
# Connect to the database
db = artifact.getDBConnection()
if self.hash in db:
print(f"Error: Have already run {self.command}. Exiting!")
return
self.status = "Begin run"
self.dumpJson("info.json")
@@ -503,6 +521,15 @@ class gem5Run:
proc.kill()
self.kill_reason = "kernel panic"
# Assigning a function/lambda to an object variable does not make
# the function/lambda become a bound one. Therefore, the
# user-defined function must pass `self` in.
# Here, mypy classifies self.check_failure() as a bound function,
# so we tell mypy to ignore it./
if self.check_failure(self): # type: ignore
proc.kill()
self.kill_reason = "User defined kill"
self.dumpJson("info.json")
# Check again in five seconds
@@ -529,6 +556,44 @@ class gem5Run:
print("Done storing the results of {}".format(" ".join(self.command)))
def run(self, task: Any = None, cwd: str = ".") -> None:
"""Actually run the test.
Calls Popen with the command to fork a new process.
Then, this function polls the process every 5 seconds to check if it
has finished or not. Each time it checks, it dumps the json info so
other applications can poll those files.
task is the celery task that is running this gem5 instance.
cwd is the directory to change to before running. This allows a server
process to run in a different directory than the running process. Note
that only the spawned process runs in the new directory.
"""
# Check if the run is already in the database
db = artifact.getDBConnection()
if self.hash in db:
print(f"Error: Have already run {self.command}. Exiting!")
return
self._run(task, cwd)
def rerun(self, task: Any = None, cwd: str = ".") -> None:
"""Rerun the test.
Calls Popen with the command to fork a new process.
Then, this function polls the process every 5 seconds to check if it
has finished or not. Each time it checks, it dumps the json info so
other applications can poll those files.
task is the celery task that is running this gem5 instance.
cwd is the directory to change to before running. This allows a server
process to run in a different directory than the running process. Note
that only the spawned process runs in the new directory.
"""
# TODO: remove the old runs?
self._run(task, cwd)
def saveResults(self) -> None:
"""Zip up the output directory and store the results in the
database."""
@@ -616,3 +681,20 @@ def getRunsByNameLike(
for run in fsruns:
yield gem5Run.loadFromDict(run)
def getRerunnableRunsByNameLike(
db: ArtifactDB, name: str, fs_only: bool = False, limit: int = 0
) -> Iterable[gem5Run]:
"""Returns a generator of gem5Run objects having rerunnable as true
and the object "name" containing the name parameter as a substring. The
parameter is case sensitive.
If fs_only is True, then only full system runs will be returned.
Limit specifies the maximum number of runs to return.
"""
for run in getRunsByNameLike(db, name, fs_only, limit):
if run.rerunnable:
yield run