Message ID | 20241101205446.41310-1-crwood@redhat.com |
---|---|
State | New |
Headers | show |
Series | rteval: Print useful exception info and exit on missing measurement tool | expand |
On Fri, 1 Nov 2024, Crystal Wood wrote: > Currently, if cyclictest or rtla (whichever rteval is configured to use) > is missing, the error output only says "measurement threads did not use > the full time slot" and then rteval hangs. > > Avoid catching these exceptions so that they will be printed out, > pointing to the actual problem. Also set up a thread exception hook to > do a prompt and (relatively) graceful exit if a thread has any uncaught > exception (this requires python >= 3.8). This can happen when developers run from source. When people install from packages these dependencies are taken care of. The hang is especially bad, so I like that your patch exits gracefully. However, we are really throwing an exception because of one reason, either cyclictest or rtla timerlat are not available on the machine, or at least not in a location that rteval knows about. In this case we should report that this is the case instead of printing an exception / traceback which looks like a problem with the software to the enduser. I would only print a traceback if the exception is one that we're not aware of. A few comments on the code follows. > > Signed-off-by: Crystal Wood <crwood@redhat.com> > --- > README | 2 +- > rteval/__init__.py | 12 ++++++++++++ > rteval/modules/__init__.py | 14 +++++++++++--- > rteval/modules/measurement/cyclictest.py | 14 +++++--------- > rteval/modules/measurement/timerlat.py | 13 +++++-------- > 5 files changed, 34 insertions(+), 21 deletions(-) > > diff --git a/README b/README > index b352d7f..19704b4 100644 > --- a/README > +++ b/README > @@ -16,7 +16,7 @@ The rteval source may be pulled from it's git tree on kernel.org: > > Rteval requires the following packages to run: > > -Python >= 3.0 > +Python >= 3.8 > http://www.python.org/download/ > > python-lxml > diff --git a/rteval/__init__.py b/rteval/__init__.py > index 7c13e84..6097ddf 100644 > --- a/rteval/__init__.py > +++ b/rteval/__init__.py > @@ -19,6 +19,7 @@ import threading > import time > from datetime import datetime > import sysconfig > +from traceback import format_exception > from rteval.modules.loads import LoadModules > from rteval.modules.measurement import MeasurementModules > from rteval.rtevalReport import rtevalReport > @@ -29,6 +30,7 @@ from rteval import version > RTEVAL_VERSION = version.RTEVAL_VERSION > > EARLYSTOP = False > +threaderr = False If this is a global we should follow the convention and capitalize it. Yes there is legacy code in rteval that doesn't follow the convention. Even better, does this have to be a global? Would perhaps a class variable work here instead? > > stopsig = threading.Event() > def sig_handler(signum, frame): > @@ -39,9 +41,17 @@ def sig_handler(signum, frame): > else: > raise RuntimeError(f"SIGNAL received! ({signum})") > > +def except_hook(args): > + global threaderr > + > + threading.__excepthook__(args) > + threaderr = True > + stopsig.set() > + > class RtEval(rtevalReport): > def __init__(self, config, loadmods, measuremods, logger): > self.__version = RTEVAL_VERSION > + threading.excepthook = except_hook > > if not isinstance(config, rtevalConfig.rtevalConfig): > raise TypeError("config variable is not an rtevalConfig object") > @@ -237,6 +247,8 @@ class RtEval(rtevalReport): > global EARLYSTOP > rtevalres = 0 > measure_start = self.__RunMeasurement() > + if threaderr: > + return 1 > > self._report(measure_start, self.__rtevcfg.xslt_report) > if self.__rtevcfg.sysreport: > diff --git a/rteval/modules/__init__.py b/rteval/modules/__init__.py > index eb29db8..9827651 100644 > --- a/rteval/modules/__init__.py > +++ b/rteval/modules/__init__.py > @@ -124,8 +124,8 @@ class rtevalModulePrototype(threading.Thread): > > def WaitForCompletion(self, wtime=None): > """ Blocks until the module has completed its workload """ > - if not self.shouldStart(): > - # If it hasn't been started yet, nothing to wait for > + if self.hadRuntimeError() or not self.shouldStart(): > + # If it failed or hasn't been started yet, nothing to wait for > return None > return self.__events["finished"].wait(wtime) > > @@ -175,7 +175,7 @@ class rtevalModulePrototype(threading.Thread): > return self._donotrun is False > > > - def run(self): > + def __run(self): > "Workload thread runner - takes care of keeping the workload running as long as needed" > if self.shouldStop(): > return > @@ -215,6 +215,12 @@ class rtevalModulePrototype(threading.Thread): > > self._WorkloadCleanup() > > + def run(self): > + try: > + self.__run() > + except Exception as e: > + self._setRuntimeError() > + raise e > > def MakeReport(self): > """ required module method, needs to return an libxml2.xmlNode object > @@ -532,6 +538,8 @@ class RtEvalModules: > rep_n = libxml2.newNode(self._report_tag) > > for (modname, mod) in self.__modules: > + if mod.hadRuntimeError(): > + continue > self._logger.log(Log.DEBUG, f"Getting report from {modname}") > modrep_n = mod.MakeReport() > if modrep_n is not None: > diff --git a/rteval/modules/measurement/cyclictest.py b/rteval/modules/measurement/cyclictest.py > index 2e8f6f1..3d25c20 100644 > --- a/rteval/modules/measurement/cyclictest.py > +++ b/rteval/modules/measurement/cyclictest.py > @@ -285,15 +285,11 @@ class Cyclictest(rtevalModulePrototype): > fp.flush() > > self.__cyclicoutput.seek(0) > - try: > - self.__cyclicprocess = subprocess.Popen(self.__cmd, > - stdout=self.__cyclicoutput, > - stderr=self.__nullfp, > - stdin=self.__nullfp) > - self.__started = True > - except OSError: > - self.__started = False > - > + self.__cyclicprocess = subprocess.Popen(self.__cmd, > + stdout=self.__cyclicoutput, > + stderr=self.__nullfp, > + stdin=self.__nullfp) > + self.__started = True > > def WorkloadAlive(self): > if self.__started: > diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py > index 92bc070..df42777 100644 > --- a/rteval/modules/measurement/timerlat.py > +++ b/rteval/modules/measurement/timerlat.py > @@ -252,14 +252,11 @@ class Timerlat(rtevalModulePrototype): > > self.__timerlat_out.seek(0) > self.__timerlat_err.seek(0) > - try: > - self.__timerlat_process = subprocess.Popen(self.__cmd, > - stdout=self.__timerlat_out, > - stderr=self.__timerlat_err, > - stdin=None) > - self.__started = True > - except OSError: > - self.__started = False > + self.__timerlat_process = subprocess.Popen(self.__cmd, > + stdout=self.__timerlat_out, > + stderr=self.__timerlat_err, > + stdin=None) > + self.__started = True > > def WorkloadAlive(self): > if self.__started: > -- > 2.47.0 > > >
On Fri, 1 Nov 2024, Crystal Wood wrote: > Currently, if cyclictest or rtla (whichever rteval is configured to use) > is missing, the error output only says "measurement threads did not use > the full time slot" and then rteval hangs. > > Avoid catching these exceptions so that they will be printed out, > pointing to the actual problem. Also set up a thread exception hook to > do a prompt and (relatively) graceful exit if a thread has any uncaught > exception (this requires python >= 3.8). > > Signed-off-by: Crystal Wood <crwood@redhat.com> > --- > README | 2 +- > rteval/__init__.py | 12 ++++++++++++ > rteval/modules/__init__.py | 14 +++++++++++--- > rteval/modules/measurement/cyclictest.py | 14 +++++--------- > rteval/modules/measurement/timerlat.py | 13 +++++-------- > 5 files changed, 34 insertions(+), 21 deletions(-) > > diff --git a/README b/README > index b352d7f..19704b4 100644 > --- a/README > +++ b/README > @@ -16,7 +16,7 @@ The rteval source may be pulled from it's git tree on kernel.org: > > Rteval requires the following packages to run: > > -Python >= 3.0 > +Python >= 3.8 > http://www.python.org/download/ > > python-lxml > diff --git a/rteval/__init__.py b/rteval/__init__.py > index 7c13e84..6097ddf 100644 > --- a/rteval/__init__.py > +++ b/rteval/__init__.py > @@ -19,6 +19,7 @@ import threading > import time > from datetime import datetime > import sysconfig > +from traceback import format_exception > from rteval.modules.loads import LoadModules > from rteval.modules.measurement import MeasurementModules > from rteval.rtevalReport import rtevalReport > @@ -29,6 +30,7 @@ from rteval import version > RTEVAL_VERSION = version.RTEVAL_VERSION > > EARLYSTOP = False > +threaderr = False > > stopsig = threading.Event() > def sig_handler(signum, frame): > @@ -39,9 +41,17 @@ def sig_handler(signum, frame): > else: > raise RuntimeError(f"SIGNAL received! ({signum})") > > +def except_hook(args): > + global threaderr > + > + threading.__excepthook__(args) > + threaderr = True > + stopsig.set() > + > class RtEval(rtevalReport): > def __init__(self, config, loadmods, measuremods, logger): > self.__version = RTEVAL_VERSION > + threading.excepthook = except_hook > > if not isinstance(config, rtevalConfig.rtevalConfig): > raise TypeError("config variable is not an rtevalConfig object") > @@ -237,6 +247,8 @@ class RtEval(rtevalReport): > global EARLYSTOP > rtevalres = 0 > measure_start = self.__RunMeasurement() > + if threaderr: > + return 1 > > self._report(measure_start, self.__rtevcfg.xslt_report) > if self.__rtevcfg.sysreport: > diff --git a/rteval/modules/__init__.py b/rteval/modules/__init__.py > index eb29db8..9827651 100644 > --- a/rteval/modules/__init__.py > +++ b/rteval/modules/__init__.py > @@ -124,8 +124,8 @@ class rtevalModulePrototype(threading.Thread): > > def WaitForCompletion(self, wtime=None): > """ Blocks until the module has completed its workload """ > - if not self.shouldStart(): > - # If it hasn't been started yet, nothing to wait for > + if self.hadRuntimeError() or not self.shouldStart(): > + # If it failed or hasn't been started yet, nothing to wait for > return None > return self.__events["finished"].wait(wtime) > > @@ -175,7 +175,7 @@ class rtevalModulePrototype(threading.Thread): > return self._donotrun is False > > > - def run(self): > + def __run(self): > "Workload thread runner - takes care of keeping the workload running as long as needed" > if self.shouldStop(): > return > @@ -215,6 +215,12 @@ class rtevalModulePrototype(threading.Thread): > > self._WorkloadCleanup() > > + def run(self): > + try: > + self.__run() > + except Exception as e: > + self._setRuntimeError() > + raise e > > def MakeReport(self): > """ required module method, needs to return an libxml2.xmlNode object > @@ -532,6 +538,8 @@ class RtEvalModules: > rep_n = libxml2.newNode(self._report_tag) > > for (modname, mod) in self.__modules: > + if mod.hadRuntimeError(): > + continue > self._logger.log(Log.DEBUG, f"Getting report from {modname}") > modrep_n = mod.MakeReport() > if modrep_n is not None: > diff --git a/rteval/modules/measurement/cyclictest.py b/rteval/modules/measurement/cyclictest.py > index 2e8f6f1..3d25c20 100644 > --- a/rteval/modules/measurement/cyclictest.py > +++ b/rteval/modules/measurement/cyclictest.py > @@ -285,15 +285,11 @@ class Cyclictest(rtevalModulePrototype): > fp.flush() > > self.__cyclicoutput.seek(0) > - try: > - self.__cyclicprocess = subprocess.Popen(self.__cmd, > - stdout=self.__cyclicoutput, > - stderr=self.__nullfp, > - stdin=self.__nullfp) > - self.__started = True > - except OSError: > - self.__started = False > - > + self.__cyclicprocess = subprocess.Popen(self.__cmd, > + stdout=self.__cyclicoutput, > + stderr=self.__nullfp, > + stdin=self.__nullfp) > + self.__started = True > > def WorkloadAlive(self): > if self.__started: > diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py > index 92bc070..df42777 100644 > --- a/rteval/modules/measurement/timerlat.py > +++ b/rteval/modules/measurement/timerlat.py > @@ -252,14 +252,11 @@ class Timerlat(rtevalModulePrototype): > > self.__timerlat_out.seek(0) > self.__timerlat_err.seek(0) > - try: > - self.__timerlat_process = subprocess.Popen(self.__cmd, > - stdout=self.__timerlat_out, > - stderr=self.__timerlat_err, > - stdin=None) > - self.__started = True > - except OSError: > - self.__started = False > + self.__timerlat_process = subprocess.Popen(self.__cmd, > + stdout=self.__timerlat_out, > + stderr=self.__timerlat_err, > + stdin=None) > + self.__started = True > > def WorkloadAlive(self): > if self.__started: > -- > 2.47.0 > > > Signed-off-by: John Kacur <jkacur@redhat.com> Thank you
diff --git a/README b/README index b352d7f..19704b4 100644 --- a/README +++ b/README @@ -16,7 +16,7 @@ The rteval source may be pulled from it's git tree on kernel.org: Rteval requires the following packages to run: -Python >= 3.0 +Python >= 3.8 http://www.python.org/download/ python-lxml diff --git a/rteval/__init__.py b/rteval/__init__.py index 7c13e84..6097ddf 100644 --- a/rteval/__init__.py +++ b/rteval/__init__.py @@ -19,6 +19,7 @@ import threading import time from datetime import datetime import sysconfig +from traceback import format_exception from rteval.modules.loads import LoadModules from rteval.modules.measurement import MeasurementModules from rteval.rtevalReport import rtevalReport @@ -29,6 +30,7 @@ from rteval import version RTEVAL_VERSION = version.RTEVAL_VERSION EARLYSTOP = False +threaderr = False stopsig = threading.Event() def sig_handler(signum, frame): @@ -39,9 +41,17 @@ def sig_handler(signum, frame): else: raise RuntimeError(f"SIGNAL received! ({signum})") +def except_hook(args): + global threaderr + + threading.__excepthook__(args) + threaderr = True + stopsig.set() + class RtEval(rtevalReport): def __init__(self, config, loadmods, measuremods, logger): self.__version = RTEVAL_VERSION + threading.excepthook = except_hook if not isinstance(config, rtevalConfig.rtevalConfig): raise TypeError("config variable is not an rtevalConfig object") @@ -237,6 +247,8 @@ class RtEval(rtevalReport): global EARLYSTOP rtevalres = 0 measure_start = self.__RunMeasurement() + if threaderr: + return 1 self._report(measure_start, self.__rtevcfg.xslt_report) if self.__rtevcfg.sysreport: diff --git a/rteval/modules/__init__.py b/rteval/modules/__init__.py index eb29db8..9827651 100644 --- a/rteval/modules/__init__.py +++ b/rteval/modules/__init__.py @@ -124,8 +124,8 @@ class rtevalModulePrototype(threading.Thread): def WaitForCompletion(self, wtime=None): """ Blocks until the module has completed its workload """ - if not self.shouldStart(): - # If it hasn't been started yet, nothing to wait for + if self.hadRuntimeError() or not self.shouldStart(): + # If it failed or hasn't been started yet, nothing to wait for return None return self.__events["finished"].wait(wtime) @@ -175,7 +175,7 @@ class rtevalModulePrototype(threading.Thread): return self._donotrun is False - def run(self): + def __run(self): "Workload thread runner - takes care of keeping the workload running as long as needed" if self.shouldStop(): return @@ -215,6 +215,12 @@ class rtevalModulePrototype(threading.Thread): self._WorkloadCleanup() + def run(self): + try: + self.__run() + except Exception as e: + self._setRuntimeError() + raise e def MakeReport(self): """ required module method, needs to return an libxml2.xmlNode object @@ -532,6 +538,8 @@ class RtEvalModules: rep_n = libxml2.newNode(self._report_tag) for (modname, mod) in self.__modules: + if mod.hadRuntimeError(): + continue self._logger.log(Log.DEBUG, f"Getting report from {modname}") modrep_n = mod.MakeReport() if modrep_n is not None: diff --git a/rteval/modules/measurement/cyclictest.py b/rteval/modules/measurement/cyclictest.py index 2e8f6f1..3d25c20 100644 --- a/rteval/modules/measurement/cyclictest.py +++ b/rteval/modules/measurement/cyclictest.py @@ -285,15 +285,11 @@ class Cyclictest(rtevalModulePrototype): fp.flush() self.__cyclicoutput.seek(0) - try: - self.__cyclicprocess = subprocess.Popen(self.__cmd, - stdout=self.__cyclicoutput, - stderr=self.__nullfp, - stdin=self.__nullfp) - self.__started = True - except OSError: - self.__started = False - + self.__cyclicprocess = subprocess.Popen(self.__cmd, + stdout=self.__cyclicoutput, + stderr=self.__nullfp, + stdin=self.__nullfp) + self.__started = True def WorkloadAlive(self): if self.__started: diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py index 92bc070..df42777 100644 --- a/rteval/modules/measurement/timerlat.py +++ b/rteval/modules/measurement/timerlat.py @@ -252,14 +252,11 @@ class Timerlat(rtevalModulePrototype): self.__timerlat_out.seek(0) self.__timerlat_err.seek(0) - try: - self.__timerlat_process = subprocess.Popen(self.__cmd, - stdout=self.__timerlat_out, - stderr=self.__timerlat_err, - stdin=None) - self.__started = True - except OSError: - self.__started = False + self.__timerlat_process = subprocess.Popen(self.__cmd, + stdout=self.__timerlat_out, + stderr=self.__timerlat_err, + stdin=None) + self.__started = True def WorkloadAlive(self): if self.__started:
Currently, if cyclictest or rtla (whichever rteval is configured to use) is missing, the error output only says "measurement threads did not use the full time slot" and then rteval hangs. Avoid catching these exceptions so that they will be printed out, pointing to the actual problem. Also set up a thread exception hook to do a prompt and (relatively) graceful exit if a thread has any uncaught exception (this requires python >= 3.8). Signed-off-by: Crystal Wood <crwood@redhat.com> --- README | 2 +- rteval/__init__.py | 12 ++++++++++++ rteval/modules/__init__.py | 14 +++++++++++--- rteval/modules/measurement/cyclictest.py | 14 +++++--------- rteval/modules/measurement/timerlat.py | 13 +++++-------- 5 files changed, 34 insertions(+), 21 deletions(-)