diff mbox series

rteval: Print useful exception info and exit on missing measurement tool

Message ID 20241101205446.41310-1-crwood@redhat.com
State New
Headers show
Series rteval: Print useful exception info and exit on missing measurement tool | expand

Commit Message

Crystal Wood Nov. 1, 2024, 8:54 p.m. UTC
Currently, if cyclictest or rtla (whichever rteval is configured to use)
is missing, the error output only says "measurement threads did not use
the full time slot" and then rteval hangs.

Avoid catching these exceptions so that they will be printed out,
pointing to the actual problem.  Also set up a thread exception hook to
do a prompt and (relatively) graceful exit if a thread has any uncaught
exception (this requires python >= 3.8).

Signed-off-by: Crystal Wood <crwood@redhat.com>
---
 README                                   |  2 +-
 rteval/__init__.py                       | 12 ++++++++++++
 rteval/modules/__init__.py               | 14 +++++++++++---
 rteval/modules/measurement/cyclictest.py | 14 +++++---------
 rteval/modules/measurement/timerlat.py   | 13 +++++--------
 5 files changed, 34 insertions(+), 21 deletions(-)

Comments

John Kacur Nov. 6, 2024, 7:25 p.m. UTC | #1
On Fri, 1 Nov 2024, Crystal Wood wrote:

> Currently, if cyclictest or rtla (whichever rteval is configured to use)
> is missing, the error output only says "measurement threads did not use
> the full time slot" and then rteval hangs.
> 
> Avoid catching these exceptions so that they will be printed out,
> pointing to the actual problem.  Also set up a thread exception hook to
> do a prompt and (relatively) graceful exit if a thread has any uncaught
> exception (this requires python >= 3.8).

This can happen when developers run from source. When people install from 
packages these dependencies are taken care of. The hang is especially bad, 
so I like that your patch exits gracefully. However, we are really 
throwing an exception because of one reason, either cyclictest or rtla 
timerlat are not available on the machine, or at least not in a location 
that rteval knows about. In this case we should report that this is the 
case instead of printing an exception / traceback which looks like a 
problem with the software to the enduser. I would only print a traceback 
if the exception is one that we're not aware of.
A few comments on the code follows.

> 
> Signed-off-by: Crystal Wood <crwood@redhat.com>
> ---
>  README                                   |  2 +-
>  rteval/__init__.py                       | 12 ++++++++++++
>  rteval/modules/__init__.py               | 14 +++++++++++---
>  rteval/modules/measurement/cyclictest.py | 14 +++++---------
>  rteval/modules/measurement/timerlat.py   | 13 +++++--------
>  5 files changed, 34 insertions(+), 21 deletions(-)
> 
> diff --git a/README b/README
> index b352d7f..19704b4 100644
> --- a/README
> +++ b/README
> @@ -16,7 +16,7 @@ The rteval source may be pulled from it's git tree on kernel.org:
>  
>  Rteval requires the following packages to run:
>  
> -Python >= 3.0
> +Python >= 3.8
>      http://www.python.org/download/
>  
>  python-lxml
> diff --git a/rteval/__init__.py b/rteval/__init__.py
> index 7c13e84..6097ddf 100644
> --- a/rteval/__init__.py
> +++ b/rteval/__init__.py
> @@ -19,6 +19,7 @@ import threading
>  import time
>  from datetime import datetime
>  import sysconfig
> +from traceback import format_exception
>  from rteval.modules.loads import LoadModules
>  from rteval.modules.measurement import MeasurementModules
>  from rteval.rtevalReport import rtevalReport
> @@ -29,6 +30,7 @@ from rteval import version
>  RTEVAL_VERSION = version.RTEVAL_VERSION
>  
>  EARLYSTOP = False
> +threaderr = False

If this is a global we should follow the convention and 
capitalize it. Yes there is legacy code in rteval that doesn't follow the 
convention.

Even better, does this have to be a global? Would perhaps a class variable 
work here instead?

>  
>  stopsig = threading.Event()
>  def sig_handler(signum, frame):
> @@ -39,9 +41,17 @@ def sig_handler(signum, frame):
>      else:
>          raise RuntimeError(f"SIGNAL received! ({signum})")
>  
> +def except_hook(args):
> +    global threaderr
> +
> +    threading.__excepthook__(args)
> +    threaderr = True
> +    stopsig.set()
> +
>  class RtEval(rtevalReport):
>      def __init__(self, config, loadmods, measuremods, logger):
>          self.__version = RTEVAL_VERSION
> +        threading.excepthook = except_hook
>  
>          if not isinstance(config, rtevalConfig.rtevalConfig):
>              raise TypeError("config variable is not an rtevalConfig object")
> @@ -237,6 +247,8 @@ class RtEval(rtevalReport):
>          global EARLYSTOP
>          rtevalres = 0
>          measure_start = self.__RunMeasurement()
> +        if threaderr:
> +            return 1
>  
>          self._report(measure_start, self.__rtevcfg.xslt_report)
>          if self.__rtevcfg.sysreport:
> diff --git a/rteval/modules/__init__.py b/rteval/modules/__init__.py
> index eb29db8..9827651 100644
> --- a/rteval/modules/__init__.py
> +++ b/rteval/modules/__init__.py
> @@ -124,8 +124,8 @@ class rtevalModulePrototype(threading.Thread):
>  
>      def WaitForCompletion(self, wtime=None):
>          """ Blocks until the module has completed its workload """
> -        if not self.shouldStart():
> -            # If it hasn't been started yet, nothing to wait for
> +        if self.hadRuntimeError() or not self.shouldStart():
> +            # If it failed or hasn't been started yet, nothing to wait for
>              return None
>          return self.__events["finished"].wait(wtime)
>  
> @@ -175,7 +175,7 @@ class rtevalModulePrototype(threading.Thread):
>          return self._donotrun is False
>  
>  
> -    def run(self):
> +    def __run(self):
>          "Workload thread runner - takes care of keeping the workload running as long as needed"
>          if self.shouldStop():
>              return
> @@ -215,6 +215,12 @@ class rtevalModulePrototype(threading.Thread):
>  
>          self._WorkloadCleanup()
>  
> +    def run(self):
> +        try:
> +            self.__run()
> +        except Exception as e:
> +            self._setRuntimeError()
> +            raise e
>  
>      def MakeReport(self):
>          """ required module method, needs to return an libxml2.xmlNode object
> @@ -532,6 +538,8 @@ class RtEvalModules:
>          rep_n = libxml2.newNode(self._report_tag)
>  
>          for (modname, mod) in self.__modules:
> +            if mod.hadRuntimeError():
> +                continue
>              self._logger.log(Log.DEBUG, f"Getting report from {modname}")
>              modrep_n = mod.MakeReport()
>              if modrep_n is not None:
> diff --git a/rteval/modules/measurement/cyclictest.py b/rteval/modules/measurement/cyclictest.py
> index 2e8f6f1..3d25c20 100644
> --- a/rteval/modules/measurement/cyclictest.py
> +++ b/rteval/modules/measurement/cyclictest.py
> @@ -285,15 +285,11 @@ class Cyclictest(rtevalModulePrototype):
>                  fp.flush()
>  
>          self.__cyclicoutput.seek(0)
> -        try:
> -            self.__cyclicprocess = subprocess.Popen(self.__cmd,
> -                                                    stdout=self.__cyclicoutput,
> -                                                    stderr=self.__nullfp,
> -                                                    stdin=self.__nullfp)
> -            self.__started = True
> -        except OSError:
> -            self.__started = False
> -
> +        self.__cyclicprocess = subprocess.Popen(self.__cmd,
> +                                                stdout=self.__cyclicoutput,
> +                                                stderr=self.__nullfp,
> +                                                stdin=self.__nullfp)
> +        self.__started = True
>  
>      def WorkloadAlive(self):
>          if self.__started:
> diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py
> index 92bc070..df42777 100644
> --- a/rteval/modules/measurement/timerlat.py
> +++ b/rteval/modules/measurement/timerlat.py
> @@ -252,14 +252,11 @@ class Timerlat(rtevalModulePrototype):
>  
>          self.__timerlat_out.seek(0)
>          self.__timerlat_err.seek(0)
> -        try:
> -            self.__timerlat_process = subprocess.Popen(self.__cmd,
> -                                                       stdout=self.__timerlat_out,
> -                                                       stderr=self.__timerlat_err,
> -                                                       stdin=None)
> -            self.__started = True
> -        except OSError:
> -            self.__started = False
> +        self.__timerlat_process = subprocess.Popen(self.__cmd,
> +                                                   stdout=self.__timerlat_out,
> +                                                   stderr=self.__timerlat_err,
> +                                                   stdin=None)
> +        self.__started = True
>  
>      def WorkloadAlive(self):
>          if self.__started:
> -- 
> 2.47.0
> 
> 
>
John Kacur Nov. 13, 2024, 3:52 p.m. UTC | #2
On Fri, 1 Nov 2024, Crystal Wood wrote:

> Currently, if cyclictest or rtla (whichever rteval is configured to use)
> is missing, the error output only says "measurement threads did not use
> the full time slot" and then rteval hangs.
> 
> Avoid catching these exceptions so that they will be printed out,
> pointing to the actual problem.  Also set up a thread exception hook to
> do a prompt and (relatively) graceful exit if a thread has any uncaught
> exception (this requires python >= 3.8).
> 
> Signed-off-by: Crystal Wood <crwood@redhat.com>
> ---
>  README                                   |  2 +-
>  rteval/__init__.py                       | 12 ++++++++++++
>  rteval/modules/__init__.py               | 14 +++++++++++---
>  rteval/modules/measurement/cyclictest.py | 14 +++++---------
>  rteval/modules/measurement/timerlat.py   | 13 +++++--------
>  5 files changed, 34 insertions(+), 21 deletions(-)
> 
> diff --git a/README b/README
> index b352d7f..19704b4 100644
> --- a/README
> +++ b/README
> @@ -16,7 +16,7 @@ The rteval source may be pulled from it's git tree on kernel.org:
>  
>  Rteval requires the following packages to run:
>  
> -Python >= 3.0
> +Python >= 3.8
>      http://www.python.org/download/
>  
>  python-lxml
> diff --git a/rteval/__init__.py b/rteval/__init__.py
> index 7c13e84..6097ddf 100644
> --- a/rteval/__init__.py
> +++ b/rteval/__init__.py
> @@ -19,6 +19,7 @@ import threading
>  import time
>  from datetime import datetime
>  import sysconfig
> +from traceback import format_exception
>  from rteval.modules.loads import LoadModules
>  from rteval.modules.measurement import MeasurementModules
>  from rteval.rtevalReport import rtevalReport
> @@ -29,6 +30,7 @@ from rteval import version
>  RTEVAL_VERSION = version.RTEVAL_VERSION
>  
>  EARLYSTOP = False
> +threaderr = False
>  
>  stopsig = threading.Event()
>  def sig_handler(signum, frame):
> @@ -39,9 +41,17 @@ def sig_handler(signum, frame):
>      else:
>          raise RuntimeError(f"SIGNAL received! ({signum})")
>  
> +def except_hook(args):
> +    global threaderr
> +
> +    threading.__excepthook__(args)
> +    threaderr = True
> +    stopsig.set()
> +
>  class RtEval(rtevalReport):
>      def __init__(self, config, loadmods, measuremods, logger):
>          self.__version = RTEVAL_VERSION
> +        threading.excepthook = except_hook
>  
>          if not isinstance(config, rtevalConfig.rtevalConfig):
>              raise TypeError("config variable is not an rtevalConfig object")
> @@ -237,6 +247,8 @@ class RtEval(rtevalReport):
>          global EARLYSTOP
>          rtevalres = 0
>          measure_start = self.__RunMeasurement()
> +        if threaderr:
> +            return 1
>  
>          self._report(measure_start, self.__rtevcfg.xslt_report)
>          if self.__rtevcfg.sysreport:
> diff --git a/rteval/modules/__init__.py b/rteval/modules/__init__.py
> index eb29db8..9827651 100644
> --- a/rteval/modules/__init__.py
> +++ b/rteval/modules/__init__.py
> @@ -124,8 +124,8 @@ class rtevalModulePrototype(threading.Thread):
>  
>      def WaitForCompletion(self, wtime=None):
>          """ Blocks until the module has completed its workload """
> -        if not self.shouldStart():
> -            # If it hasn't been started yet, nothing to wait for
> +        if self.hadRuntimeError() or not self.shouldStart():
> +            # If it failed or hasn't been started yet, nothing to wait for
>              return None
>          return self.__events["finished"].wait(wtime)
>  
> @@ -175,7 +175,7 @@ class rtevalModulePrototype(threading.Thread):
>          return self._donotrun is False
>  
>  
> -    def run(self):
> +    def __run(self):
>          "Workload thread runner - takes care of keeping the workload running as long as needed"
>          if self.shouldStop():
>              return
> @@ -215,6 +215,12 @@ class rtevalModulePrototype(threading.Thread):
>  
>          self._WorkloadCleanup()
>  
> +    def run(self):
> +        try:
> +            self.__run()
> +        except Exception as e:
> +            self._setRuntimeError()
> +            raise e
>  
>      def MakeReport(self):
>          """ required module method, needs to return an libxml2.xmlNode object
> @@ -532,6 +538,8 @@ class RtEvalModules:
>          rep_n = libxml2.newNode(self._report_tag)
>  
>          for (modname, mod) in self.__modules:
> +            if mod.hadRuntimeError():
> +                continue
>              self._logger.log(Log.DEBUG, f"Getting report from {modname}")
>              modrep_n = mod.MakeReport()
>              if modrep_n is not None:
> diff --git a/rteval/modules/measurement/cyclictest.py b/rteval/modules/measurement/cyclictest.py
> index 2e8f6f1..3d25c20 100644
> --- a/rteval/modules/measurement/cyclictest.py
> +++ b/rteval/modules/measurement/cyclictest.py
> @@ -285,15 +285,11 @@ class Cyclictest(rtevalModulePrototype):
>                  fp.flush()
>  
>          self.__cyclicoutput.seek(0)
> -        try:
> -            self.__cyclicprocess = subprocess.Popen(self.__cmd,
> -                                                    stdout=self.__cyclicoutput,
> -                                                    stderr=self.__nullfp,
> -                                                    stdin=self.__nullfp)
> -            self.__started = True
> -        except OSError:
> -            self.__started = False
> -
> +        self.__cyclicprocess = subprocess.Popen(self.__cmd,
> +                                                stdout=self.__cyclicoutput,
> +                                                stderr=self.__nullfp,
> +                                                stdin=self.__nullfp)
> +        self.__started = True
>  
>      def WorkloadAlive(self):
>          if self.__started:
> diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py
> index 92bc070..df42777 100644
> --- a/rteval/modules/measurement/timerlat.py
> +++ b/rteval/modules/measurement/timerlat.py
> @@ -252,14 +252,11 @@ class Timerlat(rtevalModulePrototype):
>  
>          self.__timerlat_out.seek(0)
>          self.__timerlat_err.seek(0)
> -        try:
> -            self.__timerlat_process = subprocess.Popen(self.__cmd,
> -                                                       stdout=self.__timerlat_out,
> -                                                       stderr=self.__timerlat_err,
> -                                                       stdin=None)
> -            self.__started = True
> -        except OSError:
> -            self.__started = False
> +        self.__timerlat_process = subprocess.Popen(self.__cmd,
> +                                                   stdout=self.__timerlat_out,
> +                                                   stderr=self.__timerlat_err,
> +                                                   stdin=None)
> +        self.__started = True
>  
>      def WorkloadAlive(self):
>          if self.__started:
> -- 
> 2.47.0
> 
> 
> 
Signed-off-by: John Kacur <jkacur@redhat.com>

Thank you
diff mbox series

Patch

diff --git a/README b/README
index b352d7f..19704b4 100644
--- a/README
+++ b/README
@@ -16,7 +16,7 @@  The rteval source may be pulled from it's git tree on kernel.org:
 
 Rteval requires the following packages to run:
 
-Python >= 3.0
+Python >= 3.8
     http://www.python.org/download/
 
 python-lxml
diff --git a/rteval/__init__.py b/rteval/__init__.py
index 7c13e84..6097ddf 100644
--- a/rteval/__init__.py
+++ b/rteval/__init__.py
@@ -19,6 +19,7 @@  import threading
 import time
 from datetime import datetime
 import sysconfig
+from traceback import format_exception
 from rteval.modules.loads import LoadModules
 from rteval.modules.measurement import MeasurementModules
 from rteval.rtevalReport import rtevalReport
@@ -29,6 +30,7 @@  from rteval import version
 RTEVAL_VERSION = version.RTEVAL_VERSION
 
 EARLYSTOP = False
+threaderr = False
 
 stopsig = threading.Event()
 def sig_handler(signum, frame):
@@ -39,9 +41,17 @@  def sig_handler(signum, frame):
     else:
         raise RuntimeError(f"SIGNAL received! ({signum})")
 
+def except_hook(args):
+    global threaderr
+
+    threading.__excepthook__(args)
+    threaderr = True
+    stopsig.set()
+
 class RtEval(rtevalReport):
     def __init__(self, config, loadmods, measuremods, logger):
         self.__version = RTEVAL_VERSION
+        threading.excepthook = except_hook
 
         if not isinstance(config, rtevalConfig.rtevalConfig):
             raise TypeError("config variable is not an rtevalConfig object")
@@ -237,6 +247,8 @@  class RtEval(rtevalReport):
         global EARLYSTOP
         rtevalres = 0
         measure_start = self.__RunMeasurement()
+        if threaderr:
+            return 1
 
         self._report(measure_start, self.__rtevcfg.xslt_report)
         if self.__rtevcfg.sysreport:
diff --git a/rteval/modules/__init__.py b/rteval/modules/__init__.py
index eb29db8..9827651 100644
--- a/rteval/modules/__init__.py
+++ b/rteval/modules/__init__.py
@@ -124,8 +124,8 @@  class rtevalModulePrototype(threading.Thread):
 
     def WaitForCompletion(self, wtime=None):
         """ Blocks until the module has completed its workload """
-        if not self.shouldStart():
-            # If it hasn't been started yet, nothing to wait for
+        if self.hadRuntimeError() or not self.shouldStart():
+            # If it failed or hasn't been started yet, nothing to wait for
             return None
         return self.__events["finished"].wait(wtime)
 
@@ -175,7 +175,7 @@  class rtevalModulePrototype(threading.Thread):
         return self._donotrun is False
 
 
-    def run(self):
+    def __run(self):
         "Workload thread runner - takes care of keeping the workload running as long as needed"
         if self.shouldStop():
             return
@@ -215,6 +215,12 @@  class rtevalModulePrototype(threading.Thread):
 
         self._WorkloadCleanup()
 
+    def run(self):
+        try:
+            self.__run()
+        except Exception as e:
+            self._setRuntimeError()
+            raise e
 
     def MakeReport(self):
         """ required module method, needs to return an libxml2.xmlNode object
@@ -532,6 +538,8 @@  class RtEvalModules:
         rep_n = libxml2.newNode(self._report_tag)
 
         for (modname, mod) in self.__modules:
+            if mod.hadRuntimeError():
+                continue
             self._logger.log(Log.DEBUG, f"Getting report from {modname}")
             modrep_n = mod.MakeReport()
             if modrep_n is not None:
diff --git a/rteval/modules/measurement/cyclictest.py b/rteval/modules/measurement/cyclictest.py
index 2e8f6f1..3d25c20 100644
--- a/rteval/modules/measurement/cyclictest.py
+++ b/rteval/modules/measurement/cyclictest.py
@@ -285,15 +285,11 @@  class Cyclictest(rtevalModulePrototype):
                 fp.flush()
 
         self.__cyclicoutput.seek(0)
-        try:
-            self.__cyclicprocess = subprocess.Popen(self.__cmd,
-                                                    stdout=self.__cyclicoutput,
-                                                    stderr=self.__nullfp,
-                                                    stdin=self.__nullfp)
-            self.__started = True
-        except OSError:
-            self.__started = False
-
+        self.__cyclicprocess = subprocess.Popen(self.__cmd,
+                                                stdout=self.__cyclicoutput,
+                                                stderr=self.__nullfp,
+                                                stdin=self.__nullfp)
+        self.__started = True
 
     def WorkloadAlive(self):
         if self.__started:
diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py
index 92bc070..df42777 100644
--- a/rteval/modules/measurement/timerlat.py
+++ b/rteval/modules/measurement/timerlat.py
@@ -252,14 +252,11 @@  class Timerlat(rtevalModulePrototype):
 
         self.__timerlat_out.seek(0)
         self.__timerlat_err.seek(0)
-        try:
-            self.__timerlat_process = subprocess.Popen(self.__cmd,
-                                                       stdout=self.__timerlat_out,
-                                                       stderr=self.__timerlat_err,
-                                                       stdin=None)
-            self.__started = True
-        except OSError:
-            self.__started = False
+        self.__timerlat_process = subprocess.Popen(self.__cmd,
+                                                   stdout=self.__timerlat_out,
+                                                   stderr=self.__timerlat_err,
+                                                   stdin=None)
+        self.__started = True
 
     def WorkloadAlive(self):
         if self.__started: