Merge pull request #2742 from boegel/pytorch_allow_failed_tests

allow some PyTorch tests to fail + print warning if one or more tests fail
easybuilders · Jul 7, 2022 · ac68b02 · ac68b02
2 parents 7883ec7 + a5b55a0
commit ac68b02
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 11 deletions.
diff --git a/easybuild/easyblocks/generic/pythonpackage.py b/easybuild/easyblocks/generic/pythonpackage.py
@@ -639,8 +639,12 @@ def build_step(self):
             # We consider the build and install output together as downloads likely happen here if this is run
             self.install_cmd_output += out
 
-    def test_step(self):
-        """Test the built Python package."""
+    def test_step(self, return_output_ec=False):
+        """
+        Test the built Python package.
+
+        :param return_output: return output and exit code of test command
+        """
 
         if isinstance(self.cfg['runtest'], string_type):
             self.testcmd = self.cfg['runtest']
@@ -649,6 +653,8 @@ def test_step(self):
             extrapath = ""
             testinstalldir = None
 
+            out, ec = (None, None)
+
             if self.testinstall:
                 # install in test directory and export PYTHONPATH
 
@@ -670,12 +676,24 @@ def test_step(self):
 
             if self.testcmd:
                 testcmd = self.testcmd % {'python': self.python_cmd}
-                cmd = ' '.join([extrapath, self.cfg['pretestopts'], testcmd, self.cfg['testopts']])
-                run_cmd(cmd, log_all=True, simple=True)
+                cmd = ' '.join([
+                    extrapath,
+                    self.cfg['pretestopts'],
+                    testcmd,
+                    self.cfg['testopts'],
+                ])
+
+                if return_output_ec:
+                    (out, ec) = run_cmd(cmd, log_all=False, log_ok=False, simple=False)
+                else:
+                    run_cmd(cmd, log_all=True, simple=True)
 
             if testinstalldir:
                 remove_dir(testinstalldir)
 
+            if return_output_ec:
+                return (out, ec)
+
     def install_step(self):
         """Install Python package to a custom path using setup.py"""
 

diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py
@@ -31,15 +31,16 @@
 import os
 import re
 import tempfile
+import easybuild.tools.environment as env
 from distutils.version import LooseVersion
 from easybuild.easyblocks.generic.pythonpackage import PythonPackage
 from easybuild.framework.easyconfig import CUSTOM
-from easybuild.tools.build_log import EasyBuildError
+from easybuild.tools.build_log import EasyBuildError, print_warning
 from easybuild.tools.config import build_option
-import easybuild.tools.environment as env
+from easybuild.tools.filetools import symlink, apply_regex_substitutions
 from easybuild.tools.modules import get_software_root, get_software_version
 from easybuild.tools.systemtools import POWER, get_cpu_architecture
-from easybuild.tools.filetools import symlink, apply_regex_substitutions
+from easybuild.tools.utilities import nub
 
 
 class EB_PyTorch(PythonPackage):
@@ -49,9 +50,10 @@ class EB_PyTorch(PythonPackage):
     def extra_options():
         extra_vars = PythonPackage.extra_options()
         extra_vars.update({
-            'excluded_tests': [{}, 'Mapping of architecture strings to list of tests to be excluded', CUSTOM],
-            'custom_opts': [[], 'List of options for the build/install command. Can be used to change the defaults ' +
-                                'set by the PyTorch EasyBlock, for example ["USE_MKLDNN=0"].', CUSTOM],
+            'custom_opts': [[], "List of options for the build/install command. Can be used to change the defaults " +
+                                "set by the PyTorch EasyBlock, for example ['USE_MKLDNN=0'].", CUSTOM],
+            'excluded_tests': [{}, "Mapping of architecture strings to list of tests to be excluded", CUSTOM],
+            'max_failed_tests': [0, "Maximum number of failing tests", CUSTOM],
         })
         extra_vars['download_dep_fail'][0] = True
         extra_vars['sanity_pip_check'][0] = True
@@ -253,7 +255,42 @@ def test_step(self):
             'python': self.python_cmd,
             'excluded_tests': ' '.join(excluded_tests)
         })
-        super(EB_PyTorch, self).test_step()
+
+        (tests_out, tests_ec) = super(EB_PyTorch, self).test_step(return_output_ec=True)
+
+        ran_tests_hits = re.findall(r"^Ran (?P<test_cnt>[0-9]+) tests in", tests_out, re.M)
+        test_cnt = 0
+        for hit in ran_tests_hits:
+            test_cnt += int(hit)
+
+        failed_tests = nub(re.findall(r"^(?P<failed_test_name>.*) failed!\s*$", tests_out, re.M))
+        failed_test_cnt = len(failed_tests)
+
+        if failed_test_cnt:
+            max_failed_tests = self.cfg['max_failed_tests']
+
+            test_or_tests = 'tests' if failed_test_cnt > 1 else 'test'
+            msg = "%d %s (out of %d) failed:\n" % (failed_test_cnt, test_or_tests, test_cnt)
+            msg += '\n'.join('* %s' % t for t in sorted(failed_tests))
+
+            if max_failed_tests == 0:
+                raise EasyBuildError(msg)
+            else:
+                msg += '\n\n' + ' '.join([
+                    "The PyTorch test suite is known to include some flaky tests,",
+                    "which may fail depending on the specifics of the system or the context in which they are run.",
+                    "For this PyTorch installation, EasyBuild allows up to %d tests to fail." % max_failed_tests,
+                    "We recommend to double check that the failing tests listed above ",
+                    "are known to be flaky, or do not affect your intended usage of PyTorch.",
+                    "In case of doubt, reach out to the EasyBuild community (via GitHub, Slack, or mailing list).",
+                ])
+                print_warning(msg)
+
+                if failed_test_cnt > max_failed_tests:
+                    raise EasyBuildError("Too many failed tests (%d), maximum allowed is %d",
+                                         failed_test_cnt, max_failed_tests)
+        elif tests_ec:
+            raise EasyBuildError("Test command had non-zero exit code (%s), but no failed tests found?!", tests_ec)
 
     def test_cases_step(self):
         # Make PyTorch tests not use the user home