huggingface · ashahba · Jan 31, 2024 · Jan 31, 2024 · Apr 26, 2024
diff --git a/measurements/perplexity/perplexity.py b/measurements/perplexity/perplexity.py
@@ -104,7 +104,6 @@ def _info(self):
     def _compute(
         self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
     ):
-
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
             if device == "gpu":

diff --git a/metrics/bertscore/bertscore.py b/metrics/bertscore/bertscore.py
@@ -143,7 +143,6 @@ def _compute(
         baseline_path=None,
         use_fast_tokenizer=False,
     ):
-
         if isinstance(references[0], str):
             references = [[ref] for ref in references]
 

diff --git a/metrics/bleu/tokenizer_13a.py b/metrics/bleu/tokenizer_13a.py
@@ -61,7 +61,7 @@ def __call__(self, line):
         :param line: a segment to tokenize
         :return: the tokenized line
         """
-        for (_re, repl) in self._re:
+        for _re, repl in self._re:
             line = _re.sub(repl, line)
 
         # no leading or trailing spaces, single space within words

diff --git a/metrics/bleurt/bleurt.py b/metrics/bleurt/bleurt.py
@@ -79,7 +79,6 @@
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class BLEURT(evaluate.Metric):
     def _info(self):
-
         return evaluate.MetricInfo(
             description=_DESCRIPTION,
             citation=_CITATION,
@@ -96,7 +95,6 @@ def _info(self):
         )
 
     def _download_and_prepare(self, dl_manager):
-
         # check that config name specifies a valid BLEURT model
         if self.config_name == "default":
             logger.warning(

diff --git a/metrics/brier_score/brier_score.py b/metrics/brier_score/brier_score.py
@@ -128,7 +128,6 @@ def _get_feature_types(self):
             ]
 
     def _compute(self, references, predictions, sample_weight=None, pos_label=1):
-
         brier_score = brier_score_loss(references, predictions, sample_weight=sample_weight, pos_label=pos_label)
 
         return {"brier_score": brier_score}
diff --git a/metrics/code_eval/execute.py b/metrics/code_eval/execute.py
@@ -54,9 +54,7 @@ def check_correctness(check_program, timeout, task_id, completion_id):
 
 
 def unsafe_execute(check_program, result, timeout):
-
     with create_tempdir():
-
         # These system calls are needed when cleaning up tempdir.
         import os
         import shutil

diff --git a/metrics/comet/comet.py b/metrics/comet/comet.py
@@ -127,7 +127,6 @@
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class COMET(evaluate.Metric):
     def _info(self):
-
         return evaluate.MetricInfo(
             description=_DESCRIPTION,
             citation=_CITATION,

diff --git a/metrics/coval/coval.py b/metrics/coval/coval.py
@@ -168,7 +168,6 @@
 def get_coref_infos(
     key_lines, sys_lines, NP_only=False, remove_nested=False, keep_singletons=True, min_span=False, doc="dummy_doc"
 ):
-
     key_doc_lines = {doc: key_lines}
     sys_doc_lines = {doc: sys_lines}
 

diff --git a/metrics/exact_match/exact_match.py b/metrics/exact_match/exact_match.py
@@ -108,7 +108,6 @@ def _compute(
         ignore_punctuation=False,
         ignore_numbers=False,
     ):
-
         if regexes_to_ignore is not None:
             for s in regexes_to_ignore:
                 predictions = np.array([re.sub(s, "", x) for x in predictions])

diff --git a/metrics/google_bleu/tokenizer_13a.py b/metrics/google_bleu/tokenizer_13a.py
@@ -61,7 +61,7 @@ def __call__(self, line):
         :param line: a segment to tokenize
         :return: the tokenized line
         """
-        for (_re, repl) in self._re:
+        for _re, repl in self._re:
             line = _re.sub(repl, line)
 
         # no leading or trailing spaces, single space within words

diff --git a/metrics/mae/mae.py b/metrics/mae/mae.py
@@ -107,7 +107,6 @@ def _get_feature_types(self):
             }
 
     def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average"):
-
         mae_score = mean_absolute_error(references, predictions, sample_weight=sample_weight, multioutput=multioutput)
 
         return {"mae": mae_score}
diff --git a/metrics/mahalanobis/mahalanobis.py b/metrics/mahalanobis/mahalanobis.py
@@ -72,7 +72,6 @@ def _info(self):
         )
 
     def _compute(self, X, reference_distribution):
-
         # convert to numpy arrays
         X = np.array(X)
         reference_distribution = np.array(reference_distribution)

diff --git a/metrics/mape/mape.py b/metrics/mape/mape.py
@@ -107,7 +107,6 @@ def _get_feature_types(self):
             }
 
     def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average"):
-
         mape_score = mean_absolute_percentage_error(
             references,
             predictions,

diff --git a/metrics/mase/mase.py b/metrics/mase/mase.py
@@ -123,7 +123,6 @@ def _compute(
         sample_weight=None,
         multioutput="uniform_average",
     ):
-
         y_pred_naive = training[:-periodicity]
         mae_naive = mean_absolute_error(training[periodicity:], y_pred_naive, multioutput=multioutput)
 

diff --git a/metrics/mse/mse.py b/metrics/mse/mse.py
@@ -111,7 +111,6 @@ def _get_feature_types(self):
             }
 
     def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average", squared=True):
-
         mse = mean_squared_error(
             references, predictions, sample_weight=sample_weight, multioutput=multioutput, squared=squared
         )

diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py
@@ -103,7 +103,6 @@ def _info(self):
     def _compute(
         self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
     ):
-
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
             if device == "gpu":

diff --git a/metrics/sari/sari.py b/metrics/sari/sari.py
@@ -228,7 +228,6 @@ def SARIsent(ssent, csent, rsents):
 
 
 def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True):
-
     # Normalization is requried for the ASSET dataset (one of the primary
     # datasets in sentence simplification) to allow using space
     # to split the sentence. Even though Wiki-Auto and TURK datasets,
@@ -279,7 +278,6 @@ def _info(self):
         )
 
     def _compute(self, sources, predictions, references):
-
         if not (len(sources) == len(predictions) == len(references)):
             raise ValueError("Sources length must match predictions and references lengths.")
         sari_score = 0

diff --git a/metrics/smape/smape.py b/metrics/smape/smape.py
@@ -147,7 +147,6 @@ def _get_feature_types(self):
             }
 
     def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average"):
-
         smape_score = symmetric_mean_absolute_percentage_error(
             references,
             predictions,

diff --git a/metrics/wiki_split/wiki_split.py b/metrics/wiki_split/wiki_split.py
@@ -255,7 +255,6 @@ def SARIsent(ssent, csent, rsents):
 
 
 def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True):
-
     # Normalization is requried for the ASSET dataset (one of the primary
     # datasets in sentence simplification) to allow using space
     # to split the sentence. Even though Wiki-Auto and TURK datasets,
@@ -285,7 +284,6 @@ def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_s
 
 
 def compute_sari(sources, predictions, references):
-
     if not (len(sources) == len(predictions) == len(references)):
         raise ValueError("Sources length must match predictions and references lengths.")
     sari_score = 0

diff --git a/metrics/xtreme_s/xtreme_s.py b/metrics/xtreme_s/xtreme_s.py
@@ -239,7 +239,6 @@ def _info(self):
         )
 
     def _compute(self, predictions, references, bleu_kwargs=None, wer_kwargs=None):
-
         bleu_kwargs = bleu_kwargs if bleu_kwargs is not None else {}
         wer_kwargs = wer_kwargs if wer_kwargs is not None else {}
 

diff --git a/src/evaluate/evaluation_suite/__init__.py b/src/evaluate/evaluation_suite/__init__.py
@@ -102,12 +102,10 @@ def assert_suite_nonempty(self):
     def run(
         self, model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"]  # noqa: F821
     ) -> Dict[str, float]:
-
         self.assert_suite_nonempty()
 
         results_all = []
         for task in self.suite:
-
             task_name = task.data
 
             if task.data_preprocessor:  # task requires extra preprocessing

diff --git a/src/evaluate/evaluator/audio_classification.py b/src/evaluate/evaluator/audio_classification.py
@@ -119,7 +119,6 @@ def compute(
         label_column: str = "label",
         label_mapping: Optional[Dict[str, Number]] = None,
     ) -> Tuple[Dict[str, float], Any]:
-
         """
         input_column (`str`, defaults to `"file"`):
             The name of the column containing either the audio files or a raw waveform, represented as a numpy array, in the dataset specified by `data`.

diff --git a/src/evaluate/evaluator/base.py b/src/evaluate/evaluator/base.py
@@ -235,7 +235,6 @@ def compute(
         label_column: str = "label",
         label_mapping: Optional[Dict[str, Number]] = None,
     ) -> Dict[str, float]:
-
         result = {}
 
         self.check_for_mismatch_in_device_setup(device, model_or_pipeline)

diff --git a/src/evaluate/evaluator/image_classification.py b/src/evaluate/evaluator/image_classification.py
@@ -87,7 +87,6 @@ def compute(
         label_column: str = "label",
         label_mapping: Optional[Dict[str, Number]] = None,
     ) -> Tuple[Dict[str, float], Any]:
-
         """
         input_column (`str`, defaults to `"image"`):
             The name of the column containing the images as PIL ImageFile in the dataset specified by `data`.

diff --git a/src/evaluate/inspect.py b/src/evaluate/inspect.py
@@ -72,7 +72,6 @@ def list_evaluation_modules(module_type=None, include_community=True, with_detai
 
 
 def _list_evaluation_modules_type(module_type, include_community=True, with_details=False):
-
     r = requests.get(HF_LIST_ENDPOINT.format(type=module_type))
     r.raise_for_status()
     d = r.json()

diff --git a/src/evaluate/loading.py b/src/evaluate/loading.py
@@ -658,7 +658,8 @@ def evaluation_module_factory(
                     download_mode=download_mode,
                     dynamic_modules_path=dynamic_modules_path,
                 ).get_module()
-        except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
+        except Exception as e1:  # all the attempts have failed
+            # before raising the error we should check if the module is already cached.
             # if it's a canonical module we need to check if it's any of the types
             if path.count("/") == 0:
                 for current_type in ["metric", "comparison", "measurement"]:

diff --git a/src/evaluate/utils/file_utils.py b/src/evaluate/utils/file_utils.py
@@ -535,7 +535,6 @@ def get_from_cache(
     # Prevent parallel downloads of the same file with a lock.
     lock_path = cache_path + ".lock"
     with FileLock(lock_path):
-
         if resume_download:
             incomplete_path = cache_path + ".incomplete"
 

diff --git a/src/evaluate/visualization.py b/src/evaluate/visualization.py
@@ -22,7 +22,6 @@ class ComplexRadar:
     """
 
     def __init__(self, fig, variables, ranges, n_ring_levels=5, show_scales=True, format_cfg=None):
-
         self.format_cfg = format_cfg
 
         # Calculate angles and create for each variable an axes
@@ -41,7 +40,6 @@ def __init__(self, fig, variables, ranges, n_ring_levels=5, show_scales=True, fo
 
         # Writing the ranges on each axes
         for i, ax in enumerate(axes):
-
             # Here we do the trick by repeating the first iteration
             j = 0 if (i == 0 or i == 1) else i - 1
             ax.set_ylim(*ranges[j])

diff --git a/tests/test_evaluation_suite.py b/tests/test_evaluation_suite.py
@@ -13,7 +13,6 @@ def setUp(self):
         self.dummy_model = DummyTextClassificationPipeline()
 
     def test_running_evaluation_suite(self):
-
         # Check that the evaluation suite successfully runs
         results = self.evaluation_suite.run(self.dummy_model)
 
@@ -25,7 +24,6 @@ def test_running_evaluation_suite(self):
         self.assertEqual(len(results), 2)
 
     def test_empty_suite(self):
-
         self.empty_suite = self.evaluation_suite
         self.empty_suite.suite = []
         self.assertRaises(ValueError, self.empty_suite.run, self.dummy_model)
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
@@ -306,7 +306,6 @@ def test_default_pipe_init(self):
         self.assertEqual(results["accuracy"], 1.0)
 
     def test_data_loading(self):
-
         # Test passing in dataset by name with split
         data = self.evaluator.load_data("evaluate/imdb-ci", split="test[:1]")
         self.evaluator.prepare_data(data=data, input_column="text", label_column="label", second_input_column=None)

diff --git a/tests/test_metric.py b/tests/test_metric.py
@@ -516,7 +516,6 @@ def test_string_casting(self):
             metric.compute(predictions=["a"], references=["a"])
 
     def test_string_casting_tested_once(self):
-
         self.counter = 0
 
         def checked_fct(fct):  # wrapper function that increases a counter on each call