Rough idea of warning of potential conflicting merges

opensanctions · Jun 11, 2024 · 5eaf5ab · 5eaf5ab
1 parent 4faf42a
commit 5eaf5ab
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 2 deletions.
diff --git a/nomenklatura/xref.py b/nomenklatura/xref.py
@@ -1,10 +1,12 @@
 import logging
-from typing import List, Optional, Type
+from typing import List, Optional, Type, Dict, Set
 from followthemoney.schema import Schema
+from itertools import combinations
+from collections import defaultdict
 
 from nomenklatura.dataset import DS
 from nomenklatura.entity import CE
-from nomenklatura.store import Store
+from nomenklatura.store import Store, View
 from nomenklatura.judgement import Judgement
 from nomenklatura.resolver import Resolver
 from nomenklatura.index import Index
@@ -25,6 +27,23 @@ def _print_stats(pairs: int, suggested: int, scores: List[float]) -> None:
     )
 
 
+def report_potential_conflicts(
+    view: View[DS, CE],
+    negative_check_matches: Dict[str, Set[str]],
+    resolver: Resolver[CE],
+) -> None:
+    for candidate_id, matches in negative_check_matches.items():
+        for left_id, right_id in combinations(matches, 2):
+            judgement = resolver.get_judgement(left_id, right_id)
+            if judgement == Judgement.NEGATIVE:
+                log.info(
+                    "Potential conflict: %s <> %s for %s",
+                    left_id,
+                    right_id,
+                    candidate_id,
+                )
+
+
 def xref(
     resolver: Resolver[CE],
     store: Store[DS, CE],
@@ -33,6 +52,7 @@ def xref(
     external: bool = True,
     range: Optional[Schema] = None,
     auto_threshold: Optional[float] = None,
+    negative_check_threshold: Optional[float] = None,
     focus_dataset: Optional[str] = None,
     algorithm: Type[ScoringAlgorithm] = DefaultAlgorithm,
     user: Optional[str] = None,
@@ -41,6 +61,8 @@ def xref(
     view = store.default_view(external=external)
     index = Index(view)
     index.build()
+    negative_check_threshold = negative_check_threshold or auto_threshold or 0.98
+    negative_check_matches: Dict[str, Set[str]] = defaultdict(set)
     try:
         scores: List[float] = []
         suggested = 0
@@ -67,12 +89,17 @@ def xref(
             if scored:
                 result = algorithm.compare(left, right)
                 score = result.score
+
             scores.append(score)
 
             # Not sure this is globally a good idea.
             if len(left.datasets.intersection(right.datasets)) > 0:
                 score = score * 0.7
 
+            if score > negative_check_threshold:
+                negative_check_matches[left_id.id].add(right_id.id)
+                negative_check_matches[right_id.id].add(left_id.id)
+
             if auto_threshold is not None and score > auto_threshold:
                 log.info("Auto-merge [%.2f]: %s <> %s", score, left, right)
                 canonical_id = resolver.decide(
@@ -92,5 +119,7 @@ def xref(
             suggested += 1
         _print_stats(idx, suggested, scores)
 
+        report_potential_conflicts(view, negative_check_matches, resolver)
+
     except KeyboardInterrupt:
         log.info("User cancelled, xref will end gracefully.")
diff --git a/tests/test_xref.py b/tests/test_xref.py
@@ -1,3 +1,8 @@
+import logging
+from nomenklatura.dataset.dataset import Dataset
+from nomenklatura.judgement import Judgement
+from nomenklatura.matching.regression_v1.model import RegressionV1
+from nomenklatura.store.memory import MemoryStore
 from nomenklatura.xref import xref
 from nomenklatura.store import SimpleMemoryStore
 from nomenklatura.resolver import Resolver
@@ -20,3 +25,65 @@ def test_xref_candidates(
         if left.caption == "Johanna Quandt":
             assert right.caption == "Frau Johanna Quandt"
         assert score > 0.0
+
+
+def test_xref_potential_conflicts(
+    test_dataset: Dataset,
+    caplog,
+):
+    resolver = Resolver[CompositeEntity]()
+    store = MemoryStore(test_dataset, resolver)
+    algorithm = RegressionV1()
+    a = CompositeEntity.from_data(
+        test_dataset,
+        {
+            "id": "a",
+            "schema": "Company",
+            "properties": {
+                "name": "The AAA Weapons and Munitions Factory Joint Stock Company",
+                "address": "Moscow",
+            },
+        },
+    )
+    b = CompositeEntity.from_data(
+        test_dataset,
+        {
+            "id": "b",
+            "schema": "Company",
+            "properties": {
+                "name": "The BBB Weapons and Munitions Factory Joint Stock Company",
+                "address": "Moscow",
+            },
+        },
+    )
+    c = CompositeEntity.from_data(
+        test_dataset,
+        {
+            "id": "c",
+            "schema": "Company",
+            "properties": {
+                "name": "The AAA Weapons and Ammunition Factory Joint Stock Company",
+                "address": "Moscow",
+            },
+        },
+    )
+    writer = store.writer()
+    writer.add_entity(a)
+    writer.add_entity(b)
+    writer.add_entity(c)
+    writer.flush()
+
+    resolver.decide("a", "b", Judgement.NEGATIVE, user="test")
+
+    with caplog.at_level(logging.INFO):
+        xref(
+            resolver,
+            store,
+            # Not the default, but easily gets the scores where this is a problem
+            algorithm=RegressionV1,
+            # Lower than usual just because we're testing with one dataset
+            negative_check_threshold=0.6
+        )
+    logs = {r.message for r in caplog.records}
+
+    assert "Potential conflict: b <> a for c" in logs