Skip to content

Commit

Permalink
Rough idea of warning of potential conflicting merges
Browse files Browse the repository at this point in the history
  • Loading branch information
jbothma committed Jun 11, 2024
1 parent 4faf42a commit 5eaf5ab
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 2 deletions.
33 changes: 31 additions & 2 deletions nomenklatura/xref.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging
from typing import List, Optional, Type
from typing import List, Optional, Type, Dict, Set
from followthemoney.schema import Schema
from itertools import combinations
from collections import defaultdict

from nomenklatura.dataset import DS
from nomenklatura.entity import CE
from nomenklatura.store import Store
from nomenklatura.store import Store, View
from nomenklatura.judgement import Judgement
from nomenklatura.resolver import Resolver
from nomenklatura.index import Index
Expand All @@ -25,6 +27,23 @@ def _print_stats(pairs: int, suggested: int, scores: List[float]) -> None:
)


def report_potential_conflicts(
view: View[DS, CE],
negative_check_matches: Dict[str, Set[str]],
resolver: Resolver[CE],
) -> None:
for candidate_id, matches in negative_check_matches.items():
for left_id, right_id in combinations(matches, 2):
judgement = resolver.get_judgement(left_id, right_id)
if judgement == Judgement.NEGATIVE:
log.info(
"Potential conflict: %s <> %s for %s",
left_id,
right_id,
candidate_id,
)


def xref(
resolver: Resolver[CE],
store: Store[DS, CE],
Expand All @@ -33,6 +52,7 @@ def xref(
external: bool = True,
range: Optional[Schema] = None,
auto_threshold: Optional[float] = None,
negative_check_threshold: Optional[float] = None,
focus_dataset: Optional[str] = None,
algorithm: Type[ScoringAlgorithm] = DefaultAlgorithm,
user: Optional[str] = None,
Expand All @@ -41,6 +61,8 @@ def xref(
view = store.default_view(external=external)
index = Index(view)
index.build()
negative_check_threshold = negative_check_threshold or auto_threshold or 0.98
negative_check_matches: Dict[str, Set[str]] = defaultdict(set)
try:
scores: List[float] = []
suggested = 0
Expand All @@ -67,12 +89,17 @@ def xref(
if scored:
result = algorithm.compare(left, right)
score = result.score

scores.append(score)

# Not sure this is globally a good idea.
if len(left.datasets.intersection(right.datasets)) > 0:
score = score * 0.7

if score > negative_check_threshold:
negative_check_matches[left_id.id].add(right_id.id)
negative_check_matches[right_id.id].add(left_id.id)

if auto_threshold is not None and score > auto_threshold:
log.info("Auto-merge [%.2f]: %s <> %s", score, left, right)
canonical_id = resolver.decide(
Expand All @@ -92,5 +119,7 @@ def xref(
suggested += 1
_print_stats(idx, suggested, scores)

report_potential_conflicts(view, negative_check_matches, resolver)

except KeyboardInterrupt:
log.info("User cancelled, xref will end gracefully.")
67 changes: 67 additions & 0 deletions tests/test_xref.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import logging
from nomenklatura.dataset.dataset import Dataset
from nomenklatura.judgement import Judgement
from nomenklatura.matching.regression_v1.model import RegressionV1
from nomenklatura.store.memory import MemoryStore
from nomenklatura.xref import xref
from nomenklatura.store import SimpleMemoryStore
from nomenklatura.resolver import Resolver
Expand All @@ -20,3 +25,65 @@ def test_xref_candidates(
if left.caption == "Johanna Quandt":
assert right.caption == "Frau Johanna Quandt"
assert score > 0.0


def test_xref_potential_conflicts(
test_dataset: Dataset,
caplog,
):
resolver = Resolver[CompositeEntity]()
store = MemoryStore(test_dataset, resolver)
algorithm = RegressionV1()
a = CompositeEntity.from_data(
test_dataset,
{
"id": "a",
"schema": "Company",
"properties": {
"name": "The AAA Weapons and Munitions Factory Joint Stock Company",
"address": "Moscow",
},
},
)
b = CompositeEntity.from_data(
test_dataset,
{
"id": "b",
"schema": "Company",
"properties": {
"name": "The BBB Weapons and Munitions Factory Joint Stock Company",
"address": "Moscow",
},
},
)
c = CompositeEntity.from_data(
test_dataset,
{
"id": "c",
"schema": "Company",
"properties": {
"name": "The AAA Weapons and Ammunition Factory Joint Stock Company",
"address": "Moscow",
},
},
)
writer = store.writer()
writer.add_entity(a)
writer.add_entity(b)
writer.add_entity(c)
writer.flush()

resolver.decide("a", "b", Judgement.NEGATIVE, user="test")

with caplog.at_level(logging.INFO):
xref(
resolver,
store,
# Not the default, but easily gets the scores where this is a problem
algorithm=RegressionV1,
# Lower than usual just because we're testing with one dataset
negative_check_threshold=0.6
)
logs = {r.message for r in caplog.records}

assert "Potential conflict: b <> a for c" in logs

0 comments on commit 5eaf5ab

Please sign in to comment.