Skip to content

Commit

Permalink
Replace multiple date features with one that combines the ideas
Browse files Browse the repository at this point in the history
  • Loading branch information
jbothma committed Sep 10, 2024
1 parent ad82f9e commit 1c96cfd
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 5 deletions.
Binary file modified nomenklatura/data/regression-v3.pkl
Binary file not shown.
52 changes: 52 additions & 0 deletions nomenklatura/matching/compare/dates.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from typing import Iterable, Set
from prefixdate import Precision
from followthemoney.proxy import E
from rigour.text.distance import dam_levenshtein
from itertools import product
import numpy as np

from nomenklatura.matching.compare.util import has_overlap, is_disjoint
from nomenklatura.matching.util import props_pair


MAX_YEARS = 2


def _dates_precision(values: Iterable[str], precision: Precision) -> Set[str]:
dates = set()
for value in values:
Expand Down Expand Up @@ -70,3 +76,49 @@ def dob_year_disjoint(query: E, result: E) -> float:
if is_disjoint(query_years, result_years):
return 1.0
return 0.0


def dob_similarity(query: E, result: E) -> float:
"""
1.0: precise dates match
0.75: years match
0.5: dates within 1 edit from each other
0.25: years within 2 years from each other
-0.2: imprecise dates are disjoint
-0.3: precise dates are disjoint
"""
query_dates, result_dates = props_pair(query, result, ["birthDate"])

# missing data
if len(query_dates) == 0 or len(result_dates) == 0:
return np.nan

# exact match on precise dates
result_days = _dates_precision(result_dates, Precision.DAY)
query_days = _dates_precision(query_dates, Precision.DAY)
if has_overlap(query_days, result_days):
return 1.0

# precise dates available but have no common values
if is_disjoint(query_days, result_days):
return -0.3

# clerical errors on precise dates
for qd, rd in product(query_days, result_days):
if dam_levenshtein(qd, rd) <= 1:
return 0.3

# years overlap
query_years = _dates_precision(query_dates, Precision.YEAR)
result_years = _dates_precision(result_dates, Precision.YEAR)
if has_overlap(query_years, result_years):
return 0.5

# years are close
for qy, ry in product(query_years, result_years):
years_difference = abs(int(qy) - int(ry))
if years_difference <= MAX_YEARS:
return 0.25

# dates exist but are disjoint other than above options
return -0.2
6 changes: 2 additions & 4 deletions nomenklatura/matching/regression_v3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from nomenklatura.matching.regression_v3.misc import country_mismatch
from nomenklatura.matching.compare.gender import gender_mismatch
from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches
from nomenklatura.matching.compare.dates import dob_year_disjoint
from nomenklatura.matching.compare.dates import dob_year_disjoint, dob_similarity
from nomenklatura.matching.types import FeatureDocs, FeatureDoc, MatchingResult
from nomenklatura.matching.types import CompareFunction, Encoded, ScoringAlgorithm
from nomenklatura.matching.util import make_github_url
Expand All @@ -38,9 +38,7 @@ class RegressionV3(ScoringAlgorithm):
phone_match,
email_match,
identifier_match,
dob_matches,
dob_year_matches,
dob_year_disjoint,
dob_similarity,
first_name_match,
family_name_match,
birth_place,
Expand Down
16 changes: 15 additions & 1 deletion tests/matching/test_dates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches
from nomenklatura.matching.compare.dates import dob_matches, dob_similarity, dob_year_matches
from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint

from .util import e
Expand All @@ -11,23 +11,37 @@ def test_dob_matches():
assert dob_year_matches(left, right) == 1.0
assert dob_day_disjoint(left, right) == 0.0
assert dob_year_disjoint(left, right) == 0.0
assert dob_similarity(left, right) == 1.0
right = e("Person", birthDate="1980-04-15")
assert dob_similarity(left, right) == 0.5
right = e("Person", birthDate="1980-03-16")
assert dob_similarity(left, right) == 0.5
right = e("Person", birthDate="1981-04-16")
assert dob_similarity(left, right) == 0.5
right = e("Person", birthDate="1980")
assert dob_year_matches(left, right) == 1.0
assert dob_day_disjoint(left, right) == 0.0
assert dob_similarity(left, right) == 0.75
right = e("Person", birthDate="1980-04")
assert dob_year_matches(left, right) == 1.0
assert dob_day_disjoint(left, right) == 0.0
assert dob_similarity(left, right) == 0.75
right = e("Person", birthDate="1980-04-16T19:00:00")
assert dob_matches(left, right) == 1.0
assert dob_year_matches(left, right) == 1.0
assert dob_day_disjoint(left, right) == 0.0
assert dob_similarity(left, right) == 1.0
right = e("Person", birthDate="1965-04-16")
assert dob_matches(left, right) == 0.0
assert dob_year_matches(left, right) == 0.0
assert dob_day_disjoint(left, right) == 1.0
assert dob_year_disjoint(left, right) == 1.0
assert dob_similarity(left, right) == -1.0
none = e("Person", name="Harry")
assert dob_matches(left, none) == 0.0
assert dob_year_matches(left, none) == 0.0
assert dob_day_disjoint(left, none) == 0.0
assert dob_year_disjoint(left, none) == 0.0
assert dob_similarity(left, none) == 0.0


0 comments on commit 1c96cfd

Please sign in to comment.