diff --git a/nomenklatura/data/regression-v3.pkl b/nomenklatura/data/regression-v3.pkl index 1e088ed6..1c79beec 100644 Binary files a/nomenklatura/data/regression-v3.pkl and b/nomenklatura/data/regression-v3.pkl differ diff --git a/nomenklatura/matching/compare/dates.py b/nomenklatura/matching/compare/dates.py index 8f7f3165..924840cb 100644 --- a/nomenklatura/matching/compare/dates.py +++ b/nomenklatura/matching/compare/dates.py @@ -1,11 +1,17 @@ from typing import Iterable, Set from prefixdate import Precision from followthemoney.proxy import E +from rigour.text.distance import dam_levenshtein +from itertools import product +import numpy as np from nomenklatura.matching.compare.util import has_overlap, is_disjoint from nomenklatura.matching.util import props_pair +MAX_YEARS = 2 + + def _dates_precision(values: Iterable[str], precision: Precision) -> Set[str]: dates = set() for value in values: @@ -70,3 +76,49 @@ def dob_year_disjoint(query: E, result: E) -> float: if is_disjoint(query_years, result_years): return 1.0 return 0.0 + + +def dob_similarity(query: E, result: E) -> float: + """ + 1.0: precise dates match + 0.75: years match + 0.5: dates within 1 edit from each other + 0.25: years within 2 years from each other + -0.2: imprecise dates are disjoint + -0.3: precise dates are disjoint + """ + query_dates, result_dates = props_pair(query, result, ["birthDate"]) + + # missing data + if len(query_dates) == 0 or len(result_dates) == 0: + return np.nan + + # exact match on precise dates + result_days = _dates_precision(result_dates, Precision.DAY) + query_days = _dates_precision(query_dates, Precision.DAY) + if has_overlap(query_days, result_days): + return 1.0 + + # precise dates available but have no common values + if is_disjoint(query_days, result_days): + return -0.3 + + # clerical errors on precise dates + for qd, rd in product(query_days, result_days): + if dam_levenshtein(qd, rd) <= 1: + return 0.3 + + # years overlap + query_years = _dates_precision(query_dates, Precision.YEAR) + result_years = _dates_precision(result_dates, Precision.YEAR) + if has_overlap(query_years, result_years): + return 0.5 + + # years are close + for qy, ry in product(query_years, result_years): + years_difference = abs(int(qy) - int(ry)) + if years_difference <= MAX_YEARS: + return 0.25 + + # dates exist but are disjoint other than above options + return -0.2 diff --git a/nomenklatura/matching/regression_v3/model.py b/nomenklatura/matching/regression_v3/model.py index a682fcbf..a3165a3b 100644 --- a/nomenklatura/matching/regression_v3/model.py +++ b/nomenklatura/matching/regression_v3/model.py @@ -17,7 +17,7 @@ from nomenklatura.matching.regression_v3.misc import country_mismatch from nomenklatura.matching.compare.gender import gender_mismatch from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches -from nomenklatura.matching.compare.dates import dob_year_disjoint +from nomenklatura.matching.compare.dates import dob_year_disjoint, dob_similarity from nomenklatura.matching.types import FeatureDocs, FeatureDoc, MatchingResult from nomenklatura.matching.types import CompareFunction, Encoded, ScoringAlgorithm from nomenklatura.matching.util import make_github_url @@ -38,9 +38,7 @@ class RegressionV3(ScoringAlgorithm): phone_match, email_match, identifier_match, - dob_matches, - dob_year_matches, - dob_year_disjoint, + dob_similarity, first_name_match, family_name_match, birth_place, diff --git a/tests/matching/test_dates.py b/tests/matching/test_dates.py index 9ff1fe85..c6d5c57f 100644 --- a/tests/matching/test_dates.py +++ b/tests/matching/test_dates.py @@ -1,4 +1,4 @@ -from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches +from nomenklatura.matching.compare.dates import dob_matches, dob_similarity, dob_year_matches from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint from .util import e @@ -11,23 +11,37 @@ def test_dob_matches(): assert dob_year_matches(left, right) == 1.0 assert dob_day_disjoint(left, right) == 0.0 assert dob_year_disjoint(left, right) == 0.0 + assert dob_similarity(left, right) == 1.0 + right = e("Person", birthDate="1980-04-15") + assert dob_similarity(left, right) == 0.5 + right = e("Person", birthDate="1980-03-16") + assert dob_similarity(left, right) == 0.5 + right = e("Person", birthDate="1981-04-16") + assert dob_similarity(left, right) == 0.5 right = e("Person", birthDate="1980") assert dob_year_matches(left, right) == 1.0 assert dob_day_disjoint(left, right) == 0.0 + assert dob_similarity(left, right) == 0.75 right = e("Person", birthDate="1980-04") assert dob_year_matches(left, right) == 1.0 assert dob_day_disjoint(left, right) == 0.0 + assert dob_similarity(left, right) == 0.75 right = e("Person", birthDate="1980-04-16T19:00:00") assert dob_matches(left, right) == 1.0 assert dob_year_matches(left, right) == 1.0 assert dob_day_disjoint(left, right) == 0.0 + assert dob_similarity(left, right) == 1.0 right = e("Person", birthDate="1965-04-16") assert dob_matches(left, right) == 0.0 assert dob_year_matches(left, right) == 0.0 assert dob_day_disjoint(left, right) == 1.0 assert dob_year_disjoint(left, right) == 1.0 + assert dob_similarity(left, right) == -1.0 none = e("Person", name="Harry") assert dob_matches(left, none) == 0.0 assert dob_year_matches(left, none) == 0.0 assert dob_day_disjoint(left, none) == 0.0 assert dob_year_disjoint(left, none) == 0.0 + assert dob_similarity(left, none) == 0.0 + + \ No newline at end of file