Skip to content

Commit

Permalink
Replace multiple date features with one that combines the ideas
Browse files Browse the repository at this point in the history
  • Loading branch information
jbothma committed Aug 30, 2024
1 parent d8de3b0 commit 0853e13
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 10 deletions.
Binary file modified nomenklatura/data/regression-v3.pkl
Binary file not shown.
52 changes: 52 additions & 0 deletions nomenklatura/matching/compare/dates.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from typing import Iterable, Set
from prefixdate import Precision
from followthemoney.proxy import E
from rigour.text.distance import dam_levenshtein
from itertools import product

from nomenklatura.matching.compare.util import has_overlap, is_disjoint
from nomenklatura.matching.util import props_pair


MAX_YEARS = 2


def _dates_precision(values: Iterable[str], precision: Precision) -> Set[str]:
dates = set()
for value in values:
Expand Down Expand Up @@ -71,9 +76,56 @@ def dob_year_disjoint(query: E, result: E) -> float:
return 1.0
return 0.0


def both_have_dob(query: E, result: E) -> float:
"""Both entities have a birth date."""
query_dates, result_dates = props_pair(query, result, ["birthDate"])
if query_dates and result_dates:
return 1.0
return 0.0


def dob_similarity(query: E, result: E) -> float:
"""
1.0: precise dates match
0.75: years match
0.5: dates within 1 edit from each other
0.25: years within 2 years from each other
-0.2: imprecise dates are disjoint
-0.3: precise dates are disjoint
"""
query_dates, result_dates = props_pair(query, result, ["birthDate"])

# missing data
if len(query_dates) == 0 or len(result_dates) == 0:
return 0.0

# exact match on precise dates
result_days = _dates_precision(result_dates, Precision.DAY)
query_days = _dates_precision(query_dates, Precision.DAY)
if has_overlap(query_days, result_days):
return 1.0

# precise dates available but have no common values
if is_disjoint(query_days, result_days):
return -0.3

# clerical errors on precise dates
for qd, rd in product(query_days, result_days):
if dam_levenshtein(qd, rd) <= 1:
return 0.5

# years overlap
query_years = _dates_precision(query_dates, Precision.YEAR)
result_years = _dates_precision(result_dates, Precision.YEAR)
if has_overlap(query_years, result_years):
return 0.75

# years are close
for qy, ry in product(query_years, result_years):
years_difference = abs(int(qy) - int(ry))
if years_difference <= MAX_YEARS:
return 0.25

# dates exist but are disjoint other than above options
return -0.2
14 changes: 5 additions & 9 deletions nomenklatura/matching/regression_v3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,18 @@
from sklearn.pipeline import Pipeline # type: ignore
from followthemoney.proxy import E

from nomenklatura.matching.compare.names import name_fingerprint_levenshtein
from nomenklatura.matching.regression_v3.names import first_name_match
from nomenklatura.matching.regression_v3.names import family_name_match
from nomenklatura.matching.regression_v3.names import name_match
from nomenklatura.matching.regression_v3.names import name_levenshtein, name_match
from nomenklatura.matching.regression_v3.names import name_token_overlap, name_numbers
from nomenklatura.matching.regression_v3.misc import phone_match, email_match
from nomenklatura.matching.regression_v3.misc import address_match, address_numbers
from nomenklatura.matching.regression_v3.misc import identifier_match, birth_place
from nomenklatura.matching.regression_v3.misc import org_identifier_match
from nomenklatura.matching.compare.countries import country_mismatch, country_overlap
from nomenklatura.matching.compare.countries import country_mismatch
from nomenklatura.matching.compare.gender import gender_mismatch
from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches
from nomenklatura.matching.compare.dates import dob_year_disjoint
from nomenklatura.matching.compare.dates import dob_year_disjoint, dob_similarity
from nomenklatura.matching.types import FeatureDocs, FeatureDoc, MatchingResult
from nomenklatura.matching.types import CompareFunction, Encoded, ScoringAlgorithm
from nomenklatura.matching.util import make_github_url
Expand All @@ -33,19 +32,16 @@ class RegressionV3(ScoringAlgorithm):
name_match,
name_token_overlap,
name_numbers,
name_fingerprint_levenshtein,
name_levenshtein,
phone_match,
email_match,
identifier_match,
dob_matches,
dob_year_matches,
dob_year_disjoint,
dob_similarity,
first_name_match,
family_name_match,
birth_place,
gender_mismatch,
country_mismatch,
#country_overlap,
org_identifier_match,
address_match,
address_numbers,
Expand Down
16 changes: 15 additions & 1 deletion tests/matching/test_dates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches
from nomenklatura.matching.compare.dates import dob_matches, dob_similarity, dob_year_matches
from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint

from .util import e
Expand All @@ -11,23 +11,37 @@ def test_dob_matches():
assert dob_year_matches(left, right) == 1.0
assert dob_day_disjoint(left, right) == 0.0
assert dob_year_disjoint(left, right) == 0.0
assert dob_similarity(left, right) == 1.0
right = e("Person", birthDate="1980-04-15")
assert dob_similarity(left, right) == 0.5
right = e("Person", birthDate="1980-03-16")
assert dob_similarity(left, right) == 0.5
right = e("Person", birthDate="1981-04-16")
assert dob_similarity(left, right) == 0.5
right = e("Person", birthDate="1980")
assert dob_year_matches(left, right) == 1.0
assert dob_day_disjoint(left, right) == 0.0
assert dob_similarity(left, right) == 0.75
right = e("Person", birthDate="1980-04")
assert dob_year_matches(left, right) == 1.0
assert dob_day_disjoint(left, right) == 0.0
assert dob_similarity(left, right) == 0.75
right = e("Person", birthDate="1980-04-16T19:00:00")
assert dob_matches(left, right) == 1.0
assert dob_year_matches(left, right) == 1.0
assert dob_day_disjoint(left, right) == 0.0
assert dob_similarity(left, right) == 1.0
right = e("Person", birthDate="1965-04-16")
assert dob_matches(left, right) == 0.0
assert dob_year_matches(left, right) == 0.0
assert dob_day_disjoint(left, right) == 1.0
assert dob_year_disjoint(left, right) == 1.0
assert dob_similarity(left, right) == -1.0
none = e("Person", name="Harry")
assert dob_matches(left, none) == 0.0
assert dob_year_matches(left, none) == 0.0
assert dob_day_disjoint(left, none) == 0.0
assert dob_year_disjoint(left, none) == 0.0
assert dob_similarity(left, none) == 0.0


0 comments on commit 0853e13

Please sign in to comment.