diff --git a/nomenklatura/data/regression-v3.pkl b/nomenklatura/data/regression-v3.pkl index 39078a8..7a95c01 100644 Binary files a/nomenklatura/data/regression-v3.pkl and b/nomenklatura/data/regression-v3.pkl differ diff --git a/nomenklatura/xref.py b/nomenklatura/xref.py index b3d8447..da0a845 100644 --- a/nomenklatura/xref.py +++ b/nomenklatura/xref.py @@ -78,7 +78,6 @@ def xref( if scored: result = algorithm.compare(left, right) - #print("xref", result) score = result.score scores.append(score) diff --git a/tests/matching/test_regression_v3.py b/tests/matching/test_regression_v3.py index d98c38e..3407111 100644 --- a/tests/matching/test_regression_v3.py +++ b/tests/matching/test_regression_v3.py @@ -132,7 +132,7 @@ def test_name_country(): data["id"] = "mike2" e2 = Entity.from_dict(model, data) res = RegressionV3.compare(e1, e2) - assert 0.89 < res.score < 0.93, res + assert 0.92 < res.score < 0.95, res def test_name_match(): @@ -171,7 +171,7 @@ def test_name_address(): "id": "a", "schema": "Company", "properties": { - "name": ["The AAA Weapons and Munitions Factory Joint Stock Company"], + "name": ["The AAA Weapons and MunitionS Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -182,7 +182,7 @@ def test_name_address(): "id": "b", "schema": "Company", "properties": { - "name": ["The BBB Weapons and Munitions Factory Joint Stock Company"], + "name": ["The BBB Weapons and MunitionS Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -193,16 +193,17 @@ def test_name_address(): "id": "c", "schema": "Company", "properties": { - "name": ["The AAA Weapons and Ammunition Factory Joint Stock Company"], + "name": ["The AAA Weapons and MunitioN Factory Joint Stock Company"], "address": ["Moscow"], }, }, ) ac = RegressionV3.compare(a, c) - assert 0.5 < ac.score < 0.9 + assert 0.87 < ac.score < 0.93 ab = RegressionV3.compare(a, b) - assert 0.5 < ab.score < 0.9 - + assert 0.87 < ab.score < 0.93 + bc = RegressionV3.compare(b, c) + assert 0.84 < bc.score < 0.93 def test_isin(): """name and country together shouldn't be too strong""" diff --git a/tests/test_xref.py b/tests/test_xref.py index 2928143..4a99e50 100644 --- a/tests/test_xref.py +++ b/tests/test_xref.py @@ -43,7 +43,7 @@ def test_xref_potential_conflicts( "id": "a", "schema": "Company", "properties": { - "name": ["The AAA Weapons and Munitions Factory Joint Stock Company"], + "name": ["The AAA Weapons and MunitionS Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -54,7 +54,7 @@ def test_xref_potential_conflicts( "id": "b", "schema": "Company", "properties": { - "name": ["The BBB Weapons and Munitions Factory Joint Stock Company"], + "name": ["The BBB Weapons and MunitionS Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -65,7 +65,7 @@ def test_xref_potential_conflicts( "id": "c", "schema": "Company", "properties": { - "name": ["The AAA Weapons and Ammunition Factory Joint Stock Company"], + "name": ["The AAA Weapons and MunitioN Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -83,7 +83,7 @@ def test_xref_potential_conflicts( store, index_path, algorithm=RegressionV3, - conflicting_match_threshold=0.9, + conflicting_match_threshold=0.8, ) stdout = capsys.readouterr().out @@ -95,5 +95,3 @@ def test_xref_potential_conflicts( assert a.get("name")[0] in flat, stdout assert b.get("name")[0] in flat, stdout assert c.get("name")[0] in flat, stdout - print(stdout) - assert False