From 5d59c8da2c6f78cf91c03cb94794b236e402fbc2 Mon Sep 17 00:00:00 2001 From: JD Bothma Date: Thu, 19 Sep 2024 09:12:13 +0100 Subject: [PATCH] Update expected scores --- nomenklatura/data/regression-v3.pkl | Bin 2603 -> 2603 bytes nomenklatura/xref.py | 1 - tests/matching/test_regression_v3.py | 15 ++++++++------- tests/test_xref.py | 10 ++++------ 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/nomenklatura/data/regression-v3.pkl b/nomenklatura/data/regression-v3.pkl index 39078a8cc25b30f546c93b377e7cb5a1b677b895..7a95c01cc2de40065f8102901b3ff4600ab7fa80 100644 GIT binary patch delta 984 zcma)&c}$XV7{?XRmp5mS({439@u-=(a=P|pmYLcC%bKp#UeeJ9QV!QKgUX}q#^1YI zF3YsmA8dG*>6l<61WFL-HZsejOYLwSv$9QI|8T3V|G#^l&*!7`R(jVrxXO>&*Ig90 zr2wm8>nxS>19W!c5wp2G7(P?n0W3cB^>N3nGtF=<=^5fK9EQ@`V!ml)1P0uWcD1IC z!YAWFTa%~)Zg9n3l1=Gw&vlU{>(DUduHCQL{8I#LcD0?p5LW~JtVGlE#@ArI_|)Fs z(*oB>nSHS#8b#5pG=e~M`r`jaiHUWhJZyKO7-)Y-Iq?sa-?YD@)X_ua)xJE7y*UL` z%WLfc#s=8N3)L3|@<4W;5gs~&542H<%`u?|wT@>b`fEX_ef0fSmI4O(^Abt}O2ATh zve(C44huX#pV^w64ja>~^i9XLFlMC#>y-!!l2$D(Ag@9ZJ19f6s~p6(6;X`zk1)zD zWq0YL(G~_*{w-+U+OGI2_}(7ED0|TjB?{S!m1nEr$M|XjepSQi-OI+@IXz&m>5lD* zGs3=`jg=irEku*|X+cCQ_*4cDiq&c`MBaJnCar-nO!#I+Nw%;zCME;*^^}lADKe7~XQ%RCXCD z^es<3+^o{`AMDU+IPX~{{sTkswpg!tRwUEZC0yE@kyotom&S&vo84FSBwOQ_yy*wM zsba28W&jgbw84~xbHGOD@Hy^IBXp4Ihfz}K3ez8>#L!*l zB8-wlADHtoN)UPCeqmFR&U2muNnyUU;J~zxPJgro55uSgs00ths0>JjFT|)6$cp=6 VlQ}r`=qnz~riy4LuNM|B`2%FK$?X6D delta 984 zcma)&Ye-XZ7>7MNcQ&shT~_22XO}@*Rz#HbYFZg8>6&IlnAvKA)1@1wg_pGH%G>%M zOGzzP%hIxG*veVQQm2z-ji8a;BvWdcZBqK73xPX+5d6^hAMg7-Pm6bpx26rt*C*_8 zHe!)r_6l>x@+LrS4Dp#keBhckH-~$sK+;=l$+g!*5NG;&pO~2dlK$qnr)3hXjKu;& z$anCaS*q@fxd@$=Yd^;d62Yct^_IqdgO!dymtlG7(3Q&HcN0{Q*Ryx(UD7B#GzPd` zy3hu@GnyU;KjNWvjwZ)l$B>g#(0`-c92~m2%#mNL;KPh1NX(lc!?Z@it8?Z)+oD$9QwLsFcxuY-|Dx$J54MAY10f}jlsTtn3Xh?~H1B*Y znk6ShzKV!~ouRSg!`r*bPwjZ*NwB~(t~LIPR!JUaZ#n7Ni%9Ink3f|dJ*+xAx$j`z zIEXuk&Fl^Xa2paD6K|S8b4}t=*VhT$9$TgMwT5)feqU~G9*2%BNpx3>i9CJ?8>Y2A zg{El(5q-U#6zJmq2wj8*$g_O-^|7rNrQ>VpRW+N=^h;3#z6Ymx&?L@@pJO8A1fM8# zm54GkgvsI*X{v}2Z58=OFy>&Xfl!>1mOgJVXJH?(P!-dMZZ|@?3=U37p{oo(oDxHQ z41b)GLlcZ;I3~)ze_#b8e$T9!` diff --git a/nomenklatura/xref.py b/nomenklatura/xref.py index b3d8447e..da0a845c 100644 --- a/nomenklatura/xref.py +++ b/nomenklatura/xref.py @@ -78,7 +78,6 @@ def xref( if scored: result = algorithm.compare(left, right) - #print("xref", result) score = result.score scores.append(score) diff --git a/tests/matching/test_regression_v3.py b/tests/matching/test_regression_v3.py index d98c38e3..3407111e 100644 --- a/tests/matching/test_regression_v3.py +++ b/tests/matching/test_regression_v3.py @@ -132,7 +132,7 @@ def test_name_country(): data["id"] = "mike2" e2 = Entity.from_dict(model, data) res = RegressionV3.compare(e1, e2) - assert 0.89 < res.score < 0.93, res + assert 0.92 < res.score < 0.95, res def test_name_match(): @@ -171,7 +171,7 @@ def test_name_address(): "id": "a", "schema": "Company", "properties": { - "name": ["The AAA Weapons and Munitions Factory Joint Stock Company"], + "name": ["The AAA Weapons and MunitionS Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -182,7 +182,7 @@ def test_name_address(): "id": "b", "schema": "Company", "properties": { - "name": ["The BBB Weapons and Munitions Factory Joint Stock Company"], + "name": ["The BBB Weapons and MunitionS Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -193,16 +193,17 @@ def test_name_address(): "id": "c", "schema": "Company", "properties": { - "name": ["The AAA Weapons and Ammunition Factory Joint Stock Company"], + "name": ["The AAA Weapons and MunitioN Factory Joint Stock Company"], "address": ["Moscow"], }, }, ) ac = RegressionV3.compare(a, c) - assert 0.5 < ac.score < 0.9 + assert 0.87 < ac.score < 0.93 ab = RegressionV3.compare(a, b) - assert 0.5 < ab.score < 0.9 - + assert 0.87 < ab.score < 0.93 + bc = RegressionV3.compare(b, c) + assert 0.84 < bc.score < 0.93 def test_isin(): """name and country together shouldn't be too strong""" diff --git a/tests/test_xref.py b/tests/test_xref.py index 29281433..4a99e504 100644 --- a/tests/test_xref.py +++ b/tests/test_xref.py @@ -43,7 +43,7 @@ def test_xref_potential_conflicts( "id": "a", "schema": "Company", "properties": { - "name": ["The AAA Weapons and Munitions Factory Joint Stock Company"], + "name": ["The AAA Weapons and MunitionS Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -54,7 +54,7 @@ def test_xref_potential_conflicts( "id": "b", "schema": "Company", "properties": { - "name": ["The BBB Weapons and Munitions Factory Joint Stock Company"], + "name": ["The BBB Weapons and MunitionS Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -65,7 +65,7 @@ def test_xref_potential_conflicts( "id": "c", "schema": "Company", "properties": { - "name": ["The AAA Weapons and Ammunition Factory Joint Stock Company"], + "name": ["The AAA Weapons and MunitioN Factory Joint Stock Company"], "address": ["Moscow"], }, }, @@ -83,7 +83,7 @@ def test_xref_potential_conflicts( store, index_path, algorithm=RegressionV3, - conflicting_match_threshold=0.9, + conflicting_match_threshold=0.8, ) stdout = capsys.readouterr().out @@ -95,5 +95,3 @@ def test_xref_potential_conflicts( assert a.get("name")[0] in flat, stdout assert b.get("name")[0] in flat, stdout assert c.get("name")[0] in flat, stdout - print(stdout) - assert False