Skip to content

Commit

Permalink
Use tantivy index for dedupe
Browse files Browse the repository at this point in the history
  • Loading branch information
jbothma committed Jun 10, 2024
1 parent d67b61b commit 97791ae
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 4 deletions.
2 changes: 2 additions & 0 deletions nomenklatura/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
from nomenklatura.resolver import Resolver
from nomenklatura.store import Store, View
from nomenklatura.index import Index
from nomenklatura.index import TantivyIndex

__version__ = "3.11.4"
__all__ = [
"Dataset",
"CompositeEntity",
"Resolver",
"Index",
"TantivyIndex",
"Store",
"View",
]
3 changes: 2 additions & 1 deletion nomenklatura/index/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from nomenklatura.index.index import Index
from nomenklatura.index.tantivy_index import TantivyIndex

__all__ = ["Index"]
__all__ = ["Index", "TantivyIndex"]
10 changes: 8 additions & 2 deletions nomenklatura/xref.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import logging
from typing import List, Optional, Type
from followthemoney.schema import Schema
from tempfile import mkdtemp
from pathlib import Path
import shutil

from nomenklatura.dataset import DS
from nomenklatura.entity import CE
from nomenklatura.store import Store
from nomenklatura.judgement import Judgement
from nomenklatura.resolver import Resolver
from nomenklatura.index import Index
from nomenklatura.index import TantivyIndex
from nomenklatura.matching import DefaultAlgorithm, ScoringAlgorithm

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -39,7 +42,8 @@ def xref(
) -> None:
log.info("Begin xref: %r, resolver: %s", store, resolver)
view = store.default_view(external=external)
index = Index(view)
working_dir = Path(mkdtemp())
index = TantivyIndex(view, working_dir)
index.build()
try:
scores: List[float] = []
Expand Down Expand Up @@ -94,3 +98,5 @@ def xref(

except KeyboardInterrupt:
log.info("User cancelled, xref will end gracefully.")
finally:
shutil.rmtree(working_dir, ignore_errors=True)
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tempfile import mkdtemp

from nomenklatura import settings
from nomenklatura.index.tantivy_index import TantivyIndex
from nomenklatura.index import TantivyIndex
from nomenklatura.store import load_entity_file_store, SimpleMemoryStore
from nomenklatura.kv import get_redis
from nomenklatura.db import get_engine, get_metadata
Expand Down

0 comments on commit 97791ae

Please sign in to comment.