diff --git a/nomenklatura/cli.py b/nomenklatura/cli.py index c99575bc..51b2b3e8 100644 --- a/nomenklatura/cli.py +++ b/nomenklatura/cli.py @@ -23,6 +23,7 @@ from nomenklatura.stream import StreamEntity from nomenklatura.xref import xref as run_xref from nomenklatura.tui import dedupe_ui +from nomenklatura.matching.bench import bench_matcher INDEX_SEGMENT = "xref-index" @@ -308,5 +309,13 @@ def statements_aggregate( write_entity(outfh, entity) +@cli.command("bench", help="Benchmark a matching algorithm") +@click.argument("name", type=str) +@click.argument("pairs_file", type=InPath) +@click.option("-n", "--number", type=int, default=1000) +def bench(name: str, pairs_file: Path, number: int = 1000) -> None: + bench_matcher(name, pairs_file, number) + + if __name__ == "__main__": cli() diff --git a/nomenklatura/matching/bench.py b/nomenklatura/matching/bench.py new file mode 100644 index 00000000..fc87d154 --- /dev/null +++ b/nomenklatura/matching/bench.py @@ -0,0 +1,30 @@ +import datetime +from timeit import timeit +from itertools import cycle +import logging + +from nomenklatura.matching import get_algorithm +from nomenklatura.matching.pairs import read_pairs +from nomenklatura.util import PathLike + + +log = logging.getLogger(__name__) + + +def bench_matcher(name: str, pairs_file: PathLike, number: int) -> None: + log.info("Loading pairs from %s", pairs_file) + pairs = list(read_pairs(pairs_file)) + log.info("Read %d pairs", len(pairs)) + matcher = get_algorithm(name) + if matcher is None: + raise ValueError("No matcher named %s", name) + log.info("Loaded %s", matcher.NAME) + infinite_pairs = cycle(pairs) + + def compare_one_pair() -> None: + pair = next(infinite_pairs) + matcher.compare(pair.left, pair.right) + + log.info("Running benchmark for %d iterations", number) + seconds = timeit(compare_one_pair, number=number) + log.info("Total time %s", datetime.timedelta(seconds=seconds))