From 2f33178be8c0f30533271e4db6a7aa23f0c8888b Mon Sep 17 00:00:00 2001 From: Richard5678 Date: Sun, 16 Jul 2023 13:07:57 -0700 Subject: [PATCH] caught up with original main branch --- README.md | 65 +++++----- docs/regressions-log.md | 13 ++ ...ressions-msmarco-passage-cos-dpr-distil.md | 111 ++++++++++++++++++ docs/regressions.md | 12 +- pom.xml | 2 +- .../io/anserini/util/DumpAnalyzedQueries.java | 42 ++++++- src/main/python/regressions-batch03.txt | 12 ++ .../dl19-doc-segmented-unicoil-noexp.template | 6 +- .../dl19-doc-segmented-unicoil.template | 6 +- .../templates/dl19-passage-bm25-b8.template | 6 +- ...e-splade-distil-cocodenser-medium.template | 6 +- .../dl19-passage-splade-pp-ed-onnx.template | 6 +- .../dl19-passage-splade-pp-ed.template | 6 +- .../dl19-passage-splade-pp-sd-onnx.template | 6 +- .../dl19-passage-splade-pp-sd.template | 6 +- .../dl19-passage-unicoil-noexp.template | 6 +- .../templates/dl19-passage-unicoil.template | 6 +- .../dl20-doc-segmented-unicoil-noexp.template | 6 +- .../dl20-doc-segmented-unicoil.template | 6 +- .../templates/dl20-passage-bm25-b8.template | 6 +- ...e-splade-distil-cocodenser-medium.template | 6 +- .../dl20-passage-splade-pp-ed-onnx.template | 6 +- .../dl20-passage-splade-pp-ed.template | 6 +- .../dl20-passage-splade-pp-sd-onnx.template | 6 +- .../dl20-passage-splade-pp-sd.template | 6 +- .../dl20-passage-unicoil-noexp.template | 6 +- .../templates/dl20-passage-unicoil.template | 6 +- ...21-doc-segmented-unicoil-0shot-v2.template | 8 +- .../dl21-doc-segmented-unicoil-0shot.template | 8 +- ...-segmented-unicoil-noexp-0shot-v2.template | 8 +- ...doc-segmented-unicoil-noexp-0shot.template | 8 +- .../dl21-passage-splade-pp-ed.template | 6 +- .../dl21-passage-splade-pp-sd.template | 6 +- .../dl21-passage-unicoil-0shot.template | 8 +- .../dl21-passage-unicoil-noexp-0shot.template | 8 +- .../dl22-passage-splade-pp-ed.template | 6 +- .../dl22-passage-splade-pp-sd.template | 6 +- .../dl22-passage-unicoil-0shot.template | 8 +- .../dl22-passage-unicoil-noexp-0shot.template | 8 +- ...marco-doc-segmented-unicoil-noexp.template | 6 +- .../msmarco-doc-segmented-unicoil.template | 6 +- .../msmarco-passage-bm25-b8.template | 6 +- .../msmarco-passage-cos-dpr-distil.template | 89 ++++++++++++++ .../msmarco-passage-deepimpact.template | 6 +- ...smarco-passage-distill-splade-max.template | 6 +- ...e-splade-distil-cocodenser-medium.template | 6 +- ...msmarco-passage-splade-pp-ed-onnx.template | 6 +- .../msmarco-passage-splade-pp-ed.template | 6 +- ...msmarco-passage-splade-pp-sd-onnx.template | 6 +- .../msmarco-passage-splade-pp-sd.template | 6 +- .../msmarco-passage-unicoil-noexp.template | 6 +- ...o-passage-unicoil-tilde-expansion.template | 6 +- .../msmarco-passage-unicoil.template | 6 +- ...v2-doc-segmented-unicoil-0shot-v2.template | 8 +- ...co-v2-doc-segmented-unicoil-0shot.template | 8 +- ...-segmented-unicoil-noexp-0shot-v2.template | 8 +- ...doc-segmented-unicoil-noexp-0shot.template | 8 +- .../msmarco-v2-passage-splade-pp-ed.template | 6 +- .../msmarco-v2-passage-splade-pp-sd.template | 6 +- .../msmarco-v2-passage-unicoil-0shot.template | 8 +- ...co-v2-passage-unicoil-noexp-0shot.template | 8 +- .../msmarco-passage-cos-dpr-distil.yaml | 5 +- src/test/java/io/anserini/doc/DataModel.java | 23 +++- .../doc/GenerateRegressionDocsTest.java | 2 + 64 files changed, 505 insertions(+), 217 deletions(-) create mode 100644 docs/regressions-msmarco-passage-cos-dpr-distil.md create mode 100644 src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template diff --git a/README.md b/README.md index f8ceb50fe1..df544d0f88 100644 --- a/README.md +++ b/README.md @@ -62,18 +62,18 @@ See individual pages for details! | | dev | DL19 | DL20 | |---------------------------------------------|:------------------------------------------------------------------------:|:---------------------------------------------------------------------:|:---------------------------------------------------------------------:| -| **Unsupervised Lexical** | | | | +| **Unsupervised Sparse Lexical** | | | | | BoW baselines | [+](docs/regressions-msmarco-passage.md) | [+](docs/regressions-dl19-passage.md) | [+](docs/regressions-dl20-passage.md) | | Quantized BM25 | [✓](docs/regressions-msmarco-passage-bm25-b8.md) | [✓](docs/regressions-dl19-passage-bm25-b8.md) | [✓](docs/regressions-dl20-passage-bm25-b8.md) | | WP baselines | [+](docs/regressions-msmarco-passage-wp.md) | [+](docs/regressions-dl19-passage-wp.md) | [+](docs/regressions-dl20-passage-wp.md) | | Huggingface WP baselines | [+](docs/regressions-msmarco-passage-hgf-wp.md) | [+](docs/regressions-dl19-passage-hgf-wp.md) | [+](docs/regressions-dl20-passage-hgf-wp.md) | | doc2query | [+](docs/regressions-msmarco-passage-doc2query.md) | | | | doc2query-T5 | [+](docs/regressions-msmarco-passage-docTTTTTquery.md) | [+](docs/regressions-dl19-passage-docTTTTTquery.md) | [+](docs/regressions-dl20-passage-docTTTTTquery.md) | -| **Learned sparse lexical (uniCOIL family)** | | | | +| **Learned Sparse Lexical (uniCOIL family)** | | | | | uniCOIL noexp | [✓](docs/regressions-msmarco-passage-unicoil-noexp.md) | [✓](docs/regressions-dl19-passage-unicoil-noexp.md) | [✓](docs/regressions-dl20-passage-unicoil-noexp.md) | | uniCOIL with doc2query-T5 | [✓](docs/regressions-msmarco-passage-unicoil.md) | [✓](docs/regressions-dl19-passage-unicoil.md) | [✓](docs/regressions-dl20-passage-unicoil.md) | | uniCOIL with TILDE | [✓](docs/regressions-msmarco-passage-unicoil-tilde-expansion.md) | | | -| **Learned sparse lexical (other)** | | | | +| **Learned Sparse Lexical (other)** | | | | | DeepImpact | [✓](docs/regressions-msmarco-passage-deepimpact.md) | | | | SPLADEv2 | [✓](docs/regressions-msmarco-passage-distill-splade-max.md) | | | | SPLADE-distill CoCodenser-medium | [✓](docs/regressions-msmarco-passage-splade-distil-cocodenser-medium.md) | [✓](docs/regressions-dl19-passage-splade-distil-cocodenser-medium.md) | [✓](docs/regressions-dl20-passage-splade-distil-cocodenser-medium.md) | @@ -81,6 +81,8 @@ See individual pages for details! | SPLADE++ CoCondenser-EnsembleDistil (ONNX) | [✓](docs/regressions-msmarco-passage-splade-pp-ed-onnx.md) | [✓](docs/regressions-dl19-passage-splade-pp-ed-onnx.md) | [✓](docs/regressions-dl20-passage-splade-pp-ed-onnx.md) | | SPLADE++ CoCondenser-SelfDistil | [✓](docs/regressions-msmarco-passage-splade-pp-sd.md) | [✓](docs/regressions-dl19-passage-splade-pp-sd.md) | [✓](docs/regressions-dl20-passage-splade-pp-sd.md) | | SPLADE++ CoCondenser-SelfDistil (ONNX) | [✓](docs/regressions-msmarco-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions-dl19-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions-dl20-passage-splade-pp-sd-onnx.md) | +| **Learned Dense** | | | | +| cosDPR-distil | [✓](docs/regressions-msmarco-passage-cos-dpr-distil.md) | | | | ### Available Corpora for Download @@ -95,6 +97,7 @@ See individual pages for details! | [SPLADE-distill CoCodenser-medium](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-splade_distil_cocodenser_medium.tar) | 4.9 GB | `f77239a26d08856e6491a34062893b0c` | | [SPLADE++ CoCondenser-EnsembleDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar) | 4.2 GB | `e489133bdc54ee1e7c62a32aa582bc77` | | [SPLADE++ CoCondenser-SelfDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar) | 4.8 GB | `cb7e264222f2bf2221dd2c9d28190be1` | +| [cosDPR-distil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar) | 57 GB | `e20ffbc8b5e7f760af31298aefeaebbd` |
@@ -102,20 +105,20 @@ See individual pages for details! ### MS MARCO V1 Document Regressions -| | dev | DL19 | DL20 | -|---|:---:|:----:|:----:| -| **Unsupervised lexical, complete doc**[*](docs/experiments-msmarco-doc-doc2query-details.md) | -| BoW baselines | [+](docs/regressions-msmarco-doc.md) | [+](docs/regressions-dl19-doc.md) | [+](docs/regressions-dl20-doc.md) | -| WP baselines | [+](docs/regressions-msmarco-doc-wp.md) | [+](docs/regressions-dl19-doc-wp.md) | [+](docs/regressions-dl20-doc-wp.md) | -| Huggingface WP baselines | [+](docs/regressions-msmarco-doc-hgf-wp.md) | [+](docs/regressions-dl19-doc-hgf-wp.md) | [+](docs/regressions-dl20-doc-hgf-wp.md) | -| doc2query-T5 | [+](docs/regressions-msmarco-doc-docTTTTTquery.md) | [+](docs/regressions-dl19-doc-docTTTTTquery.md) | [+](docs/regressions-dl20-doc-docTTTTTquery.md) | -| **Unsupervised lexical, segmented doc**[*](docs/experiments-msmarco-doc-doc2query-details.md) | -| BoW baselines | [+](docs/regressions-msmarco-doc-segmented.md) | [+](docs/regressions-dl19-doc-segmented.md) | [+](docs/regressions-dl20-doc-segmented.md) | -| WP baselines | [+](docs/regressions-msmarco-doc-segmented-wp.md) | [+](docs/regressions-dl19-doc-segmented-wp.md) | [+](docs/regressions-dl20-doc-segmented-wp.md) | -| doc2query-T5 | [+](docs/regressions-msmarco-doc-segmented-docTTTTTquery.md) | [+](docs/regressions-dl19-doc-segmented-docTTTTTquery.md) | [+](docs/regressions-dl20-doc-segmented-docTTTTTquery.md) | -| **Learned sparse lexical** | -| uniCOIL noexp | [✓](docs/regressions-msmarco-doc-segmented-unicoil-noexp.md) | [✓](docs/regressions-dl19-doc-segmented-unicoil-noexp.md) | [✓](docs/regressions-dl20-doc-segmented-unicoil-noexp.md) | -| uniCOIL with doc2query-T5 | [✓](docs/regressions-msmarco-doc-segmented-unicoil.md) | [✓](docs/regressions-dl19-doc-segmented-unicoil.md) | [✓](docs/regressions-dl20-doc-segmented-unicoil.md) | +| | dev | DL19 | DL20 | +|-----------------------------------------------------------------------------------------------|:------------------------------------------------------------:|:---------------------------------------------------------:|:---------------------------------------------------------:| +| **Unsupervised Lexical, Complete Doc**[*](docs/experiments-msmarco-doc-doc2query-details.md) | +| BoW baselines | [+](docs/regressions-msmarco-doc.md) | [+](docs/regressions-dl19-doc.md) | [+](docs/regressions-dl20-doc.md) | +| WP baselines | [+](docs/regressions-msmarco-doc-wp.md) | [+](docs/regressions-dl19-doc-wp.md) | [+](docs/regressions-dl20-doc-wp.md) | +| Huggingface WP baselines | [+](docs/regressions-msmarco-doc-hgf-wp.md) | [+](docs/regressions-dl19-doc-hgf-wp.md) | [+](docs/regressions-dl20-doc-hgf-wp.md) | +| doc2query-T5 | [+](docs/regressions-msmarco-doc-docTTTTTquery.md) | [+](docs/regressions-dl19-doc-docTTTTTquery.md) | [+](docs/regressions-dl20-doc-docTTTTTquery.md) | +| **Unsupervised Lexical, Segmented Doc**[*](docs/experiments-msmarco-doc-doc2query-details.md) | +| BoW baselines | [+](docs/regressions-msmarco-doc-segmented.md) | [+](docs/regressions-dl19-doc-segmented.md) | [+](docs/regressions-dl20-doc-segmented.md) | +| WP baselines | [+](docs/regressions-msmarco-doc-segmented-wp.md) | [+](docs/regressions-dl19-doc-segmented-wp.md) | [+](docs/regressions-dl20-doc-segmented-wp.md) | +| doc2query-T5 | [+](docs/regressions-msmarco-doc-segmented-docTTTTTquery.md) | [+](docs/regressions-dl19-doc-segmented-docTTTTTquery.md) | [+](docs/regressions-dl20-doc-segmented-docTTTTTquery.md) | +| **Learned Sparse Lexical** | +| uniCOIL noexp | [✓](docs/regressions-msmarco-doc-segmented-unicoil-noexp.md) | [✓](docs/regressions-dl19-doc-segmented-unicoil-noexp.md) | [✓](docs/regressions-dl20-doc-segmented-unicoil-noexp.md) | +| uniCOIL with doc2query-T5 | [✓](docs/regressions-msmarco-doc-segmented-unicoil.md) | [✓](docs/regressions-dl19-doc-segmented-unicoil.md) | [✓](docs/regressions-dl20-doc-segmented-unicoil.md) | ### Available Corpora for Download @@ -132,13 +135,13 @@ See individual pages for details! | | dev | DL21 | DL22 | |--------------------------------------------|:---------------------------------------------------------------:|:---------------------------------------------------------:|:---------------------------------------------------------:| -| **Unsupervised lexical, original corpus** | +| **Unsupervised Lexical, Original Corpus** | | baselines | [+](docs/regressions-msmarco-v2-passage.md) | [+](docs/regressions-dl21-passage.md) | [+](docs/regressions-dl22-passage.md) | | doc2query-T5 | [+](docs/regressions-msmarco-v2-passage-d2q-t5.md) | [+](docs/regressions-dl21-passage-d2q-t5.md) | [+](docs/regressions-dl22-passage-d2q-t5.md) | -| **Unsupervised lexical, augmented corpus** | +| **Unsupervised Lexical, Augmented Corpus** | | baselines | [+](docs/regressions-msmarco-v2-passage-augmented.md) | [+](docs/regressions-dl21-passage-augmented.md) | [+](docs/regressions-dl22-passage-augmented.md) | | doc2query-T5 | [+](docs/regressions-msmarco-v2-passage-augmented-d2q-t5.md) | [+](docs/regressions-dl21-passage-augmented-d2q-t5.md) | [+](docs/regressions-dl22-passage-augmented-d2q-t5.md) | -| **Learned sparse lexical** | +| **Learned Sparse Lexical** | | uniCOIL noexp zero-shot | [✓](docs/regressions-msmarco-v2-passage-unicoil-noexp-0shot.md) | [✓](docs/regressions-dl21-passage-unicoil-noexp-0shot.md) | [✓](docs/regressions-dl22-passage-unicoil-noexp-0shot.md) | | uniCOIL with doc2query-T5 zero-shot | [✓](docs/regressions-msmarco-v2-passage-unicoil-0shot.md) | [✓](docs/regressions-dl21-passage-unicoil-0shot.md) | [✓](docs/regressions-dl22-passage-unicoil-0shot.md) | | SPLADE++ CoCondenser-EnsembleDistil | [✓](docs/regressions-msmarco-v2-passage-splade-pp-ed.md) | [✓](docs/regressions-dl21-passage-splade-pp-ed.md) | [✓](docs/regressions-dl22-passage-splade-pp-ed.md) | @@ -159,17 +162,17 @@ See individual pages for details! ### MS MARCO V2 Document Regressions -| | dev | DL21 | -|---|:---:|:----:| -| **Unsupervised lexical, complete doc** | -| baselines | [+](docs/regressions-msmarco-v2-doc.md) | [+](docs/regressions-dl21-doc.md) | -| doc2query-T5 | [+](docs/regressions-msmarco-v2-doc-d2q-t5.md) | [+](docs/regressions-dl21-doc-d2q-t5.md) | -| **Unsupervised lexical, segmented doc** | -| baselines | [+](docs/regressions-msmarco-v2-doc-segmented.md) | [+](docs/regressions-dl21-doc-segmented.md) | -| doc2query-T5 | [+](docs/regressions-msmarco-v2-doc-segmented-d2q-t5.md) | [+](docs/regressions-dl21-doc-segmented-d2q-t5.md) | -| **Learned sparse lexical** | -| uniCOIL noexp zero-shot | [✓](docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.md) | [✓](docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot-v2.md) | -| uniCOIL with doc2query-T5 zero-shot | [✓](docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot-v2.md) | [✓](docs/regressions-dl21-doc-segmented-unicoil-0shot-v2.md) | +| | dev | DL21 | +|-----------------------------------------|:------------------------------------------------------------------------:|:------------------------------------------------------------------:| +| **Unsupervised Lexical, Complete Doc** | +| baselines | [+](docs/regressions-msmarco-v2-doc.md) | [+](docs/regressions-dl21-doc.md) | +| doc2query-T5 | [+](docs/regressions-msmarco-v2-doc-d2q-t5.md) | [+](docs/regressions-dl21-doc-d2q-t5.md) | +| **Unsupervised Lexical, Segmented Doc** | +| baselines | [+](docs/regressions-msmarco-v2-doc-segmented.md) | [+](docs/regressions-dl21-doc-segmented.md) | +| doc2query-T5 | [+](docs/regressions-msmarco-v2-doc-segmented-d2q-t5.md) | [+](docs/regressions-dl21-doc-segmented-d2q-t5.md) | +| **Learned Sparse Lexical** | +| uniCOIL noexp zero-shot | [✓](docs/regressions-msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.md) | [✓](docs/regressions-dl21-doc-segmented-unicoil-noexp-0shot-v2.md) | +| uniCOIL with doc2query-T5 zero-shot | [✓](docs/regressions-msmarco-v2-doc-segmented-unicoil-0shot-v2.md) | [✓](docs/regressions-dl21-doc-segmented-unicoil-0shot-v2.md) | ### Available Corpora for Download diff --git a/docs/regressions-log.md b/docs/regressions-log.md index 1b3c766f00..cc65f15b1f 100644 --- a/docs/regressions-log.md +++ b/docs/regressions-log.md @@ -3,6 +3,19 @@ The following change log details commits to regression tests that alter effectiveness and the addition of new regression tests. This documentation is useful for figuring why results may have changed over time. +### June 27, 2023 + +Summarizing new regressions since last entry, see [PR #2140](https://github.com/castorini/anserini/pull/2140): + + ++ `msmarco-passage-cos-dpr-distil` ++ `msmarco-v2-passage-splade-pp-ed` ++ `msmarco-v2-passage-splade-pp-sd` ++ `dl21-passage-splade-pp-ed` ++ `dl21-passage-splade-pp-sd` ++ `dl22-passage-splade-pp-ed` ++ `dl22-passage-splade-pp-sd` + ### April 5, 2023 + commit [`a7df7f`](https://github.com/castorini/anserini/commit/a7df7fc5d527ede8f34ee60afa41dec4f6b0e93a) (4/5/2023) diff --git a/docs/regressions-msmarco-passage-cos-dpr-distil.md b/docs/regressions-msmarco-passage-cos-dpr-distil.md new file mode 100644 index 0000000000..17bc104b76 --- /dev/null +++ b/docs/regressions-msmarco-passage-cos-dpr-distil.md @@ -0,0 +1,111 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://arxiv.org/abs/2304.12139) _arXiv:2304.12139_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml). +Note that this page is automatically generated from [this template](../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ + -generator LuceneDenseVectorDocumentGenerator \ + -threads 16 -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + + + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.392 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.387 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.900 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.970 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally stable to the third digit after the decimal point. + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions.md b/docs/regressions.md index 7dbb6eb135..c555887356 100644 --- a/docs/regressions.md +++ b/docs/regressions.md @@ -50,8 +50,9 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-distil-cocodenser-medium >& logs/log.msmarco-passage-splade-distil-cocodenser-medium & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed >& logs/log.msmarco-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd >& logs/log.msmarco-passage-splade-pp-sd & -nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx -nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil >& logs/log.msmarco-passage-cos-dpr-distil & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc >& logs/log.msmarco-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-wp >& logs/log.msmarco-doc-wp & @@ -127,9 +128,10 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-d2q-t5 >& logs/log.msmarco-v2-passage-d2q-t5 & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-augmented >& logs/log.msmarco-v2-passage-augmented & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-augmented-d2q-t5 >& logs/log.msmarco-v2-passage-augmented-d2q-t5 & - nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-unicoil-noexp-0shot >& logs/log.msmarco-v2-passage-unicoil-noexp-0shot & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-unicoil-0shot >& logs/log.msmarco-v2-passage-unicoil-0shot & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-splade-pp-ed >& logs/log.msmarco-v2-passage-splade-pp-ed & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-splade-pp-sd >& logs/log.msmarco-v2-passage-splade-pp-sd & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc >& logs/log.msmarco-v2-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-doc-d2q-t5 >& logs/log.msmarco-v2-doc-d2q-t5 & @@ -147,6 +149,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-augmented-d2q-t5 >& logs/log.dl21-passage-augmented-d2q-t5 & nohup python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-unicoil-noexp-0shot >& logs/log.dl21-passage-unicoil-noexp-0shot & nohup python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-unicoil-0shot >& logs/log.dl21-passage-unicoil-0shot & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-splade-pp-ed >& logs/log.dl21-passage-splade-pp-ed & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl21-passage-splade-pp-sd >& logs/log.dl21-passage-splade-pp-sd & nohup python src/main/python/run_regression.py --index --verify --search --regression dl21-doc >& logs/log.dl21-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression dl21-doc-d2q-t5 >& logs/log.dl21-doc-d2q-t5 & @@ -163,6 +167,8 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression dl22-passage-augmented-d2q-t5 >& logs/log.dl22-passage-augmented-d2q-t5 & nohup python src/main/python/run_regression.py --index --verify --search --regression dl22-passage-unicoil-noexp-0shot >& logs/log.dl22-passage-unicoil-noexp-0shot & nohup python src/main/python/run_regression.py --index --verify --search --regression dl22-passage-unicoil-0shot >& logs/log.dl22-passage-unicoil-0shot & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl22-passage-splade-pp-ed >& logs/log.dl22-passage-splade-pp-ed & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl22-passage-splade-pp-sd >& logs/log.dl22-passage-splade-pp-sd & ```
diff --git a/pom.xml b/pom.xml index ee0d9fb0de..49e163899a 100644 --- a/pom.xml +++ b/pom.xml @@ -467,7 +467,7 @@ com.google.guava guava - 30.1.1-jre + 32.0.0-jre ai.djl.huggingface diff --git a/src/main/java/io/anserini/util/DumpAnalyzedQueries.java b/src/main/java/io/anserini/util/DumpAnalyzedQueries.java index b2e35425ff..9419343dee 100644 --- a/src/main/java/io/anserini/util/DumpAnalyzedQueries.java +++ b/src/main/java/io/anserini/util/DumpAnalyzedQueries.java @@ -17,14 +17,21 @@ package io.anserini.util; import io.anserini.analysis.AnalyzerUtils; +import io.anserini.analysis.AnalyzerMap; +import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.index.IndexCollection; import io.anserini.search.topicreader.TopicReader; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.commons.lang3.StringUtils; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; import org.kohsuke.args4j.ParserProperties; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + import java.io.FileOutputStream; import java.io.IOException; import java.nio.file.Path; @@ -39,6 +46,8 @@ */ public class DumpAnalyzedQueries { + private static final Logger LOG = LogManager.getLogger(DumpAnalyzedQueries.class); + public static class Args { @Option(name = "-topicreader", metaVar = "[class]", usage = "topic reader") public String topicReader = null; @@ -48,12 +57,35 @@ public static class Args { @Option(name = "-output", metaVar = "[file]", required = true, usage = "queries") public String output; + + @Option(name = "-language", usage = "Analyzer Language") + public String language = "en"; + } + + static Analyzer getAnalyzer(Args args) { + try { + if (AnalyzerMap.analyzerMap.containsKey(args.language)) { + LOG.info("Using language-specific analyzer"); + LOG.info("Language: " + args.language); + return AnalyzerMap.getLanguageSpecificAnalyzer(args.language); + } else if (args.language.equals("sw") || args.language.equals("te")) { + LOG.info("Using WhitespaceAnalyzer"); + return new WhitespaceAnalyzer(); + } else { + // Default to English + LOG.info("Using DefaultEnglishAnalyzer"); + return IndexCollection.DEFAULT_ANALYZER; + } + } catch (Exception e) { + return null; + } } @SuppressWarnings("unchecked") public static void main(String[] argv) throws IOException { Args args = new Args(); CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(90)); + Analyzer analyzer; try { parser.parseArgument(argv); @@ -68,10 +100,10 @@ public static void main(String[] argv) throws IOException { // Can we infer the TopicReader? Class clazz = TopicReader.getTopicReaderClassByFile(args.topicsFile.toString()); if (clazz != null) { - System.out.println(String.format("Inferring %s has TopicReader class %s.", args.topicsFile, clazz)); + LOG.warn(String.format("Inferring %s has TopicReader class %s.", args.topicsFile, clazz)); } else { // If not, get it from the command-line argument. - System.out.println(String.format("Unable to infer TopicReader class for %s, using specified class %s.", + LOG.info(String.format("Unable to infer TopicReader class for %s, using specified class %s.", args.topicsFile, args.topicReader)); if (args.topicReader == null) { System.err.println("Must specify TopicReader with -topicreader!"); @@ -87,16 +119,16 @@ public static void main(String[] argv) throws IOException { e.printStackTrace(); throw new IllegalArgumentException("Unable to load TopicReader: " + args.topicReader); } - SortedMap> topics = tr.read(); + analyzer = getAnalyzer(args); FileOutputStream out = new FileOutputStream(args.output); for (Map.Entry> entry : topics.entrySet()) { - List tokens = AnalyzerUtils.analyze(IndexCollection.DEFAULT_ANALYZER, entry.getValue().get("title")); + List tokens = AnalyzerUtils.analyze(analyzer, entry.getValue().get("title")); out.write((entry.getKey() + "\t" + StringUtils.join(tokens, " ") + "\n").getBytes()); } out.close(); - System.out.println("Done!"); + LOG.info("Done!"); } } diff --git a/src/main/python/regressions-batch03.txt b/src/main/python/regressions-batch03.txt index eb42685ff2..0eff24e557 100644 --- a/src/main/python/regressions-batch03.txt +++ b/src/main/python/regressions-batch03.txt @@ -1,3 +1,9 @@ + +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-splade-pp-ed > logs/log.msmarco-v2-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v2-passage-splade-pp-sd > logs/log.msmarco-v2-passage-splade-pp-sd 2>&1 + +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil > logs/log.msmarco-passage-cos-dpr-distil 2>&1 + # ONNX runs write to the same indexes as the encoded queries, so we need to spread out python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx > logs/log.msmarco-passage-splade-pp-ed-onnx 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx > logs/log.msmarco-passage-splade-pp-sd-onnx 2>&1 @@ -116,6 +122,9 @@ python src/main/python/run_regression.py --verify --search --regression dl21-pas python src/main/python/run_regression.py --verify --search --regression dl21-passage-unicoil-noexp-0shot > logs/log.dl21-passage-unicoil-noexp-0shot 2>&1 python src/main/python/run_regression.py --verify --search --regression dl21-passage-unicoil-0shot > logs/log.dl21-passage-unicoil-0shot 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl21-passage-splade-pp-ed > logs/log.dl21-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl21-passage-splade-pp-sd > logs/log.dl21-passage-splade-pp-sd 2>&1 + python src/main/python/run_regression.py --verify --search --regression dl21-doc > logs/log.dl21-doc 2>&1 python src/main/python/run_regression.py --verify --search --regression dl21-doc-d2q-t5 > logs/log.dl21-doc-d2q-t5 2>&1 python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented > logs/log.dl21-doc-segmented 2>&1 @@ -132,6 +141,9 @@ python src/main/python/run_regression.py --verify --search --regression dl22-pas python src/main/python/run_regression.py --verify --search --regression dl22-passage-unicoil-noexp-0shot > logs/log.dl22-passage-unicoil-noexp-0shot 2>&1 python src/main/python/run_regression.py --verify --search --regression dl22-passage-unicoil-0shot > logs/log.dl22-passage-unicoil-0shot 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl22-passage-splade-pp-ed > logs/log.dl22-passage-splade-pp-ed 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl22-passage-splade-pp-sd > logs/log.dl22-passage-splade-pp-sd 2>&1 + # MIRACL python src/main/python/run_regression.py --index --verify --search --regression miracl-v1.0-ar > logs/log.miracl-v1.0-ar 2>&1 python src/main/python/run_regression.py --index --verify --search --regression miracl-v1.0-bn > logs/log.miracl-v1.0-bn 2>&1 diff --git a/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil-noexp.template b/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil-noexp.template index de645b06b4..12cb2f15f7 100644 --- a/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil-noexp.template @@ -36,11 +36,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil-noexp.tar -P collections/ -tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. +To confirm, `${corpus}.tar` is 11 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil.template b/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil.template index 05e1ee971b..bcf48dbe91 100644 --- a/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil.template +++ b/src/main/resources/docgen/templates/dl19-doc-segmented-unicoil.template @@ -36,11 +36,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil.tar -P collections/ -tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. +To confirm, `${corpus}.tar` is 19 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-bm25-b8.template b/src/main/resources/docgen/templates/dl19-passage-bm25-b8.template index dfe7653950..355484f0e9 100644 --- a/src/main/resources/docgen/templates/dl19-passage-bm25-b8.template +++ b/src/main/resources/docgen/templates/dl19-passage-bm25-b8.template @@ -29,11 +29,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-bm25-b8.tar -P collections/ -tar xvf collections/msmarco-passage-bm25-b8.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-bm25-b8.tar` is 1.2 GB and has MD5 checksum `0a623e2c97ac6b7e814bf1323a97b435`. +To confirm, `${corpus}.tar` is 1.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-splade-distil-cocodenser-medium.template b/src/main/resources/docgen/templates/dl19-passage-splade-distil-cocodenser-medium.template index 55054dbce4..9d339301e6 100644 --- a/src/main/resources/docgen/templates/dl19-passage-splade-distil-cocodenser-medium.template +++ b/src/main/resources/docgen/templates/dl19-passage-splade-distil-cocodenser-medium.template @@ -33,11 +33,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-splade_distil_cocodenser_medium.tar -P collections/ -tar xvf collections/msmarco-passage-splade_distil_cocodenser_medium.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade_distil_cocodenser_medium.tar` is 4.9 GB and has MD5 checksum `f77239a26d08856e6491a34062893b0c`. +To confirm, `${corpus}.tar` is 4.9 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-splade-pp-ed-onnx.template b/src/main/resources/docgen/templates/dl19-passage-splade-pp-ed-onnx.template index bdf035b277..d7c142ec2b 100644 --- a/src/main/resources/docgen/templates/dl19-passage-splade-pp-ed-onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage-splade-pp-ed-onnx.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-ed.tar` is 4.2 GB and has MD5 checksum `e489133bdc54ee1e7c62a32aa582bc77`. +To confirm, `${corpus}.tar` is 4.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-splade-pp-ed.template b/src/main/resources/docgen/templates/dl19-passage-splade-pp-ed.template index 6bd8702149..421158889c 100644 --- a/src/main/resources/docgen/templates/dl19-passage-splade-pp-ed.template +++ b/src/main/resources/docgen/templates/dl19-passage-splade-pp-ed.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-ed.tar` is 4.2 GB and has MD5 checksum `e489133bdc54ee1e7c62a32aa582bc77`. +To confirm, `${corpus}.tar` is 4.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-splade-pp-sd-onnx.template b/src/main/resources/docgen/templates/dl19-passage-splade-pp-sd-onnx.template index 2051b6bafa..99a3680cda 100644 --- a/src/main/resources/docgen/templates/dl19-passage-splade-pp-sd-onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage-splade-pp-sd-onnx.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-sd.tar` is 4.8 GB and has MD5 checksum `cb7e264222f2bf2221dd2c9d28190be1`. +To confirm, `${corpus}.tar` is 4.8 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-splade-pp-sd.template b/src/main/resources/docgen/templates/dl19-passage-splade-pp-sd.template index ac6ea3741b..02d52a99b4 100644 --- a/src/main/resources/docgen/templates/dl19-passage-splade-pp-sd.template +++ b/src/main/resources/docgen/templates/dl19-passage-splade-pp-sd.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-sd.tar` is 4.8 GB and has MD5 checksum `cb7e264222f2bf2221dd2c9d28190be1`. +To confirm, `${corpus}.tar` is 4.8 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-unicoil-noexp.template b/src/main/resources/docgen/templates/dl19-passage-unicoil-noexp.template index c32f87be33..0572e40703 100644 --- a/src/main/resources/docgen/templates/dl19-passage-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/dl19-passage-unicoil-noexp.template @@ -38,11 +38,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-noexp.tar -P collections/ -tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. +To confirm, `${corpus}.tar` is 2.7 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-unicoil.template b/src/main/resources/docgen/templates/dl19-passage-unicoil.template index 7be33112e2..44452634bd 100644 --- a/src/main/resources/docgen/templates/dl19-passage-unicoil.template +++ b/src/main/resources/docgen/templates/dl19-passage-unicoil.template @@ -38,11 +38,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil.tar -P collections/ -tar xvf collections/msmarco-passage-unicoil.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. +To confirm, `${corpus}.tar` is 3.4 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil-noexp.template b/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil-noexp.template index 85d816d84f..552521c66e 100644 --- a/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil-noexp.template @@ -36,11 +36,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil-noexp.tar -P collections/ -tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. +To confirm, `${corpus}.tar` is 11 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil.template b/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil.template index 31f2fafd21..dfe208c0b2 100644 --- a/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil.template +++ b/src/main/resources/docgen/templates/dl20-doc-segmented-unicoil.template @@ -36,11 +36,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil.tar -P collections/ -tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. +To confirm, `${corpus}.tar` is 19 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-bm25-b8.template b/src/main/resources/docgen/templates/dl20-passage-bm25-b8.template index 0816158a8e..333e935627 100644 --- a/src/main/resources/docgen/templates/dl20-passage-bm25-b8.template +++ b/src/main/resources/docgen/templates/dl20-passage-bm25-b8.template @@ -29,11 +29,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-bm25-b8.tar -P collections/ -tar xvf collections/msmarco-passage-bm25-b8.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-bm25-b8.tar` is 1.2 GB and has MD5 checksum `0a623e2c97ac6b7e814bf1323a97b435`. +To confirm, `${corpus}.tar` is 1.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-splade-distil-cocodenser-medium.template b/src/main/resources/docgen/templates/dl20-passage-splade-distil-cocodenser-medium.template index 8c10bbdf5d..5a00106642 100644 --- a/src/main/resources/docgen/templates/dl20-passage-splade-distil-cocodenser-medium.template +++ b/src/main/resources/docgen/templates/dl20-passage-splade-distil-cocodenser-medium.template @@ -33,11 +33,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-splade_distil_cocodenser_medium.tar -P collections/ -tar xvf collections/msmarco-passage-splade_distil_cocodenser_medium.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade_distil_cocodenser_medium.tar` is 4.9 GB and has MD5 checksum `f77239a26d08856e6491a34062893b0c`. +To confirm, `${corpus}.tar` is 4.9 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-splade-pp-ed-onnx.template b/src/main/resources/docgen/templates/dl20-passage-splade-pp-ed-onnx.template index c586226458..5fa28d601d 100644 --- a/src/main/resources/docgen/templates/dl20-passage-splade-pp-ed-onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage-splade-pp-ed-onnx.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-ed.tar` is 4.2 GB and has MD5 checksum `e489133bdc54ee1e7c62a32aa582bc77`. +To confirm, `${corpus}.tar` is 4.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-splade-pp-ed.template b/src/main/resources/docgen/templates/dl20-passage-splade-pp-ed.template index d5e081fa5e..77344b165b 100644 --- a/src/main/resources/docgen/templates/dl20-passage-splade-pp-ed.template +++ b/src/main/resources/docgen/templates/dl20-passage-splade-pp-ed.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-ed.tar` is 4.2 GB and has MD5 checksum `e489133bdc54ee1e7c62a32aa582bc77`. +To confirm, `${corpus}.tar` is 4.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-splade-pp-sd-onnx.template b/src/main/resources/docgen/templates/dl20-passage-splade-pp-sd-onnx.template index 2f6aed7aff..c92d160bdb 100644 --- a/src/main/resources/docgen/templates/dl20-passage-splade-pp-sd-onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage-splade-pp-sd-onnx.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-sd.tar` is 4.8 GB and has MD5 checksum `cb7e264222f2bf2221dd2c9d28190be1`. +To confirm, `${corpus}.tar` is 4.8 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-splade-pp-sd.template b/src/main/resources/docgen/templates/dl20-passage-splade-pp-sd.template index 077143544d..b17cdce1d0 100644 --- a/src/main/resources/docgen/templates/dl20-passage-splade-pp-sd.template +++ b/src/main/resources/docgen/templates/dl20-passage-splade-pp-sd.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-sd.tar` is 4.8 GB and has MD5 checksum `cb7e264222f2bf2221dd2c9d28190be1`. +To confirm, `${corpus}.tar` is 4.8 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-unicoil-noexp.template b/src/main/resources/docgen/templates/dl20-passage-unicoil-noexp.template index 65622b3985..7fd5ae38e4 100644 --- a/src/main/resources/docgen/templates/dl20-passage-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/dl20-passage-unicoil-noexp.template @@ -38,11 +38,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-noexp.tar -P collections/ -tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. +To confirm, `${corpus}.tar` is 2.7 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-unicoil.template b/src/main/resources/docgen/templates/dl20-passage-unicoil.template index 897a15cd41..d48b1b0c3b 100644 --- a/src/main/resources/docgen/templates/dl20-passage-unicoil.template +++ b/src/main/resources/docgen/templates/dl20-passage-unicoil.template @@ -38,11 +38,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil.tar -P collections/ -tar xvf collections/msmarco-passage-unicoil.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. +To confirm, `${corpus}.tar` is 3.4 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot-v2.template b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot-v2.template index 737f52c797..9ecc949766 100644 --- a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot-v2.template +++ b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot-v2.template @@ -44,16 +44,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_doc_segmented_unicoil_0shot_v2 collections/msmarco-v2-doc-segmented-unicoil-0shot-v2 +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_doc_segmented_unicoil_0shot_v2.tar` is 72 GB and has an MD5 checksum of `c5639748c2cbad0152e10b0ebde3b804`. +To confirm, `${download_corpus}.tar` is 72 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot.template b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot.template index 5ba0872278..9aa1161dc8 100644 --- a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-0shot.template @@ -43,16 +43,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_doc_segmented_unicoil_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_doc_segmented_unicoil_0shot collections/msmarco-v2-doc-segmented-unicoil-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_doc_segmented_unicoil_0shot.tar` is 62 GB and has an MD5 checksum of `889db095113cc4fe152382ccff73304a`. +To confirm, `${download_corpus}.tar` is 62 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot-v2.template b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot-v2.template index 69d1b63dfe..1a92562766 100644 --- a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot-v2.template +++ b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot-v2.template @@ -44,16 +44,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 collections/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar` is 55 GB and has an MD5 checksum of `97ba262c497164de1054f357caea0c63`. +To confirm, `${download_corpus}.tar` is 55 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot.template index d77265f816..97beebe3d7 100644 --- a/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/dl21-doc-segmented-unicoil-noexp-0shot.template @@ -43,16 +43,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot collections/msmarco-v2-doc-segmented-unicoil-noexp-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar` is 54 GB and has an MD5 checksum of `28261587d6afde56efd8df4f950e7fb4`. +To confirm, `${download_corpus}.tar` is 54 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl21-passage-splade-pp-ed.template b/src/main/resources/docgen/templates/dl21-passage-splade-pp-ed.template index bab695a7bc..3d1592b742 100644 --- a/src/main/resources/docgen/templates/dl21-passage-splade-pp-ed.template +++ b/src/main/resources/docgen/templates/dl21-passage-splade-pp-ed.template @@ -34,11 +34,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco_v2_passage_splade_pp_ed.tar -P collections/ -tar xvf collections/msmarco_v2_passage_splade_pp_ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${download_corpus}.tar -C collections/ ``` -To confirm, `msmarco_v2_passage_splade_pp_ed.tar` is 66 GB and has MD5 checksum `2cdb2adc259b8fa6caf666b20ebdc0e8`. +To confirm, `${download_corpus}.tar` is 66 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl21-passage-splade-pp-sd.template b/src/main/resources/docgen/templates/dl21-passage-splade-pp-sd.template index b1e03500f2..bd64ec32fa 100644 --- a/src/main/resources/docgen/templates/dl21-passage-splade-pp-sd.template +++ b/src/main/resources/docgen/templates/dl21-passage-splade-pp-sd.template @@ -34,11 +34,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco_v2_passage_splade_pp_sd.tar -P collections/ -tar xvf collections/msmarco_v2_passage_splade_pp_sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${download_corpus}.tar -C collections/ ``` -To confirm, `msmarco_v2_passage_splade_pp_sd.tar` is 76 GB and has MD5 checksum `061930dd615c7c807323ea7fc7957877`. +To confirm, `${download_corpus}.tar` is 76 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl21-passage-unicoil-0shot.template b/src/main/resources/docgen/templates/dl21-passage-unicoil-0shot.template index ade0daeada..1b551dfe13 100644 --- a/src/main/resources/docgen/templates/dl21-passage-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/dl21-passage-unicoil-0shot.template @@ -38,16 +38,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_passage_unicoil_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_passage_unicoil_0shot collections/msmarco-v2-passage-unicoil-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_passage_unicoil_0shot.tar` is 41 GB and has an MD5 checksum of `1949a00bfd5e1f1a230a04bbc1f01539`. +To confirm, `${download_corpus}.tar` is 41 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl21-passage-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/dl21-passage-unicoil-noexp-0shot.template index 15718b35c5..7457ef88d5 100644 --- a/src/main/resources/docgen/templates/dl21-passage-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/dl21-passage-unicoil-noexp-0shot.template @@ -38,16 +38,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_passage_unicoil_noexp_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_passage_unicoil_noexp_0shot collections/msmarco-v2-passage-unicoil-noexp-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_passage_unicoil_noexp_0shot.tar` is 24 GB and has an MD5 checksum of `d9cc1ed3049746e68a2c91bf90e5212d`. +To confirm, `${download_corpus}.tar` is 24 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl22-passage-splade-pp-ed.template b/src/main/resources/docgen/templates/dl22-passage-splade-pp-ed.template index 6861dfa807..a98ecb36b6 100644 --- a/src/main/resources/docgen/templates/dl22-passage-splade-pp-ed.template +++ b/src/main/resources/docgen/templates/dl22-passage-splade-pp-ed.template @@ -34,11 +34,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco_v2_passage_splade_pp_ed.tar -P collections/ -tar xvf collections/msmarco_v2_passage_splade_pp_ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${download_corpus}.tar -C collections/ ``` -To confirm, `msmarco_v2_passage_splade_pp_ed.tar` is 66 GB and has MD5 checksum `2cdb2adc259b8fa6caf666b20ebdc0e8`. +To confirm, `${download_corpus}.tar` is 66 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl22-passage-splade-pp-sd.template b/src/main/resources/docgen/templates/dl22-passage-splade-pp-sd.template index 78b82bf194..f08ada810e 100644 --- a/src/main/resources/docgen/templates/dl22-passage-splade-pp-sd.template +++ b/src/main/resources/docgen/templates/dl22-passage-splade-pp-sd.template @@ -34,11 +34,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco_v2_passage_splade_pp_sd.tar -P collections/ -tar xvf collections/msmarco_v2_passage_splade_pp_sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${download_corpus}.tar -C collections/ ``` -To confirm, `msmarco_v2_passage_splade_pp_sd.tar` is 76 GB and has MD5 checksum `061930dd615c7c807323ea7fc7957877`. +To confirm, `${download_corpus}.tar` is 76 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl22-passage-unicoil-0shot.template b/src/main/resources/docgen/templates/dl22-passage-unicoil-0shot.template index d87499be67..365c880e9e 100644 --- a/src/main/resources/docgen/templates/dl22-passage-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/dl22-passage-unicoil-0shot.template @@ -38,16 +38,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_passage_unicoil_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_passage_unicoil_0shot collections/msmarco-v2-passage-unicoil-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_passage_unicoil_0shot.tar` is 41 GB and has an MD5 checksum of `1949a00bfd5e1f1a230a04bbc1f01539`. +To confirm, `${download_corpus}.tar` is 41 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl22-passage-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/dl22-passage-unicoil-noexp-0shot.template index c66f5e31ed..ae6cc78be7 100644 --- a/src/main/resources/docgen/templates/dl22-passage-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/dl22-passage-unicoil-noexp-0shot.template @@ -38,16 +38,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_passage_unicoil_noexp_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_passage_unicoil_noexp_0shot collections/msmarco-v2-passage-unicoil-noexp-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_passage_unicoil_noexp_0shot.tar` is 24 GB and has an MD5 checksum of `d9cc1ed3049746e68a2c91bf90e5212d`. +To confirm, `${download_corpus}.tar` is 24 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil-noexp.template b/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil-noexp.template index c295dd7fa9..a50bc50587 100644 --- a/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil-noexp.template @@ -36,11 +36,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil-noexp.tar -P collections/ -tar xvf collections/msmarco-doc-segmented-unicoil-noexp.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-doc-segmented-unicoil-noexp.tar` is 11 GB and has MD5 checksum `11b226e1cacd9c8ae0a660fd14cdd710`. +To confirm, `${corpus}.tar` is 11 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil.template b/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil.template index f4da0450dd..fa174b39c8 100644 --- a/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil.template +++ b/src/main/resources/docgen/templates/msmarco-doc-segmented-unicoil.template @@ -36,11 +36,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil.tar -P collections/ -tar xvf collections/msmarco-doc-segmented-unicoil.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-doc-segmented-unicoil.tar` is 19 GB and has MD5 checksum `6a00e2c0c375cb1e52c83ae5ac377ebb`. +To confirm, `${corpus}.tar` is 19 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-bm25-b8.template b/src/main/resources/docgen/templates/msmarco-passage-bm25-b8.template index 020f09cada..4e07c6e189 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-bm25-b8.template +++ b/src/main/resources/docgen/templates/msmarco-passage-bm25-b8.template @@ -27,11 +27,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-bm25-b8.tar -P collections/ -tar xvf collections/msmarco-passage-bm25-b8.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-bm25-b8.tar` is 1.2 GB and has MD5 checksum `0a623e2c97ac6b7e814bf1323a97b435`. +To confirm, `${corpus}.tar` is 1.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template new file mode 100644 index 0000000000..7c022ae3f0 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template @@ -0,0 +1,89 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://arxiv.org/abs/2304.12139) _arXiv:2304.12139_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + + + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally stable to the third digit after the decimal point. + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-passage-deepimpact.template b/src/main/resources/docgen/templates/msmarco-passage-deepimpact.template index 8eca7f8854..c4848780e3 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-deepimpact.template +++ b/src/main/resources/docgen/templates/msmarco-passage-deepimpact.template @@ -31,11 +31,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-deepimpact.tar -P collections/ -tar xvf collections/msmarco-passage-deepimpact.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-deepimpact.tar` is 3.6 GB and has MD5 checksum `73843885b503af3c8b3ee62e5f5a9900`. +To confirm, `${corpus}.tar` is 3.6 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-distill-splade-max.template b/src/main/resources/docgen/templates/msmarco-passage-distill-splade-max.template index dfbe966d92..8fc97927e3 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-distill-splade-max.template +++ b/src/main/resources/docgen/templates/msmarco-passage-distill-splade-max.template @@ -32,11 +32,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-distill-splade-max.tar -P collections/ -tar xvf collections/msmarco-passage-distill-splade-max.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-distill-splade-max.tar` is 9.9 GB and has MD5 checksum `b5d126f5d9a8e1b3ef3f5cb0ba651725`. +To confirm, `${corpus}.tar` is 9.9 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-splade-distil-cocodenser-medium.template b/src/main/resources/docgen/templates/msmarco-passage-splade-distil-cocodenser-medium.template index a3498b9dc9..25c4505e6b 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-splade-distil-cocodenser-medium.template +++ b/src/main/resources/docgen/templates/msmarco-passage-splade-distil-cocodenser-medium.template @@ -30,11 +30,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-splade_distil_cocodenser_medium.tar -P collections/ -tar xvf collections/msmarco-passage-splade_distil_cocodenser_medium.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade_distil_cocodenser_medium.tar` is 4.9 GB and has MD5 checksum `f77239a26d08856e6491a34062893b0c`. +To confirm, `${corpus}.tar` is 4.9 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-splade-pp-ed-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-splade-pp-ed-onnx.template index 8715ba7577..f08d3f4b44 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-splade-pp-ed-onnx.template +++ b/src/main/resources/docgen/templates/msmarco-passage-splade-pp-ed-onnx.template @@ -32,11 +32,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-ed.tar` is 4.2 GB and has MD5 checksum `e489133bdc54ee1e7c62a32aa582bc77`. +To confirm, `${corpus}.tar` is 4.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-splade-pp-ed.template b/src/main/resources/docgen/templates/msmarco-passage-splade-pp-ed.template index ce9a817141..f3bb706a7a 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-splade-pp-ed.template +++ b/src/main/resources/docgen/templates/msmarco-passage-splade-pp-ed.template @@ -32,11 +32,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-ed.tar` is 4.2 GB and has MD5 checksum `e489133bdc54ee1e7c62a32aa582bc77`. +To confirm, `${corpus}.tar` is 4.2 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-splade-pp-sd-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-splade-pp-sd-onnx.template index b273b0b53b..44df0f0ea8 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-splade-pp-sd-onnx.template +++ b/src/main/resources/docgen/templates/msmarco-passage-splade-pp-sd-onnx.template @@ -32,11 +32,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-sd.tar` is 4.8 GB and has MD5 checksum `cb7e264222f2bf2221dd2c9d28190be1`. +To confirm, `${corpus}.tar` is 4.8 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-splade-pp-sd.template b/src/main/resources/docgen/templates/msmarco-passage-splade-pp-sd.template index adf224b5d2..f215dc741a 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-splade-pp-sd.template +++ b/src/main/resources/docgen/templates/msmarco-passage-splade-pp-sd.template @@ -32,11 +32,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar -P collections/ -tar xvf collections/msmarco-passage-splade-pp-sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-splade-pp-sd.tar` is 4.8 GB and has MD5 checksum `cb7e264222f2bf2221dd2c9d28190be1`. +To confirm, `${corpus}.tar` is 4.8 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-unicoil-noexp.template b/src/main/resources/docgen/templates/msmarco-passage-unicoil-noexp.template index a054518ec2..03ee48f139 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-unicoil-noexp.template +++ b/src/main/resources/docgen/templates/msmarco-passage-unicoil-noexp.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-noexp.tar -P collections/ -tar xvf collections/msmarco-passage-unicoil-noexp.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-unicoil-noexp.tar` is 2.7 GB and has MD5 checksum `f17ddd8c7c00ff121c3c3b147d2e17d8`. +To confirm, `${corpus}.tar` is 2.7 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-unicoil-tilde-expansion.template b/src/main/resources/docgen/templates/msmarco-passage-unicoil-tilde-expansion.template index 2d96003c11..f37a90b242 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-unicoil-tilde-expansion.template +++ b/src/main/resources/docgen/templates/msmarco-passage-unicoil-tilde-expansion.template @@ -32,11 +32,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-tilde-expansion.tar -P collections/ -tar xvf collections/msmarco-passage-unicoil-tilde-expansion.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-unicoil-tilde-expansion.tar` is 3.9 GB and has MD5 checksum `12a9c289d94e32fd63a7d39c9677d75c`. +To confirm, `${corpus}.tar` is 3.9 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-passage-unicoil.template b/src/main/resources/docgen/templates/msmarco-passage-unicoil.template index e6960a1ac1..a2f29bdca6 100644 --- a/src/main/resources/docgen/templates/msmarco-passage-unicoil.template +++ b/src/main/resources/docgen/templates/msmarco-passage-unicoil.template @@ -32,11 +32,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil.tar -P collections/ -tar xvf collections/msmarco-passage-unicoil.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-unicoil.tar` is 3.4 GB and has MD5 checksum `78eef752c78c8691f7d61600ceed306f`. +To confirm, `${corpus}.tar` is 3.4 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot-v2.template b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot-v2.template index 240ec210bd..54036106ca 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot-v2.template +++ b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot-v2.template @@ -41,16 +41,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_doc_segmented_unicoil_0shot_v2 collections/msmarco-v2-doc-segmented-unicoil-0shot-v2 +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_doc_segmented_unicoil_0shot_v2.tar` is 72 GB and has an MD5 checksum of `c5639748c2cbad0152e10b0ebde3b804`. +To confirm, `${download_corpus}.tar` is 72 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot.template b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot.template index 82521f154f..0f7b8944ef 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-0shot.template @@ -40,16 +40,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_doc_segmented_unicoil_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_doc_segmented_unicoil_0shot collections/msmarco-v2-doc-segmented-unicoil-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_doc_segmented_unicoil_0shot.tar` is 62 GB and has an MD5 checksum of `889db095113cc4fe152382ccff73304a`. +To confirm, `${download_corpus}.tar` is 62 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.template b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.template index c819b6a863..1b28c3ad42 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.template +++ b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2.template @@ -41,16 +41,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2 collections/msmarco-v2-doc-segmented-unicoil-noexp-0shot-v2 +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar` is 55 GB and has an MD5 checksum of `97ba262c497164de1054f357caea0c63`. +To confirm, `${download_corpus}.tar` is 55 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot.template index 960b18a5d1..74b7b377ac 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/msmarco-v2-doc-segmented-unicoil-noexp-0shot.template @@ -40,16 +40,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_doc_segmented_unicoil_noexp_0shot collections/msmarco-v2-doc-segmented-unicoil-noexp-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_doc_segmented_unicoil_noexp_0shot.tar` is 54 GB and has an MD5 checksum of `28261587d6afde56efd8df4f950e7fb4`. +To confirm, `${download_corpus}.tar` is 54 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-v2-passage-splade-pp-ed.template b/src/main/resources/docgen/templates/msmarco-v2-passage-splade-pp-ed.template index feb28094ae..5306211fd3 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-passage-splade-pp-ed.template +++ b/src/main/resources/docgen/templates/msmarco-v2-passage-splade-pp-ed.template @@ -33,11 +33,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco_v2_passage_splade_pp_ed.tar -P collections/ -tar xvf collections/msmarco_v2_passage_splade_pp_ed.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${download_corpus}.tar -C collections/ ``` -To confirm, `msmarco_v2_passage_splade_pp_ed.tar` is 66 GB and has MD5 checksum `2cdb2adc259b8fa6caf666b20ebdc0e8`. +To confirm, `${download_corpus}.tar` is 66 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-v2-passage-splade-pp-sd.template b/src/main/resources/docgen/templates/msmarco-v2-passage-splade-pp-sd.template index acdfc65cd1..8ad0b74923 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-passage-splade-pp-sd.template +++ b/src/main/resources/docgen/templates/msmarco-v2-passage-splade-pp-sd.template @@ -33,11 +33,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco_v2_passage_splade_pp_sd.tar -P collections/ -tar xvf collections/msmarco_v2_passage_splade_pp_sd.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${download_corpus}.tar -C collections/ ``` -To confirm, `msmarco_v2_passage_splade_pp_sd.tar` is 76 GB and has MD5 checksum `061930dd615c7c807323ea7fc7957877`. +To confirm, `${download_corpus}.tar` is 76 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-0shot.template b/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-0shot.template index 08bd5cdc49..c880ece60e 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-0shot.template +++ b/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-0shot.template @@ -35,16 +35,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_passage_unicoil_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_passage_unicoil_0shot collections/msmarco-v2-passage-unicoil-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_passage_unicoil_0shot.tar` is 41 GB and has an MD5 checksum of `1949a00bfd5e1f1a230a04bbc1f01539`. +To confirm, `${download_corpus}.tar` is 41 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-noexp-0shot.template b/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-noexp-0shot.template index 96f945a143..436d22ac64 100644 --- a/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-noexp-0shot.template +++ b/src/main/resources/docgen/templates/msmarco-v2-passage-unicoil-noexp-0shot.template @@ -35,16 +35,16 @@ Download, unpack, and prepare the corpus: ```bash # Download -wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_passage_unicoil_noexp_0shot.tar -P collections/ +wget ${download_url} -P collections/ # Unpack -tar -xvf collections/msmarco_v2_passage_unicoil_noexp_0shot.tar -C collections/ +tar -xvf collections/${download_corpus}.tar -C collections/ # Rename (indexer is expecting corpus under a slightly different name) -mv collections/msmarco_v2_passage_unicoil_noexp_0shot collections/msmarco-v2-passage-unicoil-noexp-0shot +mv collections/${download_corpus} collections/${corpus} ``` -To confirm, `msmarco_v2_passage_unicoil_noexp_0shot.tar` is 24 GB and has an MD5 checksum of `d9cc1ed3049746e68a2c91bf90e5212d`. +To confirm, `${download_corpus}.tar` is 24 GB and has an MD5 checksum of `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml index 5d831504da..51a0dd3855 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml @@ -1,7 +1,10 @@ --- -corpus: msmarco-passage +corpus: msmarco-passage-cos-dpr-distil corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator diff --git a/src/test/java/io/anserini/doc/DataModel.java b/src/test/java/io/anserini/doc/DataModel.java index 51cc803e44..144c0d7b9d 100755 --- a/src/test/java/io/anserini/doc/DataModel.java +++ b/src/test/java/io/anserini/doc/DataModel.java @@ -26,7 +26,10 @@ public class DataModel { private static final String INDEX_COMMAND = "target/appassembler/bin/IndexCollection"; + private static final String INDEX_HNSW_COMMAND = "target/appassembler/bin/IndexHnswDenseVectors"; + private static final String SEARCH_COMMAND = "target/appassembler/bin/SearchCollection"; + private static final String SEARCH_HNSW_COMMAND = "target/appassembler/bin/SearchHnswDenseVectors"; private String corpus; private String corpus_path; @@ -259,8 +262,13 @@ static class Metric { } public String generateIndexingCommand(String collection) { + String indexCommand = INDEX_COMMAND; + if (getCollection_class().equals("JsonDenseVectorCollection")) { + indexCommand = INDEX_HNSW_COMMAND; + } + StringBuilder builder = new StringBuilder(); - builder.append(INDEX_COMMAND).append(" \\\n"); + builder.append(indexCommand).append(" \\\n"); builder.append(" -collection ").append(getCollection_class()).append(" \\\n"); builder.append(" -input ").append("/path/to/"+collection).append(" \\\n"); builder.append(" -index ").append(getIndex_path()).append(" \\\n"); @@ -291,7 +299,11 @@ public String generateRankingCommand(String collection) { StringBuilder builder = new StringBuilder(); for (Model model : getModels()) { for (Topic topic : getTopics()) { - builder.append(SEARCH_COMMAND).append(" \\\n"); + String searchCommand = SEARCH_COMMAND; + if (model.getParams().contains("VectorQueryGenerator")) { + searchCommand = SEARCH_HNSW_COMMAND; + } + builder.append(searchCommand).append(" \\\n"); builder.append(" -index").append(" ").append(getIndex_path()).append(" \\\n"); builder.append(" -topics").append(" ").append(Paths.get("tools/topics-and-qrels", topic.getPath())).append(" \\\n"); builder.append(" -topicreader").append(" ").append((topic.getTopic_reader() == null) ? getTopic_reader() : topic.getTopic_reader()).append(" \\\n"); @@ -403,7 +415,12 @@ public String generateEffectiveness(String collection) { Topic topic = getTopics().get(i); builder.append(String.format("| %1$-109s|", topic.getName())); for (Model model : getModels()) { - builder.append(String.format(" %-10.4f|", model.getResults().get(eval.getMetric()).get(i))); + // 3 digits for HNSW, 4 otherwise: + if (getCollection_class().equals("JsonDenseVectorCollection")) { + builder.append(String.format(" %-10.3f|", model.getResults().get(eval.getMetric()).get(i))); + } else { + builder.append(String.format(" %-10.4f|", model.getResults().get(eval.getMetric()).get(i))); + } } builder.append("\n"); } diff --git a/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java b/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java index 8afe996bc5..f5c6ea24cf 100755 --- a/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java +++ b/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java @@ -50,6 +50,8 @@ public void main() throws Exception { valuesMap.put("template", String.format("../src/main/resources/docgen/templates/%s.template", testName)); valuesMap.put("test_name", testName); valuesMap.put("corpus", corpus); + valuesMap.put("download_url", data.getDownload_url()); + valuesMap.put("download_checksum", data.getDownload_checksum()); valuesMap.put("download_corpus", download_corpus); valuesMap.put("index_cmds", data.generateIndexingCommand(corpus)); valuesMap.put("ranking_cmds", data.generateRankingCommand(corpus));