From 05b806553b0eee715f8a8468f7af9d2fdc4ba68f Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 18 Mar 2020 14:09:02 -0400 Subject: [PATCH 1/3] Compares lossy vs. exact terms. --- .../java/io/anserini/util/ExtractDocumentLengths.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java index cd1619d95c..ced7ed3ac9 100644 --- a/src/main/java/io/anserini/util/ExtractDocumentLengths.java +++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java @@ -66,6 +66,9 @@ public static void main(String[] args) throws Exception { PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output))); int numDocs = reader.numDocs(); + long lossyTotalTerms = 0; + long exactTotalTerms = 0; + out.println("docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count"); for (int i = 0; i < numDocs; i++) { Terms terms = reader.getTermVector(i, "contents"); @@ -80,7 +83,14 @@ public static void main(String[] args) throws Exception { int lossyDoclength = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactDoclength)); int lossyTermCount = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactTermCount)); out.println(String.format("%d\t%d\t%d\t%d\t%d", i, exactDoclength, exactTermCount, lossyDoclength, lossyTermCount)); + lossyTotalTerms += lossyDoclength; + exactTotalTerms += exactDoclength; } + + System.out.println("Total number of terms in collection (sum of doclengths):"); + System.out.println("Lossy: " + lossyTotalTerms); + System.out.println("Exact: " + exactTotalTerms); + out.flush(); out.close(); reader.close(); From e1f51391e93b3489f1f7589e08728f99178ae9af Mon Sep 17 00:00:00 2001 From: lintool Date: Thu, 19 Mar 2020 21:23:18 -0400 Subject: [PATCH 2/3] Tweaked tests. --- .../io/anserini/integration/EndToEndTest.java | 10 +++------- .../integration/MultiThreadingSearchTest.java | 1 + .../anserini/search/SearchCollectionTest.java | 2 -- .../util/ExtractDocumentLengthsTest.java | 20 +++++++++++++++++++ 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java index 4929d3ec4a..c54f39b543 100644 --- a/src/test/java/io/anserini/integration/EndToEndTest.java +++ b/src/test/java/io/anserini/integration/EndToEndTest.java @@ -85,6 +85,7 @@ public abstract class EndToEndTest extends LuceneTestCase { public void setUp() throws Exception { super.setUp(); init(); + testIndexing(); } @After @@ -196,7 +197,8 @@ protected SearchArgs createDefaultSearchArgs() { return searchArgs; } - protected void testSearching() { + @Test + public void testSearching() { try { for (Map.Entry entry : testQueries.entrySet()) { SearchCollection searcher = new SearchCollection(entry.getValue()); @@ -224,10 +226,4 @@ protected void checkRankingResults(String key, String output) throws IOException assertEquals(cnt, ref.length); } - - @Test - public void testAll() { - testIndexing(); - testSearching(); - } } diff --git a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java index 3e84843fd0..c74240afb0 100644 --- a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java +++ b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java @@ -153,6 +153,7 @@ protected void checkRankingResults(String key, String output) throws IOException assertEquals(groundTruthRuns.get(run)[cnt], s); cnt++; } + assertEquals(cnt, groundTruthRuns.get(run).length); // Add the file to the cleanup list. cleanup.add(runfile); diff --git a/src/test/java/io/anserini/search/SearchCollectionTest.java b/src/test/java/io/anserini/search/SearchCollectionTest.java index 08f13be7dd..7c790e52eb 100644 --- a/src/test/java/io/anserini/search/SearchCollectionTest.java +++ b/src/test/java/io/anserini/search/SearchCollectionTest.java @@ -1,12 +1,10 @@ package io.anserini.search; -import org.junit.After; import org.junit.Test; import java.io.ByteArrayOutputStream; import java.io.PrintStream; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class SearchCollectionTest { diff --git a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java index 3e712fe0dc..73118933db 100644 --- a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java +++ b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java @@ -21,6 +21,8 @@ import org.junit.Before; import org.junit.Test; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; import java.nio.file.Files; import java.nio.file.Paths; import java.util.List; @@ -31,6 +33,19 @@ public class ExtractDocumentLengthsTest extends IndexerTestBase { private static final Random rand = new Random(); private String randomFileName; + private final ByteArrayOutputStream out = new ByteArrayOutputStream(); + private PrintStream save; + + private void redirectStdout() { + save = System.out; + out.reset(); + System.setOut(new PrintStream(out)); + } + + private void restoreStdout() { + System.setOut(save); + } + @Before @Override public void setUp() throws Exception { @@ -49,7 +64,12 @@ public void tearDown() throws Exception { public void test() throws Exception { // See: https://github.com/castorini/anserini/issues/903 Locale.setDefault(Locale.US); + redirectStdout(); ExtractDocumentLengths.main(new String[] {"-index", tempDir1.toString(), "-output", randomFileName}); + restoreStdout(); + + assertEquals("Total number of terms in collection (sum of doclengths):\nLossy: 12\nExact: 12\n", + out.toString()); List lines = Files.readAllLines(Paths.get(randomFileName)); assertEquals(4, lines.size()); From a5bd731edce0cb1bee26efcfbbb37104e749896d Mon Sep 17 00:00:00 2001 From: lintool Date: Fri, 20 Mar 2020 06:59:36 -0400 Subject: [PATCH 3/3] Addressed CR. --- .../anserini/util/ExtractDocumentLengths.java | 10 +- .../java/io/anserini/IndexerTestBase.java | 25 ++-- .../IndexerWithEmptyDocumentTestBase.java | 117 ++++++++++++++++++ .../util/ExtractDocumentLengthsTest.java | 6 +- .../io/anserini/util/ExtractNormsTest.java | 6 +- 5 files changed, 146 insertions(+), 18 deletions(-) create mode 100644 src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java index ced7ed3ac9..2cd3440e10 100644 --- a/src/main/java/io/anserini/util/ExtractDocumentLengths.java +++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java @@ -16,6 +16,7 @@ package io.anserini.util; +import io.anserini.index.IndexArgs; import io.anserini.index.NotStoredException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -71,10 +72,15 @@ public static void main(String[] args) throws Exception { out.println("docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count"); for (int i = 0; i < numDocs; i++) { - Terms terms = reader.getTermVector(i, "contents"); + Terms terms = reader.getTermVector(i, IndexArgs.CONTENTS); if (terms == null) { - throw new NotStoredException("Term vectors not available!"); + // It could be the case that TermVectors weren't stored when constructing the index, or we're just missing a + // TermVector for a zero-length document. Warn, but don't throw exception. + System.err.println(String.format("Warning: TermVector not available for docid %d.", i)); + out.println(String.format("%d\t0\t0\t0\t0", i)); + continue; } + long exactDoclength = terms.getSumTotalTermFreq(); long exactTermCount = terms.size(); // Uses Lucene's method of encoding an integer into a byte, and the decoding it again. diff --git a/src/test/java/io/anserini/IndexerTestBase.java b/src/test/java/io/anserini/IndexerTestBase.java index 6373ede542..917f684ae3 100644 --- a/src/test/java/io/anserini/IndexerTestBase.java +++ b/src/test/java/io/anserini/IndexerTestBase.java @@ -16,6 +16,7 @@ package io.anserini; +import io.anserini.index.IndexArgs; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.document.Document; @@ -59,26 +60,26 @@ private void buildTestIndex() throws IOException { Document doc1 = new Document(); String doc1Text = "here is some text here is some more text. city."; - doc1.add(new StringField("id", "doc1", Field.Store.YES)); - doc1.add(new SortedDocValuesField("id", new BytesRef("doc1".getBytes()))); - doc1.add(new Field("contents", doc1Text , textOptions)); - doc1.add(new StoredField("raw", doc1Text)); + doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES)); + doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes()))); + doc1.add(new Field(IndexArgs.CONTENTS, doc1Text , textOptions)); + doc1.add(new StoredField(IndexArgs.RAW, doc1Text)); writer.addDocument(doc1); Document doc2 = new Document(); String doc2Text = "more texts"; - doc2.add(new StringField("id", "doc2", Field.Store.YES)); - doc2.add(new SortedDocValuesField("id", new BytesRef("doc2".getBytes()))); - doc2.add(new Field("contents", doc2Text, textOptions)); // Note plural, to test stemming - doc2.add(new StoredField("raw", doc2Text)); + doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES)); + doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes()))); + doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions)); // Note plural, to test stemming + doc2.add(new StoredField(IndexArgs.RAW, doc2Text)); writer.addDocument(doc2); Document doc3 = new Document(); String doc3Text = "here is a test"; - doc3.add(new StringField("id", "doc3", Field.Store.YES)); - doc3.add(new SortedDocValuesField("id", new BytesRef("doc3".getBytes()))); - doc3.add(new Field("contents", doc3Text, textOptions)); - doc3.add(new StoredField("raw", doc3Text)); + doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES)); + doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes()))); + doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions)); + doc3.add(new StoredField(IndexArgs.RAW, doc3Text)); writer.addDocument(doc3); writer.commit(); diff --git a/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java new file mode 100644 index 0000000000..e9e91a244b --- /dev/null +++ b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java @@ -0,0 +1,117 @@ +/* + * Anserini: A Lucene toolkit for replicable information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini; + +import io.anserini.index.IndexArgs; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.After; +import org.junit.Before; + +import java.io.IOException; +import java.nio.file.Path; + +public class IndexerWithEmptyDocumentTestBase extends LuceneTestCase { + protected Path tempDir1; + + // A very simple example of how to build an index. + // Creates an index similar to IndexerTestBase, but adds an empty document to test error handling. + private void buildTestIndex() throws IOException { + Directory dir = FSDirectory.open(tempDir1); + + Analyzer analyzer = new EnglishAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + + IndexWriter writer = new IndexWriter(dir, config); + + FieldType textOptions = new FieldType(); + textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + textOptions.setStored(true); + textOptions.setTokenized(true); + textOptions.setStoreTermVectors(true); + textOptions.setStoreTermVectorPositions(true); + + Document doc1 = new Document(); + String doc1Text = "here is some text here is some more text. city."; + doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES)); + doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes()))); + doc1.add(new Field(IndexArgs.CONTENTS, doc1Text , textOptions)); + doc1.add(new StoredField(IndexArgs.RAW, doc1Text)); + writer.addDocument(doc1); + + Document doc2 = new Document(); + String doc2Text = "more texts"; + doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES)); + doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes()))); + doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions)); // Note plural, to test stemming + doc2.add(new StoredField(IndexArgs.RAW, doc2Text)); + writer.addDocument(doc2); + + Document doc3 = new Document(); + String doc3Text = "here is a test"; + doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES)); + doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes()))); + doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions)); + doc3.add(new StoredField(IndexArgs.RAW, doc3Text)); + writer.addDocument(doc3); + + Document doc4 = new Document(); + String doc4Text = ""; + doc4.add(new StringField(IndexArgs.ID, "doc4", Field.Store.YES)); + doc4.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc4".getBytes()))); + doc4.add(new Field(IndexArgs.CONTENTS, doc4Text, textOptions)); + doc4.add(new StoredField(IndexArgs.RAW, doc4Text)); + writer.addDocument(doc4); + + writer.commit(); + writer.forceMerge(1); + writer.close(); + + dir.close(); + } + + @Before + @Override + public void setUp() throws Exception { + super.setUp(); + + tempDir1 = createTempDir(); + buildTestIndex(); + } + + @After + @Override + public void tearDown() throws Exception { + // Call garbage collector for Windows compatibility + System.gc(); + super.tearDown(); + } +} diff --git a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java index 73118933db..5d912dfd7e 100644 --- a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java +++ b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java @@ -17,6 +17,7 @@ package io.anserini.util; import io.anserini.IndexerTestBase; +import io.anserini.IndexerWithEmptyDocumentTestBase; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -29,7 +30,7 @@ import java.util.Locale; import java.util.Random; -public class ExtractDocumentLengthsTest extends IndexerTestBase { +public class ExtractDocumentLengthsTest extends IndexerWithEmptyDocumentTestBase { private static final Random rand = new Random(); private String randomFileName; @@ -72,9 +73,10 @@ public void test() throws Exception { out.toString()); List lines = Files.readAllLines(Paths.get(randomFileName)); - assertEquals(4, lines.size()); + assertEquals(5, lines.size()); assertEquals("0\t8\t5\t8\t5", lines.get(1)); assertEquals("1\t2\t2\t2\t2", lines.get(2)); assertEquals("2\t2\t2\t2\t2", lines.get(3)); + assertEquals("3\t0\t0\t0\t0", lines.get(4)); } } diff --git a/src/test/java/io/anserini/util/ExtractNormsTest.java b/src/test/java/io/anserini/util/ExtractNormsTest.java index 0ce8373065..8bed615e65 100644 --- a/src/test/java/io/anserini/util/ExtractNormsTest.java +++ b/src/test/java/io/anserini/util/ExtractNormsTest.java @@ -17,6 +17,7 @@ package io.anserini.util; import io.anserini.IndexerTestBase; +import io.anserini.IndexerWithEmptyDocumentTestBase; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -27,7 +28,7 @@ import java.util.Locale; import java.util.Random; -public class ExtractNormsTest extends IndexerTestBase { +public class ExtractNormsTest extends IndexerWithEmptyDocumentTestBase { private static final Random rand = new Random(); private String randomFileName; @@ -52,9 +53,10 @@ public void test() throws Exception { ExtractNorms.main(new String[] {"-index", tempDir1.toString(), "-output", randomFileName}); List lines = Files.readAllLines(Paths.get(randomFileName)); - assertEquals(4, lines.size()); + assertEquals(5, lines.size()); assertEquals("0\t8", lines.get(1)); assertEquals("1\t2", lines.get(2)); assertEquals("2\t2", lines.get(3)); + assertEquals("3\t0", lines.get(4)); } }