From 05b806553b0eee715f8a8468f7af9d2fdc4ba68f Mon Sep 17 00:00:00 2001
From: lintool <jimmylin@uwaterloo.ca>
Date: Wed, 18 Mar 2020 14:09:02 -0400
Subject: [PATCH 1/3] Compares lossy vs. exact terms.

---
 .../java/io/anserini/util/ExtractDocumentLengths.java  | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java
index cd1619d95c..ced7ed3ac9 100644
--- a/src/main/java/io/anserini/util/ExtractDocumentLengths.java
+++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java
@@ -66,6 +66,9 @@ public static void main(String[] args) throws Exception {
     PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
 
     int numDocs = reader.numDocs();
+    long lossyTotalTerms = 0;
+    long exactTotalTerms = 0;
+
     out.println("docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count");
     for (int i = 0; i < numDocs; i++) {
       Terms terms = reader.getTermVector(i, "contents");
@@ -80,7 +83,14 @@ public static void main(String[] args) throws Exception {
       int lossyDoclength = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactDoclength));
       int lossyTermCount = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactTermCount));
       out.println(String.format("%d\t%d\t%d\t%d\t%d", i, exactDoclength, exactTermCount, lossyDoclength, lossyTermCount));
+      lossyTotalTerms += lossyDoclength;
+      exactTotalTerms += exactDoclength;
     }
+
+    System.out.println("Total number of terms in collection (sum of doclengths):");
+    System.out.println("Lossy: " + lossyTotalTerms);
+    System.out.println("Exact: " + exactTotalTerms);
+
     out.flush();
     out.close();
     reader.close();

From e1f51391e93b3489f1f7589e08728f99178ae9af Mon Sep 17 00:00:00 2001
From: lintool <jimmylin@uwaterloo.ca>
Date: Thu, 19 Mar 2020 21:23:18 -0400
Subject: [PATCH 2/3] Tweaked tests.

---
 .../io/anserini/integration/EndToEndTest.java | 10 +++-------
 .../integration/MultiThreadingSearchTest.java |  1 +
 .../anserini/search/SearchCollectionTest.java |  2 --
 .../util/ExtractDocumentLengthsTest.java      | 20 +++++++++++++++++++
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/test/java/io/anserini/integration/EndToEndTest.java b/src/test/java/io/anserini/integration/EndToEndTest.java
index 4929d3ec4a..c54f39b543 100644
--- a/src/test/java/io/anserini/integration/EndToEndTest.java
+++ b/src/test/java/io/anserini/integration/EndToEndTest.java
@@ -85,6 +85,7 @@ public abstract class EndToEndTest extends LuceneTestCase {
   public void setUp() throws Exception {
     super.setUp();
     init();
+    testIndexing();
   }
 
   @After
@@ -196,7 +197,8 @@ protected SearchArgs createDefaultSearchArgs() {
     return searchArgs;
   }
 
-  protected void testSearching() {
+  @Test
+  public void testSearching() {
     try {
       for (Map.Entry<String, SearchArgs> entry : testQueries.entrySet()) {
         SearchCollection searcher = new SearchCollection(entry.getValue());
@@ -224,10 +226,4 @@ protected void checkRankingResults(String key, String output) throws IOException
 
     assertEquals(cnt, ref.length);
   }
-
-  @Test
-  public void testAll() {
-    testIndexing();
-    testSearching();
-  }
 }
diff --git a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
index 3e84843fd0..c74240afb0 100644
--- a/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
+++ b/src/test/java/io/anserini/integration/MultiThreadingSearchTest.java
@@ -153,6 +153,7 @@ protected void checkRankingResults(String key, String output) throws IOException
         assertEquals(groundTruthRuns.get(run)[cnt], s);
         cnt++;
       }
+      assertEquals(cnt, groundTruthRuns.get(run).length);
 
       // Add the file to the cleanup list.
       cleanup.add(runfile);
diff --git a/src/test/java/io/anserini/search/SearchCollectionTest.java b/src/test/java/io/anserini/search/SearchCollectionTest.java
index 08f13be7dd..7c790e52eb 100644
--- a/src/test/java/io/anserini/search/SearchCollectionTest.java
+++ b/src/test/java/io/anserini/search/SearchCollectionTest.java
@@ -1,12 +1,10 @@
 package io.anserini.search;
 
-import org.junit.After;
 import org.junit.Test;
 
 import java.io.ByteArrayOutputStream;
 import java.io.PrintStream;
 
-import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 public class SearchCollectionTest {
diff --git a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java
index 3e712fe0dc..73118933db 100644
--- a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java
+++ b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java
@@ -21,6 +21,8 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.List;
@@ -31,6 +33,19 @@ public class ExtractDocumentLengthsTest extends IndexerTestBase {
   private static final Random rand = new Random();
   private String randomFileName;
 
+  private final ByteArrayOutputStream out = new ByteArrayOutputStream();
+  private PrintStream save;
+
+  private void redirectStdout() {
+    save = System.out;
+    out.reset();
+    System.setOut(new PrintStream(out));
+  }
+
+  private void restoreStdout() {
+    System.setOut(save);
+  }
+
   @Before
   @Override
   public void setUp() throws Exception {
@@ -49,7 +64,12 @@ public void tearDown() throws Exception {
   public void test() throws Exception {
     // See: https://github.com/castorini/anserini/issues/903
     Locale.setDefault(Locale.US);
+    redirectStdout();
     ExtractDocumentLengths.main(new String[] {"-index", tempDir1.toString(), "-output", randomFileName});
+    restoreStdout();
+
+    assertEquals("Total number of terms in collection (sum of doclengths):\nLossy: 12\nExact: 12\n",
+        out.toString());
 
     List<String> lines = Files.readAllLines(Paths.get(randomFileName));
     assertEquals(4, lines.size());

From a5bd731edce0cb1bee26efcfbbb37104e749896d Mon Sep 17 00:00:00 2001
From: lintool <jimmylin@uwaterloo.ca>
Date: Fri, 20 Mar 2020 06:59:36 -0400
Subject: [PATCH 3/3] Addressed CR.

---
 .../anserini/util/ExtractDocumentLengths.java |  10 +-
 .../java/io/anserini/IndexerTestBase.java     |  25 ++--
 .../IndexerWithEmptyDocumentTestBase.java     | 117 ++++++++++++++++++
 .../util/ExtractDocumentLengthsTest.java      |   6 +-
 .../io/anserini/util/ExtractNormsTest.java    |   6 +-
 5 files changed, 146 insertions(+), 18 deletions(-)
 create mode 100644 src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java

diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java
index ced7ed3ac9..2cd3440e10 100644
--- a/src/main/java/io/anserini/util/ExtractDocumentLengths.java
+++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java
@@ -16,6 +16,7 @@
 
 package io.anserini.util;
 
+import io.anserini.index.IndexArgs;
 import io.anserini.index.NotStoredException;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
@@ -71,10 +72,15 @@ public static void main(String[] args) throws Exception {
 
     out.println("docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count");
     for (int i = 0; i < numDocs; i++) {
-      Terms terms = reader.getTermVector(i, "contents");
+      Terms terms = reader.getTermVector(i, IndexArgs.CONTENTS);
       if (terms == null) {
-        throw new NotStoredException("Term vectors not available!");
+        // It could be the case that TermVectors weren't stored when constructing the index, or we're just missing a
+        // TermVector for a zero-length document. Warn, but don't throw exception.
+        System.err.println(String.format("Warning: TermVector not available for docid %d.", i));
+        out.println(String.format("%d\t0\t0\t0\t0", i));
+        continue;
       }
+
       long exactDoclength = terms.getSumTotalTermFreq();
       long exactTermCount = terms.size();
       // Uses Lucene's method of encoding an integer into a byte, and the decoding it again.
diff --git a/src/test/java/io/anserini/IndexerTestBase.java b/src/test/java/io/anserini/IndexerTestBase.java
index 6373ede542..917f684ae3 100644
--- a/src/test/java/io/anserini/IndexerTestBase.java
+++ b/src/test/java/io/anserini/IndexerTestBase.java
@@ -16,6 +16,7 @@
 
 package io.anserini;
 
+import io.anserini.index.IndexArgs;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.document.Document;
@@ -59,26 +60,26 @@ private void buildTestIndex() throws IOException {
 
     Document doc1 = new Document();
     String doc1Text = "here is some text here is some more text. city.";
-    doc1.add(new StringField("id", "doc1", Field.Store.YES));
-    doc1.add(new SortedDocValuesField("id", new BytesRef("doc1".getBytes())));
-    doc1.add(new Field("contents", doc1Text , textOptions));
-    doc1.add(new StoredField("raw", doc1Text));
+    doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
+    doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
+    doc1.add(new Field(IndexArgs.CONTENTS, doc1Text , textOptions));
+    doc1.add(new StoredField(IndexArgs.RAW, doc1Text));
     writer.addDocument(doc1);
 
     Document doc2 = new Document();
     String doc2Text = "more texts";
-    doc2.add(new StringField("id", "doc2", Field.Store.YES));
-    doc2.add(new SortedDocValuesField("id", new BytesRef("doc2".getBytes())));
-    doc2.add(new Field("contents", doc2Text, textOptions));  // Note plural, to test stemming
-    doc2.add(new StoredField("raw", doc2Text));
+    doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
+    doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
+    doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions));  // Note plural, to test stemming
+    doc2.add(new StoredField(IndexArgs.RAW, doc2Text));
     writer.addDocument(doc2);
 
     Document doc3 = new Document();
     String doc3Text = "here is a test";
-    doc3.add(new StringField("id", "doc3", Field.Store.YES));
-    doc3.add(new SortedDocValuesField("id", new BytesRef("doc3".getBytes())));
-    doc3.add(new Field("contents", doc3Text, textOptions));
-    doc3.add(new StoredField("raw", doc3Text));
+    doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
+    doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
+    doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
+    doc3.add(new StoredField(IndexArgs.RAW, doc3Text));
     writer.addDocument(doc3);
 
     writer.commit();
diff --git a/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java
new file mode 100644
index 0000000000..e9e91a244b
--- /dev/null
+++ b/src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java
@@ -0,0 +1,117 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini;
+
+import io.anserini.index.IndexArgs;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.After;
+import org.junit.Before;
+
+import java.io.IOException;
+import java.nio.file.Path;
+
+public class IndexerWithEmptyDocumentTestBase extends LuceneTestCase {
+  protected Path tempDir1;
+
+  // A very simple example of how to build an index.
+  // Creates an index similar to IndexerTestBase, but adds an empty document to test error handling.
+  private void buildTestIndex() throws IOException {
+    Directory dir = FSDirectory.open(tempDir1);
+
+    Analyzer analyzer = new EnglishAnalyzer();
+    IndexWriterConfig config = new IndexWriterConfig(analyzer);
+    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+
+    IndexWriter writer = new IndexWriter(dir, config);
+
+    FieldType textOptions = new FieldType();
+    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+    textOptions.setStored(true);
+    textOptions.setTokenized(true);
+    textOptions.setStoreTermVectors(true);
+    textOptions.setStoreTermVectorPositions(true);
+
+    Document doc1 = new Document();
+    String doc1Text = "here is some text here is some more text. city.";
+    doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
+    doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
+    doc1.add(new Field(IndexArgs.CONTENTS, doc1Text , textOptions));
+    doc1.add(new StoredField(IndexArgs.RAW, doc1Text));
+    writer.addDocument(doc1);
+
+    Document doc2 = new Document();
+    String doc2Text = "more texts";
+    doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
+    doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
+    doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions));  // Note plural, to test stemming
+    doc2.add(new StoredField(IndexArgs.RAW, doc2Text));
+    writer.addDocument(doc2);
+
+    Document doc3 = new Document();
+    String doc3Text = "here is a test";
+    doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
+    doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
+    doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
+    doc3.add(new StoredField(IndexArgs.RAW, doc3Text));
+    writer.addDocument(doc3);
+
+    Document doc4 = new Document();
+    String doc4Text = "";
+    doc4.add(new StringField(IndexArgs.ID, "doc4", Field.Store.YES));
+    doc4.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc4".getBytes())));
+    doc4.add(new Field(IndexArgs.CONTENTS, doc4Text, textOptions));
+    doc4.add(new StoredField(IndexArgs.RAW, doc4Text));
+    writer.addDocument(doc4);
+
+    writer.commit();
+    writer.forceMerge(1);
+    writer.close();
+
+    dir.close();
+  }
+
+  @Before
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+
+    tempDir1 = createTempDir();
+    buildTestIndex();
+  }
+
+  @After
+  @Override
+  public void tearDown() throws Exception {
+    // Call garbage collector for Windows compatibility
+    System.gc();
+    super.tearDown();
+  }
+}
diff --git a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java
index 73118933db..5d912dfd7e 100644
--- a/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java
+++ b/src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java
@@ -17,6 +17,7 @@
 package io.anserini.util;
 
 import io.anserini.IndexerTestBase;
+import io.anserini.IndexerWithEmptyDocumentTestBase;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -29,7 +30,7 @@
 import java.util.Locale;
 import java.util.Random;
 
-public class ExtractDocumentLengthsTest extends IndexerTestBase {
+public class ExtractDocumentLengthsTest extends IndexerWithEmptyDocumentTestBase {
   private static final Random rand = new Random();
   private String randomFileName;
 
@@ -72,9 +73,10 @@ public void test() throws Exception {
         out.toString());
 
     List<String> lines = Files.readAllLines(Paths.get(randomFileName));
-    assertEquals(4, lines.size());
+    assertEquals(5, lines.size());
     assertEquals("0\t8\t5\t8\t5", lines.get(1));
     assertEquals("1\t2\t2\t2\t2", lines.get(2));
     assertEquals("2\t2\t2\t2\t2", lines.get(3));
+    assertEquals("3\t0\t0\t0\t0", lines.get(4));
   }
 }
diff --git a/src/test/java/io/anserini/util/ExtractNormsTest.java b/src/test/java/io/anserini/util/ExtractNormsTest.java
index 0ce8373065..8bed615e65 100644
--- a/src/test/java/io/anserini/util/ExtractNormsTest.java
+++ b/src/test/java/io/anserini/util/ExtractNormsTest.java
@@ -17,6 +17,7 @@
 package io.anserini.util;
 
 import io.anserini.IndexerTestBase;
+import io.anserini.IndexerWithEmptyDocumentTestBase;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -27,7 +28,7 @@
 import java.util.Locale;
 import java.util.Random;
 
-public class ExtractNormsTest extends IndexerTestBase {
+public class ExtractNormsTest extends IndexerWithEmptyDocumentTestBase {
   private static final Random rand = new Random();
   private String randomFileName;
 
@@ -52,9 +53,10 @@ public void test() throws Exception {
     ExtractNorms.main(new String[] {"-index", tempDir1.toString(), "-output", randomFileName});
 
     List<String> lines = Files.readAllLines(Paths.get(randomFileName));
-    assertEquals(4, lines.size());
+    assertEquals(5, lines.size());
     assertEquals("0\t8", lines.get(1));
     assertEquals("1\t2", lines.get(2));
     assertEquals("2\t2", lines.get(3));
+    assertEquals("3\t0", lines.get(4));
   }
 }