Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ExtractDocumentLengths: prints out sum of doclengths, both lossy and lossless #1040

Merged
merged 4 commits into from
Mar 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions src/main/java/io/anserini/util/ExtractDocumentLengths.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package io.anserini.util;

import io.anserini.index.IndexArgs;
import io.anserini.index.NotStoredException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
Expand Down Expand Up @@ -66,12 +67,20 @@ public static void main(String[] args) throws Exception {
PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));

int numDocs = reader.numDocs();
long lossyTotalTerms = 0;
long exactTotalTerms = 0;

out.println("docid\tdoc_length\tunique_term_count\tlossy_doc_length\tlossy_unique_term_count");
for (int i = 0; i < numDocs; i++) {
Terms terms = reader.getTermVector(i, "contents");
Terms terms = reader.getTermVector(i, IndexArgs.CONTENTS);
if (terms == null) {
throw new NotStoredException("Term vectors not available!");
// It could be the case that TermVectors weren't stored when constructing the index, or we're just missing a
// TermVector for a zero-length document. Warn, but don't throw exception.
System.err.println(String.format("Warning: TermVector not available for docid %d.", i));
out.println(String.format("%d\t0\t0\t0\t0", i));
continue;
}

long exactDoclength = terms.getSumTotalTermFreq();
long exactTermCount = terms.size();
// Uses Lucene's method of encoding an integer into a byte, and the decoding it again.
Expand All @@ -80,7 +89,14 @@ public static void main(String[] args) throws Exception {
int lossyDoclength = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactDoclength));
int lossyTermCount = SmallFloat.byte4ToInt(SmallFloat.intToByte4((int) exactTermCount));
out.println(String.format("%d\t%d\t%d\t%d\t%d", i, exactDoclength, exactTermCount, lossyDoclength, lossyTermCount));
lossyTotalTerms += lossyDoclength;
exactTotalTerms += exactDoclength;
}

System.out.println("Total number of terms in collection (sum of doclengths):");
System.out.println("Lossy: " + lossyTotalTerms);
System.out.println("Exact: " + exactTotalTerms);

out.flush();
out.close();
reader.close();
Expand Down
25 changes: 13 additions & 12 deletions src/test/java/io/anserini/IndexerTestBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package io.anserini;

import io.anserini.index.IndexArgs;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
Expand Down Expand Up @@ -59,26 +60,26 @@ private void buildTestIndex() throws IOException {

Document doc1 = new Document();
String doc1Text = "here is some text here is some more text. city.";
doc1.add(new StringField("id", "doc1", Field.Store.YES));
doc1.add(new SortedDocValuesField("id", new BytesRef("doc1".getBytes())));
doc1.add(new Field("contents", doc1Text , textOptions));
doc1.add(new StoredField("raw", doc1Text));
doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
doc1.add(new Field(IndexArgs.CONTENTS, doc1Text , textOptions));
doc1.add(new StoredField(IndexArgs.RAW, doc1Text));
writer.addDocument(doc1);

Document doc2 = new Document();
String doc2Text = "more texts";
doc2.add(new StringField("id", "doc2", Field.Store.YES));
doc2.add(new SortedDocValuesField("id", new BytesRef("doc2".getBytes())));
doc2.add(new Field("contents", doc2Text, textOptions)); // Note plural, to test stemming
doc2.add(new StoredField("raw", doc2Text));
doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions)); // Note plural, to test stemming
doc2.add(new StoredField(IndexArgs.RAW, doc2Text));
writer.addDocument(doc2);

Document doc3 = new Document();
String doc3Text = "here is a test";
doc3.add(new StringField("id", "doc3", Field.Store.YES));
doc3.add(new SortedDocValuesField("id", new BytesRef("doc3".getBytes())));
doc3.add(new Field("contents", doc3Text, textOptions));
doc3.add(new StoredField("raw", doc3Text));
doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
doc3.add(new StoredField(IndexArgs.RAW, doc3Text));
writer.addDocument(doc3);

writer.commit();
Expand Down
117 changes: 117 additions & 0 deletions src/test/java/io/anserini/IndexerWithEmptyDocumentTestBase.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini;

import io.anserini.index.IndexArgs;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.After;
import org.junit.Before;

import java.io.IOException;
import java.nio.file.Path;

public class IndexerWithEmptyDocumentTestBase extends LuceneTestCase {
protected Path tempDir1;

// A very simple example of how to build an index.
// Creates an index similar to IndexerTestBase, but adds an empty document to test error handling.
private void buildTestIndex() throws IOException {
Directory dir = FSDirectory.open(tempDir1);

Analyzer analyzer = new EnglishAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

IndexWriter writer = new IndexWriter(dir, config);

FieldType textOptions = new FieldType();
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
textOptions.setStored(true);
textOptions.setTokenized(true);
textOptions.setStoreTermVectors(true);
textOptions.setStoreTermVectorPositions(true);

Document doc1 = new Document();
String doc1Text = "here is some text here is some more text. city.";
doc1.add(new StringField(IndexArgs.ID, "doc1", Field.Store.YES));
doc1.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc1".getBytes())));
doc1.add(new Field(IndexArgs.CONTENTS, doc1Text , textOptions));
doc1.add(new StoredField(IndexArgs.RAW, doc1Text));
writer.addDocument(doc1);

Document doc2 = new Document();
String doc2Text = "more texts";
doc2.add(new StringField(IndexArgs.ID, "doc2", Field.Store.YES));
doc2.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc2".getBytes())));
doc2.add(new Field(IndexArgs.CONTENTS, doc2Text, textOptions)); // Note plural, to test stemming
doc2.add(new StoredField(IndexArgs.RAW, doc2Text));
writer.addDocument(doc2);

Document doc3 = new Document();
String doc3Text = "here is a test";
doc3.add(new StringField(IndexArgs.ID, "doc3", Field.Store.YES));
doc3.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc3".getBytes())));
doc3.add(new Field(IndexArgs.CONTENTS, doc3Text, textOptions));
doc3.add(new StoredField(IndexArgs.RAW, doc3Text));
writer.addDocument(doc3);

Document doc4 = new Document();
String doc4Text = "";
doc4.add(new StringField(IndexArgs.ID, "doc4", Field.Store.YES));
doc4.add(new SortedDocValuesField(IndexArgs.ID, new BytesRef("doc4".getBytes())));
doc4.add(new Field(IndexArgs.CONTENTS, doc4Text, textOptions));
doc4.add(new StoredField(IndexArgs.RAW, doc4Text));
writer.addDocument(doc4);

writer.commit();
writer.forceMerge(1);
writer.close();

dir.close();
}

@Before
@Override
public void setUp() throws Exception {
super.setUp();

tempDir1 = createTempDir();
buildTestIndex();
}

@After
@Override
public void tearDown() throws Exception {
// Call garbage collector for Windows compatibility
System.gc();
super.tearDown();
}
}
10 changes: 3 additions & 7 deletions src/test/java/io/anserini/integration/EndToEndTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ public abstract class EndToEndTest extends LuceneTestCase {
public void setUp() throws Exception {
super.setUp();
init();
testIndexing();
}

@After
Expand Down Expand Up @@ -196,7 +197,8 @@ protected SearchArgs createDefaultSearchArgs() {
return searchArgs;
}

protected void testSearching() {
@Test
public void testSearching() {
try {
for (Map.Entry<String, SearchArgs> entry : testQueries.entrySet()) {
SearchCollection searcher = new SearchCollection(entry.getValue());
Expand Down Expand Up @@ -224,10 +226,4 @@ protected void checkRankingResults(String key, String output) throws IOException

assertEquals(cnt, ref.length);
}

@Test
public void testAll() {
testIndexing();
testSearching();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ protected void checkRankingResults(String key, String output) throws IOException
assertEquals(groundTruthRuns.get(run)[cnt], s);
cnt++;
}
assertEquals(cnt, groundTruthRuns.get(run).length);

// Add the file to the cleanup list.
cleanup.add(runfile);
Expand Down
2 changes: 0 additions & 2 deletions src/test/java/io/anserini/search/SearchCollectionTest.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package io.anserini.search;

import org.junit.After;
import org.junit.Test;

import java.io.ByteArrayOutputStream;
import java.io.PrintStream;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class SearchCollectionTest {
Expand Down
26 changes: 24 additions & 2 deletions src/test/java/io/anserini/util/ExtractDocumentLengthsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,36 @@
package io.anserini.util;

import io.anserini.IndexerTestBase;
import io.anserini.IndexerWithEmptyDocumentTestBase;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Locale;
import java.util.Random;

public class ExtractDocumentLengthsTest extends IndexerTestBase {
public class ExtractDocumentLengthsTest extends IndexerWithEmptyDocumentTestBase {
private static final Random rand = new Random();
private String randomFileName;

private final ByteArrayOutputStream out = new ByteArrayOutputStream();
private PrintStream save;

private void redirectStdout() {
save = System.out;
out.reset();
System.setOut(new PrintStream(out));
}

private void restoreStdout() {
System.setOut(save);
}

@Before
@Override
public void setUp() throws Exception {
Expand All @@ -49,12 +65,18 @@ public void tearDown() throws Exception {
public void test() throws Exception {
// See: https://github.com/castorini/anserini/issues/903
Locale.setDefault(Locale.US);
redirectStdout();
ExtractDocumentLengths.main(new String[] {"-index", tempDir1.toString(), "-output", randomFileName});
restoreStdout();

assertEquals("Total number of terms in collection (sum of doclengths):\nLossy: 12\nExact: 12\n",
out.toString());

List<String> lines = Files.readAllLines(Paths.get(randomFileName));
assertEquals(4, lines.size());
assertEquals(5, lines.size());
assertEquals("0\t8\t5\t8\t5", lines.get(1));
assertEquals("1\t2\t2\t2\t2", lines.get(2));
assertEquals("2\t2\t2\t2\t2", lines.get(3));
assertEquals("3\t0\t0\t0\t0", lines.get(4));
}
}
6 changes: 4 additions & 2 deletions src/test/java/io/anserini/util/ExtractNormsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package io.anserini.util;

import io.anserini.IndexerTestBase;
import io.anserini.IndexerWithEmptyDocumentTestBase;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
Expand All @@ -27,7 +28,7 @@
import java.util.Locale;
import java.util.Random;

public class ExtractNormsTest extends IndexerTestBase {
public class ExtractNormsTest extends IndexerWithEmptyDocumentTestBase {
private static final Random rand = new Random();
private String randomFileName;

Expand All @@ -52,9 +53,10 @@ public void test() throws Exception {
ExtractNorms.main(new String[] {"-index", tempDir1.toString(), "-output", randomFileName});

List<String> lines = Files.readAllLines(Paths.get(randomFileName));
assertEquals(4, lines.size());
assertEquals(5, lines.size());
assertEquals("0\t8", lines.get(1));
assertEquals("1\t2", lines.get(2));
assertEquals("2\t2", lines.get(3));
assertEquals("3\t0", lines.get(4));
}
}