Skip to content

Commit

Permalink
Merge pull request #187 from BobLd/dla-example-1
Browse files Browse the repository at this point in the history
Add AdvancedTextExtraction example
  • Loading branch information
EliotJones committed Jul 1, 2020
2 parents 7d16613 + a60be8d commit 5caf5f2
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 0 deletions.
91 changes: 91 additions & 0 deletions examples/AdvancedTextExtraction.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
namespace UglyToad.Examples
{
using PdfPig;
using System;
using System.Text;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;

public static class AdvancedTextExtraction
{
public static void Run(string filePath)
{
var sb = new StringBuilder();

using (var document = PdfDocument.Open(filePath))
{
foreach (var page in document.GetPages())
{
// 0. Preprocessing
var letters = page.Letters; // no preprocessing

// 1. Extract words
var wordExtractor = NearestNeighbourWordExtractor.Instance;
var wordExtractorOptions = new NearestNeighbourWordExtractor.NearestNeighbourWordExtractorOptions()
{
Filter = (pivot, candidate) =>
{
// check if white space (default implementation of 'Filter')
if (string.IsNullOrWhiteSpace(candidate.Value))
{
// pivot and candidate letters cannot belong to the same word
// if candidate letter is null or white space.
// ('FilterPivot' already checks if the pivot is null or white space by default)
return false;
}
// check for height difference
var maxHeight = Math.Max(pivot.PointSize, candidate.PointSize);
var minHeight = Math.Min(pivot.PointSize, candidate.PointSize);
if (minHeight != 0 && maxHeight / minHeight > 2.0)
{
// pivot and candidate letters cannot belong to the same word
// if one letter is more than twice the size of the other.
return false;
}
// check for colour difference
var pivotRgb = pivot.Color.ToRGBValues();
var candidateRgb = candidate.Color.ToRGBValues();
if (!pivotRgb.Equals(candidateRgb))
{
// pivot and candidate letters cannot belong to the same word
// if they don't have the same colour.
return false;
}
return true;
}
};

var words = wordExtractor.GetWords(letters, wordExtractorOptions);

// 2. Segment page
var pageSegmenter = DocstrumBoundingBoxes.Instance;
var pageSegmenterOptions = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
{

};

var textBlocks = pageSegmenter.GetBlocks(words, pageSegmenterOptions);

// 3. Postprocessing
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
var orderedTextBlocks = readingOrder.Get(textBlocks);

// 4. Extract text
foreach (var block in orderedTextBlocks)
{
sb.Append(block.Text.Normalize(NormalizationForm.FormKC)); // normalise text
sb.AppendLine();
}

sb.AppendLine();
}
}

Console.WriteLine(sb.ToString());
}
}
}
4 changes: 4 additions & 0 deletions examples/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ public static void Main()
("Generate PDF/A-2A compliant file",
() => GeneratePdfA2AFile.Run(Path.Combine(filesDirectory, "..", "..", "Fonts", "TrueType", "Roboto-Regular.ttf"),
Path.Combine(filesDirectory, "smile-250-by-160.jpg")))
},
{7,
("Advance text extraction using layout analysis algorithms",
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
}
};

Expand Down

0 comments on commit 5caf5f2

Please sign in to comment.