-
Notifications
You must be signed in to change notification settings - Fork 238
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #187 from BobLd/dla-example-1
Add AdvancedTextExtraction example
- Loading branch information
Showing
2 changed files
with
95 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
namespace UglyToad.Examples | ||
{ | ||
using PdfPig; | ||
using System; | ||
using System.Text; | ||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; | ||
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; | ||
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; | ||
|
||
public static class AdvancedTextExtraction | ||
{ | ||
public static void Run(string filePath) | ||
{ | ||
var sb = new StringBuilder(); | ||
|
||
using (var document = PdfDocument.Open(filePath)) | ||
{ | ||
foreach (var page in document.GetPages()) | ||
{ | ||
// 0. Preprocessing | ||
var letters = page.Letters; // no preprocessing | ||
|
||
// 1. Extract words | ||
var wordExtractor = NearestNeighbourWordExtractor.Instance; | ||
var wordExtractorOptions = new NearestNeighbourWordExtractor.NearestNeighbourWordExtractorOptions() | ||
{ | ||
Filter = (pivot, candidate) => | ||
{ | ||
// check if white space (default implementation of 'Filter') | ||
if (string.IsNullOrWhiteSpace(candidate.Value)) | ||
{ | ||
// pivot and candidate letters cannot belong to the same word | ||
// if candidate letter is null or white space. | ||
// ('FilterPivot' already checks if the pivot is null or white space by default) | ||
return false; | ||
} | ||
// check for height difference | ||
var maxHeight = Math.Max(pivot.PointSize, candidate.PointSize); | ||
var minHeight = Math.Min(pivot.PointSize, candidate.PointSize); | ||
if (minHeight != 0 && maxHeight / minHeight > 2.0) | ||
{ | ||
// pivot and candidate letters cannot belong to the same word | ||
// if one letter is more than twice the size of the other. | ||
return false; | ||
} | ||
// check for colour difference | ||
var pivotRgb = pivot.Color.ToRGBValues(); | ||
var candidateRgb = candidate.Color.ToRGBValues(); | ||
if (!pivotRgb.Equals(candidateRgb)) | ||
{ | ||
// pivot and candidate letters cannot belong to the same word | ||
// if they don't have the same colour. | ||
return false; | ||
} | ||
return true; | ||
} | ||
}; | ||
|
||
var words = wordExtractor.GetWords(letters, wordExtractorOptions); | ||
|
||
// 2. Segment page | ||
var pageSegmenter = DocstrumBoundingBoxes.Instance; | ||
var pageSegmenterOptions = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() | ||
{ | ||
|
||
}; | ||
|
||
var textBlocks = pageSegmenter.GetBlocks(words, pageSegmenterOptions); | ||
|
||
// 3. Postprocessing | ||
var readingOrder = UnsupervisedReadingOrderDetector.Instance; | ||
var orderedTextBlocks = readingOrder.Get(textBlocks); | ||
|
||
// 4. Extract text | ||
foreach (var block in orderedTextBlocks) | ||
{ | ||
sb.Append(block.Text.Normalize(NormalizationForm.FormKC)); // normalise text | ||
sb.AppendLine(); | ||
} | ||
|
||
sb.AppendLine(); | ||
} | ||
} | ||
|
||
Console.WriteLine(sb.ToString()); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters