Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,15 @@ public class MatchedPattern {
private final float[] coordinates = new float[4];

/**
* Constructor to pair a strip of text with its bounding box coordinates inside a page.
* The coordinates system has the origin (0, 0) in the lower left point of the page
* and uses PDF points as unit measure.
* Constructor to pair a strip of text with its bounding box coordinates inside a page. The coordinates system has
* the origin (0, 0) in the lower left point of the page and uses PDF points as unit measure.
*
* @param text string
* @param page int
* @param llx float lower left x coordinate
* @param lly float lower left y coordinate
* @param urx float upper right x coordinate
* @param ury float upper right y coordinate
* @param text string
* @param page int
* @param llx float lower left x coordinate
* @param lly float lower left y coordinate
* @param urx float upper right x coordinate
* @param ury float upper right y coordinate
*/
MatchedPattern(String text, int page, float llx, float lly, float urx, float ury) {
this.text = text;
Expand All @@ -85,13 +84,21 @@ public float[] getCoordinates() {
return coordinates;
}

public String printCoordinates() {
return "[llx: " + coordinates[0] + ", lly: " + coordinates[1] + ", urx: " + coordinates[2] + ", ury: " + coordinates[3] + "]";
}

@Override
public String toString() {
String[] c = new String[4];
for(int i = 0; i < 4; i++) {
c[i] = String.valueOf(coordinates[i]);
}
return "[" + String.join(", ", c) + "]";
StringBuilder sb = new StringBuilder();
sb.append("Text: [")
.append(this.text)
.append("] - boundingBox: ")
.append(this.printCoordinates())
.append(" - page: [")
.append(this.page)
.append("]");
return sb.toString();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,10 @@ LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
*/
package org.openpdf.text.pdf.parser;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Stack;
import org.openpdf.text.ExceptionConverter;
import org.openpdf.text.error_messages.MessageLocalization;
import org.openpdf.text.pdf.CMapAwareDocumentFont;
import org.openpdf.text.pdf.PRIndirectReference;
import org.openpdf.text.pdf.PRStream;
import org.openpdf.text.pdf.PRTokeniser;
import org.openpdf.text.pdf.PdfArray;
import org.openpdf.text.pdf.PdfContentParser;
import org.openpdf.text.pdf.PdfDictionary;
import org.openpdf.text.pdf.PdfIndirectReference;
import org.openpdf.text.pdf.PdfLiteral;
import org.openpdf.text.pdf.PdfName;
import org.openpdf.text.pdf.PdfNumber;
import org.openpdf.text.pdf.PdfObject;
import org.openpdf.text.pdf.PdfReader;
import org.openpdf.text.pdf.PdfStream;
import org.openpdf.text.pdf.PdfString;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,14 @@ LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
*/
package org.openpdf.text.pdf.parser;

import org.openpdf.text.ExceptionConverter;
import org.openpdf.text.error_messages.MessageLocalization;
import org.openpdf.text.pdf.*;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.openpdf.text.pdf.BaseFont;
import org.openpdf.text.pdf.PdfString;

/**
* @author dgd
Expand All @@ -63,16 +62,59 @@ public class PdfContentTextLocator extends PdfContentStreamHandler {
private final ArrayList<Float> fragmentsWidths = new ArrayList<>();

private final int page;
private final Pattern p;

private Pattern p;
private float[] coordinates;
private final int mode;

/**
* Construct a content PdfContetStreamHandler for regex-based text extraction pattern
*
* @param renderListener the text assembler
* @param pattern the pattern to match text against
* @param page PdfPage to inspect
*/
public PdfContentTextLocator(TextAssembler renderListener, String pattern, int page) {
super(renderListener);
if(pattern == null) throw new IllegalArgumentException("Pattern cannot be null");
if (pattern == null) {
throw new IllegalArgumentException("Pattern cannot be null");
}
//We check for length because we want to include whitespaces as possible patterns
if(pattern.isEmpty()) throw new IllegalArgumentException("Pattern sequence must be longer than 0");
if (pattern.isEmpty()) {
throw new IllegalArgumentException("Pattern sequence must be longer than 0");
}
this.p = Pattern.compile(pattern);
this.page = page;
this.mode = 1;
installDefaultOperators();
reset();
}

/**
* Construct a content PdfContetStreamHandler for coordinates-based text extraction pattern
*
* @param renderListener the text assembler
* @param coordinates the bounding box to search text within
* @param page PdfPage to inspect
*/
public PdfContentTextLocator(TextAssembler renderListener, float[] coordinates, int page) {
super(renderListener);
if (coordinates.length != 4) {
throw new IllegalArgumentException("Coordinates bounding box must be an array of "
+ "four floats, "
+ "[x1, y1, x2, y2] {lower left point, upper right point}");
}
if (coordinates[2] < coordinates[0]) {
throw new IllegalArgumentException("x2 {coordinates[2]} must be greater than or equal to x1 "
+ "{coordinates[0]}");
}
if (coordinates[3] < coordinates[1]) {
throw new IllegalArgumentException("y2 {coordinates[3]} must be greater than or equal to y1 "
+ "{coordinates[1]}");
}
this.coordinates = coordinates;
//We check for length because we want to include whitespaces as possible patterns
this.page = page;
this.mode = 2;
installDefaultOperators();
reset();
}
Expand Down Expand Up @@ -103,13 +145,12 @@ public void reset() {
}

/**
* Search for a pattern in a PdfString
* and if found, collect its bounding box
* Extract a PdfString content and coordinates based on the handler extraction pattern: either matches a given regex
* or intersects a given bounding box
*
* @param string the text to inspect
*/
void displayPdfString(PdfString string) {

String decoded;
byte[] bytes;
if (BaseFont.IDENTITY_H.equals(graphicsState().getFont().getEncoding())) {
Expand All @@ -135,20 +176,72 @@ void displayPdfString(PdfString string) {
counter++;
}

float pdfStringWidth = startWidth + totalWidth;
float y = new Vector(0, 0, 1f).cross(textMatrix).get(1);
float y1 = y + graphicsState().getFontDescentDescriptor();
float y2 = y + graphicsState().getFontAscentDescriptor();
float fontFloor = y + graphicsState().getFontDescentDescriptor();
float fontCeiling = y + graphicsState().getFontAscentDescriptor();

switch (this.mode) {
case 1: {
matchPdfString(decoded, widths, totalWidth, fontFloor, fontCeiling);
break;
}
case 2: {
locatePdfString(decoded, startWidth, totalWidth, fontFloor, fontCeiling);
break;
}
default: {
//do nothing for now
}
}
}

/**
* Search for a pattern in a PdfString and if found, collect its bounding box
*
* @param decoded the text to inspect
* @param widths array of prefix widths of each char
* @param totalWidth width of the text
* @param fontFloor lowest y-coordinate of the font
* @param fontCeiling highest y-coordinate of the font
*/
private void matchPdfString(String decoded, float[] widths, float totalWidth, float fontFloor, float fontCeiling) {
Matcher m = p.matcher(decoded);
while (m.find()) {
float x1 = widths[m.start()];
float x2 = widths[m.end()];
MatchedPattern mp = new MatchedPattern(decoded, this.page, x1, y1, x2, y2);
MatchedPattern mp = new MatchedPattern(decoded, this.page, x1, fontFloor, x2, fontCeiling);
accumulator.add(mp);
}
textMatrix = new Matrix(totalWidth, 0).multiply(textMatrix);
}

/**
* Extract text if it's coordinates intersect with the given bounding box
*
* @param decoded the text to inspect
* @param startWidth left-most x-coordinate of the text
* @param totalWidth width of the text
* @param fontFloor lowest y-coordinate of the font
* @param fontCeiling highest y-coordinate of the font
*/
private void locatePdfString(String decoded, float startWidth, float totalWidth, float fontFloor,
float fontCeiling) {
float endWidth = startWidth + totalWidth;
textMatrix = new Matrix(totalWidth, 0).multiply(textMatrix);
if (startWidth < this.coordinates[0] && endWidth < this.coordinates[0]) {
return;
}
if (startWidth > this.coordinates[2]) {
return;
}
if (fontFloor < this.coordinates[1] && fontCeiling < this.coordinates[1]) {
return;
}
if (fontFloor > this.coordinates[3]) {
return;
}
MatchedPattern mp = new MatchedPattern(decoded, this.page, startWidth, fontFloor, endWidth, fontCeiling);
accumulator.add(mp);
}

private float convertHeightToUser(float height) {
Expand All @@ -164,7 +257,7 @@ public String getResultantText() {
/**
* @return list of text strips that matches
*/
public ArrayList<MatchedPattern> getMatchedPatterns() {
public List<MatchedPattern> getMatchedPatterns() {
return this.accumulator;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,11 @@
package org.openpdf.text.pdf.parser;


import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.openpdf.text.ExceptionConverter;
import org.openpdf.text.pdf.PRIndirectReference;
import org.openpdf.text.pdf.PRStream;
import org.openpdf.text.pdf.PRTokeniser;
import org.openpdf.text.pdf.PdfArray;
import org.openpdf.text.pdf.PdfContentParser;
import org.openpdf.text.pdf.PdfDictionary;
import org.openpdf.text.pdf.PdfLiteral;
Expand Down Expand Up @@ -149,7 +145,7 @@ private byte[] getContentBytesFromContentObject(PdfObject contentObject) throws
* @return <CODE>ArrayList<MatchedPattern></CODE> List of matched text patterns with coordinates.
* @throws IOException on error
*/
public ArrayList<MatchedPattern> searchPage(int page, String pattern) throws IOException {
public List<MatchedPattern> searchPage(int page, String pattern) throws IOException {
PdfDictionary pageDict = reader.getPageN(page);
if (pageDict == null) {
return new ArrayList<>();
Expand All @@ -169,14 +165,50 @@ public ArrayList<MatchedPattern> searchPage(int page, String pattern) throws IOE
* @return <CODE>ArrayList<MatchedPattern></CODE> List of matched text patterns with coordinates.
* @throws IOException on error
*/
public ArrayList<MatchedPattern> searchFile(String pattern) throws IOException {
ArrayList<MatchedPattern> res = new ArrayList<>();
public List<MatchedPattern> searchFile(String pattern) throws IOException {
List<MatchedPattern> res = new ArrayList<>();
for (int page = 1; page <= reader.getNumberOfPages(); page++) {
res.addAll(searchPage(page, pattern));
}
return res;
}

/**
* Locates text within a bounding box inside a page
*
* @param page page number we are interested in
* @param coordinates bounding box to extract text from
* @return <CODE>ArrayList<MatchedPattern></CODE> List of matched text patterns with coordinates.
* @throws IOException on error
*/
public List<MatchedPattern> searchPage(int page, float[] coordinates) throws IOException {
PdfDictionary pageDict = reader.getPageN(page);
if (pageDict == null) {
return new ArrayList<>();
}
PdfDictionary resources = pageDict.getAsDict(PdfName.RESOURCES);
renderListener.reset();
renderListener.setPage(page);
PdfContentTextLocator handler = new PdfContentTextLocator(renderListener, coordinates, page);
processContent(getContentBytesForPage(page), resources, handler);
return handler.getMatchedPatterns();
}

/**
* Locates text within a bounding box inside a PDF
*
* @param coordinates bounding box to extract text from
* @return <CODE>ArrayList<MatchedPattern></CODE> List of matched text patterns with coordinates.
* @throws IOException on error
*/
public List<MatchedPattern> searchFile(float[] coordinates) throws IOException {
List<MatchedPattern> res = new ArrayList<>();
for (int page = 1; page <= reader.getNumberOfPages(); page++) {
res.addAll(searchPage(page, coordinates));
}
return res;
}

/**
* Processes PDF syntax
*
Expand Down
Loading
Loading