diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java index 7d062d59e..0510d3a3b 100644 --- a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java @@ -53,16 +53,15 @@ public class MatchedPattern { private final float[] coordinates = new float[4]; /** - * Constructor to pair a strip of text with its bounding box coordinates inside a page. - * The coordinates system has the origin (0, 0) in the lower left point of the page - * and uses PDF points as unit measure. + * Constructor to pair a strip of text with its bounding box coordinates inside a page. The coordinates system has + * the origin (0, 0) in the lower left point of the page and uses PDF points as unit measure. * - * @param text string - * @param page int - * @param llx float lower left x coordinate - * @param lly float lower left y coordinate - * @param urx float upper right x coordinate - * @param ury float upper right y coordinate + * @param text string + * @param page int + * @param llx float lower left x coordinate + * @param lly float lower left y coordinate + * @param urx float upper right x coordinate + * @param ury float upper right y coordinate */ MatchedPattern(String text, int page, float llx, float lly, float urx, float ury) { this.text = text; @@ -85,13 +84,21 @@ public float[] getCoordinates() { return coordinates; } + public String printCoordinates() { + return "[llx: " + coordinates[0] + ", lly: " + coordinates[1] + ", urx: " + coordinates[2] + ", ury: " + coordinates[3] + "]"; + } + @Override public String toString() { - String[] c = new String[4]; - for(int i = 0; i < 4; i++) { - c[i] = String.valueOf(coordinates[i]); - } - return "[" + String.join(", ", c) + "]"; + StringBuilder sb = new StringBuilder(); + sb.append("Text: [") + .append(this.text) + .append("] - boundingBox: ") + .append(this.printCoordinates()) + .append(" - page: [") + .append(this.page) + .append("]"); + return sb.toString(); } } diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextExtractor.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextExtractor.java index 99f18d640..05297dc97 100644 --- a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextExtractor.java +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextExtractor.java @@ -41,31 +41,10 @@ LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the */ package org.openpdf.text.pdf.parser; -import java.io.ByteArrayOutputStream; -import java.io.IOException; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Locale; -import java.util.Map; import java.util.Optional; import java.util.Stack; -import org.openpdf.text.ExceptionConverter; -import org.openpdf.text.error_messages.MessageLocalization; -import org.openpdf.text.pdf.CMapAwareDocumentFont; -import org.openpdf.text.pdf.PRIndirectReference; -import org.openpdf.text.pdf.PRStream; -import org.openpdf.text.pdf.PRTokeniser; -import org.openpdf.text.pdf.PdfArray; -import org.openpdf.text.pdf.PdfContentParser; -import org.openpdf.text.pdf.PdfDictionary; -import org.openpdf.text.pdf.PdfIndirectReference; -import org.openpdf.text.pdf.PdfLiteral; -import org.openpdf.text.pdf.PdfName; -import org.openpdf.text.pdf.PdfNumber; -import org.openpdf.text.pdf.PdfObject; -import org.openpdf.text.pdf.PdfReader; -import org.openpdf.text.pdf.PdfStream; import org.openpdf.text.pdf.PdfString; /** diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java index 49e0761ce..d7812a55d 100644 --- a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java @@ -41,15 +41,14 @@ LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the */ package org.openpdf.text.pdf.parser; -import org.openpdf.text.ExceptionConverter; -import org.openpdf.text.error_messages.MessageLocalization; -import org.openpdf.text.pdf.*; -import java.io.ByteArrayOutputStream; -import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.openpdf.text.pdf.BaseFont; +import org.openpdf.text.pdf.PdfString; /** * @author dgd @@ -63,16 +62,59 @@ public class PdfContentTextLocator extends PdfContentStreamHandler { private final ArrayList fragmentsWidths = new ArrayList<>(); private final int page; - private final Pattern p; - + private Pattern p; + private float[] coordinates; + private final int mode; + /** + * Construct a content PdfContetStreamHandler for regex-based text extraction pattern + * + * @param renderListener the text assembler + * @param pattern the pattern to match text against + * @param page PdfPage to inspect + */ public PdfContentTextLocator(TextAssembler renderListener, String pattern, int page) { super(renderListener); - if(pattern == null) throw new IllegalArgumentException("Pattern cannot be null"); + if (pattern == null) { + throw new IllegalArgumentException("Pattern cannot be null"); + } //We check for length because we want to include whitespaces as possible patterns - if(pattern.isEmpty()) throw new IllegalArgumentException("Pattern sequence must be longer than 0"); + if (pattern.isEmpty()) { + throw new IllegalArgumentException("Pattern sequence must be longer than 0"); + } this.p = Pattern.compile(pattern); this.page = page; + this.mode = 1; + installDefaultOperators(); + reset(); + } + + /** + * Construct a content PdfContetStreamHandler for coordinates-based text extraction pattern + * + * @param renderListener the text assembler + * @param coordinates the bounding box to search text within + * @param page PdfPage to inspect + */ + public PdfContentTextLocator(TextAssembler renderListener, float[] coordinates, int page) { + super(renderListener); + if (coordinates.length != 4) { + throw new IllegalArgumentException("Coordinates bounding box must be an array of " + + "four floats, " + + "[x1, y1, x2, y2] {lower left point, upper right point}"); + } + if (coordinates[2] < coordinates[0]) { + throw new IllegalArgumentException("x2 {coordinates[2]} must be greater than or equal to x1 " + + "{coordinates[0]}"); + } + if (coordinates[3] < coordinates[1]) { + throw new IllegalArgumentException("y2 {coordinates[3]} must be greater than or equal to y1 " + + "{coordinates[1]}"); + } + this.coordinates = coordinates; + //We check for length because we want to include whitespaces as possible patterns + this.page = page; + this.mode = 2; installDefaultOperators(); reset(); } @@ -103,13 +145,12 @@ public void reset() { } /** - * Search for a pattern in a PdfString - * and if found, collect its bounding box + * Extract a PdfString content and coordinates based on the handler extraction pattern: either matches a given regex + * or intersects a given bounding box * * @param string the text to inspect */ void displayPdfString(PdfString string) { - String decoded; byte[] bytes; if (BaseFont.IDENTITY_H.equals(graphicsState().getFont().getEncoding())) { @@ -135,20 +176,72 @@ void displayPdfString(PdfString string) { counter++; } - float pdfStringWidth = startWidth + totalWidth; float y = new Vector(0, 0, 1f).cross(textMatrix).get(1); - float y1 = y + graphicsState().getFontDescentDescriptor(); - float y2 = y + graphicsState().getFontAscentDescriptor(); + float fontFloor = y + graphicsState().getFontDescentDescriptor(); + float fontCeiling = y + graphicsState().getFontAscentDescriptor(); + + switch (this.mode) { + case 1: { + matchPdfString(decoded, widths, totalWidth, fontFloor, fontCeiling); + break; + } + case 2: { + locatePdfString(decoded, startWidth, totalWidth, fontFloor, fontCeiling); + break; + } + default: { + //do nothing for now + } + } + } + /** + * Search for a pattern in a PdfString and if found, collect its bounding box + * + * @param decoded the text to inspect + * @param widths array of prefix widths of each char + * @param totalWidth width of the text + * @param fontFloor lowest y-coordinate of the font + * @param fontCeiling highest y-coordinate of the font + */ + private void matchPdfString(String decoded, float[] widths, float totalWidth, float fontFloor, float fontCeiling) { Matcher m = p.matcher(decoded); while (m.find()) { float x1 = widths[m.start()]; float x2 = widths[m.end()]; - MatchedPattern mp = new MatchedPattern(decoded, this.page, x1, y1, x2, y2); + MatchedPattern mp = new MatchedPattern(decoded, this.page, x1, fontFloor, x2, fontCeiling); accumulator.add(mp); } + textMatrix = new Matrix(totalWidth, 0).multiply(textMatrix); + } + /** + * Extract text if it's coordinates intersect with the given bounding box + * + * @param decoded the text to inspect + * @param startWidth left-most x-coordinate of the text + * @param totalWidth width of the text + * @param fontFloor lowest y-coordinate of the font + * @param fontCeiling highest y-coordinate of the font + */ + private void locatePdfString(String decoded, float startWidth, float totalWidth, float fontFloor, + float fontCeiling) { + float endWidth = startWidth + totalWidth; textMatrix = new Matrix(totalWidth, 0).multiply(textMatrix); + if (startWidth < this.coordinates[0] && endWidth < this.coordinates[0]) { + return; + } + if (startWidth > this.coordinates[2]) { + return; + } + if (fontFloor < this.coordinates[1] && fontCeiling < this.coordinates[1]) { + return; + } + if (fontFloor > this.coordinates[3]) { + return; + } + MatchedPattern mp = new MatchedPattern(decoded, this.page, startWidth, fontFloor, endWidth, fontCeiling); + accumulator.add(mp); } private float convertHeightToUser(float height) { @@ -164,7 +257,7 @@ public String getResultantText() { /** * @return list of text strips that matches */ - public ArrayList getMatchedPatterns() { + public List getMatchedPatterns() { return this.accumulator; } } diff --git a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java index c9672671c..4be2bd3b7 100644 --- a/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java +++ b/openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java @@ -49,15 +49,11 @@ package org.openpdf.text.pdf.parser; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.openpdf.text.ExceptionConverter; -import org.openpdf.text.pdf.PRIndirectReference; -import org.openpdf.text.pdf.PRStream; import org.openpdf.text.pdf.PRTokeniser; -import org.openpdf.text.pdf.PdfArray; import org.openpdf.text.pdf.PdfContentParser; import org.openpdf.text.pdf.PdfDictionary; import org.openpdf.text.pdf.PdfLiteral; @@ -149,7 +145,7 @@ private byte[] getContentBytesFromContentObject(PdfObject contentObject) throws * @return ArrayList List of matched text patterns with coordinates. * @throws IOException on error */ - public ArrayList searchPage(int page, String pattern) throws IOException { + public List searchPage(int page, String pattern) throws IOException { PdfDictionary pageDict = reader.getPageN(page); if (pageDict == null) { return new ArrayList<>(); @@ -169,14 +165,50 @@ public ArrayList searchPage(int page, String pattern) throws IOE * @return ArrayList List of matched text patterns with coordinates. * @throws IOException on error */ - public ArrayList searchFile(String pattern) throws IOException { - ArrayList res = new ArrayList<>(); + public List searchFile(String pattern) throws IOException { + List res = new ArrayList<>(); for (int page = 1; page <= reader.getNumberOfPages(); page++) { res.addAll(searchPage(page, pattern)); } return res; } + /** + * Locates text within a bounding box inside a page + * + * @param page page number we are interested in + * @param coordinates bounding box to extract text from + * @return ArrayList List of matched text patterns with coordinates. + * @throws IOException on error + */ + public List searchPage(int page, float[] coordinates) throws IOException { + PdfDictionary pageDict = reader.getPageN(page); + if (pageDict == null) { + return new ArrayList<>(); + } + PdfDictionary resources = pageDict.getAsDict(PdfName.RESOURCES); + renderListener.reset(); + renderListener.setPage(page); + PdfContentTextLocator handler = new PdfContentTextLocator(renderListener, coordinates, page); + processContent(getContentBytesForPage(page), resources, handler); + return handler.getMatchedPatterns(); + } + + /** + * Locates text within a bounding box inside a PDF + * + * @param coordinates bounding box to extract text from + * @return ArrayList List of matched text patterns with coordinates. + * @throws IOException on error + */ + public List searchFile(float[] coordinates) throws IOException { + List res = new ArrayList<>(); + for (int page = 1; page <= reader.getNumberOfPages(); page++) { + res.addAll(searchPage(page, coordinates)); + } + return res; + } + /** * Processes PDF syntax * diff --git a/openpdf-core/src/test/java/org/openpdf/text/pdf/parser/PdfTextExtractorTest.java b/openpdf-core/src/test/java/org/openpdf/text/pdf/parser/PdfTextExtractorTest.java index 50338d31f..4e78307d8 100644 --- a/openpdf-core/src/test/java/org/openpdf/text/pdf/parser/PdfTextExtractorTest.java +++ b/openpdf-core/src/test/java/org/openpdf/text/pdf/parser/PdfTextExtractorTest.java @@ -12,6 +12,7 @@ import org.openpdf.text.PageSize; import org.openpdf.text.Paragraph; import org.openpdf.text.Phrase; +import org.openpdf.text.pdf.ColumnText; import org.openpdf.text.pdf.FontSelector; import org.openpdf.text.pdf.PdfPTable; import org.openpdf.text.pdf.PdfReader; @@ -22,6 +23,7 @@ import java.io.InputStream; import java.net.URL; import java.nio.file.Files; +import java.util.List; import org.junit.jupiter.api.Test; @@ -46,6 +48,23 @@ static byte[] createSimpleDocumentWithElements(Element... elements) { return baos.toByteArray(); } + static byte[] createSimpleDocumentWithPositionedStrings(String testString) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + Document document = new Document(PageSize.A4); + PdfWriter wr = PdfWriter.getInstance(document, baos); + document.open(); + wr.open(); + ColumnText.showTextAligned(wr.getDirectContent(), + Element.ALIGN_LEFT, new Phrase(testString), 300, 780, 0); + ColumnText.showTextAligned(wr.getDirectContent(), + Element.ALIGN_LEFT, new Phrase(testString), 150, 400, 0); + ColumnText.showTextAligned(wr.getDirectContent(), + Element.ALIGN_LEFT, new Phrase(testString), 10, 10, 0); + document.close(); + wr.close(); + return baos.toByteArray(); + } + protected static byte[] readDocument(final File file) throws IOException { try (ByteArrayOutputStream fileBytes = new ByteArrayOutputStream(); InputStream inputStream = Files.newInputStream(file.toPath())) { @@ -173,27 +192,27 @@ void getTextFromPageInTablesWithSingleWords_expectsWordsAreSeparatedBySpaces() // then assertEquals("One Two Three", extracted); } - + @Test - void testTextLocatorFindsTextWithCoordinates() throws Exception { + void testTextLocatorFindsTextWithRegex() throws Exception { // given: Create a simple document with known text final String testText = "Hello World Test"; final Paragraph paragraph = new Paragraph(testText); byte[] pdfBytes = createSimpleDocumentWithElements(paragraph); - + // when: Search for text pattern using PdfTextLocator final PdfReader pdfReader = new PdfReader(pdfBytes); java.util.List matches = new PdfTextLocator(pdfReader).searchPage(1, "Hello"); - + // then: Verify that text was found with coordinates assertNotNull(matches); assertFalse(matches.isEmpty(), "Should find at least one match for 'Hello'"); MatchedPattern match = matches.get(0); // The matched text contains "Hello" (the pattern we're searching for) - assertTrue(match.getText().contains("Hello"), + assertTrue(match.getText().contains("Hello"), "Matched text should contain 'Hello', got: " + match.getText()); assertEquals(1, match.getPage()); - + // Verify coordinates are reasonable (not all zeros) float[] coords = match.getCoordinates(); assertNotNull(coords); @@ -202,22 +221,23 @@ void testTextLocatorFindsTextWithCoordinates() throws Exception { assertTrue(coords[0] > 0 || coords[1] > 0 || coords[2] > 0 || coords[3] > 0, "Coordinates should have at least one non-zero value"); } - + @Test - void testTextLocatorFindsMultipleMatches() throws Exception { + void testTextLocatorFindsMultipleMatchesWithRegex() throws Exception { // given: Create a document with repeated text final String testText = "Test word. Another Test word."; final Paragraph paragraph = new Paragraph(testText); byte[] pdfBytes = createSimpleDocumentWithElements(paragraph); - + // when: Search for pattern that appears multiple times final PdfReader pdfReader = new PdfReader(pdfBytes); java.util.List matches = new PdfTextLocator(pdfReader).searchPage(1, "Test"); - + // then: Verify matches found (Note: implementation may return fewer matches than expected) assertNotNull(matches); - assertFalse(matches.isEmpty(), "Should find at least one match for 'Test'"); - + assertFalse(matches.isEmpty(), "Should find exactly two matches for 'Test'"); + assertEquals(2, matches.size(), "Should find exactly two matches for 'Test'"); + // Verify each match contains the pattern for (MatchedPattern match : matches) { assertTrue(match.getText().contains("Test"), @@ -225,6 +245,60 @@ void testTextLocatorFindsMultipleMatches() throws Exception { } } + @Test + void testTextLocatorFindsTextWithCoordinates() throws Exception { + // given: Create a simple document with known text + final String testText = "Hello World Test"; + byte[] pdfBytes = createSimpleDocumentWithPositionedStrings(testText); + + // when: Search for text pattern using PdfTextLocator + final PdfReader pdfReader = new PdfReader(pdfBytes); + float[] boundingBox = new float[]{0, 0, 20, 20}; + List matches = new PdfTextLocator(pdfReader).searchPage(1, boundingBox); + + // then: Verify that text was found with coordinates + assertNotNull(matches); + assertFalse(matches.isEmpty(), "Should find exactly one match"); + MatchedPattern match = matches.get(0); + // The matched text contains "Hello" (the pattern we're searching for) + assertTrue(match.getText().contains("Hello"), + "Matched text should contain 'Hello', got: " + match.getText()); + assertEquals(1, match.getPage()); + assertEquals(1, matches.size(), "Should find exactly one match"); + + // Verify coordinates are reasonable + float[] coords = match.getCoordinates(); + assertNotNull(coords); + assertEquals(4, coords.length); + } + + @Test + void testTextLocatorFindsMultipleMatchesWithCoordinates() throws Exception { + // given: Create a simple document with known text + final String testText = "Hello World Test"; + byte[] pdfBytes = createSimpleDocumentWithPositionedStrings(testText); + + // when: Search for text pattern using PdfTextLocator + final PdfReader pdfReader = new PdfReader(pdfBytes); + float[] boundingBox = new float[]{0, 0, 180, 450}; + List matches = new PdfTextLocator(pdfReader).searchPage(1, boundingBox); + + // then: Verify that text was found with coordinates + assertNotNull(matches); + assertFalse(matches.isEmpty(), "Should find exactly two matches"); + assertEquals(2, matches.size(), "Should find exactly two matches"); + for (MatchedPattern match : matches) { + // The matched text contains "Hello" (the pattern we're searching for) + assertTrue(match.getText().contains("Hello"), + "Matched text should contain 'Hello', got: " + match.getText()); + assertEquals(1, match.getPage()); + // Verify coordinates are reasonable + float[] coords = match.getCoordinates(); + assertNotNull(coords); + assertEquals(4, coords.length); + } + } + private String getString(String fileName, int pageNumber) throws Exception { URL resource = getClass().getResource("/" + fileName); return getString(new File(resource.toURI()), pageNumber);