Dependencies:
- Guava > 19
- PDFBox > 2.0
- A helper class which wraps IOException in a RuntimeException
| import java.awt.geom.Rectangle2D; | |
| import org.apache.pdfbox.text.PDFTextStripperByArea; | |
| public class PageRegion { | |
| private String name; | |
| private Rectangle2D rect; | |
| // Coordinates are in the order in which the Apache PDF-Box Debugger displays them | |
| public PageRegion(String name, double y1, double x1, double y2, double x2) { | |
| this.name = name; | |
| double x = Math.min(x1, x2); | |
| double y = Math.min(y1, y2); | |
| double width = Math.abs(x1 - x2) + 1; | |
| double height = Math.abs(y1 - y2) + 1; | |
| this.rect = new Rectangle2D.Double(x, y, width, height); | |
| } | |
| public String getName() { | |
| return name; | |
| } | |
| public void addTo(PDFTextStripperByArea stripper) { | |
| stripper.addRegion(name, rect); | |
| } | |
| } |
| import static org.junit.Assert.assertEquals; | |
| import java.io.File; | |
| import java.io.IOException; | |
| import java.util.ArrayList; | |
| import java.util.List; | |
| import java.util.stream.Collectors; | |
| import org.apache.pdfbox.pdmodel.PDDocument; | |
| import org.apache.pdfbox.text.PDFTextStripper; | |
| import org.apache.pdfbox.text.PDFTextStripperByArea; | |
| import com.google.common.base.Preconditions; | |
| import tools.RuntimeIOException; | |
| public class ReportTextExtractor { | |
| private PDDocument document; | |
| private File file; | |
| public ReportTextExtractor(File file) { | |
| this.file = file; | |
| try { | |
| document = PDDocument.load(file); | |
| } catch (IOException e) { | |
| throw new RuntimeIOException("Error loading " + file.getAbsolutePath(), e); | |
| } | |
| } | |
| private PDFTextStripperByArea byAreaStripper; | |
| protected PDFTextStripperByArea createByAreaStripper() { | |
| PDFTextStripperByArea result; | |
| try { | |
| result = new PDFTextStripperByArea(); | |
| } catch (IOException e) { | |
| throw new RuntimeIOException(e); | |
| } | |
| result.setAddMoreFormatting(true); | |
| result.setSortByPosition(true); | |
| return result; | |
| } | |
| public ReportTextExtractor addRegion(PageRegion region) { | |
| if (byAreaStripper == null) { | |
| byAreaStripper = createByAreaStripper(); | |
| } | |
| region.addTo(byAreaStripper); | |
| return this; | |
| } | |
| private boolean processed = false; | |
| private int page = 0; | |
| public String getText(PageRegion region) { | |
| if (byAreaStripper == null) { | |
| throw new IllegalStateException("Add some region, first!"); | |
| } | |
| Preconditions.checkArgument(byAreaStripper.getRegions().contains(region.getName()), | |
| "Unknown region [%s]; valid names are: %s", region.getName(), byAreaStripper.getRegions()); | |
| if (!processed) { | |
| try { | |
| byAreaStripper.extractRegions(document.getPage(page)); | |
| } catch (IOException e) { | |
| throw new RuntimeIOException("Error processing page " + page, e); | |
| } | |
| processed = true; | |
| } | |
| String result = byAreaStripper.getTextForRegion(region.getName()); | |
| result = postProcess(result); | |
| return result; | |
| } | |
| /** Get all the text of the document at once */ | |
| public static String getText(File file) { | |
| return new ReportTextExtractor(file).getText(); | |
| } | |
| public String getText() { | |
| PDFTextStripper stripper; | |
| try { | |
| stripper = new PDFTextStripper(); | |
| } catch (IOException e) { | |
| throw new RuntimeIOException(e); | |
| } | |
| stripper.setAddMoreFormatting(true); | |
| stripper.setSortByPosition(true); | |
| try { | |
| String result = stripper.getText(document); | |
| result = postProcess(result); | |
| return result; | |
| } catch (IOException e) { | |
| throw new RuntimeIOException("Error getting text from " + file.getAbsolutePath(), e); | |
| } finally { | |
| close(); | |
| } | |
| } | |
| protected String postProcess(String result) { | |
| return result.trim() | |
| .replace("\r\n", "\n"); | |
| } | |
| public void mustContain(PageRegion region, String... fragments) { | |
| String text = getText(region); | |
| mustContain(text, fragments); | |
| } | |
| public void mustContain(String text, String... fragments) { | |
| List<String> missing = new ArrayList<>(); | |
| for(String fragment: fragments) { | |
| if (!text.contains(fragment)) { | |
| missing.add(fragment); | |
| } | |
| } | |
| if (!missing.isEmpty()) { | |
| String expected = missing.stream() | |
| .collect(Collectors.joining("\n---\n")); | |
| assertEquals(expected, text); | |
| } | |
| } | |
| public void close() { | |
| if (document != null) { | |
| try { | |
| document.close(); | |
| } catch (IOException e) { | |
| throw new RuntimeIOException(e); | |
| } | |
| document = null; | |
| } | |
| } | |
| } |