digulla · January 25, 2017 09:32
diff --git a/README.md b/README.md
diff --git a/PageRegion b/PageRegion
 import java.awt.geom.Rectangle2D;

 import org.apache.pdfbox.text.PDFTextStripperByArea;

 public class PageRegion {

    private String name;
    private Rectangle2D rect;

    // Coordinates are in the order in which the Apache PDF-Box Debugger displays them
    public PageRegion(String name, double y1, double x1, double y2, double x2) {
        this.name = name;
        
        double x = Math.min(x1, x2);
        double y = Math.min(y1, y2);
        double width = Math.abs(x1 - x2) + 1;
        double height = Math.abs(y1 - y2) + 1;
        this.rect = new Rectangle2D.Double(x, y, width, height);
    }
    
    public String getName() {
        return name;
    }
    
    public void addTo(PDFTextStripperByArea stripper) {
        stripper.addRegion(name, rect);
    }
 }
diff --git a/ReportTextExtractor.java b/ReportTextExtractor.java
 import static org.junit.Assert.assertEquals;

 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;

 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.PDFTextStripperByArea;

 import com.google.common.base.Preconditions;

 import tools.RuntimeIOException;

 public class ReportTextExtractor {

    private PDDocument document;
    private File file;

    public ReportTextExtractor(File file) {
        this.file = file;
        try {
            document = PDDocument.load(file);
        } catch (IOException e) {
            throw new RuntimeIOException("Error loading " + file.getAbsolutePath(), e);
        }
    }
    
    private PDFTextStripperByArea byAreaStripper;
    
    protected PDFTextStripperByArea createByAreaStripper() {
        PDFTextStripperByArea result;
        try {
            result = new PDFTextStripperByArea();
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
        
        result.setAddMoreFormatting(true);
        result.setSortByPosition(true);
        
        return result;
    }
    
    public ReportTextExtractor addRegion(PageRegion region) {
        if (byAreaStripper == null) {
            byAreaStripper = createByAreaStripper();
        }
        
        region.addTo(byAreaStripper);
        return this;
    }
    
    private boolean processed = false;
    private int page = 0;
    
    public String getText(PageRegion region) {
        if (byAreaStripper == null) {
            throw new IllegalStateException("Add some region, first!");
        }
        Preconditions.checkArgument(byAreaStripper.getRegions().contains(region.getName()),
                "Unknown region [%s]; valid names are: %s", region.getName(), byAreaStripper.getRegions());
        
        if (!processed) {
            try {
                byAreaStripper.extractRegions(document.getPage(page));
            } catch (IOException e) {
                throw new RuntimeIOException("Error processing page " + page, e);
            }
            
            processed = true;
        }
        
        String result = byAreaStripper.getTextForRegion(region.getName());
        result = postProcess(result);
        return result;
    }
    
    /** Get all the text of the document at once */
    public static String getText(File file) {
        return new ReportTextExtractor(file).getText();
    }
    
    public String getText() {
        PDFTextStripper stripper;
        try {
            stripper = new PDFTextStripper();
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
        
        stripper.setAddMoreFormatting(true);
        stripper.setSortByPosition(true);
        
        try {
            String result = stripper.getText(document);
            result = postProcess(result);
            return result;
        } catch (IOException e) {
            throw new RuntimeIOException("Error getting text from " + file.getAbsolutePath(), e);
        } finally {
            close();
        }
    }

    protected String postProcess(String result) {
        return result.trim()
                .replace("\r\n", "\n");
    }

    public void mustContain(PageRegion region, String... fragments) {
        String text = getText(region);
        mustContain(text, fragments);
    }
    
    public void mustContain(String text, String... fragments) {
        List<String> missing = new ArrayList<>();
        for(String fragment: fragments) {
            if (!text.contains(fragment)) {
                missing.add(fragment);
            }
        }
        
        if (!missing.isEmpty()) {
            String expected = missing.stream()
                    .collect(Collectors.joining("\n---\n"));
            assertEquals(expected, text);
        }
    }
    
    public void close() {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
            
            document = null;
        }
    }

 }
	import java.awt.geom.Rectangle2D;

	import org.apache.pdfbox.text.PDFTextStripperByArea;

	public class PageRegion {

	private String name;
	private Rectangle2D rect;

	// Coordinates are in the order in which the Apache PDF-Box Debugger displays them
	public PageRegion(String name, double y1, double x1, double y2, double x2) {
	this.name = name;

	double x = Math.min(x1, x2);
	double y = Math.min(y1, y2);
	double width = Math.abs(x1 - x2) + 1;
	double height = Math.abs(y1 - y2) + 1;
	this.rect = new Rectangle2D.Double(x, y, width, height);
	}

	public String getName() {
	return name;
	}

	public void addTo(PDFTextStripperByArea stripper) {
	stripper.addRegion(name, rect);
	}
	}
	import static org.junit.Assert.assertEquals;

	import java.io.File;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.stream.Collectors;

	import org.apache.pdfbox.pdmodel.PDDocument;
	import org.apache.pdfbox.text.PDFTextStripper;
	import org.apache.pdfbox.text.PDFTextStripperByArea;

	import com.google.common.base.Preconditions;

	import tools.RuntimeIOException;

	public class ReportTextExtractor {

	private PDDocument document;
	private File file;

	public ReportTextExtractor(File file) {
	this.file = file;
	try {
	document = PDDocument.load(file);
	} catch (IOException e) {
	throw new RuntimeIOException("Error loading " + file.getAbsolutePath(), e);
	}
	}

	private PDFTextStripperByArea byAreaStripper;

	protected PDFTextStripperByArea createByAreaStripper() {
	PDFTextStripperByArea result;
	try {
	result = new PDFTextStripperByArea();
	} catch (IOException e) {
	throw new RuntimeIOException(e);
	}

	result.setAddMoreFormatting(true);
	result.setSortByPosition(true);

	return result;
	}

	public ReportTextExtractor addRegion(PageRegion region) {
	if (byAreaStripper == null) {
	byAreaStripper = createByAreaStripper();
	}

	region.addTo(byAreaStripper);
	return this;
	}

	private boolean processed = false;
	private int page = 0;

	public String getText(PageRegion region) {
	if (byAreaStripper == null) {
	throw new IllegalStateException("Add some region, first!");
	}
	Preconditions.checkArgument(byAreaStripper.getRegions().contains(region.getName()),
	"Unknown region [%s]; valid names are: %s", region.getName(), byAreaStripper.getRegions());

	if (!processed) {
	try {
	byAreaStripper.extractRegions(document.getPage(page));
	} catch (IOException e) {
	throw new RuntimeIOException("Error processing page " + page, e);
	}

	processed = true;
	}

	String result = byAreaStripper.getTextForRegion(region.getName());
	result = postProcess(result);
	return result;
	}

	/** Get all the text of the document at once */
	public static String getText(File file) {
	return new ReportTextExtractor(file).getText();
	}

	public String getText() {
	PDFTextStripper stripper;
	try {
	stripper = new PDFTextStripper();
	} catch (IOException e) {
	throw new RuntimeIOException(e);
	}

	stripper.setAddMoreFormatting(true);
	stripper.setSortByPosition(true);

	try {
	String result = stripper.getText(document);
	result = postProcess(result);
	return result;
	} catch (IOException e) {
	throw new RuntimeIOException("Error getting text from " + file.getAbsolutePath(), e);
	} finally {
	close();
	}
	}

	protected String postProcess(String result) {
	return result.trim()
	.replace("\r\n", "\n");
	}

	public void mustContain(PageRegion region, String... fragments) {
	String text = getText(region);
	mustContain(text, fragments);
	}

	public void mustContain(String text, String... fragments) {
	List<String> missing = new ArrayList<>();
	for(String fragment: fragments) {
	if (!text.contains(fragment)) {
	missing.add(fragment);
	}
	}

	if (!missing.isEmpty()) {
	String expected = missing.stream()
	.collect(Collectors.joining("\n---\n"));
	assertEquals(expected, text);
	}
	}

	public void close() {
	if (document != null) {
	try {
	document.close();
	} catch (IOException e) {
	throw new RuntimeIOException(e);
	}

	document = null;
	}
	}

	}