首页 文章

在提取文本位置时从PdfBox获得的负X或Y.

提问于
浏览
0

我试图提取PDF格式的所有文本及其坐标 . 我正在使用 Apache PDFBox 2.0.8 并按照示例程序DrawPrintTextLocations .

它似乎主要起作用,但对于某些pdf-s我 get negative values for the x and y coordinates of the bounding boxes . 例如,请参阅pdf file .

我的应用程序假设坐标系为普通pdf(x从左到右,y从上到下) . 所以这些都是我的计算 .

以下是相关的代码 .

import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.List;

/**
 * This is an example on how to get some x/y coordinates of text and to show them in a rendered
 * image.
 *
 * @author Ben Litchfield
 * @author Tilman Hausherr
 */
public class DrawPrintTextLocations extends PDFTextStripper {
    private AffineTransform flipAT;
    private AffineTransform rotateAT;
    private AffineTransform transAT;

    private final float DPI = 200.0f;
    private final double PT2PX = DPI / 72.0;
    private final AffineTransform dpiAT = AffineTransform.getScaleInstance(PT2PX, PT2PX);

    private final String filename;
    static final int SCALE = 1;
    private Graphics2D g2d;
    private final PDDocument document;

    /**
     * Instantiate a new PDFTextStripper object.
     *
     * @param document
     * @param filename
     * @throws IOException If there is an error loading the properties.
     */
    public DrawPrintTextLocations(PDDocument document, String filename) throws IOException {
        this.document = document;
        this.filename = filename;
    }

    /**
     * This will print the documents data.
     *
     * @param args The command line arguments.
     * @throws IOException If there is an error parsing the document.
     */
    public static void main(String[] args) throws IOException {
        String pdfLoc = "/debug/pdfbox/p2_VS008PI.pdf";

        if (args.length == 1) {
            pdfLoc = args[0];
        }

        try (PDDocument document = PDDocument.load(new File(pdfLoc))) {
            DrawPrintTextLocations stripper = new DrawPrintTextLocations(document, pdfLoc);
            stripper.setSortByPosition(true);

            for (int page = 0; page < document.getNumberOfPages(); ++page) {
                stripper.stripPage(page);
            }
        }
    }

    private void stripPage(int page) throws IOException {
        PDFRenderer pdfRenderer = new PDFRenderer(document);
        BufferedImage image = pdfRenderer.renderImageWithDPI(page, DPI);

        PDPage pdPage = document.getPage(page);
        PDRectangle cropBox = pdPage.getCropBox();

        // flip y-axis
        flipAT = new AffineTransform();
        flipAT.translate(0, pdPage.getBBox().getHeight());
        flipAT.scale(1, -1);

        // page may be rotated
        rotateAT = new AffineTransform();
        int rotation = pdPage.getRotation();
        if (rotation != 0) {
            PDRectangle mediaBox = pdPage.getMediaBox();
            switch (rotation) {
                case 90:
                    rotateAT.translate(mediaBox.getHeight(), 0);
                    break;
                case 270:
                    rotateAT.translate(0, mediaBox.getWidth());
                    break;
                case 180:
                    rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
                    break;
                default:
                    break;
            }
            rotateAT.rotate(Math.toRadians(rotation));
        }

        // cropbox
        transAT = AffineTransform.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY());

        g2d = image.createGraphics();
        g2d.setStroke(new BasicStroke(0.1f));
        g2d.scale(SCALE, SCALE);

        setStartPage(page + 1);
        setEndPage(page + 1);

        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        writeText(document, dummy);

        g2d.dispose();

        String imageFilename = filename;
        int pt = imageFilename.lastIndexOf('.');
        imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png";
        ImageIO.write(image, "png", new File(imageFilename));
    }

    /**
     * Override the default functionality of PDFTextStripper.
     */
    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {

        for (TextPosition text : textPositions) {

            AffineTransform at = text.getTextMatrix().createAffineTransform();
            PDFont font = text.getFont();

            BoundingBox bbox = font.getBoundingBox();

            float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
            Rectangle2D.Float rect1 = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());

            if (font instanceof PDType3Font) {
                at.concatenate(font.getFontMatrix().createAffineTransform());
            } else {
                at.scale(1 / 1000f, 1 / 1000f);
            }

            Shape s1 = at.createTransformedShape(rect1);
            s1 = flipAT.createTransformedShape(s1);
            s1 = rotateAT.createTransformedShape(s1);
            s1 = dpiAT.createTransformedShape(s1);

            g2d.setColor(Color.blue);
            g2d.draw(s1);

            Rectangle bounds = s1.getBounds();
            if (bounds.getX() < 0 || bounds.getY() < 0) {
                // THIS is where things go wrong
                // i need these coordinates to be +ve
                System.out.println(bounds.toString());
                System.out.println(rect1.toString());
            }
        }
    }
}

以下是上述pdf第一页输出的一些片段 .

第10节 - 保险及其他财务资源java.awt.Rectangle [x = -3237,y = 40,width = 19,height = 43] java.awt.Rectangle [x = -3216,y = 40,width = 20 ,height = 43] java.awt.Rectangle [x = -3194,y = 40,width = 23,height = 43] java.awt.Rectangle [x = -3170,y = 40,width = 22,height = 43 ]

1 回答

  • 1

    带负坐标的字符位于裁剪框之外(也是坐标大于裁剪框高度/宽度的字符) . 将cropbox视为较大的切口 . 要查看整个内容,请运行此代码

    pdPage.setCropBox(pdPage.getMediaBox());
    

    对于PDF的每个页面,然后保存并查看它 .

    根据你的评论

    按照你的建议将裁剪框设置到媒体框,实际上改变了pdf的整个屏幕外观,现在我整理了3页作为一个整理 .

    这表明在物理上,这是一张折叠的纸张,每面有3页 . 在线PDF显示为6页,便于在计算机上查看 .

相关问题