Skip to content
Guides

Extract Text in PDF

This sample shows how to extract all the text of a PDF

Java
/**
 * Copyright © 2014-2023 PDF Technologies, Inc. All Rights Reserved.
 *
 * THIS SOURCE CODE AND ANY ACCOMPANYING DOCUMENTATION ARE PROTECTED BY INTERNATIONAL COPYRIGHT LAW
 * AND MAY NOT BE RESOLD OR REDISTRIBUTED. USAGE IS BOUND TO THE ComPDFKit LICENSE AGREEMENT.
 * UNAUTHORIZED REPRODUCTION OR DISTRIBUTION IS SUBJECT TO CIVIL AND CRIMINAL PENALTIES.
 * This notice may not be removed from this file.
 */


package com.compdfkit.samples.samples;

import android.graphics.RectF;

import com.compdfkit.core.document.CPDFDocument;
import com.compdfkit.core.page.CPDFPage;
import com.compdfkit.core.page.CPDFTextPage;
import com.compdfkit.core.page.CPDFTextRange;
import com.compdfkit.core.page.CPDFTextSelection;
import com.compdfkit.samples.PDFSamples;
import com.compdfkit.samples.R;
import com.compdfkit.samples.util.FileUtils;
import com.compdfkit.samples.util.OutputListener;


public class TextExtractTest extends PDFSamples {

    public TextExtractTest(){
        setTitle(R.string.text_extract_test_title);
        setDescription(R.string.text_extract_test_desc);
    }

    @Override
    protected void run(OutputListener outputListener) {
        super.run(outputListener);
        printHead();
        extractPageText();
        extractAllPageText();
        extractRectRangeText();
        printFooter();
    }

    /**
     * Samples 1: Extract all text content in the specified page
     */
    private void extractPageText(){
        outputListener.println("Samples 1: Extract all text content in the specified page");
        outputListener.println("Opening the Samples PDF File");
        outputListener.println("The text content of the first page of the document:");
        printDividingLine();
        outputListener.println("Text : ");
        CPDFDocument document = new CPDFDocument(context);
        document.open(FileUtils.getAssetsTempFile(context, "text.pdf"));
        CPDFPage page = document.pageAtIndex(0);
        CPDFTextPage textPage = page.getTextPage();
        String pageText = textPage.getText(new CPDFTextRange(0, textPage.getCountChars() - 1) );
        outputListener.print(pageText);
        outputListener.println("\nDone!");
        printDividingLine();
    }

    /**
     * Samples 2: Extract all text content of the document
     */
    private void extractAllPageText(){
        outputListener.println("Samples 2: Extract all text content of the document");
        outputListener.println("Opening the Samples PDF File");
        printDividingLine();
        outputListener.println("Text : ");
        CPDFDocument document = new CPDFDocument(context);
        document.open(FileUtils.getAssetsTempFile(context, "text.pdf"));
        for (int i = 0; i < document.getPageCount(); i++) {
            CPDFPage page = document.pageAtIndex(i);
            CPDFTextPage textPage = page.getTextPage();
            String pageText = textPage.getText(new CPDFTextRange(0, textPage.getCountChars() - 1) );
            outputListener.print(pageText);
        }
        outputListener.println("Done!");
        printDividingLine();
    }

    private void extractRectRangeText(){
        outputListener.println("Samples 3: Extract Select Rect Range Text");
        outputListener.println("Opening the Samples PDF File");
        CPDFDocument document = new CPDFDocument(context);
        document.open(FileUtils.getAssetsTempFile(context, "text.pdf"));
        CPDFPage page = document.pageAtIndex(0);
        CPDFTextPage textPage = page.getTextPage();
        // Extract the text within the range of (0, 0, 500, 500) on the first page
        RectF selectRect = new RectF(0f, 0f, 300f, 300f);
        RectF size = page.getSize();
        selectRect = page.convertRectFromPage(false, size.width(), size.height(), selectRect);
        CPDFTextSelection[] selections = textPage.getSelectionsByLineForRect(selectRect);
        outputListener.println("Range : (0, 0, 300, 300)");
        outputListener.println("Text : ");
        for (int i = 0; i < selections.length; i++) {
            CPDFTextSelection textSelection = selections[i];
            if (textSelection == null) {
                continue;
            }
            String text = textPage.getText(textSelection.getTextRange());
            outputListener.print(text);
        }
        outputListener.println("\nDone!");
        printDividingLine();
    }
}