Extracting text from a PDF with Python

Extracting text from a PDF with Python

Install PyMuPDF

python -m pip install --upgrade pymupdf

Here is the Source Code:

import fitz  # PyMuPDF

def extract_highlighted_text(pdf_path):
    highlighted_text = []

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        print('Page -> ', page)
        print('Text -> ', page.get_text())

        # Get all the annotations on the page
        annotations = page.annots()

        for annot in annotations:
            print(annot)
            # Check if the annotation is a highlight
            if annot.type[0] == 8:  # 8 corresponds to a highlight annotation in PyMuPDF
                highlight_text = annot.info["subject"]
                highlighted_text.append(highlight_text)

    # Close the PDF document
    pdf_document.close()

    return highlighted_text

# Usage example
pdf_path = 'INPUT_FILE.pdf'
highlighted_text = extract_highlighted_text(pdf_path)

for text in highlighted_text:
    print(text)

Read more about:

1. More About Extract Document Text

2. Mark Extracted Text

3. Annotations