Not all valuable data lives on web pages. Reports, invoices, research papers, and government filings often come as PDFs and documents. Python has excellent libraries for extracting structured data from these formats.
In this guide, I'll show you practical techniques for parsing PDFs, extracting tables, and handling scanned documents with OCR.
PDF Parsing Libraries
Python offers several PDF parsing options, each with different strengths:
| Library | Best For | Tables | OCR | Speed |
|---|---|---|---|---|
| PyPDF2 | Text extraction | No | No | Fast |
| pdfplumber | Tables & layout | Yes | No | Medium |
| Camelot | Table extraction | Yes | No | Medium |
| pytesseract | Scanned PDFs | No | Yes | Slow |
| pymupdf (fitz) | Full-featured | Yes | Yes | Fast |
Basic Text Extraction
For simple text PDFs, PyPDF2 or pymupdf works well:
import fitz # pymupdf
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
# Usage
text = extract_text_from_pdf("report.pdf")
print(text[:500])
For more control over layout:
def extract_text_by_page(pdf_path):
doc = fitz.open(pdf_path)
pages = []
for page_num, page in enumerate(doc):
blocks = page.get_text("blocks")
page_text = []
for block in sorted(blocks, key=lambda b: (b[1], b[0])):
page_text.append(block[4].strip())
pages.append({
"page": page_num + 1,
"text": "\n".join(page_text)
})
doc.close()
return pages
pages = extract_text_by_page("report.pdf")
for p in pages:
print(f"--- Page {p['page']} ---")
print(p['text'][:200])
Extracting Tables from PDFs
pdfplumber is excellent for table extraction:
import pdfplumber
import csv
def extract_tables(pdf_path, output_csv=None):
all_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
tables = page.extract_tables()
for table_idx, table in enumerate(tables):
if table:
headers = table[0]
rows = table[1:]
all_tables.append({
"page": page_num + 1,
"table_index": table_idx,
"headers": headers,
"rows": rows
})
if output_csv and all_tables:
with open(output_csv, "w", newline="") as f:
writer = csv.writer(f)
for table in all_tables:
writer.writerow(table["headers"])
writer.writerows(table["rows"])
writer.writerow([]) # Separator
return all_tables
tables = extract_tables("financial_report.pdf", "output.csv")
for t in tables:
print(f"Page {t['page']}, Table {t['table_index']}:")
print(f" Headers: {t['headers']}")
print(f" Rows: {len(t['rows'])}")
OCR for Scanned Documents
Scanned PDFs contain images, not text. Use OCR to extract text:
import fitz
from PIL import Image
import pytesseract
import io
def ocr_pdf(pdf_path, language="eng"):
doc = fitz.open(pdf_path)
full_text = []
for page_num, page in enumerate(doc):
# First try direct text extraction
text = page.get_text().strip()
if len(text) < 50: # Likely a scanned page
# Convert page to image for OCR
pix = page.get_pixmap(dpi=300)
img = Image.open(io.BytesIO(pix.tobytes("png")))
text = pytesseract.image_to_string(img, lang=language)
full_text.append({
"page": page_num + 1,
"text": text,
"method": "ocr" if len(page.get_text().strip()) < 50 else "direct"
})
doc.close()
return full_text
results = ocr_pdf("scanned_document.pdf")
for r in results:
method = r['method']
print(f"Page {r['page']} ({method}): {r['text'][:100]}...")
Parsing Structured Data from PDFs
For invoices, receipts, and forms, use regex patterns on extracted text:
import re
def parse_invoice(pdf_path):
text = extract_text_from_pdf(pdf_path)
invoice_data = {}
# Extract invoice number
inv_match = re.search(r"Invoice\s*#?\s*:?\s*(\w+[-]?\w+)", text, re.IGNORECASE)
if inv_match:
invoice_data["invoice_number"] = inv_match.group(1)
# Extract date
date_match = re.search(r"Date\s*:?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})", text)
if date_match:
invoice_data["date"] = date_match.group(1)
# Extract total amount
total_match = re.search(r"Total\s*:?\s*\$?([\d,]+\.\d{2})", text, re.IGNORECASE)
if total_match:
invoice_data["total"] = float(total_match.group(1).replace(",", ""))
# Extract email
email_match = re.search(r"[\w.+-]+@[\w-]+\.[\w.-]+", text)
if email_match:
invoice_data["email"] = email_match.group()
return invoice_data
invoice = parse_invoice("invoice_2024.pdf")
print(invoice)
Batch Processing Multiple PDFs
from pathlib import Path
import json
def batch_extract(pdf_directory, output_file="extracted_data.json"):
pdf_dir = Path(pdf_directory)
results = []
for pdf_file in sorted(pdf_dir.glob("*.pdf")):
print(f"Processing: {pdf_file.name}")
try:
text = extract_text_from_pdf(str(pdf_file))
tables = extract_tables(str(pdf_file))
results.append({
"file": pdf_file.name,
"text_length": len(text),
"tables_found": len(tables),
"text_preview": text[:200],
})
except Exception as e:
results.append({
"file": pdf_file.name,
"error": str(e)
})
with open(output_file, "w") as f:
json.dump(results, f, indent=2)
print(f"Processed {len(results)} PDFs -> {output_file}")
return results
batch_extract("/path/to/pdfs/")
Tips for Better Extraction
- Always try direct text extraction first — OCR is slow and less accurate
- Use high DPI (300+) for OCR — low resolution produces garbage
- Pre-process images before OCR — deskew, denoise, and threshold for better results
- Validate extracted data — use regex patterns to verify formats
- Handle multi-column layouts — sort text blocks by position for correct reading order
Scaling PDF Processing
For large-scale document processing, you may want to combine PDF extraction with web scraping to collect documents automatically. ScrapeOps provides monitoring and proxy management that can help you download PDFs at scale before processing them.
Conclusion
PDF and document parsing is a critical data engineering skill. Start with pymupdf for text, pdfplumber for tables, and pytesseract for scanned documents. Combine these tools with regex patterns to extract structured data from any document format.
Happy scraping!
Top comments (0)