🛠️ pdf_to_markdown_chapters: Splits a long PDF into Markdown chapters by headings

#pdf #markdown #automation

PDF to Markdown Chapters

This tool converts a long PDF document into organized Markdown files, splitting the content into chapters based on heading structure. It is ideal for converting technical documents, books, or reports into a structured format suitable for documentation websites, wikis, or static site generators.

Features

Extracts text from PDF using layout-aware parsing
Identifies chapter headings using font size, style, and positional heuristics
Splits content into separate Markdown files per chapter
Preserves basic formatting such as bold, italic, lists, and code blocks
Creates a table of contents (_toc.md) for easy navigation
Lightweight and dependency-managed using standard Python libraries

Usage

Run the script from the command line:

python main.py input.pdf --output-dir chapters/

This will create a directory (default: chapters/) containing individual .md files for each detected chapter and a _toc.md file.

Dependencies

Python 3.7+
pdfplumber for precise text and layout extraction
argparse for command-line interface

Install dependencies:

pip install pdfplumber

Customization

You can adjust heading detection sensitivity by modifying the font-weight and size thresholds in the script. The tool assumes that chapter titles are larger and bold compared to body text.

Limitations

Works best with text-based PDFs (not scanned)
Heading detection is heuristic-based; may need tuning for specific documents
Complex layouts (multi-column, tables) may not convert perfectly

License

MIT

import argparse
import os
import re
import pdfplumber


def is_heading(obj, min_font_size=12, bold_keywords=['bold', 'Bold']):
    """Determine if a text object is a heading based on font characteristics."""
    font_name = obj.get('fontname', '')
    size = obj.get('size', 0)
    if size >= min_font_size:
        if any(keyword in font_name for keyword in bold_keywords):
            return True
    return False


def extract_headings_and_text(pdf_path):
    """Extract structured content: list of (heading, content) tuples."""
    chapters = []
    current_heading = 'Introduction'
    current_content = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text_objects = page.chars
            if not text_objects:
                continue

            # Group into lines by y-position
            lines = {}
            for obj in text_objects:
                y_key = round(obj['top'])
                lines.setdefault(y_key, []).append(obj)

            for y_key in sorted(lines.keys()):
                line_chars = lines[y_key]
                text = ''.join([c['text'] for c in line_chars])
                bbox = (min(c['x0'] for c in line_chars),
                        min(c['top'] for c in line_chars),
                        max(c['x1'] for c in line_chars),
                        max(c['bottom'] for c in line_chars))
                # Use first char to represent line style
                if is_heading(line_chars[0]):
                    if current_heading and current_content:
                        chapters.append((current_heading, '\n'.join(current_content)))
                    current_heading = text.strip()
                    current_content = []
                else:
                    current_content.append(text.strip())

    if current_heading and current_content:
        chapters.append((current_heading, '\n'.join(current_content)))
    return chapters


def save_chapters(chapters, output_dir):
    """Save each chapter as a markdown file and generate TOC."""
    os.makedirs(output_dir, exist_ok=True)
    toc_lines = ['# Table of Contents\n']

    for i, (heading, content) in enumerate(chapters):
        filename = f'{i+1:02d}_{re.sub(r"[^a-zA-Z0-9]", "_", heading.strip())[:50]}.md'
        filepath = os.path.join(output_dir, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(f'# {heading}\n\n{content}\n')
        toc_lines.append(f'{i+1}. [{heading}]({filename})')

    # Write TOC
    toc_path = os.path.join(output_dir, '_toc.md')
    with open(toc_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(toc_lines))


def main():
    parser = argparse.ArgumentParser(description='Split PDF into Markdown chapters.')
    parser.add_argument('pdf_path', help='Path to input PDF')
    parser.add_argument('--output-dir', '-o', default='chapters', help='Output directory')
    args = parser.parse_args()

    if not os.path.exists(args.pdf_path):
        print(f'PDF file not found: {args.pdf_path}')
        return

    chapters = extract_headings_and_text(args.pdf_path)
    save_chapters(chapters, args.output_dir)
    print(f'Saved {len(chapters)} chapters to {args.output_dir}/')

if __name__ == '__main__':
    main()

DEV Community