PDF to Markdown Chapters
This tool converts a long PDF document into organized Markdown files, splitting the content into chapters based on heading structure. It is ideal for converting technical documents, books, or reports into a structured format suitable for documentation websites, wikis, or static site generators.
Features
- Extracts text from PDF using layout-aware parsing
- Identifies chapter headings using font size, style, and positional heuristics
- Splits content into separate Markdown files per chapter
- Preserves basic formatting such as bold, italic, lists, and code blocks
- Creates a table of contents (
_toc.md) for easy navigation - Lightweight and dependency-managed using standard Python libraries
Usage
Run the script from the command line:
python main.py input.pdf --output-dir chapters/
This will create a directory (default: chapters/) containing individual .md files for each detected chapter and a _toc.md file.
Dependencies
- Python 3.7+
-
pdfplumberfor precise text and layout extraction -
argparsefor command-line interface
Install dependencies:
pip install pdfplumber
Customization
You can adjust heading detection sensitivity by modifying the font-weight and size thresholds in the script. The tool assumes that chapter titles are larger and bold compared to body text.
Limitations
- Works best with text-based PDFs (not scanned)
- Heading detection is heuristic-based; may need tuning for specific documents
- Complex layouts (multi-column, tables) may not convert perfectly
License
MIT
import argparse
import os
import re
import pdfplumber
def is_heading(obj, min_font_size=12, bold_keywords=['bold', 'Bold']):
"""Determine if a text object is a heading based on font characteristics."""
font_name = obj.get('fontname', '')
size = obj.get('size', 0)
if size >= min_font_size:
if any(keyword in font_name for keyword in bold_keywords):
return True
return False
def extract_headings_and_text(pdf_path):
"""Extract structured content: list of (heading, content) tuples."""
chapters = []
current_heading = 'Introduction'
current_content = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text_objects = page.chars
if not text_objects:
continue
# Group into lines by y-position
lines = {}
for obj in text_objects:
y_key = round(obj['top'])
lines.setdefault(y_key, []).append(obj)
for y_key in sorted(lines.keys()):
line_chars = lines[y_key]
text = ''.join([c['text'] for c in line_chars])
bbox = (min(c['x0'] for c in line_chars),
min(c['top'] for c in line_chars),
max(c['x1'] for c in line_chars),
max(c['bottom'] for c in line_chars))
# Use first char to represent line style
if is_heading(line_chars[0]):
if current_heading and current_content:
chapters.append((current_heading, '\n'.join(current_content)))
current_heading = text.strip()
current_content = []
else:
current_content.append(text.strip())
if current_heading and current_content:
chapters.append((current_heading, '\n'.join(current_content)))
return chapters
def save_chapters(chapters, output_dir):
"""Save each chapter as a markdown file and generate TOC."""
os.makedirs(output_dir, exist_ok=True)
toc_lines = ['# Table of Contents\n']
for i, (heading, content) in enumerate(chapters):
filename = f'{i+1:02d}_{re.sub(r"[^a-zA-Z0-9]", "_", heading.strip())[:50]}.md'
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(f'# {heading}\n\n{content}\n')
toc_lines.append(f'{i+1}. [{heading}]({filename})')
# Write TOC
toc_path = os.path.join(output_dir, '_toc.md')
with open(toc_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(toc_lines))
def main():
parser = argparse.ArgumentParser(description='Split PDF into Markdown chapters.')
parser.add_argument('pdf_path', help='Path to input PDF')
parser.add_argument('--output-dir', '-o', default='chapters', help='Output directory')
args = parser.parse_args()
if not os.path.exists(args.pdf_path):
print(f'PDF file not found: {args.pdf_path}')
return
chapters = extract_headings_and_text(args.pdf_path)
save_chapters(chapters, args.output_dir)
print(f'Saved {len(chapters)} chapters to {args.output_dir}/')
if __name__ == '__main__':
main()
Top comments (0)