Most ebook pipelines produce one language output. A bilingual pipeline costs roughly 30% more in API time and reaches twice the addressable market. The Spanish-language technical book market is underserved relative to English — the competition is thinner, and search volume on KDP's .com.mx, .com.co, and .com.es storefronts is real.
Here is how to build one pipeline run that produces both book_en.epub and book_es.epub.
The Core Problem: Code Fences Must Survive Translation
Naive translation — pass the whole chapter to an LLM, ask for Spanish — will corrupt code blocks. Variable names get translated, comments change syntactically, and indentation sometimes breaks. The solution is fence-preserving translation: extract code blocks before translation, replace them with stable placeholders, translate the prose, then restore the blocks.
import re
import anthropic
FENCE_PATTERN = re.compile(r"(```
[\w]*\n.*?
```)", re.DOTALL)
def extract_fences(content: str) -> tuple[str, dict[str, str]]:
"""Replace code fences with stable placeholders."""
fences: dict[str, str] = {}
counter = 0
def replacer(match: re.Match) -> str:
nonlocal counter
token = f"__CODE_BLOCK_{counter}__"
fences[token] = match.group(0)
counter += 1
return token
sanitized = FENCE_PATTERN.sub(replacer, content)
return sanitized, fences
def restore_fences(translated: str, fences: dict[str, str]) -> str:
"""Restore original code blocks from placeholders."""
for token, original in fences.items():
translated = translated.replace(token, original)
return translated
def translate_chapter(
client: anthropic.Anthropic,
content: str,
target_lang: str = "es",
model: str = "claude-sonnet-4-5"
) -> str:
"""Fence-preserving translation of a Markdown chapter."""
sanitized, fences = extract_fences(content)
lang_instructions = {
"es": (
"Translate the following technical Markdown text to Spanish (Latin American). "
"Preserve all Markdown formatting. "
"Translate inline code variable names only if they are comment text. "
"Do not translate the placeholder tokens like __CODE_BLOCK_0__."
)
}
response = client.messages.create(
model=model,
max_tokens=4096,
messages=[{
"role": "user",
"content": f"{lang_instructions[target_lang]}\n\n---\n\n{sanitized}"
}]
)
translated_sanitized = response.content[0].text
return restore_fences(translated_sanitized, fences)
Semantic QA Layer
After translation, you need to verify the Spanish output is not just syntactically intact but semantically equivalent. A lightweight approach: embed both versions and check cosine similarity chapter by chapter.
import numpy as np
def embed_text(client: anthropic.Anthropic, text: str) -> list[float]:
"""Get text embedding via Claude (or any embedding endpoint)."""
# Using a dedicated embedding model is faster and cheaper here
# This example uses a simple heuristic via Claude for illustration
response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=256,
messages=[{
"role": "user",
"content": (
f"Summarize the following text in exactly 3 sentences in English, "
f"preserving all technical concepts:\n\n{text[:3000]}"
)
}]
)
return response.content[0].text # use with external embedder in production
def cosine_similarity(a: list[float], b: list[float]) -> float:
va, vb = np.array(a), np.array(b)
return float(np.dot(va, vb) / (np.linalg.norm(va) * np.linalg.norm(vb)))
def qa_translation(
client: anthropic.Anthropic,
original: str,
translated: str,
threshold: float = 0.85
) -> tuple[bool, float]:
"""
Heuristic QA: ask Claude to rate semantic equivalence on 0-1 scale.
In production, use sentence-transformers or OpenAI embeddings.
"""
response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=64,
messages=[{
"role": "user",
"content": (
"Rate the semantic equivalence of these two technical passages on a scale "
"from 0.0 to 1.0. Reply with only a decimal number.\n\n"
f"ORIGINAL:\n{original[:1500]}\n\n"
f"TRANSLATION:\n{translated[:1500]}"
)
}]
)
try:
score = float(response.content[0].text.strip())
except ValueError:
score = 0.0
return score >= threshold, score
The Full Bilingual Pipeline
import subprocess
from pathlib import Path
import json
CHECKPOINT = Path("checkpoint_bilingual.json")
def compile_epub(chapters_dir: Path, output: Path, lang: str, metadata: dict) -> Path:
chapter_files = sorted(chapters_dir.glob(f"*_{lang}.md"))
cmd = [
"pandoc",
"--from", "markdown+fenced_code_blocks",
"--to", "epub3",
"--metadata", f"title={metadata['title']}",
"--metadata", f"author={metadata['author']}",
"--metadata", f"lang={'en-US' if lang == 'en' else 'es-419'}",
"--epub-cover-image", metadata[f"cover_{lang}"],
"--toc",
"-o", str(output),
*[str(f) for f in chapter_files]
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(result.stderr)
return output
def run_bilingual_pipeline(config: dict):
client = anthropic.Anthropic()
state = json.loads(CHECKPOINT.read_text()) if CHECKPOINT.exists() else {}
chapters_dir = Path("chapters")
chapters_dir.mkdir(exist_ok=True)
for chapter in config["chapters"]:
slug = chapter["slug"]
# English chapter (already generated by main pipeline)
en_path = chapters_dir / f"{slug}_en.md"
if not en_path.exists():
print(f" Missing EN source for {slug}, skipping")
continue
es_path = chapters_dir / f"{slug}_es.md"
qa_key = f"{slug}_qa"
# Translate if not cached
if not es_path.exists():
print(f" Translating: {slug}")
en_content = en_path.read_text()
es_content = translate_chapter(client, en_content, target_lang="es")
# QA check
ok, score = qa_translation(client, en_content, es_content)
print(f" QA score: {score:.2f} ({'PASS' if ok else 'REVIEW'})")
state[qa_key] = {"score": score, "pass": ok}
CHECKPOINT.write_text(json.dumps(state, indent=2))
if not ok:
es_path.with_suffix(".REVIEW.md").write_text(es_content)
print(f" Low QA score, saved for review: {slug}")
continue
es_path.write_text(es_content)
# Compile both EPUBs
metadata = config["metadata"]
en_epub = compile_epub(chapters_dir, Path("book_en.epub"), "en", metadata)
es_epub = compile_epub(chapters_dir, Path("book_es.epub"), "es", metadata)
print(f"EN EPUB: {en_epub}")
print(f"ES EPUB: {es_epub}")
Why Bilingual Doubles the Addressable Market
KDP has separate storefronts for Mexico (.com.mx), Colombia (.com.co), and Spain (.com.es). A Spanish EPUB with proper lang=es-419 metadata appears in those stores automatically. The English version targets .com (US/UK/AU/CA).
One pipeline run, two listings. Competition on Spanish-language technical titles is sparse — many technical niches have fewer than 10 direct competitors in Spanish versus hundreds in English.
Publishing Both Versions
# Gumroad: upload both files to the same product
# (buyers get access to both EPUBs in their library)
curl -X PUT "https://api.gumroad.com/v2/products/xhxkzz" \
-d "access_token=$GUMROAD_TOKEN" \
-F "url=@book_en.epub" \
-F "url_es=@book_es.epub"
# KDP: two separate listings, cross-link in description
# EN listing: standard upload
# ES listing: set language to "Spanish", upload book_es.epub
For KDP, the Spanish listing is a separate ASIN. Link between them in the product descriptions. Some buyers purchase both.
Cost Overhead for Bilingual
- Translation API: ~$0.30–0.60 per book (Haiku model, fence-preserving)
- QA passes: ~$0.10 per book
- Extra Pandoc compile: under 1 second
Total additional cost per bilingual book: under $1.00. The market expansion justifies it at any positive sales rate.
The complete bilingual pipeline — fence extraction, translation, semantic QA, dual EPUB compilation, and Gumroad + KDP multilingual listing workflow — is in the Python Ebook Automation Pipeline ($12.99, 30-day refund).
📋 Free: AI Publishing Checklist — 7 steps to ship a technical ebook with Python (PDF, free)
Full pipeline + 10 scripts: germy5.gumroad.com/l/xhxkzz — $12.99 launch price
Top comments (0)