Accounts payable teams spend an average of 3.7 minutes manually processing each invoice. At 200 invoices per month, that's 12+ hours of data entry. Here's how to build an automated pipeline that brings this to under 10 seconds per document.
Pipeline Architecture
Email/SFTP/API → Receive → Extract → Validate → Enrich → Store → Notify
Each stage is independent and can fail gracefully without losing the document.
Stage 1: Document Ingestion
Accept invoices from multiple sources:
const express = require('express');
const multer = require('multer');
const path = require('path');
const upload = multer({
dest: '/tmp/invoices',
limits: { fileSize: 20 * 1024 * 1024 }, // 20MB
fileFilter: (req, file, cb) => {
const allowed = ['.pdf', '.docx', '.xlsx', '.png', '.jpg'];
const ext = path.extname(file.originalname).toLowerCase();
cb(null, allowed.includes(ext));
},
});
app.post('/api/invoices/upload', upload.array('files', 20), async (req, res) => {
const jobs = req.files.map(file => ({
id: generateJobId(),
path: file.path,
filename: file.originalname,
status: 'queued',
}));
await queue.addBatch(jobs);
res.json({ jobs: jobs.map(j => ({ id: j.id, status: j.status })) });
});
Stage 2: Extraction
async function extractInvoiceData(job) {
const formData = new FormData();
formData.append('file', fs.createReadStream(job.path), job.filename);
formData.append('fields', JSON.stringify([
'invoice_number', 'invoice_date', 'due_date',
'vendor_name', 'vendor_address', 'vendor_tax_id',
'line_items', 'subtotal', 'tax_amount', 'total_amount',
'currency', 'payment_terms',
]));
const response = await fetch('https://parseflow.dev/api/extract', {
method: 'POST',
headers: { 'Authorization': `Bearer ${process.env.PARSEFLOW_KEY}` },
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(`Extraction failed: ${error.message}`);
}
return response.json();
}
Stage 3: Validation
Never trust extracted data without validation:
function validateInvoice(data) {
const errors = [];
// Required fields
if (!data.invoice_number) errors.push('Missing invoice number');
if (!data.vendor_name) errors.push('Missing vendor name');
if (!data.total_amount) errors.push('Missing total amount');
// Math validation
if (data.line_items?.length > 0) {
const lineTotal = data.line_items.reduce((sum, item) => sum + item.total, 0);
const tolerance = 0.02; // 2 cents tolerance for rounding
if (Math.abs(lineTotal - data.subtotal) > tolerance) {
errors.push(`Line items sum (${lineTotal}) != subtotal (${data.subtotal})`);
}
}
if (data.subtotal && data.tax_amount && data.total_amount) {
const expected = data.subtotal + data.tax_amount;
if (Math.abs(expected - data.total_amount) > 0.02) {
errors.push(`Subtotal + tax (${expected}) != total (${data.total_amount})`);
}
}
// Duplicate detection
// (check against your DB for same invoice_number + vendor)
return { valid: errors.length === 0, errors };
}
Stage 4: Enrichment
Match the vendor to your supplier database:
async function enrichInvoice(data) {
// Fuzzy match vendor name to known suppliers
const vendor = await db.suppliers.findBestMatch(data.vendor_name);
if (vendor) {
data.supplier_id = vendor.id;
data.gl_account = vendor.default_gl_account;
data.cost_center = vendor.default_cost_center;
data.approver_email = vendor.approver_email;
data.payment_method = vendor.preferred_payment_method;
} else {
data.requires_review = true;
data.review_reason = 'Unknown vendor — manual matching required';
}
return data;
}
Stage 5: Notifications
async function notifyApprover(invoice) {
// Only for invoices above threshold or from unknown vendors
if (invoice.total_amount > 5000 || invoice.requires_review) {
await emailService.send({
to: invoice.approver_email,
subject: `Invoice approval required: ${invoice.invoice_number} — ${invoice.vendor_name}`,
template: 'invoice-approval',
data: invoice,
});
}
}
Error Handling and Dead Letter Queue
async function processJob(job) {
try {
job.status = 'processing';
const extracted = await extractInvoiceData(job);
const validation = validateInvoice(extracted);
if (!validation.valid) {
job.status = 'validation_failed';
job.errors = validation.errors;
await moveToReview(job);
return;
}
const enriched = await enrichInvoice(extracted);
await db.invoices.create({ ...enriched, job_id: job.id });
await notifyApprover(enriched);
job.status = 'completed';
} catch (err) {
job.attempts++;
if (job.attempts >= 3) {
job.status = 'dead_letter';
await alertOps(job, err);
} else {
job.status = 'retry';
job.retry_after = addMinutes(new Date(), job.attempts * 15);
}
}
await db.jobs.update(job);
}
Results
A pipeline like this, using ParseFlow for the extraction stage, processes a typical invoice in 4-8 seconds with 94%+ field accuracy across variable formats. The validation stage catches the remaining edge cases and routes them to a human reviewer queue rather than silently accepting bad data.
The full pipeline handles PDF, Word, and Excel with the same code path — no special-casing per format.
Top comments (0)