You've found a table on a website. You need that data in a spreadsheet. The obvious path—copy, paste, clean up in Excel—works once. But what if you need this data weekly? Or from 50 different pages?
This guide shows you how to extract HTML tables programmatically with JavaScript, handle the edge cases that break naive approaches, and export to formats your tools actually accept.
The Naive Approach (And Why It Fails)
The simplest extraction looks like this:
function extractTable(table) {
return Array.from(table.rows).map(row =>
Array.from(row.cells).map(cell => cell.textContent.trim())
);
}
This works for simple tables. It breaks immediately when you hit:
- Rowspan/colspan — Cells that span multiple rows or columns
- Nested tables — Tables inside table cells
-
Hidden content —
<style>,<script>, ordisplay:noneelements - Special characters — Newlines, tabs, and quotes in cell content
Let's fix each one.
Handling Rowspan and Colspan
When a cell has rowspan="2", it occupies space in the current row AND the next row. A naive extractor sees fewer cells than expected and misaligns columns.
The solution: build a virtual grid that tracks occupied positions.
function extractTableMatrix(table) {
const rows = Array.from(table.rows);
const grid = [];
rows.forEach((rowEl, rowIndex) => {
if (!grid[rowIndex]) grid[rowIndex] = [];
let colIndex = 0;
Array.from(rowEl.cells).forEach(cell => {
// Skip columns already occupied by previous rowspans
while (grid[rowIndex][colIndex] !== undefined) {
colIndex++;
}
const text = cell.textContent.trim();
const rowSpan = parseInt(cell.rowSpan, 10) || 1;
const colSpan = parseInt(cell.colSpan, 10) || 1;
// Fill the rectangular block this cell occupies
for (let r = 0; r < rowSpan; r++) {
const targetRow = rowIndex + r;
if (!grid[targetRow]) grid[targetRow] = [];
for (let c = 0; c < colSpan; c++) {
const targetCol = colIndex + c;
if (grid[targetRow][targetCol] === undefined) {
grid[targetRow][targetCol] = text;
}
}
}
colIndex += colSpan;
});
});
return grid;
}
Now a table like this:
<table>
<tr><td rowspan="2">A</td><td>B</td></tr>
<tr><td>C</td></tr>
</table>
Correctly becomes:
[
["A", "B"],
["A", "C"] // "A" appears in both rows
]
Extracting Clean Text
textContent grabs everything—including CSS rules in <style> tags and JavaScript in <script> tags that some pages inject into table cells.
Clean extraction requires filtering:
function extractCellText(cell) {
if (!cell) return "";
// Clone to avoid modifying the DOM
const clone = cell.cloneNode(true);
// Remove invisible elements
const invisibleSelectors = "style, script, noscript, template, link";
clone.querySelectorAll(invisibleSelectors).forEach(el => el.remove());
// Normalize whitespace
return (clone.textContent || "").replace(/\s+/g, " ").trim();
}
Detecting Nested Tables
When a table contains another table in a cell, you typically want the outer table's data, not a recursive mess.
Detection is straightforward:
function isNestedTable(table, allTables) {
let parent = table.parentElement;
while (parent) {
if (parent.tagName === "TABLE") {
return true; // This table is inside another table
}
parent = parent.parentElement;
}
return false;
}
// Filter when scanning a page
const allTables = document.querySelectorAll("table");
const topLevelTables = Array.from(allTables)
.filter(t => !isNestedTable(t, allTables));
Converting to CSV
CSV looks simple until you need to handle:
- Commas inside values
- Quotes inside values
- Newlines inside values
The RFC 4180 compliant approach:
function toCSV(rows, delimiter = ",") {
return rows.map(row =>
row.map(cell => {
if (cell == null) cell = "";
const str = String(cell);
// Quote if contains delimiter, quotes, or newlines
const needsQuotes = str.includes(delimiter) || /["\r\n]/.test(str);
const escaped = str.replace(/"/g, '""');
return needsQuotes ? `"${escaped}"` : escaped;
}).join(delimiter)
).join("\r\n");
}
This correctly handles the nightmare case:
toCSV([['Say "Hello, World"', "Normal"]])
// '"Say ""Hello, World""",Normal'
For a complete guide on CSV exports, see How to Export HTML Tables to CSV in Chrome.
Converting to JSON
For JSON export, the first row becomes keys:
function toJSON(rows) {
if (rows.length < 2) return "[]";
const headers = rows[0].map((h, i) => sanitizeKey(h, i));
const dataRows = rows.slice(1);
const objects = dataRows.map(row => {
const obj = {};
headers.forEach((key, i) => {
obj[key] = row[i] ?? "";
});
return obj;
});
return JSON.stringify(objects, null, 2);
}
function sanitizeKey(header, index) {
let key = (header || "").toString().trim();
if (!key) return `col_${index + 1}`;
// Normalize to lowercase snake_case
return key
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "") // Remove accents
.toLowerCase()
.replace(/[^a-z0-9]+/g, "_")
.replace(/^_+|_+$/g, "");
}
Input:
| Product Name | Price ($) |
|--------------|-----------|
| Widget | 29.99 |
Output:
[
{
"product_name": "Widget",
"price": "29.99"
}
]
Triggering the Download
In a browser context, you can trigger a download without a server:
function downloadFile(content, filename, mimeType) {
const blob = new Blob([content], { type: mimeType });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = filename;
link.click();
URL.revokeObjectURL(url);
}
// Usage
const csv = toCSV(extractTableMatrix(table));
downloadFile(csv, "data.csv", "text/csv;charset=utf-8");
Putting It All Together
Here's a minimal bookmarklet that exports the first table on any page:
javascript:(function(){
const table = document.querySelector("table");
if (!table) { alert("No table found"); return; }
function extractTableMatrix(table) {
const rows = Array.from(table.rows);
const grid = [];
rows.forEach((rowEl, ri) => {
if (!grid[ri]) grid[ri] = [];
let ci = 0;
Array.from(rowEl.cells).forEach(cell => {
while (grid[ri][ci] !== undefined) ci++;
const text = cell.textContent.trim();
const rs = parseInt(cell.rowSpan) || 1;
const cs = parseInt(cell.colSpan) || 1;
for (let r = 0; r < rs; r++) {
if (!grid[ri+r]) grid[ri+r] = [];
for (let c = 0; c < cs; c++) {
if (grid[ri+r][ci+c] === undefined) grid[ri+r][ci+c] = text;
}
}
ci += cs;
});
});
return grid;
}
const data = extractTableMatrix(table);
const csv = data.map(row =>
row.map(c => c.includes(",") ? `"${c}"` : c).join(",")
).join("\n");
const blob = new Blob([csv], {type: "text/csv"});
const link = document.createElement("a");
link.href = URL.createObjectURL(blob);
link.download = "table.csv";
link.click();
})();
When to Use a Browser Extension Instead
This code works, but maintaining it across different sites is tedious. If you're extracting tables regularly, a browser extension handles:
- Multiple tables per page
- Format selection (CSV, JSON, Excel)
- Data cleaning (number normalization, null handling)
- Column selection and reordering
I built HTML Table Exporter for exactly this workflow. The core algorithms are similar to what's shown here, packaged in a usable UI.
Learn more at gauchogrid.com/html-table-exporter or try it free on the Chrome Web Store.
Questions about table extraction edge cases? Drop a comment; I've probably hit it.
Top comments (0)