Parsing HTML tables seems straightforward until you encounter real-world data. Wikipedia tables have navigation rows. Financial sites use complex rowspans. Sports statistics sites nest headers two levels deep.
After building HTML Table Exporter, a table extraction tool used on thousands of different sites, I've catalogued the edge cases that break most parsers. Here's how to handle each one.
Problem 1: Rowspan Expansion
A cell with rowspan="3" occupies vertical space in the current row and the next two rows. If you iterate through row.cells naively, your columns misalign.
The broken output:
| Country | 2020 | 2021 | 2022 | <- Header
| USA | 100 | 200 | 300 | <- Expected
| 150 | 250 | 350 | <- Missing "USA" (rowspan continued)
The fix: Track occupied positions in a virtual grid.
function expandRowspans(table) {
const rows = Array.from(table.rows);
const grid = [];
rows.forEach((rowEl, rowIndex) => {
if (!grid[rowIndex]) grid[rowIndex] = [];
let colIndex = 0;
Array.from(rowEl.cells).forEach(cell => {
// Find next unoccupied column
while (grid[rowIndex][colIndex] !== undefined) {
colIndex++;
}
const text = cell.textContent.trim();
const rowSpan = parseInt(cell.rowSpan, 10) || 1;
const colSpan = parseInt(cell.colSpan, 10) || 1;
// Mark all cells this element spans
for (let r = 0; r < rowSpan; r++) {
const targetRow = rowIndex + r;
if (!grid[targetRow]) grid[targetRow] = [];
for (let c = 0; c < colSpan; c++) {
grid[targetRow][colIndex + c] = text;
}
}
colIndex += colSpan;
});
});
// Normalize row lengths
const maxCols = Math.max(...grid.map(r => r.length));
return grid.map(row => {
const normalized = new Array(maxCols).fill("");
row.forEach((val, i) => normalized[i] = val ?? "");
return normalized;
});
}
Key insight: The virtual grid is the source of truth. The DOM cells are just instructions for populating it.
Problem 2: Nested Tables
Wikipedia infoboxes often contain tables within table cells. A recursive approach extracts garbage:
<table>
<tr>
<td>Country</td>
<td>
<table> <!-- Nested! -->
<tr><td>Population</td><td>330M</td></tr>
</table>
</td>
</tr>
</table>
Detection strategy: Check if a table's ancestor is also a table.
function isNestedTable(table) {
let parent = table.parentElement;
while (parent) {
if (parent.tagName === "TABLE") {
return true;
}
parent = parent.parentElement;
}
return false;
}
// When scanning a page
function getTopLevelTables() {
const all = document.querySelectorAll("table");
return Array.from(all).filter(t => !isNestedTable(t));
}
But what about the nested table's content?
For the outer table, I flatten nested tables to their text content:
function extractCellText(cell) {
const clone = cell.cloneNode(true);
// Remove nested tables (their text is already included via textContent)
clone.querySelectorAll("table").forEach(t => t.remove());
// Remove invisible elements
clone.querySelectorAll("style, script").forEach(el => el.remove());
return (clone.textContent || "").replace(/\s+/g, " ").trim();
}
Problem 3: Wikipedia Navigation Rows
Wikipedia tables often start with a navigation row:
| v t e List of countries by population |
| Rank | Country | Population |
| 1 | China | 1.4B |
That "v t e" row (View/Talk/Edit links) isn't data—it's UI. A parser that treats it as the header row produces garbage.
For a practical guide on handling Wikipedia tables, see Export Wikipedia Tables to Excel in 30 Seconds.
Detection:
function isWikipediaNavRow(row) {
const firstCell = row[0] || "";
// Common patterns for nav rows
const patterns = [
/^v\s+t\s+e\s/i, // "v t e "
/^\s*v\s*\|\s*t\s*\|\s*e/i, // "v | t | e"
/^\[v\]\s*\[t\]\s*\[e\]/i // "[v] [t] [e]"
];
return patterns.some(p => p.test(firstCell));
}
function detectHeaderRowIndex(matrix) {
for (let i = 0; i < Math.min(3, matrix.length - 1); i++) {
if (isWikipediaNavRow(matrix[i])) {
return i + 1; // Header is the next row
}
}
return 0; // Default: first row is header
}
Problem 4: Title Rows (Spanning All Columns)
Some tables have a title row that spans the entire width:
<table>
<tr><td colspan="4">Quarterly Revenue ($ millions)</td></tr>
<tr><td>Q1</td><td>Q2</td><td>Q3</td><td>Q4</td></tr>
<tr><td>100</td><td>120</td><td>115</td><td>130</td></tr>
</table>
After rowspan expansion, the first row becomes ["Quarterly Revenue...", "Quarterly Revenue...", ...]—the same value repeated.
Detection:
function isTitleRow(row, nextRow) {
if (!row || !nextRow) return false;
const uniqueValues = new Set(row.filter(v => v.trim()));
const nextUniqueValues = new Set(nextRow.filter(v => v.trim()));
// Title row characteristics:
// 1. Only one unique value (repeated via colspan)
// 2. Next row has multiple unique values (actual headers)
// 3. The single value is long text (>30 chars typically)
return (
uniqueValues.size === 1 &&
nextUniqueValues.size > 2 &&
row[0] && row[0].length > 30
);
}
Problem 5: Grouped Column Headers (FBREF Style)
Sports statistics sites like FBREF use two-level headers:
| | | Playing Time | Performance |
| Player | Nation | MP | Starts | Min | Gls | Ast | xG |
| Haaland| Norway | 35 | 33 | 2950| 36 | 8 | 32 |
The first row contains group names. The second row contains actual column names. Both are "headers."
The challenge: After colspan expansion, row 0 looks like:
["", "", "Playing Time", "Playing Time", "Playing Time", "Performance", "Performance", "Performance"]
Detection heuristics:
function isGroupHeaderRow(row, nextRow) {
if (!row || !nextRow || row.length !== nextRow.length) return false;
// Count how many cells have the same value as their neighbor
let repeatCount = 0;
for (let i = 1; i < row.length; i++) {
if (row[i] && row[i] === row[i-1]) repeatCount++;
}
const repeatRatio = repeatCount / (row.length - 1);
// Group header rows typically have 40%+ repeated values
// AND the next row has more unique values
const uniqueInRow = new Set(row.filter(v => v.trim())).size;
const uniqueInNext = new Set(nextRow.filter(v => v.trim())).size;
return repeatRatio > 0.4 && uniqueInNext > uniqueInRow;
}
Merging group + sub-headers:
function mergeGroupAndSubHeaders(groupRow, subHeaderRow) {
return subHeaderRow.map((subHeader, idx) => {
const group = (groupRow[idx] || "").trim();
const sub = (subHeader || "").trim();
if (!group) return sub;
if (!sub) return group;
if (sub.toLowerCase() === group.toLowerCase()) return sub;
return `${group} - ${sub}`;
});
}
// Result: ["Player", "Nation", "Playing Time - MP", "Playing Time - Starts", ...]
Problem 6: Horizontally Duplicated Tables
Wikipedia population tables often have this structure:
| Rank | Name | Pop | Rank | Name | Pop |
| 1 | Tokyo | 37M | 11 | Paris | 11M |
| 2 | Delhi | 32M | 12 | Cairo | 10M |
This is ONE logical table displayed in two columns to save vertical space.
Detection:
function detectHorizontalDuplication(headers) {
const half = Math.floor(headers.length / 2);
if (half < 2) return null;
const firstHalf = headers.slice(0, half);
const secondHalf = headers.slice(half, half * 2);
// Check if second half matches first half
const matches = firstHalf.every((h, i) =>
h.toLowerCase() === secondHalf[i]?.toLowerCase()
);
if (matches) {
return { detected: true, repeatCount: 2, baseColumns: half };
}
return null;
}
Normalization: Split each row and stack vertically:
function normalizeHorizontallyDuplicatedTable(matrix, baseColumns) {
const header = matrix[0].slice(0, baseColumns);
const normalizedRows = [header];
for (let i = 1; i < matrix.length; i++) {
const row = matrix[i];
// First half
normalizedRows.push(row.slice(0, baseColumns));
// Second half (if not empty)
const secondHalf = row.slice(baseColumns, baseColumns * 2);
if (secondHalf.some(cell => cell.trim())) {
normalizedRows.push(secondHalf);
}
}
return normalizedRows;
}
The Combined Algorithm
Real-world parsing requires checking all these cases in sequence:
function parseTable(table) {
// 1. Expand rowspans/colspans to virtual grid
let matrix = expandRowspans(table);
// 2. Detect and skip nav/title rows
const headerIndex = detectHeaderRowIndex(matrix);
if (headerIndex > 0) {
matrix = matrix.slice(headerIndex);
}
// 3. Handle grouped headers (FBREF style)
const groupedHeaders = detectGroupedColumnHeaders(matrix);
if (groupedHeaders) {
const mergedHeaders = mergeGroupAndSubHeaders(matrix[0], matrix[1]);
matrix = [mergedHeaders, ...matrix.slice(2)];
}
// 4. Handle horizontal duplication
const duplication = detectHorizontalDuplication(matrix[0]);
if (duplication) {
matrix = normalizeHorizontallyDuplicatedTable(matrix, duplication.baseColumns);
}
return matrix;
}
Testing These Edge Cases
Every pattern above came from a real bug report. I maintain a test suite with HTML fixtures for each:
// Test: Wikipedia-style nav row
const navRowHtml = `
<table>
<tr><td colspan="3">v t e Countries</td></tr>
<tr><td>Rank</td><td>Country</td><td>Pop</td></tr>
<tr><td>1</td><td>China</td><td>1.4B</td></tr>
</table>
`;
const result = parseTable(parseHtml(navRowHtml));
assert(result[0][0] === "Rank"); // Header correctly identified
assert(result[1][1] === "China"); // Data correctly aligned
The test suite has 24 cases covering combinations of these patterns. New bug reports become new test cases.
Try It Yourself
If you're building table extraction, I hope this saves you debugging time. If you just need to export tables without writing code, HTML Table Exporter handles all these cases automatically.
Learn more at gauchogrid.com/html-table-exporter or try it free on the Chrome Web Store.
Found a table that breaks your parser? Share the URL; I collect these edge cases.
Top comments (0)