Automating Web Table Exports with JavaScript: A Practical Guide

#javascript #datascience #webdev #tutorial

You've found a table on a website. You need that data in a spreadsheet. The obvious path—copy, paste, clean up in Excel—works once. But what if you need this data weekly? Or from 50 different pages?

This guide shows you how to extract HTML tables programmatically with JavaScript, handle the edge cases that break naive approaches, and export to formats your tools actually accept.

The Naive Approach (And Why It Fails)

The simplest extraction looks like this:

function extractTable(table) {
  return Array.from(table.rows).map(row => 
    Array.from(row.cells).map(cell => cell.textContent.trim())
  );
}

This works for simple tables. It breaks immediately when you hit:

Rowspan/colspan — Cells that span multiple rows or columns
Nested tables — Tables inside table cells
Hidden content — <style>, <script>, or display:none elements
Special characters — Newlines, tabs, and quotes in cell content

Let's fix each one.

Handling Rowspan and Colspan

When a cell has rowspan="2", it occupies space in the current row AND the next row. A naive extractor sees fewer cells than expected and misaligns columns.

The solution: build a virtual grid that tracks occupied positions.

function extractTableMatrix(table) {
  const rows = Array.from(table.rows);
  const grid = [];

  rows.forEach((rowEl, rowIndex) => {
    if (!grid[rowIndex]) grid[rowIndex] = [];

    let colIndex = 0;

    Array.from(rowEl.cells).forEach(cell => {
      // Skip columns already occupied by previous rowspans
      while (grid[rowIndex][colIndex] !== undefined) {
        colIndex++;
      }

      const text = cell.textContent.trim();
      const rowSpan = parseInt(cell.rowSpan, 10) || 1;
      const colSpan = parseInt(cell.colSpan, 10) || 1;

      // Fill the rectangular block this cell occupies
      for (let r = 0; r < rowSpan; r++) {
        const targetRow = rowIndex + r;
        if (!grid[targetRow]) grid[targetRow] = [];

        for (let c = 0; c < colSpan; c++) {
          const targetCol = colIndex + c;
          if (grid[targetRow][targetCol] === undefined) {
            grid[targetRow][targetCol] = text;
          }
        }
      }

      colIndex += colSpan;
    });
  });

  return grid;
}

Now a table like this:

<table>
  <tr><td rowspan="2">A</td><td>B</td></tr>
  <tr><td>C</td></tr>
</table>

Correctly becomes:

[
  ["A", "B"],
  ["A", "C"]  // "A" appears in both rows
]

Extracting Clean Text

textContent grabs everything—including CSS rules in <style> tags and JavaScript in <script> tags that some pages inject into table cells.

Clean extraction requires filtering:

function extractCellText(cell) {
  if (!cell) return "";

  // Clone to avoid modifying the DOM
  const clone = cell.cloneNode(true);

  // Remove invisible elements
  const invisibleSelectors = "style, script, noscript, template, link";
  clone.querySelectorAll(invisibleSelectors).forEach(el => el.remove());

  // Normalize whitespace
  return (clone.textContent || "").replace(/\s+/g, " ").trim();
}

Detecting Nested Tables

When a table contains another table in a cell, you typically want the outer table's data, not a recursive mess.

Detection is straightforward:

function isNestedTable(table, allTables) {
  let parent = table.parentElement;

  while (parent) {
    if (parent.tagName === "TABLE") {
      return true;  // This table is inside another table
    }
    parent = parent.parentElement;
  }

  return false;
}

// Filter when scanning a page
const allTables = document.querySelectorAll("table");
const topLevelTables = Array.from(allTables)
  .filter(t => !isNestedTable(t, allTables));

Converting to CSV

CSV looks simple until you need to handle:

Commas inside values
Quotes inside values
Newlines inside values

The RFC 4180 compliant approach:

function toCSV(rows, delimiter = ",") {
  return rows.map(row =>
    row.map(cell => {
      if (cell == null) cell = "";
      const str = String(cell);

      // Quote if contains delimiter, quotes, or newlines
      const needsQuotes = str.includes(delimiter) || /["\r\n]/.test(str);
      const escaped = str.replace(/"/g, '""');

      return needsQuotes ? `"${escaped}"` : escaped;
    }).join(delimiter)
  ).join("\r\n");
}

This correctly handles the nightmare case:

toCSV([['Say "Hello, World"', "Normal"]])
// '"Say ""Hello, World""",Normal'

For a complete guide on CSV exports, see How to Export HTML Tables to CSV in Chrome.

Converting to JSON

For JSON export, the first row becomes keys:

function toJSON(rows) {
  if (rows.length < 2) return "[]";

  const headers = rows[0].map((h, i) => sanitizeKey(h, i));
  const dataRows = rows.slice(1);

  const objects = dataRows.map(row => {
    const obj = {};
    headers.forEach((key, i) => {
      obj[key] = row[i] ?? "";
    });
    return obj;
  });

  return JSON.stringify(objects, null, 2);
}

function sanitizeKey(header, index) {
  let key = (header || "").toString().trim();

  if (!key) return `col_${index + 1}`;

  // Normalize to lowercase snake_case
  return key
    .normalize("NFD")
    .replace(/[\u0300-\u036f]/g, "")  // Remove accents
    .toLowerCase()
    .replace(/[^a-z0-9]+/g, "_")
    .replace(/^_+|_+$/g, "");
}

Input:

| Product Name | Price ($) |
|--------------|-----------|
| Widget       | 29.99     |

Output:

[
  {
    "product_name": "Widget",
    "price": "29.99"
  }
]

Triggering the Download

In a browser context, you can trigger a download without a server:

function downloadFile(content, filename, mimeType) {
  const blob = new Blob([content], { type: mimeType });
  const url = URL.createObjectURL(blob);

  const link = document.createElement("a");
  link.href = url;
  link.download = filename;
  link.click();

  URL.revokeObjectURL(url);
}

// Usage
const csv = toCSV(extractTableMatrix(table));
downloadFile(csv, "data.csv", "text/csv;charset=utf-8");

Putting It All Together

Here's a minimal bookmarklet that exports the first table on any page:

javascript:(function(){
  const table = document.querySelector("table");
  if (!table) { alert("No table found"); return; }

  function extractTableMatrix(table) {
    const rows = Array.from(table.rows);
    const grid = [];
    rows.forEach((rowEl, ri) => {
      if (!grid[ri]) grid[ri] = [];
      let ci = 0;
      Array.from(rowEl.cells).forEach(cell => {
        while (grid[ri][ci] !== undefined) ci++;
        const text = cell.textContent.trim();
        const rs = parseInt(cell.rowSpan) || 1;
        const cs = parseInt(cell.colSpan) || 1;
        for (let r = 0; r < rs; r++) {
          if (!grid[ri+r]) grid[ri+r] = [];
          for (let c = 0; c < cs; c++) {
            if (grid[ri+r][ci+c] === undefined) grid[ri+r][ci+c] = text;
          }
        }
        ci += cs;
      });
    });
    return grid;
  }

  const data = extractTableMatrix(table);
  const csv = data.map(row => 
    row.map(c => c.includes(",") ? `"${c}"` : c).join(",")
  ).join("\n");

  const blob = new Blob([csv], {type: "text/csv"});
  const link = document.createElement("a");
  link.href = URL.createObjectURL(blob);
  link.download = "table.csv";
  link.click();
})();