Wikipedia is the most common source for web table data. It's also a minefield of edge cases that break naive scrapers.
I've collected the five patterns that cause the most issues while building HTML Table Exporter, with detection code and fixes for each.
Pattern 1: Navigation Rows ("v t e")
The Problem:
<table>
<tr>
<td colspan="5">v t e Countries by population</td>
</tr>
<tr>
<td>Rank</td><td>Country</td><td>Population</td>...
</tr>
...
</table>
The first row contains "v t e" (View/Talk/Edit) links to Wikipedia template pages. If your scraper treats row 0 as headers, everything breaks.
What pd.read_html produces:
v t e Countries by population
0 Rank Country ...
1 1 China ...
Detection:
def is_nav_row(row_values):
"""Detect Wikipedia navigation prefix."""
if not row_values:
return False
first_cell = str(row_values[0]).strip().lower()
patterns = [
r'^v\s+t\s+e\s', # "v t e "
r'^v\s*\|\s*t\s*\|\s*e', # "v | t | e"
r'^\[v\]\s*\[t\]\s*\[e\]' # "[v] [t] [e]"
]
import re
return any(re.match(p, first_cell) for p in patterns)
Fix:
import pandas as pd
def read_wikipedia_table(url, table_index=0):
tables = pd.read_html(url)
df = tables[table_index]
# Check if first row is navigation
if is_nav_row(df.iloc[0].values):
# Use second row as header
df.columns = df.iloc[1]
df = df.iloc[2:].reset_index(drop=True)
return df
Pattern 2: Horizontally Duplicated Tables
The Problem:
To save vertical space, Wikipedia displays some tables in multiple columns:
| Rank | Name | Pop | Rank | Name | Pop |
|------|--------|------|------|---------|------|
| 1 | Tokyo | 37M | 11 | Paris | 11M |
| 2 | Delhi | 32M | 12 | Cairo | 10M |
This is logically ONE table with repeated column structure.
What pd.read_html produces:
Rank Name Pop Rank.1 Name.1 Pop.1
0 1 Tokyo 37M 11 Paris 11M
1 2 Delhi 32M 12 Cairo 10M
Pandas sees it as 6 columns. If you filter by "Name", you miss half the data.
Detection:
def detect_horizontal_duplication(columns):
"""Check if columns repeat (Rank, Name, Pop, Rank, Name, Pop)."""
cols = list(columns)
n = len(cols)
# Try dividing by 2, 3, 4
for divisor in [2, 3, 4]:
if n % divisor != 0:
continue
chunk_size = n // divisor
base_pattern = [c.rstrip('.0123456789') for c in cols[:chunk_size]]
is_duplicate = True
for i in range(1, divisor):
chunk = cols[i * chunk_size : (i + 1) * chunk_size]
normalized = [c.rstrip('.0123456789') for c in chunk]
if normalized != base_pattern:
is_duplicate = False
break
if is_duplicate:
return chunk_size
return None
Fix:
def normalize_duplicated_table(df, base_columns):
"""Stack horizontally duplicated tables vertically."""
n_repeats = len(df.columns) // base_columns
frames = []
for i in range(n_repeats):
start = i * base_columns
end = start + base_columns
chunk = df.iloc[:, start:end].copy()
chunk.columns = df.columns[:base_columns]
# Remove rows where all values are NaN (empty second half)
chunk = chunk.dropna(how='all')
frames.append(chunk)
return pd.concat(frames, ignore_index=True)
# Usage
df = pd.read_html(url)[0]
chunk_size = detect_horizontal_duplication(df.columns)
if chunk_size:
df = normalize_duplicated_table(df, chunk_size)
Pattern 3: Title Rows (Spanning All Columns)
The Problem:
<table>
<tr>
<td colspan="4">List of tallest buildings in the world</td>
</tr>
<tr>
<td>Rank</td><td>Building</td><td>City</td><td>Height</td>
</tr>
...
</table>
The first row is a title, not data. After colspan expansion, it becomes:
['List of tallest...', 'List of tallest...', 'List of tallest...', 'List of tallest...']
Detection:
def is_title_row(row_values, next_row_values):
"""Detect full-width title rows."""
if not row_values or not next_row_values:
return False
# All values are the same (colspan expanded)
unique_values = set(str(v).strip() for v in row_values if str(v).strip())
# Title row: 1 unique value, next row has multiple unique values
# And the value is long (> 20 chars typically for titles)
if len(unique_values) == 1:
title = list(unique_values)[0]
next_unique = len(set(str(v).strip() for v in next_row_values if str(v).strip()))
return len(title) > 20 and next_unique > 2
return False
Fix:
def skip_title_rows(df):
"""Remove title rows from the top of a dataframe."""
skip_count = 0
for i in range(min(3, len(df) - 1)):
current_row = df.iloc[i].values
next_row = df.iloc[i + 1].values if i + 1 < len(df) else None
if is_title_row(current_row, next_row):
skip_count = i + 1
else:
break
if skip_count > 0:
# Use the row after titles as header
df.columns = df.iloc[skip_count]
df = df.iloc[skip_count + 1:].reset_index(drop=True)
return df
Pattern 4: Grouped Headers (Two-Level)
The Problem:
| | | Statistics | Statistics |
| Rank | Country | GDP (nominal) | GDP (PPP) |
|--------|---------|-------------------|------------------|
| 1 | USA | 25.5 trillion | 25.5 trillion |
Row 0 is category headers. Row 1 is actual column headers. Both are semantically "headers."
What pd.read_html produces:
Often corrupted or with MultiIndex that's awkward to work with.
Detection:
def has_grouped_headers(df):
"""Detect two-level grouped headers."""
if len(df) < 3:
return False
row0 = df.iloc[0].values
row1 = df.iloc[1].values
# Count repeated consecutive values in row0
repeat_count = 0
for i in range(1, len(row0)):
if str(row0[i]).strip() == str(row0[i-1]).strip() and str(row0[i]).strip():
repeat_count += 1
repeat_ratio = repeat_count / max(1, len(row0) - 1)
# Grouped headers typically have 40%+ repeated values
# AND row1 has more unique non-empty values than row0
unique0 = len(set(str(v).strip() for v in row0 if str(v).strip()))
unique1 = len(set(str(v).strip() for v in row1 if str(v).strip()))
return repeat_ratio > 0.3 and unique1 > unique0
Fix:
def merge_grouped_headers(df):
"""Merge two-level headers into single level."""
group_row = df.iloc[0].values
header_row = df.iloc[1].values
merged = []
for i, (group, header) in enumerate(zip(group_row, header_row)):
g = str(group).strip()
h = str(header).strip()
if not g or g == h:
merged.append(h)
elif not h:
merged.append(g)
else:
merged.append(f"{g} - {h}")
df.columns = merged
return df.iloc[2:].reset_index(drop=True)
# Usage
if has_grouped_headers(df):
df = merge_grouped_headers(df)
Pattern 5: Nested Infobox Tables
The Problem:
Wikipedia infoboxes contain tables within table cells:
<table class="infobox">
<tr>
<td>Population</td>
<td>
<table> <!-- Nested! -->
<tr><td>Urban</td><td>8.3M</td></tr>
<tr><td>Metro</td><td>20.1M</td></tr>
</table>
</td>
</tr>
</table>
What pd.read_html produces:
Both the outer and inner tables are returned. If you're looking for "all tables on the page," you get duplicates and nested junk.
Detection and Filtering:
from bs4 import BeautifulSoup
import requests
def get_top_level_tables(url):
"""Get only top-level tables, not nested ones."""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
all_tables = soup.find_all('table')
top_level = []
for table in all_tables:
# Check if this table is inside another table
parent = table.parent
is_nested = False
while parent:
if parent.name == 'table':
is_nested = True
break
parent = parent.parent
if not is_nested:
top_level.append(table)
return top_level
def read_top_level_tables(url):
"""Read only top-level tables as DataFrames."""
import pandas as pd
tables = get_top_level_tables(url)
dfs = []
for table in tables:
try:
# Convert single table to DataFrame
df = pd.read_html(str(table))[0]
dfs.append(df)
except Exception:
continue
return dfs
Complete Wikipedia Table Reader
Combining all fixes:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
class WikipediaTableReader:
def __init__(self, url):
self.url = url
self.soup = None
def _fetch(self):
if self.soup is None:
response = requests.get(self.url)
self.soup = BeautifulSoup(response.text, 'html.parser')
def _is_nav_row(self, values):
if not values:
return False
first = str(values[0]).strip().lower()
return bool(re.match(r'^v\s+t\s+e\s', first))
def _is_title_row(self, values, next_values):
unique = set(str(v).strip() for v in values if str(v).strip())
if len(unique) != 1:
return False
title = list(unique)[0]
next_unique = len(set(str(v).strip() for v in next_values if str(v).strip()))
return len(title) > 20 and next_unique > 2
def get_tables(self, skip_infobox=True):
"""Get all data tables from the page."""
self._fetch()
tables = self.soup.find_all('table')
results = []
for table in tables:
# Skip nested tables
if table.find_parent('table'):
continue
# Skip infoboxes if requested
if skip_infobox and 'infobox' in table.get('class', []):
continue
try:
df = pd.read_html(str(table))[0]
df = self._clean_table(df)
if len(df) > 0 and len(df.columns) > 1:
results.append(df)
except Exception:
continue
return results
def _clean_table(self, df):
"""Apply all cleaning steps."""
# Skip nav rows
while len(df) > 0 and self._is_nav_row(df.iloc[0].values):
df.columns = df.iloc[1] if len(df) > 1 else df.columns
df = df.iloc[2:].reset_index(drop=True) if len(df) > 2 else df.iloc[1:]
# Skip title rows
if len(df) > 1:
while self._is_title_row(df.iloc[0].values, df.iloc[1].values if len(df) > 1 else []):
df.columns = df.iloc[1]
df = df.iloc[2:].reset_index(drop=True)
return df
# Usage
reader = WikipediaTableReader("https://en.wikipedia.org/wiki/List_of_countries_by_population")
tables = reader.get_tables()
When to Use an Extension Instead
If you're doing ad-hoc extraction (not building a pipeline), a browser extension handles all these patterns automatically.
HTML Table Exporter detects these patterns and normalizes the output. One click vs. debugging edge cases.
For a practical walkthrough, see our guide on exporting Wikipedia tables to Excel in 30 seconds.
For automated pipelines, use the code above. For occasional exports, use the right tool for the job.
Learn more at gauchogrid.com/html-table-exporter or try it free on the Chrome Web Store.
Found a Wikipedia table that breaks this code? Share the URL—I'll add it to my test suite.
Top comments (0)