WDSEGA

Posted on May 29

Python Data Processing with Pandas: Tips and Tricks

#python #datascience #tutorial #pandas

Pandas is the backbone of data processing in Python. Whether you're cleaning messy CSVs, aggregating millions of rows, or preparing datasets for machine learning, Pandas has the tools you need. But knowing which tools to use — and how to use them efficiently — makes all the difference.

Here are practical tips and tricks that will make your Pandas code faster, cleaner, and more maintainable.

1. Read Data Efficiently

Specify dtypes to Save Memory

import pandas as pd

# Bad: Pandas infers types, often using 64-bit by default
df = pd.read_csv('large_file.csv')

# Good: Specify optimal dtypes
dtypes = {
    'id': 'int32',
    'category': 'category',
    'price': 'float32',
    'is_active': 'bool',
    'name': 'string',
    'date': 'str',  # Parse dates separately
}

df = pd.read_csv('large_file.csv', dtype=dtypes, parse_dates=['date'])

Read Only Needed Columns

# Only load the columns you need
df = pd.read_csv(
    'large_file.csv',
    usecols=['id', 'name', 'price', 'date'],
    dtype={'id': 'int32', 'price': 'float32'}
)

Chunk Processing for Large Files

# Process a huge file in chunks
chunk_size = 100_000
results = []

for chunk in pd.read_csv('huge_file.csv', chunksize=chunk_size):
    # Process each chunk
    filtered = chunk[chunk['price'] > 100]
    results.append(filtered.groupby('category')['price'].mean())

# Combine results
final = pd.concat(results).groupby(level=0).mean()

2. Vectorized Operations Over Loops

The single biggest performance improvement you can make is replacing loops with vectorized operations:

import numpy as np

# Slow: Row-by-row iteration
def categorize_price_slow(df):
    categories = []
    for _, row in df.iterrows():
        if row['price'] < 10:
            categories.append('budget')
        elif row['price'] < 50:
            categories.append('mid-range')
        else:
            categories.append('premium')
    df['category'] = categories
    return df

# Fast: Vectorized with np.where
def categorize_price_fast(df):
    df['category'] = np.where(
        df['price'] < 10, 'budget',
        np.where(df['price'] < 50, 'mid-range', 'premium')
    )
    return df

# Even better: pd.cut for binning
df['category'] = pd.cut(
    df['price'],
    bins=[0, 10, 50, float('inf')],
    labels=['budget', 'mid-range', 'premium']
)

String Operations

# Vectorized string methods
df['name_clean'] = (
    df['name']
    .str.strip()
    .str.lower()
    .str.replace(r'[^\w\s]', '', regex=True)
    .str[:50]  # Truncate to 50 characters
)

# Check for patterns
has_email = df['text'].str.contains(r'\S+@\S+\.\S+', regex=True)

3. Method Chaining for Readable Pipelines

result = (
    pd.read_csv('sales.csv')
    .assign(
        date=lambda df: pd.to_datetime(df['date']),
        revenue=lambda df: df['quantity'] * df['price']
    )
    .query('revenue > 0 & quantity > 0')
    .assign(
        category=lambda df: pd.cut(
            df['revenue'],
            bins=[0, 100, 1000, float('inf')],
            labels=['low', 'medium', 'high']
        )
    )
    .groupby(['region', 'category'], observed=True)
    .agg(
        total_revenue=('revenue', 'sum'),
        avg_revenue=('revenue', 'mean'),
        order_count=('order_id', 'nunique'),
        top_product=('product', lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else None)
    )
    .reset_index()
    .sort_values('total_revenue', ascending=False)
    .head(20)
)

4. Handle Missing Data Like a Pro

# Diagnose missing data
missing = (
    df.isnull()
    .sum()
    .sort_values(ascending=False)
    .head(20)
)

missing_pct = (missing / len(df) * 100).round(1)
print(f"Missing values:\n{missing_pct[missing_pct > 0]}")

# Strategic filling
df['price'] = df['price'].fillna(df.groupby('category')['price'].transform('median'))
df['name'] = df['name'].fillna('Unknown')
df['is_active'] = df['is_active'].fillna(False)

# Forward fill for time series
df['value'] = df['value'].ffill(limit=7)  # Fill up to 7 consecutive gaps

# Interpolation for numeric data
df['temperature'] = df['temperature'].interpolate(method='linear', limit=5)

# Drop columns with too many missing values
threshold = 0.5  # Drop columns with >50% missing
df = df.loc[:, df.isnull().mean() < threshold]

5. Efficient GroupBy Operations

# Multiple aggregations at once
agg_result = df.groupby('department').agg(
    total_sales=('sales', 'sum'),
    avg_sales=('sales', 'mean'),
    num_employees=('employee_id', 'nunique'),
    best_month=('sales', lambda x: x.idxmax()),
    sales_std=('sales', 'std'),
).round(2)

# Named aggregations (cleaner output)
agg_result = df.groupby('region', observed=True).agg(
    revenue_total=pd.NamedAgg(column='revenue', aggfunc='sum'),
    revenue_avg=pd.NamedAgg(column='revenue', aggfunc='mean'),
    orders=pd.NamedAgg(column='order_id', aggfunc='nunique'),
)

# Transform: add group-level stats back to each row
df['dept_avg'] = df.groupby('department')['salary'].transform('mean')
df['salary_rank'] = df.groupby('department')['salary'].rank(pct=True)

# Filter groups based on conditions
large_departments = (
    df.groupby('department')
    .filter(lambda x: len(x) >= 10)
)

6. Merge and Join Strategies

# Different merge types for different use cases
# Inner merge: only matching rows
merged = pd.merge(orders, customers, on='customer_id', how='inner')

# Left merge: keep all from left
merged = pd.merge(orders, customers, on='customer_id', how='left')

# Handle duplicate keys with suffixes
merged = pd.merge(
    df1, df2,
    on='id',
    how='outer',
    suffixes=('_left', '_right'),
    indicator=True  # Adds '_merge' column showing merge origin
)

# Merge on multiple columns
merged = pd.merge(
    sales, products,
    left_on=['product_code', 'region'],
    right_on=['code', 'region'],
    how='left'
)

# Anti-join: find rows in A not in B
anti_join = df1[~df1['key'].isin(df2['key'])]

7. Time Series Tricks

# Set datetime index
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()

# Resampling
daily = df.resample('D')['sales'].sum()
weekly = df.resample('W')['sales'].mean()
monthly = df.resample('ME')['revenue'].agg(['sum', 'mean', 'count'])

# Rolling windows
df['rolling_7d_avg'] = df['sales'].rolling(window=7).mean()
df['rolling_30d_std'] = df['sales'].rolling(window=30).std()

# Expanding windows
df['cumulative_avg'] = df['sales'].expanding().mean()

# Shift and difference
df['sales_change'] = df['sales'].diff()
df['sales_yoy'] = df['sales'].shift(365)  # Year over year
df['pct_change'] = df['sales'].pct_change()

For the complete guide with all code examples and advanced patterns, read the full article on our blog.

Originally published at WD Tech Blog. Follow for more Python tutorials, AI tools, and developer resources.

DEV Community