Pandas Data Cleaning Utilities

Common pandas operations for data cleaning and exploration

pandasdata-cleaningdata-explorationpython

Pandas Data Cleaning Utilities

Common data cleaning and exploration functions using pandas.

import pandas as pd
import numpy as np

def data_overview(df):
    """
    Comprehensive data overview with statistics.

    Args:
        df: Input DataFrame

    Returns:
        Dictionary with overview statistics
    """
    overview = {
        'shape': df.shape,
        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
        'missing_values': df.isnull().sum(),
        'missing_percentage': (df.isnull().sum() / len(df)) * 100,
        'duplicate_rows': df.duplicated().sum(),
        'data_types': df.dtypes,
        'numeric_summary': df.describe() if len(df.select_dtypes(include=[np.number]).columns) > 0 else None,
    }

    print("=" * 70)
    print("DATA OVERVIEW")
    print("=" * 70)
    print(f"Shape: {overview['shape']}")
    print(f"Memory Usage: {overview['memory_usage_mb']:.2f} MB")
    print(f"Duplicate Rows: {overview['duplicate_rows']}")
    print("\nMissing Values:")
    missing_df = pd.DataFrame({
        'Count': overview['missing_values'],
        'Percentage': overview['missing_percentage']
    }).sort_values('Count', ascending=False)
    print(missing_df[missing_df['Count'] > 0])

    return overview

def clean_dataframe(df,
                   drop_duplicates=True,
                   handle_missing='drop',
                   remove_outliers=False,
                   outlier_method='iqr'):
    """
    Clean DataFrame with common operations.

    Args:
        df: Input DataFrame
        drop_duplicates: Remove duplicate rows
        handle_missing: Strategy ('drop', 'fill_mean', 'fill_median', 'fill_mode')
        remove_outliers: Whether to remove outliers
        outlier_method: Method for outlier detection ('iqr' or 'zscore')
    """
    df = df.copy()
    initial_shape = df.shape

    # Remove duplicates
    if drop_duplicates:
        df = df.drop_duplicates()
        print(f"Removed {initial_shape[0] - df.shape[0]} duplicate rows")

    # Handle missing values
    if handle_missing == 'drop':
        df = df.dropna()
    elif handle_missing == 'fill_mean':
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    elif handle_missing == 'fill_median':
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    elif handle_missing == 'fill_mode':
        for col in df.columns:
            df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown',
                          inplace=True)

    # Remove outliers
    if remove_outliers:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if outlier_method == 'iqr':
            for col in numeric_cols:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        elif outlier_method == 'zscore':
            from scipy import stats
            z_scores = np.abs(stats.zscore(df[numeric_cols]))
            df = df[(z_scores < 3).all(axis=1)]

    print(f"Final shape: {df.shape} (removed {initial_shape[0] - df.shape[0]} rows)")
    return df

# Usage Example
# overview = data_overview(df)
# df_clean = clean_dataframe(df, handle_missing='fill_mean', remove_outliers=True)