Pandas Data Cleaning Utilities
Common data cleaning and exploration functions using pandas.
import pandas as pd
import numpy as np
def data_overview(df):
"""
Comprehensive data overview with statistics.
Args:
df: Input DataFrame
Returns:
Dictionary with overview statistics
"""
overview = {
'shape': df.shape,
'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
'missing_values': df.isnull().sum(),
'missing_percentage': (df.isnull().sum() / len(df)) * 100,
'duplicate_rows': df.duplicated().sum(),
'data_types': df.dtypes,
'numeric_summary': df.describe() if len(df.select_dtypes(include=[np.number]).columns) > 0 else None,
}
print("=" * 70)
print("DATA OVERVIEW")
print("=" * 70)
print(f"Shape: {overview['shape']}")
print(f"Memory Usage: {overview['memory_usage_mb']:.2f} MB")
print(f"Duplicate Rows: {overview['duplicate_rows']}")
print("\nMissing Values:")
missing_df = pd.DataFrame({
'Count': overview['missing_values'],
'Percentage': overview['missing_percentage']
}).sort_values('Count', ascending=False)
print(missing_df[missing_df['Count'] > 0])
return overview
def clean_dataframe(df,
drop_duplicates=True,
handle_missing='drop',
remove_outliers=False,
outlier_method='iqr'):
"""
Clean DataFrame with common operations.
Args:
df: Input DataFrame
drop_duplicates: Remove duplicate rows
handle_missing: Strategy ('drop', 'fill_mean', 'fill_median', 'fill_mode')
remove_outliers: Whether to remove outliers
outlier_method: Method for outlier detection ('iqr' or 'zscore')
"""
df = df.copy()
initial_shape = df.shape
# Remove duplicates
if drop_duplicates:
df = df.drop_duplicates()
print(f"Removed {initial_shape[0] - df.shape[0]} duplicate rows")
# Handle missing values
if handle_missing == 'drop':
df = df.dropna()
elif handle_missing == 'fill_mean':
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
elif handle_missing == 'fill_median':
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
elif handle_missing == 'fill_mode':
for col in df.columns:
df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown',
inplace=True)
# Remove outliers
if remove_outliers:
numeric_cols = df.select_dtypes(include=[np.number]).columns
if outlier_method == 'iqr':
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
elif outlier_method == 'zscore':
from scipy import stats
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]
print(f"Final shape: {df.shape} (removed {initial_shape[0] - df.shape[0]} rows)")
return df
# Usage Example
# overview = data_overview(df)
# df_clean = clean_dataframe(df, handle_missing='fill_mean', remove_outliers=True)