Data Preprocessing Pipeline
A comprehensive data preprocessing function that handles missing values, duplicates, scaling, and encoding.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
def preprocess_data(df,
handle_missing='mean',
remove_duplicates=True,
scale_features=True,
scale_method='standard',
encode_categorical=True,
drop_columns=None):
"""
Comprehensive data preprocessing pipeline.
Args:
df: Input DataFrame
handle_missing: Strategy for missing values ('mean', 'median', 'mode', 'knn', 'drop')
remove_duplicates: Whether to remove duplicate rows
scale_features: Whether to scale numerical features
scale_method: Scaling method ('standard' or 'minmax')
encode_categorical: Whether to encode categorical variables
drop_columns: List of columns to drop
Returns:
Preprocessed DataFrame and preprocessing objects
"""
df = df.copy()
# Drop specified columns
if drop_columns:
df = df.drop(columns=drop_columns, errors='ignore')
# Remove duplicates
if remove_duplicates:
initial_rows = len(df)
df = df.drop_duplicates()
print(f"Removed {initial_rows - len(df)} duplicate rows")
# Handle missing values
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
if handle_missing == 'mean':
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
elif handle_missing == 'median':
imputer = SimpleImputer(strategy='median')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
elif handle_missing == 'mode':
imputer = SimpleImputer(strategy='most_frequent')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
elif handle_missing == 'knn':
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
elif handle_missing == 'drop':
df = df.dropna()
# Fill categorical missing values with mode
for col in categorical_cols:
df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown',
inplace=True)
# Encode categorical variables
label_encoders = {}
if encode_categorical:
for col in categorical_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col].astype(str))
label_encoders[col] = le
# Scale features
scaler = None
if scale_features and len(numeric_cols) > 0:
if scale_method == 'standard':
scaler = StandardScaler()
elif scale_method == 'minmax':
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print(f"Preprocessing complete. Shape: {df.shape}")
return df, scaler, label_encoders
# Usage Example
# df_processed, scaler, encoders = preprocess_data(
# df,
# handle_missing='mean',
# scale_method='standard'
# )