Data Preprocessing Pipeline

Complete data preprocessing pipeline for machine learning with missing values, scaling, and encoding

data-preprocessingpandassklearnpython

Data Preprocessing Pipeline

A comprehensive data preprocessing function that handles missing values, duplicates, scaling, and encoding.

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer

def preprocess_data(df,
                   handle_missing='mean',
                   remove_duplicates=True,
                   scale_features=True,
                   scale_method='standard',
                   encode_categorical=True,
                   drop_columns=None):
    """
    Comprehensive data preprocessing pipeline.

    Args:
        df: Input DataFrame
        handle_missing: Strategy for missing values ('mean', 'median', 'mode', 'knn', 'drop')
        remove_duplicates: Whether to remove duplicate rows
        scale_features: Whether to scale numerical features
        scale_method: Scaling method ('standard' or 'minmax')
        encode_categorical: Whether to encode categorical variables
        drop_columns: List of columns to drop

    Returns:
        Preprocessed DataFrame and preprocessing objects
    """
    df = df.copy()

    # Drop specified columns
    if drop_columns:
        df = df.drop(columns=drop_columns, errors='ignore')

    # Remove duplicates
    if remove_duplicates:
        initial_rows = len(df)
        df = df.drop_duplicates()
        print(f"Removed {initial_rows - len(df)} duplicate rows")

    # Handle missing values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    if handle_missing == 'mean':
        imputer = SimpleImputer(strategy='mean')
        df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    elif handle_missing == 'median':
        imputer = SimpleImputer(strategy='median')
        df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    elif handle_missing == 'mode':
        imputer = SimpleImputer(strategy='most_frequent')
        df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    elif handle_missing == 'knn':
        imputer = KNNImputer(n_neighbors=5)
        df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    elif handle_missing == 'drop':
        df = df.dropna()

    # Fill categorical missing values with mode
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown',
                      inplace=True)

    # Encode categorical variables
    label_encoders = {}
    if encode_categorical:
        for col in categorical_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le

    # Scale features
    scaler = None
    if scale_features and len(numeric_cols) > 0:
        if scale_method == 'standard':
            scaler = StandardScaler()
        elif scale_method == 'minmax':
            scaler = MinMaxScaler()

        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    print(f"Preprocessing complete. Shape: {df.shape}")
    return df, scaler, label_encoders

# Usage Example
# df_processed, scaler, encoders = preprocess_data(
#     df,
#     handle_missing='mean',
#     scale_method='standard'
# )