Data Visualization Utilities
Common plotting functions for exploratory data analysis.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
def plot_correlation_matrix(df, figsize=(12, 10), annot=True):
"""
Plot correlation matrix heatmap.
Args:
df: DataFrame with numeric columns
figsize: Figure size
annot: Whether to annotate with correlation values
"""
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()
plt.figure(figsize=figsize)
sns.heatmap(corr_matrix, annot=annot, fmt='.2f', cmap='coolwarm',
center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
return corr_matrix
def plot_distribution(df, columns=None, figsize=(15, 5)):
"""
Plot distribution of numeric columns.
Args:
df: DataFrame
columns: List of columns to plot (default: all numeric)
figsize: Figure size
"""
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
n_cols = len(columns)
n_rows = (n_cols + 2) // 3
fig, axes = plt.subplots(n_rows, 3, figsize=figsize)
axes = axes.flatten() if n_cols > 1 else [axes]
for idx, col in enumerate(columns):
if idx < len(axes):
axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
axes[idx].set_title(f'Distribution of {col}')
axes[idx].set_xlabel(col)
axes[idx].set_ylabel('Frequency')
axes[idx].grid(True, alpha=0.3)
# Hide unused subplots
for idx in range(len(columns), len(axes)):
axes[idx].axis('off')
plt.tight_layout()
plt.show()
def plot_boxplots(df, columns=None, figsize=(15, 5)):
"""
Plot boxplots for numeric columns.
Args:
df: DataFrame
columns: List of columns to plot
figsize: Figure size
"""
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
n_cols = len(columns)
n_rows = (n_cols + 2) // 3
fig, axes = plt.subplots(n_rows, 3, figsize=figsize)
axes = axes.flatten() if n_cols > 1 else [axes]
for idx, col in enumerate(columns):
if idx < len(axes):
axes[idx].boxplot(df[col].dropna())
axes[idx].set_title(f'Boxplot of {col}')
axes[idx].set_ylabel(col)
axes[idx].grid(True, alpha=0.3)
for idx in range(len(columns), len(axes)):
axes[idx].axis('off')
plt.tight_layout()
plt.show()
# Usage Example
# plot_correlation_matrix(df)
# plot_distribution(df, columns=['age', 'income', 'score'])