Scale features, encode categories, handle missing values, and engineer new features for better ML models
Garbage in, garbage out. The quality of your data preprocessing directly determines model performance. This lesson covers the essential transformations every ML practitioner must know.
Raw data is messy:
Best for: algorithms sensitive to scale (SVM, KNN, neural networks, PCA)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Use same scaler!
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Use same scaler!
Result: mean=0, std=1 for each feature
Best for: neural networks, image data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['size_encoded'] = le.fit_transform(df['size']) # S→0, M→1, L→2
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['size_encoded'] = le.fit_transform(df['size']) # S→0, M→1, L→2
from sklearn.preprocessing import OneHotEncoder
# or use pandas:
df = pd.get_dummies(df, columns=['colour'], drop_first=True)
from sklearn.preprocessing import OneHotEncoder
# or use pandas:
df = pd.get_dummies(df, columns=['colour'], drop_first=True)
from sklearn.impute import SimpleImputer
# Numerical: fill with mean/median
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
# Categorical: fill with most frequent
cat_imputer = SimpleImputer(strategy='most_frequent')
from sklearn.impute import SimpleImputer
# Numerical: fill with mean/median
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
# Categorical: fill with most frequent
cat_imputer = SimpleImputer(strategy='most_frequent')
Creating new features from existing ones:
# Polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X) # Adds x1², x2², x1*x2
# Log transform for skewed features
df['log_salary'] = np.log1p(df['salary'])
# Interaction features
df['age_experience'] = df['age'] * df['years_experience']
# Polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X) # Adds x1², x2², x1*x2
# Log transform for skewed features
df['log_salary'] = np.log1p(df['salary'])
# Interaction features
df['age_experience'] = df['age'] * df['years_experience']
Critical rule: Never fit preprocessing on test data!
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
to use AI code explanations