Load, explore, clean, and transform datasets using Pandas DataFrames for ML preparation
Pandas is the go-to library for data manipulation and analysis. In ML, you spend 80% of your time preparing data — Pandas makes this manageable.
import pandas as pd
s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
import pandas as pd
s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
df = pd.DataFrame({
'age': [25, 30, 35],
'salary': [50000, 60000, 75000],
'dept': ['HR', 'Eng', 'Eng']
})
df = pd.DataFrame({
'age': [25, 30, 35],
'salary': [50000, 60000, 75000],
'dept': ['HR', 'Eng', 'Eng']
})
df.head() # First 5 rows
df.info() # Column types, null counts
df.describe() # Statistical summary
df.shape # (rows, columns)
df.dtypes # Data types
df.isnull().sum() # Count missing values
df.head() # First 5 rows
df.info() # Column types, null counts
df.describe() # Statistical summary
df.shape # (rows, columns)
df.dtypes # Data types
df.isnull().sum() # Count missing values
df['age'] # Single column (Series)
df[['age', 'salary']] # Multiple columns
df.iloc[0] # First row by position
df.loc[df['age'] > 28] # Filter rows by condition
df['age'] # Single column (Series)
df[['age', 'salary']] # Multiple columns
df.iloc[0] # First row by position
df.loc[df['age'] > 28] # Filter rows by condition
df.dropna() # Remove rows with NaN
df.fillna(df.mean()) # Fill NaN with column mean
df.drop_duplicates() # Remove duplicate rows
df['age'] = df['age'].astype(float) # Convert type
df.dropna() # Remove rows with NaN
df.fillna(df.mean()) # Fill NaN with column mean
df.drop_duplicates() # Remove duplicate rows
df['age'] = df['age'].astype(float) # Convert type
# Create new features
df['salary_per_year'] = df['salary'] / 12
df['is_engineer'] = (df['dept'] == 'Eng').astype(int)
# One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=['dept'])
# Create new features
df['salary_per_year'] = df['salary'] / 12
df['is_engineer'] = (df['dept'] == 'Eng').astype(int)
# One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=['dept'])
df.groupby('dept')['salary'].mean()
df.groupby('dept').agg({'salary': ['mean', 'max'], 'age': 'mean'})
df.groupby('dept')['salary'].mean()
df.groupby('dept').agg({'salary': ['mean', 'max'], 'age': 'mean'})
to use AI code explanations