3.1 Prerequisites
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PowerTransformer, StandardScaler
ames = fetch_openml(name="house_prices", as_frame=True)
df = ames.frame.copy()
print("Dataset shape:", df.shape)
## Dataset shape: (1460, 81)
df.head()
## Id MSSubClass MSZoning ... SaleType SaleCondition SalePrice
## 0 1 60 RL ... WD Normal 208500
## 1 2 20 RL ... WD Normal 181500
## 2 3 60 RL ... WD Normal 223500
## 3 4 70 RL ... WD Abnorml 140000
## 4 5 60 RL ... WD Normal 250000
##
## [5 rows x 81 columns]
3.2 Target Engineering
y = pd.to_numeric(df["SalePrice"], errors="coerce")
print(y.describe())
## count 1460.000000
## mean 180921.195890
## std 79442.502883
## min 34900.000000
## 25% 129975.000000
## 50% 163000.000000
## 75% 214000.000000
## max 755000.000000
## Name: SalePrice, dtype: float64
y_log = np.log1p(y)
print(y_log.describe())
## count 1460.000000
## mean 12.024057
## std 0.399449
## min 10.460271
## 25% 11.775105
## 50% 12.001512
## 75% 12.273736
## max 13.534474
## Name: SalePrice, dtype: float64
plt.hist(y.dropna(), bins=30)
plt.title("Original SalePrice Distribution")
plt.show()

plt.hist(y_log.dropna(), bins=30)
plt.title("Log Transformed SalePrice")
plt.show()

3.3 Dealing with Missingness
total_missing = df.isna().sum().sum()
print("Total missing values:", total_missing)
## Total missing values: 6965
missing_per_column = df.isna().sum().sort_values(ascending=False)
missing_per_column[missing_per_column > 0].head(20)
## PoolQC 1453
## MiscFeature 1406
## Alley 1369
## Fence 1179
## FireplaceQu 690
## LotFrontage 259
## GarageFinish 81
## GarageQual 81
## GarageYrBlt 81
## GarageType 81
## GarageCond 81
## BsmtExposure 38
## BsmtFinType2 38
## BsmtCond 37
## BsmtFinType1 37
## BsmtQual 37
## MasVnrArea 8
## MasVnrType 8
## Electrical 1
## dtype: int64
plt.figure(figsize=(14,8))
sns.heatmap(df.isna(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

3.4 Feature Filtering
zero_var = df.nunique() == 1
zero_var_features = df.columns[zero_var]
print("Zero variance features:", list(zero_var_features))
## Zero variance features: []
numeric_df = df.select_dtypes(include=np.number)
selector = VarianceThreshold(threshold=0.01)
filtered_numeric = selector.fit_transform(numeric_df)
selected_features = numeric_df.columns[selector.get_support()]
removed_features = numeric_df.columns[~selector.get_support()]
print("Near-zero variance features removed:")
## Near-zero variance features removed:
print(list(removed_features))
## []
3.5 Numeric Feature Engineering
numeric_df = df.select_dtypes(include=np.number)
skewness = numeric_df.skew().sort_values(ascending=False)
print("Top skewed features:")
## Top skewed features:
print(skewness.head(10))
## MiscVal 24.476794
## PoolArea 14.828374
## LotArea 12.207688
## 3SsnPorch 10.304342
## LowQualFinSF 9.011341
## KitchenAbvGr 4.488397
## BsmtFinSF2 4.255261
## ScreenPorch 4.122214
## BsmtHalfBath 4.103403
## EnclosedPorch 3.089872
## dtype: float64
Standardization
scaler = StandardScaler()
numeric_standardized = pd.DataFrame(
scaler.fit_transform(numeric_transformed),
columns=numeric_transformed.columns
)
numeric_standardized.head()
## Id MSSubClass LotFrontage ... MoSold YrSold SalePrice
## 0 -2.161263 0.493460 -0.146062 ... -1.694229 0.140472 0.571155
## 1 -2.149711 -1.164269 0.546852 ... -0.444891 -0.613646 0.227627
## 2 -2.139242 0.493460 -0.000631 ... 0.987490 0.140472 0.741869
## 3 -2.129513 0.698191 -0.397299 ... -1.694229 -1.369085 -0.425386
## 4 -2.120335 0.493460 0.718695 ... 1.963461 0.140472 1.015293
##
## [5 rows x 38 columns]
print("Means after standardization:")
## Means after standardization:
print(numeric_standardized.mean().head())
## Id 0.000000e+00
## MSSubClass 3.285043e-17
## LotFrontage 3.285043e-17
## LotArea 1.216683e-18
## OverallQual -1.460019e-17
## dtype: float64
print("Standard deviation after standardization:")
## Standard deviation after standardization:
print(numeric_standardized.std().head())
## Id 1.000343
## MSSubClass 1.000343
## LotFrontage 1.000343
## LotArea 1.000343
## OverallQual 1.000343
## dtype: float64