3.1 Prerequisites

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PowerTransformer, StandardScaler
ames = fetch_openml(name="house_prices", as_frame=True)

df = ames.frame.copy()

print("Dataset shape:", df.shape)
## Dataset shape: (1460, 81)
df.head()
##    Id  MSSubClass MSZoning  ...  SaleType  SaleCondition SalePrice
## 0   1          60       RL  ...        WD         Normal    208500
## 1   2          20       RL  ...        WD         Normal    181500
## 2   3          60       RL  ...        WD         Normal    223500
## 3   4          70       RL  ...        WD        Abnorml    140000
## 4   5          60       RL  ...        WD         Normal    250000
## 
## [5 rows x 81 columns]

3.2 Target Engineering

y = pd.to_numeric(df["SalePrice"], errors="coerce")

print(y.describe())
## count      1460.000000
## mean     180921.195890
## std       79442.502883
## min       34900.000000
## 25%      129975.000000
## 50%      163000.000000
## 75%      214000.000000
## max      755000.000000
## Name: SalePrice, dtype: float64
y_log = np.log1p(y)

print(y_log.describe())
## count    1460.000000
## mean       12.024057
## std         0.399449
## min        10.460271
## 25%        11.775105
## 50%        12.001512
## 75%        12.273736
## max        13.534474
## Name: SalePrice, dtype: float64
plt.hist(y.dropna(), bins=30)
plt.title("Original SalePrice Distribution")
plt.show()

plt.hist(y_log.dropna(), bins=30)
plt.title("Log Transformed SalePrice")
plt.show()


3.3 Dealing with Missingness

total_missing = df.isna().sum().sum()

print("Total missing values:", total_missing)
## Total missing values: 6965
missing_per_column = df.isna().sum().sort_values(ascending=False)

missing_per_column[missing_per_column > 0].head(20)
## PoolQC          1453
## MiscFeature     1406
## Alley           1369
## Fence           1179
## FireplaceQu      690
## LotFrontage      259
## GarageFinish      81
## GarageQual        81
## GarageYrBlt       81
## GarageType        81
## GarageCond        81
## BsmtExposure      38
## BsmtFinType2      38
## BsmtCond          37
## BsmtFinType1      37
## BsmtQual          37
## MasVnrArea         8
## MasVnrType         8
## Electrical         1
## dtype: int64
plt.figure(figsize=(14,8))
sns.heatmap(df.isna(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()


Median Imputation

df_median = df.copy()

num_cols = df_median.select_dtypes(include=np.number).columns
cat_cols = df_median.select_dtypes(exclude=np.number).columns

df_median[num_cols] = SimpleImputer(strategy="median").fit_transform(df_median[num_cols])
df_median[cat_cols] = SimpleImputer(strategy="most_frequent").fit_transform(df_median[cat_cols])

print("Remaining missing values:", df_median.isna().sum().sum())
## Remaining missing values: 0

3.4 Feature Filtering

zero_var = df.nunique() == 1

zero_var_features = df.columns[zero_var]

print("Zero variance features:", list(zero_var_features))
## Zero variance features: []
numeric_df = df.select_dtypes(include=np.number)

selector = VarianceThreshold(threshold=0.01)

filtered_numeric = selector.fit_transform(numeric_df)

selected_features = numeric_df.columns[selector.get_support()]

removed_features = numeric_df.columns[~selector.get_support()]

print("Near-zero variance features removed:")
## Near-zero variance features removed:
print(list(removed_features))
## []

3.5 Numeric Feature Engineering

numeric_df = df.select_dtypes(include=np.number)

skewness = numeric_df.skew().sort_values(ascending=False)

print("Top skewed features:")
## Top skewed features:
print(skewness.head(10))
## MiscVal          24.476794
## PoolArea         14.828374
## LotArea          12.207688
## 3SsnPorch        10.304342
## LowQualFinSF      9.011341
## KitchenAbvGr      4.488397
## BsmtFinSF2        4.255261
## ScreenPorch       4.122214
## BsmtHalfBath      4.103403
## EnclosedPorch     3.089872
## dtype: float64

Yeo-Johnson Transformation

numeric_filled = numeric_df.fillna(numeric_df.median())

pt = PowerTransformer(method='yeo-johnson')

numeric_transformed = pd.DataFrame(
    pt.fit_transform(numeric_filled),
    columns=numeric_df.columns
)

numeric_transformed.head()
##          Id  MSSubClass  LotFrontage  ...    MoSold        YrSold  SalePrice
## 0 -2.161263    0.493460    -0.146062  ... -1.694229  5.939693e-15   0.571155
## 1 -2.149711   -1.164269     0.546852  ... -0.444891 -2.575717e-14   0.227627
## 2 -2.139242    0.493460    -0.000631  ...  0.987490  5.939693e-15   0.741869
## 3 -2.129513    0.698191    -0.397299  ... -1.694229 -5.750955e-14  -0.425386
## 4 -2.120335    0.493460     0.718695  ...  1.963461  5.939693e-15   1.015293
## 
## [5 rows x 38 columns]

Standardization

scaler = StandardScaler()

numeric_standardized = pd.DataFrame(
    scaler.fit_transform(numeric_transformed),
    columns=numeric_transformed.columns
)

numeric_standardized.head()
##          Id  MSSubClass  LotFrontage  ...    MoSold    YrSold  SalePrice
## 0 -2.161263    0.493460    -0.146062  ... -1.694229  0.140472   0.571155
## 1 -2.149711   -1.164269     0.546852  ... -0.444891 -0.613646   0.227627
## 2 -2.139242    0.493460    -0.000631  ...  0.987490  0.140472   0.741869
## 3 -2.129513    0.698191    -0.397299  ... -1.694229 -1.369085  -0.425386
## 4 -2.120335    0.493460     0.718695  ...  1.963461  0.140472   1.015293
## 
## [5 rows x 38 columns]
print("Means after standardization:")
## Means after standardization:
print(numeric_standardized.mean().head())
## Id             0.000000e+00
## MSSubClass     3.285043e-17
## LotFrontage    3.285043e-17
## LotArea        1.216683e-18
## OverallQual   -1.460019e-17
## dtype: float64
print("Standard deviation after standardization:")
## Standard deviation after standardization:
print(numeric_standardized.std().head())
## Id             1.000343
## MSSubClass     1.000343
## LotFrontage    1.000343
## LotArea        1.000343
## OverallQual    1.000343
## dtype: float64