Setup

This is where we will add all our dependencies.

packages <- c("caret", "ggplot2", "corrplot", "dplyr", "VIM", "e1071")
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])

library(caret)
library(ggplot2)
library(corrplot)
library(dplyr)
library(VIM)
library(e1071)
library(xgboost)
library(gridExtra)
library(patchwork)

Load and Inspect Data

The Housing data set contains many missing values and categorical variables. Our main task involves imputing or dropping missing values, encoding categoricals, and log transforming the target variable SalePrice since it is right skewed.

data_path <- "C:/Users/Owner/Desktop/Predictive Modeling/housing_model/data"

list.files(data_path)
## [1] "data_description.txt" "test.csv"             "test_clean.csv"      
## [4] "test_ids.csv"         "train.csv"            "train_clean.csv"
train <- read.csv(file.path(data_path, "train.csv"), stringsAsFactors = FALSE)
test  <- read.csv(file.path(data_path, "test.csv"), stringsAsFactors = FALSE)

cat("Train dimensions:", dim(train), "\n")
## Train dimensions: 1460 81
cat("Test dimensions:", dim(test), "\n")
## Test dimensions: 1459 80
# Save the target (Sales Price) and IDs before merging
train_ids <- train$Id
test_ids <- test$Id
sale_price <- train$SalePrice

#Remove ID and target from train set, ID from test
train$Id <- NULL
test$Id <- NULL
train$SalePrice <- NULL

Merging Train and Test

Merging allows us to ensure any factor levels or imputed medians are computed on the full data distribution, all train and test columns should match.

# Tag each set so we can come split them later
train$set <- "train"
test$set <- "test"
all_data <- bind_rows(train, test)

cat("Combined dimensions", dim(all_data), "\n")
## Combined dimensions 2919 80

Inspect Missing Values

We have some variables such as GarageType that have observations marked as NA. However, that just means there is no garage, not that there is a missing distinction. This means we have got to check the none_cols and zero_cols list against the data description file.

missing_counts <- sort(colSums(is.na(all_data)),decreasing = TRUE)

missing_pct <- missing_counts / nrow(all_data) * 100

missing_df <- data.frame(
  variable = names(missing_counts),
  count = missing_counts,
  percentage = round(missing_pct, 1)
)

cat("\nTop 40 variables with missing values:\n")
## 
## Top 40 variables with missing values:
print(head(missing_df[missing_df$count > 0, ], 40))
##                  variable count percentage
## PoolQC             PoolQC  2909       99.7
## MiscFeature   MiscFeature  2814       96.4
## Alley               Alley  2721       93.2
## Fence               Fence  2348       80.4
## FireplaceQu   FireplaceQu  1420       48.6
## LotFrontage   LotFrontage   486       16.6
## GarageYrBlt   GarageYrBlt   159        5.4
## GarageFinish GarageFinish   159        5.4
## GarageQual     GarageQual   159        5.4
## GarageCond     GarageCond   159        5.4
## GarageType     GarageType   157        5.4
## BsmtCond         BsmtCond    82        2.8
## BsmtExposure BsmtExposure    82        2.8
## BsmtQual         BsmtQual    81        2.8
## BsmtFinType2 BsmtFinType2    80        2.7
## BsmtFinType1 BsmtFinType1    79        2.7
## MasVnrType     MasVnrType    24        0.8
## MasVnrArea     MasVnrArea    23        0.8
## MSZoning         MSZoning     4        0.1
## Utilities       Utilities     2        0.1
## BsmtFullBath BsmtFullBath     2        0.1
## BsmtHalfBath BsmtHalfBath     2        0.1
## Functional     Functional     2        0.1
## Exterior1st   Exterior1st     1        0.0
## Exterior2nd   Exterior2nd     1        0.0
## BsmtFinSF1     BsmtFinSF1     1        0.0
## BsmtFinSF2     BsmtFinSF2     1        0.0
## BsmtUnfSF       BsmtUnfSF     1        0.0
## TotalBsmtSF   TotalBsmtSF     1        0.0
## Electrical     Electrical     1        0.0
## KitchenQual   KitchenQual     1        0.0
## GarageCars     GarageCars     1        0.0
## GarageArea     GarageArea     1        0.0
## SaleType         SaleType     1        0.0

So we can see that PoolQC and Misc Feature are missing in almost every row, this isn’t an error. That just means that 99% of those houses don’t have those items. The columns referring to Garage all have exactly 159 missing values, meaning that 159 houses don’t have garages at all. This is similar to the Basement group however there is a slight variation 80 vs 82 that suggests that a rating might have been forgotten.

Fix NAs that mean “None”

Now that we have inspected our None values, we can remove the true NA’s and save the “None” values.

none_cols <- c(
  "Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
  "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish",
  "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature",
  "MasVnrType"
)
for (col in none_cols) {
  if (col %in% names(all_data)) {
    all_data[[col]][is.na(all_data[[col]])] <- "None"
  }
}
 
# Numeric "None" equivalents (e.g. no basement = 0 sq ft)
zero_cols <- c(
  "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF",
  "BsmtFullBath", "BsmtHalfBath", "GarageYrBlt", "GarageArea",
  "GarageCars", "MasVnrArea"
)
for (col in zero_cols) {
  if (col %in% names(all_data)) {
    all_data[[col]][is.na(all_data[[col]])] <- 0
  }
}

# Impute Random Categorical Data with the Mode
cat_cols <- names(all_data)[sapply(all_data, is.character)]
for (col in cat_cols) {
  if (any(is.na(all_data[[col]]))) {
    mode_val <- names(sort(table(all_data[[col]]), decreasing = TRUE))[1]
    all_data[[col]][is.na(all_data[[col]])] <- mode_val
  }
}
 
# Numerics: fill with median
num_cols <- names(all_data)[sapply(all_data, is.numeric)]
num_cols <- setdiff(num_cols, "set")
for (col in num_cols) {
  if (any(is.na(all_data[[col]]))) {
    all_data[[col]][is.na(all_data[[col]])] <- median(all_data[[col]], na.rm = TRUE)
  }
}
 
cat("\nMissing values remaining:", sum(is.na(all_data[, names(all_data) != "set"])), "\n")
## 
## Missing values remaining: 0

Feature Engineering

# Temp Data Frame for EDA
train_analysis <- all_data[all_data$set == "train", ]
train_analysis$SalePrice <- sale_price

# Identify strongest predictors
numeric_vars <- sapply(train_analysis, is.numeric)
cor_matrix <- cor(train_analysis[, numeric_vars], use = "pairwise.complete.obs")

price_correlations <- sort(cor_matrix[,"SalePrice"], decreasing = TRUE)
print(head(price_correlations, 15))
##    SalePrice  OverallQual    GrLivArea   GarageCars   GarageArea  TotalBsmtSF 
##    1.0000000    0.7909816    0.7086245    0.6404092    0.6234314    0.6135806 
##    X1stFlrSF     FullBath TotRmsAbvGrd    YearBuilt YearRemodAdd   MasVnrArea 
##    0.6058522    0.5606638    0.5337232    0.5228973    0.5071010    0.4726145 
##   Fireplaces   BsmtFinSF1  LotFrontage 
##    0.4669288    0.3864198    0.3345439
ggplot(train_analysis, aes(x = as.factor(OverallQual), y = SalePrice)) +
  geom_boxplot(fill = "steelblue") +
  labs(title = "Sale Price vs Overall Quality")

ggplot(train_analysis, aes(x = reorder(Neighborhood, SalePrice, FUN = median), y = SalePrice)) +
  geom_boxplot() +
  coord_flip() +
  labs(title = "Sale Price by Neighborhood (Ordered by Median)")

grep("Porch", names(all_data), value = TRUE)
## [1] "OpenPorchSF"   "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"
# --------- Feature Engineering ----------

# Total square footage
all_data$TotalSF <- all_data$TotalBsmtSF + all_data$X1stFlrSF + all_data$X2ndFlrSF
 
# Total bathrooms
all_data$TotalBaths <- all_data$FullBath + (0.5 * all_data$HalfBath) +
                       all_data$BsmtFullBath + (0.5 * all_data$BsmtHalfBath)
 
# Age
all_data$HouseAge   <- all_data$YrSold - all_data$YearBuilt
all_data$RemodelAge <- all_data$YrSold - all_data$YearRemodAdd
 
# Has features flags
all_data$HasPool     <- as.integer(all_data$PoolArea > 0)
all_data$HasGarage   <- as.integer(all_data$GarageArea > 0)
all_data$HasFireplace<- as.integer(all_data$Fireplaces > 0)
all_data$HasBasement <- as.integer(all_data$TotalBsmtSF > 0)

# Try Squaring OverallQual for polynomial feature
all_data$OverallQual_Squared <- all_data$OverallQual^2

#TotalPorchSF
all_data$TotalPorchSF <- all_data$OpenPorchSF + all_data$EnclosedPorch +
                         all_data$X3SsnPorch + all_data$ScreenPorch

#TotalHomeQuality
all_data$TotalHomeQuality <- all_data$OverallQual + all_data$OverallCond

#GarageAge
all_data$GarageAge <- all_data$YrSold - all_data$GarageYrBlt
all_data$GarageAge[is.na(all_data$GarageAge)] <- 0

#IsRemodeled
all_data$IsRemodeled <- ifelse(all_data$YearBuilt == all_data$YearRemodAdd, 0, 1)

#TotalOutsideSF

all_data$TotalOutsideSF <- all_data$WoodDeckSF + all_data$OpenPorchSF +
                           all_data$EnclosedPorch + all_data$X3SsnPorch +
                           all_data$ScreenPorch + all_data$PoolArea
#Qual_x_SF
all_data$Qual_x_SF <- all_data$OverallQual * all_data$TotalSF
all_data$Qual_x_Age <- all_data$OverallQual * all_data$HouseAge

Encoding Oridinal Variables

Some categories have a specific rank that we need to map to numericals so our models can understand them.

quality_map <- c("None" = 0, "Po" = 1, "Fa" = 2, "TA" = 3, "Gd" = 4, "Ex" = 5)

ordinal_cols <- c(
  "ExterQual", "ExterCond", "BsmtQual", "BsmtCond",
  "HeatingQC", "KitchenQual", "FireplaceQu",
  "GarageQual", "GarageCond", "PoolQC"
)
for (col in ordinal_cols){
  if(col %in% names(all_data)) {
    all_data[[col]] <- as.integer(quality_map[all_data[[col]]])
    all_data[[col]][is.na(all_data[[col]])] <- 0 
  }
}

Factor Conversion

For the remaining categories that don’t have an order, we can convert them into factors instead of a random string of letters.

cat_cols_remaining <- names(all_data)[sapply(all_data, is.character)]
cat_cols_remaining <- setdiff(cat_cols_remaining, "set")
all_data[cat_cols_remaining] <- lapply(all_data[cat_cols_remaining], as.factor)

Remove NZV Predictors

nzv <- nearZeroVar(all_data[, names(all_data) != "set"], saveMetrics = TRUE)
nzv_cols <- rownames(nzv[nzv$nzv == TRUE, ])

# 2. Define the "Protected" columns - we can add more or remove as we train the model
protected_cols <- c("PoolQC", "PoolArea", "HasPool", "BsmtCond", "FireplaceQu", "HasBasement")

nzv_cols_to_remove <- setdiff(nzv_cols, protected_cols)

cat("\nActually removing:", paste(nzv_cols_to_remove, collapse = ", "), "\n")
## 
## Actually removing: Street, Alley, LandContour, Utilities, LandSlope, Condition2, RoofMatl, BsmtFinType2, BsmtFinSF2, Heating, LowQualFinSF, KitchenAbvGr, Functional, OpenPorchSF, EnclosedPorch, X3SsnPorch, ScreenPorch, MiscFeature, MiscVal
cat("Keeping (Protected):", paste(intersect(nzv_cols, protected_cols), collapse = ", "), "\n")
## Keeping (Protected): BsmtCond, PoolArea, PoolQC, HasPool, HasBasement
all_data <- all_data[, !(names(all_data) %in% nzv_cols_to_remove)]

Split Into Train/Test

train_clean <- all_data[all_data$set == "train", ]
test_clean  <- all_data[all_data$set == "test",  ]
 
train_clean$set <- NULL
test_clean$set  <- NULL
 
train_clean$SalePrice <- log(sale_price)
 
cat("\nFinal train dimensions:", dim(train_clean), "\n")
## 
## Final train dimensions: 1460 77
cat("Final test dimensions: ", dim(test_clean),  "\n")
## Final test dimensions:  1459 76
# Distribution of SalePrice
p1 <- ggplot(train_clean, aes(x = SalePrice)) +
  geom_histogram(bins = 50, fill = "#1D9E75", color = "white", alpha = 0.85) +
  labs(title = "Distribution of log(SalePrice)",
       x = "log(SalePrice)", y = "Count") +
  theme_minimal()
print(p1)

# Correlation matrix
num_train <- train_clean[, sapply(train_clean, is.numeric)]
cor_matrix <- cor(num_train, use = "pairwise.complete.obs")
top_vars <- names(sort(abs(cor_matrix[, "SalePrice"]), decreasing = TRUE))[1:15]
corrplot(cor_matrix[top_vars, top_vars],
         method = "color", type = "upper",
         tl.cex = 0.7, tl.col = "black",
         title = "Top 15 correlations with SalePrice",
         mar = c(0, 0, 2, 0))

# SalePrice vs TotalSF
p2 <- ggplot(train_clean, aes(x = TotalSF, y = SalePrice)) +
  geom_point(alpha = 0.4, color = "#185FA5") +
  geom_smooth(method = "lm", color = "#E85D24", se = FALSE) +
  labs(title = "log(SalePrice) vs Total Square Footage",
       x = "Total SF", y = "log(SalePrice)") +
  theme_minimal()
print(p2)

write.csv(train_clean, "train_clean.csv", row.names = FALSE)
write.csv(test_clean,  "test_clean.csv",  row.names = FALSE)
write.csv(data.frame(Id = test_ids), "test_ids.csv", row.names = FALSE)
 
cat("\nPreprocessing complete. Files saved:\n")
## 
## Preprocessing complete. Files saved:
cat("  train_clean.csv\n")
##   train_clean.csv
cat("  test_clean.csv\n")
##   test_clean.csv
cat("  test_ids.csv\n")
##   test_ids.csv

K-Fold Cross Validation (5-Fold)

library(caret)

set.seed(123)

ctrl <- trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  savePredictions = "final"
)

Model Building

print_results <- function(model, name) {
  rmse <- min(model$results$RMSE)
  r2   <- model$results$Rsquared[which.min(model$results$RMSE)]
  cat(sprintf("\n%-20s | CV RMSE: %.5f | R²: %.4f\n", name, rmse, r2))
}

Linear Regression

#Linear Regression (CV)

lm_cv <- train(
  SalePrice ~ .,
  data = train_clean,
  method = "lm",
  trControl = ctrl,
  metric = "RMSE"
)
## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE 
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE 
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE 
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE 
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE 
## Aggregating results
## Fitting final model on full training set
lm_cv
## Linear Regression 
## 
## 1460 samples
##   76 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1169, 1169, 1167, 1168, 1167 
## Resampling results:
## 
##   RMSE       Rsquared   MAE       
##   0.1386561  0.8813315  0.09088272
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Ridge Regression

# Ridge Regression (CV)

ridge_cv <- train(
  SalePrice ~ .,
  data = train_clean,
  method = "glmnet",
  trControl = ctrl,
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = seq(0.0001, 1, length = 20)
  ),
  metric = "RMSE"
)
## + Fold1: alpha=0, lambda=1 
## - Fold1: alpha=0, lambda=1 
## + Fold2: alpha=0, lambda=1 
## - Fold2: alpha=0, lambda=1 
## + Fold3: alpha=0, lambda=1 
## - Fold3: alpha=0, lambda=1 
## + Fold4: alpha=0, lambda=1 
## - Fold4: alpha=0, lambda=1 
## + Fold5: alpha=0, lambda=1 
## - Fold5: alpha=0, lambda=1 
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0, lambda = 0.211 on full training set
ridge_cv
## glmnet 
## 
## 1460 samples
##   76 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1169, 1167, 1169, 1166, 1169 
## Resampling results across tuning parameters:
## 
##   lambda      RMSE       Rsquared   MAE       
##   0.00010000  0.1497821  0.8615818  0.09173744
##   0.05272632  0.1484390  0.8638806  0.09116622
##   0.10535263  0.1466683  0.8668366  0.09038469
##   0.15797895  0.1459137  0.8682679  0.09017397
##   0.21060526  0.1456376  0.8690547  0.09021962
##   0.26323158  0.1456433  0.8694857  0.09048455
##   0.31585789  0.1458374  0.8696949  0.09087689
##   0.36848421  0.1461660  0.8697538  0.09136630
##   0.42111053  0.1465942  0.8697036  0.09191673
##   0.47373684  0.1471039  0.8695797  0.09249693
##   0.52636316  0.1476758  0.8694008  0.09310544
##   0.57898947  0.1482929  0.8691785  0.09375381
##   0.63161579  0.1489496  0.8689251  0.09443883
##   0.68424211  0.1496383  0.8686495  0.09515841
##   0.73686842  0.1503549  0.8683570  0.09587747
##   0.78949474  0.1510974  0.8680495  0.09662074
##   0.84212105  0.1518623  0.8677312  0.09736930
##   0.89474737  0.1526387  0.8674115  0.09812138
##   0.94737368  0.1534347  0.8670835  0.09888043
##   1.00000000  0.1542448  0.8667520  0.09963692
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.2106053.
plot(ridge_cv)

print_results(ridge_cv, "Ridge")
## 
## Ridge                | CV RMSE: 0.14564 | R²: 0.8691

Lasso Regression

## Lasso Regression (CV)

lasso_cv <- train(
  SalePrice ~ .,
  data = train_clean,
  method = "glmnet",
  trControl = ctrl,
  tuneGrid = expand.grid(
    alpha = 1,
    lambda = seq(0.0001, 1, length = 20)
  ),
  metric = "RMSE"
)
## + Fold1: alpha=1, lambda=1 
## - Fold1: alpha=1, lambda=1 
## + Fold2: alpha=1, lambda=1 
## - Fold2: alpha=1, lambda=1 
## + Fold3: alpha=1, lambda=1 
## - Fold3: alpha=1, lambda=1 
## + Fold4: alpha=1, lambda=1 
## - Fold4: alpha=1, lambda=1 
## + Fold5: alpha=1, lambda=1 
## - Fold5: alpha=1, lambda=1
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 1e-04 on full training set
lasso_cv
## glmnet 
## 
## 1460 samples
##   76 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1167, 1169, 1169, 1168, 1167 
## Resampling results across tuning parameters:
## 
##   lambda      RMSE       Rsquared   MAE       
##   0.00010000  0.1473712  0.8645838  0.09193695
##   0.05272632  0.1761562  0.8332837  0.11849910
##   0.10535263  0.2183456  0.7876140  0.15534116
##   0.15797895  0.2617639  0.7423225  0.19423700
##   0.21060526  0.3019468  0.7136306  0.22990522
##   0.26323158  0.3426594  0.7065192  0.26431969
##   0.31585789  0.3866483  0.6913196  0.29994059
##   0.36848421  0.3991334        NaN  0.30981966
##   0.42111053  0.3991334        NaN  0.30981966
##   0.47373684  0.3991334        NaN  0.30981966
##   0.52636316  0.3991334        NaN  0.30981966
##   0.57898947  0.3991334        NaN  0.30981966
##   0.63161579  0.3991334        NaN  0.30981966
##   0.68424211  0.3991334        NaN  0.30981966
##   0.73686842  0.3991334        NaN  0.30981966
##   0.78949474  0.3991334        NaN  0.30981966
##   0.84212105  0.3991334        NaN  0.30981966
##   0.89474737  0.3991334        NaN  0.30981966
##   0.94737368  0.3991334        NaN  0.30981966
##   1.00000000  0.3991334        NaN  0.30981966
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 1e-04.
plot(lasso_cv)

print_results(lasso_cv, "Lasso")
## 
## Lasso                | CV RMSE: 0.14737 | R²: 0.8646

Random Forest

(Target is “SalePrice, which is continuous.)

## Random Forest (CV)

rf_cv <- train(
  SalePrice ~ .,
  data = train_clean,
  method = "rf",
  trControl = ctrl,
  metric = "RMSE",
  ntree = 200
)
## + Fold1: mtry=  2 
## - Fold1: mtry=  2 
## + Fold1: mtry= 97 
## - Fold1: mtry= 97 
## + Fold1: mtry=193 
## - Fold1: mtry=193 
## + Fold2: mtry=  2 
## - Fold2: mtry=  2 
## + Fold2: mtry= 97 
## - Fold2: mtry= 97 
## + Fold2: mtry=193 
## - Fold2: mtry=193 
## + Fold3: mtry=  2 
## - Fold3: mtry=  2 
## + Fold3: mtry= 97 
## - Fold3: mtry= 97 
## + Fold3: mtry=193 
## - Fold3: mtry=193 
## + Fold4: mtry=  2 
## - Fold4: mtry=  2 
## + Fold4: mtry= 97 
## - Fold4: mtry= 97 
## + Fold4: mtry=193 
## - Fold4: mtry=193 
## + Fold5: mtry=  2 
## - Fold5: mtry=  2 
## + Fold5: mtry= 97 
## - Fold5: mtry= 97 
## + Fold5: mtry=193 
## - Fold5: mtry=193 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 97 on full training set
rf_cv
## Random Forest 
## 
## 1460 samples
##   76 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1169, 1168, 1169, 1166, 1168 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE       
##     2   0.1871029  0.8464283  0.12708234
##    97   0.1401785  0.8760907  0.09253003
##   193   0.1431676  0.8704771  0.09552178
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 97.
plot(rf_cv)

print_results(rf_cv, "Random Forest")
## 
## Random Forest        | CV RMSE: 0.14018 | R²: 0.8761

Gradient Boosting

# Gradient Boost
set.seed(42)
gbm_grid <- expand.grid(
  n.trees           = c(500, 1000),
  interaction.depth = c(3, 5),
  shrinkage         = 0.05,
  n.minobsinnode    = 10
)
gbm_model <- train(
  SalePrice ~ .,
  data = train_clean,
  method    = "gbm",
  trControl = ctrl,
  tuneGrid  = gbm_grid,
  verbose   = FALSE
)
## + Fold1: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold1: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000 
## + Fold1: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold1: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000 
## + Fold2: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000 
## - Fold2: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000 
## + Fold2: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000 
## - Fold2: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000 
## + Fold3: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold3: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000 
## + Fold3: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold3: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000 
## + Fold4: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold4: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000 
## + Fold4: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold4: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000 
## + Fold5: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold5: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000 
## + Fold5: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold5: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000 
## Aggregating results
## Selecting tuning parameters
## Fitting n.trees = 1000, interaction.depth = 5, shrinkage = 0.05, n.minobsinnode = 10 on full training set
print_results(gbm_model, "Gradient Boosting")
## 
## Gradient Boosting    | CV RMSE: 0.12104 | R²: 0.9089
cat("Best tune:\n"); print(gbm_model$bestTune)
## Best tune:
##   n.trees interaction.depth shrinkage n.minobsinnode
## 4    1000                 5      0.05             10

Evaluation

Evaluate based off of RMSE. Because our data is continuous, we cannot fit an ROC curve.

Feature Importance (FI)

library(caret)

par(mfrow = c(1,2))

imp <- varImp(rf_cv)
imp_df <- as.data.frame(imp$importance)

# keep row names as feature names
imp_df$Feature <- rownames(imp_df)

# sort top 15
top_imp <- imp_df[order(imp_df$Overall, decreasing = TRUE), ]
top_imp <- head(top_imp, 15)

par(mar = c(5, 10, 4, 2))
barplot(top_imp$Overall,
        names.arg = top_imp$Feature,
        horiz = TRUE,
        las = 1,
        cex.names = 0.8,
        xlab = "Importance",
        main = "RF - Top 15 FI")

library(gbm)
gbm_imp         <- varImp(gbm_model)$importance
gbm_imp$Feature <- rownames(gbm_imp)
gbm_top         <- gbm_imp[order(gbm_imp$Overall, decreasing = TRUE), ]
gbm_top         <- head(gbm_top, 15)

par(mar = c(5, 10, 4, 2))
barplot(gbm_top$Overall,
        names.arg = gbm_top$Feature,
        horiz     = TRUE,
        las       = 1,
        cex.names = 0.8,
        xlab      = "Importance",
        main      = "GBM - Top 15 FI")

Note :

• Qual_x_SF (100) is the most important for prediction the “SalePrice” (Our Traget) in both the RF and GDM Models.

RMSE Comparison

cat("\n============ MODEL COMPARISON ============\n")
## 
## ============ MODEL COMPARISON ============
results <- resamples(list(
  Ridge  = ridge_cv,
  Lasso  = lasso_cv,
  RF = rf_cv,
  Linear = lm_cv,
  GBM = gbm_model
))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Ridge, Lasso, RF, Linear, GBM 
## Number of resamples: 5 
## 
## MAE 
##              Min.    1st Qu.     Median       Mean    3rd Qu.       Max. NA's
## Ridge  0.08768168 0.08806005 0.08898255 0.09021962 0.09264194 0.09373191    0
## Lasso  0.08202779 0.08642971 0.08685955 0.09193695 0.09300864 0.11135904    0
## RF     0.08604880 0.08804656 0.09354529 0.09253003 0.09468822 0.10032127    0
## Linear 0.08710717 0.08837334 0.09049914 0.09088272 0.09189125 0.09654269    0
## GBM    0.07402854 0.08105690 0.08115421 0.08157934 0.08535717 0.08629987    0
## 
## RMSE 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Ridge  0.1277099 0.1342214 0.1376598 0.1456376 0.1547423 0.1738546    0
## Lasso  0.1201136 0.1253025 0.1428855 0.1473712 0.1594107 0.1891438    0
## RF     0.1205043 0.1241178 0.1415921 0.1401785 0.1438907 0.1707875    0
## Linear 0.1202906 0.1353803 0.1389952 0.1386561 0.1466466 0.1519676    0
## GBM    0.1078427 0.1169457 0.1202677 0.1210388 0.1242888 0.1358491    0
## 
## Rsquared 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Ridge  0.8013240 0.8516059 0.8823707 0.8690547 0.9015780 0.9083949    0
## Lasso  0.7971306 0.8435090 0.8794070 0.8645838 0.9012352 0.9016374    0
## RF     0.8094766 0.8629716 0.8904951 0.8760907 0.9079069 0.9096032    0
## Linear 0.8517515 0.8808915 0.8811216 0.8813315 0.8840826 0.9088103    0
## GBM    0.8997141 0.9028349 0.9042397 0.9088507 0.9087651 0.9286996    0
# Plot comparison
comparison_plot <- dotplot(results, metric = "RMSE",
                           main = "5-Fold CV RMSE Comparison")
print(comparison_plot)

Final Ranking

model_summary <- data.frame(
  Model = c("Ridge", "Lasso", "Random Forest", "Linear Regression", "GBM"),
  CV_RMSE = c(
    min(ridge_cv$results$RMSE),
    min(lasso_cv$results$RMSE),
    min(rf_cv$results$RMSE),
    min(lm_cv$results$RMSE),
    min(gbm_model$results$RMSE)
  )
) %>% arrange(CV_RMSE)
model_summary$CV_RMSE <- round(model_summary$CV_RMSE, 5)
cat("\nFinal ranking:\n")
## 
## Final ranking:
print(model_summary)
##               Model CV_RMSE
## 1               GBM 0.12104
## 2 Linear Regression 0.13866
## 3     Random Forest 0.14018
## 4             Ridge 0.14564
## 5             Lasso 0.14737

Predicted vs Actual

par(mfrow = c(2,3))
pred_train <- predict(gbm_model, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
     xlab = "Actual log(SalePrice)",
     ylab = "Predicted log(SalePrice)",
     main = "Predicted vs Actual (GBM)",
     pch  = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)

pred_train <- predict(rf_cv, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
     xlab = "Actual log(SalePrice)",
     ylab = "Predicted log(SalePrice)",
     main = "Predicted vs Actual (RF)",
     pch  = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)

pred_train <- predict(lasso_cv, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
     xlab = "Actual log(SalePrice)",
     ylab = "Predicted log(SalePrice)",
     main = "Predicted vs Actual (Lasso)",
     pch  = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)

pred_train <- predict(ridge_cv, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
     xlab = "Actual log(SalePrice)",
     ylab = "Predicted log(SalePrice)",
     main = "Predicted vs Actual (Ridge)",
     pch  = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)

pred_train <- predict(lm_cv, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
     xlab = "Actual log(SalePrice)",
     ylab = "Predicted log(SalePrice)",
     main = "Predicted vs Actual (LR)",
     pch  = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)

### Residuals

residuals <- train_clean$SalePrice - pred_train
p2 <- plot(pred_train, residuals,
     xlab = "Fitted Values",
     ylab = "Residuals",
     main = "Residual Plot (GBM)",
     pch  = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(h = 0, col = "red", lwd = 2)

Kaggle Submission

## Predict Test Set (GBM)
final_pred <- predict(gbm_model, newdata = test_clean)
final_pred[is.na(final_pred)] <- median(final_pred, na.rm = TRUE)

# Convert back from log scale
submission <- data.frame(
  Id        = test_ids,
  SalePrice = exp(final_pred)
)

nrow(submission)
## [1] 1459
sum(is.na(submission$SalePrice))
## [1] 0
head(submission)
##     Id SalePrice
## 1 1461  121672.5
## 2 1462  165933.5
## 3 1463  176875.8
## 4 1464  197317.8
## 5 1465  178172.7
## 6 1466  174780.7
write.csv(submission, "submission.csv", row.names = FALSE)