This is where we will add all our dependencies.
packages <- c("caret", "ggplot2", "corrplot", "dplyr", "VIM", "e1071")
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])
library(caret)
library(ggplot2)
library(corrplot)
library(dplyr)
library(VIM)
library(e1071)
library(xgboost)
library(gridExtra)
library(patchwork)
The Housing data set contains many missing values and categorical variables. Our main task involves imputing or dropping missing values, encoding categoricals, and log transforming the target variable SalePrice since it is right skewed.
data_path <- "C:/Users/Owner/Desktop/Predictive Modeling/housing_model/data"
list.files(data_path)
## [1] "data_description.txt" "test.csv" "test_clean.csv"
## [4] "test_ids.csv" "train.csv" "train_clean.csv"
train <- read.csv(file.path(data_path, "train.csv"), stringsAsFactors = FALSE)
test <- read.csv(file.path(data_path, "test.csv"), stringsAsFactors = FALSE)
cat("Train dimensions:", dim(train), "\n")
## Train dimensions: 1460 81
cat("Test dimensions:", dim(test), "\n")
## Test dimensions: 1459 80
# Save the target (Sales Price) and IDs before merging
train_ids <- train$Id
test_ids <- test$Id
sale_price <- train$SalePrice
#Remove ID and target from train set, ID from test
train$Id <- NULL
test$Id <- NULL
train$SalePrice <- NULL
Merging allows us to ensure any factor levels or imputed medians are computed on the full data distribution, all train and test columns should match.
# Tag each set so we can come split them later
train$set <- "train"
test$set <- "test"
all_data <- bind_rows(train, test)
cat("Combined dimensions", dim(all_data), "\n")
## Combined dimensions 2919 80
We have some variables such as GarageType that have observations marked as NA. However, that just means there is no garage, not that there is a missing distinction. This means we have got to check the none_cols and zero_cols list against the data description file.
missing_counts <- sort(colSums(is.na(all_data)),decreasing = TRUE)
missing_pct <- missing_counts / nrow(all_data) * 100
missing_df <- data.frame(
variable = names(missing_counts),
count = missing_counts,
percentage = round(missing_pct, 1)
)
cat("\nTop 40 variables with missing values:\n")
##
## Top 40 variables with missing values:
print(head(missing_df[missing_df$count > 0, ], 40))
## variable count percentage
## PoolQC PoolQC 2909 99.7
## MiscFeature MiscFeature 2814 96.4
## Alley Alley 2721 93.2
## Fence Fence 2348 80.4
## FireplaceQu FireplaceQu 1420 48.6
## LotFrontage LotFrontage 486 16.6
## GarageYrBlt GarageYrBlt 159 5.4
## GarageFinish GarageFinish 159 5.4
## GarageQual GarageQual 159 5.4
## GarageCond GarageCond 159 5.4
## GarageType GarageType 157 5.4
## BsmtCond BsmtCond 82 2.8
## BsmtExposure BsmtExposure 82 2.8
## BsmtQual BsmtQual 81 2.8
## BsmtFinType2 BsmtFinType2 80 2.7
## BsmtFinType1 BsmtFinType1 79 2.7
## MasVnrType MasVnrType 24 0.8
## MasVnrArea MasVnrArea 23 0.8
## MSZoning MSZoning 4 0.1
## Utilities Utilities 2 0.1
## BsmtFullBath BsmtFullBath 2 0.1
## BsmtHalfBath BsmtHalfBath 2 0.1
## Functional Functional 2 0.1
## Exterior1st Exterior1st 1 0.0
## Exterior2nd Exterior2nd 1 0.0
## BsmtFinSF1 BsmtFinSF1 1 0.0
## BsmtFinSF2 BsmtFinSF2 1 0.0
## BsmtUnfSF BsmtUnfSF 1 0.0
## TotalBsmtSF TotalBsmtSF 1 0.0
## Electrical Electrical 1 0.0
## KitchenQual KitchenQual 1 0.0
## GarageCars GarageCars 1 0.0
## GarageArea GarageArea 1 0.0
## SaleType SaleType 1 0.0
So we can see that PoolQC and Misc Feature are missing in almost every row, this isn’t an error. That just means that 99% of those houses don’t have those items. The columns referring to Garage all have exactly 159 missing values, meaning that 159 houses don’t have garages at all. This is similar to the Basement group however there is a slight variation 80 vs 82 that suggests that a rating might have been forgotten.
Now that we have inspected our None values, we can remove the true NA’s and save the “None” values.
none_cols <- c(
"Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
"BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish",
"GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature",
"MasVnrType"
)
for (col in none_cols) {
if (col %in% names(all_data)) {
all_data[[col]][is.na(all_data[[col]])] <- "None"
}
}
# Numeric "None" equivalents (e.g. no basement = 0 sq ft)
zero_cols <- c(
"BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF",
"BsmtFullBath", "BsmtHalfBath", "GarageYrBlt", "GarageArea",
"GarageCars", "MasVnrArea"
)
for (col in zero_cols) {
if (col %in% names(all_data)) {
all_data[[col]][is.na(all_data[[col]])] <- 0
}
}
# Impute Random Categorical Data with the Mode
cat_cols <- names(all_data)[sapply(all_data, is.character)]
for (col in cat_cols) {
if (any(is.na(all_data[[col]]))) {
mode_val <- names(sort(table(all_data[[col]]), decreasing = TRUE))[1]
all_data[[col]][is.na(all_data[[col]])] <- mode_val
}
}
# Numerics: fill with median
num_cols <- names(all_data)[sapply(all_data, is.numeric)]
num_cols <- setdiff(num_cols, "set")
for (col in num_cols) {
if (any(is.na(all_data[[col]]))) {
all_data[[col]][is.na(all_data[[col]])] <- median(all_data[[col]], na.rm = TRUE)
}
}
cat("\nMissing values remaining:", sum(is.na(all_data[, names(all_data) != "set"])), "\n")
##
## Missing values remaining: 0
# Temp Data Frame for EDA
train_analysis <- all_data[all_data$set == "train", ]
train_analysis$SalePrice <- sale_price
# Identify strongest predictors
numeric_vars <- sapply(train_analysis, is.numeric)
cor_matrix <- cor(train_analysis[, numeric_vars], use = "pairwise.complete.obs")
price_correlations <- sort(cor_matrix[,"SalePrice"], decreasing = TRUE)
print(head(price_correlations, 15))
## SalePrice OverallQual GrLivArea GarageCars GarageArea TotalBsmtSF
## 1.0000000 0.7909816 0.7086245 0.6404092 0.6234314 0.6135806
## X1stFlrSF FullBath TotRmsAbvGrd YearBuilt YearRemodAdd MasVnrArea
## 0.6058522 0.5606638 0.5337232 0.5228973 0.5071010 0.4726145
## Fireplaces BsmtFinSF1 LotFrontage
## 0.4669288 0.3864198 0.3345439
ggplot(train_analysis, aes(x = as.factor(OverallQual), y = SalePrice)) +
geom_boxplot(fill = "steelblue") +
labs(title = "Sale Price vs Overall Quality")
ggplot(train_analysis, aes(x = reorder(Neighborhood, SalePrice, FUN = median), y = SalePrice)) +
geom_boxplot() +
coord_flip() +
labs(title = "Sale Price by Neighborhood (Ordered by Median)")
grep("Porch", names(all_data), value = TRUE)
## [1] "OpenPorchSF" "EnclosedPorch" "X3SsnPorch" "ScreenPorch"
# --------- Feature Engineering ----------
# Total square footage
all_data$TotalSF <- all_data$TotalBsmtSF + all_data$X1stFlrSF + all_data$X2ndFlrSF
# Total bathrooms
all_data$TotalBaths <- all_data$FullBath + (0.5 * all_data$HalfBath) +
all_data$BsmtFullBath + (0.5 * all_data$BsmtHalfBath)
# Age
all_data$HouseAge <- all_data$YrSold - all_data$YearBuilt
all_data$RemodelAge <- all_data$YrSold - all_data$YearRemodAdd
# Has features flags
all_data$HasPool <- as.integer(all_data$PoolArea > 0)
all_data$HasGarage <- as.integer(all_data$GarageArea > 0)
all_data$HasFireplace<- as.integer(all_data$Fireplaces > 0)
all_data$HasBasement <- as.integer(all_data$TotalBsmtSF > 0)
# Try Squaring OverallQual for polynomial feature
all_data$OverallQual_Squared <- all_data$OverallQual^2
#TotalPorchSF
all_data$TotalPorchSF <- all_data$OpenPorchSF + all_data$EnclosedPorch +
all_data$X3SsnPorch + all_data$ScreenPorch
#TotalHomeQuality
all_data$TotalHomeQuality <- all_data$OverallQual + all_data$OverallCond
#GarageAge
all_data$GarageAge <- all_data$YrSold - all_data$GarageYrBlt
all_data$GarageAge[is.na(all_data$GarageAge)] <- 0
#IsRemodeled
all_data$IsRemodeled <- ifelse(all_data$YearBuilt == all_data$YearRemodAdd, 0, 1)
#TotalOutsideSF
all_data$TotalOutsideSF <- all_data$WoodDeckSF + all_data$OpenPorchSF +
all_data$EnclosedPorch + all_data$X3SsnPorch +
all_data$ScreenPorch + all_data$PoolArea
#Qual_x_SF
all_data$Qual_x_SF <- all_data$OverallQual * all_data$TotalSF
all_data$Qual_x_Age <- all_data$OverallQual * all_data$HouseAge
Some categories have a specific rank that we need to map to numericals so our models can understand them.
quality_map <- c("None" = 0, "Po" = 1, "Fa" = 2, "TA" = 3, "Gd" = 4, "Ex" = 5)
ordinal_cols <- c(
"ExterQual", "ExterCond", "BsmtQual", "BsmtCond",
"HeatingQC", "KitchenQual", "FireplaceQu",
"GarageQual", "GarageCond", "PoolQC"
)
for (col in ordinal_cols){
if(col %in% names(all_data)) {
all_data[[col]] <- as.integer(quality_map[all_data[[col]]])
all_data[[col]][is.na(all_data[[col]])] <- 0
}
}
For the remaining categories that don’t have an order, we can convert them into factors instead of a random string of letters.
cat_cols_remaining <- names(all_data)[sapply(all_data, is.character)]
cat_cols_remaining <- setdiff(cat_cols_remaining, "set")
all_data[cat_cols_remaining] <- lapply(all_data[cat_cols_remaining], as.factor)
nzv <- nearZeroVar(all_data[, names(all_data) != "set"], saveMetrics = TRUE)
nzv_cols <- rownames(nzv[nzv$nzv == TRUE, ])
# 2. Define the "Protected" columns - we can add more or remove as we train the model
protected_cols <- c("PoolQC", "PoolArea", "HasPool", "BsmtCond", "FireplaceQu", "HasBasement")
nzv_cols_to_remove <- setdiff(nzv_cols, protected_cols)
cat("\nActually removing:", paste(nzv_cols_to_remove, collapse = ", "), "\n")
##
## Actually removing: Street, Alley, LandContour, Utilities, LandSlope, Condition2, RoofMatl, BsmtFinType2, BsmtFinSF2, Heating, LowQualFinSF, KitchenAbvGr, Functional, OpenPorchSF, EnclosedPorch, X3SsnPorch, ScreenPorch, MiscFeature, MiscVal
cat("Keeping (Protected):", paste(intersect(nzv_cols, protected_cols), collapse = ", "), "\n")
## Keeping (Protected): BsmtCond, PoolArea, PoolQC, HasPool, HasBasement
all_data <- all_data[, !(names(all_data) %in% nzv_cols_to_remove)]
train_clean <- all_data[all_data$set == "train", ]
test_clean <- all_data[all_data$set == "test", ]
train_clean$set <- NULL
test_clean$set <- NULL
train_clean$SalePrice <- log(sale_price)
cat("\nFinal train dimensions:", dim(train_clean), "\n")
##
## Final train dimensions: 1460 77
cat("Final test dimensions: ", dim(test_clean), "\n")
## Final test dimensions: 1459 76
# Distribution of SalePrice
p1 <- ggplot(train_clean, aes(x = SalePrice)) +
geom_histogram(bins = 50, fill = "#1D9E75", color = "white", alpha = 0.85) +
labs(title = "Distribution of log(SalePrice)",
x = "log(SalePrice)", y = "Count") +
theme_minimal()
print(p1)
# Correlation matrix
num_train <- train_clean[, sapply(train_clean, is.numeric)]
cor_matrix <- cor(num_train, use = "pairwise.complete.obs")
top_vars <- names(sort(abs(cor_matrix[, "SalePrice"]), decreasing = TRUE))[1:15]
corrplot(cor_matrix[top_vars, top_vars],
method = "color", type = "upper",
tl.cex = 0.7, tl.col = "black",
title = "Top 15 correlations with SalePrice",
mar = c(0, 0, 2, 0))
# SalePrice vs TotalSF
p2 <- ggplot(train_clean, aes(x = TotalSF, y = SalePrice)) +
geom_point(alpha = 0.4, color = "#185FA5") +
geom_smooth(method = "lm", color = "#E85D24", se = FALSE) +
labs(title = "log(SalePrice) vs Total Square Footage",
x = "Total SF", y = "log(SalePrice)") +
theme_minimal()
print(p2)
write.csv(train_clean, "train_clean.csv", row.names = FALSE)
write.csv(test_clean, "test_clean.csv", row.names = FALSE)
write.csv(data.frame(Id = test_ids), "test_ids.csv", row.names = FALSE)
cat("\nPreprocessing complete. Files saved:\n")
##
## Preprocessing complete. Files saved:
cat(" train_clean.csv\n")
## train_clean.csv
cat(" test_clean.csv\n")
## test_clean.csv
cat(" test_ids.csv\n")
## test_ids.csv
library(caret)
set.seed(123)
ctrl <- trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
savePredictions = "final"
)
print_results <- function(model, name) {
rmse <- min(model$results$RMSE)
r2 <- model$results$Rsquared[which.min(model$results$RMSE)]
cat(sprintf("\n%-20s | CV RMSE: %.5f | R²: %.4f\n", name, rmse, r2))
}
#Linear Regression (CV)
lm_cv <- train(
SalePrice ~ .,
data = train_clean,
method = "lm",
trControl = ctrl,
metric = "RMSE"
)
## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
lm_cv
## Linear Regression
##
## 1460 samples
## 76 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1169, 1169, 1167, 1168, 1167
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1386561 0.8813315 0.09088272
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Ridge Regression (CV)
ridge_cv <- train(
SalePrice ~ .,
data = train_clean,
method = "glmnet",
trControl = ctrl,
tuneGrid = expand.grid(
alpha = 0,
lambda = seq(0.0001, 1, length = 20)
),
metric = "RMSE"
)
## + Fold1: alpha=0, lambda=1
## - Fold1: alpha=0, lambda=1
## + Fold2: alpha=0, lambda=1
## - Fold2: alpha=0, lambda=1
## + Fold3: alpha=0, lambda=1
## - Fold3: alpha=0, lambda=1
## + Fold4: alpha=0, lambda=1
## - Fold4: alpha=0, lambda=1
## + Fold5: alpha=0, lambda=1
## - Fold5: alpha=0, lambda=1
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0, lambda = 0.211 on full training set
ridge_cv
## glmnet
##
## 1460 samples
## 76 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1169, 1167, 1169, 1166, 1169
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.00010000 0.1497821 0.8615818 0.09173744
## 0.05272632 0.1484390 0.8638806 0.09116622
## 0.10535263 0.1466683 0.8668366 0.09038469
## 0.15797895 0.1459137 0.8682679 0.09017397
## 0.21060526 0.1456376 0.8690547 0.09021962
## 0.26323158 0.1456433 0.8694857 0.09048455
## 0.31585789 0.1458374 0.8696949 0.09087689
## 0.36848421 0.1461660 0.8697538 0.09136630
## 0.42111053 0.1465942 0.8697036 0.09191673
## 0.47373684 0.1471039 0.8695797 0.09249693
## 0.52636316 0.1476758 0.8694008 0.09310544
## 0.57898947 0.1482929 0.8691785 0.09375381
## 0.63161579 0.1489496 0.8689251 0.09443883
## 0.68424211 0.1496383 0.8686495 0.09515841
## 0.73686842 0.1503549 0.8683570 0.09587747
## 0.78949474 0.1510974 0.8680495 0.09662074
## 0.84212105 0.1518623 0.8677312 0.09736930
## 0.89474737 0.1526387 0.8674115 0.09812138
## 0.94737368 0.1534347 0.8670835 0.09888043
## 1.00000000 0.1542448 0.8667520 0.09963692
##
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.2106053.
plot(ridge_cv)
print_results(ridge_cv, "Ridge")
##
## Ridge | CV RMSE: 0.14564 | R²: 0.8691
## Lasso Regression (CV)
lasso_cv <- train(
SalePrice ~ .,
data = train_clean,
method = "glmnet",
trControl = ctrl,
tuneGrid = expand.grid(
alpha = 1,
lambda = seq(0.0001, 1, length = 20)
),
metric = "RMSE"
)
## + Fold1: alpha=1, lambda=1
## - Fold1: alpha=1, lambda=1
## + Fold2: alpha=1, lambda=1
## - Fold2: alpha=1, lambda=1
## + Fold3: alpha=1, lambda=1
## - Fold3: alpha=1, lambda=1
## + Fold4: alpha=1, lambda=1
## - Fold4: alpha=1, lambda=1
## + Fold5: alpha=1, lambda=1
## - Fold5: alpha=1, lambda=1
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 1e-04 on full training set
lasso_cv
## glmnet
##
## 1460 samples
## 76 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1167, 1169, 1169, 1168, 1167
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.00010000 0.1473712 0.8645838 0.09193695
## 0.05272632 0.1761562 0.8332837 0.11849910
## 0.10535263 0.2183456 0.7876140 0.15534116
## 0.15797895 0.2617639 0.7423225 0.19423700
## 0.21060526 0.3019468 0.7136306 0.22990522
## 0.26323158 0.3426594 0.7065192 0.26431969
## 0.31585789 0.3866483 0.6913196 0.29994059
## 0.36848421 0.3991334 NaN 0.30981966
## 0.42111053 0.3991334 NaN 0.30981966
## 0.47373684 0.3991334 NaN 0.30981966
## 0.52636316 0.3991334 NaN 0.30981966
## 0.57898947 0.3991334 NaN 0.30981966
## 0.63161579 0.3991334 NaN 0.30981966
## 0.68424211 0.3991334 NaN 0.30981966
## 0.73686842 0.3991334 NaN 0.30981966
## 0.78949474 0.3991334 NaN 0.30981966
## 0.84212105 0.3991334 NaN 0.30981966
## 0.89474737 0.3991334 NaN 0.30981966
## 0.94737368 0.3991334 NaN 0.30981966
## 1.00000000 0.3991334 NaN 0.30981966
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 1e-04.
plot(lasso_cv)
print_results(lasso_cv, "Lasso")
##
## Lasso | CV RMSE: 0.14737 | R²: 0.8646
(Target is “SalePrice, which is continuous.)
## Random Forest (CV)
rf_cv <- train(
SalePrice ~ .,
data = train_clean,
method = "rf",
trControl = ctrl,
metric = "RMSE",
ntree = 200
)
## + Fold1: mtry= 2
## - Fold1: mtry= 2
## + Fold1: mtry= 97
## - Fold1: mtry= 97
## + Fold1: mtry=193
## - Fold1: mtry=193
## + Fold2: mtry= 2
## - Fold2: mtry= 2
## + Fold2: mtry= 97
## - Fold2: mtry= 97
## + Fold2: mtry=193
## - Fold2: mtry=193
## + Fold3: mtry= 2
## - Fold3: mtry= 2
## + Fold3: mtry= 97
## - Fold3: mtry= 97
## + Fold3: mtry=193
## - Fold3: mtry=193
## + Fold4: mtry= 2
## - Fold4: mtry= 2
## + Fold4: mtry= 97
## - Fold4: mtry= 97
## + Fold4: mtry=193
## - Fold4: mtry=193
## + Fold5: mtry= 2
## - Fold5: mtry= 2
## + Fold5: mtry= 97
## - Fold5: mtry= 97
## + Fold5: mtry=193
## - Fold5: mtry=193
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 97 on full training set
rf_cv
## Random Forest
##
## 1460 samples
## 76 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1169, 1168, 1169, 1166, 1168
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.1871029 0.8464283 0.12708234
## 97 0.1401785 0.8760907 0.09253003
## 193 0.1431676 0.8704771 0.09552178
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 97.
plot(rf_cv)
print_results(rf_cv, "Random Forest")
##
## Random Forest | CV RMSE: 0.14018 | R²: 0.8761
# Gradient Boost
set.seed(42)
gbm_grid <- expand.grid(
n.trees = c(500, 1000),
interaction.depth = c(3, 5),
shrinkage = 0.05,
n.minobsinnode = 10
)
gbm_model <- train(
SalePrice ~ .,
data = train_clean,
method = "gbm",
trControl = ctrl,
tuneGrid = gbm_grid,
verbose = FALSE
)
## + Fold1: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold1: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## + Fold1: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold1: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## + Fold2: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold2: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## + Fold2: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold2: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## + Fold3: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold3: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## + Fold3: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold3: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## + Fold4: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold4: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## + Fold4: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold4: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## + Fold5: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## - Fold5: shrinkage=0.05, interaction.depth=3, n.minobsinnode=10, n.trees=1000
## + Fold5: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## - Fold5: shrinkage=0.05, interaction.depth=5, n.minobsinnode=10, n.trees=1000
## Aggregating results
## Selecting tuning parameters
## Fitting n.trees = 1000, interaction.depth = 5, shrinkage = 0.05, n.minobsinnode = 10 on full training set
print_results(gbm_model, "Gradient Boosting")
##
## Gradient Boosting | CV RMSE: 0.12104 | R²: 0.9089
cat("Best tune:\n"); print(gbm_model$bestTune)
## Best tune:
## n.trees interaction.depth shrinkage n.minobsinnode
## 4 1000 5 0.05 10
Evaluate based off of RMSE. Because our data is continuous, we cannot fit an ROC curve.
library(caret)
par(mfrow = c(1,2))
imp <- varImp(rf_cv)
imp_df <- as.data.frame(imp$importance)
# keep row names as feature names
imp_df$Feature <- rownames(imp_df)
# sort top 15
top_imp <- imp_df[order(imp_df$Overall, decreasing = TRUE), ]
top_imp <- head(top_imp, 15)
par(mar = c(5, 10, 4, 2))
barplot(top_imp$Overall,
names.arg = top_imp$Feature,
horiz = TRUE,
las = 1,
cex.names = 0.8,
xlab = "Importance",
main = "RF - Top 15 FI")
library(gbm)
gbm_imp <- varImp(gbm_model)$importance
gbm_imp$Feature <- rownames(gbm_imp)
gbm_top <- gbm_imp[order(gbm_imp$Overall, decreasing = TRUE), ]
gbm_top <- head(gbm_top, 15)
par(mar = c(5, 10, 4, 2))
barplot(gbm_top$Overall,
names.arg = gbm_top$Feature,
horiz = TRUE,
las = 1,
cex.names = 0.8,
xlab = "Importance",
main = "GBM - Top 15 FI")
Note :
• Qual_x_SF (100) is the most important for prediction the “SalePrice” (Our Traget) in both the RF and GDM Models.
cat("\n============ MODEL COMPARISON ============\n")
##
## ============ MODEL COMPARISON ============
results <- resamples(list(
Ridge = ridge_cv,
Lasso = lasso_cv,
RF = rf_cv,
Linear = lm_cv,
GBM = gbm_model
))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Ridge, Lasso, RF, Linear, GBM
## Number of resamples: 5
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Ridge 0.08768168 0.08806005 0.08898255 0.09021962 0.09264194 0.09373191 0
## Lasso 0.08202779 0.08642971 0.08685955 0.09193695 0.09300864 0.11135904 0
## RF 0.08604880 0.08804656 0.09354529 0.09253003 0.09468822 0.10032127 0
## Linear 0.08710717 0.08837334 0.09049914 0.09088272 0.09189125 0.09654269 0
## GBM 0.07402854 0.08105690 0.08115421 0.08157934 0.08535717 0.08629987 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Ridge 0.1277099 0.1342214 0.1376598 0.1456376 0.1547423 0.1738546 0
## Lasso 0.1201136 0.1253025 0.1428855 0.1473712 0.1594107 0.1891438 0
## RF 0.1205043 0.1241178 0.1415921 0.1401785 0.1438907 0.1707875 0
## Linear 0.1202906 0.1353803 0.1389952 0.1386561 0.1466466 0.1519676 0
## GBM 0.1078427 0.1169457 0.1202677 0.1210388 0.1242888 0.1358491 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Ridge 0.8013240 0.8516059 0.8823707 0.8690547 0.9015780 0.9083949 0
## Lasso 0.7971306 0.8435090 0.8794070 0.8645838 0.9012352 0.9016374 0
## RF 0.8094766 0.8629716 0.8904951 0.8760907 0.9079069 0.9096032 0
## Linear 0.8517515 0.8808915 0.8811216 0.8813315 0.8840826 0.9088103 0
## GBM 0.8997141 0.9028349 0.9042397 0.9088507 0.9087651 0.9286996 0
# Plot comparison
comparison_plot <- dotplot(results, metric = "RMSE",
main = "5-Fold CV RMSE Comparison")
print(comparison_plot)
model_summary <- data.frame(
Model = c("Ridge", "Lasso", "Random Forest", "Linear Regression", "GBM"),
CV_RMSE = c(
min(ridge_cv$results$RMSE),
min(lasso_cv$results$RMSE),
min(rf_cv$results$RMSE),
min(lm_cv$results$RMSE),
min(gbm_model$results$RMSE)
)
) %>% arrange(CV_RMSE)
model_summary$CV_RMSE <- round(model_summary$CV_RMSE, 5)
cat("\nFinal ranking:\n")
##
## Final ranking:
print(model_summary)
## Model CV_RMSE
## 1 GBM 0.12104
## 2 Linear Regression 0.13866
## 3 Random Forest 0.14018
## 4 Ridge 0.14564
## 5 Lasso 0.14737
par(mfrow = c(2,3))
pred_train <- predict(gbm_model, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
xlab = "Actual log(SalePrice)",
ylab = "Predicted log(SalePrice)",
main = "Predicted vs Actual (GBM)",
pch = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)
pred_train <- predict(rf_cv, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
xlab = "Actual log(SalePrice)",
ylab = "Predicted log(SalePrice)",
main = "Predicted vs Actual (RF)",
pch = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)
pred_train <- predict(lasso_cv, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
xlab = "Actual log(SalePrice)",
ylab = "Predicted log(SalePrice)",
main = "Predicted vs Actual (Lasso)",
pch = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)
pred_train <- predict(ridge_cv, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
xlab = "Actual log(SalePrice)",
ylab = "Predicted log(SalePrice)",
main = "Predicted vs Actual (Ridge)",
pch = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)
pred_train <- predict(lm_cv, newdata = train_clean)
plot(train_clean$SalePrice, pred_train,
xlab = "Actual log(SalePrice)",
ylab = "Predicted log(SalePrice)",
main = "Predicted vs Actual (LR)",
pch = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(0, 1, col = "red", lwd = 2)
### Residuals
residuals <- train_clean$SalePrice - pred_train
p2 <- plot(pred_train, residuals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residual Plot (GBM)",
pch = 16, col = adjustcolor("steelblue", alpha = 0.4))
abline(h = 0, col = "red", lwd = 2)
## Predict Test Set (GBM)
final_pred <- predict(gbm_model, newdata = test_clean)
final_pred[is.na(final_pred)] <- median(final_pred, na.rm = TRUE)
# Convert back from log scale
submission <- data.frame(
Id = test_ids,
SalePrice = exp(final_pred)
)
nrow(submission)
## [1] 1459
sum(is.na(submission$SalePrice))
## [1] 0
head(submission)
## Id SalePrice
## 1 1461 121672.5
## 2 1462 165933.5
## 3 1463 176875.8
## 4 1464 197317.8
## 5 1465 178172.7
## 6 1466 174780.7
write.csv(submission, "submission.csv", row.names = FALSE)