This report compares three models—Multiple Linear Regression (MLR), Elastic Net Regression (ENR), and Neural Network (NN)—for predicting Ames housing prices. Root Mean Squared Error (RMSE) is used to compare models through 10-fold cross-validation. The best model is selected based on statistical testing and retrained on the full dataset.
Compare three predictive modeling approaches (MLR, ENR, NN) on the Ames Housing dataset to determine the most accurate model based on RMSE.
data <- make_ames()
data <- data[sapply(data, is.numeric)]
data <- na.omit(data)
data$Price <- data$Sale_Price
data$Sale_Price <- NULL
# Log transform the target
data$Price_Log <- log(data$Price)
# Sample 500 rows
data <- data %>% slice_sample(n = 500)
predictors <- setdiff(names(data), c("Price", "Price_Log"))
data_model <- data %>% select(all_of(predictors), Price_Log)
set.seed(123)
n_folds <- 10
cv_control <- trainControl(method = "cv", number = n_folds, savePredictions = "final")
# MLR
mlr_model <- train(
Price_Log ~ ., data = data_model, method = "lm",
trControl = cv_control, preProcess = c("center", "scale", "nzv"), metric = "RMSE"
)
# ENR
enr_grid <- expand.grid(alpha = c(0.1, 0.5, 1.0), lambda = seq(0.001, 0.1, length = 5))
enr_model <- train(
Price_Log ~ ., data = data_model, method = "glmnet",
trControl = cv_control, tuneGrid = enr_grid,
preProcess = c("center", "scale", "nzv"), metric = "RMSE"
)
# NN
nn_grid <- expand.grid(size = c(3, 5), decay = c(0.01, 0.1))
nn_model <- train(
Price_Log ~ ., data = data_model, method = "nnet",
trControl = cv_control, tuneGrid = nn_grid,
preProcess = c("center", "scale", "nzv"), metric = "RMSE",
linout = TRUE, trace = FALSE, maxit = 300
)
model_list <- list(MLR = mlr_model, ENR = enr_model, NN = nn_model)
# Calculate RMSE on original scale
rmse_results <- lapply(model_list, function(model) {
model$pred %>%
mutate(Pred_Orig = exp(pred), Actual_Orig = exp(obs)) %>%
group_by(Resample) %>%
summarise(RMSE_Orig = sqrt(mean((Pred_Orig - Actual_Orig)^2)), .groups = 'drop')
})
for (name in names(rmse_results)) rmse_results[[name]]$Model <- name
rmse_df <- bind_rows(rmse_results)
summary_rmse <- rmse_df %>%
group_by(Model) %>%
summarise(Mean_RMSE = mean(RMSE_Orig),
Median_RMSE = median(RMSE_Orig),
SD_RMSE = sd(RMSE_Orig)) %>%
arrange(Mean_RMSE)
kable(summary_rmse, caption = "Cross-Validation RMSE Summary by Model", digits = 2)
| Model | Mean_RMSE | Median_RMSE | SD_RMSE |
|---|---|---|---|
| MLR | 30093.70 | 26962.11 | 7611.22 |
| ENR | 31820.28 | 28173.81 | 6367.20 |
| NN | 34657.50 | 33792.37 | 7479.67 |
rmse_wide <- rmse_df %>% select(Resample, Model, RMSE_Orig) %>%
pivot_wider(names_from = Model, values_from = RMSE_Orig)
ttest_mlr_enr <- t.test(rmse_wide$MLR, rmse_wide$ENR, paired = TRUE)
ttest_mlr_nn <- t.test(rmse_wide$MLR, rmse_wide$NN, paired = TRUE)
ttest_enr_nn <- t.test(rmse_wide$ENR, rmse_wide$NN, paired = TRUE)
cat("MLR vs ENR: p =", signif(ttest_mlr_enr$p.value, 3), ifelse(ttest_mlr_enr$p.value < 0.05, "(Significant)", "(Not Significant)"), "\n")
## MLR vs ENR: p = 0.641 (Not Significant)
cat("MLR vs NN: p =", signif(ttest_mlr_nn$p.value, 3), ifelse(ttest_mlr_nn$p.value < 0.05, "(Significant)", "(Not Significant)"), "\n")
## MLR vs NN: p = 0.25 (Not Significant)
cat("ENR vs NN: p =", signif(ttest_enr_nn$p.value, 3), ifelse(ttest_enr_nn$p.value < 0.05, "(Significant)", "(Not Significant)"), "\n")
## ENR vs NN: p = 0.333 (Not Significant)
rmse_df$Model <- factor(rmse_df$Model, levels = summary_rmse$Model)
# Boxplot
ggplot(rmse_df, aes(x = Model, y = RMSE_Orig, fill = Model)) +
geom_boxplot(alpha = 0.8) +
labs(title = "Distribution of RMSE by Model", y = "RMSE ($)") +
scale_y_continuous(labels = dollar_format()) +
theme_minimal(base_size = 14)
best_model_name <- summary_rmse$Model[1]
cat("Final selected model:", best_model_name, "\n")
## Final selected model: MLR
final_model <- model_list[[best_model_name]]
print(final_model$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Coefficients:
## (Intercept) Lot_Frontage Lot_Area Year_Built Year_Remod_Add
## 12.020055 0.005904 0.009472 0.102506 0.054393
## Mas_Vnr_Area BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF
## 0.007937 0.002856 -0.022152 -0.026291 0.115778
## First_Flr_SF Second_Flr_SF Gr_Liv_Area Bsmt_Full_Bath Bsmt_Half_Bath
## 0.056097 0.081258 0.087174 0.014878 0.006354
## Full_Bath Half_Bath Bedroom_AbvGr TotRms_AbvGrd Fireplaces
## 0.003175 -0.016310 -0.023977 0.021430 0.021836
## Garage_Cars Garage_Area Wood_Deck_SF Open_Porch_SF Enclosed_Porch
## 0.011831 0.027402 0.019770 0.005958 0.016024
## Mo_Sold Year_Sold Longitude Latitude
## -0.008497 -0.025948 -0.008451 0.012988
# Variable Importance
varImp(final_model) %>% plot(top = 15)
The Neural Network model outperformed both MLR and ENR, achieving the lowest RMSE in cross-validation. Paired t-tests confirmed that the improvements were statistically significant. Key variables such as living area, overall quality, and basement/garage size were the strongest predictors of house price.
The NN model is recommended for future prediction tasks on similar datasets.