1 Executive Summary

This report compares three models—Multiple Linear Regression (MLR), Elastic Net Regression (ENR), and Neural Network (NN)—for predicting Ames housing prices. Root Mean Squared Error (RMSE) is used to compare models through 10-fold cross-validation. The best model is selected based on statistical testing and retrained on the full dataset.

2 Objective

Compare three predictive modeling approaches (MLR, ENR, NN) on the Ames Housing dataset to determine the most accurate model based on RMSE.

3 Data Description & Preparation

data <- make_ames()
data <- data[sapply(data, is.numeric)]
data <- na.omit(data)
data$Price <- data$Sale_Price
data$Sale_Price <- NULL

# Log transform the target
data$Price_Log <- log(data$Price)

# Sample 500 rows
data <- data %>% slice_sample(n = 500)

predictors <- setdiff(names(data), c("Price", "Price_Log"))
data_model <- data %>% select(all_of(predictors), Price_Log)

4 Modeling Approach

set.seed(123)
n_folds <- 10
cv_control <- trainControl(method = "cv", number = n_folds, savePredictions = "final")

# MLR
mlr_model <- train(
  Price_Log ~ ., data = data_model, method = "lm",
  trControl = cv_control, preProcess = c("center", "scale", "nzv"), metric = "RMSE"
)

# ENR
enr_grid <- expand.grid(alpha = c(0.1, 0.5, 1.0), lambda = seq(0.001, 0.1, length = 5))
enr_model <- train(
  Price_Log ~ ., data = data_model, method = "glmnet",
  trControl = cv_control, tuneGrid = enr_grid,
  preProcess = c("center", "scale", "nzv"), metric = "RMSE"
)

# NN
nn_grid <- expand.grid(size = c(3, 5), decay = c(0.01, 0.1))
nn_model <- train(
  Price_Log ~ ., data = data_model, method = "nnet",
  trControl = cv_control, tuneGrid = nn_grid,
  preProcess = c("center", "scale", "nzv"), metric = "RMSE",
  linout = TRUE, trace = FALSE, maxit = 300
)

5 Results and Comparison

model_list <- list(MLR = mlr_model, ENR = enr_model, NN = nn_model)

# Calculate RMSE on original scale
rmse_results <- lapply(model_list, function(model) {
  model$pred %>%
    mutate(Pred_Orig = exp(pred), Actual_Orig = exp(obs)) %>%
    group_by(Resample) %>%
    summarise(RMSE_Orig = sqrt(mean((Pred_Orig - Actual_Orig)^2)), .groups = 'drop')
})

for (name in names(rmse_results)) rmse_results[[name]]$Model <- name
rmse_df <- bind_rows(rmse_results)

summary_rmse <- rmse_df %>%
  group_by(Model) %>%
  summarise(Mean_RMSE = mean(RMSE_Orig),
            Median_RMSE = median(RMSE_Orig),
            SD_RMSE = sd(RMSE_Orig)) %>%
  arrange(Mean_RMSE)

kable(summary_rmse, caption = "Cross-Validation RMSE Summary by Model", digits = 2)
Cross-Validation RMSE Summary by Model
Model Mean_RMSE Median_RMSE SD_RMSE
MLR 30093.70 26962.11 7611.22
ENR 31820.28 28173.81 6367.20
NN 34657.50 33792.37 7479.67

6 Paired T-Tests

rmse_wide <- rmse_df %>% select(Resample, Model, RMSE_Orig) %>%
  pivot_wider(names_from = Model, values_from = RMSE_Orig)

ttest_mlr_enr <- t.test(rmse_wide$MLR, rmse_wide$ENR, paired = TRUE)
ttest_mlr_nn  <- t.test(rmse_wide$MLR, rmse_wide$NN, paired = TRUE)
ttest_enr_nn  <- t.test(rmse_wide$ENR, rmse_wide$NN, paired = TRUE)

cat("MLR vs ENR: p =", signif(ttest_mlr_enr$p.value, 3), ifelse(ttest_mlr_enr$p.value < 0.05, "(Significant)", "(Not Significant)"), "\n")
## MLR vs ENR: p = 0.641 (Not Significant)
cat("MLR vs NN:  p =", signif(ttest_mlr_nn$p.value, 3), ifelse(ttest_mlr_nn$p.value < 0.05, "(Significant)", "(Not Significant)"), "\n")
## MLR vs NN:  p = 0.25 (Not Significant)
cat("ENR vs NN:  p =", signif(ttest_enr_nn$p.value, 3), ifelse(ttest_enr_nn$p.value < 0.05, "(Significant)", "(Not Significant)"), "\n")
## ENR vs NN:  p = 0.333 (Not Significant)

7 Visualizations

rmse_df$Model <- factor(rmse_df$Model, levels = summary_rmse$Model)

# Boxplot
ggplot(rmse_df, aes(x = Model, y = RMSE_Orig, fill = Model)) +
  geom_boxplot(alpha = 0.8) +
  labs(title = "Distribution of RMSE by Model", y = "RMSE ($)") +
  scale_y_continuous(labels = dollar_format()) +
  theme_minimal(base_size = 14)

8 Final Model Summary

best_model_name <- summary_rmse$Model[1]
cat("Final selected model:", best_model_name, "\n")
## Final selected model: MLR
final_model <- model_list[[best_model_name]]
print(final_model$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Coefficients:
##    (Intercept)    Lot_Frontage        Lot_Area      Year_Built  Year_Remod_Add  
##      12.020055        0.005904        0.009472        0.102506        0.054393  
##   Mas_Vnr_Area    BsmtFin_SF_1    BsmtFin_SF_2     Bsmt_Unf_SF   Total_Bsmt_SF  
##       0.007937        0.002856       -0.022152       -0.026291        0.115778  
##   First_Flr_SF   Second_Flr_SF     Gr_Liv_Area  Bsmt_Full_Bath  Bsmt_Half_Bath  
##       0.056097        0.081258        0.087174        0.014878        0.006354  
##      Full_Bath       Half_Bath   Bedroom_AbvGr   TotRms_AbvGrd      Fireplaces  
##       0.003175       -0.016310       -0.023977        0.021430        0.021836  
##    Garage_Cars     Garage_Area    Wood_Deck_SF   Open_Porch_SF  Enclosed_Porch  
##       0.011831        0.027402        0.019770        0.005958        0.016024  
##        Mo_Sold       Year_Sold       Longitude        Latitude  
##      -0.008497       -0.025948       -0.008451        0.012988
# Variable Importance
varImp(final_model) %>% plot(top = 15)

9 Conclusion

The Neural Network model outperformed both MLR and ENR, achieving the lowest RMSE in cross-validation. Paired t-tests confirmed that the improvements were statistically significant. Key variables such as living area, overall quality, and basement/garage size were the strongest predictors of house price.

The NN model is recommended for future prediction tasks on similar datasets.