1 Distribution Visualizations

Show code

TPOSampleJoined %>%
  ggplot(aes(log10(TOT_MCF))) +
  geom_density() +
  scale_x_continuous(limits = c(-2,5), breaks = c(seq(-2,5,1))) +
  scale_y_continuous(limits = c, breaks = c(seq(0,.7,.1))) +
  theme_fivethirtyeight(base_size = 20, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 30)) +
  labs(title = "Logged Volume Distribution - Unfiltered", x = "Log(10) MCF", y = "Density")

Show code

TPOSampleJoined %>%
  filter(TOT_MCF < 16000 & Pro_Rad < 200 & !is.na(Pro_Rad)) %>%
  ggplot(aes(log10(TOT_MCF))) +
  geom_density() +
  scale_x_continuous(limits = c(-2,5), breaks = c(seq(-2,5,1))) +
  scale_y_continuous(limits = c, breaks = c(seq(0,.7,.1))) +
  theme_fivethirtyeight(base_size = 20, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 30)) +
  labs(title = "Logged Volume Distribution - Model Obs. Only", x = "Log(10) MCF", y = "Density")

Show code

TPOSampleJoined %>%
  ggplot(aes(sqrt(Pro_Rad)))+
  geom_density() + 
  scale_x_continuous(limits = c(0, 20), breaks = c(seq(0,20,2))) +
  scale_y_continuous(limits = c(0,.25), breaks = c(seq(0,.25,.05))) +
  theme_fivethirtyeight(base_size = 20, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 30)) +
  labs(title = "SqRt Radius Distribution", x = "SqRt Procurement Radius", y = "Density")

2 Normal-QQ Plots

Show code

Agg <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad)) %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2) +
  stat_qq_line() +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))


L <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Lake") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "blue") +
  stat_qq_line(color = "blue") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

NE <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "NewEngland") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "red") +
  stat_qq_line(color = "red") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

MA <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "MidAtl") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "orange") +
  stat_qq_line(color = "orange") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

P <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Plains") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "brown") +
  stat_qq_line(color = "brown") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

C <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Central") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "darkgreen") +
  stat_qq_line(color = "darkgreen") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

Test <- ggarrange(plotlist = list(Agg,C,L,MA,NE,P), labels = list("Agg.", "Central", "Lake", "MidAtl", "NewEng", "Plains"), font.label = list(size = 50, face = "plain", color ="black", family = 'serif')) 

annotate_figure(Test,
                left = textGrob("Log10(MCF)", rot = 90, vjust = .4, gp = gpar(cex = 4, fontfamily = "serif")),
                bottom = textGrob("Theoretical Quantiles",gp = gpar(cex = 4, fontfamily = "serif")),
                top = textGrob("Normal-QQ Plots - Log10(MCF)", gp = gpar(cex = 6, fontfamily = "serif")))

Show code

rm(Agg,C,L,MA,NE,P)


Agg <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Pro_Rad < 200) %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2) +
  stat_qq_line() +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,5)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.1,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

L <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Lake") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "blue") +
  stat_qq_line(color = "blue") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

NE <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "NewEngland") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "red") +
  stat_qq_line(color = "red") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

MA <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "MidAtl") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "orange") +
  stat_qq_line(color = "orange") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

P <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Plains") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "brown") +
  stat_qq_line(color = "brown") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

C <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Central") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "darkgreen") +
  stat_qq_line(color = "darkgreen") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

Test <- ggarrange(plotlist = list(Agg,C,L,MA,NE,P), labels = list("Agg.", "Central", "Lake", "MidAtl", "NewEng", "Plains"), font.label = list(size = 50, face = "plain", color ="black", family = 'serif')) 

annotate_figure(Test,
                left = textGrob("Sqrt(Procurement Radius)", rot = 90, gp = gpar(cex = 4, fontfamily = "serif")),
                bottom = textGrob("Theoretical Quantiles",gp = gpar(cex = 4, fontfamily = "serif")),
                top = textGrob("Normal-QQ Plots - Procurement Radius", gp = gpar(cex = 6, fontfamily = "serif")))

Show code

rm(Agg,C,L,MA,NE,P)

Show code

# ggplot(TPOSampleJoined, aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
#     geom_point() +
#     geom_smooth(se = FALSE, method = "lm", fullrange = TRUE) +
#     scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
#     scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
#     facet_wrap(~Region, ncol = 3) +
#     theme_fivethirtyeight(base_size = 20, base_family = "serif") +
#     theme(legend.position = "none", title = element_text(size = 20, family = "serif")) +
#     labs(title = "Regression Observations") +
#     ggpubr::stat_regline_equation(label.x = 1, label.y = 14, aes(label = ..rr.label..))


Agg <- TPOSampleJoined %>%
    ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3) +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "black") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

C <- TPOSampleJoined %>%
  filter(Region == "Central") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "darkgreen") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "darkgreen") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

L <- TPOSampleJoined %>%
  filter(Region == "Lake") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "blue") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "blue") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

MA <- TPOSampleJoined %>%
  filter(Region == "MidAtl") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "orange") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "orange") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

NE <- TPOSampleJoined %>%
  filter(Region == "NewEngland") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "red") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "red") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

P <- TPOSampleJoined %>%
  filter(Region == "Plains") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "brown") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "brown") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))


Test <- ggarrange(plotlist = list(Agg,C,L,MA,NE,P), labels = list("Agg.", "Central", "Lake", "MidAtl", "NewEng", "Plains"), font.label = list(size = 50, face = "plain", color ="black", family = 'serif')) 

annotate_figure(Test,
                left = textGrob("Sqrt(Procurement Radius)", rot = 90, gp = gpar(cex = 4, fontfamily = "serif")),
                bottom = textGrob("Log(10) MCF",gp = gpar(cex = 4, fontfamily = "serif")),
                top = textGrob("Regression Observations", gp = gpar(cex = 6, fontfamily = "serif")))

Show code

rm(Agg,C,L,MA,NE,P)

3 Linear Regression

Show code

# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")
 
lm_fit <- linear_model %>% 
  fit(Pro_Rad_SqRt ~ LOG_TOT_MCF * Region, data = data_training)

VI <- as.data.frame(lm_fit$fit$coefficients) %>%
  rownames_to_column(var = "variable") %>%
  rename(scaled_importance = 'lm_fit$fit$coefficients')

VI <- VI[2:10,]

VI <- VI %>%
  mutate(scaled_importance = abs(1/(1-(scaled_importance^2))),
         scaled_importance = scaled_importance/max(scaled_importance))


VI %>%
  mutate(variable = as.factor(variable),
         variable = fct_reorder(variable, scaled_importance)) %>%
  filter(scaled_importance > 0) %>%
ggplot(aes(y = variable, x = scaled_importance)) + 
  geom_col() +
  scale_x_continuous(breaks = c(seq(0,1,.1)), limits = c(0,1)) + 
  labs(title = "Linear Regression - Scaled Variable Importance Scores", subtitle = "Scores calculated using test data through R-Squared values.",y = "Scaled VI Score", x = "Variable") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 30))

Show code

# Predict outcome
predictions <- predict(lm_fit, new_data = data_test)
 
# Combine test data with predictions
data_test_results <- data_test %>% 
  select(LOG_TOT_MCF, Region, Pro_Rad_SqRt) %>% 
  bind_cols(predictions)

# data_test_results <- data_test_results %>%
#   mutate(Pro_Rad_SqRt = Pro_Rad_SqRt^2,
#          .pred = .pred^2)

lin_testdata <- data_test_results %>%
  mutate(residuals = abs(Pro_Rad_SqRt - .pred))

lin_test_results <- lin_testdata %>%
  select(Pro_Rad_SqRt, .pred) %>%
  bind_cols(mean_res = mean(lin_testdata$residuals))

lin_rmse <- round(rmse(lin_test_results$.pred, lin_test_results$Pro_Rad_SqRt), 3)
lin_mae <- round(mae_vec(lin_test_results$.pred, lin_test_results$Pro_Rad_SqRt), 3)
lin_r2 <- round(R2_Score(lin_test_results$.pred, lin_test_results$Pro_Rad_SqRt), 3)

paste0("Overall RMSE = ", round(rmse(lin_test_results$.pred, lin_test_results$Pro_Rad_SqRt), 3))

[1] "Overall RMSE = 2.074"

Show code

test <- lin_testdata %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(residuals), list(MeanRes = mean))
test2 <- lin_testdata %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)


  ggplot(lin_testdata, aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Linear Regression - Test Data Residual Plot", subtitle = "SqRt(Procurement Radius)" , x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
    ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

Show code

#EVALUATE THE MODEL BEFORE PROCEEDING, then use last_fit() to fit on the test dataset
# Define a linear regression model
linear_model <- linear_reg() %>% 
  set_engine('lm') %>% 
  set_mode('regression')
 
# Train linear_model with last_fit()
linear_fit <- linear_model %>% 
  last_fit(Pro_Rad_SqRt ~ LOG_TOT_MCF * Region, split = data_split)
 
# Collect predictions
lin_finalpreds <- linear_fit %>% 
  collect_predictions() %>%
  mutate(Res = abs(Pro_Rad_SqRt - .pred))

rm(linear_fit, linear_model, lm_fit, predictions, data_test_results, lin_testdata)

4 Additive Regression

Show code

# Initialize a linear regression object, linear_model
mgcv_model <- gen_additive_mod() %>% 
  set_engine("mgcv") %>% 
  set_mode("regression")
 
mgcv_fit <- mgcv_model %>% 
  fit(Pro_Rad_SqRt ~ s(LOG_TOT_MCF, by = Region) + Region, 
      data = data_training, method = "REML")
 
VI <- as.data.frame(mgcv_fit$fit$coefficients) %>%
  rownames_to_column(var = "variable") %>%
  rename(scaled_importance = 'mgcv_fit$fit$coefficients')

VI <- VI[2:10,]

VI <- VI %>%
  mutate(scaled_importance = abs(1/(1-(scaled_importance^2))),
         scaled_importance = scaled_importance/max(scaled_importance))


VI %>%
  mutate(variable = as.factor(variable),
         variable = fct_reorder(variable, scaled_importance)) %>%
  filter(scaled_importance > 0) %>%
ggplot(aes(y = variable, x = scaled_importance)) + 
  geom_col() +
  scale_x_continuous(breaks = c(seq(0,1,.1)), limits = c(0,1)) + 
  labs(title = "Additive Regression - Scaled Variable Importance Scores", subtitle = "Scores calculated using test data through R-Squared values.",y = "Scaled VI Score", x = "Variable") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 30))

Show code

# Predict outcome
mgcv_testpreds <- predict(mgcv_fit, new_data = data_test)
mgcv_finalpreds <- predict(mgcv_fit, new_data = Test_Data)
 

# Combine test data with predictions
mgcv_testpreds <- data_test %>% 
  select(LOG_TOT_MCF, Region, Pro_Rad_SqRt) %>% 
  bind_cols(mgcv_testpreds) %>%
  mutate(residuals = abs(Pro_Rad_SqRt - .pred))

mgcv_rmse <- round(rmse(mgcv_testpreds$.pred, mgcv_testpreds$Pro_Rad_SqRt), 3)
mgcv_mae <- round(mae_vec(mgcv_testpreds$.pred, mgcv_testpreds$Pro_Rad_SqRt), 3)
mgcv_r2 <- round(R2_Score(mgcv_testpreds$.pred, mgcv_testpreds$Pro_Rad_SqRt), 3)
paste0("Overall RMSE = ", round(rmse(mgcv_testpreds$.pred, mgcv_testpreds$Pro_Rad_SqRt), 3))

[1] "Overall RMSE = 2.098"

Show code

# Combine final data with predictions
mgcv_finalpreds <- Test_Data %>% 
  select(LOG_TOT_MCF, Region, Pro_Rad_SqRt) %>% 
  bind_cols(mgcv_finalpreds) %>%
  mutate(residuals = abs(Pro_Rad_SqRt - .pred))

test <- mgcv_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(residuals), list(MeanRes = mean))
test2 <- mgcv_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)


ggplot(mgcv_testpreds, aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Additive Regression - Test Data Residual Plot", subtitle = "SqRt(Procurement Radius)", 
         x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

Show code

# mgcv_testpreds <- mgcv_testpreds %>%
#   rmse(Pro_Rad_SqRt, .pred) %>%
#   bind_cols(mean_res = mean(mgcv_testpreds$residuals))
# 
# mgcv_finalpreds <- mgcv_finalpreds %>%
#   rmse(Pro_Rad_SqRt, .pred) %>%
#   bind_cols(mean_res = mean(mgcv_finalpreds$residuals))

rm(mgcv_fit, mgcv_model)

5 Decsision Tree Regression

Show code

dt_model <- decision_tree() %>% 
  set_engine('rpart') %>% 
  set_mode('regression')
 
# Build feature engineering pipeline
data_recipe <- recipe(Pro_Rad_SqRt ~ ., data = data_training) %>% 
  # Correlation filter
  step_corr(all_numeric(), threshold = .8) %>%
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())
 
# Create a workflow
data_wkfl <- workflow() %>% 
  # Include the model object
  add_model(dt_model) %>% 
  # Include the recipe object
  add_recipe(data_recipe)
 
# Train the workflow
data_wkfl_fit <- data_wkfl %>% 
  last_fit(split = data_split)

# Calculate performance metrics on test data
data_wkfl_fit %>% 
  collect_metrics(summarize = FALSE)

# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      2.18   Preprocessor1_Model1
2 rsq     standard      0.0738 Preprocessor1_Model1

Show code

# Create cross validation folds
set.seed(260)
data_folds <- vfold_cv(data_training, v = 5, strata = Pro_Rad_SqRt)
 
# Create custom metrics function
data_metrics <- metric_set(yardstick::rmse)
 
# Fit resamples
data_rs <- data_wkfl %>% 
  fit_resamples(resamples = data_folds)
 
# Detailed cross validation results
data_results <- data_rs %>% 
  collect_metrics(summarize = FALSE)
 
# Set tuning hyperparameters
dt_tune_model <- decision_tree(tree_depth = tune(), min_n = tune()) %>% 
  set_engine('rpart') %>% 
  set_mode('regression')
 
# Create a tuning workflow
data_tune_wkfl <- data_wkfl %>% 
  # Replace model
  update_model(dt_tune_model)
 
# Hyperparameter tuning with grid search
set.seed(284)
dt_grid <- grid_random(parameters(dt_tune_model), size = 5)
# Hyperparameter tuning
dt_tuning <- data_tune_wkfl %>% 
  tune_grid(resamples = data_folds, grid = dt_grid)
 
# Collect detailed tuning results
dt_tuning_results <- dt_tuning %>% 
  collect_metrics(summarize = FALSE)
 
# Display 5 best performing models
Test <- dt_tuning %>% show_best(metric = 'rmse', n = 5)
 
# Select based on best performance
best_dt_model <- dt_tuning %>% 
  # Choose the best model based on roc_auc
  select_best(metric = 'rmse')
 
# Finalize your workflow
final_data_wkfl <- data_tune_wkfl %>% 
  finalize_workflow(best_dt_model)
 
# Train finalized decision tree workflow
data_final_fit <- final_data_wkfl %>% 
  last_fit(split = data_split)
 
# View performance metrics
data_final_fit %>% collect_metrics()

# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      2.18   Preprocessor1_Model1
2 rsq     standard      0.0738 Preprocessor1_Model1

Show code

# Create an ROC curve
dt_testpreds <- data_final_fit %>% 
  # Collect predictions
  collect_predictions() %>%
  mutate(Res = abs(Pro_Rad_SqRt - .pred)) %>%
  cbind(data_test[,3]) %>%
  select(-c(id, .row, .config)) 

test <- dt_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(Res), list(MeanRes = mean))
test2 <- dt_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)

dtree_rmse <- round(rmse(dt_testpreds$.pred, dt_testpreds$Pro_Rad_SqRt), 3)
dtree_mae <- round(mae_vec(dt_testpreds$.pred, dt_testpreds$Pro_Rad_SqRt), 3)
dtree_r2 <- round(R2_Score(dt_testpreds$.pred, dt_testpreds$Pro_Rad_SqRt), 3)

paste0("Overall RMSE = ", round(rmse(dt_testpreds$.pred, dt_testpreds$Pro_Rad_SqRt), 3))

[1] "Overall RMSE = 2.18"

Show code

dt_testpreds %>%
  ggplot(aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Decision Tree Regression - Test Data Residual Plot", 
         subtitle = "SqRt(Procurement Radius)" ,x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

6 Random Forest Regression

Show code

rf_model <- rand_forest(trees = 1000) %>% 
  set_engine('ranger') %>% 
  set_mode('regression')
 
# Build feature engineering pipeline
data_recipe <- recipe(Pro_Rad_SqRt ~ ., data = data_training) %>% 
  # Correlation filter
  step_corr(all_numeric(), threshold = .8) %>%
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())

# Create a workflow
data_wkfl <- workflow() %>% 
  # Include the model object
  add_model(rf_model) %>% 
  # Include the recipe object
  add_recipe(data_recipe)

# Train the workflow
data_wkfl_fit <- data_wkfl %>% 
  last_fit(split = data_split)

# Calculate performance metrics on test data
data_wkfl_fit %>% 
  collect_metrics(summarize = FALSE)

# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard       2.06  Preprocessor1_Model1
2 rsq     standard       0.150 Preprocessor1_Model1

Show code

# Create cross validation folds
set.seed(260)
data_folds <- vfold_cv(data_training, v = 5, strata = Pro_Rad_SqRt)
 
# Create custom metrics function
data_metrics <- metric_set(yardstick::rmse)

# Fit resamples
data_rs <- data_wkfl %>% 
  fit_resamples(resamples = data_folds)
 
# Detailed cross validation results
data_results <- data_rs %>% 
  collect_metrics(summarize = FALSE)
 
# Set tuning hyperparameters
rf_tune_model <- rand_forest(trees = tune(), mtry = tune()) %>% 
  set_engine('ranger') %>% 
  set_mode('regression')
# Create a tuning workflow
data_tune_wkfl <- data_wkfl %>% 
  # Replace model
  update_model(rf_tune_model)

# Hyperparameter tuning with grid search
set.seed(284)

# Hyperparameter tuning
rf_tuning <- data_tune_wkfl %>% 
  tune_grid(resamples = data_folds)
 
# Collect detailed tuning results
rf_tuning_results <- rf_tuning %>% 
  collect_metrics(summarize = FALSE)
 
# Display 5 best performing models
Test <- rf_tuning %>% show_best(metric = 'rmse', n = 5)
 
# Select based on best performance
best_rf_model <- rf_tuning %>% 
  # Choose the best model based on rmse
  select_best(metric = 'rmse')
 
# Finalize your workflow
final_data_wkfl <- data_tune_wkfl %>% 
  finalize_workflow(best_rf_model)
 
# Train finalized decision tree workflow
data_final_fit <- final_data_wkfl %>% 
  last_fit(split = data_split)
 
# View performance metrics
data_final_fit %>% collect_metrics()

# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard       2.06  Preprocessor1_Model1
2 rsq     standard       0.147 Preprocessor1_Model1

Show code

# Create an ROC curve
rf_testpreds <- data_final_fit %>% 
  # Collect predictions
  collect_predictions() %>%
  mutate(Res = abs(Pro_Rad_SqRt - .pred)) %>%
  cbind(data_test[,3]) %>%
  select(-c(id, .row, .config)) 

test <- rf_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(Res), list(MeanRes = mean))

test2 <- rf_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)

rf_rmse <- round(rmse(rf_testpreds$.pred, rf_testpreds$Pro_Rad_SqRt), 3)
rf_mae <- round(mae_vec(rf_testpreds$.pred, rf_testpreds$Pro_Rad_SqRt), 3)
rf_r2 <- round(R2_Score(rf_testpreds$.pred, rf_testpreds$Pro_Rad_SqRt), 3)

rf_testpreds %>%
  ggplot(aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Random Forest Regression - Test Data Residual Plot", 
         subtitle = "SqRt(Procurement Radius)" , x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

7 Gradient Boosted Regression

Show code

# Specify the model class
xg_model <- boost_tree(
                trees = 1000,
                learn_rate = tune(),
                tree_depth = tune(),
                sample_size = tune()) %>%
  set_mode("regression") %>%
  set_engine("xgboost")

# Create the tuning grid
tunegrid_boost <- grid_regular(parameters(xg_model), 
                      levels = 5)

# Create CV folds of training data
folds <- vfold_cv(data_training, 5, strata = Pro_Rad_SqRt)

# Tune along the grid
tune_results <- tune_grid(xg_model,
                    Pro_Rad_SqRt ~ .,
                    resamples = folds,
                    grid = tunegrid_boost,
                    metrics = metric_set(yardstick::rmse))

best_params <- select_best(tune_results)

# Finalize the specification
final_spec <- finalize_model(xg_model, best_params)

# Train the final model on the full training data
final_model <- final_spec %>% fit(Pro_Rad_SqRt ~ ., data_training)

# Specify, fit, predict, and combine with training data
xg_testpreds <- final_model %>%
  predict(data_test) 

xg_testpreds <- data_test %>%
  cbind(xg_testpreds) %>%
mutate(Res = abs(Pro_Rad_SqRt - .pred)) 


test <- xg_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(Res), list(MeanRes = mean))

test2 <- xg_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)

xg_rmse <- round(rmse(xg_testpreds$.pred, xg_testpreds$Pro_Rad_SqRt), 3)
xg_mae <- round(mae_vec(xg_testpreds$.pred, xg_testpreds$Pro_Rad_SqRt), 3)
xg_r2 <- round(R2_Score(xg_testpreds$.pred, xg_testpreds$Pro_Rad_SqRt), 3)

xg_testpreds %>%
  ggplot(aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), 
          axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Gradient Boosted Regression - Test Data Residual Plot", 
         subtitle = "SqRt(Procurement Radius)" , x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

8 Unsupervised Regression - Machine Learning using ‘h2o’

Show code

VI <- h2o.varimp(leader)

VI <- VI %>%
  mutate(variable = case_when(variable == "LOG_TOT_MCF" ~ "Log10(MCF)",
                              variable == "MILL_STATE" ~ "State",
                              variable == "MILL_TYPE_CD" ~ "Mill Type",
                              variable == "Region" ~ "Region"))


VI %>%
  mutate(variable = as.factor(variable),
         variable = fct_reorder(variable, scaled_importance)) %>%
  filter(scaled_importance > 0) %>%
ggplot(aes(y = variable, x = scaled_importance)) + 
  geom_col() +
  scale_x_continuous(breaks = c(seq(0,1,.1)), limits = c(0,1)) + 
  labs(title = "h2o Regression - Scaled Variable Importance Scores", 
       subtitle = "Scores calculated using test data.", y = "Scaled VI Score", x = "Variable") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 30))

Show code

head(lb)

                                      model_id     rmse      mse
1 GBM_grid_1_AutoML_1_20221026_163026_model_10 2.029082 4.117172
2  GBM_grid_1_AutoML_1_20221026_163026_model_2 2.044099 4.178339
3  GBM_grid_1_AutoML_1_20221026_163026_model_8 2.049108 4.198843
4 GBM_grid_1_AutoML_1_20221026_163026_model_11 2.050137 4.203060
5               GBM_1_AutoML_1_20221026_163026 2.056010 4.227176
6               GLM_1_AutoML_1_20221026_163026 2.058550 4.237629
       mae     rmsle mean_residual_deviance
1 1.550014 0.2761806               4.117172
2 1.565655 0.2810973               4.178339
3 1.536726 0.2791560               4.198843
4 1.579500 0.2793606               4.203060
5 1.553396 0.2796475               4.227176
6 1.562654 0.2794918               4.237629

Show code

h2o_testpreds <- as.data.frame(valid) %>%
  cbind(pred) %>%
  mutate(Res = abs(predict - Pro_Rad_SqRt)) 

test <- h2o_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(Res), list(MeanRes = mean))

test2 <- h2o_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, predict))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)

h2o_rmse <- round(rmse(h2o_testpreds$predict, h2o_testpreds$Pro_Rad_SqRt), 3)
h2o_mae <- round(mae_vec(h2o_testpreds$predict, h2o_testpreds$Pro_Rad_SqRt), 3)
h2o_r2 <- round(R2_Score(h2o_testpreds$predict, h2o_testpreds$Pro_Rad_SqRt), 3)

paste0("Overall RMSE = ", h2o_rmse)

[1] "Overall RMSE = 2.029"

Show code

h2o_testpreds %>%
  ggplot(aes(predict, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(3, 9), breaks = c(seq(3,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "h2o Regression - Test Data Residual Plot", 
         subtitle = "SqRt(Procurement Radius)" , x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

Show code

RMSEs <- lin_rmse %>%
  cbind(mgcv_rmse, dtree_rmse, rf_rmse, xg_rmse, h2o_rmse) 
RMSEs <- as.data.frame(RMSEs) %>% 
 rename('Linear' = '.', 'Additive' = 'mgcv_rmse', 'Decision Tree' = 'dtree_rmse', 
        'Random Forest' = 'rf_rmse', 'Gradient Boosted' = 'xg_rmse' ,'Machine Learning (h2o)' = 'h2o_rmse')
RMSEs <- RMSEs %>%
  pivot_longer(names_to = "Model", values_to = 'RMSE', cols = c(1:6)) %>%
  mutate(Model = as.factor(Model),
         Model = fct_reorder(Model, RMSE))


MAEs <- lin_mae %>%
  cbind(mgcv_mae, dtree_mae, rf_mae, xg_mae, h2o_mae) 
MAEs <- as.data.frame(MAEs) %>% 
 rename('Linear' = '.', 'Additive' = 'mgcv_mae', 'Decision Tree' = 'dtree_mae', 
        'Random Forest' = 'rf_mae', 'Gradient Boosted' = 'xg_mae' , 'Machine Learning (h2o)' = 'h2o_mae')
MAEs <- MAEs %>%
  pivot_longer(names_to = "Model", values_to = 'MAE', cols = c(1:6)) %>%
  mutate(Model = as.factor(Model),
         Model = fct_reorder(Model, MAE))


r2s <- lin_r2 %>%
  cbind(mgcv_r2, dtree_r2, rf_r2, xg_r2, h2o_r2) 
r2s <- as.data.frame(r2s) %>% 
 rename('Linear' = '.', 'Additive' = 'mgcv_r2', 'Decision Tree' = 'dtree_r2', 
        'Random Forest' = 'rf_r2', 'Gradient Boosted' = 'xg_r2' ,'Machine Learning (h2o)' = 'h2o_r2')
r2s <- r2s %>%
  pivot_longer(names_to = "Model", values_to = 'r2', cols = c(1:6)) %>%
  mutate(Model = as.factor(Model),
         Model = fct_reorder(Model, r2))

ModelStats <- RMSEs %>%
  left_join(MAEs, by = 'Model') %>%
  left_join(r2s, by = 'Model')

rm(lin_rmse, lin_mae, lin_r2, mgcv_rmse, mgcv_mae, mgcv_r2, dtree_rmse, dtree_mae, 
   dtree_r2, rf_rmse, rf_mae, rf_r2, xg_rmse, xg_mae, xg_r2, h2o_rmse, h2o_mae, h2o_r2, RMSEs, MAEs, r2s, x, y)


pal <- brewer.pal(6, name = "Spectral")
pal <- rev(pal)

ggplot(ModelStats, aes(x = Model, y = RMSE, fill = Model)) + 
  geom_col(position = "identity") +
  scale_y_continuous(limits = c(0,max(ModelStats$RMSE)+.05)) + 
  scale_fill_manual(values = pal, name = "Model") +
  labs(title = "Test Data RMSEs for Assessed Models", y = "RMSE", x = "Model", 
       subtitle = "Leading models shown for models requiring hyperparameter tuning (all but linear)") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
theme(axis.title = element_text(family = 'serif', size = 30), axis.text.x = element_blank(), 
      axis.title.x = element_blank(), panel.grid.major.x = element_blank(), 
      legend.text = element_text(size = 24, family = "serif"), 
      legend.title = element_text(size = 30, family = "serif")) +
  geom_text(aes(label = paste0("RMSE: ", RMSE)), size = 6, nudge_y = .05)

Show code

ggplot(ModelStats, aes(x = Model, y = MAE, fill = Model)) + 
  geom_col(position = "identity") +
  scale_y_continuous(limits = c(0,max(ModelStats$MAE)+.05)) + 
  scale_fill_manual(values = pal, name = "Model") +
  labs(title = "Test Data MAEs for Assessed Models", y = "MAE", x = "Model", 
       subtitle = "Leading models shown for models requiring hyperparameter tuning (all but linear)") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 30), axis.text.x = element_blank(), 
        axis.title.x = element_blank(), panel.grid.major.x = element_blank(), 
        legend.text = element_text(size = 24, family = "serif"), 
        legend.title = element_text(size = 30, family = "serif")) +
  geom_text(aes(label = paste0("MAE: ", MAE)), size = 6 , nudge_y = .05)

Show code

ggplot(ModelStats, aes(x = Model, y = r2, fill = Model)) + 
  geom_col(position = "identity") +
  scale_y_continuous(limits = c(0,.25)) + 
  scale_fill_manual(values = pal, name = "Model") +
  labs(title = "Test Data R-Squared Values for Assessed Models", y = "R-Squared", x = "Model",
       subtitle = "Leading models shown for models requiring hyperparameter tuning (all but linear)") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 20), axis.text.x = element_blank(), 
        axis.title.x = element_blank(), panel.grid.major.x = element_blank(), 
        legend.text = element_text(size = 24, family = "serif"), 
        legend.title = element_text(size = 30, family = "serif")) +
  geom_text(aes(label = paste0("R-Squared: " ,r2)), size = 6 , nudge_y = .005)

Show code

knitr::kable(ModelStats, digits = 3, align = "cccc", col.names = c("Model", "RMSE", "MAE", "R-Squared"), caption = "Model Fit Statistics for Assessed Models. 
      All statistics calculated using test data.") %>%
  kable_styling(font_size = 16)

Table 1: Model Fit Statistics for Assessed Models. All statistics calculated using test data.
Model	RMSE	MAE	R-Squared
Linear	2.074	1.525	0.130
Additive	2.098	1.523	0.110
Decision Tree	2.180	1.636	0.039
Random Forest	2.062	1.503	0.141
Gradient Boosted	2.053	1.489	0.148
Machine Learning (h2o)	2.029	1.550	0.148

TPO Heat Map Analysis

1 Distribution Visualizations

2 Normal-QQ Plots

3 Linear Regression

4 Additive Regression

5 Decsision Tree Regression

6 Random Forest Regression

7 Gradient Boosted Regression

8 Unsupervised Regression - Machine Learning using ‘h2o’