TPO Heat Map Analysis

TPO Heat Map Analysis

Ian Kennedy
2022-10-26

1 Distribution Visualizations

Show code
TPOSampleJoined %>%
  ggplot(aes(log10(TOT_MCF))) +
  geom_density() +
  scale_x_continuous(limits = c(-2,5), breaks = c(seq(-2,5,1))) +
  scale_y_continuous(limits = c, breaks = c(seq(0,.7,.1))) +
  theme_fivethirtyeight(base_size = 20, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 30)) +
  labs(title = "Logged Volume Distribution - Unfiltered", x = "Log(10) MCF", y = "Density") 
Show code
TPOSampleJoined %>%
  filter(TOT_MCF < 16000 & Pro_Rad < 200 & !is.na(Pro_Rad)) %>%
  ggplot(aes(log10(TOT_MCF))) +
  geom_density() +
  scale_x_continuous(limits = c(-2,5), breaks = c(seq(-2,5,1))) +
  scale_y_continuous(limits = c, breaks = c(seq(0,.7,.1))) +
  theme_fivethirtyeight(base_size = 20, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 30)) +
  labs(title = "Logged Volume Distribution - Model Obs. Only", x = "Log(10) MCF", y = "Density") 
Show code
TPOSampleJoined %>%
  ggplot(aes(sqrt(Pro_Rad)))+
  geom_density() + 
  scale_x_continuous(limits = c(0, 20), breaks = c(seq(0,20,2))) +
  scale_y_continuous(limits = c(0,.25), breaks = c(seq(0,.25,.05))) +
  theme_fivethirtyeight(base_size = 20, base_family = "serif") +
  theme(axis.title = element_text(family = 'serif', size = 30)) +
  labs(title = "SqRt Radius Distribution", x = "SqRt Procurement Radius", y = "Density")

2 Normal-QQ Plots

Show code
Agg <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad)) %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2) +
  stat_qq_line() +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))


L <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Lake") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "blue") +
  stat_qq_line(color = "blue") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

NE <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "NewEngland") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "red") +
  stat_qq_line(color = "red") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

MA <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "MidAtl") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "orange") +
  stat_qq_line(color = "orange") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

P <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Plains") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "brown") +
  stat_qq_line(color = "brown") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

C <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Central") %>%
ggplot(aes(sample = log10(TOT_MCF))) +
  stat_qq(geom = "point", size = 2, color = "darkgreen") +
  stat_qq_line(color = "darkgreen") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(-2,5,1)), limits = c(-2,5.2)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

Test <- ggarrange(plotlist = list(Agg,C,L,MA,NE,P), labels = list("Agg.", "Central", "Lake", "MidAtl", "NewEng", "Plains"), font.label = list(size = 50, face = "plain", color ="black", family = 'serif')) 

annotate_figure(Test,
                left = textGrob("Log10(MCF)", rot = 90, vjust = .4, gp = gpar(cex = 4, fontfamily = "serif")),
                bottom = textGrob("Theoretical Quantiles",gp = gpar(cex = 4, fontfamily = "serif")),
                top = textGrob("Normal-QQ Plots - Log10(MCF)", gp = gpar(cex = 6, fontfamily = "serif")))
Show code
rm(Agg,C,L,MA,NE,P)


Agg <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Pro_Rad < 200) %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2) +
  stat_qq_line() +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,5)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.1,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

L <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Lake") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "blue") +
  stat_qq_line(color = "blue") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

NE <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "NewEngland") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "red") +
  stat_qq_line(color = "red") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

MA <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "MidAtl") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "orange") +
  stat_qq_line(color = "orange") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

P <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Plains") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "brown") +
  stat_qq_line(color = "brown") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

C <- TPOSampleJoined %>%
  filter(!is.na(Pro_Rad) & Region == "Central") %>%
ggplot(aes(sample = sqrt(Pro_Rad))) +
  stat_qq(geom = "point", size = 2, color = "darkgreen") +
  stat_qq_line(color = "darkgreen") +
  scale_x_continuous(breaks = c(seq(-2,2,1)), limits = c(-4,4)) +
  scale_y_continuous(breaks = c(seq(0,15,3)), limits = c(-.4,15)) +
  theme_fivethirtyeight(base_size = 30, base_family = "serif") +
  theme(legend.position = "none", title = element_text(size = 20, family = "serif"))

Test <- ggarrange(plotlist = list(Agg,C,L,MA,NE,P), labels = list("Agg.", "Central", "Lake", "MidAtl", "NewEng", "Plains"), font.label = list(size = 50, face = "plain", color ="black", family = 'serif')) 

annotate_figure(Test,
                left = textGrob("Sqrt(Procurement Radius)", rot = 90, gp = gpar(cex = 4, fontfamily = "serif")),
                bottom = textGrob("Theoretical Quantiles",gp = gpar(cex = 4, fontfamily = "serif")),
                top = textGrob("Normal-QQ Plots - Procurement Radius", gp = gpar(cex = 6, fontfamily = "serif")))
Show code
rm(Agg,C,L,MA,NE,P)
Show code
# ggplot(TPOSampleJoined, aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
#     geom_point() +
#     geom_smooth(se = FALSE, method = "lm", fullrange = TRUE) +
#     scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
#     scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
#     facet_wrap(~Region, ncol = 3) +
#     theme_fivethirtyeight(base_size = 20, base_family = "serif") +
#     theme(legend.position = "none", title = element_text(size = 20, family = "serif")) +
#     labs(title = "Regression Observations") +
#     ggpubr::stat_regline_equation(label.x = 1, label.y = 14, aes(label = ..rr.label..))


Agg <- TPOSampleJoined %>%
    ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3) +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "black") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

C <- TPOSampleJoined %>%
  filter(Region == "Central") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "darkgreen") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "darkgreen") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

L <- TPOSampleJoined %>%
  filter(Region == "Lake") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "blue") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "blue") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

MA <- TPOSampleJoined %>%
  filter(Region == "MidAtl") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "orange") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "orange") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

NE <- TPOSampleJoined %>%
  filter(Region == "NewEngland") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "red") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "red") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))

P <- TPOSampleJoined %>%
  filter(Region == "Plains") %>%
  ggplot(aes(LOG_TOT_MCF, Pro_Rad_SqRt)) +
    geom_point(size = 3, color = "brown") +
    geom_smooth(se = FALSE, method = "lm", fullrange = TRUE, color = "brown") +
    scale_x_continuous(limits = c(-2, 4.5), breaks = c(seq(-2,4.5,.5))) +
    scale_y_continuous(limits = c(0, 14), breaks = c(seq(0,14,2))) +
    theme_fivethirtyeight(base_size = 30, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_blank(), axis.title.y = element_blank()) +
    ggpubr::stat_regline_equation(label.x = 1, label.y = 14, size = 10, aes(label = ..rr.label..))


Test <- ggarrange(plotlist = list(Agg,C,L,MA,NE,P), labels = list("Agg.", "Central", "Lake", "MidAtl", "NewEng", "Plains"), font.label = list(size = 50, face = "plain", color ="black", family = 'serif')) 

annotate_figure(Test,
                left = textGrob("Sqrt(Procurement Radius)", rot = 90, gp = gpar(cex = 4, fontfamily = "serif")),
                bottom = textGrob("Log(10) MCF",gp = gpar(cex = 4, fontfamily = "serif")),
                top = textGrob("Regression Observations", gp = gpar(cex = 6, fontfamily = "serif")))
Show code
rm(Agg,C,L,MA,NE,P)

3 Linear Regression

Show code
# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")
 
lm_fit <- linear_model %>% 
  fit(Pro_Rad_SqRt ~ LOG_TOT_MCF * Region, data = data_training)

VI <- as.data.frame(lm_fit$fit$coefficients) %>%
  rownames_to_column(var = "variable") %>%
  rename(scaled_importance = 'lm_fit$fit$coefficients')

VI <- VI[2:10,]

VI <- VI %>%
  mutate(scaled_importance = abs(1/(1-(scaled_importance^2))),
         scaled_importance = scaled_importance/max(scaled_importance))


VI %>%
  mutate(variable = as.factor(variable),
         variable = fct_reorder(variable, scaled_importance)) %>%
  filter(scaled_importance > 0) %>%
ggplot(aes(y = variable, x = scaled_importance)) + 
  geom_col() +
  scale_x_continuous(breaks = c(seq(0,1,.1)), limits = c(0,1)) + 
  labs(title = "Linear Regression - Scaled Variable Importance Scores", subtitle = "Scores calculated using test data through R-Squared values.",y = "Scaled VI Score", x = "Variable") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 30))
Show code
# Predict outcome
predictions <- predict(lm_fit, new_data = data_test)
 
# Combine test data with predictions
data_test_results <- data_test %>% 
  select(LOG_TOT_MCF, Region, Pro_Rad_SqRt) %>% 
  bind_cols(predictions)

# data_test_results <- data_test_results %>%
#   mutate(Pro_Rad_SqRt = Pro_Rad_SqRt^2,
#          .pred = .pred^2)

lin_testdata <- data_test_results %>%
  mutate(residuals = abs(Pro_Rad_SqRt - .pred))

lin_test_results <- lin_testdata %>%
  select(Pro_Rad_SqRt, .pred) %>%
  bind_cols(mean_res = mean(lin_testdata$residuals))

lin_rmse <- round(rmse(lin_test_results$.pred, lin_test_results$Pro_Rad_SqRt), 3)
lin_mae <- round(mae_vec(lin_test_results$.pred, lin_test_results$Pro_Rad_SqRt), 3)
lin_r2 <- round(R2_Score(lin_test_results$.pred, lin_test_results$Pro_Rad_SqRt), 3)

paste0("Overall RMSE = ", round(rmse(lin_test_results$.pred, lin_test_results$Pro_Rad_SqRt), 3))
[1] "Overall RMSE = 2.074"
Show code
test <- lin_testdata %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(residuals), list(MeanRes = mean))
test2 <- lin_testdata %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)


  ggplot(lin_testdata, aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Linear Regression - Test Data Residual Plot", subtitle = "SqRt(Procurement Radius)" , x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
    ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))
Show code
#EVALUATE THE MODEL BEFORE PROCEEDING, then use last_fit() to fit on the test dataset
# Define a linear regression model
linear_model <- linear_reg() %>% 
  set_engine('lm') %>% 
  set_mode('regression')
 
# Train linear_model with last_fit()
linear_fit <- linear_model %>% 
  last_fit(Pro_Rad_SqRt ~ LOG_TOT_MCF * Region, split = data_split)
 
# Collect predictions
lin_finalpreds <- linear_fit %>% 
  collect_predictions() %>%
  mutate(Res = abs(Pro_Rad_SqRt - .pred))

rm(linear_fit, linear_model, lm_fit, predictions, data_test_results, lin_testdata)

4 Additive Regression

Show code
# Initialize a linear regression object, linear_model
mgcv_model <- gen_additive_mod() %>% 
  set_engine("mgcv") %>% 
  set_mode("regression")
 
mgcv_fit <- mgcv_model %>% 
  fit(Pro_Rad_SqRt ~ s(LOG_TOT_MCF, by = Region) + Region, 
      data = data_training, method = "REML")
 
VI <- as.data.frame(mgcv_fit$fit$coefficients) %>%
  rownames_to_column(var = "variable") %>%
  rename(scaled_importance = 'mgcv_fit$fit$coefficients')

VI <- VI[2:10,]

VI <- VI %>%
  mutate(scaled_importance = abs(1/(1-(scaled_importance^2))),
         scaled_importance = scaled_importance/max(scaled_importance))


VI %>%
  mutate(variable = as.factor(variable),
         variable = fct_reorder(variable, scaled_importance)) %>%
  filter(scaled_importance > 0) %>%
ggplot(aes(y = variable, x = scaled_importance)) + 
  geom_col() +
  scale_x_continuous(breaks = c(seq(0,1,.1)), limits = c(0,1)) + 
  labs(title = "Additive Regression - Scaled Variable Importance Scores", subtitle = "Scores calculated using test data through R-Squared values.",y = "Scaled VI Score", x = "Variable") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 30))
Show code
# Predict outcome
mgcv_testpreds <- predict(mgcv_fit, new_data = data_test)
mgcv_finalpreds <- predict(mgcv_fit, new_data = Test_Data)
 

# Combine test data with predictions
mgcv_testpreds <- data_test %>% 
  select(LOG_TOT_MCF, Region, Pro_Rad_SqRt) %>% 
  bind_cols(mgcv_testpreds) %>%
  mutate(residuals = abs(Pro_Rad_SqRt - .pred))

mgcv_rmse <- round(rmse(mgcv_testpreds$.pred, mgcv_testpreds$Pro_Rad_SqRt), 3)
mgcv_mae <- round(mae_vec(mgcv_testpreds$.pred, mgcv_testpreds$Pro_Rad_SqRt), 3)
mgcv_r2 <- round(R2_Score(mgcv_testpreds$.pred, mgcv_testpreds$Pro_Rad_SqRt), 3)
paste0("Overall RMSE = ", round(rmse(mgcv_testpreds$.pred, mgcv_testpreds$Pro_Rad_SqRt), 3))
[1] "Overall RMSE = 2.098"
Show code
# Combine final data with predictions
mgcv_finalpreds <- Test_Data %>% 
  select(LOG_TOT_MCF, Region, Pro_Rad_SqRt) %>% 
  bind_cols(mgcv_finalpreds) %>%
  mutate(residuals = abs(Pro_Rad_SqRt - .pred))

test <- mgcv_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(residuals), list(MeanRes = mean))
test2 <- mgcv_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)


ggplot(mgcv_testpreds, aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Additive Regression - Test Data Residual Plot", subtitle = "SqRt(Procurement Radius)", 
         x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))
Show code
# mgcv_testpreds <- mgcv_testpreds %>%
#   rmse(Pro_Rad_SqRt, .pred) %>%
#   bind_cols(mean_res = mean(mgcv_testpreds$residuals))
# 
# mgcv_finalpreds <- mgcv_finalpreds %>%
#   rmse(Pro_Rad_SqRt, .pred) %>%
#   bind_cols(mean_res = mean(mgcv_finalpreds$residuals))

rm(mgcv_fit, mgcv_model)

5 Decsision Tree Regression

Show code
dt_model <- decision_tree() %>% 
  set_engine('rpart') %>% 
  set_mode('regression')
 
# Build feature engineering pipeline
data_recipe <- recipe(Pro_Rad_SqRt ~ ., data = data_training) %>% 
  # Correlation filter
  step_corr(all_numeric(), threshold = .8) %>%
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())
 
# Create a workflow
data_wkfl <- workflow() %>% 
  # Include the model object
  add_model(dt_model) %>% 
  # Include the recipe object
  add_recipe(data_recipe)
 
# Train the workflow
data_wkfl_fit <- data_wkfl %>% 
  last_fit(split = data_split)

# Calculate performance metrics on test data
data_wkfl_fit %>% 
  collect_metrics(summarize = FALSE)
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      2.18   Preprocessor1_Model1
2 rsq     standard      0.0738 Preprocessor1_Model1
Show code
# Create cross validation folds
set.seed(260)
data_folds <- vfold_cv(data_training, v = 5, strata = Pro_Rad_SqRt)
 
# Create custom metrics function
data_metrics <- metric_set(yardstick::rmse)
 
# Fit resamples
data_rs <- data_wkfl %>% 
  fit_resamples(resamples = data_folds)
 
# Detailed cross validation results
data_results <- data_rs %>% 
  collect_metrics(summarize = FALSE)
 
# Set tuning hyperparameters
dt_tune_model <- decision_tree(tree_depth = tune(), min_n = tune()) %>% 
  set_engine('rpart') %>% 
  set_mode('regression')
 
# Create a tuning workflow
data_tune_wkfl <- data_wkfl %>% 
  # Replace model
  update_model(dt_tune_model)
 
# Hyperparameter tuning with grid search
set.seed(284)
dt_grid <- grid_random(parameters(dt_tune_model), size = 5)
# Hyperparameter tuning
dt_tuning <- data_tune_wkfl %>% 
  tune_grid(resamples = data_folds, grid = dt_grid)
 
# Collect detailed tuning results
dt_tuning_results <- dt_tuning %>% 
  collect_metrics(summarize = FALSE)
 
# Display 5 best performing models
Test <- dt_tuning %>% show_best(metric = 'rmse', n = 5)
 
# Select based on best performance
best_dt_model <- dt_tuning %>% 
  # Choose the best model based on roc_auc
  select_best(metric = 'rmse')
 
# Finalize your workflow
final_data_wkfl <- data_tune_wkfl %>% 
  finalize_workflow(best_dt_model)
 
# Train finalized decision tree workflow
data_final_fit <- final_data_wkfl %>% 
  last_fit(split = data_split)
 
# View performance metrics
data_final_fit %>% collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      2.18   Preprocessor1_Model1
2 rsq     standard      0.0738 Preprocessor1_Model1
Show code
# Create an ROC curve
dt_testpreds <- data_final_fit %>% 
  # Collect predictions
  collect_predictions() %>%
  mutate(Res = abs(Pro_Rad_SqRt - .pred)) %>%
  cbind(data_test[,3]) %>%
  select(-c(id, .row, .config)) 

test <- dt_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(Res), list(MeanRes = mean))
test2 <- dt_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)

dtree_rmse <- round(rmse(dt_testpreds$.pred, dt_testpreds$Pro_Rad_SqRt), 3)
dtree_mae <- round(mae_vec(dt_testpreds$.pred, dt_testpreds$Pro_Rad_SqRt), 3)
dtree_r2 <- round(R2_Score(dt_testpreds$.pred, dt_testpreds$Pro_Rad_SqRt), 3)

paste0("Overall RMSE = ", round(rmse(dt_testpreds$.pred, dt_testpreds$Pro_Rad_SqRt), 3))
[1] "Overall RMSE = 2.18"
Show code
dt_testpreds %>%
  ggplot(aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Decision Tree Regression - Test Data Residual Plot", 
         subtitle = "SqRt(Procurement Radius)" ,x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

6 Random Forest Regression

Show code
rf_model <- rand_forest(trees = 1000) %>% 
  set_engine('ranger') %>% 
  set_mode('regression')
 
# Build feature engineering pipeline
data_recipe <- recipe(Pro_Rad_SqRt ~ ., data = data_training) %>% 
  # Correlation filter
  step_corr(all_numeric(), threshold = .8) %>%
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())

# Create a workflow
data_wkfl <- workflow() %>% 
  # Include the model object
  add_model(rf_model) %>% 
  # Include the recipe object
  add_recipe(data_recipe)

# Train the workflow
data_wkfl_fit <- data_wkfl %>% 
  last_fit(split = data_split)

# Calculate performance metrics on test data
data_wkfl_fit %>% 
  collect_metrics(summarize = FALSE)
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard       2.06  Preprocessor1_Model1
2 rsq     standard       0.150 Preprocessor1_Model1
Show code
# Create cross validation folds
set.seed(260)
data_folds <- vfold_cv(data_training, v = 5, strata = Pro_Rad_SqRt)
 
# Create custom metrics function
data_metrics <- metric_set(yardstick::rmse)

# Fit resamples
data_rs <- data_wkfl %>% 
  fit_resamples(resamples = data_folds)
 
# Detailed cross validation results
data_results <- data_rs %>% 
  collect_metrics(summarize = FALSE)
 
# Set tuning hyperparameters
rf_tune_model <- rand_forest(trees = tune(), mtry = tune()) %>% 
  set_engine('ranger') %>% 
  set_mode('regression')
# Create a tuning workflow
data_tune_wkfl <- data_wkfl %>% 
  # Replace model
  update_model(rf_tune_model)

# Hyperparameter tuning with grid search
set.seed(284)

# Hyperparameter tuning
rf_tuning <- data_tune_wkfl %>% 
  tune_grid(resamples = data_folds)
 
# Collect detailed tuning results
rf_tuning_results <- rf_tuning %>% 
  collect_metrics(summarize = FALSE)
 
# Display 5 best performing models
Test <- rf_tuning %>% show_best(metric = 'rmse', n = 5)
 
# Select based on best performance
best_rf_model <- rf_tuning %>% 
  # Choose the best model based on rmse
  select_best(metric = 'rmse')
 
# Finalize your workflow
final_data_wkfl <- data_tune_wkfl %>% 
  finalize_workflow(best_rf_model)
 
# Train finalized decision tree workflow
data_final_fit <- final_data_wkfl %>% 
  last_fit(split = data_split)
 
# View performance metrics
data_final_fit %>% collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard       2.06  Preprocessor1_Model1
2 rsq     standard       0.147 Preprocessor1_Model1
Show code
# Create an ROC curve
rf_testpreds <- data_final_fit %>% 
  # Collect predictions
  collect_predictions() %>%
  mutate(Res = abs(Pro_Rad_SqRt - .pred)) %>%
  cbind(data_test[,3]) %>%
  select(-c(id, .row, .config)) 

test <- rf_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(Res), list(MeanRes = mean))

test2 <- rf_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)

rf_rmse <- round(rmse(rf_testpreds$.pred, rf_testpreds$Pro_Rad_SqRt), 3)
rf_mae <- round(mae_vec(rf_testpreds$.pred, rf_testpreds$Pro_Rad_SqRt), 3)
rf_r2 <- round(R2_Score(rf_testpreds$.pred, rf_testpreds$Pro_Rad_SqRt), 3)

rf_testpreds %>%
  ggplot(aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Random Forest Regression - Test Data Residual Plot", 
         subtitle = "SqRt(Procurement Radius)" , x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

7 Gradient Boosted Regression

Show code
# Specify the model class
xg_model <- boost_tree(
                trees = 1000,
                learn_rate = tune(),
                tree_depth = tune(),
                sample_size = tune()) %>%
  set_mode("regression") %>%
  set_engine("xgboost")

# Create the tuning grid
tunegrid_boost <- grid_regular(parameters(xg_model), 
                      levels = 5)

# Create CV folds of training data
folds <- vfold_cv(data_training, 5, strata = Pro_Rad_SqRt)

# Tune along the grid
tune_results <- tune_grid(xg_model,
                    Pro_Rad_SqRt ~ .,
                    resamples = folds,
                    grid = tunegrid_boost,
                    metrics = metric_set(yardstick::rmse))

best_params <- select_best(tune_results)

# Finalize the specification
final_spec <- finalize_model(xg_model, best_params)

# Train the final model on the full training data
final_model <- final_spec %>% fit(Pro_Rad_SqRt ~ ., data_training)

# Specify, fit, predict, and combine with training data
xg_testpreds <- final_model %>%
  predict(data_test) 

xg_testpreds <- data_test %>%
  cbind(xg_testpreds) %>%
mutate(Res = abs(Pro_Rad_SqRt - .pred)) 


test <- xg_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(Res), list(MeanRes = mean))

test2 <- xg_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, .pred))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)

xg_rmse <- round(rmse(xg_testpreds$.pred, xg_testpreds$Pro_Rad_SqRt), 3)
xg_mae <- round(mae_vec(xg_testpreds$.pred, xg_testpreds$Pro_Rad_SqRt), 3)
xg_r2 <- round(R2_Score(xg_testpreds$.pred, xg_testpreds$Pro_Rad_SqRt), 3)

xg_testpreds %>%
  ggplot(aes(.pred, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(1, 9), breaks = c(seq(1,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), 
          axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "Gradient Boosted Regression - Test Data Residual Plot", 
         subtitle = "SqRt(Procurement Radius)" , x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

8 Unsupervised Regression - Machine Learning using ‘h2o’

Show code
VI <- h2o.varimp(leader)

VI <- VI %>%
  mutate(variable = case_when(variable == "LOG_TOT_MCF" ~ "Log10(MCF)",
                              variable == "MILL_STATE" ~ "State",
                              variable == "MILL_TYPE_CD" ~ "Mill Type",
                              variable == "Region" ~ "Region"))


VI %>%
  mutate(variable = as.factor(variable),
         variable = fct_reorder(variable, scaled_importance)) %>%
  filter(scaled_importance > 0) %>%
ggplot(aes(y = variable, x = scaled_importance)) + 
  geom_col() +
  scale_x_continuous(breaks = c(seq(0,1,.1)), limits = c(0,1)) + 
  labs(title = "h2o Regression - Scaled Variable Importance Scores", 
       subtitle = "Scores calculated using test data.", y = "Scaled VI Score", x = "Variable") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 30))
Show code
head(lb)
                                      model_id     rmse      mse
1 GBM_grid_1_AutoML_1_20221026_163026_model_10 2.029082 4.117172
2  GBM_grid_1_AutoML_1_20221026_163026_model_2 2.044099 4.178339
3  GBM_grid_1_AutoML_1_20221026_163026_model_8 2.049108 4.198843
4 GBM_grid_1_AutoML_1_20221026_163026_model_11 2.050137 4.203060
5               GBM_1_AutoML_1_20221026_163026 2.056010 4.227176
6               GLM_1_AutoML_1_20221026_163026 2.058550 4.237629
       mae     rmsle mean_residual_deviance
1 1.550014 0.2761806               4.117172
2 1.565655 0.2810973               4.178339
3 1.536726 0.2791560               4.198843
4 1.579500 0.2793606               4.203060
5 1.553396 0.2796475               4.227176
6 1.562654 0.2794918               4.237629
Show code
h2o_testpreds <- as.data.frame(valid) %>%
  cbind(pred) %>%
  mutate(Res = abs(predict - Pro_Rad_SqRt)) 

test <- h2o_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise_at(vars(Res), list(MeanRes = mean))

test2 <- h2o_testpreds %>%
  dplyr::group_by(Region) %>%
  summarise(rmse = rmse(Pro_Rad_SqRt, predict))

test <- test %>%
  left_join(test2, by = 'Region') %>%
  mutate(MeanRes = round(MeanRes, 2),
         rmse = round(rmse, 2))
rm(test2)

h2o_rmse <- round(rmse(h2o_testpreds$predict, h2o_testpreds$Pro_Rad_SqRt), 3)
h2o_mae <- round(mae_vec(h2o_testpreds$predict, h2o_testpreds$Pro_Rad_SqRt), 3)
h2o_r2 <- round(R2_Score(h2o_testpreds$predict, h2o_testpreds$Pro_Rad_SqRt), 3)

paste0("Overall RMSE = ", h2o_rmse)
[1] "Overall RMSE = 2.029"
Show code
h2o_testpreds %>%
  ggplot(aes(predict, Pro_Rad_SqRt)) +
    geom_point(size = 1) +
    geom_abline() +
    scale_x_continuous(limits = c(3, 9), breaks = c(seq(3,9,1))) +
    scale_y_continuous(limits = c(0, 13), breaks = c(seq(0,13,1))) +
    facet_wrap(~Region, nrow = 2) +
    theme_fivethirtyeight(base_size = 20, base_family = "serif") +
    theme(axis.title = element_text(family = 'serif', size = 30), axis.title.x = element_text(family = 'serif', size = 30), 
          axis.title.y = element_text(family = 'serif', size = 30)) +
    labs(title = "h2o Regression - Test Data Residual Plot", 
         subtitle = "SqRt(Procurement Radius)" , x = "Prediction", y = "Observation") +
  geom_text(x = 4.3, y = 11, aes(label = MeanRes, size = 4), data = test) +
  annotate("text", x = 3.5, y = 11, label = "Mean Res: ") +
  geom_text(x = 4.3, y = 12, aes(label = rmse, size = 4), data = test) +
  annotate("text", x = 3.65, y = 12, label = "RMSE: ") +
  theme(legend.position="none") +
      ggpubr::stat_regline_equation(label.x = 5.75, label.y = 13, aes(label = ..rr.label..))

Show code
RMSEs <- lin_rmse %>%
  cbind(mgcv_rmse, dtree_rmse, rf_rmse, xg_rmse, h2o_rmse) 
RMSEs <- as.data.frame(RMSEs) %>% 
 rename('Linear' = '.', 'Additive' = 'mgcv_rmse', 'Decision Tree' = 'dtree_rmse', 
        'Random Forest' = 'rf_rmse', 'Gradient Boosted' = 'xg_rmse' ,'Machine Learning (h2o)' = 'h2o_rmse')
RMSEs <- RMSEs %>%
  pivot_longer(names_to = "Model", values_to = 'RMSE', cols = c(1:6)) %>%
  mutate(Model = as.factor(Model),
         Model = fct_reorder(Model, RMSE))


MAEs <- lin_mae %>%
  cbind(mgcv_mae, dtree_mae, rf_mae, xg_mae, h2o_mae) 
MAEs <- as.data.frame(MAEs) %>% 
 rename('Linear' = '.', 'Additive' = 'mgcv_mae', 'Decision Tree' = 'dtree_mae', 
        'Random Forest' = 'rf_mae', 'Gradient Boosted' = 'xg_mae' , 'Machine Learning (h2o)' = 'h2o_mae')
MAEs <- MAEs %>%
  pivot_longer(names_to = "Model", values_to = 'MAE', cols = c(1:6)) %>%
  mutate(Model = as.factor(Model),
         Model = fct_reorder(Model, MAE))


r2s <- lin_r2 %>%
  cbind(mgcv_r2, dtree_r2, rf_r2, xg_r2, h2o_r2) 
r2s <- as.data.frame(r2s) %>% 
 rename('Linear' = '.', 'Additive' = 'mgcv_r2', 'Decision Tree' = 'dtree_r2', 
        'Random Forest' = 'rf_r2', 'Gradient Boosted' = 'xg_r2' ,'Machine Learning (h2o)' = 'h2o_r2')
r2s <- r2s %>%
  pivot_longer(names_to = "Model", values_to = 'r2', cols = c(1:6)) %>%
  mutate(Model = as.factor(Model),
         Model = fct_reorder(Model, r2))

ModelStats <- RMSEs %>%
  left_join(MAEs, by = 'Model') %>%
  left_join(r2s, by = 'Model')

rm(lin_rmse, lin_mae, lin_r2, mgcv_rmse, mgcv_mae, mgcv_r2, dtree_rmse, dtree_mae, 
   dtree_r2, rf_rmse, rf_mae, rf_r2, xg_rmse, xg_mae, xg_r2, h2o_rmse, h2o_mae, h2o_r2, RMSEs, MAEs, r2s, x, y)


pal <- brewer.pal(6, name = "Spectral")
pal <- rev(pal)

ggplot(ModelStats, aes(x = Model, y = RMSE, fill = Model)) + 
  geom_col(position = "identity") +
  scale_y_continuous(limits = c(0,max(ModelStats$RMSE)+.05)) + 
  scale_fill_manual(values = pal, name = "Model") +
  labs(title = "Test Data RMSEs for Assessed Models", y = "RMSE", x = "Model", 
       subtitle = "Leading models shown for models requiring hyperparameter tuning (all but linear)") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
theme(axis.title = element_text(family = 'serif', size = 30), axis.text.x = element_blank(), 
      axis.title.x = element_blank(), panel.grid.major.x = element_blank(), 
      legend.text = element_text(size = 24, family = "serif"), 
      legend.title = element_text(size = 30, family = "serif")) +
  geom_text(aes(label = paste0("RMSE: ", RMSE)), size = 6, nudge_y = .05)
Show code
ggplot(ModelStats, aes(x = Model, y = MAE, fill = Model)) + 
  geom_col(position = "identity") +
  scale_y_continuous(limits = c(0,max(ModelStats$MAE)+.05)) + 
  scale_fill_manual(values = pal, name = "Model") +
  labs(title = "Test Data MAEs for Assessed Models", y = "MAE", x = "Model", 
       subtitle = "Leading models shown for models requiring hyperparameter tuning (all but linear)") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 30), axis.text.x = element_blank(), 
        axis.title.x = element_blank(), panel.grid.major.x = element_blank(), 
        legend.text = element_text(size = 24, family = "serif"), 
        legend.title = element_text(size = 30, family = "serif")) +
  geom_text(aes(label = paste0("MAE: ", MAE)), size = 6 , nudge_y = .05)
Show code
ggplot(ModelStats, aes(x = Model, y = r2, fill = Model)) + 
  geom_col(position = "identity") +
  scale_y_continuous(limits = c(0,.25)) + 
  scale_fill_manual(values = pal, name = "Model") +
  labs(title = "Test Data R-Squared Values for Assessed Models", y = "R-Squared", x = "Model",
       subtitle = "Leading models shown for models requiring hyperparameter tuning (all but linear)") +
  theme_fivethirtyeight(base_size = 20, base_family = 'serif') +
  theme(axis.title = element_text(family = 'serif', size = 20), axis.text.x = element_blank(), 
        axis.title.x = element_blank(), panel.grid.major.x = element_blank(), 
        legend.text = element_text(size = 24, family = "serif"), 
        legend.title = element_text(size = 30, family = "serif")) +
  geom_text(aes(label = paste0("R-Squared: " ,r2)), size = 6 , nudge_y = .005) 
Show code
knitr::kable(ModelStats, digits = 3, align = "cccc", col.names = c("Model", "RMSE", "MAE", "R-Squared"), caption = "Model Fit Statistics for Assessed Models. 
      All statistics calculated using test data.") %>%
  kable_styling(font_size = 16)
Table 1: Model Fit Statistics for Assessed Models. All statistics calculated using test data.
Model RMSE MAE R-Squared
Linear 2.074 1.525 0.130
Additive 2.098 1.523 0.110
Decision Tree 2.180 1.636 0.039
Random Forest 2.062 1.503 0.141
Gradient Boosted 2.053 1.489 0.148
Machine Learning (h2o) 2.029 1.550 0.148