library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tidyr)
library(openxlsx)
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.5.2
library(stringr)
dod <- read_excel("DoD_manpower.xlsx")
joint <- read_excel("JointBases_Master_Combined_Only.xlsx",
sheet = "Master_Combined")
dod <- dod %>% arrange(Year)
dod_ts <- ts(dod$TotalDoD,
start = min(dod$Year),
frequency = 1)
fit_dod <- auto.arima(dod_ts)
current_year <- max(dod$Year)
current_total <- dod$TotalDoD[dod$Year == current_year]
# These are the exact future years you want:
target_years <- c(2025, 2030, 2035, 2040, 2045)
# Forecast far enough to cover the max target year
h <- max(target_years) - current_year
dod_fc <- forecast(fit_dod, h = h)
# All forecast years from (current_year + 1) up to max(target_years)
fc_all <- data.frame(
Year = (current_year + 1):(current_year + h),
DoD_Total_Forecast = as.numeric(dod_fc$mean)
)
# Keep only the exact target years
fc_df <- fc_all %>%
filter(Year %in% target_years) %>%
mutate(
DoD_Growth_Factor = DoD_Total_Forecast / current_total
)
fc_df
## Year DoD_Total_Forecast DoD_Growth_Factor
## 1 2025 2235.273 0.9978896
## 2 2030 2248.027 1.0035835
## 3 2035 2239.220 0.9996518
## 4 2040 2245.281 1.0023578
## 5 2045 2240.797 1.0003558
base_now <- joint %>%
transmute(
InstallationName,
StateTerritory,
LeadService,
Current_DirectWorkforce = as.numeric(Current_DirectWorkforce),
Base_Military = as.numeric(PostBRAC_Military),
Base_Civilians = as.numeric(PostBRAC_Civilians)
)
## Warning: There were 2 warnings in `transmute()`.
## The first warning was:
## ℹ In argument: `Base_Military = as.numeric(PostBRAC_Military)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
base_now
## # A tibble: 12 × 6
## InstallationName StateTerritory LeadService Current_DirectWorkfo…¹
## <chr> <chr> <chr> <dbl>
## 1 Joint Base San Antonio TX Air Force … 88760
## 2 Joint Base Lewis-McChord WA Army (I Co… 45000
## 3 Joint Base Pearl Harbor-Hi… HI Navy (PACF… 53000
## 4 Joint Base Charleston SC Air Force … 26000
## 5 Joint Base Langley-Eustis VA Air Force … 34000
## 6 Joint Base McGuire-Dix-Lak… NJ Air Force … 33000
## 7 Joint Base Elmendorf-Richa… AK Air Force … 22000
## 8 Joint Base Andrews MD Air Force … 17500
## 9 Joint Base Anacostia-Bolli… DC Navy (hist… 8000
## 10 Joint Expeditionary Base L… VA Navy (NECC… 20000
## 11 Joint Region Marianas GU Navy (regi… 10500
## 12 Joint Base Myer-Henderson … VA/DC Army (MDW)… 6800
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
## # ℹ 2 more variables: Base_Military <dbl>, Base_Civilians <dbl>
pred_df_basic <- base_now %>%
crossing(fc_df) %>%
mutate(
Pred_Military = Base_Military * DoD_Growth_Factor,
Pred_Civilians = Base_Civilians * DoD_Growth_Factor,
Pred_Total = Current_DirectWorkforce * DoD_Growth_Factor,
Growth_Military = Pred_Military - Base_Military,
Growth_Civilians= Pred_Civilians - Base_Civilians,
Growth_Total = Pred_Total - Current_DirectWorkforce
)
pred_df_basic
## # A tibble: 60 × 15
## InstallationName StateTerritory LeadService Current_DirectWorkfo…¹
## <chr> <chr> <chr> <dbl>
## 1 Joint Base Anacostia-Bolli… DC Navy (hist… 8000
## 2 Joint Base Anacostia-Bolli… DC Navy (hist… 8000
## 3 Joint Base Anacostia-Bolli… DC Navy (hist… 8000
## 4 Joint Base Anacostia-Bolli… DC Navy (hist… 8000
## 5 Joint Base Anacostia-Bolli… DC Navy (hist… 8000
## 6 Joint Base Andrews MD Air Force … 17500
## 7 Joint Base Andrews MD Air Force … 17500
## 8 Joint Base Andrews MD Air Force … 17500
## 9 Joint Base Andrews MD Air Force … 17500
## 10 Joint Base Andrews MD Air Force … 17500
## # ℹ 50 more rows
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
## # ℹ 11 more variables: Base_Military <dbl>, Base_Civilians <dbl>, Year <int>,
## # DoD_Total_Forecast <dbl>, DoD_Growth_Factor <dbl>, Pred_Military <dbl>,
## # Pred_Civilians <dbl>, Pred_Total <dbl>, Growth_Military <dbl>,
## # Growth_Civilians <dbl>, Growth_Total <dbl>
elasticity_df <- joint %>%
mutate(
Base_Growth_Factor = Current_DirectWorkforce / Baseline_DirectWorkforce
) %>%
left_join(
dod %>% select(Year, TotalDoD) %>% rename(Baseline_DoD = TotalDoD),
by = c("BaselineYear" = "Year")
) %>%
left_join(
dod %>% select(Year, TotalDoD) %>% rename(Current_DoD = TotalDoD),
by = c("CurrentYear" = "Year")
) %>%
mutate(
DoD_Growth_Factor = Current_DoD / Baseline_DoD,
Elasticity = Base_Growth_Factor / DoD_Growth_Factor
) %>%
select(InstallationName, Elasticity)
elasticity_df
## # A tibble: 12 × 2
## InstallationName Elasticity
## <chr> <dbl>
## 1 Joint Base San Antonio 2.47
## 2 Joint Base Lewis-McChord 1.49
## 3 Joint Base Pearl Harbor-Hickam 2.42
## 4 Joint Base Charleston 1.66
## 5 Joint Base Langley-Eustis 1.42
## 6 Joint Base McGuire-Dix-Lakehurst 1.54
## 7 Joint Base Elmendorf-Richardson 1.29
## 8 Joint Base Andrews 1.45
## 9 Joint Base Anacostia-Bolling 1.22
## 10 Joint Expeditionary Base Little Creek-Fort Story 1.36
## 11 Joint Region Marianas 1.53
## 12 Joint Base Myer-Henderson Hall 1.14
# Adjust skip if needed after inspecting the raw file
census_raw <- read_excel("NST-EST2024-POP.xlsx", skip = 2)
## New names:
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
head(census_raw)
## # A tibble: 6 × 7
## `Geographic Area` April 1, 2020 Estimat…¹ Population Estimate …² ...4 ...5
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> NA 2020 2.02e3 2.02e3
## 2 United States 331515736 331577720 3.32e8 3.34e8
## 3 Northeast 57617706 57431458 5.73e7 5.72e7
## 4 Midwest 68998970 68984258 6.89e7 6.89e7
## 5 South 126281537 126476549 1.27e8 1.29e8
## 6 West 78617523 78685455 7.86e7 7.89e7
## # ℹ abbreviated names: ¹​`April 1, 2020 Estimates Base`,
## # ²​`Population Estimate (as of July 1)`
## # ℹ 2 more variables: ...6 <dbl>, ...7 <dbl>
# Clean and reshape Census dataset
census_clean <- census_raw %>%
rename(State = 1) %>%
mutate(
State = gsub("^\\.", "", State) # <-- Remove leading dot
) %>%
filter(
!State %in% c("United States", "Northeast", "Midwest", "South", "West"),
!grepl("Region", State, ignore.case = TRUE),
!is.na(State)
) %>%
rename(
Pop_2020 = 2,
Pop_2021 = 3,
Pop_2022 = 4,
Pop_2023 = 5,
Pop_2024 = 6
) %>%
mutate(across(starts_with("Pop_"), as.numeric))
# Wide -> long
census_long <- census_clean %>%
pivot_longer(
cols = starts_with("Pop_"),
names_to = "Year",
values_to = "Population"
) %>%
mutate(
Year = as.numeric(gsub("Pop_", "", Year))
) %>%
arrange(State, Year)
# Year-over-year population growth
census_long <- census_long %>%
group_by(State) %>%
mutate(Pop_Growth = Population / lag(Population) - 1) %>%
ungroup()
# Keep latest available year per state (e.g., 2024)
census_latest <- census_long %>%
group_by(State) %>%
filter(Year == max(Year)) %>%
ungroup()
head(census_latest)
## # A tibble: 6 × 5
## State ...7 Year Population Pop_Growth
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 5157699 2024 5117673 0.00817
## 2 Alaska 740133 2024 736510 0.00282
## 3 Arizona 7582384 2024 7473027 0.0129
## 4 Arkansas 3088354 2024 3069463 0.00714
## 5 California 39431263 2024 39198693 0.00144
## 6 Colorado 5957493 2024 5901339 0.00861
joint <- joint %>%
mutate(
StateFull = case_when(
StateTerritory == "DC" ~ "District of Columbia",
TRUE ~ state.name[match(StateTerritory, state.abb)]
)
)
unique(joint$StateFull)
## [1] "Texas" "Washington" "Hawaii"
## [4] "South Carolina" "Virginia" "New Jersey"
## [7] "Alaska" "Maryland" "District of Columbia"
## [10] NA
clean_name <- function(x) {
x %>%
trimws() %>%
toupper() %>%
gsub("[^A-Z0-9 ]", "", .) %>%
gsub("[[:space:]]+", " ", .)
}
joint <- joint %>%
mutate(InstallClean = clean_name(InstallationName))
elasticity_df <- elasticity_df %>%
mutate(InstallClean = clean_name(InstallationName))
joint_ml <- joint %>%
left_join(
census_latest,
by = c("StateFull" = "State") # join by state only (latest year)
) %>%
left_join(
elasticity_df %>% select(InstallClean, Elasticity),
by = "InstallClean"
) %>%
arrange(CurrentYear) %>%
mutate(
DoD_Growth = (Current_Year_TotalDoD / lag(Current_Year_TotalDoD)) - 1,
DoD_Growth = ifelse(is.na(DoD_Growth), 0, DoD_Growth),
Pop_Growth = ifelse(is.na(Pop_Growth), 0, Pop_Growth)
)
# Quick check of missingness in key predictors
colSums(is.na(joint_ml)[, c("Current_DirectWorkforce",
"DoD_Growth",
"Population",
"Pop_Growth",
"Elasticity")])
## Current_DirectWorkforce DoD_Growth Population
## 0 0 2
## Pop_Growth Elasticity
## 0 0
# Build machine-learning dataset from joint_ml
ml_df <- joint_ml %>%
mutate(
# Sensible defaults for missing predictors
Population = ifelse(is.na(Population), 0, Population),
DoD_Growth = ifelse(is.na(DoD_Growth), 0, DoD_Growth),
Pop_Growth = ifelse(is.na(Pop_Growth), 0, Pop_Growth),
Elasticity = ifelse(is.na(Elasticity), 1, Elasticity) # neutral elasticity if missing
) %>%
# Only require that the TARGET exists
filter(!is.na(Current_DirectWorkforce)) %>%
select(
InstallationName,
StateTerritory,
CurrentYear,
Current_DirectWorkforce,
DoD_Growth,
Population,
Pop_Growth,
Elasticity
)
nrow(ml_df)
## [1] 12
head(ml_df)
## # A tibble: 6 × 8
## InstallationName StateTerritory CurrentYear Current_DirectWorkfo…¹ DoD_Growth
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Joint Base San A… TX 2020 88760 0
## 2 Joint Base Lewis… WA 2020 45000 0
## 3 Joint Base Pearl… HI 2020 53000 0
## 4 Joint Base Charl… SC 2020 26000 0
## 5 Joint Base Langl… VA 2020 34000 0
## 6 Joint Base McGui… NJ 2020 33000 0
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
## # ℹ 3 more variables: Population <dbl>, Pop_Growth <dbl>, Elasticity <dbl>
set.seed(997)
rf_model <- train(
Current_DirectWorkforce ~ DoD_Growth + Population + Pop_Growth + Elasticity,
data = ml_df,
method = "rf",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 5
)
## note: only 3 unique complexity parameters in default grid. Truncating the grid to 3 .
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
rf_model
## Random Forest
##
## 12 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 11, 10, 11, 11, 11, 11, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 16957.26 1 16816.60
## 3 16914.21 1 16777.91
## 4 16930.97 1 16789.37
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
# Variable importance for the RF model
varImp(rf_model)
## rf variable importance
##
## Overall
## Elasticity 100.00
## Population 65.07
## Pop_Growth 49.91
## DoD_Growth 0.00
# Add model predictions + predicted year
ml_df <- ml_df %>%
mutate(
Predicted_Total = predict(rf_model, newdata = ml_df),
Predicted_Year = CurrentYear # <-- NEW COLUMN
)
# View results
ml_df %>%
select(
InstallationName,
Predicted_Year,
Current_DirectWorkforce,
Predicted_Total
) %>%
head()
## # A tibble: 6 × 4
## InstallationName Predicted_Year Current_DirectWorkfo…¹ Predicted_Total
## <chr> <dbl> <dbl> <dbl>
## 1 Joint Base San Antonio 2020 88760 66826.
## 2 Joint Base Lewis-McChord 2020 45000 33782.
## 3 Joint Base Pearl Harbor… 2020 53000 48521.
## 4 Joint Base Charleston 2020 26000 32714.
## 5 Joint Base Langley-Eust… 2020 34000 28475.
## 6 Joint Base McGuire-Dix-… 2020 33000 33706.
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
# 1) Base-level features (one row per installation)
# assumes ml_df and rf_model already exist
base_features <- ml_df %>%
dplyr::group_by(InstallationName, StateTerritory) %>%
dplyr::slice_tail(n = 1) %>% # keep one row per base
dplyr::ungroup() %>%
dplyr::select(
InstallationName,
StateTerritory,
Current_DirectWorkforce,
Population,
Pop_Growth,
Elasticity
)
# 2) Future DoD scenarios from fc_df (e.g., 2025, 2030, 2035, 2040, 2045)
# OPTION 2: make DoD_Growth base-specific by scaling with Elasticity
future_scenarios <- base_features %>%
tidyr::crossing(
fc_df %>%
dplyr::transmute(
Predicted_Year = Year,
Raw_DoD_Growth = DoD_Growth_Factor - 1 # national DoD growth rate
)
) %>%
dplyr::mutate(
# Base-specific effective DoD growth used as RF input:
DoD_Growth = Raw_DoD_Growth * Elasticity
)
# 3) Predict future total personnel using the RF model
future_scenarios <- future_scenarios %>%
dplyr::mutate(
Predicted_Total = predict(rf_model, newdata = future_scenarios),
Predicted_Growth_Pct =
(Predicted_Total / Current_DirectWorkforce - 1) * 100
)
# 4) Inspect a sample of the results
future_scenarios %>%
dplyr::arrange(InstallationName, Predicted_Year) %>%
dplyr::select(
InstallationName,
StateTerritory,
Predicted_Year,
Current_DirectWorkforce,
Predicted_Total,
Predicted_Growth_Pct
) %>%
head(20)
## # A tibble: 20 × 6
## InstallationName StateTerritory Predicted_Year Current_DirectWorkfo…¹
## <chr> <chr> <int> <dbl>
## 1 Joint Base Anacostia-Bo… DC 2025 8000
## 2 Joint Base Anacostia-Bo… DC 2030 8000
## 3 Joint Base Anacostia-Bo… DC 2035 8000
## 4 Joint Base Anacostia-Bo… DC 2040 8000
## 5 Joint Base Anacostia-Bo… DC 2045 8000
## 6 Joint Base Andrews MD 2025 17500
## 7 Joint Base Andrews MD 2030 17500
## 8 Joint Base Andrews MD 2035 17500
## 9 Joint Base Andrews MD 2040 17500
## 10 Joint Base Andrews MD 2045 17500
## 11 Joint Base Charleston SC 2025 26000
## 12 Joint Base Charleston SC 2030 26000
## 13 Joint Base Charleston SC 2035 26000
## 14 Joint Base Charleston SC 2040 26000
## 15 Joint Base Charleston SC 2045 26000
## 16 Joint Base Elmendorf-Ri… AK 2025 22000
## 17 Joint Base Elmendorf-Ri… AK 2030 22000
## 18 Joint Base Elmendorf-Ri… AK 2035 22000
## 19 Joint Base Elmendorf-Ri… AK 2040 22000
## 20 Joint Base Elmendorf-Ri… AK 2045 22000
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
## # ℹ 2 more variables: Predicted_Total <dbl>, Predicted_Growth_Pct <dbl>
library(openxlsx)
# Create workbook
wb <- createWorkbook()
addWorksheet(wb, "RF_Future_Predictions")
# Write full dataset
writeData(wb, "RF_Future_Predictions", future_scenarios)
# Save file
saveWorkbook(
wb,
file = "RF_JointBase_Personnel_Predictions_2030_2045.xlsx",
overwrite = TRUE
)
"Export complete: RF_JointBase_Personnel_Predictions_2030_2045.xlsx"
## [1] "Export complete: RF_JointBase_Personnel_Predictions_2030_2045.xlsx"
library(ggplot2)
library(dplyr)
library(tidyr)
# 1) Years you want RF predictions for
predict_years <- c(2010, 2015, 2020, 2025)
# 2) Base-level features for RF input (last available row per base)
base_features <- ml_df %>%
group_by(InstallationName, StateTerritory) %>%
slice_tail(n = 1) %>%
ungroup()
# 3) RF predictions for selected years (use '.' as newdata so sizes match)
rf_preds <- base_features %>%
tidyr::crossing(Pred_Year = predict_years) %>%
mutate(
CurrentYear = Pred_Year,
Personnel = predict(rf_model, newdata = .), # <-- FIX HERE
Type = "Predicted"
) %>%
select(InstallationName, StateTerritory, CurrentYear, Personnel, Type)
# 4) Historical actual values (all years in ml_df)
actual_df <- ml_df %>%
mutate(
Type = "Actual",
Personnel = Current_DirectWorkforce
) %>%
select(InstallationName, StateTerritory, CurrentYear, Personnel, Type)
# 5) Combined dataset for plotting
plot_df <- bind_rows(actual_df, rf_preds) %>%
arrange(InstallationName, CurrentYear)
# Global max for common y-axis
global_max <- max(plot_df$Personnel, na.rm = TRUE)
# 6) One PNG per base, with side-by-side panels
out_dir <- "plots_actual_predicted_panels"
if (!dir.exists(out_dir)) dir.create(out_dir)
bases <- unique(plot_df$InstallationName)
for (base in bases) {
df <- plot_df %>%
filter(InstallationName == base) %>%
arrange(CurrentYear) %>%
mutate(
YearLabel = as.character(CurrentYear), # <-- FIXED
YearNum = CurrentYear
)
p <- ggplot(df, aes(x = YearNum, y = Personnel)) +
geom_col(aes(fill = Type), width = 0.7, position = "dodge") +
geom_line(aes(group = 1), color = "grey30", linewidth = 0.6) +
geom_point(color = "grey30", size = 2) +
scale_fill_manual(
values = c("Actual" = "#F28E2B", "Predicted" = "#4E79A7"),
name = ""
) +
scale_x_continuous(
breaks = unique(df$YearNum),
labels = unique(df$YearLabel)
) +
coord_cartesian(ylim = c(0, global_max * 1.1)) +
labs(
title = paste("Actual vs RF-Predicted Personnel\n", base),
x = "Year",
y = "Personnel"
) +
theme_minimal(base_size = 13) +
theme(
axis.text.x = element_text(size = 11, angle = 0, hjust = 0.5), # <-- readable
panel.grid.major.x = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5)
)
ggsave(
filename = paste0(out_dir, "/", gsub(" ", "_", base), "_Actual_vs_Predicted.png"),
plot = p,
width = 10,
height = 6,
dpi = 300
)
}
"Side-by-side panel PNGs complete."
## [1] "Side-by-side panel PNGs complete."
Joint Base Anacostia-Bolling

Joint Base Andrews

Joint Base Charleston

Joint Base Elmendorf-Richardson

Joint Base Langley-Eustis

Joint Base Lewis-McChord

Joint Base McGuire-Dix-Lakehurst

Joint Base Myer-Henderson Hall

Joint Base Pearl Harbor-Hickam

Joint Base San Antonio

Joint Expeditionary Base Little Creek-Fort Story

Joint Region Marianas
