library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tidyr)
library(openxlsx)
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.5.2
library(stringr)
dod   <- read_excel("DoD_manpower.xlsx")
joint <- read_excel("JointBases_Master_Combined_Only.xlsx",
sheet = "Master_Combined")
dod <- dod %>% arrange(Year)

dod_ts <- ts(dod$TotalDoD,
             start     = min(dod$Year),
             frequency = 1)

fit_dod <- auto.arima(dod_ts)

current_year  <- max(dod$Year)
current_total <- dod$TotalDoD[dod$Year == current_year]

# These are the exact future years you want:
target_years <- c(2025, 2030, 2035, 2040, 2045)

# Forecast far enough to cover the max target year
h <- max(target_years) - current_year

dod_fc <- forecast(fit_dod, h = h)

# All forecast years from (current_year + 1) up to max(target_years)
fc_all <- data.frame(
  Year               = (current_year + 1):(current_year + h),
  DoD_Total_Forecast = as.numeric(dod_fc$mean)
)

# Keep only the exact target years
fc_df <- fc_all %>%
  filter(Year %in% target_years) %>%
  mutate(
    DoD_Growth_Factor = DoD_Total_Forecast / current_total
  )

fc_df
##   Year DoD_Total_Forecast DoD_Growth_Factor
## 1 2025           2235.273         0.9978896
## 2 2030           2248.027         1.0035835
## 3 2035           2239.220         0.9996518
## 4 2040           2245.281         1.0023578
## 5 2045           2240.797         1.0003558
base_now <- joint %>%
transmute(
InstallationName,
StateTerritory,
LeadService,
Current_DirectWorkforce = as.numeric(Current_DirectWorkforce),
Base_Military           = as.numeric(PostBRAC_Military),
Base_Civilians          = as.numeric(PostBRAC_Civilians)
)
## Warning: There were 2 warnings in `transmute()`.
## The first warning was:
## ℹ In argument: `Base_Military = as.numeric(PostBRAC_Military)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
base_now
## # A tibble: 12 × 6
##    InstallationName            StateTerritory LeadService Current_DirectWorkfo…¹
##    <chr>                       <chr>          <chr>                        <dbl>
##  1 Joint Base San Antonio      TX             Air Force …                  88760
##  2 Joint Base Lewis-McChord    WA             Army (I Co…                  45000
##  3 Joint Base Pearl Harbor-Hi… HI             Navy (PACF…                  53000
##  4 Joint Base Charleston       SC             Air Force …                  26000
##  5 Joint Base Langley-Eustis   VA             Air Force …                  34000
##  6 Joint Base McGuire-Dix-Lak… NJ             Air Force …                  33000
##  7 Joint Base Elmendorf-Richa… AK             Air Force …                  22000
##  8 Joint Base Andrews          MD             Air Force …                  17500
##  9 Joint Base Anacostia-Bolli… DC             Navy (hist…                   8000
## 10 Joint Expeditionary Base L… VA             Navy (NECC…                  20000
## 11 Joint Region Marianas       GU             Navy (regi…                  10500
## 12 Joint Base Myer-Henderson … VA/DC          Army (MDW)…                   6800
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
## # ℹ 2 more variables: Base_Military <dbl>, Base_Civilians <dbl>
pred_df_basic <- base_now %>%
crossing(fc_df) %>%
mutate(
Pred_Military   = Base_Military  * DoD_Growth_Factor,
Pred_Civilians  = Base_Civilians * DoD_Growth_Factor,
Pred_Total      = Current_DirectWorkforce * DoD_Growth_Factor,
Growth_Military = Pred_Military  - Base_Military,
Growth_Civilians= Pred_Civilians - Base_Civilians,
Growth_Total    = Pred_Total     - Current_DirectWorkforce
)

pred_df_basic
## # A tibble: 60 × 15
##    InstallationName            StateTerritory LeadService Current_DirectWorkfo…¹
##    <chr>                       <chr>          <chr>                        <dbl>
##  1 Joint Base Anacostia-Bolli… DC             Navy (hist…                   8000
##  2 Joint Base Anacostia-Bolli… DC             Navy (hist…                   8000
##  3 Joint Base Anacostia-Bolli… DC             Navy (hist…                   8000
##  4 Joint Base Anacostia-Bolli… DC             Navy (hist…                   8000
##  5 Joint Base Anacostia-Bolli… DC             Navy (hist…                   8000
##  6 Joint Base Andrews          MD             Air Force …                  17500
##  7 Joint Base Andrews          MD             Air Force …                  17500
##  8 Joint Base Andrews          MD             Air Force …                  17500
##  9 Joint Base Andrews          MD             Air Force …                  17500
## 10 Joint Base Andrews          MD             Air Force …                  17500
## # ℹ 50 more rows
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
## # ℹ 11 more variables: Base_Military <dbl>, Base_Civilians <dbl>, Year <int>,
## #   DoD_Total_Forecast <dbl>, DoD_Growth_Factor <dbl>, Pred_Military <dbl>,
## #   Pred_Civilians <dbl>, Pred_Total <dbl>, Growth_Military <dbl>,
## #   Growth_Civilians <dbl>, Growth_Total <dbl>
elasticity_df <- joint %>%
mutate(
Base_Growth_Factor = Current_DirectWorkforce / Baseline_DirectWorkforce
) %>%
left_join(
dod %>% select(Year, TotalDoD) %>% rename(Baseline_DoD = TotalDoD),
by = c("BaselineYear" = "Year")
) %>%
left_join(
dod %>% select(Year, TotalDoD) %>% rename(Current_DoD = TotalDoD),
by = c("CurrentYear" = "Year")
) %>%
mutate(
DoD_Growth_Factor = Current_DoD / Baseline_DoD,
Elasticity        = Base_Growth_Factor / DoD_Growth_Factor
) %>%
select(InstallationName, Elasticity)

elasticity_df
## # A tibble: 12 × 2
##    InstallationName                                 Elasticity
##    <chr>                                                 <dbl>
##  1 Joint Base San Antonio                                 2.47
##  2 Joint Base Lewis-McChord                               1.49
##  3 Joint Base Pearl Harbor-Hickam                         2.42
##  4 Joint Base Charleston                                  1.66
##  5 Joint Base Langley-Eustis                              1.42
##  6 Joint Base McGuire-Dix-Lakehurst                       1.54
##  7 Joint Base Elmendorf-Richardson                        1.29
##  8 Joint Base Andrews                                     1.45
##  9 Joint Base Anacostia-Bolling                           1.22
## 10 Joint Expeditionary Base Little Creek-Fort Story       1.36
## 11 Joint Region Marianas                                  1.53
## 12 Joint Base Myer-Henderson Hall                         1.14
# Adjust skip if needed after inspecting the raw file

census_raw <- read_excel("NST-EST2024-POP.xlsx", skip = 2)
## New names:
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
head(census_raw)
## # A tibble: 6 × 7
##   `Geographic Area` April 1, 2020 Estimat…¹ Population Estimate …²   ...4   ...5
##   <chr>                               <dbl>                  <dbl>  <dbl>  <dbl>
## 1 <NA>                                   NA                   2020 2.02e3 2.02e3
## 2 United States                   331515736              331577720 3.32e8 3.34e8
## 3 Northeast                        57617706               57431458 5.73e7 5.72e7
## 4 Midwest                          68998970               68984258 6.89e7 6.89e7
## 5 South                           126281537              126476549 1.27e8 1.29e8
## 6 West                             78617523               78685455 7.86e7 7.89e7
## # ℹ abbreviated names: ¹​`April 1, 2020 Estimates Base`,
## #   ²​`Population Estimate (as of July 1)`
## # ℹ 2 more variables: ...6 <dbl>, ...7 <dbl>
# Clean and reshape Census dataset

census_clean <- census_raw %>%
  rename(State = 1) %>%
  mutate(
    State = gsub("^\\.", "", State)   # <-- Remove leading dot
  ) %>%
  filter(
    !State %in% c("United States", "Northeast", "Midwest", "South", "West"),
    !grepl("Region", State, ignore.case = TRUE),
    !is.na(State)
  ) %>%
  rename(
    Pop_2020 = 2,
    Pop_2021 = 3,
    Pop_2022 = 4,
    Pop_2023 = 5,
    Pop_2024 = 6
  ) %>%
  mutate(across(starts_with("Pop_"), as.numeric))


# Wide -> long

census_long <- census_clean %>%
  pivot_longer(
    cols = starts_with("Pop_"),
    names_to = "Year",
    values_to = "Population"
  ) %>%
  mutate(
    Year = as.numeric(gsub("Pop_", "", Year))
  ) %>%
  arrange(State, Year)

# Year-over-year population growth

census_long <- census_long %>%
group_by(State) %>%
mutate(Pop_Growth = Population / lag(Population) - 1) %>%
ungroup()

# Keep latest available year per state (e.g., 2024)

census_latest <- census_long %>%
group_by(State) %>%
filter(Year == max(Year)) %>%
ungroup()

head(census_latest)
## # A tibble: 6 × 5
##   State          ...7  Year Population Pop_Growth
##   <chr>         <dbl> <dbl>      <dbl>      <dbl>
## 1 Alabama     5157699  2024    5117673    0.00817
## 2 Alaska       740133  2024     736510    0.00282
## 3 Arizona     7582384  2024    7473027    0.0129 
## 4 Arkansas    3088354  2024    3069463    0.00714
## 5 California 39431263  2024   39198693    0.00144
## 6 Colorado    5957493  2024    5901339    0.00861
joint <- joint %>%
  mutate(
    StateFull = case_when(
      StateTerritory == "DC" ~ "District of Columbia",
      TRUE ~ state.name[match(StateTerritory, state.abb)]
    )
  )


unique(joint$StateFull)
##  [1] "Texas"                "Washington"           "Hawaii"              
##  [4] "South Carolina"       "Virginia"             "New Jersey"          
##  [7] "Alaska"               "Maryland"             "District of Columbia"
## [10] NA
clean_name <- function(x) {
x %>%
trimws() %>%
toupper() %>%
gsub("[^A-Z0-9 ]", "", .) %>%
gsub("[[:space:]]+", " ", .)
}

joint <- joint %>%
mutate(InstallClean = clean_name(InstallationName))

elasticity_df <- elasticity_df %>%
mutate(InstallClean = clean_name(InstallationName))
joint_ml <- joint %>%
left_join(
census_latest,
by = c("StateFull" = "State")   # join by state only (latest year)
) %>%
left_join(
elasticity_df %>% select(InstallClean, Elasticity),
by = "InstallClean"
) %>%
arrange(CurrentYear) %>%
mutate(
DoD_Growth = (Current_Year_TotalDoD / lag(Current_Year_TotalDoD)) - 1,
DoD_Growth = ifelse(is.na(DoD_Growth), 0, DoD_Growth),
Pop_Growth = ifelse(is.na(Pop_Growth), 0, Pop_Growth)
)

# Quick check of missingness in key predictors

colSums(is.na(joint_ml)[, c("Current_DirectWorkforce",
"DoD_Growth",
"Population",
"Pop_Growth",
"Elasticity")])
## Current_DirectWorkforce              DoD_Growth              Population 
##                       0                       0                       2 
##              Pop_Growth              Elasticity 
##                       0                       0
# Build machine-learning dataset from joint_ml
ml_df <- joint_ml %>%
  mutate(
    # Sensible defaults for missing predictors
    Population = ifelse(is.na(Population), 0, Population),
    DoD_Growth = ifelse(is.na(DoD_Growth), 0, DoD_Growth),
    Pop_Growth = ifelse(is.na(Pop_Growth), 0, Pop_Growth),
    Elasticity = ifelse(is.na(Elasticity), 1, Elasticity)  # neutral elasticity if missing
  ) %>%
  # Only require that the TARGET exists
  filter(!is.na(Current_DirectWorkforce)) %>%
  select(
    InstallationName,
    StateTerritory,
    CurrentYear,
    Current_DirectWorkforce,
    DoD_Growth,
    Population,
    Pop_Growth,
    Elasticity
  )

nrow(ml_df)
## [1] 12
head(ml_df)
## # A tibble: 6 × 8
##   InstallationName  StateTerritory CurrentYear Current_DirectWorkfo…¹ DoD_Growth
##   <chr>             <chr>                <dbl>                  <dbl>      <dbl>
## 1 Joint Base San A… TX                    2020                  88760          0
## 2 Joint Base Lewis… WA                    2020                  45000          0
## 3 Joint Base Pearl… HI                    2020                  53000          0
## 4 Joint Base Charl… SC                    2020                  26000          0
## 5 Joint Base Langl… VA                    2020                  34000          0
## 6 Joint Base McGui… NJ                    2020                  33000          0
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
## # ℹ 3 more variables: Population <dbl>, Pop_Growth <dbl>, Elasticity <dbl>
set.seed(997)

rf_model <- train(
  Current_DirectWorkforce ~ DoD_Growth + Population + Pop_Growth + Elasticity,
  data = ml_df,
  method = "rf",
  trControl = trainControl(method = "cv", number = 10),
  tuneLength = 5
)
## note: only 3 unique complexity parameters in default grid. Truncating the grid to 3 .
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
rf_model
## Random Forest 
## 
## 12 samples
##  4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 11, 10, 11, 11, 11, 11, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared  MAE     
##   2     16957.26  1         16816.60
##   3     16914.21  1         16777.91
##   4     16930.97  1         16789.37
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
# Variable importance for the RF model
varImp(rf_model)
## rf variable importance
## 
##            Overall
## Elasticity  100.00
## Population   65.07
## Pop_Growth   49.91
## DoD_Growth    0.00
# Add model predictions + predicted year
ml_df <- ml_df %>%
  mutate(
    Predicted_Total = predict(rf_model, newdata = ml_df),
    Predicted_Year  = CurrentYear     # <-- NEW COLUMN
  )

# View results
ml_df %>%
  select(
    InstallationName,
    Predicted_Year,
    Current_DirectWorkforce,
    Predicted_Total
  ) %>%
  head()
## # A tibble: 6 × 4
##   InstallationName         Predicted_Year Current_DirectWorkfo…¹ Predicted_Total
##   <chr>                             <dbl>                  <dbl>           <dbl>
## 1 Joint Base San Antonio             2020                  88760          66826.
## 2 Joint Base Lewis-McChord           2020                  45000          33782.
## 3 Joint Base Pearl Harbor…           2020                  53000          48521.
## 4 Joint Base Charleston              2020                  26000          32714.
## 5 Joint Base Langley-Eust…           2020                  34000          28475.
## 6 Joint Base McGuire-Dix-…           2020                  33000          33706.
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
# 1) Base-level features (one row per installation)
#    assumes ml_df and rf_model already exist
base_features <- ml_df %>%
  dplyr::group_by(InstallationName, StateTerritory) %>%
  dplyr::slice_tail(n = 1) %>%          # keep one row per base
  dplyr::ungroup() %>%
  dplyr::select(
    InstallationName,
    StateTerritory,
    Current_DirectWorkforce,
    Population,
    Pop_Growth,
    Elasticity
  )

# 2) Future DoD scenarios from fc_df (e.g., 2025, 2030, 2035, 2040, 2045)
#    OPTION 2: make DoD_Growth base-specific by scaling with Elasticity
future_scenarios <- base_features %>%
  tidyr::crossing(
    fc_df %>%
      dplyr::transmute(
        Predicted_Year     = Year,
        Raw_DoD_Growth     = DoD_Growth_Factor - 1   # national DoD growth rate
      )
  ) %>%
  dplyr::mutate(
    # Base-specific effective DoD growth used as RF input:
    DoD_Growth = Raw_DoD_Growth * Elasticity
  )

# 3) Predict future total personnel using the RF model
future_scenarios <- future_scenarios %>%
  dplyr::mutate(
    Predicted_Total = predict(rf_model, newdata = future_scenarios),
    Predicted_Growth_Pct =
      (Predicted_Total / Current_DirectWorkforce - 1) * 100
  )

# 4) Inspect a sample of the results
future_scenarios %>%
  dplyr::arrange(InstallationName, Predicted_Year) %>%
  dplyr::select(
    InstallationName,
    StateTerritory,
    Predicted_Year,
    Current_DirectWorkforce,
    Predicted_Total,
    Predicted_Growth_Pct
  ) %>%
  head(20)
## # A tibble: 20 × 6
##    InstallationName         StateTerritory Predicted_Year Current_DirectWorkfo…¹
##    <chr>                    <chr>                   <int>                  <dbl>
##  1 Joint Base Anacostia-Bo… DC                       2025                   8000
##  2 Joint Base Anacostia-Bo… DC                       2030                   8000
##  3 Joint Base Anacostia-Bo… DC                       2035                   8000
##  4 Joint Base Anacostia-Bo… DC                       2040                   8000
##  5 Joint Base Anacostia-Bo… DC                       2045                   8000
##  6 Joint Base Andrews       MD                       2025                  17500
##  7 Joint Base Andrews       MD                       2030                  17500
##  8 Joint Base Andrews       MD                       2035                  17500
##  9 Joint Base Andrews       MD                       2040                  17500
## 10 Joint Base Andrews       MD                       2045                  17500
## 11 Joint Base Charleston    SC                       2025                  26000
## 12 Joint Base Charleston    SC                       2030                  26000
## 13 Joint Base Charleston    SC                       2035                  26000
## 14 Joint Base Charleston    SC                       2040                  26000
## 15 Joint Base Charleston    SC                       2045                  26000
## 16 Joint Base Elmendorf-Ri… AK                       2025                  22000
## 17 Joint Base Elmendorf-Ri… AK                       2030                  22000
## 18 Joint Base Elmendorf-Ri… AK                       2035                  22000
## 19 Joint Base Elmendorf-Ri… AK                       2040                  22000
## 20 Joint Base Elmendorf-Ri… AK                       2045                  22000
## # ℹ abbreviated name: ¹​Current_DirectWorkforce
## # ℹ 2 more variables: Predicted_Total <dbl>, Predicted_Growth_Pct <dbl>
library(openxlsx)

# Create workbook
wb <- createWorkbook()

addWorksheet(wb, "RF_Future_Predictions")

# Write full dataset
writeData(wb, "RF_Future_Predictions", future_scenarios)

# Save file
saveWorkbook(
  wb,
  file = "RF_JointBase_Personnel_Predictions_2030_2045.xlsx",
  overwrite = TRUE
)

"Export complete: RF_JointBase_Personnel_Predictions_2030_2045.xlsx"
## [1] "Export complete: RF_JointBase_Personnel_Predictions_2030_2045.xlsx"
library(ggplot2)
library(dplyr)
library(tidyr)

# 1) Years you want RF predictions for
predict_years <- c(2010, 2015, 2020, 2025)

# 2) Base-level features for RF input (last available row per base)
base_features <- ml_df %>%
  group_by(InstallationName, StateTerritory) %>%
  slice_tail(n = 1) %>%
  ungroup()

# 3) RF predictions for selected years (use '.' as newdata so sizes match)
rf_preds <- base_features %>%
  tidyr::crossing(Pred_Year = predict_years) %>%
  mutate(
    CurrentYear = Pred_Year,
    Personnel   = predict(rf_model, newdata = .),  # <-- FIX HERE
    Type        = "Predicted"
  ) %>%
  select(InstallationName, StateTerritory, CurrentYear, Personnel, Type)

# 4) Historical actual values (all years in ml_df)
actual_df <- ml_df %>%
  mutate(
    Type      = "Actual",
    Personnel = Current_DirectWorkforce
  ) %>%
  select(InstallationName, StateTerritory, CurrentYear, Personnel, Type)

# 5) Combined dataset for plotting
plot_df <- bind_rows(actual_df, rf_preds) %>%
  arrange(InstallationName, CurrentYear)

# Global max for common y-axis
global_max <- max(plot_df$Personnel, na.rm = TRUE)

# 6) One PNG per base, with side-by-side panels
out_dir <- "plots_actual_predicted_panels"
if (!dir.exists(out_dir)) dir.create(out_dir)

bases <- unique(plot_df$InstallationName)

for (base in bases) {

  df <- plot_df %>%
    filter(InstallationName == base) %>%
    arrange(CurrentYear) %>%
    mutate(
      YearLabel = as.character(CurrentYear),  # <-- FIXED
      YearNum   = CurrentYear
    )

  p <- ggplot(df, aes(x = YearNum, y = Personnel)) +
    geom_col(aes(fill = Type), width = 0.7, position = "dodge") +
    geom_line(aes(group = 1), color = "grey30", linewidth = 0.6) +
    geom_point(color = "grey30", size = 2) +
    scale_fill_manual(
      values = c("Actual" = "#F28E2B", "Predicted" = "#4E79A7"),
      name = ""
    ) +
    scale_x_continuous(
      breaks = unique(df$YearNum),
      labels = unique(df$YearLabel)
    ) +
    coord_cartesian(ylim = c(0, global_max * 1.1)) +
    labs(
      title = paste("Actual vs RF-Predicted Personnel\n", base),
      x = "Year",
      y = "Personnel"
    ) +
    theme_minimal(base_size = 13) +
    theme(
      axis.text.x = element_text(size = 11, angle = 0, hjust = 0.5),  # <-- readable
      panel.grid.major.x = element_blank(),
      plot.title = element_text(face = "bold", hjust = 0.5)
    )

  ggsave(
    filename = paste0(out_dir, "/", gsub(" ", "_", base), "_Actual_vs_Predicted.png"),
    plot     = p,
    width    = 10,
    height   = 6,
    dpi      = 300
  )
}


"Side-by-side panel PNGs complete."
## [1] "Side-by-side panel PNGs complete."

Joint Base Anacostia-Bolling

Joint Base Andrews

Joint Base Charleston

Joint Base Elmendorf-Richardson

Joint Base Langley-Eustis

Joint Base Lewis-McChord

Joint Base McGuire-Dix-Lakehurst

Joint Base Myer-Henderson Hall

Joint Base Pearl Harbor-Hickam

Joint Base San Antonio

Joint Expeditionary Base Little Creek-Fort Story

Joint Region Marianas