Packages Used : running list

library(tidyverse)
library(readxl)
library(janitor)
library(psych)

Data Cleaning

Specify target columns

# Taking out only the targeted variables needed for the analysis as there were much more availible in the health outcomes datasets. 

columns_to_extract <- c("year","fips", "state", "county", "percent_food_insecure","percent_frequent_mental_distress",
                        "percent_uninsured_children", "percent_disconnected_youth", "spending_per_pupil","school_funding_adequacy", "high_school_graduation_rate", "median_household_income", "gender_pay_gap", "percent_enrolled_in_free_or_reduced_lunch", "percent_household_income_required_for_child_care_expenses", "percent_households_with_severe_cost_burden", "percent_rural", "percent_65_and_over", "percent_black","percent_not_proficient_in_english", "segregation_index", "percent_disconnected_youth", "food_environment_index", "teen_birth_rate", "percent_fair_or_poor_health", "percent_unemployed", "percent_children_in_single_parent_households", "percent_children_in_poverty", "percent_severe_housing_problems", "percent_completed_high_school","percent_completed_high_school", "percent_low_birthweight")

Pull in data

main_25 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_25.csv"
supp_25 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_25.csv"
main_24 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_24.csv"
supp_24 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_24.csv"
main_23 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_23.csv"
supp_23 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_23.csv"
main_22 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_22.csv"
supp_22 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_22.csv"
main_21 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_21.csv"
supp_21 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_21.csv"
main_20 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_20.csv"
supp_20 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_20.csv"

Ensure naming conventions are all the same

clean_and_select <- function(file_path, columns_to_extract) {
  data <- read.csv(file_path, check.names = FALSE) %>%
    janitor::clean_names() %>%
    select(any_of(columns_to_extract))
  return(data)
}
m_25 <- clean_and_select(main_25, columns_to_extract)
s_25 <- clean_and_select(supp_25, columns_to_extract)

m_24 <- clean_and_select(main_24, columns_to_extract)
s_24 <- clean_and_select(supp_24, columns_to_extract)

m_23 <- clean_and_select(main_23, columns_to_extract)
s_23 <- clean_and_select(supp_23, columns_to_extract)

m_22 <- clean_and_select(main_22, columns_to_extract)
s_22 <- clean_and_select(supp_22, columns_to_extract)

m_21 <- clean_and_select(main_21, columns_to_extract)
s_21 <- clean_and_select(supp_21, columns_to_extract)

m_20 <- clean_and_select(main_20, columns_to_extract)
s_20 <- clean_and_select(supp_20, columns_to_extract)

Join in Main and Supplemental Data by Year

f_25 <- m_25 %>% 
  left_join(s_25, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_24 <- m_24 %>% 
  left_join(s_24, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_23 <- m_23 %>% 
  left_join(s_23, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_22 <- m_22 %>% 
  left_join(s_22, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_21 <- m_21 %>% 
  left_join(s_21, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_20 <- m_20 %>% 
  left_join(s_20, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

Bind Yearly Data Together

final <- bind_rows(f_25, f_24, f_23, f_22, f_21, f_20)
final_data_clean <- final %>%
  select(where(~ !any(is.na(.))))

Rural | Urban Category Creation

final <- final %>%
  mutate(
    rural_urban = case_when(
      percent_rural == 100 ~ "Completely Rural",
      percent_rural >= 50 & percent_rural < 100 ~ "Mostly Rural",
      percent_rural < 50 ~ "Mostly Urban",
      TRUE ~ NA_character_ 
    )
  )

Dropping NA Columns vs Imputing

total_rows <- nrow(final)


missing_counts <- colSums(is.na(final))

missing_percentage <- (missing_counts / total_rows) * 100


missing_table <- data.frame(
  Column = names(missing_counts),
  MissingCount = missing_counts,
  MissingPercentage = round(missing_percentage, 2) 
)

missing_table <- missing_table[missing_table$MissingCount > 0, ]

print(missing_table)

##                                                                                                              Column
## percent_household_income_required_for_child_care_expenses percent_household_income_required_for_child_care_expenses
## percent_completed_high_school                                                         percent_completed_high_school
## percent_uninsured_children                                                               percent_uninsured_children
## percent_disconnected_youth                                                               percent_disconnected_youth
## spending_per_pupil                                                                               spending_per_pupil
## school_funding_adequacy                                                                     school_funding_adequacy
## high_school_graduation_rate                                                             high_school_graduation_rate
## gender_pay_gap                                                                                       gender_pay_gap
## percent_households_with_severe_cost_burden                               percent_households_with_severe_cost_burden
## segregation_index                                                                                 segregation_index
## teen_birth_rate                                                                                     teen_birth_rate
## percent_children_in_single_parent_households                           percent_children_in_single_parent_households
## percent_low_birthweight                                                                     percent_low_birthweight
## percent_black                                                                                         percent_black
## percent_children_in_single_parent_households.x                       percent_children_in_single_parent_households.x
## percent_children_in_single_parent_households.y                       percent_children_in_single_parent_households.y
##                                                           MissingCount
## percent_household_income_required_for_child_care_expenses          189
## percent_completed_high_school                                       63
## percent_uninsured_children                                         189
## percent_disconnected_youth                                          59
## spending_per_pupil                                                 143
## school_funding_adequacy                                            202
## high_school_graduation_rate                                          7
## gender_pay_gap                                                     126
## percent_households_with_severe_cost_burden                         189
## segregation_index                                                   16
## teen_birth_rate                                                      5
## percent_children_in_single_parent_households                       189
## percent_low_birthweight                                             64
## percent_black                                                       63
## percent_children_in_single_parent_households.x                     189
## percent_children_in_single_parent_households.y                     189
##                                                           MissingPercentage
## percent_household_income_required_for_child_care_expenses             50.00
## percent_completed_high_school                                         16.67
## percent_uninsured_children                                            50.00
## percent_disconnected_youth                                            15.61
## spending_per_pupil                                                    37.83
## school_funding_adequacy                                               53.44
## high_school_graduation_rate                                            1.85
## gender_pay_gap                                                        33.33
## percent_households_with_severe_cost_burden                            50.00
## segregation_index                                                      4.23
## teen_birth_rate                                                        1.32
## percent_children_in_single_parent_households                          50.00
## percent_low_birthweight                                               16.93
## percent_black                                                         16.67
## percent_children_in_single_parent_households.x                        50.00
## percent_children_in_single_parent_households.y                        50.00

EDA

Summary Statistics

describe(final, fast = T)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##                                                           vars   n     mean
## year                                                         1 378  2022.50
## fips                                                         2 378 36061.02
## state.x                                                      3 378      NaN
## county.x                                                     4 378      NaN
## percent_household_income_required_for_child_care_expenses    5 189    36.26
## food_environment_index                                       6 378     8.28
## percent_fair_or_poor_health                                  7 378    15.28
## percent_unemployed                                           8 378     4.98
## percent_children_in_poverty                                  9 378    17.60
## percent_severe_housing_problems                             10 378    16.04
## percent_completed_high_school                               11 315    89.60
## percent_food_insecure                                       12 378    11.16
## percent_frequent_mental_distress                            13 378    14.85
## percent_uninsured_children                                  14 189     2.90
## percent_disconnected_youth                                  15 319     7.76
## spending_per_pupil                                          16 235 24725.92
## school_funding_adequacy                                     17 176 12738.98
## high_school_graduation_rate                                 18 371    85.94
## median_household_income                                     19 378 67423.84
## gender_pay_gap                                              20 252     0.84
## percent_enrolled_in_free_or_reduced_lunch                   21 378    48.38
## percent_households_with_severe_cost_burden                  22 189    14.04
## percent_rural                                               23 378    45.01
## percent_65_and_over                                         24 378    19.72
## percent_not_proficient_in_english                           25 378     2.23
## segregation_index                                           26 362    18.10
## teen_birth_rate                                             27 373    14.47
## percent_children_in_single_parent_households                28 189    22.25
## percent_low_birthweight                                     29 314     7.29
## percent_black                                               30 315     6.13
## percent_children_in_single_parent_households.x              31 189    22.71
## percent_children_in_single_parent_households.y              32 189    22.35
## rural_urban                                                 33 378      NaN
##                                                                 sd      min
## year                                                          1.71  2020.00
## fips                                                         36.39 36000.00
## state.x                                                         NA      Inf
## county.x                                                        NA      Inf
## percent_household_income_required_for_child_care_expenses     6.19    24.00
## food_environment_index                                        0.53     6.20
## percent_fair_or_poor_health                                   2.85     9.00
## percent_unemployed                                            1.92     2.70
## percent_children_in_poverty                                   5.14     6.00
## percent_severe_housing_problems                               5.60     8.00
## percent_completed_high_school                                 3.32    73.00
## percent_food_insecure                                         2.25     4.00
## percent_frequent_mental_distress                              2.08    10.00
## percent_uninsured_children                                    1.05     2.00
## percent_disconnected_youth                                    3.94     1.00
## spending_per_pupil                                         4491.46 18229.00
## school_funding_adequacy                                    4128.12   958.00
## high_school_graduation_rate                                   4.63    67.00
## median_household_income                                   16548.19 38566.00
## gender_pay_gap                                                0.05     0.69
## percent_enrolled_in_free_or_reduced_lunch                    10.23    22.00
## percent_households_with_severe_cost_burden                    4.40     7.00
## percent_rural                                                28.81     0.00
## percent_65_and_over                                           3.11    12.80
## percent_not_proficient_in_english                             3.27     0.00
## segregation_index                                            27.28     0.03
## teen_birth_rate                                               6.24     3.00
## percent_children_in_single_parent_households                  5.89    12.00
## percent_low_birthweight                                       0.95     4.00
## percent_black                                                 6.06     0.60
## percent_children_in_single_parent_households.x                6.16    10.00
## percent_children_in_single_parent_households.y                5.88    12.00
## rural_urban                                                     NA      Inf
##                                                                 max     range
## year                                                        2025.00      5.00
## fips                                                       36123.00    123.00
## state.x                                                        -Inf      -Inf
## county.x                                                       -Inf      -Inf
## percent_household_income_required_for_child_care_expenses     80.00     56.00
## food_environment_index                                         9.90      3.70
## percent_fair_or_poor_health                                   30.00     21.00
## percent_unemployed                                            16.00     13.30
## percent_children_in_poverty                                   38.00     32.00
## percent_severe_housing_problems                               39.00     31.00
## percent_completed_high_school                                 96.00     23.00
## percent_food_insecure                                         20.00     16.00
## percent_frequent_mental_distress                              20.00     10.00
## percent_uninsured_children                                    10.00      8.00
## percent_disconnected_youth                                    27.00     26.00
## spending_per_pupil                                         50849.00  32620.00
## school_funding_adequacy                                    34981.00  34023.00
## high_school_graduation_rate                                   96.00     29.00
## median_household_income                                   140466.00 101900.00
## gender_pay_gap                                                 0.99      0.30
## percent_enrolled_in_free_or_reduced_lunch                     88.00     66.00
## percent_households_with_severe_cost_burden                    31.00     24.00
## percent_rural                                                100.00    100.00
## percent_65_and_over                                           34.80     22.00
## percent_not_proficient_in_english                             16.00     16.00
## segregation_index                                             81.00     80.97
## teen_birth_rate                                               36.00     33.00
## percent_children_in_single_parent_households                  52.00     40.00
## percent_low_birthweight                                       10.00      6.00
## percent_black                                                 29.90     29.30
## percent_children_in_single_parent_households.x                51.00     41.00
## percent_children_in_single_parent_households.y                52.00     40.00
## rural_urban                                                    -Inf      -Inf
##                                                               se
## year                                                        0.09
## fips                                                        1.87
## state.x                                                       NA
## county.x                                                      NA
## percent_household_income_required_for_child_care_expenses   0.45
## food_environment_index                                      0.03
## percent_fair_or_poor_health                                 0.15
## percent_unemployed                                          0.10
## percent_children_in_poverty                                 0.26
## percent_severe_housing_problems                             0.29
## percent_completed_high_school                               0.19
## percent_food_insecure                                       0.12
## percent_frequent_mental_distress                            0.11
## percent_uninsured_children                                  0.08
## percent_disconnected_youth                                  0.22
## spending_per_pupil                                        292.99
## school_funding_adequacy                                   311.17
## high_school_graduation_rate                                 0.24
## median_household_income                                   851.15
## gender_pay_gap                                              0.00
## percent_enrolled_in_free_or_reduced_lunch                   0.53
## percent_households_with_severe_cost_burden                  0.32
## percent_rural                                               1.48
## percent_65_and_over                                         0.16
## percent_not_proficient_in_english                           0.17
## segregation_index                                           1.43
## teen_birth_rate                                             0.32
## percent_children_in_single_parent_households                0.43
## percent_low_birthweight                                     0.05
## percent_black                                               0.34
## percent_children_in_single_parent_households.x              0.45
## percent_children_in_single_parent_households.y              0.43
## rural_urban                                                   NA

Missingness Visualized

ggplot(missing_table, aes(x = reorder(Column, -MissingPercentage), y = MissingPercentage)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() + 
  labs(
    title = "Percentage of Missing Data by Variable",
    x = "Variable",
    y = "Missingness %"
  ) +
  theme_minimal()

##Numeric Distribution

Quick sapply function to look at histogram distributions for all of the numeric variables in the dataset
The target variable percent_food_insecure is normally distributed

library(ggplot2)
numeric_columns <- sapply(final, is.numeric)
final_numeric <- final[, numeric_columns]

for (col in names(final_numeric)) {
  print(
    ggplot(final, aes_string(x = col)) +
      geom_histogram(fill = "steelblue", bins = 30, color = "black") +
      labs(title = paste("Distribution of", col), x = col, y = "Frequency") +
      theme_minimal()
  )
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 63 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 143 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 202 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 63 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

By Year:

Mean, median and standard deviation were calculated for each variable and then grouped by year.

library(dplyr)

yearly_summary <- final %>%
  group_by(year) %>%
  summarise(across(where(is.numeric), list(mean = mean, median = median, sd = sd), na.rm = TRUE))

## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## ℹ In group 1: `year = 2020`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

print(yearly_summary)

## # A tibble: 6 × 88
##    year fips_mean fips_median fips_sd percent_household_income_required_for_ch…¹
##   <int>     <dbl>       <int>   <dbl>                                      <dbl>
## 1  2020    36061.       36061    36.6                                      NaN  
## 2  2021    36061.       36061    36.6                                      NaN  
## 3  2022    36061.       36061    36.6                                      NaN  
## 4  2023    36061.       36061    36.6                                       32.2
## 5  2024    36061.       36061    36.6                                       39.0
## 6  2025    36061.       36061    36.6                                       37.6
## # ℹ abbreviated name:
## #   ¹percent_household_income_required_for_child_care_expenses_mean
## # ℹ 83 more variables:
## #   percent_household_income_required_for_child_care_expenses_median <int>,
## #   percent_household_income_required_for_child_care_expenses_sd <dbl>,
## #   food_environment_index_mean <dbl>, food_environment_index_median <dbl>,
## #   food_environment_index_sd <dbl>, percent_fair_or_poor_health_mean <dbl>, …

By County

Same thing for county level averages, median and sd for each var.

region_summary <- final %>%
  group_by(county.x, year) %>%
  summarise(across(where(is.numeric), list(mean = mean, median = median, sd = sd), na.rm = TRUE))

## `summarise()` has grouped output by 'county.x'. You can override using the
## `.groups` argument.

print(region_summary)

## # A tibble: 378 × 89
## # Groups:   county.x [63]
##    county.x  year fips_mean fips_median fips_sd percent_household_income_requi…¹
##    <chr>    <int>     <dbl>       <int>   <dbl>                            <dbl>
##  1 Albany    2020     36001       36001      NA                              NaN
##  2 Albany    2021     36001       36001      NA                              NaN
##  3 Albany    2022     36001       36001      NA                              NaN
##  4 Albany    2023     36001       36001      NA                               35
##  5 Albany    2024     36001       36001      NA                               41
##  6 Albany    2025     36001       36001      NA                               37
##  7 Allegany  2020     36003       36003      NA                              NaN
##  8 Allegany  2021     36003       36003      NA                              NaN
##  9 Allegany  2022     36003       36003      NA                              NaN
## 10 Allegany  2023     36003       36003      NA                               34
## # ℹ 368 more rows
## # ℹ abbreviated name:
## #   ¹percent_household_income_required_for_child_care_expenses_mean
## # ℹ 83 more variables:
## #   percent_household_income_required_for_child_care_expenses_median <int>,
## #   percent_household_income_required_for_child_care_expenses_sd <dbl>,
## #   food_environment_index_mean <dbl>, food_environment_index_median <dbl>, …

Yearly Line Graph of Food Insecurity

Despite this graph being messy, there is a point to it.
A trend appears, there is an overall shift down in food insecurity from 2023 -> 2024, followed by a shift upward in 2025, past where is was in the previous 4 years for most counties.
Additionally, the Bronx is an outlier for food insecurity showing 1. a much higher starting point in 2020, but also the increase over the five years is sharp.

library(ggplot2)

ggplot(region_summary, aes(x = year, y = percent_food_insecure_mean, color = county.x)) +
  geom_line() +
  labs(title = "Yearly Food Insecurity Across New York State Counties", x = "Year", y = "Food Insecurity") +
  theme_minimal()

Average trend in food insecurity across NYS only.

ggplot(region_summary, aes(x = year, y = percent_food_insecure_mean)) +
  geom_line() +
  labs(title = "Yearly Average Food Insecurity Across New York State", x = "Year", y = "Value") +
  theme_minimal()

ggplot(final, aes(x = county.x, y = percent_food_insecure, color = rural_urban)) +
  geom_boxplot(fill = "steelblue") +
  labs(title = "County Level Comparison of Food Insecurity In New York State", x = "County", y = "Food Insecurity Percentage") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))

Saving out final csv for cross use in my python code (for LSTM) and the xGBoost.

final_csv <- write_csv(final, "C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")

Model Testing:

XGBoost 1:

This first XGBoost model aims to be a lighter weight model, only selecting 10 variables from the dataset. Additionally, I only created two lag features and recoded my categorical rural_urban column as numeric

library(tidyverse)
library(xgboost)

## Warning: package 'xgboost' was built under R version 4.3.3

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")

df <- data %>%
  select(year, fips, county.x, percent_food_insecure,
    percent_unemployed, percent_children_in_poverty,
    median_household_income, percent_fair_or_poor_health,
    high_school_graduation_rate, rural_urban
  ) %>%
  drop_na(percent_food_insecure)  #just making sure there are no na's left in for this XGBoost model

Create Lag Features and code rural_urban on

Using a standard 1 and 2 year lag variable for the entire model

df <- df %>%
  arrange(fips, year) %>% 
  group_by(fips) %>%
  mutate(
    food_insecure_lag1 = lag(percent_food_insecure, 1),  # Previous year
    food_insecure_lag2 = lag(percent_food_insecure, 2)   # Two years back
  ) %>%
  ungroup()

# recode rural_urban as numeric
df <- df %>%
  mutate(rural_urban = as.factor(rural_urban),
         rural_urban = as.numeric(rural_urban))

Split Data

train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)

# dropping out the non-important features, including the dependent variable
X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure))
y_train <- train$percent_food_insecure

X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure))
y_test <- test$percent_food_insecure

Experiment 1: Train Model

# cross val
ctrl <- trainControl(
  method = "cv",
  number = 10,
  verboseIter = TRUE
)

# Standard xgboost tuning grid
xgb_grid <- expand.grid(
  nrounds = 100,
  max_depth = 6,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)


xgb_model <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl,
  tuneGrid = xgb_grid,
  verbosity = 0
)

## + Fold01: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold01: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold02: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold02: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold03: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold03: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold04: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold04: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold05: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold05: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold06: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold06: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold07: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold07: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold08: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold08: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold09: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold09: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold10: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1

## Warning: Setting row names on a tibble is deprecated.

## - Fold10: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## Aggregating results
## Fitting final model on full training set

## Warning: Setting row names on a tibble is deprecated.

print(xgb_model)

## eXtreme Gradient Boosting 
## 
## 252 samples
##   8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 226, 227, 228, 227, 228, 226, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.9396357  0.782687  0.7290635
## 
## Tuning parameter 'nrounds' was held constant at a value of 100
## Tuning
##  held constant at a value of 1
## Tuning parameter 'subsample' was held
##  constant at a value of 1

Apply Experiment 1 Model:

# predicting 2024 data
X_2024 <- df %>% 
  filter(year == 2024) %>%
  select(-c(year, fips, county.x, percent_food_insecure))

# Predicting 2024 food insecurity
predictions_2024 <- predict(xgb_model, newdata = X_2024)

# bringing in county and fips codes 
results <- df %>%
  filter(year == 2024) %>%
  select(fips, county.x) %>%
  mutate(pred_2024_food_insecure = predictions_2024)

# View predictions
print(results)

## # A tibble: 63 × 3
##     fips county.x    pred_2024_food_insecure
##    <int> <chr>                         <dbl>
##  1 36000 Total                         10.0 
##  2 36001 Albany                         9.34
##  3 36003 Allegany                      13.1 
##  4 36005 Bronx                         17.9 
##  5 36007 Broome                        12.3 
##  6 36009 Cattaraugus                   12.7 
##  7 36011 Cayuga                        10.5 
##  8 36013 Chautauqua                    13.2 
##  9 36015 Chemung                       11.9 
## 10 36017 Chenango                      10.6 
## # ℹ 53 more rows

Experiment 1: Predicted vs Actual Food Insecurity

# Predict 2024 (for validPrediPredir)
pred_2024 <- predict(xgb_model, newdata = X_test)

# Calculate RMSE
rmse <- sqrt(mean((pred_2024 - y_test)^2))
print(paste("RMSE:", rmse))

## [1] "RMSE: 1.07624761613868"

ggplot(data.frame(Actual = y_test, Predicted = pred_2024), aes(Actual, Predicted)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(title = "Actual vs. Predicted Food Insecurity (2024)")

Experiment 1 Data Metrics Table

library(caret)
library(Metrics)

## Warning: package 'Metrics' was built under R version 4.3.3

## 
## Attaching package: 'Metrics'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

test_predictions <- predict(xgb_model, newdata = X_test)
actual_values <- y_test 

metrics_table <- data.frame(
  RMSE = RMSE(test_predictions, actual_values),
  MAPE = mape(actual_values, test_predictions) * 100,
  MSE = mean((test_predictions - actual_values)^2),
  R2 = R2(test_predictions, actual_values)
)

print(metrics_table)

##       RMSE    MAPE      MSE        R2
## 1 1.076248 10.3129 1.158309 0.8556726

XGBoost 2:

XGBoost model 2 brings in a few different changes to the model, the lag variables are still created but this time the model is tuned to be faster.
By dropping the number of folds from 10 to 5 this should cut the number of boosted models ran for this experiment.

library(tidyverse)
library(xgboost)
library(caret)


data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")


df <- data %>%
  select(where(~ all(!is.na(.)))) %>%
  drop_na(percent_food_insecure)

df <- df %>%
  arrange(fips, year) %>% 
  group_by(fips) %>%
  mutate(
    food_insecure_lag1 = lag(percent_food_insecure, 1),  # Previous year
    food_insecure_lag2 = lag(percent_food_insecure, 2)   # Two years back
  ) %>%
  select(-food_environment_index) %>% 
  ungroup()


df <- df %>%
  mutate(rural_urban = as.factor(rural_urban),
         rural_urban = as.numeric(rural_urban))

train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)


X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_train <- train$percent_food_insecure

X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_test <- test$percent_food_insecure

ctrl_fast <- trainControl(
  method = "cv",
  number = 5,  # Reduced from 10 folds
  verboseIter = TRUE,
  allowParallel = TRUE  # Enable parallel processing
)

# Tuning grid set
xgb_grid_fast <- expand.grid(
  nrounds = c(100, 150, 200, 300),          # Reduced options
  max_depth = c(4, 6, 8),            # Most useful range
  eta = c(0.05, 0.1, .15),            # Optimal learning rates
  gamma = 0,                     
  colsample_bytree = 0.8,       
  min_child_weight = 1,          
  subsample = 0.8               
)

# Parallel processing to speed things up
library(doParallel)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

## Loading required package: parallel

cl <- makePSOCKcluster(4)
registerDoParallel(cl)

xgb_fast <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl_fast,
  tuneGrid = xgb_grid_fast,
  verbosity = 0,
  metric = "RMSE"
)

## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 300, max_depth = 4, eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set

stopCluster(cl)


print(xgb_fast)

## eXtreme Gradient Boosting 
## 
## 252 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 201, 202, 201, 202, 202 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  nrounds  RMSE       Rsquared   MAE      
##   0.05  4          100      0.8781404  0.8355296  0.6707104
##   0.05  4          150      0.8540096  0.8402291  0.6462976
##   0.05  4          200      0.8453597  0.8426345  0.6345746
##   0.05  4          300      0.8451813  0.8424344  0.6312022
##   0.05  6          100      0.8923034  0.8322891  0.6678132
##   0.05  6          150      0.8754636  0.8340372  0.6504066
##   0.05  6          200      0.8773633  0.8331453  0.6492893
##   0.05  6          300      0.8772838  0.8334252  0.6496573
##   0.05  8          100      0.8974352  0.8291845  0.6731898
##   0.05  8          150      0.8816944  0.8291804  0.6630502
##   0.05  8          200      0.8796597  0.8293649  0.6624847
##   0.05  8          300      0.8796704  0.8292728  0.6624413
##   0.10  4          100      0.8648257  0.8364921  0.6439529
##   0.10  4          150      0.8642269  0.8362099  0.6417462
##   0.10  4          200      0.8645774  0.8362313  0.6412801
##   0.10  4          300      0.8656470  0.8356974  0.6410426
##   0.10  6          100      0.8825171  0.8286300  0.6591104
##   0.10  6          150      0.8806580  0.8292312  0.6567807
##   0.10  6          200      0.8797769  0.8296094  0.6567192
##   0.10  6          300      0.8796946  0.8296352  0.6566413
##   0.10  8          100      0.8979305  0.8253533  0.6732380
##   0.10  8          150      0.8972889  0.8256395  0.6722391
##   0.10  8          200      0.8970448  0.8257292  0.6718342
##   0.10  8          300      0.8970231  0.8257341  0.6718000
##   0.15  4          100      0.8607434  0.8359365  0.6525811
##   0.15  4          150      0.8637856  0.8354407  0.6524942
##   0.15  4          200      0.8638682  0.8354469  0.6534405
##   0.15  4          300      0.8643632  0.8352971  0.6540152
##   0.15  6          100      0.8865801  0.8275857  0.6848609
##   0.15  6          150      0.8874475  0.8272966  0.6852978
##   0.15  6          200      0.8874683  0.8272873  0.6853752
##   0.15  6          300      0.8874747  0.8272835  0.6853918
##   0.15  8          100      0.8980507  0.8237776  0.6916497
##   0.15  8          150      0.8981906  0.8237612  0.6917727
##   0.15  8          200      0.8981609  0.8237745  0.6917438
##   0.15  8          300      0.8981608  0.8237731  0.6917376
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## 
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 300, max_depth = 4, eta
##  = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and
##  subsample = 0.8.

plot(xgb_fast)

library(doParallel)
library(Metrics)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)


ctrl_fast <- trainControl(
  method = "cv",
  number = 5,  # Reduced from 10 folds
  verboseIter = TRUE,
  allowParallel = TRUE,  # Enable parallel processing
  savePredictions = "final"
)

xgb_grid_fast <- expand.grid(
  nrounds = c(100, 150, 200, 300),
  max_depth = c(4, 6, 8),
  eta = c(0.05, 0.1, 0.15),
  gamma = 0,
  colsample_bytree = 0.8,
  min_child_weight = 1,
  subsample = 0.8
)

# Train the  model
xgb_optimized <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl_fast,
  tuneGrid = xgb_grid_fast,
  verbosity = 0,
  metric = "RMSE"
)

## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 100, max_depth = 6, eta = 0.15, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set

## Warning: Setting row names on a tibble is deprecated.

stopCluster(cl)


print(xgb_optimized)

## eXtreme Gradient Boosting 
## 
## 252 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 200, 201, 202, 203, 202 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  nrounds  RMSE       Rsquared   MAE      
##   0.05  4          100      0.9011120  0.8215231  0.6883451
##   0.05  4          150      0.8883722  0.8238125  0.6832788
##   0.05  4          200      0.8871622  0.8248628  0.6815495
##   0.05  4          300      0.8860341  0.8259349  0.6812098
##   0.05  6          100      0.8703363  0.8340119  0.6657812
##   0.05  6          150      0.8621970  0.8342835  0.6608138
##   0.05  6          200      0.8620012  0.8347651  0.6608094
##   0.05  6          300      0.8631944  0.8346652  0.6618495
##   0.05  8          100      0.9131806  0.8173697  0.7086733
##   0.05  8          150      0.9046190  0.8175442  0.7055069
##   0.05  8          200      0.9046224  0.8174260  0.7065193
##   0.05  8          300      0.9061115  0.8170087  0.7079110
##   0.10  4          100      0.8734851  0.8303399  0.6738947
##   0.10  4          150      0.8790684  0.8286507  0.6803869
##   0.10  4          200      0.8830418  0.8275311  0.6840421
##   0.10  4          300      0.8849291  0.8270416  0.6873362
##   0.10  6          100      0.9129718  0.8158589  0.7000536
##   0.10  6          150      0.9154483  0.8151201  0.7024064
##   0.10  6          200      0.9157415  0.8151300  0.7026213
##   0.10  6          300      0.9159481  0.8151109  0.7027467
##   0.10  8          100      0.8852688  0.8266064  0.6896946
##   0.10  8          150      0.8863866  0.8262701  0.6895852
##   0.10  8          200      0.8867005  0.8261605  0.6898820
##   0.10  8          300      0.8868083  0.8261222  0.6900218
##   0.15  4          100      0.9059950  0.8168906  0.7002083
##   0.15  4          150      0.9104842  0.8148878  0.7046985
##   0.15  4          200      0.9115235  0.8147664  0.7060750
##   0.15  4          300      0.9118689  0.8147551  0.7064888
##   0.15  6          100      0.8571253  0.8368995  0.6665441
##   0.15  6          150      0.8575728  0.8368456  0.6669733
##   0.15  6          200      0.8577060  0.8367976  0.6672284
##   0.15  6          300      0.8577302  0.8367929  0.6672692
##   0.15  8          100      0.8979386  0.8207182  0.7209347
##   0.15  8          150      0.8978211  0.8207909  0.7207613
##   0.15  8          200      0.8978151  0.8207947  0.7207717
##   0.15  8          300      0.8978187  0.8207937  0.7207718
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## 
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 100, max_depth = 6, eta
##  = 0.15, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and
##  subsample = 0.8.

plot(xgb_optimized)

best_params <- xgb_optimized$bestTune


X_2024 <- df %>% 
  filter(year == 2024) %>%
  select(-c(year, fips, county.x, percent_food_insecure)) %>%
 
  mutate(
    rural_urban = as.numeric(factor(rural_urban))
  )


predictions_2024 <- predict(xgb_optimized, newdata = X_2024)


results_2024 <- df %>%
  filter(year == 2024) %>%
  select(fips, county.x, percent_food_insecure) %>%  
  mutate(
    pred_2024_food_insecure = predictions_2024,
    predicted_change = pred_2024_food_insecure - percent_food_insecure,
    risk_category = case_when(
      pred_2024_food_insecure > quantile(predictions_2024, 0.75) ~ "High Risk",
      pred_2024_food_insecure > quantile(predictions_2024, 0.5) ~ "Medium Risk",
      TRUE ~ "Low Risk"
    )
  ) %>%
  arrange(desc(pred_2024_food_insecure)) 


write.csv(
  results_2024,
  file = paste0("food_insecurity_predictions_", format(Sys.Date(), "%Y%m%d"), ".csv"),
  row.names = FALSE
)


library(ggplot2)
results_2024 %>%
  head(10) %>%
  ggplot(aes(x = reorder(county.x, -pred_2024_food_insecure), y = pred_2024_food_insecure)) +
  geom_col(fill = "firebrick") +
  labs(title = "Top 10 High-Risk Counties for 2024 Food Insecurity",
       x = "County",
       y = "Predicted Food Insecurity Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(caret)
library(Metrics)

test_predictions <- predict(xgb_optimized, newdata = X_test)
actual_values <- y_test 

metrics_table <- data.frame(
  RMSE = RMSE(test_predictions, actual_values),
  MAPE = mape(actual_values, test_predictions) * 100,
  MSE = mean((test_predictions - actual_values)^2),
  R2 = R2(test_predictions, actual_values)
)

print(metrics_table)

##       RMSE    MAPE      MSE        R2
## 1 1.098412 10.7249 1.206509 0.8872072

library(caret)

perm_imp <- varImp(
  xgb_optimized,
  scale = TRUE,  
  type = "permutation",  
  numPermutations = 30  
)

ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Features", y = "Importance (Permutation)", 
       title = "Permutation Importance") +
  theme_minimal()

## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

perm_imp_plot <- ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Features", y = "Importance (Permutation)", 
       title = "Permutation Importance (XGBoost Experiment 3)") +
  theme_minimal()

## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

ggsave(
  filename = "permutation_importance.png",
  plot = perm_imp_plot,
  device = "png",
  width = 8,         
  height = 6,         
  units = "in",       
  dpi = 300,          
  bg = "white"        
)

XGBoost 3:

This XGBoost model will attempt to do four things: * Make sure the lag variables are actually backward-looking by adjusting how they are set. * Address overfitting by increasing regularization * increase early stopping. * Lastly, add in more interaction terms.

Imputation Step:

library(tidyverse)
library(mice)  # For advanced imputation

## Warning: package 'mice' was built under R version 4.3.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")

# This sapply function runs through the data and calculates the missing percentage across all columns
missing_pct <- sapply(data, function(x) mean(is.na(x)) * 100)

# 2. Rules I set (arbitrary) to decide what columns to keep and impute and which ones to drop
cols_to_impute <- names(missing_pct[missing_pct <= 25 & missing_pct > 0])
cols_to_drop <- names(missing_pct[missing_pct > 25])

# 3. Drop columns with 25% or more missingness
data_clean <- data %>% 
  select(-all_of(cols_to_drop))

# 4.
# Numeric columns: median imputation
num_cols <- data_clean %>% 
  select(where(is.numeric)) %>% 
  names() %>% 
  intersect(cols_to_impute)

data_clean <- data_clean %>%
  mutate(across(all_of(num_cols), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Categorical Columns: mode imputation
cat_cols <- data_clean %>% 
  select(where(is.character) | where(is.factor)) %>% 
  names() %>% 
  intersect(cols_to_impute)

get_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

data_clean <- data_clean %>%
  mutate(across(all_of(cat_cols), ~ifelse(is.na(.), get_mode(.), .)))

# 5. Drop rows with >25% missingness in remaining columns
row_missing_pct <- apply(data_clean, 1, function(x) mean(is.na(x)) * 100)
data_final <- data_clean[row_missing_pct <= 25, ]

Combination variable creation

df <- data_final %>%
  arrange(fips, year) %>% 
  group_by(fips) %>%
  mutate(
    food_insecure_lag1 = lag(percent_food_insecure, 1), 
    food_insecure_lag2 = lag(percent_food_insecure, 2)   
  ) %>%
  select(-food_environment_index) %>% 
  ungroup()


df <- df %>%
  mutate(rural_urban = as.factor(rural_urban),
         rural_urban = as.numeric(rural_urban),
    economic_stress = (
      percent_unemployed + 
      percent_children_in_poverty + 
      percent_severe_housing_problems
    ) / 3)

train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)

X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_train <- train$percent_food_insecure

X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_test <- test$percent_food_insecure

ctrl_fast <- trainControl(
  method = "cv",
  number = 5,  # Reduce from 10 folds
  verboseIter = TRUE,
  allowParallel = TRUE  
)

# Focused tuning grid
xgb_grid_fast <- expand.grid(
  nrounds = c(100, 150),          # Reduced amount of options for nrounds
  max_depth = c(3, 4),            # drop down from 6 and 8 tree depth
  eta = c(0.01, 0.05),            # dropped down from 3 set options to 2
  gamma = c(0,1),                     # 0-1 to add regularization to help overfitting
  colsample_bytree = 0.8,         
  min_child_weight = 2,          # Increased from 1 to help prevent overfitting
  subsample = 0.8                # Fixed
)


library(doParallel)
cl <- makePSOCKcluster(4) 
registerDoParallel(cl)

xgb_fast <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl_fast,
  tuneGrid = xgb_grid_fast,
  verbosity = 0,                 
  metric = "RMSE"
)

## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 150, max_depth = 3, eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2, subsample = 0.8 on full training set

stopCluster(cl)


print(xgb_fast)

## eXtreme Gradient Boosting 
## 
## 252 samples
##  21 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 201, 201, 202, 201, 203 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  gamma  nrounds  RMSE       Rsquared   MAE      
##   0.01  3          0      100      4.2379903  0.7198368  3.9998665
##   0.01  3          0      150      2.7371900  0.7785458  2.4712494
##   0.01  3          1      100      4.2328913  0.7198437  3.9916388
##   0.01  3          1      150      2.7387545  0.7705294  2.4669134
##   0.01  4          0      100      4.2374726  0.7277374  3.9988035
##   0.01  4          0      150      2.7326659  0.7846968  2.4708225
##   0.01  4          1      100      4.2310054  0.7229323  3.9910180
##   0.01  4          1      150      2.7352692  0.7767372  2.4674334
##   0.05  3          0      100      0.8615489  0.8491815  0.6658737
##   0.05  3          0      150      0.8560428  0.8509305  0.6553860
##   0.05  3          1      100      0.8801298  0.8424100  0.6831377
##   0.05  3          1      150      0.8777957  0.8415492  0.6771662
##   0.05  4          0      100      0.8782500  0.8442661  0.6767610
##   0.05  4          0      150      0.8705214  0.8452782  0.6631440
##   0.05  4          1      100      0.8682171  0.8465492  0.6743046
##   0.05  4          1      150      0.8657537  0.8461813  0.6723059
## 
## Tuning parameter 'colsample_bytree' was held constant at a value of 0.8
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 2
## 
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 150, max_depth = 3, eta
##  = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2 and
##  subsample = 0.8.

plot(xgb_fast)

library(doParallel)
library(Metrics)
cl <- makePSOCKcluster(4)  
registerDoParallel(cl)

# Reduced tuning grid with most impactful parameters
ctrl_fast <- trainControl(
  method = "cv",
  number = 5,  # Reduced from 10 folds
  verboseIter = TRUE,
  allowParallel = TRUE,  
  savePredictions = "final"
)


xgb_grid_fast <- expand.grid(
  nrounds = c(100, 150, 200, 300),
  max_depth = c(4, 6, 8), # Back to 4,6,8 because 3,4 RMSE was not great
  eta = c(0.05, 0.1, 0.15),
  gamma = c(0,1),
  colsample_bytree = 0.8,
  min_child_weight = 2,
  subsample = 0.8
)


xgb_optimized <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl_fast,
  tuneGrid = xgb_grid_fast,
  verbosity = 0,
  metric = "RMSE"
)

## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 300, max_depth = 6, eta = 0.05, gamma = 1, colsample_bytree = 0.8, min_child_weight = 2, subsample = 0.8 on full training set

## Warning: Setting row names on a tibble is deprecated.

stopCluster(cl)


print(xgb_optimized)

## eXtreme Gradient Boosting 
## 
## 252 samples
##  21 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 203, 201, 202, 201, 201 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  gamma  nrounds  RMSE       Rsquared   MAE      
##   0.05  4          0      100      0.8907448  0.8225203  0.6889024
##   0.05  4          0      150      0.8878283  0.8264637  0.6838817
##   0.05  4          0      200      0.8863493  0.8280422  0.6821354
##   0.05  4          0      300      0.8864141  0.8289281  0.6800962
##   0.05  4          1      100      0.8692329  0.8312319  0.6823221
##   0.05  4          1      150      0.8628717  0.8335070  0.6767554
##   0.05  4          1      200      0.8607236  0.8345670  0.6753222
##   0.05  4          1      300      0.8590330  0.8359100  0.6726605
##   0.05  6          0      100      0.8685797  0.8322320  0.6704659
##   0.05  6          0      150      0.8688771  0.8321535  0.6713056
##   0.05  6          0      200      0.8741929  0.8311496  0.6731855
##   0.05  6          0      300      0.8789708  0.8304619  0.6718609
##   0.05  6          1      100      0.8624194  0.8340334  0.6754635
##   0.05  6          1      150      0.8526616  0.8372510  0.6710308
##   0.05  6          1      200      0.8536788  0.8370835  0.6713847
##   0.05  6          1      300      0.8522603  0.8373855  0.6699777
##   0.05  8          0      100      0.8674734  0.8339552  0.6669022
##   0.05  8          0      150      0.8653034  0.8342388  0.6655097
##   0.05  8          0      200      0.8694120  0.8337656  0.6643586
##   0.05  8          0      300      0.8751872  0.8322398  0.6644406
##   0.05  8          1      100      0.8780920  0.8293627  0.6854085
##   0.05  8          1      150      0.8743516  0.8302680  0.6825640
##   0.05  8          1      200      0.8723100  0.8312949  0.6803162
##   0.05  8          1      300      0.8679497  0.8329429  0.6778601
##   0.10  4          0      100      0.9061672  0.8178920  0.6942083
##   0.10  4          0      150      0.9014654  0.8204167  0.6874965
##   0.10  4          0      200      0.9021623  0.8203703  0.6867485
##   0.10  4          0      300      0.9003716  0.8211243  0.6854194
##   0.10  4          1      100      0.8933117  0.8237149  0.7006024
##   0.10  4          1      150      0.8943248  0.8240572  0.6994523
##   0.10  4          1      200      0.8936999  0.8243502  0.6969232
##   0.10  4          1      300      0.8922653  0.8255438  0.6953046
##   0.10  6          0      100      0.9053462  0.8152095  0.6948502
##   0.10  6          0      150      0.9090872  0.8149734  0.6941957
##   0.10  6          0      200      0.9095758  0.8150010  0.6934913
##   0.10  6          0      300      0.9102890  0.8148544  0.6937459
##   0.10  6          1      100      0.8640531  0.8345145  0.6648236
##   0.10  6          1      150      0.8666107  0.8340867  0.6657681
##   0.10  6          1      200      0.8656854  0.8344951  0.6648222
##   0.10  6          1      300      0.8663870  0.8344562  0.6657586
##   0.10  8          0      100      0.9000383  0.8175953  0.6993600
##   0.10  8          0      150      0.9005137  0.8178369  0.6969322
##   0.10  8          0      200      0.9013356  0.8176193  0.6970689
##   0.10  8          0      300      0.9013389  0.8177036  0.6966672
##   0.10  8          1      100      0.8722699  0.8296108  0.6763781
##   0.10  8          1      150      0.8775351  0.8285041  0.6793858
##   0.10  8          1      200      0.8757207  0.8292867  0.6782922
##   0.10  8          1      300      0.8753224  0.8293795  0.6782871
##   0.15  4          0      100      0.8908477  0.8274183  0.6858406
##   0.15  4          0      150      0.8883673  0.8282211  0.6822434
##   0.15  4          0      200      0.8892147  0.8277253  0.6817583
##   0.15  4          0      300      0.8885816  0.8278455  0.6810660
##   0.15  4          1      100      0.8802390  0.8231812  0.6916522
##   0.15  4          1      150      0.8801942  0.8240044  0.6900426
##   0.15  4          1      200      0.8787756  0.8245382  0.6888212
##   0.15  4          1      300      0.8750942  0.8251346  0.6849653
##   0.15  6          0      100      0.9089206  0.8161500  0.6967861
##   0.15  6          0      150      0.9085901  0.8164424  0.6950970
##   0.15  6          0      200      0.9084784  0.8164772  0.6947123
##   0.15  6          0      300      0.9084124  0.8164925  0.6946573
##   0.15  6          1      100      0.8838684  0.8304600  0.6962923
##   0.15  6          1      150      0.8824853  0.8313396  0.6947185
##   0.15  6          1      200      0.8834375  0.8307777  0.6949300
##   0.15  6          1      300      0.8819700  0.8315343  0.6941799
##   0.15  8          0      100      0.9112751  0.8178064  0.6991255
##   0.15  8          0      150      0.9137724  0.8171472  0.6994128
##   0.15  8          0      200      0.9139029  0.8171118  0.6992585
##   0.15  8          0      300      0.9140759  0.8170649  0.6992819
##   0.15  8          1      100      0.8935081  0.8231672  0.6978100
##   0.15  8          1      150      0.8921961  0.8231839  0.6964752
##   0.15  8          1      200      0.8928237  0.8231334  0.6972326
##   0.15  8          1      300      0.8873299  0.8251900  0.6960735
## 
## Tuning parameter 'colsample_bytree' was held constant at a value of 0.8
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 2
## 
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 300, max_depth = 6, eta
##  = 0.05, gamma = 1, colsample_bytree = 0.8, min_child_weight = 2 and
##  subsample = 0.8.

plot(xgb_optimized)

best_params <- xgb_optimized$bestTune

X_2024 <- df %>% 
  filter(year == 2024) %>%
  select(-c(year, fips, county.x, percent_food_insecure)) %>%
  mutate(
    rural_urban = as.numeric(factor(rural_urban))
  )

predictions_2024 <- predict(xgb_optimized, newdata = X_2024)


results_2024 <- df %>%
  filter(year == 2024) %>%
  select(fips, county.x, percent_food_insecure) %>%  
  mutate(
    pred_2024_food_insecure = predictions_2024,
    predicted_change = pred_2024_food_insecure - percent_food_insecure,
    risk_category = case_when(
      pred_2024_food_insecure > quantile(predictions_2024, 0.75) ~ "High Risk",
      pred_2024_food_insecure > quantile(predictions_2024, 0.5) ~ "Medium Risk",
      TRUE ~ "Low Risk"
    )
  ) %>%
  arrange(desc(pred_2024_food_insecure))


write.csv(
  results_2024,
  file = paste0("food_insecurity_predictions_", format(Sys.Date(), "%Y%m%d"), ".csv"),
  row.names = FALSE
)


library(ggplot2)
results_2024 %>%
  head(10) %>%
  ggplot(aes(x = reorder(county.x, -pred_2024_food_insecure), y = pred_2024_food_insecure)) +
  geom_col(fill = "firebrick") +
  labs(title = "Top 10 High-Risk Counties for 2024 Food Insecurity",
       x = "County",
       y = "Predicted Food Insecurity Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

* The final values used for the model were nrounds = 300, max_depth = 4, eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2 and subsample = 0.8.

library(caret)
library(Metrics)

test_predictions <- predict(xgb_optimized, newdata = X_test)
actual_values <- y_test 

metrics_table <- data.frame(
  RMSE = RMSE(test_predictions, actual_values),
  MAPE = mape(actual_values, test_predictions) * 100,
  MSE = mean((test_predictions - actual_values)^2),
  R2 = R2(test_predictions, actual_values)
)

print(metrics_table)

##       RMSE     MAPE      MSE        R2
## 1 1.051221 10.54083 1.105066 0.9227965

Geospacial graphing for the best performing XGBoost model:

# SF and Tigris are used to call in geospacial data for mapping purposes
library(sf)

## Warning: package 'sf' was built under R version 4.3.3

## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE

library(tigris)

## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.

# Get counties in native projection (no transformation)
ny_counties <- counties(state = "NY", cb = TRUE, year = 2021, class = "sf") %>% 
  select(fips = GEOID, geometry)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |=========                                                             |  14%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |================                                                      |  24%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |===================================                                   |  51%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=====================================                                 |  54%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |============================================                          |  64%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |=================================================                     |  71%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |===================================================                   |  74%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |=================================================================     |  94%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |====================================================================  |  98%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%

# Filtering out the NY Total Row
results_2024_clean <- results_2024 %>%
  filter(fips != 36000) %>%
  mutate(fips = as.character(fips))

# Merge and plot the predictions with sf and tigris mapping data
ny_counties %>%
  left_join(results_2024_clean, by = "fips") %>%
  drop_na(pred_2024_food_insecure) %>%
  ggplot() +
  geom_sf(aes(fill = pred_2024_food_insecure), color = NA) +
  scale_fill_viridis_c("Food Insecurity (%)", option = "inferno", direction = -1) +
  labs(title = "New York State 2025 Predicted Food Insecurity by County") +
  theme_void()

ny_map <- ny_counties %>%
  left_join(results_2024_clean, by = "fips") %>%
  drop_na(pred_2024_food_insecure) %>%
  ggplot() +
  geom_sf(aes(fill = pred_2024_food_insecure), color = NA) +
  scale_fill_viridis_c(
    "Food Insecurity (%)", 
    option = "inferno", 
    direction = -1,
    limits = c(0, max(results_2024_clean$pred_2024_food_insecure))
  ) +
  labs(
    title = "New York State 2024 Predicted Food Insecurity by County",
    caption = "Data: County Health Rankings"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    legend.position = "right",
    legend.title = element_text(size = 10)
  )


ggsave(
  filename = "ny_food_insecurity_2025.png",
  plot = ny_map,
  device = "png",
  width = 10,   
  height = 8,       
  units = "in", 
  dpi = 300,        
  bg = "white"      
)

Variable Importance for the best perfoming model:

library(caret)

perm_imp <- varImp(
  xgb_optimized,
  scale = TRUE,  
  type = "permutation", 
  numPermutations = 30  
)

ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Features", y = "Importance (Permutation)", 
       title = "Permutation Importance") +
  theme_minimal()

## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

perm_imp_plot <- ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Features", y = "Importance (Permutation)", 
       title = "Permutation Importance (XGBoost Experiment 3)") +
  theme_minimal()

## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

ggsave(
  filename = "permutation_importance.png",
  plot = perm_imp_plot,
  device = "png",
  width = 8,         
  height = 6,         
  units = "in",       
  dpi = 300,          
  bg = "white"        
)

Thesis Project

Jonathan Burns

2025-05-08