Packages Used : running list

library(tidyverse)
library(readxl)
library(janitor)
library(psych)

Data Cleaning

Specify target columns

# Taking out only the targeted variables needed for the analysis as there were much more availible in the health outcomes datasets. 

columns_to_extract <- c("year","fips", "state", "county", "percent_food_insecure","percent_frequent_mental_distress",
                        "percent_uninsured_children", "percent_disconnected_youth", "spending_per_pupil","school_funding_adequacy", "high_school_graduation_rate", "median_household_income", "gender_pay_gap", "percent_enrolled_in_free_or_reduced_lunch", "percent_household_income_required_for_child_care_expenses", "percent_households_with_severe_cost_burden", "percent_rural", "percent_65_and_over", "percent_black","percent_not_proficient_in_english", "segregation_index", "percent_disconnected_youth", "food_environment_index", "teen_birth_rate", "percent_fair_or_poor_health", "percent_unemployed", "percent_children_in_single_parent_households", "percent_children_in_poverty", "percent_severe_housing_problems", "percent_completed_high_school","percent_completed_high_school", "percent_low_birthweight")

Pull in data

main_25 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_25.csv"
supp_25 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_25.csv"
main_24 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_24.csv"
supp_24 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_24.csv"
main_23 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_23.csv"
supp_23 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_23.csv"
main_22 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_22.csv"
supp_22 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_22.csv"
main_21 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_21.csv"
supp_21 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_21.csv"
main_20 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_20.csv"
supp_20 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_20.csv"

Ensure naming conventions are all the same

clean_and_select <- function(file_path, columns_to_extract) {
  data <- read.csv(file_path, check.names = FALSE) %>%
    janitor::clean_names() %>%
    select(any_of(columns_to_extract))
  return(data)
}
m_25 <- clean_and_select(main_25, columns_to_extract)
s_25 <- clean_and_select(supp_25, columns_to_extract)

m_24 <- clean_and_select(main_24, columns_to_extract)
s_24 <- clean_and_select(supp_24, columns_to_extract)

m_23 <- clean_and_select(main_23, columns_to_extract)
s_23 <- clean_and_select(supp_23, columns_to_extract)

m_22 <- clean_and_select(main_22, columns_to_extract)
s_22 <- clean_and_select(supp_22, columns_to_extract)

m_21 <- clean_and_select(main_21, columns_to_extract)
s_21 <- clean_and_select(supp_21, columns_to_extract)

m_20 <- clean_and_select(main_20, columns_to_extract)
s_20 <- clean_and_select(supp_20, columns_to_extract)

Join in Main and Supplemental Data by Year

f_25 <- m_25 %>% 
  left_join(s_25, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_24 <- m_24 %>% 
  left_join(s_24, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_23 <- m_23 %>% 
  left_join(s_23, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_22 <- m_22 %>% 
  left_join(s_22, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_21 <- m_21 %>% 
  left_join(s_21, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

f_20 <- m_20 %>% 
  left_join(s_20, by = c("year", "fips")) %>% 
  select(-state.y, -county.y)

Bind Yearly Data Together

final <- bind_rows(f_25, f_24, f_23, f_22, f_21, f_20)
final_data_clean <- final %>%
  select(where(~ !any(is.na(.))))

Rural | Urban Category Creation

final <- final %>%
  mutate(
    rural_urban = case_when(
      percent_rural == 100 ~ "Completely Rural",
      percent_rural >= 50 & percent_rural < 100 ~ "Mostly Rural",
      percent_rural < 50 ~ "Mostly Urban",
      TRUE ~ NA_character_ 
    )
  )

Dropping NA Columns vs Imputing

total_rows <- nrow(final)


missing_counts <- colSums(is.na(final))

missing_percentage <- (missing_counts / total_rows) * 100


missing_table <- data.frame(
  Column = names(missing_counts),
  MissingCount = missing_counts,
  MissingPercentage = round(missing_percentage, 2) 
)

missing_table <- missing_table[missing_table$MissingCount > 0, ]

print(missing_table)
##                                                                                                              Column
## percent_household_income_required_for_child_care_expenses percent_household_income_required_for_child_care_expenses
## percent_completed_high_school                                                         percent_completed_high_school
## percent_uninsured_children                                                               percent_uninsured_children
## percent_disconnected_youth                                                               percent_disconnected_youth
## spending_per_pupil                                                                               spending_per_pupil
## school_funding_adequacy                                                                     school_funding_adequacy
## high_school_graduation_rate                                                             high_school_graduation_rate
## gender_pay_gap                                                                                       gender_pay_gap
## percent_households_with_severe_cost_burden                               percent_households_with_severe_cost_burden
## segregation_index                                                                                 segregation_index
## teen_birth_rate                                                                                     teen_birth_rate
## percent_children_in_single_parent_households                           percent_children_in_single_parent_households
## percent_low_birthweight                                                                     percent_low_birthweight
## percent_black                                                                                         percent_black
## percent_children_in_single_parent_households.x                       percent_children_in_single_parent_households.x
## percent_children_in_single_parent_households.y                       percent_children_in_single_parent_households.y
##                                                           MissingCount
## percent_household_income_required_for_child_care_expenses          189
## percent_completed_high_school                                       63
## percent_uninsured_children                                         189
## percent_disconnected_youth                                          59
## spending_per_pupil                                                 143
## school_funding_adequacy                                            202
## high_school_graduation_rate                                          7
## gender_pay_gap                                                     126
## percent_households_with_severe_cost_burden                         189
## segregation_index                                                   16
## teen_birth_rate                                                      5
## percent_children_in_single_parent_households                       189
## percent_low_birthweight                                             64
## percent_black                                                       63
## percent_children_in_single_parent_households.x                     189
## percent_children_in_single_parent_households.y                     189
##                                                           MissingPercentage
## percent_household_income_required_for_child_care_expenses             50.00
## percent_completed_high_school                                         16.67
## percent_uninsured_children                                            50.00
## percent_disconnected_youth                                            15.61
## spending_per_pupil                                                    37.83
## school_funding_adequacy                                               53.44
## high_school_graduation_rate                                            1.85
## gender_pay_gap                                                        33.33
## percent_households_with_severe_cost_burden                            50.00
## segregation_index                                                      4.23
## teen_birth_rate                                                        1.32
## percent_children_in_single_parent_households                          50.00
## percent_low_birthweight                                               16.93
## percent_black                                                         16.67
## percent_children_in_single_parent_households.x                        50.00
## percent_children_in_single_parent_households.y                        50.00

EDA

Summary Statistics

describe(final, fast = T)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                                                           vars   n     mean
## year                                                         1 378  2022.50
## fips                                                         2 378 36061.02
## state.x                                                      3 378      NaN
## county.x                                                     4 378      NaN
## percent_household_income_required_for_child_care_expenses    5 189    36.26
## food_environment_index                                       6 378     8.28
## percent_fair_or_poor_health                                  7 378    15.28
## percent_unemployed                                           8 378     4.98
## percent_children_in_poverty                                  9 378    17.60
## percent_severe_housing_problems                             10 378    16.04
## percent_completed_high_school                               11 315    89.60
## percent_food_insecure                                       12 378    11.16
## percent_frequent_mental_distress                            13 378    14.85
## percent_uninsured_children                                  14 189     2.90
## percent_disconnected_youth                                  15 319     7.76
## spending_per_pupil                                          16 235 24725.92
## school_funding_adequacy                                     17 176 12738.98
## high_school_graduation_rate                                 18 371    85.94
## median_household_income                                     19 378 67423.84
## gender_pay_gap                                              20 252     0.84
## percent_enrolled_in_free_or_reduced_lunch                   21 378    48.38
## percent_households_with_severe_cost_burden                  22 189    14.04
## percent_rural                                               23 378    45.01
## percent_65_and_over                                         24 378    19.72
## percent_not_proficient_in_english                           25 378     2.23
## segregation_index                                           26 362    18.10
## teen_birth_rate                                             27 373    14.47
## percent_children_in_single_parent_households                28 189    22.25
## percent_low_birthweight                                     29 314     7.29
## percent_black                                               30 315     6.13
## percent_children_in_single_parent_households.x              31 189    22.71
## percent_children_in_single_parent_households.y              32 189    22.35
## rural_urban                                                 33 378      NaN
##                                                                 sd      min
## year                                                          1.71  2020.00
## fips                                                         36.39 36000.00
## state.x                                                         NA      Inf
## county.x                                                        NA      Inf
## percent_household_income_required_for_child_care_expenses     6.19    24.00
## food_environment_index                                        0.53     6.20
## percent_fair_or_poor_health                                   2.85     9.00
## percent_unemployed                                            1.92     2.70
## percent_children_in_poverty                                   5.14     6.00
## percent_severe_housing_problems                               5.60     8.00
## percent_completed_high_school                                 3.32    73.00
## percent_food_insecure                                         2.25     4.00
## percent_frequent_mental_distress                              2.08    10.00
## percent_uninsured_children                                    1.05     2.00
## percent_disconnected_youth                                    3.94     1.00
## spending_per_pupil                                         4491.46 18229.00
## school_funding_adequacy                                    4128.12   958.00
## high_school_graduation_rate                                   4.63    67.00
## median_household_income                                   16548.19 38566.00
## gender_pay_gap                                                0.05     0.69
## percent_enrolled_in_free_or_reduced_lunch                    10.23    22.00
## percent_households_with_severe_cost_burden                    4.40     7.00
## percent_rural                                                28.81     0.00
## percent_65_and_over                                           3.11    12.80
## percent_not_proficient_in_english                             3.27     0.00
## segregation_index                                            27.28     0.03
## teen_birth_rate                                               6.24     3.00
## percent_children_in_single_parent_households                  5.89    12.00
## percent_low_birthweight                                       0.95     4.00
## percent_black                                                 6.06     0.60
## percent_children_in_single_parent_households.x                6.16    10.00
## percent_children_in_single_parent_households.y                5.88    12.00
## rural_urban                                                     NA      Inf
##                                                                 max     range
## year                                                        2025.00      5.00
## fips                                                       36123.00    123.00
## state.x                                                        -Inf      -Inf
## county.x                                                       -Inf      -Inf
## percent_household_income_required_for_child_care_expenses     80.00     56.00
## food_environment_index                                         9.90      3.70
## percent_fair_or_poor_health                                   30.00     21.00
## percent_unemployed                                            16.00     13.30
## percent_children_in_poverty                                   38.00     32.00
## percent_severe_housing_problems                               39.00     31.00
## percent_completed_high_school                                 96.00     23.00
## percent_food_insecure                                         20.00     16.00
## percent_frequent_mental_distress                              20.00     10.00
## percent_uninsured_children                                    10.00      8.00
## percent_disconnected_youth                                    27.00     26.00
## spending_per_pupil                                         50849.00  32620.00
## school_funding_adequacy                                    34981.00  34023.00
## high_school_graduation_rate                                   96.00     29.00
## median_household_income                                   140466.00 101900.00
## gender_pay_gap                                                 0.99      0.30
## percent_enrolled_in_free_or_reduced_lunch                     88.00     66.00
## percent_households_with_severe_cost_burden                    31.00     24.00
## percent_rural                                                100.00    100.00
## percent_65_and_over                                           34.80     22.00
## percent_not_proficient_in_english                             16.00     16.00
## segregation_index                                             81.00     80.97
## teen_birth_rate                                               36.00     33.00
## percent_children_in_single_parent_households                  52.00     40.00
## percent_low_birthweight                                       10.00      6.00
## percent_black                                                 29.90     29.30
## percent_children_in_single_parent_households.x                51.00     41.00
## percent_children_in_single_parent_households.y                52.00     40.00
## rural_urban                                                    -Inf      -Inf
##                                                               se
## year                                                        0.09
## fips                                                        1.87
## state.x                                                       NA
## county.x                                                      NA
## percent_household_income_required_for_child_care_expenses   0.45
## food_environment_index                                      0.03
## percent_fair_or_poor_health                                 0.15
## percent_unemployed                                          0.10
## percent_children_in_poverty                                 0.26
## percent_severe_housing_problems                             0.29
## percent_completed_high_school                               0.19
## percent_food_insecure                                       0.12
## percent_frequent_mental_distress                            0.11
## percent_uninsured_children                                  0.08
## percent_disconnected_youth                                  0.22
## spending_per_pupil                                        292.99
## school_funding_adequacy                                   311.17
## high_school_graduation_rate                                 0.24
## median_household_income                                   851.15
## gender_pay_gap                                              0.00
## percent_enrolled_in_free_or_reduced_lunch                   0.53
## percent_households_with_severe_cost_burden                  0.32
## percent_rural                                               1.48
## percent_65_and_over                                         0.16
## percent_not_proficient_in_english                           0.17
## segregation_index                                           1.43
## teen_birth_rate                                             0.32
## percent_children_in_single_parent_households                0.43
## percent_low_birthweight                                     0.05
## percent_black                                               0.34
## percent_children_in_single_parent_households.x              0.45
## percent_children_in_single_parent_households.y              0.43
## rural_urban                                                   NA

Missingness Visualized

ggplot(missing_table, aes(x = reorder(Column, -MissingPercentage), y = MissingPercentage)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() + 
  labs(
    title = "Percentage of Missing Data by Variable",
    x = "Variable",
    y = "Missingness %"
  ) +
  theme_minimal()

##Numeric Distribution

  • Quick sapply function to look at histogram distributions for all of the numeric variables in the dataset
  • The target variable percent_food_insecure is normally distributed
library(ggplot2)
numeric_columns <- sapply(final, is.numeric)
final_numeric <- final[, numeric_columns]

for (col in names(final_numeric)) {
  print(
    ggplot(final, aes_string(x = col)) +
      geom_histogram(fill = "steelblue", bins = 30, color = "black") +
      labs(title = paste("Distribution of", col), x = col, y = "Frequency") +
      theme_minimal()
  )
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 63 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 143 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 202 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 63 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).

By Year:

  • Mean, median and standard deviation were calculated for each variable and then grouped by year.
library(dplyr)

yearly_summary <- final %>%
  group_by(year) %>%
  summarise(across(where(is.numeric), list(mean = mean, median = median, sd = sd), na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## ℹ In group 1: `year = 2020`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
print(yearly_summary)
## # A tibble: 6 × 88
##    year fips_mean fips_median fips_sd percent_household_income_required_for_ch…¹
##   <int>     <dbl>       <int>   <dbl>                                      <dbl>
## 1  2020    36061.       36061    36.6                                      NaN  
## 2  2021    36061.       36061    36.6                                      NaN  
## 3  2022    36061.       36061    36.6                                      NaN  
## 4  2023    36061.       36061    36.6                                       32.2
## 5  2024    36061.       36061    36.6                                       39.0
## 6  2025    36061.       36061    36.6                                       37.6
## # ℹ abbreviated name:
## #   ¹​percent_household_income_required_for_child_care_expenses_mean
## # ℹ 83 more variables:
## #   percent_household_income_required_for_child_care_expenses_median <int>,
## #   percent_household_income_required_for_child_care_expenses_sd <dbl>,
## #   food_environment_index_mean <dbl>, food_environment_index_median <dbl>,
## #   food_environment_index_sd <dbl>, percent_fair_or_poor_health_mean <dbl>, …

By County

  • Same thing for county level averages, median and sd for each var.
region_summary <- final %>%
  group_by(county.x, year) %>%
  summarise(across(where(is.numeric), list(mean = mean, median = median, sd = sd), na.rm = TRUE))
## `summarise()` has grouped output by 'county.x'. You can override using the
## `.groups` argument.
print(region_summary)
## # A tibble: 378 × 89
## # Groups:   county.x [63]
##    county.x  year fips_mean fips_median fips_sd percent_household_income_requi…¹
##    <chr>    <int>     <dbl>       <int>   <dbl>                            <dbl>
##  1 Albany    2020     36001       36001      NA                              NaN
##  2 Albany    2021     36001       36001      NA                              NaN
##  3 Albany    2022     36001       36001      NA                              NaN
##  4 Albany    2023     36001       36001      NA                               35
##  5 Albany    2024     36001       36001      NA                               41
##  6 Albany    2025     36001       36001      NA                               37
##  7 Allegany  2020     36003       36003      NA                              NaN
##  8 Allegany  2021     36003       36003      NA                              NaN
##  9 Allegany  2022     36003       36003      NA                              NaN
## 10 Allegany  2023     36003       36003      NA                               34
## # ℹ 368 more rows
## # ℹ abbreviated name:
## #   ¹​percent_household_income_required_for_child_care_expenses_mean
## # ℹ 83 more variables:
## #   percent_household_income_required_for_child_care_expenses_median <int>,
## #   percent_household_income_required_for_child_care_expenses_sd <dbl>,
## #   food_environment_index_mean <dbl>, food_environment_index_median <dbl>, …

Yearly Line Graph of Food Insecurity

  • Despite this graph being messy, there is a point to it.
  • A trend appears, there is an overall shift down in food insecurity from 2023 -> 2024, followed by a shift upward in 2025, past where is was in the previous 4 years for most counties.
  • Additionally, the Bronx is an outlier for food insecurity showing 1. a much higher starting point in 2020, but also the increase over the five years is sharp.
library(ggplot2)

ggplot(region_summary, aes(x = year, y = percent_food_insecure_mean, color = county.x)) +
  geom_line() +
  labs(title = "Yearly Food Insecurity Across New York State Counties", x = "Year", y = "Food Insecurity") +
  theme_minimal()

Average trend in food insecurity across NYS only.

ggplot(region_summary, aes(x = year, y = percent_food_insecure_mean)) +
  geom_line() +
  labs(title = "Yearly Average Food Insecurity Across New York State", x = "Year", y = "Value") +
  theme_minimal()

ggplot(final, aes(x = county.x, y = percent_food_insecure, color = rural_urban)) +
  geom_boxplot(fill = "steelblue") +
  labs(title = "County Level Comparison of Food Insecurity In New York State", x = "County", y = "Food Insecurity Percentage") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))

Saving out final csv for cross use in my python code (for LSTM) and the xGBoost.

final_csv <- write_csv(final, "C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")

Model Testing:

XGBoost 1:

  • This first XGBoost model aims to be a lighter weight model, only selecting 10 variables from the dataset. Additionally, I only created two lag features and recoded my categorical rural_urban column as numeric
library(tidyverse)
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.3
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")

df <- data %>%
  select(year, fips, county.x, percent_food_insecure,
    percent_unemployed, percent_children_in_poverty,
    median_household_income, percent_fair_or_poor_health,
    high_school_graduation_rate, rural_urban
  ) %>%
  drop_na(percent_food_insecure)  #just making sure there are no na's left in for this XGBoost model

Create Lag Features and code rural_urban on

  • Using a standard 1 and 2 year lag variable for the entire model
df <- df %>%
  arrange(fips, year) %>% 
  group_by(fips) %>%
  mutate(
    food_insecure_lag1 = lag(percent_food_insecure, 1),  # Previous year
    food_insecure_lag2 = lag(percent_food_insecure, 2)   # Two years back
  ) %>%
  ungroup()

# recode rural_urban as numeric
df <- df %>%
  mutate(rural_urban = as.factor(rural_urban),
         rural_urban = as.numeric(rural_urban))

Split Data

train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)

# dropping out the non-important features, including the dependent variable
X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure))
y_train <- train$percent_food_insecure

X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure))
y_test <- test$percent_food_insecure

Experiment 1: Train Model

# cross val
ctrl <- trainControl(
  method = "cv",
  number = 10,
  verboseIter = TRUE
)

# Standard xgboost tuning grid
xgb_grid <- expand.grid(
  nrounds = 100,
  max_depth = 6,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)


xgb_model <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl,
  tuneGrid = xgb_grid,
  verbosity = 0
)
## + Fold01: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold01: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold02: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold02: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold03: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold03: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold04: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold04: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold05: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold05: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold06: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold06: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold07: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold07: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold08: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold08: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold09: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold09: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold10: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold10: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## Aggregating results
## Fitting final model on full training set
## Warning: Setting row names on a tibble is deprecated.
print(xgb_model)
## eXtreme Gradient Boosting 
## 
## 252 samples
##   8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 226, 227, 228, 227, 228, 226, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.9396357  0.782687  0.7290635
## 
## Tuning parameter 'nrounds' was held constant at a value of 100
## Tuning
##  held constant at a value of 1
## Tuning parameter 'subsample' was held
##  constant at a value of 1

Apply Experiment 1 Model:

# predicting 2024 data
X_2024 <- df %>% 
  filter(year == 2024) %>%
  select(-c(year, fips, county.x, percent_food_insecure))

# Predicting 2024 food insecurity
predictions_2024 <- predict(xgb_model, newdata = X_2024)

# bringing in county and fips codes 
results <- df %>%
  filter(year == 2024) %>%
  select(fips, county.x) %>%
  mutate(pred_2024_food_insecure = predictions_2024)

# View predictions
print(results)
## # A tibble: 63 × 3
##     fips county.x    pred_2024_food_insecure
##    <int> <chr>                         <dbl>
##  1 36000 Total                         10.0 
##  2 36001 Albany                         9.34
##  3 36003 Allegany                      13.1 
##  4 36005 Bronx                         17.9 
##  5 36007 Broome                        12.3 
##  6 36009 Cattaraugus                   12.7 
##  7 36011 Cayuga                        10.5 
##  8 36013 Chautauqua                    13.2 
##  9 36015 Chemung                       11.9 
## 10 36017 Chenango                      10.6 
## # ℹ 53 more rows

Experiment 1: Predicted vs Actual Food Insecurity

# Predict 2024 (for validPrediPredir)
pred_2024 <- predict(xgb_model, newdata = X_test)

# Calculate RMSE
rmse <- sqrt(mean((pred_2024 - y_test)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 1.07624761613868"
ggplot(data.frame(Actual = y_test, Predicted = pred_2024), aes(Actual, Predicted)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(title = "Actual vs. Predicted Food Insecurity (2024)")

Experiment 1 Data Metrics Table

library(caret)
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
test_predictions <- predict(xgb_model, newdata = X_test)
actual_values <- y_test 

metrics_table <- data.frame(
  RMSE = RMSE(test_predictions, actual_values),
  MAPE = mape(actual_values, test_predictions) * 100,
  MSE = mean((test_predictions - actual_values)^2),
  R2 = R2(test_predictions, actual_values)
)

print(metrics_table)
##       RMSE    MAPE      MSE        R2
## 1 1.076248 10.3129 1.158309 0.8556726

XGBoost 2:

  • XGBoost model 2 brings in a few different changes to the model, the lag variables are still created but this time the model is tuned to be faster.
  • By dropping the number of folds from 10 to 5 this should cut the number of boosted models ran for this experiment.
library(tidyverse)
library(xgboost)
library(caret)


data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")


df <- data %>%
  select(where(~ all(!is.na(.)))) %>%
  drop_na(percent_food_insecure)
df <- df %>%
  arrange(fips, year) %>% 
  group_by(fips) %>%
  mutate(
    food_insecure_lag1 = lag(percent_food_insecure, 1),  # Previous year
    food_insecure_lag2 = lag(percent_food_insecure, 2)   # Two years back
  ) %>%
  select(-food_environment_index) %>% 
  ungroup()


df <- df %>%
  mutate(rural_urban = as.factor(rural_urban),
         rural_urban = as.numeric(rural_urban))
train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)


X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_train <- train$percent_food_insecure

X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_test <- test$percent_food_insecure
ctrl_fast <- trainControl(
  method = "cv",
  number = 5,  # Reduced from 10 folds
  verboseIter = TRUE,
  allowParallel = TRUE  # Enable parallel processing
)

# Tuning grid set
xgb_grid_fast <- expand.grid(
  nrounds = c(100, 150, 200, 300),          # Reduced options
  max_depth = c(4, 6, 8),            # Most useful range
  eta = c(0.05, 0.1, .15),            # Optimal learning rates
  gamma = 0,                     
  colsample_bytree = 0.8,       
  min_child_weight = 1,          
  subsample = 0.8               
)

# Parallel processing to speed things up
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
cl <- makePSOCKcluster(4)
registerDoParallel(cl)

xgb_fast <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl_fast,
  tuneGrid = xgb_grid_fast,
  verbosity = 0,
  metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 300, max_depth = 4, eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set
stopCluster(cl)


print(xgb_fast)
## eXtreme Gradient Boosting 
## 
## 252 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 201, 202, 201, 202, 202 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  nrounds  RMSE       Rsquared   MAE      
##   0.05  4          100      0.8781404  0.8355296  0.6707104
##   0.05  4          150      0.8540096  0.8402291  0.6462976
##   0.05  4          200      0.8453597  0.8426345  0.6345746
##   0.05  4          300      0.8451813  0.8424344  0.6312022
##   0.05  6          100      0.8923034  0.8322891  0.6678132
##   0.05  6          150      0.8754636  0.8340372  0.6504066
##   0.05  6          200      0.8773633  0.8331453  0.6492893
##   0.05  6          300      0.8772838  0.8334252  0.6496573
##   0.05  8          100      0.8974352  0.8291845  0.6731898
##   0.05  8          150      0.8816944  0.8291804  0.6630502
##   0.05  8          200      0.8796597  0.8293649  0.6624847
##   0.05  8          300      0.8796704  0.8292728  0.6624413
##   0.10  4          100      0.8648257  0.8364921  0.6439529
##   0.10  4          150      0.8642269  0.8362099  0.6417462
##   0.10  4          200      0.8645774  0.8362313  0.6412801
##   0.10  4          300      0.8656470  0.8356974  0.6410426
##   0.10  6          100      0.8825171  0.8286300  0.6591104
##   0.10  6          150      0.8806580  0.8292312  0.6567807
##   0.10  6          200      0.8797769  0.8296094  0.6567192
##   0.10  6          300      0.8796946  0.8296352  0.6566413
##   0.10  8          100      0.8979305  0.8253533  0.6732380
##   0.10  8          150      0.8972889  0.8256395  0.6722391
##   0.10  8          200      0.8970448  0.8257292  0.6718342
##   0.10  8          300      0.8970231  0.8257341  0.6718000
##   0.15  4          100      0.8607434  0.8359365  0.6525811
##   0.15  4          150      0.8637856  0.8354407  0.6524942
##   0.15  4          200      0.8638682  0.8354469  0.6534405
##   0.15  4          300      0.8643632  0.8352971  0.6540152
##   0.15  6          100      0.8865801  0.8275857  0.6848609
##   0.15  6          150      0.8874475  0.8272966  0.6852978
##   0.15  6          200      0.8874683  0.8272873  0.6853752
##   0.15  6          300      0.8874747  0.8272835  0.6853918
##   0.15  8          100      0.8980507  0.8237776  0.6916497
##   0.15  8          150      0.8981906  0.8237612  0.6917727
##   0.15  8          200      0.8981609  0.8237745  0.6917438
##   0.15  8          300      0.8981608  0.8237731  0.6917376
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## 
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 300, max_depth = 4, eta
##  = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and
##  subsample = 0.8.
plot(xgb_fast)

library(doParallel)
library(Metrics)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)


ctrl_fast <- trainControl(
  method = "cv",
  number = 5,  # Reduced from 10 folds
  verboseIter = TRUE,
  allowParallel = TRUE,  # Enable parallel processing
  savePredictions = "final"
)

xgb_grid_fast <- expand.grid(
  nrounds = c(100, 150, 200, 300),
  max_depth = c(4, 6, 8),
  eta = c(0.05, 0.1, 0.15),
  gamma = 0,
  colsample_bytree = 0.8,
  min_child_weight = 1,
  subsample = 0.8
)

# Train the  model
xgb_optimized <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl_fast,
  tuneGrid = xgb_grid_fast,
  verbosity = 0,
  metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 100, max_depth = 6, eta = 0.15, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set
## Warning: Setting row names on a tibble is deprecated.
stopCluster(cl)


print(xgb_optimized)
## eXtreme Gradient Boosting 
## 
## 252 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 200, 201, 202, 203, 202 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  nrounds  RMSE       Rsquared   MAE      
##   0.05  4          100      0.9011120  0.8215231  0.6883451
##   0.05  4          150      0.8883722  0.8238125  0.6832788
##   0.05  4          200      0.8871622  0.8248628  0.6815495
##   0.05  4          300      0.8860341  0.8259349  0.6812098
##   0.05  6          100      0.8703363  0.8340119  0.6657812
##   0.05  6          150      0.8621970  0.8342835  0.6608138
##   0.05  6          200      0.8620012  0.8347651  0.6608094
##   0.05  6          300      0.8631944  0.8346652  0.6618495
##   0.05  8          100      0.9131806  0.8173697  0.7086733
##   0.05  8          150      0.9046190  0.8175442  0.7055069
##   0.05  8          200      0.9046224  0.8174260  0.7065193
##   0.05  8          300      0.9061115  0.8170087  0.7079110
##   0.10  4          100      0.8734851  0.8303399  0.6738947
##   0.10  4          150      0.8790684  0.8286507  0.6803869
##   0.10  4          200      0.8830418  0.8275311  0.6840421
##   0.10  4          300      0.8849291  0.8270416  0.6873362
##   0.10  6          100      0.9129718  0.8158589  0.7000536
##   0.10  6          150      0.9154483  0.8151201  0.7024064
##   0.10  6          200      0.9157415  0.8151300  0.7026213
##   0.10  6          300      0.9159481  0.8151109  0.7027467
##   0.10  8          100      0.8852688  0.8266064  0.6896946
##   0.10  8          150      0.8863866  0.8262701  0.6895852
##   0.10  8          200      0.8867005  0.8261605  0.6898820
##   0.10  8          300      0.8868083  0.8261222  0.6900218
##   0.15  4          100      0.9059950  0.8168906  0.7002083
##   0.15  4          150      0.9104842  0.8148878  0.7046985
##   0.15  4          200      0.9115235  0.8147664  0.7060750
##   0.15  4          300      0.9118689  0.8147551  0.7064888
##   0.15  6          100      0.8571253  0.8368995  0.6665441
##   0.15  6          150      0.8575728  0.8368456  0.6669733
##   0.15  6          200      0.8577060  0.8367976  0.6672284
##   0.15  6          300      0.8577302  0.8367929  0.6672692
##   0.15  8          100      0.8979386  0.8207182  0.7209347
##   0.15  8          150      0.8978211  0.8207909  0.7207613
##   0.15  8          200      0.8978151  0.8207947  0.7207717
##   0.15  8          300      0.8978187  0.8207937  0.7207718
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## 
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 100, max_depth = 6, eta
##  = 0.15, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and
##  subsample = 0.8.
plot(xgb_optimized)

best_params <- xgb_optimized$bestTune


X_2024 <- df %>% 
  filter(year == 2024) %>%
  select(-c(year, fips, county.x, percent_food_insecure)) %>%
 
  mutate(
    rural_urban = as.numeric(factor(rural_urban))
  )


predictions_2024 <- predict(xgb_optimized, newdata = X_2024)


results_2024 <- df %>%
  filter(year == 2024) %>%
  select(fips, county.x, percent_food_insecure) %>%  
  mutate(
    pred_2024_food_insecure = predictions_2024,
    predicted_change = pred_2024_food_insecure - percent_food_insecure,
    risk_category = case_when(
      pred_2024_food_insecure > quantile(predictions_2024, 0.75) ~ "High Risk",
      pred_2024_food_insecure > quantile(predictions_2024, 0.5) ~ "Medium Risk",
      TRUE ~ "Low Risk"
    )
  ) %>%
  arrange(desc(pred_2024_food_insecure)) 


write.csv(
  results_2024,
  file = paste0("food_insecurity_predictions_", format(Sys.Date(), "%Y%m%d"), ".csv"),
  row.names = FALSE
)


library(ggplot2)
results_2024 %>%
  head(10) %>%
  ggplot(aes(x = reorder(county.x, -pred_2024_food_insecure), y = pred_2024_food_insecure)) +
  geom_col(fill = "firebrick") +
  labs(title = "Top 10 High-Risk Counties for 2024 Food Insecurity",
       x = "County",
       y = "Predicted Food Insecurity Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(caret)
library(Metrics)

test_predictions <- predict(xgb_optimized, newdata = X_test)
actual_values <- y_test 

metrics_table <- data.frame(
  RMSE = RMSE(test_predictions, actual_values),
  MAPE = mape(actual_values, test_predictions) * 100,
  MSE = mean((test_predictions - actual_values)^2),
  R2 = R2(test_predictions, actual_values)
)

print(metrics_table)
##       RMSE    MAPE      MSE        R2
## 1 1.098412 10.7249 1.206509 0.8872072
library(caret)

perm_imp <- varImp(
  xgb_optimized,
  scale = TRUE,  
  type = "permutation",  
  numPermutations = 30  
)

ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Features", y = "Importance (Permutation)", 
       title = "Permutation Importance") +
  theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

perm_imp_plot <- ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Features", y = "Importance (Permutation)", 
       title = "Permutation Importance (XGBoost Experiment 3)") +
  theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
ggsave(
  filename = "permutation_importance.png",
  plot = perm_imp_plot,
  device = "png",
  width = 8,         
  height = 6,         
  units = "in",       
  dpi = 300,          
  bg = "white"        
)

XGBoost 3:

This XGBoost model will attempt to do four things: * Make sure the lag variables are actually backward-looking by adjusting how they are set. * Address overfitting by increasing regularization * increase early stopping. * Lastly, add in more interaction terms.

Imputation Step:

library(tidyverse)
library(mice)  # For advanced imputation
## Warning: package 'mice' was built under R version 4.3.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")

# This sapply function runs through the data and calculates the missing percentage across all columns
missing_pct <- sapply(data, function(x) mean(is.na(x)) * 100)

# 2. Rules I set (arbitrary) to decide what columns to keep and impute and which ones to drop
cols_to_impute <- names(missing_pct[missing_pct <= 25 & missing_pct > 0])
cols_to_drop <- names(missing_pct[missing_pct > 25])

# 3. Drop columns with 25% or more missingness
data_clean <- data %>% 
  select(-all_of(cols_to_drop))

# 4.
# Numeric columns: median imputation
num_cols <- data_clean %>% 
  select(where(is.numeric)) %>% 
  names() %>% 
  intersect(cols_to_impute)

data_clean <- data_clean %>%
  mutate(across(all_of(num_cols), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Categorical Columns: mode imputation
cat_cols <- data_clean %>% 
  select(where(is.character) | where(is.factor)) %>% 
  names() %>% 
  intersect(cols_to_impute)

get_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

data_clean <- data_clean %>%
  mutate(across(all_of(cat_cols), ~ifelse(is.na(.), get_mode(.), .)))

# 5. Drop rows with >25% missingness in remaining columns
row_missing_pct <- apply(data_clean, 1, function(x) mean(is.na(x)) * 100)
data_final <- data_clean[row_missing_pct <= 25, ]

Combination variable creation

df <- data_final %>%
  arrange(fips, year) %>% 
  group_by(fips) %>%
  mutate(
    food_insecure_lag1 = lag(percent_food_insecure, 1), 
    food_insecure_lag2 = lag(percent_food_insecure, 2)   
  ) %>%
  select(-food_environment_index) %>% 
  ungroup()


df <- df %>%
  mutate(rural_urban = as.factor(rural_urban),
         rural_urban = as.numeric(rural_urban),
    economic_stress = (
      percent_unemployed + 
      percent_children_in_poverty + 
      percent_severe_housing_problems
    ) / 3)
train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)

X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_train <- train$percent_food_insecure

X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_test <- test$percent_food_insecure
ctrl_fast <- trainControl(
  method = "cv",
  number = 5,  # Reduce from 10 folds
  verboseIter = TRUE,
  allowParallel = TRUE  
)

# Focused tuning grid
xgb_grid_fast <- expand.grid(
  nrounds = c(100, 150),          # Reduced amount of options for nrounds
  max_depth = c(3, 4),            # drop down from 6 and 8 tree depth
  eta = c(0.01, 0.05),            # dropped down from 3 set options to 2
  gamma = c(0,1),                     # 0-1 to add regularization to help overfitting
  colsample_bytree = 0.8,         
  min_child_weight = 2,          # Increased from 1 to help prevent overfitting
  subsample = 0.8                # Fixed
)


library(doParallel)
cl <- makePSOCKcluster(4) 
registerDoParallel(cl)

xgb_fast <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl_fast,
  tuneGrid = xgb_grid_fast,
  verbosity = 0,                 
  metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 150, max_depth = 3, eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2, subsample = 0.8 on full training set
stopCluster(cl)


print(xgb_fast)
## eXtreme Gradient Boosting 
## 
## 252 samples
##  21 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 201, 201, 202, 201, 203 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  gamma  nrounds  RMSE       Rsquared   MAE      
##   0.01  3          0      100      4.2379903  0.7198368  3.9998665
##   0.01  3          0      150      2.7371900  0.7785458  2.4712494
##   0.01  3          1      100      4.2328913  0.7198437  3.9916388
##   0.01  3          1      150      2.7387545  0.7705294  2.4669134
##   0.01  4          0      100      4.2374726  0.7277374  3.9988035
##   0.01  4          0      150      2.7326659  0.7846968  2.4708225
##   0.01  4          1      100      4.2310054  0.7229323  3.9910180
##   0.01  4          1      150      2.7352692  0.7767372  2.4674334
##   0.05  3          0      100      0.8615489  0.8491815  0.6658737
##   0.05  3          0      150      0.8560428  0.8509305  0.6553860
##   0.05  3          1      100      0.8801298  0.8424100  0.6831377
##   0.05  3          1      150      0.8777957  0.8415492  0.6771662
##   0.05  4          0      100      0.8782500  0.8442661  0.6767610
##   0.05  4          0      150      0.8705214  0.8452782  0.6631440
##   0.05  4          1      100      0.8682171  0.8465492  0.6743046
##   0.05  4          1      150      0.8657537  0.8461813  0.6723059
## 
## Tuning parameter 'colsample_bytree' was held constant at a value of 0.8
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 2
## 
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 150, max_depth = 3, eta
##  = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2 and
##  subsample = 0.8.
plot(xgb_fast)

library(doParallel)
library(Metrics)
cl <- makePSOCKcluster(4)  
registerDoParallel(cl)

# Reduced tuning grid with most impactful parameters
ctrl_fast <- trainControl(
  method = "cv",
  number = 5,  # Reduced from 10 folds
  verboseIter = TRUE,
  allowParallel = TRUE,  
  savePredictions = "final"
)


xgb_grid_fast <- expand.grid(
  nrounds = c(100, 150, 200, 300),
  max_depth = c(4, 6, 8), # Back to 4,6,8 because 3,4 RMSE was not great
  eta = c(0.05, 0.1, 0.15),
  gamma = c(0,1),
  colsample_bytree = 0.8,
  min_child_weight = 2,
  subsample = 0.8
)


xgb_optimized <- train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = ctrl_fast,
  tuneGrid = xgb_grid_fast,
  verbosity = 0,
  metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 300, max_depth = 6, eta = 0.05, gamma = 1, colsample_bytree = 0.8, min_child_weight = 2, subsample = 0.8 on full training set
## Warning: Setting row names on a tibble is deprecated.
stopCluster(cl)


print(xgb_optimized)
## eXtreme Gradient Boosting 
## 
## 252 samples
##  21 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 203, 201, 202, 201, 201 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  gamma  nrounds  RMSE       Rsquared   MAE      
##   0.05  4          0      100      0.8907448  0.8225203  0.6889024
##   0.05  4          0      150      0.8878283  0.8264637  0.6838817
##   0.05  4          0      200      0.8863493  0.8280422  0.6821354
##   0.05  4          0      300      0.8864141  0.8289281  0.6800962
##   0.05  4          1      100      0.8692329  0.8312319  0.6823221
##   0.05  4          1      150      0.8628717  0.8335070  0.6767554
##   0.05  4          1      200      0.8607236  0.8345670  0.6753222
##   0.05  4          1      300      0.8590330  0.8359100  0.6726605
##   0.05  6          0      100      0.8685797  0.8322320  0.6704659
##   0.05  6          0      150      0.8688771  0.8321535  0.6713056
##   0.05  6          0      200      0.8741929  0.8311496  0.6731855
##   0.05  6          0      300      0.8789708  0.8304619  0.6718609
##   0.05  6          1      100      0.8624194  0.8340334  0.6754635
##   0.05  6          1      150      0.8526616  0.8372510  0.6710308
##   0.05  6          1      200      0.8536788  0.8370835  0.6713847
##   0.05  6          1      300      0.8522603  0.8373855  0.6699777
##   0.05  8          0      100      0.8674734  0.8339552  0.6669022
##   0.05  8          0      150      0.8653034  0.8342388  0.6655097
##   0.05  8          0      200      0.8694120  0.8337656  0.6643586
##   0.05  8          0      300      0.8751872  0.8322398  0.6644406
##   0.05  8          1      100      0.8780920  0.8293627  0.6854085
##   0.05  8          1      150      0.8743516  0.8302680  0.6825640
##   0.05  8          1      200      0.8723100  0.8312949  0.6803162
##   0.05  8          1      300      0.8679497  0.8329429  0.6778601
##   0.10  4          0      100      0.9061672  0.8178920  0.6942083
##   0.10  4          0      150      0.9014654  0.8204167  0.6874965
##   0.10  4          0      200      0.9021623  0.8203703  0.6867485
##   0.10  4          0      300      0.9003716  0.8211243  0.6854194
##   0.10  4          1      100      0.8933117  0.8237149  0.7006024
##   0.10  4          1      150      0.8943248  0.8240572  0.6994523
##   0.10  4          1      200      0.8936999  0.8243502  0.6969232
##   0.10  4          1      300      0.8922653  0.8255438  0.6953046
##   0.10  6          0      100      0.9053462  0.8152095  0.6948502
##   0.10  6          0      150      0.9090872  0.8149734  0.6941957
##   0.10  6          0      200      0.9095758  0.8150010  0.6934913
##   0.10  6          0      300      0.9102890  0.8148544  0.6937459
##   0.10  6          1      100      0.8640531  0.8345145  0.6648236
##   0.10  6          1      150      0.8666107  0.8340867  0.6657681
##   0.10  6          1      200      0.8656854  0.8344951  0.6648222
##   0.10  6          1      300      0.8663870  0.8344562  0.6657586
##   0.10  8          0      100      0.9000383  0.8175953  0.6993600
##   0.10  8          0      150      0.9005137  0.8178369  0.6969322
##   0.10  8          0      200      0.9013356  0.8176193  0.6970689
##   0.10  8          0      300      0.9013389  0.8177036  0.6966672
##   0.10  8          1      100      0.8722699  0.8296108  0.6763781
##   0.10  8          1      150      0.8775351  0.8285041  0.6793858
##   0.10  8          1      200      0.8757207  0.8292867  0.6782922
##   0.10  8          1      300      0.8753224  0.8293795  0.6782871
##   0.15  4          0      100      0.8908477  0.8274183  0.6858406
##   0.15  4          0      150      0.8883673  0.8282211  0.6822434
##   0.15  4          0      200      0.8892147  0.8277253  0.6817583
##   0.15  4          0      300      0.8885816  0.8278455  0.6810660
##   0.15  4          1      100      0.8802390  0.8231812  0.6916522
##   0.15  4          1      150      0.8801942  0.8240044  0.6900426
##   0.15  4          1      200      0.8787756  0.8245382  0.6888212
##   0.15  4          1      300      0.8750942  0.8251346  0.6849653
##   0.15  6          0      100      0.9089206  0.8161500  0.6967861
##   0.15  6          0      150      0.9085901  0.8164424  0.6950970
##   0.15  6          0      200      0.9084784  0.8164772  0.6947123
##   0.15  6          0      300      0.9084124  0.8164925  0.6946573
##   0.15  6          1      100      0.8838684  0.8304600  0.6962923
##   0.15  6          1      150      0.8824853  0.8313396  0.6947185
##   0.15  6          1      200      0.8834375  0.8307777  0.6949300
##   0.15  6          1      300      0.8819700  0.8315343  0.6941799
##   0.15  8          0      100      0.9112751  0.8178064  0.6991255
##   0.15  8          0      150      0.9137724  0.8171472  0.6994128
##   0.15  8          0      200      0.9139029  0.8171118  0.6992585
##   0.15  8          0      300      0.9140759  0.8170649  0.6992819
##   0.15  8          1      100      0.8935081  0.8231672  0.6978100
##   0.15  8          1      150      0.8921961  0.8231839  0.6964752
##   0.15  8          1      200      0.8928237  0.8231334  0.6972326
##   0.15  8          1      300      0.8873299  0.8251900  0.6960735
## 
## Tuning parameter 'colsample_bytree' was held constant at a value of 0.8
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 2
## 
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 300, max_depth = 6, eta
##  = 0.05, gamma = 1, colsample_bytree = 0.8, min_child_weight = 2 and
##  subsample = 0.8.
plot(xgb_optimized)

best_params <- xgb_optimized$bestTune

X_2024 <- df %>% 
  filter(year == 2024) %>%
  select(-c(year, fips, county.x, percent_food_insecure)) %>%
  mutate(
    rural_urban = as.numeric(factor(rural_urban))
  )

predictions_2024 <- predict(xgb_optimized, newdata = X_2024)


results_2024 <- df %>%
  filter(year == 2024) %>%
  select(fips, county.x, percent_food_insecure) %>%  
  mutate(
    pred_2024_food_insecure = predictions_2024,
    predicted_change = pred_2024_food_insecure - percent_food_insecure,
    risk_category = case_when(
      pred_2024_food_insecure > quantile(predictions_2024, 0.75) ~ "High Risk",
      pred_2024_food_insecure > quantile(predictions_2024, 0.5) ~ "Medium Risk",
      TRUE ~ "Low Risk"
    )
  ) %>%
  arrange(desc(pred_2024_food_insecure))


write.csv(
  results_2024,
  file = paste0("food_insecurity_predictions_", format(Sys.Date(), "%Y%m%d"), ".csv"),
  row.names = FALSE
)


library(ggplot2)
results_2024 %>%
  head(10) %>%
  ggplot(aes(x = reorder(county.x, -pred_2024_food_insecure), y = pred_2024_food_insecure)) +
  geom_col(fill = "firebrick") +
  labs(title = "Top 10 High-Risk Counties for 2024 Food Insecurity",
       x = "County",
       y = "Predicted Food Insecurity Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

* The final values used for the model were nrounds = 300, max_depth = 4, eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2 and subsample = 0.8.

library(caret)
library(Metrics)

test_predictions <- predict(xgb_optimized, newdata = X_test)
actual_values <- y_test 

metrics_table <- data.frame(
  RMSE = RMSE(test_predictions, actual_values),
  MAPE = mape(actual_values, test_predictions) * 100,
  MSE = mean((test_predictions - actual_values)^2),
  R2 = R2(test_predictions, actual_values)
)

print(metrics_table)
##       RMSE     MAPE      MSE        R2
## 1 1.051221 10.54083 1.105066 0.9227965

Geospacial graphing for the best performing XGBoost model:

# SF and Tigris are used to call in geospacial data for mapping purposes
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
# Get counties in native projection (no transformation)
ny_counties <- counties(state = "NY", cb = TRUE, year = 2021, class = "sf") %>% 
  select(fips = GEOID, geometry)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |=========                                                             |  14%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |================                                                      |  24%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |===================================                                   |  51%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=====================================                                 |  54%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |============================================                          |  64%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |=================================================                     |  71%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |===================================================                   |  74%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |=================================================================     |  94%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |====================================================================  |  98%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
# Filtering out the NY Total Row
results_2024_clean <- results_2024 %>%
  filter(fips != 36000) %>%
  mutate(fips = as.character(fips))

# Merge and plot the predictions with sf and tigris mapping data
ny_counties %>%
  left_join(results_2024_clean, by = "fips") %>%
  drop_na(pred_2024_food_insecure) %>%
  ggplot() +
  geom_sf(aes(fill = pred_2024_food_insecure), color = NA) +
  scale_fill_viridis_c("Food Insecurity (%)", option = "inferno", direction = -1) +
  labs(title = "New York State 2025 Predicted Food Insecurity by County") +
  theme_void()

ny_map <- ny_counties %>%
  left_join(results_2024_clean, by = "fips") %>%
  drop_na(pred_2024_food_insecure) %>%
  ggplot() +
  geom_sf(aes(fill = pred_2024_food_insecure), color = NA) +
  scale_fill_viridis_c(
    "Food Insecurity (%)", 
    option = "inferno", 
    direction = -1,
    limits = c(0, max(results_2024_clean$pred_2024_food_insecure))
  ) +
  labs(
    title = "New York State 2024 Predicted Food Insecurity by County",
    caption = "Data: County Health Rankings"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    legend.position = "right",
    legend.title = element_text(size = 10)
  )


ggsave(
  filename = "ny_food_insecurity_2025.png",
  plot = ny_map,
  device = "png",
  width = 10,   
  height = 8,       
  units = "in", 
  dpi = 300,        
  bg = "white"      
)

Variable Importance for the best perfoming model:

library(caret)

perm_imp <- varImp(
  xgb_optimized,
  scale = TRUE,  
  type = "permutation", 
  numPermutations = 30  
)

ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Features", y = "Importance (Permutation)", 
       title = "Permutation Importance") +
  theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

perm_imp_plot <- ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = "Features", y = "Importance (Permutation)", 
       title = "Permutation Importance (XGBoost Experiment 3)") +
  theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
ggsave(
  filename = "permutation_importance.png",
  plot = perm_imp_plot,
  device = "png",
  width = 8,         
  height = 6,         
  units = "in",       
  dpi = 300,          
  bg = "white"        
)