library(tidyverse)
library(readxl)
library(janitor)
library(psych)
# Taking out only the targeted variables needed for the analysis as there were much more availible in the health outcomes datasets.
columns_to_extract <- c("year","fips", "state", "county", "percent_food_insecure","percent_frequent_mental_distress",
"percent_uninsured_children", "percent_disconnected_youth", "spending_per_pupil","school_funding_adequacy", "high_school_graduation_rate", "median_household_income", "gender_pay_gap", "percent_enrolled_in_free_or_reduced_lunch", "percent_household_income_required_for_child_care_expenses", "percent_households_with_severe_cost_burden", "percent_rural", "percent_65_and_over", "percent_black","percent_not_proficient_in_english", "segregation_index", "percent_disconnected_youth", "food_environment_index", "teen_birth_rate", "percent_fair_or_poor_health", "percent_unemployed", "percent_children_in_single_parent_households", "percent_children_in_poverty", "percent_severe_housing_problems", "percent_completed_high_school","percent_completed_high_school", "percent_low_birthweight")
main_25 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_25.csv"
supp_25 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_25.csv"
main_24 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_24.csv"
supp_24 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_24.csv"
main_23 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_23.csv"
supp_23 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_23.csv"
main_22 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_22.csv"
supp_22 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_22.csv"
main_21 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_21.csv"
supp_21 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_21.csv"
main_20 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/main_20.csv"
supp_20 <- "https://raw.githubusercontent.com/jonburns2454/DATA_698/refs/heads/main/DATA/supp_20.csv"
clean_and_select <- function(file_path, columns_to_extract) {
data <- read.csv(file_path, check.names = FALSE) %>%
janitor::clean_names() %>%
select(any_of(columns_to_extract))
return(data)
}
m_25 <- clean_and_select(main_25, columns_to_extract)
s_25 <- clean_and_select(supp_25, columns_to_extract)
m_24 <- clean_and_select(main_24, columns_to_extract)
s_24 <- clean_and_select(supp_24, columns_to_extract)
m_23 <- clean_and_select(main_23, columns_to_extract)
s_23 <- clean_and_select(supp_23, columns_to_extract)
m_22 <- clean_and_select(main_22, columns_to_extract)
s_22 <- clean_and_select(supp_22, columns_to_extract)
m_21 <- clean_and_select(main_21, columns_to_extract)
s_21 <- clean_and_select(supp_21, columns_to_extract)
m_20 <- clean_and_select(main_20, columns_to_extract)
s_20 <- clean_and_select(supp_20, columns_to_extract)
f_25 <- m_25 %>%
left_join(s_25, by = c("year", "fips")) %>%
select(-state.y, -county.y)
f_24 <- m_24 %>%
left_join(s_24, by = c("year", "fips")) %>%
select(-state.y, -county.y)
f_23 <- m_23 %>%
left_join(s_23, by = c("year", "fips")) %>%
select(-state.y, -county.y)
f_22 <- m_22 %>%
left_join(s_22, by = c("year", "fips")) %>%
select(-state.y, -county.y)
f_21 <- m_21 %>%
left_join(s_21, by = c("year", "fips")) %>%
select(-state.y, -county.y)
f_20 <- m_20 %>%
left_join(s_20, by = c("year", "fips")) %>%
select(-state.y, -county.y)
final <- bind_rows(f_25, f_24, f_23, f_22, f_21, f_20)
final_data_clean <- final %>%
select(where(~ !any(is.na(.))))
final <- final %>%
mutate(
rural_urban = case_when(
percent_rural == 100 ~ "Completely Rural",
percent_rural >= 50 & percent_rural < 100 ~ "Mostly Rural",
percent_rural < 50 ~ "Mostly Urban",
TRUE ~ NA_character_
)
)
total_rows <- nrow(final)
missing_counts <- colSums(is.na(final))
missing_percentage <- (missing_counts / total_rows) * 100
missing_table <- data.frame(
Column = names(missing_counts),
MissingCount = missing_counts,
MissingPercentage = round(missing_percentage, 2)
)
missing_table <- missing_table[missing_table$MissingCount > 0, ]
print(missing_table)
## Column
## percent_household_income_required_for_child_care_expenses percent_household_income_required_for_child_care_expenses
## percent_completed_high_school percent_completed_high_school
## percent_uninsured_children percent_uninsured_children
## percent_disconnected_youth percent_disconnected_youth
## spending_per_pupil spending_per_pupil
## school_funding_adequacy school_funding_adequacy
## high_school_graduation_rate high_school_graduation_rate
## gender_pay_gap gender_pay_gap
## percent_households_with_severe_cost_burden percent_households_with_severe_cost_burden
## segregation_index segregation_index
## teen_birth_rate teen_birth_rate
## percent_children_in_single_parent_households percent_children_in_single_parent_households
## percent_low_birthweight percent_low_birthweight
## percent_black percent_black
## percent_children_in_single_parent_households.x percent_children_in_single_parent_households.x
## percent_children_in_single_parent_households.y percent_children_in_single_parent_households.y
## MissingCount
## percent_household_income_required_for_child_care_expenses 189
## percent_completed_high_school 63
## percent_uninsured_children 189
## percent_disconnected_youth 59
## spending_per_pupil 143
## school_funding_adequacy 202
## high_school_graduation_rate 7
## gender_pay_gap 126
## percent_households_with_severe_cost_burden 189
## segregation_index 16
## teen_birth_rate 5
## percent_children_in_single_parent_households 189
## percent_low_birthweight 64
## percent_black 63
## percent_children_in_single_parent_households.x 189
## percent_children_in_single_parent_households.y 189
## MissingPercentage
## percent_household_income_required_for_child_care_expenses 50.00
## percent_completed_high_school 16.67
## percent_uninsured_children 50.00
## percent_disconnected_youth 15.61
## spending_per_pupil 37.83
## school_funding_adequacy 53.44
## high_school_graduation_rate 1.85
## gender_pay_gap 33.33
## percent_households_with_severe_cost_burden 50.00
## segregation_index 4.23
## teen_birth_rate 1.32
## percent_children_in_single_parent_households 50.00
## percent_low_birthweight 16.93
## percent_black 16.67
## percent_children_in_single_parent_households.x 50.00
## percent_children_in_single_parent_households.y 50.00
describe(final, fast = T)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean
## year 1 378 2022.50
## fips 2 378 36061.02
## state.x 3 378 NaN
## county.x 4 378 NaN
## percent_household_income_required_for_child_care_expenses 5 189 36.26
## food_environment_index 6 378 8.28
## percent_fair_or_poor_health 7 378 15.28
## percent_unemployed 8 378 4.98
## percent_children_in_poverty 9 378 17.60
## percent_severe_housing_problems 10 378 16.04
## percent_completed_high_school 11 315 89.60
## percent_food_insecure 12 378 11.16
## percent_frequent_mental_distress 13 378 14.85
## percent_uninsured_children 14 189 2.90
## percent_disconnected_youth 15 319 7.76
## spending_per_pupil 16 235 24725.92
## school_funding_adequacy 17 176 12738.98
## high_school_graduation_rate 18 371 85.94
## median_household_income 19 378 67423.84
## gender_pay_gap 20 252 0.84
## percent_enrolled_in_free_or_reduced_lunch 21 378 48.38
## percent_households_with_severe_cost_burden 22 189 14.04
## percent_rural 23 378 45.01
## percent_65_and_over 24 378 19.72
## percent_not_proficient_in_english 25 378 2.23
## segregation_index 26 362 18.10
## teen_birth_rate 27 373 14.47
## percent_children_in_single_parent_households 28 189 22.25
## percent_low_birthweight 29 314 7.29
## percent_black 30 315 6.13
## percent_children_in_single_parent_households.x 31 189 22.71
## percent_children_in_single_parent_households.y 32 189 22.35
## rural_urban 33 378 NaN
## sd min
## year 1.71 2020.00
## fips 36.39 36000.00
## state.x NA Inf
## county.x NA Inf
## percent_household_income_required_for_child_care_expenses 6.19 24.00
## food_environment_index 0.53 6.20
## percent_fair_or_poor_health 2.85 9.00
## percent_unemployed 1.92 2.70
## percent_children_in_poverty 5.14 6.00
## percent_severe_housing_problems 5.60 8.00
## percent_completed_high_school 3.32 73.00
## percent_food_insecure 2.25 4.00
## percent_frequent_mental_distress 2.08 10.00
## percent_uninsured_children 1.05 2.00
## percent_disconnected_youth 3.94 1.00
## spending_per_pupil 4491.46 18229.00
## school_funding_adequacy 4128.12 958.00
## high_school_graduation_rate 4.63 67.00
## median_household_income 16548.19 38566.00
## gender_pay_gap 0.05 0.69
## percent_enrolled_in_free_or_reduced_lunch 10.23 22.00
## percent_households_with_severe_cost_burden 4.40 7.00
## percent_rural 28.81 0.00
## percent_65_and_over 3.11 12.80
## percent_not_proficient_in_english 3.27 0.00
## segregation_index 27.28 0.03
## teen_birth_rate 6.24 3.00
## percent_children_in_single_parent_households 5.89 12.00
## percent_low_birthweight 0.95 4.00
## percent_black 6.06 0.60
## percent_children_in_single_parent_households.x 6.16 10.00
## percent_children_in_single_parent_households.y 5.88 12.00
## rural_urban NA Inf
## max range
## year 2025.00 5.00
## fips 36123.00 123.00
## state.x -Inf -Inf
## county.x -Inf -Inf
## percent_household_income_required_for_child_care_expenses 80.00 56.00
## food_environment_index 9.90 3.70
## percent_fair_or_poor_health 30.00 21.00
## percent_unemployed 16.00 13.30
## percent_children_in_poverty 38.00 32.00
## percent_severe_housing_problems 39.00 31.00
## percent_completed_high_school 96.00 23.00
## percent_food_insecure 20.00 16.00
## percent_frequent_mental_distress 20.00 10.00
## percent_uninsured_children 10.00 8.00
## percent_disconnected_youth 27.00 26.00
## spending_per_pupil 50849.00 32620.00
## school_funding_adequacy 34981.00 34023.00
## high_school_graduation_rate 96.00 29.00
## median_household_income 140466.00 101900.00
## gender_pay_gap 0.99 0.30
## percent_enrolled_in_free_or_reduced_lunch 88.00 66.00
## percent_households_with_severe_cost_burden 31.00 24.00
## percent_rural 100.00 100.00
## percent_65_and_over 34.80 22.00
## percent_not_proficient_in_english 16.00 16.00
## segregation_index 81.00 80.97
## teen_birth_rate 36.00 33.00
## percent_children_in_single_parent_households 52.00 40.00
## percent_low_birthweight 10.00 6.00
## percent_black 29.90 29.30
## percent_children_in_single_parent_households.x 51.00 41.00
## percent_children_in_single_parent_households.y 52.00 40.00
## rural_urban -Inf -Inf
## se
## year 0.09
## fips 1.87
## state.x NA
## county.x NA
## percent_household_income_required_for_child_care_expenses 0.45
## food_environment_index 0.03
## percent_fair_or_poor_health 0.15
## percent_unemployed 0.10
## percent_children_in_poverty 0.26
## percent_severe_housing_problems 0.29
## percent_completed_high_school 0.19
## percent_food_insecure 0.12
## percent_frequent_mental_distress 0.11
## percent_uninsured_children 0.08
## percent_disconnected_youth 0.22
## spending_per_pupil 292.99
## school_funding_adequacy 311.17
## high_school_graduation_rate 0.24
## median_household_income 851.15
## gender_pay_gap 0.00
## percent_enrolled_in_free_or_reduced_lunch 0.53
## percent_households_with_severe_cost_burden 0.32
## percent_rural 1.48
## percent_65_and_over 0.16
## percent_not_proficient_in_english 0.17
## segregation_index 1.43
## teen_birth_rate 0.32
## percent_children_in_single_parent_households 0.43
## percent_low_birthweight 0.05
## percent_black 0.34
## percent_children_in_single_parent_households.x 0.45
## percent_children_in_single_parent_households.y 0.43
## rural_urban NA
ggplot(missing_table, aes(x = reorder(Column, -MissingPercentage), y = MissingPercentage)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Percentage of Missing Data by Variable",
x = "Variable",
y = "Missingness %"
) +
theme_minimal()
##Numeric Distribution
library(ggplot2)
numeric_columns <- sapply(final, is.numeric)
final_numeric <- final[, numeric_columns]
for (col in names(final_numeric)) {
print(
ggplot(final, aes_string(x = col)) +
geom_histogram(fill = "steelblue", bins = 30, color = "black") +
labs(title = paste("Distribution of", col), x = col, y = "Frequency") +
theme_minimal()
)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 63 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 143 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 202 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 63 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 189 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(dplyr)
yearly_summary <- final %>%
group_by(year) %>%
summarise(across(where(is.numeric), list(mean = mean, median = median, sd = sd), na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## ℹ In group 1: `year = 2020`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
print(yearly_summary)
## # A tibble: 6 × 88
## year fips_mean fips_median fips_sd percent_household_income_required_for_ch…¹
## <int> <dbl> <int> <dbl> <dbl>
## 1 2020 36061. 36061 36.6 NaN
## 2 2021 36061. 36061 36.6 NaN
## 3 2022 36061. 36061 36.6 NaN
## 4 2023 36061. 36061 36.6 32.2
## 5 2024 36061. 36061 36.6 39.0
## 6 2025 36061. 36061 36.6 37.6
## # ℹ abbreviated name:
## # ¹​percent_household_income_required_for_child_care_expenses_mean
## # ℹ 83 more variables:
## # percent_household_income_required_for_child_care_expenses_median <int>,
## # percent_household_income_required_for_child_care_expenses_sd <dbl>,
## # food_environment_index_mean <dbl>, food_environment_index_median <dbl>,
## # food_environment_index_sd <dbl>, percent_fair_or_poor_health_mean <dbl>, …
region_summary <- final %>%
group_by(county.x, year) %>%
summarise(across(where(is.numeric), list(mean = mean, median = median, sd = sd), na.rm = TRUE))
## `summarise()` has grouped output by 'county.x'. You can override using the
## `.groups` argument.
print(region_summary)
## # A tibble: 378 × 89
## # Groups: county.x [63]
## county.x year fips_mean fips_median fips_sd percent_household_income_requi…¹
## <chr> <int> <dbl> <int> <dbl> <dbl>
## 1 Albany 2020 36001 36001 NA NaN
## 2 Albany 2021 36001 36001 NA NaN
## 3 Albany 2022 36001 36001 NA NaN
## 4 Albany 2023 36001 36001 NA 35
## 5 Albany 2024 36001 36001 NA 41
## 6 Albany 2025 36001 36001 NA 37
## 7 Allegany 2020 36003 36003 NA NaN
## 8 Allegany 2021 36003 36003 NA NaN
## 9 Allegany 2022 36003 36003 NA NaN
## 10 Allegany 2023 36003 36003 NA 34
## # ℹ 368 more rows
## # ℹ abbreviated name:
## # ¹​percent_household_income_required_for_child_care_expenses_mean
## # ℹ 83 more variables:
## # percent_household_income_required_for_child_care_expenses_median <int>,
## # percent_household_income_required_for_child_care_expenses_sd <dbl>,
## # food_environment_index_mean <dbl>, food_environment_index_median <dbl>, …
library(ggplot2)
ggplot(region_summary, aes(x = year, y = percent_food_insecure_mean, color = county.x)) +
geom_line() +
labs(title = "Yearly Food Insecurity Across New York State Counties", x = "Year", y = "Food Insecurity") +
theme_minimal()
ggplot(region_summary, aes(x = year, y = percent_food_insecure_mean)) +
geom_line() +
labs(title = "Yearly Average Food Insecurity Across New York State", x = "Year", y = "Value") +
theme_minimal()
ggplot(final, aes(x = county.x, y = percent_food_insecure, color = rural_urban)) +
geom_boxplot(fill = "steelblue") +
labs(title = "County Level Comparison of Food Insecurity In New York State", x = "County", y = "Food Insecurity Percentage") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7))
final_csv <- write_csv(final, "C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")
library(tidyverse)
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")
df <- data %>%
select(year, fips, county.x, percent_food_insecure,
percent_unemployed, percent_children_in_poverty,
median_household_income, percent_fair_or_poor_health,
high_school_graduation_rate, rural_urban
) %>%
drop_na(percent_food_insecure) #just making sure there are no na's left in for this XGBoost model
df <- df %>%
arrange(fips, year) %>%
group_by(fips) %>%
mutate(
food_insecure_lag1 = lag(percent_food_insecure, 1), # Previous year
food_insecure_lag2 = lag(percent_food_insecure, 2) # Two years back
) %>%
ungroup()
# recode rural_urban as numeric
df <- df %>%
mutate(rural_urban = as.factor(rural_urban),
rural_urban = as.numeric(rural_urban))
train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)
# dropping out the non-important features, including the dependent variable
X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure))
y_train <- train$percent_food_insecure
X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure))
y_test <- test$percent_food_insecure
# cross val
ctrl <- trainControl(
method = "cv",
number = 10,
verboseIter = TRUE
)
# Standard xgboost tuning grid
xgb_grid <- expand.grid(
nrounds = 100,
max_depth = 6,
eta = 0.3,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
xgb_model <- train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = ctrl,
tuneGrid = xgb_grid,
verbosity = 0
)
## + Fold01: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold01: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold02: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold02: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold03: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold03: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold04: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold04: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold05: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold05: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold06: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold06: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold07: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold07: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold08: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold08: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold09: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold09: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## + Fold10: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Warning: Setting row names on a tibble is deprecated.
## - Fold10: nrounds=100, max_depth=6, eta=0.3, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1
## Aggregating results
## Fitting final model on full training set
## Warning: Setting row names on a tibble is deprecated.
print(xgb_model)
## eXtreme Gradient Boosting
##
## 252 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 226, 227, 228, 227, 228, 226, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.9396357 0.782687 0.7290635
##
## Tuning parameter 'nrounds' was held constant at a value of 100
## Tuning
## held constant at a value of 1
## Tuning parameter 'subsample' was held
## constant at a value of 1
# predicting 2024 data
X_2024 <- df %>%
filter(year == 2024) %>%
select(-c(year, fips, county.x, percent_food_insecure))
# Predicting 2024 food insecurity
predictions_2024 <- predict(xgb_model, newdata = X_2024)
# bringing in county and fips codes
results <- df %>%
filter(year == 2024) %>%
select(fips, county.x) %>%
mutate(pred_2024_food_insecure = predictions_2024)
# View predictions
print(results)
## # A tibble: 63 × 3
## fips county.x pred_2024_food_insecure
## <int> <chr> <dbl>
## 1 36000 Total 10.0
## 2 36001 Albany 9.34
## 3 36003 Allegany 13.1
## 4 36005 Bronx 17.9
## 5 36007 Broome 12.3
## 6 36009 Cattaraugus 12.7
## 7 36011 Cayuga 10.5
## 8 36013 Chautauqua 13.2
## 9 36015 Chemung 11.9
## 10 36017 Chenango 10.6
## # ℹ 53 more rows
# Predict 2024 (for validPrediPredir)
pred_2024 <- predict(xgb_model, newdata = X_test)
# Calculate RMSE
rmse <- sqrt(mean((pred_2024 - y_test)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 1.07624761613868"
ggplot(data.frame(Actual = y_test, Predicted = pred_2024), aes(Actual, Predicted)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Actual vs. Predicted Food Insecurity (2024)")
library(caret)
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
test_predictions <- predict(xgb_model, newdata = X_test)
actual_values <- y_test
metrics_table <- data.frame(
RMSE = RMSE(test_predictions, actual_values),
MAPE = mape(actual_values, test_predictions) * 100,
MSE = mean((test_predictions - actual_values)^2),
R2 = R2(test_predictions, actual_values)
)
print(metrics_table)
## RMSE MAPE MSE R2
## 1 1.076248 10.3129 1.158309 0.8556726
library(tidyverse)
library(xgboost)
library(caret)
data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")
df <- data %>%
select(where(~ all(!is.na(.)))) %>%
drop_na(percent_food_insecure)
df <- df %>%
arrange(fips, year) %>%
group_by(fips) %>%
mutate(
food_insecure_lag1 = lag(percent_food_insecure, 1), # Previous year
food_insecure_lag2 = lag(percent_food_insecure, 2) # Two years back
) %>%
select(-food_environment_index) %>%
ungroup()
df <- df %>%
mutate(rural_urban = as.factor(rural_urban),
rural_urban = as.numeric(rural_urban))
train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)
X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_train <- train$percent_food_insecure
X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_test <- test$percent_food_insecure
ctrl_fast <- trainControl(
method = "cv",
number = 5, # Reduced from 10 folds
verboseIter = TRUE,
allowParallel = TRUE # Enable parallel processing
)
# Tuning grid set
xgb_grid_fast <- expand.grid(
nrounds = c(100, 150, 200, 300), # Reduced options
max_depth = c(4, 6, 8), # Most useful range
eta = c(0.05, 0.1, .15), # Optimal learning rates
gamma = 0,
colsample_bytree = 0.8,
min_child_weight = 1,
subsample = 0.8
)
# Parallel processing to speed things up
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
xgb_fast <- train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = ctrl_fast,
tuneGrid = xgb_grid_fast,
verbosity = 0,
metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 300, max_depth = 4, eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set
stopCluster(cl)
print(xgb_fast)
## eXtreme Gradient Boosting
##
## 252 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 201, 202, 201, 202, 202
## Resampling results across tuning parameters:
##
## eta max_depth nrounds RMSE Rsquared MAE
## 0.05 4 100 0.8781404 0.8355296 0.6707104
## 0.05 4 150 0.8540096 0.8402291 0.6462976
## 0.05 4 200 0.8453597 0.8426345 0.6345746
## 0.05 4 300 0.8451813 0.8424344 0.6312022
## 0.05 6 100 0.8923034 0.8322891 0.6678132
## 0.05 6 150 0.8754636 0.8340372 0.6504066
## 0.05 6 200 0.8773633 0.8331453 0.6492893
## 0.05 6 300 0.8772838 0.8334252 0.6496573
## 0.05 8 100 0.8974352 0.8291845 0.6731898
## 0.05 8 150 0.8816944 0.8291804 0.6630502
## 0.05 8 200 0.8796597 0.8293649 0.6624847
## 0.05 8 300 0.8796704 0.8292728 0.6624413
## 0.10 4 100 0.8648257 0.8364921 0.6439529
## 0.10 4 150 0.8642269 0.8362099 0.6417462
## 0.10 4 200 0.8645774 0.8362313 0.6412801
## 0.10 4 300 0.8656470 0.8356974 0.6410426
## 0.10 6 100 0.8825171 0.8286300 0.6591104
## 0.10 6 150 0.8806580 0.8292312 0.6567807
## 0.10 6 200 0.8797769 0.8296094 0.6567192
## 0.10 6 300 0.8796946 0.8296352 0.6566413
## 0.10 8 100 0.8979305 0.8253533 0.6732380
## 0.10 8 150 0.8972889 0.8256395 0.6722391
## 0.10 8 200 0.8970448 0.8257292 0.6718342
## 0.10 8 300 0.8970231 0.8257341 0.6718000
## 0.15 4 100 0.8607434 0.8359365 0.6525811
## 0.15 4 150 0.8637856 0.8354407 0.6524942
## 0.15 4 200 0.8638682 0.8354469 0.6534405
## 0.15 4 300 0.8643632 0.8352971 0.6540152
## 0.15 6 100 0.8865801 0.8275857 0.6848609
## 0.15 6 150 0.8874475 0.8272966 0.6852978
## 0.15 6 200 0.8874683 0.8272873 0.6853752
## 0.15 6 300 0.8874747 0.8272835 0.6853918
## 0.15 8 100 0.8980507 0.8237776 0.6916497
## 0.15 8 150 0.8981906 0.8237612 0.6917727
## 0.15 8 200 0.8981609 0.8237745 0.6917438
## 0.15 8 300 0.8981608 0.8237731 0.6917376
##
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
##
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 300, max_depth = 4, eta
## = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and
## subsample = 0.8.
plot(xgb_fast)
library(doParallel)
library(Metrics)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
ctrl_fast <- trainControl(
method = "cv",
number = 5, # Reduced from 10 folds
verboseIter = TRUE,
allowParallel = TRUE, # Enable parallel processing
savePredictions = "final"
)
xgb_grid_fast <- expand.grid(
nrounds = c(100, 150, 200, 300),
max_depth = c(4, 6, 8),
eta = c(0.05, 0.1, 0.15),
gamma = 0,
colsample_bytree = 0.8,
min_child_weight = 1,
subsample = 0.8
)
# Train the model
xgb_optimized <- train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = ctrl_fast,
tuneGrid = xgb_grid_fast,
verbosity = 0,
metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 100, max_depth = 6, eta = 0.15, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set
## Warning: Setting row names on a tibble is deprecated.
stopCluster(cl)
print(xgb_optimized)
## eXtreme Gradient Boosting
##
## 252 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 200, 201, 202, 203, 202
## Resampling results across tuning parameters:
##
## eta max_depth nrounds RMSE Rsquared MAE
## 0.05 4 100 0.9011120 0.8215231 0.6883451
## 0.05 4 150 0.8883722 0.8238125 0.6832788
## 0.05 4 200 0.8871622 0.8248628 0.6815495
## 0.05 4 300 0.8860341 0.8259349 0.6812098
## 0.05 6 100 0.8703363 0.8340119 0.6657812
## 0.05 6 150 0.8621970 0.8342835 0.6608138
## 0.05 6 200 0.8620012 0.8347651 0.6608094
## 0.05 6 300 0.8631944 0.8346652 0.6618495
## 0.05 8 100 0.9131806 0.8173697 0.7086733
## 0.05 8 150 0.9046190 0.8175442 0.7055069
## 0.05 8 200 0.9046224 0.8174260 0.7065193
## 0.05 8 300 0.9061115 0.8170087 0.7079110
## 0.10 4 100 0.8734851 0.8303399 0.6738947
## 0.10 4 150 0.8790684 0.8286507 0.6803869
## 0.10 4 200 0.8830418 0.8275311 0.6840421
## 0.10 4 300 0.8849291 0.8270416 0.6873362
## 0.10 6 100 0.9129718 0.8158589 0.7000536
## 0.10 6 150 0.9154483 0.8151201 0.7024064
## 0.10 6 200 0.9157415 0.8151300 0.7026213
## 0.10 6 300 0.9159481 0.8151109 0.7027467
## 0.10 8 100 0.8852688 0.8266064 0.6896946
## 0.10 8 150 0.8863866 0.8262701 0.6895852
## 0.10 8 200 0.8867005 0.8261605 0.6898820
## 0.10 8 300 0.8868083 0.8261222 0.6900218
## 0.15 4 100 0.9059950 0.8168906 0.7002083
## 0.15 4 150 0.9104842 0.8148878 0.7046985
## 0.15 4 200 0.9115235 0.8147664 0.7060750
## 0.15 4 300 0.9118689 0.8147551 0.7064888
## 0.15 6 100 0.8571253 0.8368995 0.6665441
## 0.15 6 150 0.8575728 0.8368456 0.6669733
## 0.15 6 200 0.8577060 0.8367976 0.6672284
## 0.15 6 300 0.8577302 0.8367929 0.6672692
## 0.15 8 100 0.8979386 0.8207182 0.7209347
## 0.15 8 150 0.8978211 0.8207909 0.7207613
## 0.15 8 200 0.8978151 0.8207947 0.7207717
## 0.15 8 300 0.8978187 0.8207937 0.7207718
##
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
##
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 100, max_depth = 6, eta
## = 0.15, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and
## subsample = 0.8.
plot(xgb_optimized)
best_params <- xgb_optimized$bestTune
X_2024 <- df %>%
filter(year == 2024) %>%
select(-c(year, fips, county.x, percent_food_insecure)) %>%
mutate(
rural_urban = as.numeric(factor(rural_urban))
)
predictions_2024 <- predict(xgb_optimized, newdata = X_2024)
results_2024 <- df %>%
filter(year == 2024) %>%
select(fips, county.x, percent_food_insecure) %>%
mutate(
pred_2024_food_insecure = predictions_2024,
predicted_change = pred_2024_food_insecure - percent_food_insecure,
risk_category = case_when(
pred_2024_food_insecure > quantile(predictions_2024, 0.75) ~ "High Risk",
pred_2024_food_insecure > quantile(predictions_2024, 0.5) ~ "Medium Risk",
TRUE ~ "Low Risk"
)
) %>%
arrange(desc(pred_2024_food_insecure))
write.csv(
results_2024,
file = paste0("food_insecurity_predictions_", format(Sys.Date(), "%Y%m%d"), ".csv"),
row.names = FALSE
)
library(ggplot2)
results_2024 %>%
head(10) %>%
ggplot(aes(x = reorder(county.x, -pred_2024_food_insecure), y = pred_2024_food_insecure)) +
geom_col(fill = "firebrick") +
labs(title = "Top 10 High-Risk Counties for 2024 Food Insecurity",
x = "County",
y = "Predicted Food Insecurity Rate") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(caret)
library(Metrics)
test_predictions <- predict(xgb_optimized, newdata = X_test)
actual_values <- y_test
metrics_table <- data.frame(
RMSE = RMSE(test_predictions, actual_values),
MAPE = mape(actual_values, test_predictions) * 100,
MSE = mean((test_predictions - actual_values)^2),
R2 = R2(test_predictions, actual_values)
)
print(metrics_table)
## RMSE MAPE MSE R2
## 1 1.098412 10.7249 1.206509 0.8872072
library(caret)
perm_imp <- varImp(
xgb_optimized,
scale = TRUE,
type = "permutation",
numPermutations = 30
)
ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(x = "Features", y = "Importance (Permutation)",
title = "Permutation Importance") +
theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
perm_imp_plot <- ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(x = "Features", y = "Importance (Permutation)",
title = "Permutation Importance (XGBoost Experiment 3)") +
theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
ggsave(
filename = "permutation_importance.png",
plot = perm_imp_plot,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 300,
bg = "white"
)
This XGBoost model will attempt to do four things: * Make sure the lag variables are actually backward-looking by adjusting how they are set. * Address overfitting by increasing regularization * increase early stopping. * Lastly, add in more interaction terms.
library(tidyverse)
library(mice) # For advanced imputation
## Warning: package 'mice' was built under R version 4.3.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
data <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv")
# This sapply function runs through the data and calculates the missing percentage across all columns
missing_pct <- sapply(data, function(x) mean(is.na(x)) * 100)
# 2. Rules I set (arbitrary) to decide what columns to keep and impute and which ones to drop
cols_to_impute <- names(missing_pct[missing_pct <= 25 & missing_pct > 0])
cols_to_drop <- names(missing_pct[missing_pct > 25])
# 3. Drop columns with 25% or more missingness
data_clean <- data %>%
select(-all_of(cols_to_drop))
# 4.
# Numeric columns: median imputation
num_cols <- data_clean %>%
select(where(is.numeric)) %>%
names() %>%
intersect(cols_to_impute)
data_clean <- data_clean %>%
mutate(across(all_of(num_cols), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
# Categorical Columns: mode imputation
cat_cols <- data_clean %>%
select(where(is.character) | where(is.factor)) %>%
names() %>%
intersect(cols_to_impute)
get_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
data_clean <- data_clean %>%
mutate(across(all_of(cat_cols), ~ifelse(is.na(.), get_mode(.), .)))
# 5. Drop rows with >25% missingness in remaining columns
row_missing_pct <- apply(data_clean, 1, function(x) mean(is.na(x)) * 100)
data_final <- data_clean[row_missing_pct <= 25, ]
df <- data_final %>%
arrange(fips, year) %>%
group_by(fips) %>%
mutate(
food_insecure_lag1 = lag(percent_food_insecure, 1),
food_insecure_lag2 = lag(percent_food_insecure, 2)
) %>%
select(-food_environment_index) %>%
ungroup()
df <- df %>%
mutate(rural_urban = as.factor(rural_urban),
rural_urban = as.numeric(rural_urban),
economic_stress = (
percent_unemployed +
percent_children_in_poverty +
percent_severe_housing_problems
) / 3)
train <- df %>% filter(year < 2024)
test <- df %>% filter(year == 2024)
X_train <- train %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_train <- train$percent_food_insecure
X_test <- test %>% select(-c(year, fips, county.x, percent_food_insecure, state.x))
y_test <- test$percent_food_insecure
ctrl_fast <- trainControl(
method = "cv",
number = 5, # Reduce from 10 folds
verboseIter = TRUE,
allowParallel = TRUE
)
# Focused tuning grid
xgb_grid_fast <- expand.grid(
nrounds = c(100, 150), # Reduced amount of options for nrounds
max_depth = c(3, 4), # drop down from 6 and 8 tree depth
eta = c(0.01, 0.05), # dropped down from 3 set options to 2
gamma = c(0,1), # 0-1 to add regularization to help overfitting
colsample_bytree = 0.8,
min_child_weight = 2, # Increased from 1 to help prevent overfitting
subsample = 0.8 # Fixed
)
library(doParallel)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
xgb_fast <- train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = ctrl_fast,
tuneGrid = xgb_grid_fast,
verbosity = 0,
metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 150, max_depth = 3, eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2, subsample = 0.8 on full training set
stopCluster(cl)
print(xgb_fast)
## eXtreme Gradient Boosting
##
## 252 samples
## 21 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 201, 201, 202, 201, 203
## Resampling results across tuning parameters:
##
## eta max_depth gamma nrounds RMSE Rsquared MAE
## 0.01 3 0 100 4.2379903 0.7198368 3.9998665
## 0.01 3 0 150 2.7371900 0.7785458 2.4712494
## 0.01 3 1 100 4.2328913 0.7198437 3.9916388
## 0.01 3 1 150 2.7387545 0.7705294 2.4669134
## 0.01 4 0 100 4.2374726 0.7277374 3.9988035
## 0.01 4 0 150 2.7326659 0.7846968 2.4708225
## 0.01 4 1 100 4.2310054 0.7229323 3.9910180
## 0.01 4 1 150 2.7352692 0.7767372 2.4674334
## 0.05 3 0 100 0.8615489 0.8491815 0.6658737
## 0.05 3 0 150 0.8560428 0.8509305 0.6553860
## 0.05 3 1 100 0.8801298 0.8424100 0.6831377
## 0.05 3 1 150 0.8777957 0.8415492 0.6771662
## 0.05 4 0 100 0.8782500 0.8442661 0.6767610
## 0.05 4 0 150 0.8705214 0.8452782 0.6631440
## 0.05 4 1 100 0.8682171 0.8465492 0.6743046
## 0.05 4 1 150 0.8657537 0.8461813 0.6723059
##
## Tuning parameter 'colsample_bytree' was held constant at a value of 0.8
##
## Tuning parameter 'min_child_weight' was held constant at a value of 2
##
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 150, max_depth = 3, eta
## = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2 and
## subsample = 0.8.
plot(xgb_fast)
library(doParallel)
library(Metrics)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
# Reduced tuning grid with most impactful parameters
ctrl_fast <- trainControl(
method = "cv",
number = 5, # Reduced from 10 folds
verboseIter = TRUE,
allowParallel = TRUE,
savePredictions = "final"
)
xgb_grid_fast <- expand.grid(
nrounds = c(100, 150, 200, 300),
max_depth = c(4, 6, 8), # Back to 4,6,8 because 3,4 RMSE was not great
eta = c(0.05, 0.1, 0.15),
gamma = c(0,1),
colsample_bytree = 0.8,
min_child_weight = 2,
subsample = 0.8
)
xgb_optimized <- train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = ctrl_fast,
tuneGrid = xgb_grid_fast,
verbosity = 0,
metric = "RMSE"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 300, max_depth = 6, eta = 0.05, gamma = 1, colsample_bytree = 0.8, min_child_weight = 2, subsample = 0.8 on full training set
## Warning: Setting row names on a tibble is deprecated.
stopCluster(cl)
print(xgb_optimized)
## eXtreme Gradient Boosting
##
## 252 samples
## 21 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 203, 201, 202, 201, 201
## Resampling results across tuning parameters:
##
## eta max_depth gamma nrounds RMSE Rsquared MAE
## 0.05 4 0 100 0.8907448 0.8225203 0.6889024
## 0.05 4 0 150 0.8878283 0.8264637 0.6838817
## 0.05 4 0 200 0.8863493 0.8280422 0.6821354
## 0.05 4 0 300 0.8864141 0.8289281 0.6800962
## 0.05 4 1 100 0.8692329 0.8312319 0.6823221
## 0.05 4 1 150 0.8628717 0.8335070 0.6767554
## 0.05 4 1 200 0.8607236 0.8345670 0.6753222
## 0.05 4 1 300 0.8590330 0.8359100 0.6726605
## 0.05 6 0 100 0.8685797 0.8322320 0.6704659
## 0.05 6 0 150 0.8688771 0.8321535 0.6713056
## 0.05 6 0 200 0.8741929 0.8311496 0.6731855
## 0.05 6 0 300 0.8789708 0.8304619 0.6718609
## 0.05 6 1 100 0.8624194 0.8340334 0.6754635
## 0.05 6 1 150 0.8526616 0.8372510 0.6710308
## 0.05 6 1 200 0.8536788 0.8370835 0.6713847
## 0.05 6 1 300 0.8522603 0.8373855 0.6699777
## 0.05 8 0 100 0.8674734 0.8339552 0.6669022
## 0.05 8 0 150 0.8653034 0.8342388 0.6655097
## 0.05 8 0 200 0.8694120 0.8337656 0.6643586
## 0.05 8 0 300 0.8751872 0.8322398 0.6644406
## 0.05 8 1 100 0.8780920 0.8293627 0.6854085
## 0.05 8 1 150 0.8743516 0.8302680 0.6825640
## 0.05 8 1 200 0.8723100 0.8312949 0.6803162
## 0.05 8 1 300 0.8679497 0.8329429 0.6778601
## 0.10 4 0 100 0.9061672 0.8178920 0.6942083
## 0.10 4 0 150 0.9014654 0.8204167 0.6874965
## 0.10 4 0 200 0.9021623 0.8203703 0.6867485
## 0.10 4 0 300 0.9003716 0.8211243 0.6854194
## 0.10 4 1 100 0.8933117 0.8237149 0.7006024
## 0.10 4 1 150 0.8943248 0.8240572 0.6994523
## 0.10 4 1 200 0.8936999 0.8243502 0.6969232
## 0.10 4 1 300 0.8922653 0.8255438 0.6953046
## 0.10 6 0 100 0.9053462 0.8152095 0.6948502
## 0.10 6 0 150 0.9090872 0.8149734 0.6941957
## 0.10 6 0 200 0.9095758 0.8150010 0.6934913
## 0.10 6 0 300 0.9102890 0.8148544 0.6937459
## 0.10 6 1 100 0.8640531 0.8345145 0.6648236
## 0.10 6 1 150 0.8666107 0.8340867 0.6657681
## 0.10 6 1 200 0.8656854 0.8344951 0.6648222
## 0.10 6 1 300 0.8663870 0.8344562 0.6657586
## 0.10 8 0 100 0.9000383 0.8175953 0.6993600
## 0.10 8 0 150 0.9005137 0.8178369 0.6969322
## 0.10 8 0 200 0.9013356 0.8176193 0.6970689
## 0.10 8 0 300 0.9013389 0.8177036 0.6966672
## 0.10 8 1 100 0.8722699 0.8296108 0.6763781
## 0.10 8 1 150 0.8775351 0.8285041 0.6793858
## 0.10 8 1 200 0.8757207 0.8292867 0.6782922
## 0.10 8 1 300 0.8753224 0.8293795 0.6782871
## 0.15 4 0 100 0.8908477 0.8274183 0.6858406
## 0.15 4 0 150 0.8883673 0.8282211 0.6822434
## 0.15 4 0 200 0.8892147 0.8277253 0.6817583
## 0.15 4 0 300 0.8885816 0.8278455 0.6810660
## 0.15 4 1 100 0.8802390 0.8231812 0.6916522
## 0.15 4 1 150 0.8801942 0.8240044 0.6900426
## 0.15 4 1 200 0.8787756 0.8245382 0.6888212
## 0.15 4 1 300 0.8750942 0.8251346 0.6849653
## 0.15 6 0 100 0.9089206 0.8161500 0.6967861
## 0.15 6 0 150 0.9085901 0.8164424 0.6950970
## 0.15 6 0 200 0.9084784 0.8164772 0.6947123
## 0.15 6 0 300 0.9084124 0.8164925 0.6946573
## 0.15 6 1 100 0.8838684 0.8304600 0.6962923
## 0.15 6 1 150 0.8824853 0.8313396 0.6947185
## 0.15 6 1 200 0.8834375 0.8307777 0.6949300
## 0.15 6 1 300 0.8819700 0.8315343 0.6941799
## 0.15 8 0 100 0.9112751 0.8178064 0.6991255
## 0.15 8 0 150 0.9137724 0.8171472 0.6994128
## 0.15 8 0 200 0.9139029 0.8171118 0.6992585
## 0.15 8 0 300 0.9140759 0.8170649 0.6992819
## 0.15 8 1 100 0.8935081 0.8231672 0.6978100
## 0.15 8 1 150 0.8921961 0.8231839 0.6964752
## 0.15 8 1 200 0.8928237 0.8231334 0.6972326
## 0.15 8 1 300 0.8873299 0.8251900 0.6960735
##
## Tuning parameter 'colsample_bytree' was held constant at a value of 0.8
##
## Tuning parameter 'min_child_weight' was held constant at a value of 2
##
## Tuning parameter 'subsample' was held constant at a value of 0.8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 300, max_depth = 6, eta
## = 0.05, gamma = 1, colsample_bytree = 0.8, min_child_weight = 2 and
## subsample = 0.8.
plot(xgb_optimized)
best_params <- xgb_optimized$bestTune
X_2024 <- df %>%
filter(year == 2024) %>%
select(-c(year, fips, county.x, percent_food_insecure)) %>%
mutate(
rural_urban = as.numeric(factor(rural_urban))
)
predictions_2024 <- predict(xgb_optimized, newdata = X_2024)
results_2024 <- df %>%
filter(year == 2024) %>%
select(fips, county.x, percent_food_insecure) %>%
mutate(
pred_2024_food_insecure = predictions_2024,
predicted_change = pred_2024_food_insecure - percent_food_insecure,
risk_category = case_when(
pred_2024_food_insecure > quantile(predictions_2024, 0.75) ~ "High Risk",
pred_2024_food_insecure > quantile(predictions_2024, 0.5) ~ "Medium Risk",
TRUE ~ "Low Risk"
)
) %>%
arrange(desc(pred_2024_food_insecure))
write.csv(
results_2024,
file = paste0("food_insecurity_predictions_", format(Sys.Date(), "%Y%m%d"), ".csv"),
row.names = FALSE
)
library(ggplot2)
results_2024 %>%
head(10) %>%
ggplot(aes(x = reorder(county.x, -pred_2024_food_insecure), y = pred_2024_food_insecure)) +
geom_col(fill = "firebrick") +
labs(title = "Top 10 High-Risk Counties for 2024 Food Insecurity",
x = "County",
y = "Predicted Food Insecurity Rate") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
* The final values used for the model were nrounds = 300, max_depth = 4,
eta = 0.05, gamma = 0, colsample_bytree = 0.8, min_child_weight = 2 and
subsample = 0.8.
library(caret)
library(Metrics)
test_predictions <- predict(xgb_optimized, newdata = X_test)
actual_values <- y_test
metrics_table <- data.frame(
RMSE = RMSE(test_predictions, actual_values),
MAPE = mape(actual_values, test_predictions) * 100,
MSE = mean((test_predictions - actual_values)^2),
R2 = R2(test_predictions, actual_values)
)
print(metrics_table)
## RMSE MAPE MSE R2
## 1 1.051221 10.54083 1.105066 0.9227965
# SF and Tigris are used to call in geospacial data for mapping purposes
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
# Get counties in native projection (no transformation)
ny_counties <- counties(state = "NY", cb = TRUE, year = 2021, class = "sf") %>%
select(fips = GEOID, geometry)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================ | 24%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================ | 64%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|=================================================== | 74%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 86%
|
|============================================================= | 87%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
# Filtering out the NY Total Row
results_2024_clean <- results_2024 %>%
filter(fips != 36000) %>%
mutate(fips = as.character(fips))
# Merge and plot the predictions with sf and tigris mapping data
ny_counties %>%
left_join(results_2024_clean, by = "fips") %>%
drop_na(pred_2024_food_insecure) %>%
ggplot() +
geom_sf(aes(fill = pred_2024_food_insecure), color = NA) +
scale_fill_viridis_c("Food Insecurity (%)", option = "inferno", direction = -1) +
labs(title = "New York State 2025 Predicted Food Insecurity by County") +
theme_void()
ny_map <- ny_counties %>%
left_join(results_2024_clean, by = "fips") %>%
drop_na(pred_2024_food_insecure) %>%
ggplot() +
geom_sf(aes(fill = pred_2024_food_insecure), color = NA) +
scale_fill_viridis_c(
"Food Insecurity (%)",
option = "inferno",
direction = -1,
limits = c(0, max(results_2024_clean$pred_2024_food_insecure))
) +
labs(
title = "New York State 2024 Predicted Food Insecurity by County",
caption = "Data: County Health Rankings"
) +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
legend.position = "right",
legend.title = element_text(size = 10)
)
ggsave(
filename = "ny_food_insecurity_2025.png",
plot = ny_map,
device = "png",
width = 10,
height = 8,
units = "in",
dpi = 300,
bg = "white"
)
library(caret)
perm_imp <- varImp(
xgb_optimized,
scale = TRUE,
type = "permutation",
numPermutations = 30
)
ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(x = "Features", y = "Importance (Permutation)",
title = "Permutation Importance") +
theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
perm_imp_plot <- ggplot(perm_imp, aes(x = reorder(rownames(perm_imp), Overall), y = Overall)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(x = "Features", y = "Importance (Permutation)",
title = "Permutation Importance (XGBoost Experiment 3)") +
theme_minimal()
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
ggsave(
filename = "permutation_importance.png",
plot = perm_imp_plot,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 300,
bg = "white"
)