This analysis explores the relationships between educational outcomes, funding, demographics, and other socioeconomic factors across West Virginia counties. By identifying the key factors influencing student achievement, we aim to develop predictive models that highlight which variables most significantly impact educational outcomes across the state.
library(tidyverse)
library(janitor)
library(readxl)
library(caret)
library(rpart)
library(rpart.plot)
library(cluster)
library(factoextra)
library(neuralnet)
assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'
t_assess_raw <- read_excel(path = assessment_path, sheet = 'SY21 School & District', range = 'b2:f7312') %>%
janitor::clean_names()
t_assess_science <- read_excel(path = assessment_path, sheet = 'SY21 School & District', range = 'db3:db7312',
col_names = "science_proficiency", na = '**')
t_assess <- bind_cols(t_assess_raw, t_assess_science) %>%
filter(school == 999,
population_group == "Total Population",
county != "Statewide") %>%
mutate(science_proficiency = as.numeric(science_proficiency)) %>%
group_by(county) %>%
summarize(science_proficiency = mean(science_proficiency, na.rm = TRUE))
t_unemployed <- read_csv("unemployed.csv", skip = 4) %>%
janitor::clean_names() %>%
filter(!str_detect(county, "United States|West Virginia")) %>%
mutate(county = str_replace(county, " County", ""),
value_percent = as.numeric(value_percent)) %>%
select(county, unemployment_rate = value_percent)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_unemployed <- t_unemployed %>%
rename(county = county)
t_finance <- read_excel("./us census ed spending/elsec22.xls") %>%
janitor::clean_names()
t_finance_wv <- t_finance %>%
filter(fipst == 54) %>%
mutate(county = str_extract(name, "^\\w+")) %>%
select(county, totalrev, totalexp, tcurinst, tcurelsc)
standardize_county <- function(df, col) {
df %>%
mutate(!!sym(col) := str_to_lower(!!sym(col)),
!!sym(col) := str_replace_all(!!sym(col), " county", ""),
!!sym(col) := str_replace_all(!!sym(col), "[^a-z]", "")) # Remove special characters
}
t_assess <- standardize_county(t_assess, "county")
t_unemployed <- standardize_county(t_unemployed, "county")
t_finance_wv <- standardize_county(t_finance_wv, "county")
t_merged <- left_join(t_assess, t_unemployed, by = "county")
t_model_data <- left_join(t_merged, t_finance_wv, by = "county") %>%
drop_na()
summary(t_model_data)
## county science_proficiency unemployment_rate totalrev
## Length:54 Min. :17.03 Min. : 2.600 Min. : 13285
## Class :character 1st Qu.:21.77 1st Qu.: 5.025 1st Qu.: 27280
## Mode :character Median :24.60 Median : 6.250 Median : 50230
## Mean :25.50 Mean : 6.906 Mean : 73189
## 3rd Qu.:29.46 3rd Qu.: 8.375 3rd Qu.: 91392
## Max. :41.80 Max. :14.400 Max. :431207
## totalexp tcurinst tcurelsc
## Min. : 13954 Min. : 7380 Min. : 13014
## 1st Qu.: 26364 1st Qu.: 13432 1st Qu.: 24756
## Median : 49383 Median : 26149 Median : 46706
## Mean : 69982 Mean : 36849 Mean : 63484
## 3rd Qu.: 82272 3rd Qu.: 42984 3rd Qu.: 76314
## Max. :416491 Max. :195090 Max. :323034
ggplot(t_model_data, aes(x = tcurinst, y = science_proficiency)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Instructional Spending vs. Science Proficiency",
x = "Instructional Spending (Total)",
y = "Science Proficiency (%)")
## `geom_smooth()` using formula = 'y ~ x'
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
t_model_data %>%
select(science_proficiency, unemployment_rate, tcurinst, totalrev) %>%
ggpairs()
t_merged %>%
arrange(desc(science_proficiency)) %>%
slice_head(n = 10) %>%
ggplot(aes(x = reorder(county, science_proficiency), y = science_proficiency)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 10 WV Counties by Science Proficiency",
x = "county",
y = "Science Proficiency (%)")
set.seed(123)
kdata <- t_model_data %>%
select(unemployment_rate, tcurinst, science_proficiency) %>%
scale()
kmeans_model <- kmeans(kdata, centers = 3)
t_model_data$cluster <- as.factor(kmeans_model$cluster)
fviz_cluster(kmeans_model, data = kdata,
geom = "point",
ellipse.type = "norm",
main = "County Clusters Based on Key Indicators")
set.seed(123)
train_index <- createDataPartition(t_model_data$science_proficiency, p = 0.8, list = FALSE)
train_data <- t_model_data[train_index, ]
test_data <- t_model_data[-train_index, ]
tree_model <- rpart(science_proficiency ~ unemployment_rate + tcurinst + totalrev,
data = train_data,
method = "anova")
rpart.plot(tree_model)
tree_preds <- predict(tree_model, test_data)
tree_rmse <- sqrt(mean((tree_preds - test_data$science_proficiency)^2))
tree_rmse
## [1] 2.496782
Based on our analysis, we offer the following recommendations:
Our research successfully identified key drivers of educational achievement across West Virginia counties by analyzing assessment performance, financial investments, and demographic factors. This analysis revealed that per-pupil instructional spending and unemployment rates significantly predict science proficiency outcomes. Using K-means clustering, we identified county groupings with similar financial as well as educational profiles, revealing regional patterns and funding inequities. Both linear regression as well as decision tree models showed that higher instructional spending and lower unemployment correlate with improved science outcomes. Counties with comparable spending levels sometimes showed different results, suggesting the influence of unmeasured factors like leadership quality and community resources. Our findings identified underperforming, underfunded counties (Cluster 3) that should be prioritized for investment. The moderate negative correlation between unemployment and proficiency suggests economic development initiatives may indirectly enhance educational outcomes. The data indicates that spending efficiency, not just quantity, drives counties to achieve exceptional results with moderate funding, helping us understand that effective resource allocation may be as crucial as total investment.
I will now integrate WV checkbook spending data to further analyze the relationship between financial investment and educational outcomes.
We used county-level financial transaction records from the WV Checkbook system to estimate total education-related spending per county. The checkbook data was aggregated across all available county CSVs.
library(stringr)
checkbook_files <- list.files(path = "./checkbook_2023/", pattern = "\\.csv$", full.names = TRUE)
process_checkbook <- function(file_path) {
county_name <- str_extract(basename(file_path), "(?<=23-)[A-Za-z]+(?=-\\d+)")
df <- read_csv(file_path, show_col_types = FALSE)
df %>%
mutate(County = str_to_title(county_name),
CK_AMT = as.numeric(CK_AMT)) %>%
summarise(County = first(County),
TotalSpent = sum(CK_AMT, na.rm = TRUE))
}
checkbook_summary <- map_dfr(checkbook_files, process_checkbook)
checkbook_summary <- checkbook_summary %>%
rename(county = County) %>%
standardize_county("county")
t_model_data <- left_join(t_model_data, checkbook_summary, by = "county")
colnames(t_model_data)
## [1] "county" "science_proficiency" "unemployment_rate"
## [4] "totalrev" "totalexp" "tcurinst"
## [7] "tcurelsc" "cluster" "TotalSpent"
library(ggrepel)
t_model_data %>%
filter(!is.na(science_proficiency), !is.na(TotalSpent)) %>%
mutate(label = if_else(science_proficiency > 35 | science_proficiency < 20, county, NA_character_)) %>%
ggplot(aes(x = TotalSpent, y = science_proficiency, label = label)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_text_repel(na.rm = TRUE) +
labs(
title = "Checkbook Spending vs. Science Proficiency",
x = "Total Spending (2023, from Checkbook)",
y = "Science Proficiency (%)"
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Checkbook Conclusion
The plot shows a positive relationship between total education-related spending and science proficiency. While spending alone doesn’t guarantee performance, higher-spending counties tend to achieve better outcomes.
West Virginia Department of Education. (2023). Assessment Data SY15–SY22. Retrieved from internal class Dropbox link
United States Census Bureau. (2023). Annual Survey of School System Finances (Elsec22.xls).
WV Checkbook. (2023). West Virginia State Auditor’s Office Open Government Transparency Portal
U.S. Bureau of Labor Statistics. (2023). County-level Unemployment Rates
OpenAI. (2025, April). ChatGPT (GPT-4), personal communication. Assisted with R code debugging. https://chat.openai.com/