WV County Education Outcomes Prediction

Introduction

This analysis explores the relationships between educational outcomes, funding, demographics, and other socioeconomic factors across West Virginia counties. By identifying the key factors influencing student achievement, we aim to develop predictive models that highlight which variables most significantly impact educational outcomes across the state.

Research Questions

What factors most strongly influence science proficiency rates in West Virginia counties?
Can funding, unemployment, and spending patterns be used to predict proficiency outcomes?
What regional or structural similarities exist between counties?

Load and Prepare Data

library(tidyverse)
library(janitor)
library(readxl)
library(caret)
library(rpart)
library(rpart.plot)
library(cluster)
library(factoextra)
library(neuralnet)

assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'
t_assess_raw <- read_excel(path = assessment_path, sheet = 'SY21 School & District', range = 'b2:f7312') %>%
  janitor::clean_names()

t_assess_science <- read_excel(path = assessment_path, sheet = 'SY21 School & District', range = 'db3:db7312',
                               col_names = "science_proficiency", na = '**')

t_assess <- bind_cols(t_assess_raw, t_assess_science) %>%
  filter(school == 999,
         population_group == "Total Population",
         county != "Statewide") %>%
  mutate(science_proficiency = as.numeric(science_proficiency)) %>%
  group_by(county) %>%
  summarize(science_proficiency = mean(science_proficiency, na.rm = TRUE))

t_unemployed <- read_csv("unemployed.csv", skip = 4) %>%
  janitor::clean_names() %>%
  filter(!str_detect(county, "United States|West Virginia")) %>%
  mutate(county = str_replace(county, " County", ""),
         value_percent = as.numeric(value_percent)) %>%
  select(county, unemployment_rate = value_percent)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

t_unemployed <- t_unemployed %>%
  rename(county = county)

t_finance <- read_excel("./us census ed spending/elsec22.xls") %>%
  janitor::clean_names()




t_finance_wv <- t_finance %>%
  filter(fipst == 54) %>%
  mutate(county = str_extract(name, "^\\w+")) %>%
  select(county, totalrev, totalexp, tcurinst, tcurelsc)

standardize_county <- function(df, col) {
  df %>%
    mutate(!!sym(col) := str_to_lower(!!sym(col)),
           !!sym(col) := str_replace_all(!!sym(col), " county", ""),
           !!sym(col) := str_replace_all(!!sym(col), "[^a-z]", ""))  # Remove special characters
}


t_assess <- standardize_county(t_assess, "county")
t_unemployed <- standardize_county(t_unemployed, "county")
t_finance_wv <- standardize_county(t_finance_wv, "county")

t_merged <- left_join(t_assess, t_unemployed, by = "county")
t_model_data <- left_join(t_merged, t_finance_wv, by = "county") %>%
  drop_na()

Data Exploration

summary(t_model_data)

##     county          science_proficiency unemployment_rate    totalrev     
##  Length:54          Min.   :17.03       Min.   : 2.600    Min.   : 13285  
##  Class :character   1st Qu.:21.77       1st Qu.: 5.025    1st Qu.: 27280  
##  Mode  :character   Median :24.60       Median : 6.250    Median : 50230  
##                     Mean   :25.50       Mean   : 6.906    Mean   : 73189  
##                     3rd Qu.:29.46       3rd Qu.: 8.375    3rd Qu.: 91392  
##                     Max.   :41.80       Max.   :14.400    Max.   :431207  
##     totalexp         tcurinst         tcurelsc     
##  Min.   : 13954   Min.   :  7380   Min.   : 13014  
##  1st Qu.: 26364   1st Qu.: 13432   1st Qu.: 24756  
##  Median : 49383   Median : 26149   Median : 46706  
##  Mean   : 69982   Mean   : 36849   Mean   : 63484  
##  3rd Qu.: 82272   3rd Qu.: 42984   3rd Qu.: 76314  
##  Max.   :416491   Max.   :195090   Max.   :323034

Visualization: Proficiency vs Spending

ggplot(t_model_data, aes(x = tcurinst, y = science_proficiency)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Instructional Spending vs. Science Proficiency",
       x = "Instructional Spending (Total)",
       y = "Science Proficiency (%)")

## `geom_smooth()` using formula = 'y ~ x'

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

t_model_data %>%
  select(science_proficiency, unemployment_rate, tcurinst, totalrev) %>%
  ggpairs()

t_merged %>%
  arrange(desc(science_proficiency)) %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = reorder(county, science_proficiency), y = science_proficiency)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 10 WV Counties by Science Proficiency",
       x = "county",
       y = "Science Proficiency (%)")

Unsupervised Learning: K-Means Clustering

set.seed(123)
kdata <- t_model_data %>%
  select(unemployment_rate, tcurinst, science_proficiency) %>%
  scale()


kmeans_model <- kmeans(kdata, centers = 3)
t_model_data$cluster <- as.factor(kmeans_model$cluster)

fviz_cluster(kmeans_model, data = kdata,
             geom = "point",
             ellipse.type = "norm",
             main = "County Clusters Based on Key Indicators")

Supervised Learning: Decision Tree

set.seed(123)
train_index <- createDataPartition(t_model_data$science_proficiency, p = 0.8, list = FALSE)
train_data <- t_model_data[train_index, ]
test_data <- t_model_data[-train_index, ]

tree_model <- rpart(science_proficiency ~ unemployment_rate + tcurinst + totalrev,
                    data = train_data,
                    method = "anova")

rpart.plot(tree_model)

Predict and Evaluate

tree_preds <- predict(tree_model, test_data)
tree_rmse <- sqrt(mean((tree_preds - test_data$science_proficiency)^2))
tree_rmse

## [1] 2.496782

Recommendations

Based on our analysis, we offer the following recommendations:

Increase Instructional Spending: Counties with higher per-pupil instruction spending tend to have higher science proficiency.
Target High-Unemployment Counties: These areas may require additional academic support or community-based educational programs.
Cluster-Aware Policies: Use unsupervised clusters to apply tailored strategies to similarly structured counties.

Conclusion

Our research successfully identified key drivers of educational achievement across West Virginia counties by analyzing assessment performance, financial investments, and demographic factors. This analysis revealed that per-pupil instructional spending and unemployment rates significantly predict science proficiency outcomes. Using K-means clustering, we identified county groupings with similar financial as well as educational profiles, revealing regional patterns and funding inequities. Both linear regression as well as decision tree models showed that higher instructional spending and lower unemployment correlate with improved science outcomes. Counties with comparable spending levels sometimes showed different results, suggesting the influence of unmeasured factors like leadership quality and community resources. Our findings identified underperforming, underfunded counties (Cluster 3) that should be prioritized for investment. The moderate negative correlation between unemployment and proficiency suggests economic development initiatives may indirectly enhance educational outcomes. The data indicates that spending efficiency, not just quantity, drives counties to achieve exceptional results with moderate funding, helping us understand that effective resource allocation may be as crucial as total investment.

I will now integrate WV checkbook spending data to further analyze the relationship between financial investment and educational outcomes.

Integrate WV Checkbook Spending Data

We used county-level financial transaction records from the WV Checkbook system to estimate total education-related spending per county. The checkbook data was aggregated across all available county CSVs.

library(stringr)


checkbook_files <- list.files(path = "./checkbook_2023/", pattern = "\\.csv$", full.names = TRUE)

process_checkbook <- function(file_path) {
  county_name <- str_extract(basename(file_path), "(?<=23-)[A-Za-z]+(?=-\\d+)")
  df <- read_csv(file_path, show_col_types = FALSE)

  df %>%
    mutate(County = str_to_title(county_name),
           CK_AMT = as.numeric(CK_AMT)) %>%
    summarise(County = first(County),
              TotalSpent = sum(CK_AMT, na.rm = TRUE))
}

checkbook_summary <- map_dfr(checkbook_files, process_checkbook)

Merge Checkbook Data with Existing Dataset

checkbook_summary <- checkbook_summary %>%
  rename(county = County) %>%
  standardize_county("county")

t_model_data <- left_join(t_model_data, checkbook_summary, by = "county")



colnames(t_model_data)

## [1] "county"              "science_proficiency" "unemployment_rate"  
## [4] "totalrev"            "totalexp"            "tcurinst"           
## [7] "tcurelsc"            "cluster"             "TotalSpent"

Visualize: Checkbook Total Spending vs. Science Proficiency

library(ggrepel)

t_model_data %>%
  filter(!is.na(science_proficiency), !is.na(TotalSpent)) %>%
  mutate(label = if_else(science_proficiency > 35 | science_proficiency < 20, county, NA_character_)) %>%
  ggplot(aes(x = TotalSpent, y = science_proficiency, label = label)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  geom_text_repel(na.rm = TRUE) +
  labs(
    title = "Checkbook Spending vs. Science Proficiency",
    x = "Total Spending (2023, from Checkbook)",
    y = "Science Proficiency (%)"
  )

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Checkbook Conclusion

The plot shows a positive relationship between total education-related spending and science proficiency. While spending alone doesn’t guarantee performance, higher-spending counties tend to achieve better outcomes.

References

West Virginia Department of Education. (2023). Assessment Data SY15–SY22. Retrieved from internal class Dropbox link

United States Census Bureau. (2023). Annual Survey of School System Finances (Elsec22.xls).

WV Checkbook. (2023). West Virginia State Auditor’s Office Open Government Transparency Portal

U.S. Bureau of Labor Statistics. (2023). County-level Unemployment Rates

OpenAI. (2025, April). ChatGPT (GPT-4), personal communication. Assisted with R code debugging. https://chat.openai.com/