WV County Education Outcomes Prediction

##Introduction This analysis explores the relationships between educational outcomes, funding, demographics, and other socioeconomic factors across West Virginia counties. We aim to identify key drivers of educational performance and develop predictive models to help understand what factors most influence student achievement

##Key Research Questions

Load assessment data

**need to fix error still

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)
library(readxl)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(rpart.plot)

assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'


t_assess_raw_school <- read_excel(path = assessment_path,
                           sheet = 'SY21 School & District',
                           range = 'b2:f7312')


t_assess_raw_science <- read_excel(path = assessment_path,
                           sheet = 'SY21 School & District',
                           range = 'db3:db7312', 
                           col_names = c('science_proficiency'),
                           na = '**')

t_assess_raw <- t_assess_raw_school %>%
  bind_cols(t_assess_raw_science) %>% 
  janitor::clean_names()  


# Remove subgroups
t_assess <- t_assess_raw %>% 
  filter(school == 999) %>% 
  filter(population_group == 'Total Population') %>% 
  filter(county != 'Statewide') %>% 
  mutate(proficiency = science_proficiency)  

print(t_assess)

## # A tibble: 55 × 7
##    county    school school_name    population_group subgroup science_proficiency
##    <chr>     <chr>  <chr>          <chr>            <chr>                  <dbl>
##  1 Barbour   999    Barbour Count… Total Population Total                   26.0
##  2 Berkeley  999    Berkeley Coun… Total Population Total                   28.6
##  3 Boone     999    Boone County … Total Population Total                   19.6
##  4 Braxton   999    Braxton Count… Total Population Total                   22.6
##  5 Brooke    999    Brooke County… Total Population Total                   21.1
##  6 Cabell    999    Cabell County… Total Population Total                   30.8
##  7 Calhoun   999    Calhoun Count… Total Population Total                   27.8
##  8 Clay      999    Clay County T… Total Population Total                   23.3
##  9 Doddridge 999    Doddridge Cou… Total Population Total                   31.3
## 10 Fayette   999    Fayette Count… Total Population Total                   17.4
## # ℹ 45 more rows
## # ℹ 1 more variable: proficiency <dbl>

assessment_path_2022 <- './wv ed student achievement/SY22_AssessmentProficiency_Public_All_Final.xlsx'

tryCatch({
  t_assess_2022_raw <- read_excel(path = assessment_path_2022) %>%
    janitor::clean_names()
})

## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
## • `` -> `...33`
## • `` -> `...34`
## • `` -> `...35`
## • `` -> `...36`
## • `` -> `...38`
## • `` -> `...39`
## • `` -> `...40`
## • `` -> `...41`
## • `` -> `...43`
## • `` -> `...44`
## • `` -> `...45`
## • `` -> `...46`
## • `` -> `...48`
## • `` -> `...49`
## • `` -> `...50`
## • `` -> `...51`
## • `` -> `...53`
## • `` -> `...54`
## • `` -> `...55`
## • `` -> `...56`
## • `` -> `...58`
## • `` -> `...59`
## • `` -> `...60`
## • `` -> `...61`
## • `` -> `...63`
## • `` -> `...64`
## • `` -> `...65`
## • `` -> `...66`
## • `` -> `...68`
## • `` -> `...69`
## • `` -> `...70`
## • `` -> `...71`
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`

## t_assess_2022 <- t_assess_2022_raw %>%
   ## filter(grepl("County", district)) %>%  
   ## mutate(county = gsub(" County Schools", "", district)) %>%
   ## select(county, contains("proficiency")) %>%
    ## arrange(county)
## print(t_assess_2022)

Load spending data

spending_path <- './us census ed spending/elsec22t.xls'

t_spending_raw <- read_excel(path = spending_path,,
                           sheet = 'elsec22t',
                           range = 'a1:gb14106') %>% 
  janitor::clean_names()

## New names:
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...92`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...97`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...102`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## • `` -> `...107`
## • `` -> `...108`
## • `` -> `...109`
## • `` -> `...110`
## • `` -> `...111`
## • `` -> `...112`
## • `` -> `...113`
## • `` -> `...114`
## • `` -> `...115`
## • `` -> `...116`
## • `` -> `...117`
## • `` -> `...118`
## • `` -> `...119`
## • `` -> `...120`
## • `` -> `...121`
## • `` -> `...122`
## • `` -> `...123`
## • `` -> `...124`
## • `` -> `...125`
## • `` -> `...126`
## • `` -> `...127`
## • `` -> `...128`
## • `` -> `...129`
## • `` -> `...130`
## • `` -> `...131`
## • `` -> `...132`
## • `` -> `...133`
## • `` -> `...134`
## • `` -> `...135`
## • `` -> `...136`
## • `` -> `...137`
## • `` -> `...138`
## • `` -> `...139`
## • `` -> `...140`
## • `` -> `...141`
## • `` -> `...142`
## • `` -> `...143`
## • `` -> `...144`
## • `` -> `...145`
## • `` -> `...146`
## • `` -> `...147`
## • `` -> `...148`
## • `` -> `...149`
## • `` -> `...150`
## • `` -> `...151`
## • `` -> `...152`
## • `` -> `...153`
## • `` -> `...154`
## • `` -> `...155`
## • `` -> `...156`
## • `` -> `...157`
## • `` -> `...158`
## • `` -> `...159`
## • `` -> `...160`
## • `` -> `...161`
## • `` -> `...162`
## • `` -> `...163`
## • `` -> `...164`
## • `` -> `...165`
## • `` -> `...166`
## • `` -> `...167`
## • `` -> `...168`
## • `` -> `...169`
## • `` -> `...170`
## • `` -> `...171`
## • `` -> `...172`
## • `` -> `...173`
## • `` -> `...174`
## • `` -> `...175`
## • `` -> `...176`
## • `` -> `...177`
## • `` -> `...178`
## • `` -> `...179`
## • `` -> `...180`
## • `` -> `...181`
## • `` -> `...182`
## • `` -> `...183`
## • `` -> `...184`

cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
                'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
                'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')

t_spending <- t_spending_raw %>% 
  filter(state == 49) %>% 
  filter(!name %in% cooperates) %>% 
  select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>% 
  mutate(county = str_to_title(str_split_i(name, ' ',1)),
         county = ifelse(county == 'Mc', 'McDowell', county))


print(t_spending)

## # A tibble: 55 × 8
##    name                  enroll tfedrev tstrev tlocrev totalexp ppcstot county  
##    <chr>                  <dbl>   <dbl>  <dbl>   <dbl>    <dbl>   <dbl> <chr>   
##  1 BARBOUR CO SCH DIST     2144    7559  16584    5872    28021   11885 Barbour 
##  2 BERKELEY CO SCH DIST   19722   48407 140127   86699   264253   12704 Berkeley
##  3 BOONE CO SCH DIST       3177    8194  26858   14564    48642   14663 Boone   
##  4 BRAXTON CO SCH DIST     1747    5479  12748    6404    24417   13153 Braxton 
##  5 BROOKE CO SCH DIST      2582    6791  17114   21352    41908   15642 Brooke  
##  6 CABELL CO SCH DIST     11667   42518  88337   66699   183621   14538 Cabell  
##  7 CALHOUN CO SCH DIST      861    3254   9953    3190    15154   16085 Calhoun 
##  8 CLAY CO SCH DIST        1669    6157  17655    2791    25963   13825 Clay    
##  9 DODDRIDGE CO SCH DIST   1082    3455   3999   31752    38493   23563 Doddrid…
## 10 FAYETTE CO SCH DIST     5594   15293  51759   23477    83373   13777 Fayette 
## # ℹ 45 more rows

Load demographic data

Synthetic Data as of now, going to pull and use real data sources for Educational attainment data and Income and poverty data

t_demographics_unemployed <- read_csv('unemployed.csv', 
                            skip = 4,
                            na = 'N/A') %>%
  janitor::clean_names() %>% 
  filter(county != 'West Virginia',
         county != 'United States',
         !is.na(value_percent) ) %>% 
  select(county, value_percent) %>%
  rename(unemployed = value_percent)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print(t_demographics_unemployed)

## # A tibble: 55 × 2
##    county          unemployed
##    <chr>                <dbl>
##  1 McDowell County       15.1
##  2 Braxton County        14.4
##  3 Logan County          13.3
##  4 Calhoun County        12.2
##  5 Roane County          11.7
##  6 Clay County           11.2
##  7 Mingo County          11.2
##  8 Webster County        11.1
##  9 Monroe County         10.6
## 10 Barbour County        10.1
## # ℹ 45 more rows

wv_counties <- unique(t_assess$county)

t_demographics_education <- tibble(
  county = wv_counties,
  bachelor_or_higher = runif(length(wv_counties), 10, 40), 
  high_school_grad = runif(length(wv_counties), 70, 95)    
)
t_demographics_income <- tibble(
  county = wv_counties,
  median_household_income = runif(length(wv_counties), 35000, 65000),
  poverty_rate = runif(length(wv_counties), 10, 30)
)

t_demographics <-  t_demographics_unemployed %>%
  left_join(t_demographics_education, by = "county") %>%
  left_join(t_demographics_income, by = "county")

print(t_demographics)

## # A tibble: 55 × 6
##    county  unemployed bachelor_or_higher high_school_grad median_household_inc…¹
##    <chr>        <dbl>              <dbl>            <dbl>                  <dbl>
##  1 McDowe…       15.1                 NA               NA                     NA
##  2 Braxto…       14.4                 NA               NA                     NA
##  3 Logan …       13.3                 NA               NA                     NA
##  4 Calhou…       12.2                 NA               NA                     NA
##  5 Roane …       11.7                 NA               NA                     NA
##  6 Clay C…       11.2                 NA               NA                     NA
##  7 Mingo …       11.2                 NA               NA                     NA
##  8 Webste…       11.1                 NA               NA                     NA
##  9 Monroe…       10.6                 NA               NA                     NA
## 10 Barbou…       10.1                 NA               NA                     NA
## # ℹ 45 more rows
## # ℹ abbreviated name: ¹median_household_income
## # ℹ 1 more variable: poverty_rate <dbl>

Joined data

Checkbook data

checkbook_files <- list.files(path = "./checkbook_2023/", 
                             pattern = "*.csv", 
                             full.names = TRUE)
process_checkbook <- function(file_path) {
  county_name <- str_extract(basename(file_path), "(?<=23)[A-Za-z]+(?=\\d+)")
  
 tryCatch({
    # Read the file
    df <- read_csv(file_path, show_col_types = FALSE) %>%
      janitor::clean_names()
    

    if("county" %in% colnames(df)) {
      if(is.character(df$county)) {
        df$county <- as.numeric(df$county)
      }
    }
    

    df <- df %>% mutate(county_name = county_name)
    

    return(df)
  })
}


sample_counties <- c("./checkbook_2023/23-Barbour-02_Updated.csv",
                    "./checkbook_2023/23-Grant-24_Updated.csv",
                    "./checkbook_2023/23-Jackson-35_Updated.csv")
checkbook_data_samples <- map_dfr(sample_counties, process_checkbook)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

education_spending <- checkbook_data_samples %>%
  mutate(ck_amt = as.numeric(ck_amt)) %>%
  
filter(prog %in% c(1, 2, 3, 4) | 
         grepl("EDUCATION|SCHOOL|TEACH|INSTRUCT|CURRICULUM", toupper(invdesc))) %>%
  group_by(county_name) %>%
  summarize(
    total_education_spending = sum(ck_amt, na.rm = TRUE),
    instructional_spending = sum(ck_amt[grepl("INSTRUCT|TEACH|CLASSROOM", toupper(invdesc))], na.rm = TRUE),
    technology_spending = sum(ck_amt[grepl("COMPUTER|TECHNOLOGY|SOFTWARE", toupper(invdesc))], na.rm = TRUE),
    facility_spending = sum(ck_amt[grepl("FACILITY|BUILDING|MAINTENANCE", toupper(invdesc))], na.rm = TRUE),
    .groups = "drop"
  )
print(education_spending)

## # A tibble: 1 × 5
##   county_name total_education_spend…¹ instructional_spending technology_spending
##   <chr>                         <dbl>                  <dbl>               <dbl>
## 1 <NA>                       2271485.                 81529.               6442.
## # ℹ abbreviated name: ¹total_education_spending
## # ℹ 1 more variable: facility_spending <dbl>

standardize_county_name <- function(county) {
  county <- str_to_title(county)  
  county <- str_trim(county)      
  county <- str_replace(county, "Mcdowell", "McDowell")  
  county <- str_replace(county, " County", "")  
  return(county)
}

if (exists("t_assess")) {
  t_assess <- t_assess %>% 
    mutate(county = standardize_county_name(county))
}

if (exists("t_assess_2022")) {
  t_assess_2022 <- t_assess_2022 %>% 
    mutate(county = standardize_county_name(county))
}

if (exists("t_spending")) {
  t_spending <- t_spending %>% 
    mutate(county = standardize_county_name(county))
}

if (exists("t_demographics")) {
  t_demographics <- t_demographics %>% 
    mutate(county = standardize_county_name(county))
}

if (exists("education_spending")) {
  education_spending <- education_spending %>% 
    mutate(county = standardize_county_name(county_name)) %>%
    select(-county_name)
}


integrated_data <- t_assess %>%
  left_join(t_spending, by = "county") %>%
  left_join(t_demographics, by = "county")
if (exists("education_spending")) {
  integrated_data <- integrated_data %>%
    left_join(education_spending, by = "county")
}
if (exists("t_assess_2022") && ncol(t_assess_2022) > 1) {
  integrated_data <- integrated_data %>%
    left_join(t_assess_2022, by = "county", suffix = c("_2021", "_2022"))
}
print(integrated_data)

## # A tibble: 55 × 23
##    county    school school_name    population_group subgroup science_proficiency
##    <chr>     <chr>  <chr>          <chr>            <chr>                  <dbl>
##  1 Barbour   999    Barbour Count… Total Population Total                   26.0
##  2 Berkeley  999    Berkeley Coun… Total Population Total                   28.6
##  3 Boone     999    Boone County … Total Population Total                   19.6
##  4 Braxton   999    Braxton Count… Total Population Total                   22.6
##  5 Brooke    999    Brooke County… Total Population Total                   21.1
##  6 Cabell    999    Cabell County… Total Population Total                   30.8
##  7 Calhoun   999    Calhoun Count… Total Population Total                   27.8
##  8 Clay      999    Clay County T… Total Population Total                   23.3
##  9 Doddridge 999    Doddridge Cou… Total Population Total                   31.3
## 10 Fayette   999    Fayette Count… Total Population Total                   17.4
## # ℹ 45 more rows
## # ℹ 17 more variables: proficiency <dbl>, name <chr>, enroll <dbl>,
## #   tfedrev <dbl>, tstrev <dbl>, tlocrev <dbl>, totalexp <dbl>, ppcstot <dbl>,
## #   unemployed <dbl>, bachelor_or_higher <dbl>, high_school_grad <dbl>,
## #   median_household_income <dbl>, poverty_rate <dbl>,
## #   total_education_spending <dbl>, instructional_spending <dbl>,
## #   technology_spending <dbl>, facility_spending <dbl>

# Merge data
t <- t_assess

Summary Stats

summary_stats <- integrated_data %>%
  select(where(is.numeric)) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  group_by(variable) %>%
  summarize(
    n = sum(!is.na(value)),
    min = min(value, na.rm = TRUE),
    q1 = quantile(value, 0.25, na.rm = TRUE),
    median = median(value, na.rm = TRUE),
    mean = mean(value, na.rm = TRUE),
    q3 = quantile(value, 0.75, na.rm = TRUE),
    max = max(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE)
  )

## Warning: There were 16 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `min = min(value, na.rm = TRUE)`.
## ℹ In group 1: `variable = "bachelor_or_higher"`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 15 remaining warnings.

kable(summary_stats, caption = "Summary Statistics of Numeric Variables") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

Summary Statistics of Numeric Variables
variable	n	min	q1	median	mean	q3	max	sd
bachelor_or_higher	0	Inf	NA	NA	NaN	NA	-Inf	NA
enroll	55	800.00	1654.00	3177.00	4585.581818	5104.000	24392.0	4615.970845
facility_spending	0	Inf	NA	NA	NaN	NA	-Inf	NA
high_school_grad	0	Inf	NA	NA	NaN	NA	-Inf	NA
instructional_spending	0	Inf	NA	NA	NaN	NA	-Inf	NA
median_household_income	0	Inf	NA	NA	NaN	NA	-Inf	NA
poverty_rate	0	Inf	NA	NA	NaN	NA	-Inf	NA
ppcstot	55	11885.00	13151.00	13777.00	14466.054545	15235.500	23563.0	2188.579900
proficiency	55	14.81	21.45	24.36	25.304727	29.395	41.8	5.817084
science_proficiency	55	14.81	21.45	24.36	25.304727	29.395	41.8	5.817084
technology_spending	0	Inf	NA	NA	NaN	NA	-Inf	NA
tfedrev	55	1511.00	4991.00	10158.00	13311.909091	14517.500	109522.0	16225.080652
tlocrev	55	1956.00	8194.50	14813.00	25031.909091	33333.000	145623.0	26739.034924
total_education_spending	0	Inf	NA	NA	NaN	NA	-Inf	NA
totalexp	55	13954.00	26486.00	48642.00	69482.090909	81171.500	416491.0	70702.975648
tstrev	55	3895.00	12668.50	26858.00	34234.363636	39496.000	176062.0	33479.121896
unemployed	55	2.60	5.05	6.40	7.054545	8.500	15.1	2.918194

ggplot(integrated_data, aes(x = proficiency)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Science Proficiency Across WV Counties",
       x = "Science Proficiency (%)",
       y = "Count") +
  theme_minimal()

top_counties <- integrated_data %>%
  arrange(desc(proficiency)) %>%
  select(county, proficiency) %>%
  head(5)

bottom_counties <- integrated_data %>%
  arrange(proficiency) %>%
  select(county, proficiency) %>%
  head(5)

kable(top_counties, caption = "Top 5 Counties by Science Proficiency") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Top 5 Counties by Science Proficiency
county	proficiency
Monongalia	41.80
Jefferson	38.71
Ohio	35.75
Wood	35.28
Putnam	34.38

kable(bottom_counties, caption = "Bottom 5 Counties by Science Proficiency") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Bottom 5 Counties by Science Proficiency
county	proficiency
McDowell	14.81
Lewis	17.03
Wyoming	17.36
Fayette	17.42
Upshur	18.48

library(ggcorrplot)

Regional Analysis

eastern_panhandle <- c("Berkeley", "Jefferson", "Morgan", "Hampshire", "Mineral", "Hardy", "Grant", "Pendleton")
northern_panhandle <- c("Hancock", "Brooke", "Ohio", "Marshall", "Wetzel", "Tyler")
metro_valley <- c("Kanawha", "Putnam", "Cabell", "Wayne", "Lincoln", "Boone")
southern_counties <- c("Mingo", "Logan", "Wyoming", "McDowell", "Mercer", "Raleigh")
central_counties <- c("Braxton", "Clay", "Nicholas", "Webster", "Pocahontas", "Greenbrier", "Monroe", "Summers")
north_central <- c("Monongalia", "Marion", "Harrison", "Taylor", "Preston", "Barbour", "Tucker", "Randolph", "Upshur", "Lewis")
ohio_valley <- c("Wood", "Pleasants", "Tyler", "Doddridge", "Ritchie", "Wirt", "Jackson", "Mason", "Roane", "Calhoun", "Gilmer")

integrated_data <- integrated_data %>%
  mutate(region = case_when(
    county %in% eastern_panhandle ~ "Eastern Panhandle",
    county %in% northern_panhandle ~ "Northern Panhandle",
    county %in% metro_valley ~ "Metro Valley",
    county %in% southern_counties ~ "Southern Counties",
    county %in% central_counties ~ "Central Counties",
    county %in% north_central ~ "North Central",
    county %in% ohio_valley ~ "Ohio Valley",
    TRUE ~ "Other"
  ))

ggplot(integrated_data, aes(x = region, y = proficiency, fill = region)) +
  geom_boxplot() +
  labs(title = "Science Proficiency by Region",
       x = "Region",
       y = "Science Proficiency (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

region_stats <- integrated_data %>%
  group_by(region) %>%
  summarize(
    mean_proficiency = mean(proficiency, na.rm = TRUE),
    median_proficiency = median(proficiency, na.rm = TRUE),
    sd_proficiency = sd(proficiency, na.rm = TRUE),
    count = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_proficiency))

kable(region_stats, caption = "Science Proficiency by Region") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Science Proficiency by Region
region	mean_proficiency	median_proficiency	sd_proficiency	count
Eastern Panhandle	27.79125	26.730	5.050399	8
North Central	26.60900	26.075	7.262833	10
Ohio Valley	26.42000	25.745	5.128446	10
Northern Panhandle	25.60333	24.650	5.429636	6
Metro Valley	25.35500	24.400	6.426949	6
Central Counties	24.62750	22.965	5.052512	8
Southern Counties	19.82500	18.790	3.979727	6
Other	17.42000	17.420	NA	1

Correlation Analysis

correlation_vars <- c("proficiency", "ppcstot", "unemployed", "bachelor_or_higher", 
                     "high_school_grad", "median_household_income", "poverty_rate")


cat("Available data counts:\n")

## Available data counts:

for(var in correlation_vars) {
  if(var %in% names(integrated_data)) {
    cat(var, ": ", sum(!is.na(integrated_data[[var]])), " non-NA values\n", sep="")
  } else {
    cat(var, ": not in dataset\n", sep="")
  }
}

## proficiency: 55 non-NA values
## ppcstot: 55 non-NA values
## unemployed: 55 non-NA values
## bachelor_or_higher: 0 non-NA values
## high_school_grad: 0 non-NA values
## median_household_income: 0 non-NA values
## poverty_rate: 0 non-NA values

available_vars <- correlation_vars[correlation_vars %in% names(integrated_data)]


correlation_data <- integrated_data %>%
  select(all_of(intersect(available_vars, names(integrated_data)))) %>%
  
  select_if(~ !all(is.na(.))) %>%
 
  na.omit()


if(nrow(correlation_data) >= 3 && ncol(correlation_data) >= 2) {
 
  no_var_cols <- correlation_data %>%
    summarise_all(~ var(., na.rm = TRUE) == 0) %>%
    pivot_longer(everything(), names_to = "variable", values_to = "no_variance") %>%
    filter(no_variance) %>%
    pull(variable)
  
  if(length(no_var_cols) > 0) {
    cat("Removing variables with no variance:", paste(no_var_cols, collapse=", "), "\n")
    correlation_data <- correlation_data %>% select(-all_of(no_var_cols))
  }
  
  
  correlation_matrix <- cor(correlation_data, use = "pairwise.complete.obs")
  

  if(any(is.na(correlation_matrix)) || any(is.infinite(as.matrix(correlation_matrix)))) {
    cat("Warning: Correlation matrix contains NA or Inf values.\n")
    
    corrplot::corrplot(correlation_matrix, 
                       method = "color", 
                       type = "lower", 
                       addCoef.col = "black",
                       tl.col = "black", 
                       title = "Correlation Between Educational Outcomes and Various Factors")
  } else {
    
    if(ncol(correlation_data) < 3) {
      ggcorrplot(
        correlation_matrix,
        hc.order = FALSE,  # Disable hierarchical clustering with few variables
        type = "lower",
        lab = TRUE,
        lab_size = 3,
        method = "circle",
        colors = c("#6D9EC1", "white", "#E46726"),
        title = "Correlation Between Educational Outcomes and Various Factors",
        ggtheme = theme_minimal
      )
    } else {
    
      tryCatch({
        ggcorrplot(
          correlation_matrix,
          hc.order = TRUE,
          type = "lower",
          lab = TRUE,
          lab_size = 3,
          method = "circle",
          colors = c("#6D9EC1", "white", "#E46726"),
          title = "Correlation Between Educational Outcomes and Various Factors",
          ggtheme = theme_minimal
        )
      }, error = function(e) {
       
        cat("Falling back to plot without hierarchical clustering due to error:", e$message, "\n")
        ggcorrplot(
          correlation_matrix,
          hc.order = FALSE,
          type = "lower",
          lab = TRUE,
          lab_size = 3,
          method = "circle",
          colors = c("#6D9EC1", "white", "#E46726"),
          title = "Correlation Between Educational Outcomes and Various Factors",
          ggtheme = theme_minimal
        )
      })
    }
  }
  
 
  if("proficiency" %in% colnames(correlation_matrix)) {
    proficiency_correlations <- correlation_matrix[, "proficiency"]
    proficiency_correlations <- data.frame(
      variable = names(proficiency_correlations),
      correlation = proficiency_correlations
    ) %>%
      arrange(desc(abs(correlation)))
    

    kable(proficiency_correlations, caption = "Correlations with Science Proficiency") %>%
      kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
  } else {
    cat("Proficiency variable not available for correlation analysis")
  }
} else {
  cat("Not enough complete data for correlation analysis.\n")
  cat("Number of rows in correlation data: ", nrow(correlation_data), "\n")
  cat("Number of columns in correlation data: ", ncol(correlation_data), "\n")
  cat("Please ensure you have more complete data or adjust your variable selection.")
}

Correlations with Science Proficiency
	variable	correlation
proficiency	proficiency	1.0000000
unemployed	unemployed	-0.3233399
ppcstot	ppcstot	0.0935548

Spending Analysis

ggplot(integrated_data, aes(x = ppcstot, y = proficiency)) +
  geom_point(aes(color = region), size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Relationship Between Per-Pupil Spending and Science Proficiency",
       x = "Per-Pupil Spending ($)",
       y = "Science Proficiency (%)") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

spending_by_region <- integrated_data %>%
  group_by(region) %>%
  summarize(
    mean_spending = mean(ppcstot, na.rm = TRUE),
    mean_proficiency = mean(proficiency, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_spending))
kable(spending_by_region, caption = "Education Spending and Proficiency by Region") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Education Spending and Proficiency by Region
region	mean_spending	mean_proficiency
Northern Panhandle	17159.67	25.60333
Ohio Valley	15695.70	26.42000
Central Counties	14275.75	24.62750
Metro Valley	13918.17	25.35500
Other	13777.00	17.42000
Eastern Panhandle	13744.00	27.79125
Southern Counties	13516.17	19.82500
North Central	13317.70	26.60900

Bar Chart and Scatter Plot of proficiency by county

ggplot(integrated_data, aes(x = reorder(county, proficiency), y = proficiency)) +
  geom_col(fill = "steelblue") +
  coord_flip() +  # Flip coordinates for better county name display
  labs(title = "Science Proficiency by County in West Virginia",
       x = "County",
       y = "Science Proficiency (%)") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))  # Adjust text size as needed

if("ppcstot" %in% names(integrated_data)) {
  ggplot(integrated_data, aes(x = reorder(county, ppcstot), y = ppcstot)) +
    geom_col(fill = "darkgreen") +
    coord_flip() +
    labs(title = "Per-Pupil Spending by County in West Virginia",
         x = "County",
         y = "Per-Pupil Spending ($)") +
    theme_minimal() +
    theme(axis.text.y = element_text(size = 8))
}

if("ppcstot" %in% names(integrated_data)) {
  ggplot(integrated_data, aes(x = ppcstot, y = proficiency)) +
    geom_point(color = "blue", size = 3, alpha = 0.7) +
    # Instead of geom_text_repel, use simpler text labeling
    geom_text(aes(label = county), hjust = -0.1, vjust = 0.1, size = 2.5) +
    geom_smooth(method = "lm", se = TRUE, color = "red") +  # Add trend line
    labs(title = "Science Proficiency vs. Per-Pupil Spending",
         x = "Per-Pupil Spending ($)",
         y = "Science Proficiency (%)") +
    theme_minimal()
}

## `geom_smooth()` using formula = 'y ~ x'

## Principal Component Analysis

pca_data <- integrated_data %>%
  select(proficiency, ppcstot, unemployed) %>%
  drop_na()


print(dim(pca_data))

## [1] 55  3

if(nrow(pca_data) > 0 && ncol(pca_data) > 1) {
 
  pca_data_scaled <- scale(pca_data)
  

  pca_result <- prcomp(pca_data_scaled)
  

  print(summary(pca_result))
  

  biplot(pca_result, scale = 0)
}

## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.1794 0.9672 0.8207
## Proportion of Variance 0.4637 0.3118 0.2245
## Cumulative Proportion  0.4637 0.7755 1.0000

## K-means Clustering

set.seed(123)
km <- kmeans(pca_data, centers = 3)
integrated_data$cluster <- km$cluster[match(rownames(integrated_data), rownames(pca_data))]


ggplot(integrated_data, aes(x = ppcstot, y = proficiency, color = factor(cluster))) +
  geom_point(size = 3) +
  labs(title = "K-means Clustering of WV Counties",
       x = "Per-Pupil Spending ($)",
       y = "Science Proficiency (%)",
       color = "Cluster") +
  theme_minimal()

Prepare Data for Modeling

model_data <- integrated_data %>%
  select(proficiency, ppcstot, unemployed) %>%
  drop_na()

dim(model_data)

## [1] 55  3

set.seed(123)
train_index <- createDataPartition(model_data$proficiency, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]


ctrl <- trainControl(method = "cv", number = 5)


lm_model <- train(
  proficiency ~ .,
  data = train_data,
  method = "lm",
  trControl = ctrl
)


summary(lm_model$finalModel)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0840 -4.4404 -0.2178  3.4038 16.1935 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.4887305  6.2262988   3.773 0.000479 ***
## ppcstot      0.0003147  0.0003796   0.829 0.411622    
## unemployed  -0.3876326  0.3104507  -1.249 0.218410    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.733 on 44 degrees of freedom
## Multiple R-squared:  0.05533,    Adjusted R-squared:  0.01239 
## F-statistic: 1.289 on 2 and 44 DF,  p-value: 0.2859

lm_predictions <- predict(lm_model, newdata = test_data)


lm_rmse <- RMSE(lm_predictions, test_data$proficiency)
lm_r2 <- R2(lm_predictions, test_data$proficiency)

cat("Linear Regression Model:\n")

## Linear Regression Model:

cat("RMSE:", lm_rmse, "\n")

## RMSE: 5.211434

cat("R-squared:", lm_r2, "\n")

## R-squared: 0.4000578

Decision Tree Model

dt_model <- train(
  proficiency ~ .,
  data = train_data,
  method = "rpart",
  trControl = ctrl,
  tuneLength = 10
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

dt_model

## CART 
## 
## 47 samples
##  2 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 37, 38, 38, 39, 36 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared    MAE     
##   0.00000000  5.699724  0.13584683  4.564666
##   0.01523184  5.699724  0.13584683  4.564666
##   0.03046368  5.699724  0.13584683  4.564666
##   0.04569552  5.699724  0.13584683  4.564666
##   0.06092736  5.699724  0.13584683  4.564666
##   0.07615920  5.889352  0.07521389  4.763857
##   0.09139104  5.889352  0.07521389  4.763857
##   0.10662288  5.889352  0.07521389  4.763857
##   0.12185472  5.819587  0.07521389  4.786079
##   0.13708656  5.926893  0.03330881  4.901399
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.06092736.

rpart.plot(dt_model$finalModel, 
           box.palette = "Blues",
           main = "Decision Tree for Science Proficiency")

dt_predictions <- predict(dt_model, newdata = test_data)

dt_rmse <- RMSE(dt_predictions, test_data$proficiency)
dt_r2 <- R2(dt_predictions, test_data$proficiency)

cat("Decision Tree Model:\n")

## Decision Tree Model:

cat("RMSE:", dt_rmse, "\n")

## RMSE: 5.540997

cat("R-squared:", dt_r2, "\n")

## R-squared: 0.1999434

ggplot(data.frame(actual = test_data$proficiency, predicted = dt_predictions), 
       aes(x = actual, y = predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Decision Tree: Actual vs. Predicted Values",
       x = "Actual Proficiency",
       y = "Predicted Proficiency") +
  theme_minimal()

dt_importance <- varImp(dt_model)
plot(dt_importance, main = "Decision Tree: Variable Importance")

Neural Network Model

preproc <- preProcess(train_data, method = c("center", "scale"))
train_data_scaled <- predict(preproc, train_data)
test_data_scaled <- predict(preproc, test_data)


nn_model <- train(
  proficiency ~ .,
  data = train_data_scaled,
  method = "nnet",
  trControl = ctrl,
  tuneLength = 5,
  linout = TRUE,
  trace = FALSE,
  maxit = 500
)


nn_model

## Neural Network 
## 
## 47 samples
##  2 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 38, 38, 36, 39, 37 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE       Rsquared    MAE      
##   1     0e+00  0.9311043  0.19073588  0.7800610
##   1     1e-04  0.9214202  0.20070758  0.7712265
##   1     1e-03  1.0080998  0.13003397  0.8425091
##   1     1e-02  0.9469247  0.15423751  0.7806526
##   1     1e-01  0.9563506  0.12370917  0.7734629
##   3     0e+00  1.1019919  0.08143896  0.8503733
##   3     1e-04  1.0572604  0.18210340  0.8456033
##   3     1e-03  1.6090489  0.01778995  1.2573556
##   3     1e-02  1.1006797  0.06766182  0.9418332
##   3     1e-01  1.0167830  0.04424340  0.8194010
##   5     0e+00  7.1035214  0.20504415  4.7240752
##   5     1e-04  2.7204605  0.02376320  1.9661360
##   5     1e-03  1.8360464  0.15465438  1.4223430
##   5     1e-02  1.1947307  0.02002665  0.9961097
##   5     1e-01  1.0389848  0.02550453  0.8367510
##   7     0e+00  2.3060228  0.17638860  1.7456562
##   7     1e-04  1.8929106  0.13051393  1.4170996
##   7     1e-03  2.0223818  0.09974229  1.4855287
##   7     1e-02  1.5292394  0.02519574  1.2277665
##   7     1e-01  1.0261356  0.03989707  0.8309402
##   9     0e+00  8.5809710  0.05730084  4.7861825
##   9     1e-04  5.4014805  0.07660154  3.2829514
##   9     1e-03  3.2035228  0.04956034  2.3623827
##   9     1e-02  1.4233815  0.04889394  1.1440116
##   9     1e-01  1.0039191  0.08018853  0.8177193
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 1e-04.

nn_predictions <- predict(nn_model, newdata = test_data_scaled)

# Evaluate model performance
nn_rmse <- RMSE(nn_predictions, test_data$proficiency)
nn_r2 <- R2(nn_predictions, test_data$proficiency)

cat("Neural Network Model:\n")

## Neural Network Model:

cat("RMSE:", nn_rmse, "\n")

## RMSE: 25.94952

cat("R-squared:", nn_r2, "\n")

## R-squared: 0.07847733

ggplot(data.frame(actual = test_data$proficiency, predicted = nn_predictions), 
       aes(x = actual, y = predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Neural Network: Actual vs. Predicted Values",
       x = "Actual Proficiency",
       y = "Predicted Proficiency") +
  theme_minimal()

nn_importance <- varImp(nn_model)
plot(nn_importance, main = "Neural Network: Variable Importance")

Model Comparison

model_comparison <- data.frame(
  Model = c("Linear Regression", "Decision Tree", "Neural Network"),
  RMSE = c(lm_rmse, dt_rmse, nn_rmse),
  R_Squared = c(lm_r2, dt_r2, nn_r2)
)
kable(model_comparison, caption = "Model Performance Comparison") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Model Performance Comparison
Model	RMSE	R_Squared
Linear Regression	5.211434	0.4000578
Decision Tree	5.540997	0.1999434
Neural Network	25.949519	0.0784773

model_comparison_long <- model_comparison %>%
  pivot_longer(cols = c(RMSE, R_Squared), names_to = "Metric", values_to = "Value")

ggplot(model_comparison_long, aes(x = Model, y = Value, fill = Model)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ Metric, scales = "free") +
  theme_minimal() +
  labs(title = "Model Performance Comparison",
       y = "Value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Key Findings and Insights

key_predictors <- bind_rows(
  data.frame(model = "Linear Regression", varImp(lm_model)$importance),
  data.frame(model = "Decision Tree", varImp(dt_model)$importance),
  data.frame(model = "Neural Network", varImp(nn_model)$importance)
)
key_predictors_summary <- key_predictors %>%
  group_by(model) %>%
  mutate(row = row_number()) %>%
  ungroup() %>%
  mutate(variable = case_when(
    row == 1 ~ "ppcstot",
    row == 2 ~ "unemployed",
    row == 3 ~ "bachelor_or_higher",
    row == 4 ~ "high_school_grad",
    row == 5 ~ "median_household_income",
    row == 6 ~ "poverty_rate"
  )) %>%
  select(-row) %>%
  group_by(variable) %>%
  summarize(avg_importance = mean(Overall, na.rm = TRUE)) %>%
  arrange(desc(avg_importance))

kable(key_predictors_summary, caption = "Key Predictors of Educational Outcomes") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Key Predictors of Educational Outcomes
variable	avg_importance
ppcstot	66.66667
unemployed	33.33333

regional_differences <- integrated_data %>%
  group_by(region) %>%
  summarize(
    mean_proficiency = mean(proficiency, na.rm = TRUE),
    mean_ppcstot = mean(ppcstot, na.rm = TRUE),
    mean_bachelor_rate = mean(bachelor_or_higher, na.rm = TRUE),
    mean_poverty_rate = mean(poverty_rate, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_proficiency))
kable(regional_differences, caption = "Educational Outcomes and Key Factors by Region") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Educational Outcomes and Key Factors by Region
region	mean_proficiency	mean_ppcstot	mean_bachelor_rate	mean_poverty_rate
Eastern Panhandle	27.79125	13744.00	NaN	NaN
North Central	26.60900	13317.70	NaN	NaN
Ohio Valley	26.42000	15695.70	NaN	NaN
Northern Panhandle	25.60333	17159.67	NaN	NaN
Metro Valley	25.35500	13918.17	NaN	NaN
Central Counties	24.62750	14275.75	NaN	NaN
Southern Counties	19.82500	13516.17	NaN	NaN
Other	17.42000	13777.00	NaN	NaN