##Introduction This analysis explores the relationships between educational outcomes, funding, demographics, and other socioeconomic factors across West Virginia counties. We aim to identify key drivers of educational performance and develop predictive models to help understand what factors most influence student achievement

##Key Research Questions

Load assessment data

**need to fix error still

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(readxl)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(rpart.plot)

assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'


t_assess_raw_school <- read_excel(path = assessment_path,
                           sheet = 'SY21 School & District',
                           range = 'b2:f7312')


t_assess_raw_science <- read_excel(path = assessment_path,
                           sheet = 'SY21 School & District',
                           range = 'db3:db7312', 
                           col_names = c('science_proficiency'),
                           na = '**')

t_assess_raw <- t_assess_raw_school %>%
  bind_cols(t_assess_raw_science) %>% 
  janitor::clean_names()  


# Remove subgroups
t_assess <- t_assess_raw %>% 
  filter(school == 999) %>% 
  filter(population_group == 'Total Population') %>% 
  filter(county != 'Statewide') %>% 
  mutate(proficiency = science_proficiency)  

print(t_assess)
## # A tibble: 55 × 7
##    county    school school_name    population_group subgroup science_proficiency
##    <chr>     <chr>  <chr>          <chr>            <chr>                  <dbl>
##  1 Barbour   999    Barbour Count… Total Population Total                   26.0
##  2 Berkeley  999    Berkeley Coun… Total Population Total                   28.6
##  3 Boone     999    Boone County … Total Population Total                   19.6
##  4 Braxton   999    Braxton Count… Total Population Total                   22.6
##  5 Brooke    999    Brooke County… Total Population Total                   21.1
##  6 Cabell    999    Cabell County… Total Population Total                   30.8
##  7 Calhoun   999    Calhoun Count… Total Population Total                   27.8
##  8 Clay      999    Clay County T… Total Population Total                   23.3
##  9 Doddridge 999    Doddridge Cou… Total Population Total                   31.3
## 10 Fayette   999    Fayette Count… Total Population Total                   17.4
## # ℹ 45 more rows
## # ℹ 1 more variable: proficiency <dbl>
assessment_path_2022 <- './wv ed student achievement/SY22_AssessmentProficiency_Public_All_Final.xlsx'

tryCatch({
  t_assess_2022_raw <- read_excel(path = assessment_path_2022) %>%
    janitor::clean_names()
})
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
## • `` -> `...33`
## • `` -> `...34`
## • `` -> `...35`
## • `` -> `...36`
## • `` -> `...38`
## • `` -> `...39`
## • `` -> `...40`
## • `` -> `...41`
## • `` -> `...43`
## • `` -> `...44`
## • `` -> `...45`
## • `` -> `...46`
## • `` -> `...48`
## • `` -> `...49`
## • `` -> `...50`
## • `` -> `...51`
## • `` -> `...53`
## • `` -> `...54`
## • `` -> `...55`
## • `` -> `...56`
## • `` -> `...58`
## • `` -> `...59`
## • `` -> `...60`
## • `` -> `...61`
## • `` -> `...63`
## • `` -> `...64`
## • `` -> `...65`
## • `` -> `...66`
## • `` -> `...68`
## • `` -> `...69`
## • `` -> `...70`
## • `` -> `...71`
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## t_assess_2022 <- t_assess_2022_raw %>%
   ## filter(grepl("County", district)) %>%  
   ## mutate(county = gsub(" County Schools", "", district)) %>%
   ## select(county, contains("proficiency")) %>%
    ## arrange(county)
## print(t_assess_2022)

Load spending data

spending_path <- './us census ed spending/elsec22t.xls'

t_spending_raw <- read_excel(path = spending_path,,
                           sheet = 'elsec22t',
                           range = 'a1:gb14106') %>% 
  janitor::clean_names()
## New names:
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...92`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...97`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...102`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## • `` -> `...107`
## • `` -> `...108`
## • `` -> `...109`
## • `` -> `...110`
## • `` -> `...111`
## • `` -> `...112`
## • `` -> `...113`
## • `` -> `...114`
## • `` -> `...115`
## • `` -> `...116`
## • `` -> `...117`
## • `` -> `...118`
## • `` -> `...119`
## • `` -> `...120`
## • `` -> `...121`
## • `` -> `...122`
## • `` -> `...123`
## • `` -> `...124`
## • `` -> `...125`
## • `` -> `...126`
## • `` -> `...127`
## • `` -> `...128`
## • `` -> `...129`
## • `` -> `...130`
## • `` -> `...131`
## • `` -> `...132`
## • `` -> `...133`
## • `` -> `...134`
## • `` -> `...135`
## • `` -> `...136`
## • `` -> `...137`
## • `` -> `...138`
## • `` -> `...139`
## • `` -> `...140`
## • `` -> `...141`
## • `` -> `...142`
## • `` -> `...143`
## • `` -> `...144`
## • `` -> `...145`
## • `` -> `...146`
## • `` -> `...147`
## • `` -> `...148`
## • `` -> `...149`
## • `` -> `...150`
## • `` -> `...151`
## • `` -> `...152`
## • `` -> `...153`
## • `` -> `...154`
## • `` -> `...155`
## • `` -> `...156`
## • `` -> `...157`
## • `` -> `...158`
## • `` -> `...159`
## • `` -> `...160`
## • `` -> `...161`
## • `` -> `...162`
## • `` -> `...163`
## • `` -> `...164`
## • `` -> `...165`
## • `` -> `...166`
## • `` -> `...167`
## • `` -> `...168`
## • `` -> `...169`
## • `` -> `...170`
## • `` -> `...171`
## • `` -> `...172`
## • `` -> `...173`
## • `` -> `...174`
## • `` -> `...175`
## • `` -> `...176`
## • `` -> `...177`
## • `` -> `...178`
## • `` -> `...179`
## • `` -> `...180`
## • `` -> `...181`
## • `` -> `...182`
## • `` -> `...183`
## • `` -> `...184`
cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
                'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
                'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')

t_spending <- t_spending_raw %>% 
  filter(state == 49) %>% 
  filter(!name %in% cooperates) %>% 
  select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>% 
  mutate(county = str_to_title(str_split_i(name, ' ',1)),
         county = ifelse(county == 'Mc', 'McDowell', county))


print(t_spending)
## # A tibble: 55 × 8
##    name                  enroll tfedrev tstrev tlocrev totalexp ppcstot county  
##    <chr>                  <dbl>   <dbl>  <dbl>   <dbl>    <dbl>   <dbl> <chr>   
##  1 BARBOUR CO SCH DIST     2144    7559  16584    5872    28021   11885 Barbour 
##  2 BERKELEY CO SCH DIST   19722   48407 140127   86699   264253   12704 Berkeley
##  3 BOONE CO SCH DIST       3177    8194  26858   14564    48642   14663 Boone   
##  4 BRAXTON CO SCH DIST     1747    5479  12748    6404    24417   13153 Braxton 
##  5 BROOKE CO SCH DIST      2582    6791  17114   21352    41908   15642 Brooke  
##  6 CABELL CO SCH DIST     11667   42518  88337   66699   183621   14538 Cabell  
##  7 CALHOUN CO SCH DIST      861    3254   9953    3190    15154   16085 Calhoun 
##  8 CLAY CO SCH DIST        1669    6157  17655    2791    25963   13825 Clay    
##  9 DODDRIDGE CO SCH DIST   1082    3455   3999   31752    38493   23563 Doddrid…
## 10 FAYETTE CO SCH DIST     5594   15293  51759   23477    83373   13777 Fayette 
## # ℹ 45 more rows

Load demographic data

Synthetic Data as of now, going to pull and use real data sources for Educational attainment data and Income and poverty data

t_demographics_unemployed <- read_csv('unemployed.csv', 
                            skip = 4,
                            na = 'N/A') %>%
  janitor::clean_names() %>% 
  filter(county != 'West Virginia',
         county != 'United States',
         !is.na(value_percent) ) %>% 
  select(county, value_percent) %>%
  rename(unemployed = value_percent)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(t_demographics_unemployed)
## # A tibble: 55 × 2
##    county          unemployed
##    <chr>                <dbl>
##  1 McDowell County       15.1
##  2 Braxton County        14.4
##  3 Logan County          13.3
##  4 Calhoun County        12.2
##  5 Roane County          11.7
##  6 Clay County           11.2
##  7 Mingo County          11.2
##  8 Webster County        11.1
##  9 Monroe County         10.6
## 10 Barbour County        10.1
## # ℹ 45 more rows
wv_counties <- unique(t_assess$county)

t_demographics_education <- tibble(
  county = wv_counties,
  bachelor_or_higher = runif(length(wv_counties), 10, 40), 
  high_school_grad = runif(length(wv_counties), 70, 95)    
)
t_demographics_income <- tibble(
  county = wv_counties,
  median_household_income = runif(length(wv_counties), 35000, 65000),
  poverty_rate = runif(length(wv_counties), 10, 30)
)

t_demographics <-  t_demographics_unemployed %>%
  left_join(t_demographics_education, by = "county") %>%
  left_join(t_demographics_income, by = "county")

print(t_demographics)
## # A tibble: 55 × 6
##    county  unemployed bachelor_or_higher high_school_grad median_household_inc…¹
##    <chr>        <dbl>              <dbl>            <dbl>                  <dbl>
##  1 McDowe…       15.1                 NA               NA                     NA
##  2 Braxto…       14.4                 NA               NA                     NA
##  3 Logan …       13.3                 NA               NA                     NA
##  4 Calhou…       12.2                 NA               NA                     NA
##  5 Roane …       11.7                 NA               NA                     NA
##  6 Clay C…       11.2                 NA               NA                     NA
##  7 Mingo …       11.2                 NA               NA                     NA
##  8 Webste…       11.1                 NA               NA                     NA
##  9 Monroe…       10.6                 NA               NA                     NA
## 10 Barbou…       10.1                 NA               NA                     NA
## # ℹ 45 more rows
## # ℹ abbreviated name: ¹​median_household_income
## # ℹ 1 more variable: poverty_rate <dbl>

Joined data

Checkbook data

checkbook_files <- list.files(path = "./checkbook_2023/", 
                             pattern = "*.csv", 
                             full.names = TRUE)
process_checkbook <- function(file_path) {
  county_name <- str_extract(basename(file_path), "(?<=23)[A-Za-z]+(?=\\d+)")
  
 tryCatch({
    # Read the file
    df <- read_csv(file_path, show_col_types = FALSE) %>%
      janitor::clean_names()
    

    if("county" %in% colnames(df)) {
      if(is.character(df$county)) {
        df$county <- as.numeric(df$county)
      }
    }
    

    df <- df %>% mutate(county_name = county_name)
    

    return(df)
  })
}


sample_counties <- c("./checkbook_2023/23-Barbour-02_Updated.csv",
                    "./checkbook_2023/23-Grant-24_Updated.csv",
                    "./checkbook_2023/23-Jackson-35_Updated.csv")
checkbook_data_samples <- map_dfr(sample_counties, process_checkbook)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
education_spending <- checkbook_data_samples %>%
  mutate(ck_amt = as.numeric(ck_amt)) %>%
  
filter(prog %in% c(1, 2, 3, 4) | 
         grepl("EDUCATION|SCHOOL|TEACH|INSTRUCT|CURRICULUM", toupper(invdesc))) %>%
  group_by(county_name) %>%
  summarize(
    total_education_spending = sum(ck_amt, na.rm = TRUE),
    instructional_spending = sum(ck_amt[grepl("INSTRUCT|TEACH|CLASSROOM", toupper(invdesc))], na.rm = TRUE),
    technology_spending = sum(ck_amt[grepl("COMPUTER|TECHNOLOGY|SOFTWARE", toupper(invdesc))], na.rm = TRUE),
    facility_spending = sum(ck_amt[grepl("FACILITY|BUILDING|MAINTENANCE", toupper(invdesc))], na.rm = TRUE),
    .groups = "drop"
  )
print(education_spending)
## # A tibble: 1 × 5
##   county_name total_education_spend…¹ instructional_spending technology_spending
##   <chr>                         <dbl>                  <dbl>               <dbl>
## 1 <NA>                       2271485.                 81529.               6442.
## # ℹ abbreviated name: ¹​total_education_spending
## # ℹ 1 more variable: facility_spending <dbl>
standardize_county_name <- function(county) {
  county <- str_to_title(county)  
  county <- str_trim(county)      
  county <- str_replace(county, "Mcdowell", "McDowell")  
  county <- str_replace(county, " County", "")  
  return(county)
}

if (exists("t_assess")) {
  t_assess <- t_assess %>% 
    mutate(county = standardize_county_name(county))
}

if (exists("t_assess_2022")) {
  t_assess_2022 <- t_assess_2022 %>% 
    mutate(county = standardize_county_name(county))
}

if (exists("t_spending")) {
  t_spending <- t_spending %>% 
    mutate(county = standardize_county_name(county))
}

if (exists("t_demographics")) {
  t_demographics <- t_demographics %>% 
    mutate(county = standardize_county_name(county))
}

if (exists("education_spending")) {
  education_spending <- education_spending %>% 
    mutate(county = standardize_county_name(county_name)) %>%
    select(-county_name)
}


integrated_data <- t_assess %>%
  left_join(t_spending, by = "county") %>%
  left_join(t_demographics, by = "county")
if (exists("education_spending")) {
  integrated_data <- integrated_data %>%
    left_join(education_spending, by = "county")
}
if (exists("t_assess_2022") && ncol(t_assess_2022) > 1) {
  integrated_data <- integrated_data %>%
    left_join(t_assess_2022, by = "county", suffix = c("_2021", "_2022"))
}
print(integrated_data)
## # A tibble: 55 × 23
##    county    school school_name    population_group subgroup science_proficiency
##    <chr>     <chr>  <chr>          <chr>            <chr>                  <dbl>
##  1 Barbour   999    Barbour Count… Total Population Total                   26.0
##  2 Berkeley  999    Berkeley Coun… Total Population Total                   28.6
##  3 Boone     999    Boone County … Total Population Total                   19.6
##  4 Braxton   999    Braxton Count… Total Population Total                   22.6
##  5 Brooke    999    Brooke County… Total Population Total                   21.1
##  6 Cabell    999    Cabell County… Total Population Total                   30.8
##  7 Calhoun   999    Calhoun Count… Total Population Total                   27.8
##  8 Clay      999    Clay County T… Total Population Total                   23.3
##  9 Doddridge 999    Doddridge Cou… Total Population Total                   31.3
## 10 Fayette   999    Fayette Count… Total Population Total                   17.4
## # ℹ 45 more rows
## # ℹ 17 more variables: proficiency <dbl>, name <chr>, enroll <dbl>,
## #   tfedrev <dbl>, tstrev <dbl>, tlocrev <dbl>, totalexp <dbl>, ppcstot <dbl>,
## #   unemployed <dbl>, bachelor_or_higher <dbl>, high_school_grad <dbl>,
## #   median_household_income <dbl>, poverty_rate <dbl>,
## #   total_education_spending <dbl>, instructional_spending <dbl>,
## #   technology_spending <dbl>, facility_spending <dbl>
# Merge data
t <- t_assess 

Summary Stats

summary_stats <- integrated_data %>%
  select(where(is.numeric)) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  group_by(variable) %>%
  summarize(
    n = sum(!is.na(value)),
    min = min(value, na.rm = TRUE),
    q1 = quantile(value, 0.25, na.rm = TRUE),
    median = median(value, na.rm = TRUE),
    mean = mean(value, na.rm = TRUE),
    q3 = quantile(value, 0.75, na.rm = TRUE),
    max = max(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE)
  )
## Warning: There were 16 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `min = min(value, na.rm = TRUE)`.
## ℹ In group 1: `variable = "bachelor_or_higher"`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 15 remaining warnings.
kable(summary_stats, caption = "Summary Statistics of Numeric Variables") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Summary Statistics of Numeric Variables
variable n min q1 median mean q3 max sd
bachelor_or_higher 0 Inf NA NA NaN NA -Inf NA
enroll 55 800.00 1654.00 3177.00 4585.581818 5104.000 24392.0 4615.970845
facility_spending 0 Inf NA NA NaN NA -Inf NA
high_school_grad 0 Inf NA NA NaN NA -Inf NA
instructional_spending 0 Inf NA NA NaN NA -Inf NA
median_household_income 0 Inf NA NA NaN NA -Inf NA
poverty_rate 0 Inf NA NA NaN NA -Inf NA
ppcstot 55 11885.00 13151.00 13777.00 14466.054545 15235.500 23563.0 2188.579900
proficiency 55 14.81 21.45 24.36 25.304727 29.395 41.8 5.817084
science_proficiency 55 14.81 21.45 24.36 25.304727 29.395 41.8 5.817084
technology_spending 0 Inf NA NA NaN NA -Inf NA
tfedrev 55 1511.00 4991.00 10158.00 13311.909091 14517.500 109522.0 16225.080652
tlocrev 55 1956.00 8194.50 14813.00 25031.909091 33333.000 145623.0 26739.034924
total_education_spending 0 Inf NA NA NaN NA -Inf NA
totalexp 55 13954.00 26486.00 48642.00 69482.090909 81171.500 416491.0 70702.975648
tstrev 55 3895.00 12668.50 26858.00 34234.363636 39496.000 176062.0 33479.121896
unemployed 55 2.60 5.05 6.40 7.054545 8.500 15.1 2.918194
ggplot(integrated_data, aes(x = proficiency)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Science Proficiency Across WV Counties",
       x = "Science Proficiency (%)",
       y = "Count") +
  theme_minimal()

top_counties <- integrated_data %>%
  arrange(desc(proficiency)) %>%
  select(county, proficiency) %>%
  head(5)

bottom_counties <- integrated_data %>%
  arrange(proficiency) %>%
  select(county, proficiency) %>%
  head(5)

kable(top_counties, caption = "Top 5 Counties by Science Proficiency") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Top 5 Counties by Science Proficiency
county proficiency
Monongalia 41.80
Jefferson 38.71
Ohio 35.75
Wood 35.28
Putnam 34.38
kable(bottom_counties, caption = "Bottom 5 Counties by Science Proficiency") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Bottom 5 Counties by Science Proficiency
county proficiency
McDowell 14.81
Lewis 17.03
Wyoming 17.36
Fayette 17.42
Upshur 18.48
library(ggcorrplot)

Regional Analysis

eastern_panhandle <- c("Berkeley", "Jefferson", "Morgan", "Hampshire", "Mineral", "Hardy", "Grant", "Pendleton")
northern_panhandle <- c("Hancock", "Brooke", "Ohio", "Marshall", "Wetzel", "Tyler")
metro_valley <- c("Kanawha", "Putnam", "Cabell", "Wayne", "Lincoln", "Boone")
southern_counties <- c("Mingo", "Logan", "Wyoming", "McDowell", "Mercer", "Raleigh")
central_counties <- c("Braxton", "Clay", "Nicholas", "Webster", "Pocahontas", "Greenbrier", "Monroe", "Summers")
north_central <- c("Monongalia", "Marion", "Harrison", "Taylor", "Preston", "Barbour", "Tucker", "Randolph", "Upshur", "Lewis")
ohio_valley <- c("Wood", "Pleasants", "Tyler", "Doddridge", "Ritchie", "Wirt", "Jackson", "Mason", "Roane", "Calhoun", "Gilmer")

integrated_data <- integrated_data %>%
  mutate(region = case_when(
    county %in% eastern_panhandle ~ "Eastern Panhandle",
    county %in% northern_panhandle ~ "Northern Panhandle",
    county %in% metro_valley ~ "Metro Valley",
    county %in% southern_counties ~ "Southern Counties",
    county %in% central_counties ~ "Central Counties",
    county %in% north_central ~ "North Central",
    county %in% ohio_valley ~ "Ohio Valley",
    TRUE ~ "Other"
  ))

ggplot(integrated_data, aes(x = region, y = proficiency, fill = region)) +
  geom_boxplot() +
  labs(title = "Science Proficiency by Region",
       x = "Region",
       y = "Science Proficiency (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

region_stats <- integrated_data %>%
  group_by(region) %>%
  summarize(
    mean_proficiency = mean(proficiency, na.rm = TRUE),
    median_proficiency = median(proficiency, na.rm = TRUE),
    sd_proficiency = sd(proficiency, na.rm = TRUE),
    count = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_proficiency))

kable(region_stats, caption = "Science Proficiency by Region") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Science Proficiency by Region
region mean_proficiency median_proficiency sd_proficiency count
Eastern Panhandle 27.79125 26.730 5.050399 8
North Central 26.60900 26.075 7.262833 10
Ohio Valley 26.42000 25.745 5.128446 10
Northern Panhandle 25.60333 24.650 5.429636 6
Metro Valley 25.35500 24.400 6.426949 6
Central Counties 24.62750 22.965 5.052512 8
Southern Counties 19.82500 18.790 3.979727 6
Other 17.42000 17.420 NA 1

Correlation Analysis

correlation_vars <- c("proficiency", "ppcstot", "unemployed", "bachelor_or_higher", 
                     "high_school_grad", "median_household_income", "poverty_rate")


cat("Available data counts:\n")
## Available data counts:
for(var in correlation_vars) {
  if(var %in% names(integrated_data)) {
    cat(var, ": ", sum(!is.na(integrated_data[[var]])), " non-NA values\n", sep="")
  } else {
    cat(var, ": not in dataset\n", sep="")
  }
}
## proficiency: 55 non-NA values
## ppcstot: 55 non-NA values
## unemployed: 55 non-NA values
## bachelor_or_higher: 0 non-NA values
## high_school_grad: 0 non-NA values
## median_household_income: 0 non-NA values
## poverty_rate: 0 non-NA values
available_vars <- correlation_vars[correlation_vars %in% names(integrated_data)]


correlation_data <- integrated_data %>%
  select(all_of(intersect(available_vars, names(integrated_data)))) %>%
  
  select_if(~ !all(is.na(.))) %>%
 
  na.omit()


if(nrow(correlation_data) >= 3 && ncol(correlation_data) >= 2) {
 
  no_var_cols <- correlation_data %>%
    summarise_all(~ var(., na.rm = TRUE) == 0) %>%
    pivot_longer(everything(), names_to = "variable", values_to = "no_variance") %>%
    filter(no_variance) %>%
    pull(variable)
  
  if(length(no_var_cols) > 0) {
    cat("Removing variables with no variance:", paste(no_var_cols, collapse=", "), "\n")
    correlation_data <- correlation_data %>% select(-all_of(no_var_cols))
  }
  
  
  correlation_matrix <- cor(correlation_data, use = "pairwise.complete.obs")
  

  if(any(is.na(correlation_matrix)) || any(is.infinite(as.matrix(correlation_matrix)))) {
    cat("Warning: Correlation matrix contains NA or Inf values.\n")
    
    corrplot::corrplot(correlation_matrix, 
                       method = "color", 
                       type = "lower", 
                       addCoef.col = "black",
                       tl.col = "black", 
                       title = "Correlation Between Educational Outcomes and Various Factors")
  } else {
    
    if(ncol(correlation_data) < 3) {
      ggcorrplot(
        correlation_matrix,
        hc.order = FALSE,  # Disable hierarchical clustering with few variables
        type = "lower",
        lab = TRUE,
        lab_size = 3,
        method = "circle",
        colors = c("#6D9EC1", "white", "#E46726"),
        title = "Correlation Between Educational Outcomes and Various Factors",
        ggtheme = theme_minimal
      )
    } else {
    
      tryCatch({
        ggcorrplot(
          correlation_matrix,
          hc.order = TRUE,
          type = "lower",
          lab = TRUE,
          lab_size = 3,
          method = "circle",
          colors = c("#6D9EC1", "white", "#E46726"),
          title = "Correlation Between Educational Outcomes and Various Factors",
          ggtheme = theme_minimal
        )
      }, error = function(e) {
       
        cat("Falling back to plot without hierarchical clustering due to error:", e$message, "\n")
        ggcorrplot(
          correlation_matrix,
          hc.order = FALSE,
          type = "lower",
          lab = TRUE,
          lab_size = 3,
          method = "circle",
          colors = c("#6D9EC1", "white", "#E46726"),
          title = "Correlation Between Educational Outcomes and Various Factors",
          ggtheme = theme_minimal
        )
      })
    }
  }
  
 
  if("proficiency" %in% colnames(correlation_matrix)) {
    proficiency_correlations <- correlation_matrix[, "proficiency"]
    proficiency_correlations <- data.frame(
      variable = names(proficiency_correlations),
      correlation = proficiency_correlations
    ) %>%
      arrange(desc(abs(correlation)))
    

    kable(proficiency_correlations, caption = "Correlations with Science Proficiency") %>%
      kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
  } else {
    cat("Proficiency variable not available for correlation analysis")
  }
} else {
  cat("Not enough complete data for correlation analysis.\n")
  cat("Number of rows in correlation data: ", nrow(correlation_data), "\n")
  cat("Number of columns in correlation data: ", ncol(correlation_data), "\n")
  cat("Please ensure you have more complete data or adjust your variable selection.")
} 
Correlations with Science Proficiency
variable correlation
proficiency proficiency 1.0000000
unemployed unemployed -0.3233399
ppcstot ppcstot 0.0935548

Spending Analysis

ggplot(integrated_data, aes(x = ppcstot, y = proficiency)) +
  geom_point(aes(color = region), size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Relationship Between Per-Pupil Spending and Science Proficiency",
       x = "Per-Pupil Spending ($)",
       y = "Science Proficiency (%)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

spending_by_region <- integrated_data %>%
  group_by(region) %>%
  summarize(
    mean_spending = mean(ppcstot, na.rm = TRUE),
    mean_proficiency = mean(proficiency, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_spending))
kable(spending_by_region, caption = "Education Spending and Proficiency by Region") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Education Spending and Proficiency by Region
region mean_spending mean_proficiency
Northern Panhandle 17159.67 25.60333
Ohio Valley 15695.70 26.42000
Central Counties 14275.75 24.62750
Metro Valley 13918.17 25.35500
Other 13777.00 17.42000
Eastern Panhandle 13744.00 27.79125
Southern Counties 13516.17 19.82500
North Central 13317.70 26.60900

Bar Chart and Scatter Plot of proficiency by county

ggplot(integrated_data, aes(x = reorder(county, proficiency), y = proficiency)) +
  geom_col(fill = "steelblue") +
  coord_flip() +  # Flip coordinates for better county name display
  labs(title = "Science Proficiency by County in West Virginia",
       x = "County",
       y = "Science Proficiency (%)") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))  # Adjust text size as needed

if("ppcstot" %in% names(integrated_data)) {
  ggplot(integrated_data, aes(x = reorder(county, ppcstot), y = ppcstot)) +
    geom_col(fill = "darkgreen") +
    coord_flip() +
    labs(title = "Per-Pupil Spending by County in West Virginia",
         x = "County",
         y = "Per-Pupil Spending ($)") +
    theme_minimal() +
    theme(axis.text.y = element_text(size = 8))
}

if("ppcstot" %in% names(integrated_data)) {
  ggplot(integrated_data, aes(x = ppcstot, y = proficiency)) +
    geom_point(color = "blue", size = 3, alpha = 0.7) +
    # Instead of geom_text_repel, use simpler text labeling
    geom_text(aes(label = county), hjust = -0.1, vjust = 0.1, size = 2.5) +
    geom_smooth(method = "lm", se = TRUE, color = "red") +  # Add trend line
    labs(title = "Science Proficiency vs. Per-Pupil Spending",
         x = "Per-Pupil Spending ($)",
         y = "Science Proficiency (%)") +
    theme_minimal()
}
## `geom_smooth()` using formula = 'y ~ x'

## Principal Component Analysis

pca_data <- integrated_data %>%
  select(proficiency, ppcstot, unemployed) %>%
  drop_na()


print(dim(pca_data))
## [1] 55  3
if(nrow(pca_data) > 0 && ncol(pca_data) > 1) {
 
  pca_data_scaled <- scale(pca_data)
  

  pca_result <- prcomp(pca_data_scaled)
  

  print(summary(pca_result))
  

  biplot(pca_result, scale = 0)
}
## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.1794 0.9672 0.8207
## Proportion of Variance 0.4637 0.3118 0.2245
## Cumulative Proportion  0.4637 0.7755 1.0000

## K-means Clustering

set.seed(123)
km <- kmeans(pca_data, centers = 3)
integrated_data$cluster <- km$cluster[match(rownames(integrated_data), rownames(pca_data))]


ggplot(integrated_data, aes(x = ppcstot, y = proficiency, color = factor(cluster))) +
  geom_point(size = 3) +
  labs(title = "K-means Clustering of WV Counties",
       x = "Per-Pupil Spending ($)",
       y = "Science Proficiency (%)",
       color = "Cluster") +
  theme_minimal()

Prepare Data for Modeling

model_data <- integrated_data %>%
  select(proficiency, ppcstot, unemployed) %>%
  drop_na()

dim(model_data)
## [1] 55  3
set.seed(123)
train_index <- createDataPartition(model_data$proficiency, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]


ctrl <- trainControl(method = "cv", number = 5)


lm_model <- train(
  proficiency ~ .,
  data = train_data,
  method = "lm",
  trControl = ctrl
)


summary(lm_model$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0840 -4.4404 -0.2178  3.4038 16.1935 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.4887305  6.2262988   3.773 0.000479 ***
## ppcstot      0.0003147  0.0003796   0.829 0.411622    
## unemployed  -0.3876326  0.3104507  -1.249 0.218410    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.733 on 44 degrees of freedom
## Multiple R-squared:  0.05533,    Adjusted R-squared:  0.01239 
## F-statistic: 1.289 on 2 and 44 DF,  p-value: 0.2859
lm_predictions <- predict(lm_model, newdata = test_data)


lm_rmse <- RMSE(lm_predictions, test_data$proficiency)
lm_r2 <- R2(lm_predictions, test_data$proficiency)

cat("Linear Regression Model:\n")
## Linear Regression Model:
cat("RMSE:", lm_rmse, "\n")
## RMSE: 5.211434
cat("R-squared:", lm_r2, "\n")
## R-squared: 0.4000578

Decision Tree Model

dt_model <- train(
  proficiency ~ .,
  data = train_data,
  method = "rpart",
  trControl = ctrl,
  tuneLength = 10
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
dt_model
## CART 
## 
## 47 samples
##  2 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 37, 38, 38, 39, 36 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared    MAE     
##   0.00000000  5.699724  0.13584683  4.564666
##   0.01523184  5.699724  0.13584683  4.564666
##   0.03046368  5.699724  0.13584683  4.564666
##   0.04569552  5.699724  0.13584683  4.564666
##   0.06092736  5.699724  0.13584683  4.564666
##   0.07615920  5.889352  0.07521389  4.763857
##   0.09139104  5.889352  0.07521389  4.763857
##   0.10662288  5.889352  0.07521389  4.763857
##   0.12185472  5.819587  0.07521389  4.786079
##   0.13708656  5.926893  0.03330881  4.901399
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.06092736.
rpart.plot(dt_model$finalModel, 
           box.palette = "Blues",
           main = "Decision Tree for Science Proficiency")

dt_predictions <- predict(dt_model, newdata = test_data)

dt_rmse <- RMSE(dt_predictions, test_data$proficiency)
dt_r2 <- R2(dt_predictions, test_data$proficiency)

cat("Decision Tree Model:\n")
## Decision Tree Model:
cat("RMSE:", dt_rmse, "\n")
## RMSE: 5.540997
cat("R-squared:", dt_r2, "\n")
## R-squared: 0.1999434
ggplot(data.frame(actual = test_data$proficiency, predicted = dt_predictions), 
       aes(x = actual, y = predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Decision Tree: Actual vs. Predicted Values",
       x = "Actual Proficiency",
       y = "Predicted Proficiency") +
  theme_minimal()

dt_importance <- varImp(dt_model)
plot(dt_importance, main = "Decision Tree: Variable Importance")

Neural Network Model

preproc <- preProcess(train_data, method = c("center", "scale"))
train_data_scaled <- predict(preproc, train_data)
test_data_scaled <- predict(preproc, test_data)


nn_model <- train(
  proficiency ~ .,
  data = train_data_scaled,
  method = "nnet",
  trControl = ctrl,
  tuneLength = 5,
  linout = TRUE,
  trace = FALSE,
  maxit = 500
)


nn_model
## Neural Network 
## 
## 47 samples
##  2 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 38, 38, 36, 39, 37 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE       Rsquared    MAE      
##   1     0e+00  0.9311043  0.19073588  0.7800610
##   1     1e-04  0.9214202  0.20070758  0.7712265
##   1     1e-03  1.0080998  0.13003397  0.8425091
##   1     1e-02  0.9469247  0.15423751  0.7806526
##   1     1e-01  0.9563506  0.12370917  0.7734629
##   3     0e+00  1.1019919  0.08143896  0.8503733
##   3     1e-04  1.0572604  0.18210340  0.8456033
##   3     1e-03  1.6090489  0.01778995  1.2573556
##   3     1e-02  1.1006797  0.06766182  0.9418332
##   3     1e-01  1.0167830  0.04424340  0.8194010
##   5     0e+00  7.1035214  0.20504415  4.7240752
##   5     1e-04  2.7204605  0.02376320  1.9661360
##   5     1e-03  1.8360464  0.15465438  1.4223430
##   5     1e-02  1.1947307  0.02002665  0.9961097
##   5     1e-01  1.0389848  0.02550453  0.8367510
##   7     0e+00  2.3060228  0.17638860  1.7456562
##   7     1e-04  1.8929106  0.13051393  1.4170996
##   7     1e-03  2.0223818  0.09974229  1.4855287
##   7     1e-02  1.5292394  0.02519574  1.2277665
##   7     1e-01  1.0261356  0.03989707  0.8309402
##   9     0e+00  8.5809710  0.05730084  4.7861825
##   9     1e-04  5.4014805  0.07660154  3.2829514
##   9     1e-03  3.2035228  0.04956034  2.3623827
##   9     1e-02  1.4233815  0.04889394  1.1440116
##   9     1e-01  1.0039191  0.08018853  0.8177193
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 1e-04.
nn_predictions <- predict(nn_model, newdata = test_data_scaled)

# Evaluate model performance
nn_rmse <- RMSE(nn_predictions, test_data$proficiency)
nn_r2 <- R2(nn_predictions, test_data$proficiency)

cat("Neural Network Model:\n")
## Neural Network Model:
cat("RMSE:", nn_rmse, "\n")
## RMSE: 25.94952
cat("R-squared:", nn_r2, "\n")
## R-squared: 0.07847733
ggplot(data.frame(actual = test_data$proficiency, predicted = nn_predictions), 
       aes(x = actual, y = predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Neural Network: Actual vs. Predicted Values",
       x = "Actual Proficiency",
       y = "Predicted Proficiency") +
  theme_minimal()

nn_importance <- varImp(nn_model)
plot(nn_importance, main = "Neural Network: Variable Importance")

Model Comparison

model_comparison <- data.frame(
  Model = c("Linear Regression", "Decision Tree", "Neural Network"),
  RMSE = c(lm_rmse, dt_rmse, nn_rmse),
  R_Squared = c(lm_r2, dt_r2, nn_r2)
)
kable(model_comparison, caption = "Model Performance Comparison") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Model Performance Comparison
Model RMSE R_Squared
Linear Regression 5.211434 0.4000578
Decision Tree 5.540997 0.1999434
Neural Network 25.949519 0.0784773
model_comparison_long <- model_comparison %>%
  pivot_longer(cols = c(RMSE, R_Squared), names_to = "Metric", values_to = "Value")

ggplot(model_comparison_long, aes(x = Model, y = Value, fill = Model)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ Metric, scales = "free") +
  theme_minimal() +
  labs(title = "Model Performance Comparison",
       y = "Value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Key Findings and Insights

key_predictors <- bind_rows(
  data.frame(model = "Linear Regression", varImp(lm_model)$importance),
  data.frame(model = "Decision Tree", varImp(dt_model)$importance),
  data.frame(model = "Neural Network", varImp(nn_model)$importance)
)
key_predictors_summary <- key_predictors %>%
  group_by(model) %>%
  mutate(row = row_number()) %>%
  ungroup() %>%
  mutate(variable = case_when(
    row == 1 ~ "ppcstot",
    row == 2 ~ "unemployed",
    row == 3 ~ "bachelor_or_higher",
    row == 4 ~ "high_school_grad",
    row == 5 ~ "median_household_income",
    row == 6 ~ "poverty_rate"
  )) %>%
  select(-row) %>%
  group_by(variable) %>%
  summarize(avg_importance = mean(Overall, na.rm = TRUE)) %>%
  arrange(desc(avg_importance))

kable(key_predictors_summary, caption = "Key Predictors of Educational Outcomes") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Key Predictors of Educational Outcomes
variable avg_importance
ppcstot 66.66667
unemployed 33.33333
regional_differences <- integrated_data %>%
  group_by(region) %>%
  summarize(
    mean_proficiency = mean(proficiency, na.rm = TRUE),
    mean_ppcstot = mean(ppcstot, na.rm = TRUE),
    mean_bachelor_rate = mean(bachelor_or_higher, na.rm = TRUE),
    mean_poverty_rate = mean(poverty_rate, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_proficiency))
kable(regional_differences, caption = "Educational Outcomes and Key Factors by Region") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Educational Outcomes and Key Factors by Region
region mean_proficiency mean_ppcstot mean_bachelor_rate mean_poverty_rate
Eastern Panhandle 27.79125 13744.00 NaN NaN
North Central 26.60900 13317.70 NaN NaN
Ohio Valley 26.42000 15695.70 NaN NaN
Northern Panhandle 25.60333 17159.67 NaN NaN
Metro Valley 25.35500 13918.17 NaN NaN
Central Counties 24.62750 14275.75 NaN NaN
Southern Counties 19.82500 13516.17 NaN NaN
Other 17.42000 13777.00 NaN NaN