WV County Education Outcomes Prediction

Thesis

The greater a Regions totalexp, the higher educational proficiency of students in that region.

Load assessment data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)
library(readxl)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(rpart)
library(rpart.plot)

assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'


t_assess_raw_school <- read_excel(path = assessment_path,
                           sheet = 'SY21 School & District',
                           range = 'b2:f7312')


t_assess_raw_science <- read_excel(path = assessment_path,
                           sheet = 'SY21 School & District',
                           range = 'db3:db7312', 
                           col_names = c('science_proficiency'),
                           na = '**')

t_assess_raw <- t_assess_raw_school %>%
  bind_cols(t_assess_raw_science) %>% 
  janitor::clean_names()  


# Remove subgroups
t_assess <- t_assess_raw %>% 
  filter(school == 999) %>% 
  filter(population_group == 'Total Population') %>% 
  filter(county != 'Statewide') %>% 
  mutate(proficiency = science_proficiency)  

print(t_assess)

## # A tibble: 55 × 7
##    county    school school_name    population_group subgroup science_proficiency
##    <chr>     <chr>  <chr>          <chr>            <chr>                  <dbl>
##  1 Barbour   999    Barbour Count… Total Population Total                   26.0
##  2 Berkeley  999    Berkeley Coun… Total Population Total                   28.6
##  3 Boone     999    Boone County … Total Population Total                   19.6
##  4 Braxton   999    Braxton Count… Total Population Total                   22.6
##  5 Brooke    999    Brooke County… Total Population Total                   21.1
##  6 Cabell    999    Cabell County… Total Population Total                   30.8
##  7 Calhoun   999    Calhoun Count… Total Population Total                   27.8
##  8 Clay      999    Clay County T… Total Population Total                   23.3
##  9 Doddridge 999    Doddridge Cou… Total Population Total                   31.3
## 10 Fayette   999    Fayette Count… Total Population Total                   17.4
## # ℹ 45 more rows
## # ℹ 1 more variable: proficiency <dbl>

Load spending data

spending_path <- './us census ed spending/elsec22t.xls'

t_spending_raw <- read_excel(path = spending_path,
                           sheet = 'elsec22t',
                           range = 'a1:gb14106') %>% 
  janitor::clean_names()

## New names:
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...92`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...97`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...102`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## • `` -> `...107`
## • `` -> `...108`
## • `` -> `...109`
## • `` -> `...110`
## • `` -> `...111`
## • `` -> `...112`
## • `` -> `...113`
## • `` -> `...114`
## • `` -> `...115`
## • `` -> `...116`
## • `` -> `...117`
## • `` -> `...118`
## • `` -> `...119`
## • `` -> `...120`
## • `` -> `...121`
## • `` -> `...122`
## • `` -> `...123`
## • `` -> `...124`
## • `` -> `...125`
## • `` -> `...126`
## • `` -> `...127`
## • `` -> `...128`
## • `` -> `...129`
## • `` -> `...130`
## • `` -> `...131`
## • `` -> `...132`
## • `` -> `...133`
## • `` -> `...134`
## • `` -> `...135`
## • `` -> `...136`
## • `` -> `...137`
## • `` -> `...138`
## • `` -> `...139`
## • `` -> `...140`
## • `` -> `...141`
## • `` -> `...142`
## • `` -> `...143`
## • `` -> `...144`
## • `` -> `...145`
## • `` -> `...146`
## • `` -> `...147`
## • `` -> `...148`
## • `` -> `...149`
## • `` -> `...150`
## • `` -> `...151`
## • `` -> `...152`
## • `` -> `...153`
## • `` -> `...154`
## • `` -> `...155`
## • `` -> `...156`
## • `` -> `...157`
## • `` -> `...158`
## • `` -> `...159`
## • `` -> `...160`
## • `` -> `...161`
## • `` -> `...162`
## • `` -> `...163`
## • `` -> `...164`
## • `` -> `...165`
## • `` -> `...166`
## • `` -> `...167`
## • `` -> `...168`
## • `` -> `...169`
## • `` -> `...170`
## • `` -> `...171`
## • `` -> `...172`
## • `` -> `...173`
## • `` -> `...174`
## • `` -> `...175`
## • `` -> `...176`
## • `` -> `...177`
## • `` -> `...178`
## • `` -> `...179`
## • `` -> `...180`
## • `` -> `...181`
## • `` -> `...182`
## • `` -> `...183`
## • `` -> `...184`

cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
                'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
                'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')

t_spending <- t_spending_raw %>% 
  filter(state == 49) %>% 
  filter(!name %in% cooperates) %>% 
  select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>% 
  mutate(county = str_to_title(str_split_i(name, ' ',1)),
         county = ifelse(county == 'Mc', 'McDowell', county))


print(t_spending)

## # A tibble: 55 × 8
##    name                  enroll tfedrev tstrev tlocrev totalexp ppcstot county  
##    <chr>                  <dbl>   <dbl>  <dbl>   <dbl>    <dbl>   <dbl> <chr>   
##  1 BARBOUR CO SCH DIST     2144    7559  16584    5872    28021   11885 Barbour 
##  2 BERKELEY CO SCH DIST   19722   48407 140127   86699   264253   12704 Berkeley
##  3 BOONE CO SCH DIST       3177    8194  26858   14564    48642   14663 Boone   
##  4 BRAXTON CO SCH DIST     1747    5479  12748    6404    24417   13153 Braxton 
##  5 BROOKE CO SCH DIST      2582    6791  17114   21352    41908   15642 Brooke  
##  6 CABELL CO SCH DIST     11667   42518  88337   66699   183621   14538 Cabell  
##  7 CALHOUN CO SCH DIST      861    3254   9953    3190    15154   16085 Calhoun 
##  8 CLAY CO SCH DIST        1669    6157  17655    2791    25963   13825 Clay    
##  9 DODDRIDGE CO SCH DIST   1082    3455   3999   31752    38493   23563 Doddrid…
## 10 FAYETTE CO SCH DIST     5594   15293  51759   23477    83373   13777 Fayette 
## # ℹ 45 more rows

Load demographic data

t_demographics_unemployed <- read_csv('./demographics/unemployed.csv', 
                            skip = 4,
                            na = 'N/A') %>%
  janitor::clean_names() %>% 
  filter(county != 'West Virginia',
         county != 'United States',
         !is.na(value_percent) ) %>% 
  select(county, value_percent) %>%
  rename(unemployed = value_percent)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

t_demographics <-  t_demographics_unemployed

print(t_demographics)

## # A tibble: 55 × 2
##    county          unemployed
##    <chr>                <dbl>
##  1 McDowell County       15.1
##  2 Braxton County        14.4
##  3 Logan County          13.3
##  4 Calhoun County        12.2
##  5 Roane County          11.7
##  6 Clay County           11.2
##  7 Mingo County          11.2
##  8 Webster County        11.1
##  9 Monroe County         10.6
## 10 Barbour County        10.1
## # ℹ 45 more rows

Joined data

# Merge data
t_demographics <- t_demographics %>%
  mutate(county = str_replace(county, " County", ""))



t <- t_assess 
t <- t %>% 
  left_join(t_spending, by = 'county') %>% 
  left_join(t_demographics, by = 'county')%>% 
  mutate(proficiency = as.numeric(proficiency),
         proficiency = ifelse(proficiency < 0, NA, proficiency),
         proficiency = ifelse(proficiency > 100, NA, proficiency))
summary(t)

##     county             school          school_name        population_group  
##  Length:55          Length:55          Length:55          Length:55         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    subgroup         science_proficiency  proficiency        name          
##  Length:55          Min.   :14.81       Min.   :14.81   Length:55         
##  Class :character   1st Qu.:21.45       1st Qu.:21.45   Class :character  
##  Mode  :character   Median :24.36       Median :24.36   Mode  :character  
##                     Mean   :25.30       Mean   :25.30                     
##                     3rd Qu.:29.39       3rd Qu.:29.39                     
##                     Max.   :41.80       Max.   :41.80                     
##      enroll         tfedrev           tstrev          tlocrev      
##  Min.   :  800   Min.   :  1511   Min.   :  3895   Min.   :  1956  
##  1st Qu.: 1654   1st Qu.:  4991   1st Qu.: 12668   1st Qu.:  8194  
##  Median : 3177   Median : 10158   Median : 26858   Median : 14813  
##  Mean   : 4586   Mean   : 13312   Mean   : 34234   Mean   : 25032  
##  3rd Qu.: 5104   3rd Qu.: 14518   3rd Qu.: 39496   3rd Qu.: 33333  
##  Max.   :24392   Max.   :109522   Max.   :176062   Max.   :145623  
##     totalexp         ppcstot        unemployed    
##  Min.   : 13954   Min.   :11885   Min.   : 2.600  
##  1st Qu.: 26486   1st Qu.:13151   1st Qu.: 5.050  
##  Median : 48642   Median :13777   Median : 6.400  
##  Mean   : 69482   Mean   :14466   Mean   : 7.055  
##  3rd Qu.: 81172   3rd Qu.:15236   3rd Qu.: 8.500  
##  Max.   :416491   Max.   :23563   Max.   :15.100

## Combing all seperate data csv into one file

file_paths <- list.files('./checkbook_2023', full.names = TRUE, pattern = "\\.csv$")


get_county_from_path <- function(path) {
  path %>%
    basename() %>%
    str_remove("\\.csv$") %>%
    str_to_lower()
}

checkbook_all <- map_dfr(file_paths, function(path) {
  read_csv(path, show_col_types = FALSE) %>%
    janitor::clean_names() %>%
    mutate(county = get_county_from_path(path))
})

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

checkbook_summary <- checkbook_all %>%
  filter(!is.na(ck_amt)) %>%
  group_by(county) %>%
  summarise(total_checkbook_spending = sum(ck_amt, na.rm = TRUE)) %>%
  ungroup()



## Joining checkbook spending data to the other data sources here 

checkbook_summary <- checkbook_summary %>%
  mutate(county = str_to_title(str_extract(county, "(?<=-)[^-]+(?=-)")))
  
completely_joined <- t %>%
left_join(checkbook_summary, by = "county")

Data Exploration

## Determining Regions

completely_joined <- completely_joined %>%
  mutate(region = case_when(
    county %in% c("Berkeley", "Jefferson", "Morgan") ~ "Eastern Panhandle",
    county %in% c("Grant", "Hampshire", "Hardy", "Mineral", "Pendleton", "Tucker") ~ "Potomac Highlands",
    county %in% c("Barbour", "Doddridge", "Harrison", "Marion", "Monongalia", "Preston", "Taylor") ~ "North Central",
    county %in% c("Brooke", "Hancock", "Marshall", "Ohio", "Tyler", "Wetzel") ~ "Northern Panhandle",
    county %in% c("Braxton", "Calhoun", "Clay", "Gilmer", "Lewis", "Nicholas", "Roane", "Upshur", "Webster", "Wirt") ~ "Central WV",
    county %in% c("Boone", "Fayette", "Logan", "McDowell", "Mercer", "Mingo", "Raleigh", "Wyoming") ~ "Southern Coalfields",
    county %in% c("Cabell", "Lincoln", "Mason", "Putnam", "Wayne") ~ "Southwestern",
    county %in% c("Jackson", "Kanawha", "Pleasants", "Ritchie", "Wood") ~ "Western",
    county %in% c("Greenbrier", "Monroe", "Pocahontas", "Summers") ~ "Greenbrier Valley",
    TRUE ~ "Other"
  ))


glimpse(completely_joined)

## Rows: 55
## Columns: 17
## $ county                   <chr> "Barbour", "Berkeley", "Boone", "Braxton", "B…
## $ school                   <chr> "999", "999", "999", "999", "999", "999", "99…
## $ school_name              <chr> "Barbour County Total", "Berkeley County Tota…
## $ population_group         <chr> "Total Population", "Total Population", "Tota…
## $ subgroup                 <chr> "Total", "Total", "Total", "Total", "Total", …
## $ science_proficiency      <dbl> 25.97, 28.63, 19.58, 22.65, 21.14, 30.82, 27.…
## $ proficiency              <dbl> 25.97, 28.63, 19.58, 22.65, 21.14, 30.82, 27.…
## $ name                     <chr> "BARBOUR CO SCH DIST", "BERKELEY CO SCH DIST"…
## $ enroll                   <dbl> 2144, 19722, 3177, 1747, 2582, 11667, 861, 16…
## $ tfedrev                  <dbl> 7559, 48407, 8194, 5479, 6791, 42518, 3254, 6…
## $ tstrev                   <dbl> 16584, 140127, 26858, 12748, 17114, 88337, 99…
## $ tlocrev                  <dbl> 5872, 86699, 14564, 6404, 21352, 66699, 3190,…
## $ totalexp                 <dbl> 28021, 264253, 48642, 24417, 41908, 183621, 1…
## $ ppcstot                  <dbl> 11885, 12704, 14663, 13153, 15642, 14538, 160…
## $ unemployed               <dbl> 10.1, 4.6, 9.8, 14.4, 5.7, 6.1, 12.2, 11.2, 4…
## $ total_checkbook_spending <dbl> 25365788, 270106909, 60173833, 28709585, 3993…
## $ region                   <chr> "North Central", "Eastern Panhandle", "Southe…

summary(completely_joined)

##     county             school          school_name        population_group  
##  Length:55          Length:55          Length:55          Length:55         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    subgroup         science_proficiency  proficiency        name          
##  Length:55          Min.   :14.81       Min.   :14.81   Length:55         
##  Class :character   1st Qu.:21.45       1st Qu.:21.45   Class :character  
##  Mode  :character   Median :24.36       Median :24.36   Mode  :character  
##                     Mean   :25.30       Mean   :25.30                     
##                     3rd Qu.:29.39       3rd Qu.:29.39                     
##                     Max.   :41.80       Max.   :41.80                     
##                                                                           
##      enroll         tfedrev           tstrev          tlocrev      
##  Min.   :  800   Min.   :  1511   Min.   :  3895   Min.   :  1956  
##  1st Qu.: 1654   1st Qu.:  4991   1st Qu.: 12668   1st Qu.:  8194  
##  Median : 3177   Median : 10158   Median : 26858   Median : 14813  
##  Mean   : 4586   Mean   : 13312   Mean   : 34234   Mean   : 25032  
##  3rd Qu.: 5104   3rd Qu.: 14518   3rd Qu.: 39496   3rd Qu.: 33333  
##  Max.   :24392   Max.   :109522   Max.   :176062   Max.   :145623  
##                                                                    
##     totalexp         ppcstot        unemployed     total_checkbook_spending
##  Min.   : 13954   Min.   :11885   Min.   : 2.600   Min.   : 17516974       
##  1st Qu.: 26486   1st Qu.:13151   1st Qu.: 5.050   1st Qu.: 32887299       
##  Median : 48642   Median :13777   Median : 6.400   Median : 53524468       
##  Mean   : 69482   Mean   :14466   Mean   : 7.055   Mean   : 84148704       
##  3rd Qu.: 81172   3rd Qu.:15236   3rd Qu.: 8.500   3rd Qu.:101527678       
##  Max.   :416491   Max.   :23563   Max.   :15.100   Max.   :566916960       
##                                                    NA's   :4               
##     region         
##  Length:55         
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

## Removing NAs

completely_joined <- completely_joined %>%
  filter(!is.na(proficiency)) 

completely_joined <- completely_joined %>%
  filter(!is.na(total_checkbook_spending))

## Looking at my spending data and making it easier to read 
summary(completely_joined$total_checkbook_spending)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  17516974  32887299  53524468  84148704 101527678 566916960

completely_joined <- completely_joined %>%
  mutate(checkbook_spending_in_millions = total_checkbook_spending / 1e6)

summary(completely_joined$checkbook_spending_in_millions)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.52   32.89   53.52   84.15  101.53  566.92

Correlations

library(ggcorrplot)

## Does County Total Spending Impact Proficiency?

ggplot(completely_joined, aes(x = checkbook_spending_in_millions, y = proficiency, color = region)) +
  geom_point(alpha = 0.7, size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "County Spending vs. Proficiency by Region",
    x = "Checkbook Spending (in millions)",
    y = "Proficiency (%)",
    color = "Region"
  ) +
  theme_minimal() + 
  geom_smooth(method = "lm", se = FALSE, color = "darkred")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## What About Unemploment?


ggplot(completely_joined, aes(x = unemployed, y = proficiency, color = region)) +
  geom_point(size = 3, alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Unemployment Rate vs. Proficiency",
    x = "Unemployment Rate (%)",
    y = "Proficiency (%)",
    color = "Region"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Linear Regression Model

model_data_scaled <- completely_joined %>%
  mutate(across(c(enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot, unemployed, total_checkbook_spending), scale))


model1 <- lm(proficiency ~ totalexp + total_checkbook_spending + unemployed, data = model_data_scaled)

summary(model1)

## 
## Call:
## lm(formula = proficiency ~ totalexp + total_checkbook_spending + 
##     unemployed, data = model_data_scaled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6532 -4.6023  0.0937  4.4268 11.2970 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               25.8139     0.7440  34.696   <2e-16 ***
## totalexp                   6.3655     3.9140   1.626    0.111    
## total_checkbook_spending  -4.7746     3.8839  -1.229    0.225    
## unemployed                -0.8872     0.7901  -1.123    0.267    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.313 on 47 degrees of freedom
## Multiple R-squared:  0.1722, Adjusted R-squared:  0.1193 
## F-statistic: 3.259 on 3 and 47 DF,  p-value: 0.02964

model2 <- lm(proficiency ~ totalexp + tfedrev, data = model_data_scaled)
summary(model2)

## 
## Call:
## lm(formula = proficiency ~ totalexp + tfedrev, data = model_data_scaled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1284 -3.8691 -0.1779  3.7889  8.9413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.8139     0.6815  37.881  < 2e-16 ***
## totalexp      8.9642     2.1635   4.143 0.000138 ***
## tfedrev      -7.4081     2.1635  -3.424 0.001272 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.867 on 48 degrees of freedom
## Multiple R-squared:  0.2907, Adjusted R-squared:  0.2612 
## F-statistic: 9.839 on 2 and 48 DF,  p-value: 0.0002625

model3 <- lm(proficiency ~ tstrev + tlocrev + totalexp + ppcstot, data = model_data_scaled)
summary(model3)

## 
## Call:
## lm(formula = proficiency ~ tstrev + tlocrev + totalexp + ppcstot, 
##     data = model_data_scaled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7828 -4.1792 -0.3228  3.4866  9.8023 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.8139     0.6899  37.420  < 2e-16 ***
## tstrev       12.5133     5.5557   2.252  0.02912 *  
## tlocrev      10.9589     3.4305   3.195  0.00253 ** 
## totalexp    -20.4858     7.9339  -2.582  0.01307 *  
## ppcstot       0.2588     0.9692   0.267  0.79063    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.927 on 46 degrees of freedom
## Multiple R-squared:  0.3034, Adjusted R-squared:  0.2429 
## F-statistic:  5.01 on 4 and 46 DF,  p-value: 0.00195

Decision Tree Model

train_index <- createDataPartition(model_data_scaled$proficiency, p = 0.8, list = FALSE)

train <- model_data_scaled[train_index, ]
test <- model_data_scaled[-train_index, ]


tree_model1 <- rpart(proficiency ~ totalexp + tfedrev + tstrev + tlocrev + total_checkbook_spending + unemployed, 
                     data = model_data_scaled, 
                     method = "anova")


rpart.plot(tree_model1, 
           type = 3, 
           extra = 101, 
           fallen.leaves = TRUE, 
           main = "Decision Tree for Proficiency Prediction", 
           box.palette = "RdYlGn", 
           shadow.col = "gray", 
           nn = TRUE)

pred_tree1 <- predict(tree_model1, newdata = test)


MAE_tree <- mean(abs(pred_tree1 - test$proficiency))
RMSE_tree <- sqrt(mean((pred_tree1 - test$proficiency)^2))
MAE_tree

## [1] 2.601399

RMSE_tree

## [1] 2.706981

## Model 2


tree_model2 <- rpart(proficiency ~ tstrev + tlocrev + totalexp + ppcstot, 
                     data = model_data_scaled, 
                     method = "anova")


rpart.plot(tree_model2, 
           type = 3, 
           extra = 101, 
           fallen.leaves = TRUE, 
           main = "Decision Tree for Proficiency Prediction", 
           box.palette = "RdYlGn", 
           shadow.col = "gray", 
           nn = TRUE)

pred_tree2 <- predict(tree_model2, newdata = test)


MAE_tree <- mean(abs(pred_tree2 - test$proficiency))
RMSE_tree <- sqrt(mean((pred_tree2 - test$proficiency)^2))
MAE_tree

## [1] 2.909479

RMSE_tree

## [1] 3.236618

K-means

library(cluster)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

kmeans_data <- completely_joined %>%
  select(proficiency, checkbook_spending_in_millions, unemployed, totalexp) %>%
  na.omit() %>%
  scale()

set.seed(123)
kmeans_result <- kmeans(kmeans_data, centers = 3, nstart = 25)

clustered_data <- completely_joined %>%
  filter(!is.na(proficiency), !is.na(checkbook_spending_in_millions), !is.na(unemployed), !is.na(totalexp)) %>%
  mutate(cluster = as.factor(kmeans_result$cluster))

fviz_cluster(kmeans_result, data = kmeans_data,
             ellipse.type = "euclid",
             geom = "point",
             palette = "jco",
             ggtheme = theme_minimal())

Conclusion

In conclusion, my findings suggest that there is a positive correlation between totalexp and educational proficiency in West Virginia counties.

WV County Education Outcomes Prediction

Grace Bowersox

Thesis

Load assessment data

Load spending data

Load demographic data

Joined data

Data Exploration

Correlations

Linear Regression Model

Decision Tree Model

K-means

Conclusion