Thesis

Load assessment data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(readxl)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(rpart)
library(rpart.plot)

assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'


t_assess_raw_school <- read_excel(path = assessment_path,
                           sheet = 'SY21 School & District',
                           range = 'b2:f7312')


t_assess_raw_science <- read_excel(path = assessment_path,
                           sheet = 'SY21 School & District',
                           range = 'db3:db7312', 
                           col_names = c('science_proficiency'),
                           na = '**')

t_assess_raw <- t_assess_raw_school %>%
  bind_cols(t_assess_raw_science) %>% 
  janitor::clean_names()  


# Remove subgroups
t_assess <- t_assess_raw %>% 
  filter(school == 999) %>% 
  filter(population_group == 'Total Population') %>% 
  filter(county != 'Statewide') %>% 
  mutate(proficiency = science_proficiency)  

print(t_assess)
## # A tibble: 55 × 7
##    county    school school_name    population_group subgroup science_proficiency
##    <chr>     <chr>  <chr>          <chr>            <chr>                  <dbl>
##  1 Barbour   999    Barbour Count… Total Population Total                   26.0
##  2 Berkeley  999    Berkeley Coun… Total Population Total                   28.6
##  3 Boone     999    Boone County … Total Population Total                   19.6
##  4 Braxton   999    Braxton Count… Total Population Total                   22.6
##  5 Brooke    999    Brooke County… Total Population Total                   21.1
##  6 Cabell    999    Cabell County… Total Population Total                   30.8
##  7 Calhoun   999    Calhoun Count… Total Population Total                   27.8
##  8 Clay      999    Clay County T… Total Population Total                   23.3
##  9 Doddridge 999    Doddridge Cou… Total Population Total                   31.3
## 10 Fayette   999    Fayette Count… Total Population Total                   17.4
## # ℹ 45 more rows
## # ℹ 1 more variable: proficiency <dbl>

Load spending data

spending_path <- './us census ed spending/elsec22t.xls'

t_spending_raw <- read_excel(path = spending_path,
                           sheet = 'elsec22t',
                           range = 'a1:gb14106') %>% 
  janitor::clean_names()
## New names:
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...92`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...97`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...102`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## • `` -> `...107`
## • `` -> `...108`
## • `` -> `...109`
## • `` -> `...110`
## • `` -> `...111`
## • `` -> `...112`
## • `` -> `...113`
## • `` -> `...114`
## • `` -> `...115`
## • `` -> `...116`
## • `` -> `...117`
## • `` -> `...118`
## • `` -> `...119`
## • `` -> `...120`
## • `` -> `...121`
## • `` -> `...122`
## • `` -> `...123`
## • `` -> `...124`
## • `` -> `...125`
## • `` -> `...126`
## • `` -> `...127`
## • `` -> `...128`
## • `` -> `...129`
## • `` -> `...130`
## • `` -> `...131`
## • `` -> `...132`
## • `` -> `...133`
## • `` -> `...134`
## • `` -> `...135`
## • `` -> `...136`
## • `` -> `...137`
## • `` -> `...138`
## • `` -> `...139`
## • `` -> `...140`
## • `` -> `...141`
## • `` -> `...142`
## • `` -> `...143`
## • `` -> `...144`
## • `` -> `...145`
## • `` -> `...146`
## • `` -> `...147`
## • `` -> `...148`
## • `` -> `...149`
## • `` -> `...150`
## • `` -> `...151`
## • `` -> `...152`
## • `` -> `...153`
## • `` -> `...154`
## • `` -> `...155`
## • `` -> `...156`
## • `` -> `...157`
## • `` -> `...158`
## • `` -> `...159`
## • `` -> `...160`
## • `` -> `...161`
## • `` -> `...162`
## • `` -> `...163`
## • `` -> `...164`
## • `` -> `...165`
## • `` -> `...166`
## • `` -> `...167`
## • `` -> `...168`
## • `` -> `...169`
## • `` -> `...170`
## • `` -> `...171`
## • `` -> `...172`
## • `` -> `...173`
## • `` -> `...174`
## • `` -> `...175`
## • `` -> `...176`
## • `` -> `...177`
## • `` -> `...178`
## • `` -> `...179`
## • `` -> `...180`
## • `` -> `...181`
## • `` -> `...182`
## • `` -> `...183`
## • `` -> `...184`
cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
                'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
                'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')

t_spending <- t_spending_raw %>% 
  filter(state == 49) %>% 
  filter(!name %in% cooperates) %>% 
  select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>% 
  mutate(county = str_to_title(str_split_i(name, ' ',1)),
         county = ifelse(county == 'Mc', 'McDowell', county))


print(t_spending)
## # A tibble: 55 × 8
##    name                  enroll tfedrev tstrev tlocrev totalexp ppcstot county  
##    <chr>                  <dbl>   <dbl>  <dbl>   <dbl>    <dbl>   <dbl> <chr>   
##  1 BARBOUR CO SCH DIST     2144    7559  16584    5872    28021   11885 Barbour 
##  2 BERKELEY CO SCH DIST   19722   48407 140127   86699   264253   12704 Berkeley
##  3 BOONE CO SCH DIST       3177    8194  26858   14564    48642   14663 Boone   
##  4 BRAXTON CO SCH DIST     1747    5479  12748    6404    24417   13153 Braxton 
##  5 BROOKE CO SCH DIST      2582    6791  17114   21352    41908   15642 Brooke  
##  6 CABELL CO SCH DIST     11667   42518  88337   66699   183621   14538 Cabell  
##  7 CALHOUN CO SCH DIST      861    3254   9953    3190    15154   16085 Calhoun 
##  8 CLAY CO SCH DIST        1669    6157  17655    2791    25963   13825 Clay    
##  9 DODDRIDGE CO SCH DIST   1082    3455   3999   31752    38493   23563 Doddrid…
## 10 FAYETTE CO SCH DIST     5594   15293  51759   23477    83373   13777 Fayette 
## # ℹ 45 more rows

Load demographic data

t_demographics_unemployed <- read_csv('./demographics/unemployed.csv', 
                            skip = 4,
                            na = 'N/A') %>%
  janitor::clean_names() %>% 
  filter(county != 'West Virginia',
         county != 'United States',
         !is.na(value_percent) ) %>% 
  select(county, value_percent) %>%
  rename(unemployed = value_percent)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_demographics <-  t_demographics_unemployed

print(t_demographics)
## # A tibble: 55 × 2
##    county          unemployed
##    <chr>                <dbl>
##  1 McDowell County       15.1
##  2 Braxton County        14.4
##  3 Logan County          13.3
##  4 Calhoun County        12.2
##  5 Roane County          11.7
##  6 Clay County           11.2
##  7 Mingo County          11.2
##  8 Webster County        11.1
##  9 Monroe County         10.6
## 10 Barbour County        10.1
## # ℹ 45 more rows

Joined data

# Merge data
t_demographics <- t_demographics %>%
  mutate(county = str_replace(county, " County", ""))



t <- t_assess 
t <- t %>% 
  left_join(t_spending, by = 'county') %>% 
  left_join(t_demographics, by = 'county')%>% 
  mutate(proficiency = as.numeric(proficiency),
         proficiency = ifelse(proficiency < 0, NA, proficiency),
         proficiency = ifelse(proficiency > 100, NA, proficiency))
summary(t)
##     county             school          school_name        population_group  
##  Length:55          Length:55          Length:55          Length:55         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    subgroup         science_proficiency  proficiency        name          
##  Length:55          Min.   :14.81       Min.   :14.81   Length:55         
##  Class :character   1st Qu.:21.45       1st Qu.:21.45   Class :character  
##  Mode  :character   Median :24.36       Median :24.36   Mode  :character  
##                     Mean   :25.30       Mean   :25.30                     
##                     3rd Qu.:29.39       3rd Qu.:29.39                     
##                     Max.   :41.80       Max.   :41.80                     
##      enroll         tfedrev           tstrev          tlocrev      
##  Min.   :  800   Min.   :  1511   Min.   :  3895   Min.   :  1956  
##  1st Qu.: 1654   1st Qu.:  4991   1st Qu.: 12668   1st Qu.:  8194  
##  Median : 3177   Median : 10158   Median : 26858   Median : 14813  
##  Mean   : 4586   Mean   : 13312   Mean   : 34234   Mean   : 25032  
##  3rd Qu.: 5104   3rd Qu.: 14518   3rd Qu.: 39496   3rd Qu.: 33333  
##  Max.   :24392   Max.   :109522   Max.   :176062   Max.   :145623  
##     totalexp         ppcstot        unemployed    
##  Min.   : 13954   Min.   :11885   Min.   : 2.600  
##  1st Qu.: 26486   1st Qu.:13151   1st Qu.: 5.050  
##  Median : 48642   Median :13777   Median : 6.400  
##  Mean   : 69482   Mean   :14466   Mean   : 7.055  
##  3rd Qu.: 81172   3rd Qu.:15236   3rd Qu.: 8.500  
##  Max.   :416491   Max.   :23563   Max.   :15.100
## Combing all seperate data csv into one file

file_paths <- list.files('./checkbook_2023', full.names = TRUE, pattern = "\\.csv$")


get_county_from_path <- function(path) {
  path %>%
    basename() %>%
    str_remove("\\.csv$") %>%
    str_to_lower()
}

checkbook_all <- map_dfr(file_paths, function(path) {
  read_csv(path, show_col_types = FALSE) %>%
    janitor::clean_names() %>%
    mutate(county = get_county_from_path(path))
})
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
checkbook_summary <- checkbook_all %>%
  filter(!is.na(ck_amt)) %>%
  group_by(county) %>%
  summarise(total_checkbook_spending = sum(ck_amt, na.rm = TRUE)) %>%
  ungroup()



## Joining checkbook spending data to the other data sources here 

checkbook_summary <- checkbook_summary %>%
  mutate(county = str_to_title(str_extract(county, "(?<=-)[^-]+(?=-)")))
  
completely_joined <- t %>%
left_join(checkbook_summary, by = "county")

Data Exploration

## Determining Regions

completely_joined <- completely_joined %>%
  mutate(region = case_when(
    county %in% c("Berkeley", "Jefferson", "Morgan") ~ "Eastern Panhandle",
    county %in% c("Grant", "Hampshire", "Hardy", "Mineral", "Pendleton", "Tucker") ~ "Potomac Highlands",
    county %in% c("Barbour", "Doddridge", "Harrison", "Marion", "Monongalia", "Preston", "Taylor") ~ "North Central",
    county %in% c("Brooke", "Hancock", "Marshall", "Ohio", "Tyler", "Wetzel") ~ "Northern Panhandle",
    county %in% c("Braxton", "Calhoun", "Clay", "Gilmer", "Lewis", "Nicholas", "Roane", "Upshur", "Webster", "Wirt") ~ "Central WV",
    county %in% c("Boone", "Fayette", "Logan", "McDowell", "Mercer", "Mingo", "Raleigh", "Wyoming") ~ "Southern Coalfields",
    county %in% c("Cabell", "Lincoln", "Mason", "Putnam", "Wayne") ~ "Southwestern",
    county %in% c("Jackson", "Kanawha", "Pleasants", "Ritchie", "Wood") ~ "Western",
    county %in% c("Greenbrier", "Monroe", "Pocahontas", "Summers") ~ "Greenbrier Valley",
    TRUE ~ "Other"
  ))


glimpse(completely_joined)
## Rows: 55
## Columns: 17
## $ county                   <chr> "Barbour", "Berkeley", "Boone", "Braxton", "B…
## $ school                   <chr> "999", "999", "999", "999", "999", "999", "99…
## $ school_name              <chr> "Barbour County Total", "Berkeley County Tota…
## $ population_group         <chr> "Total Population", "Total Population", "Tota…
## $ subgroup                 <chr> "Total", "Total", "Total", "Total", "Total", …
## $ science_proficiency      <dbl> 25.97, 28.63, 19.58, 22.65, 21.14, 30.82, 27.…
## $ proficiency              <dbl> 25.97, 28.63, 19.58, 22.65, 21.14, 30.82, 27.…
## $ name                     <chr> "BARBOUR CO SCH DIST", "BERKELEY CO SCH DIST"…
## $ enroll                   <dbl> 2144, 19722, 3177, 1747, 2582, 11667, 861, 16…
## $ tfedrev                  <dbl> 7559, 48407, 8194, 5479, 6791, 42518, 3254, 6…
## $ tstrev                   <dbl> 16584, 140127, 26858, 12748, 17114, 88337, 99…
## $ tlocrev                  <dbl> 5872, 86699, 14564, 6404, 21352, 66699, 3190,…
## $ totalexp                 <dbl> 28021, 264253, 48642, 24417, 41908, 183621, 1…
## $ ppcstot                  <dbl> 11885, 12704, 14663, 13153, 15642, 14538, 160…
## $ unemployed               <dbl> 10.1, 4.6, 9.8, 14.4, 5.7, 6.1, 12.2, 11.2, 4…
## $ total_checkbook_spending <dbl> 25365788, 270106909, 60173833, 28709585, 3993…
## $ region                   <chr> "North Central", "Eastern Panhandle", "Southe…
summary(completely_joined)
##     county             school          school_name        population_group  
##  Length:55          Length:55          Length:55          Length:55         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    subgroup         science_proficiency  proficiency        name          
##  Length:55          Min.   :14.81       Min.   :14.81   Length:55         
##  Class :character   1st Qu.:21.45       1st Qu.:21.45   Class :character  
##  Mode  :character   Median :24.36       Median :24.36   Mode  :character  
##                     Mean   :25.30       Mean   :25.30                     
##                     3rd Qu.:29.39       3rd Qu.:29.39                     
##                     Max.   :41.80       Max.   :41.80                     
##                                                                           
##      enroll         tfedrev           tstrev          tlocrev      
##  Min.   :  800   Min.   :  1511   Min.   :  3895   Min.   :  1956  
##  1st Qu.: 1654   1st Qu.:  4991   1st Qu.: 12668   1st Qu.:  8194  
##  Median : 3177   Median : 10158   Median : 26858   Median : 14813  
##  Mean   : 4586   Mean   : 13312   Mean   : 34234   Mean   : 25032  
##  3rd Qu.: 5104   3rd Qu.: 14518   3rd Qu.: 39496   3rd Qu.: 33333  
##  Max.   :24392   Max.   :109522   Max.   :176062   Max.   :145623  
##                                                                    
##     totalexp         ppcstot        unemployed     total_checkbook_spending
##  Min.   : 13954   Min.   :11885   Min.   : 2.600   Min.   : 17516974       
##  1st Qu.: 26486   1st Qu.:13151   1st Qu.: 5.050   1st Qu.: 32887299       
##  Median : 48642   Median :13777   Median : 6.400   Median : 53524468       
##  Mean   : 69482   Mean   :14466   Mean   : 7.055   Mean   : 84148704       
##  3rd Qu.: 81172   3rd Qu.:15236   3rd Qu.: 8.500   3rd Qu.:101527678       
##  Max.   :416491   Max.   :23563   Max.   :15.100   Max.   :566916960       
##                                                    NA's   :4               
##     region         
##  Length:55         
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
## Removing NAs

completely_joined <- completely_joined %>%
  filter(!is.na(proficiency)) 

completely_joined <- completely_joined %>%
  filter(!is.na(total_checkbook_spending))

## Looking at my spending data and making it easier to read 
summary(completely_joined$total_checkbook_spending)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  17516974  32887299  53524468  84148704 101527678 566916960
completely_joined <- completely_joined %>%
  mutate(checkbook_spending_in_millions = total_checkbook_spending / 1e6)

summary(completely_joined$checkbook_spending_in_millions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.52   32.89   53.52   84.15  101.53  566.92

Correlations

library(ggcorrplot)

## Does County Total Spending Impact Proficiency?

ggplot(completely_joined, aes(x = checkbook_spending_in_millions, y = proficiency, color = region)) +
  geom_point(alpha = 0.7, size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "County Spending vs. Proficiency by Region",
    x = "Checkbook Spending (in millions)",
    y = "Proficiency (%)",
    color = "Region"
  ) +
  theme_minimal() + 
  geom_smooth(method = "lm", se = FALSE, color = "darkred")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## What About Unemploment?


ggplot(completely_joined, aes(x = unemployed, y = proficiency, color = region)) +
  geom_point(size = 3, alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Unemployment Rate vs. Proficiency",
    x = "Unemployment Rate (%)",
    y = "Proficiency (%)",
    color = "Region"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Linear Regression Model

model_data_scaled <- completely_joined %>%
  mutate(across(c(enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot, unemployed, total_checkbook_spending), scale))


model1 <- lm(proficiency ~ totalexp + total_checkbook_spending + unemployed, data = model_data_scaled)

summary(model1)
## 
## Call:
## lm(formula = proficiency ~ totalexp + total_checkbook_spending + 
##     unemployed, data = model_data_scaled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6532 -4.6023  0.0937  4.4268 11.2970 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               25.8139     0.7440  34.696   <2e-16 ***
## totalexp                   6.3655     3.9140   1.626    0.111    
## total_checkbook_spending  -4.7746     3.8839  -1.229    0.225    
## unemployed                -0.8872     0.7901  -1.123    0.267    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.313 on 47 degrees of freedom
## Multiple R-squared:  0.1722, Adjusted R-squared:  0.1193 
## F-statistic: 3.259 on 3 and 47 DF,  p-value: 0.02964
model2 <- lm(proficiency ~ totalexp + tfedrev, data = model_data_scaled)
summary(model2)
## 
## Call:
## lm(formula = proficiency ~ totalexp + tfedrev, data = model_data_scaled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.1284 -3.8691 -0.1779  3.7889  8.9413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.8139     0.6815  37.881  < 2e-16 ***
## totalexp      8.9642     2.1635   4.143 0.000138 ***
## tfedrev      -7.4081     2.1635  -3.424 0.001272 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.867 on 48 degrees of freedom
## Multiple R-squared:  0.2907, Adjusted R-squared:  0.2612 
## F-statistic: 9.839 on 2 and 48 DF,  p-value: 0.0002625
model3 <- lm(proficiency ~ tstrev + tlocrev + totalexp + ppcstot, data = model_data_scaled)
summary(model3)
## 
## Call:
## lm(formula = proficiency ~ tstrev + tlocrev + totalexp + ppcstot, 
##     data = model_data_scaled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7828 -4.1792 -0.3228  3.4866  9.8023 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.8139     0.6899  37.420  < 2e-16 ***
## tstrev       12.5133     5.5557   2.252  0.02912 *  
## tlocrev      10.9589     3.4305   3.195  0.00253 ** 
## totalexp    -20.4858     7.9339  -2.582  0.01307 *  
## ppcstot       0.2588     0.9692   0.267  0.79063    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.927 on 46 degrees of freedom
## Multiple R-squared:  0.3034, Adjusted R-squared:  0.2429 
## F-statistic:  5.01 on 4 and 46 DF,  p-value: 0.00195

Decision Tree Model

train_index <- createDataPartition(model_data_scaled$proficiency, p = 0.8, list = FALSE)

train <- model_data_scaled[train_index, ]
test <- model_data_scaled[-train_index, ]


tree_model1 <- rpart(proficiency ~ totalexp + tfedrev + tstrev + tlocrev + total_checkbook_spending + unemployed, 
                     data = model_data_scaled, 
                     method = "anova")


rpart.plot(tree_model1, 
           type = 3, 
           extra = 101, 
           fallen.leaves = TRUE, 
           main = "Decision Tree for Proficiency Prediction", 
           box.palette = "RdYlGn", 
           shadow.col = "gray", 
           nn = TRUE)

pred_tree1 <- predict(tree_model1, newdata = test)


MAE_tree <- mean(abs(pred_tree1 - test$proficiency))
RMSE_tree <- sqrt(mean((pred_tree1 - test$proficiency)^2))
MAE_tree
## [1] 2.601399
RMSE_tree
## [1] 2.706981
## Model 2


tree_model2 <- rpart(proficiency ~ tstrev + tlocrev + totalexp + ppcstot, 
                     data = model_data_scaled, 
                     method = "anova")


rpart.plot(tree_model2, 
           type = 3, 
           extra = 101, 
           fallen.leaves = TRUE, 
           main = "Decision Tree for Proficiency Prediction", 
           box.palette = "RdYlGn", 
           shadow.col = "gray", 
           nn = TRUE)

pred_tree2 <- predict(tree_model2, newdata = test)


MAE_tree <- mean(abs(pred_tree2 - test$proficiency))
RMSE_tree <- sqrt(mean((pred_tree2 - test$proficiency)^2))
MAE_tree
## [1] 2.909479
RMSE_tree
## [1] 3.236618

K-means

library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
kmeans_data <- completely_joined %>%
  select(proficiency, checkbook_spending_in_millions, unemployed, totalexp) %>%
  na.omit() %>%
  scale()

set.seed(123)
kmeans_result <- kmeans(kmeans_data, centers = 3, nstart = 25)

clustered_data <- completely_joined %>%
  filter(!is.na(proficiency), !is.na(checkbook_spending_in_millions), !is.na(unemployed), !is.na(totalexp)) %>%
  mutate(cluster = as.factor(kmeans_result$cluster))

fviz_cluster(kmeans_result, data = kmeans_data,
             ellipse.type = "euclid",
             geom = "point",
             palette = "jco",
             ggtheme = theme_minimal())

Conclusion