Thesis
- The greater a Regions totalexp, the higher educational proficiency
of students in that region.
Load assessment data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(readxl)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(rpart)
library(rpart.plot)
assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'
t_assess_raw_school <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'b2:f7312')
t_assess_raw_science <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'db3:db7312',
col_names = c('science_proficiency'),
na = '**')
t_assess_raw <- t_assess_raw_school %>%
bind_cols(t_assess_raw_science) %>%
janitor::clean_names()
# Remove subgroups
t_assess <- t_assess_raw %>%
filter(school == 999) %>%
filter(population_group == 'Total Population') %>%
filter(county != 'Statewide') %>%
mutate(proficiency = science_proficiency)
print(t_assess)
## # A tibble: 55 × 7
## county school school_name population_group subgroup science_proficiency
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Barbour 999 Barbour Count… Total Population Total 26.0
## 2 Berkeley 999 Berkeley Coun… Total Population Total 28.6
## 3 Boone 999 Boone County … Total Population Total 19.6
## 4 Braxton 999 Braxton Count… Total Population Total 22.6
## 5 Brooke 999 Brooke County… Total Population Total 21.1
## 6 Cabell 999 Cabell County… Total Population Total 30.8
## 7 Calhoun 999 Calhoun Count… Total Population Total 27.8
## 8 Clay 999 Clay County T… Total Population Total 23.3
## 9 Doddridge 999 Doddridge Cou… Total Population Total 31.3
## 10 Fayette 999 Fayette Count… Total Population Total 17.4
## # ℹ 45 more rows
## # ℹ 1 more variable: proficiency <dbl>
Load spending data
spending_path <- './us census ed spending/elsec22t.xls'
t_spending_raw <- read_excel(path = spending_path,
sheet = 'elsec22t',
range = 'a1:gb14106') %>%
janitor::clean_names()
## New names:
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...92`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...97`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...102`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## • `` -> `...107`
## • `` -> `...108`
## • `` -> `...109`
## • `` -> `...110`
## • `` -> `...111`
## • `` -> `...112`
## • `` -> `...113`
## • `` -> `...114`
## • `` -> `...115`
## • `` -> `...116`
## • `` -> `...117`
## • `` -> `...118`
## • `` -> `...119`
## • `` -> `...120`
## • `` -> `...121`
## • `` -> `...122`
## • `` -> `...123`
## • `` -> `...124`
## • `` -> `...125`
## • `` -> `...126`
## • `` -> `...127`
## • `` -> `...128`
## • `` -> `...129`
## • `` -> `...130`
## • `` -> `...131`
## • `` -> `...132`
## • `` -> `...133`
## • `` -> `...134`
## • `` -> `...135`
## • `` -> `...136`
## • `` -> `...137`
## • `` -> `...138`
## • `` -> `...139`
## • `` -> `...140`
## • `` -> `...141`
## • `` -> `...142`
## • `` -> `...143`
## • `` -> `...144`
## • `` -> `...145`
## • `` -> `...146`
## • `` -> `...147`
## • `` -> `...148`
## • `` -> `...149`
## • `` -> `...150`
## • `` -> `...151`
## • `` -> `...152`
## • `` -> `...153`
## • `` -> `...154`
## • `` -> `...155`
## • `` -> `...156`
## • `` -> `...157`
## • `` -> `...158`
## • `` -> `...159`
## • `` -> `...160`
## • `` -> `...161`
## • `` -> `...162`
## • `` -> `...163`
## • `` -> `...164`
## • `` -> `...165`
## • `` -> `...166`
## • `` -> `...167`
## • `` -> `...168`
## • `` -> `...169`
## • `` -> `...170`
## • `` -> `...171`
## • `` -> `...172`
## • `` -> `...173`
## • `` -> `...174`
## • `` -> `...175`
## • `` -> `...176`
## • `` -> `...177`
## • `` -> `...178`
## • `` -> `...179`
## • `` -> `...180`
## • `` -> `...181`
## • `` -> `...182`
## • `` -> `...183`
## • `` -> `...184`
cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')
t_spending <- t_spending_raw %>%
filter(state == 49) %>%
filter(!name %in% cooperates) %>%
select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>%
mutate(county = str_to_title(str_split_i(name, ' ',1)),
county = ifelse(county == 'Mc', 'McDowell', county))
print(t_spending)
## # A tibble: 55 × 8
## name enroll tfedrev tstrev tlocrev totalexp ppcstot county
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 BARBOUR CO SCH DIST 2144 7559 16584 5872 28021 11885 Barbour
## 2 BERKELEY CO SCH DIST 19722 48407 140127 86699 264253 12704 Berkeley
## 3 BOONE CO SCH DIST 3177 8194 26858 14564 48642 14663 Boone
## 4 BRAXTON CO SCH DIST 1747 5479 12748 6404 24417 13153 Braxton
## 5 BROOKE CO SCH DIST 2582 6791 17114 21352 41908 15642 Brooke
## 6 CABELL CO SCH DIST 11667 42518 88337 66699 183621 14538 Cabell
## 7 CALHOUN CO SCH DIST 861 3254 9953 3190 15154 16085 Calhoun
## 8 CLAY CO SCH DIST 1669 6157 17655 2791 25963 13825 Clay
## 9 DODDRIDGE CO SCH DIST 1082 3455 3999 31752 38493 23563 Doddrid…
## 10 FAYETTE CO SCH DIST 5594 15293 51759 23477 83373 13777 Fayette
## # ℹ 45 more rows
Load demographic data
t_demographics_unemployed <- read_csv('./demographics/unemployed.csv',
skip = 4,
na = 'N/A') %>%
janitor::clean_names() %>%
filter(county != 'West Virginia',
county != 'United States',
!is.na(value_percent) ) %>%
select(county, value_percent) %>%
rename(unemployed = value_percent)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_demographics <- t_demographics_unemployed
print(t_demographics)
## # A tibble: 55 × 2
## county unemployed
## <chr> <dbl>
## 1 McDowell County 15.1
## 2 Braxton County 14.4
## 3 Logan County 13.3
## 4 Calhoun County 12.2
## 5 Roane County 11.7
## 6 Clay County 11.2
## 7 Mingo County 11.2
## 8 Webster County 11.1
## 9 Monroe County 10.6
## 10 Barbour County 10.1
## # ℹ 45 more rows
Joined data
# Merge data
t_demographics <- t_demographics %>%
mutate(county = str_replace(county, " County", ""))
t <- t_assess
t <- t %>%
left_join(t_spending, by = 'county') %>%
left_join(t_demographics, by = 'county')%>%
mutate(proficiency = as.numeric(proficiency),
proficiency = ifelse(proficiency < 0, NA, proficiency),
proficiency = ifelse(proficiency > 100, NA, proficiency))
summary(t)
## county school school_name population_group
## Length:55 Length:55 Length:55 Length:55
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## subgroup science_proficiency proficiency name
## Length:55 Min. :14.81 Min. :14.81 Length:55
## Class :character 1st Qu.:21.45 1st Qu.:21.45 Class :character
## Mode :character Median :24.36 Median :24.36 Mode :character
## Mean :25.30 Mean :25.30
## 3rd Qu.:29.39 3rd Qu.:29.39
## Max. :41.80 Max. :41.80
## enroll tfedrev tstrev tlocrev
## Min. : 800 Min. : 1511 Min. : 3895 Min. : 1956
## 1st Qu.: 1654 1st Qu.: 4991 1st Qu.: 12668 1st Qu.: 8194
## Median : 3177 Median : 10158 Median : 26858 Median : 14813
## Mean : 4586 Mean : 13312 Mean : 34234 Mean : 25032
## 3rd Qu.: 5104 3rd Qu.: 14518 3rd Qu.: 39496 3rd Qu.: 33333
## Max. :24392 Max. :109522 Max. :176062 Max. :145623
## totalexp ppcstot unemployed
## Min. : 13954 Min. :11885 Min. : 2.600
## 1st Qu.: 26486 1st Qu.:13151 1st Qu.: 5.050
## Median : 48642 Median :13777 Median : 6.400
## Mean : 69482 Mean :14466 Mean : 7.055
## 3rd Qu.: 81172 3rd Qu.:15236 3rd Qu.: 8.500
## Max. :416491 Max. :23563 Max. :15.100
## Combing all seperate data csv into one file
file_paths <- list.files('./checkbook_2023', full.names = TRUE, pattern = "\\.csv$")
get_county_from_path <- function(path) {
path %>%
basename() %>%
str_remove("\\.csv$") %>%
str_to_lower()
}
checkbook_all <- map_dfr(file_paths, function(path) {
read_csv(path, show_col_types = FALSE) %>%
janitor::clean_names() %>%
mutate(county = get_county_from_path(path))
})
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
checkbook_summary <- checkbook_all %>%
filter(!is.na(ck_amt)) %>%
group_by(county) %>%
summarise(total_checkbook_spending = sum(ck_amt, na.rm = TRUE)) %>%
ungroup()
## Joining checkbook spending data to the other data sources here
checkbook_summary <- checkbook_summary %>%
mutate(county = str_to_title(str_extract(county, "(?<=-)[^-]+(?=-)")))
completely_joined <- t %>%
left_join(checkbook_summary, by = "county")
Data Exploration
## Determining Regions
completely_joined <- completely_joined %>%
mutate(region = case_when(
county %in% c("Berkeley", "Jefferson", "Morgan") ~ "Eastern Panhandle",
county %in% c("Grant", "Hampshire", "Hardy", "Mineral", "Pendleton", "Tucker") ~ "Potomac Highlands",
county %in% c("Barbour", "Doddridge", "Harrison", "Marion", "Monongalia", "Preston", "Taylor") ~ "North Central",
county %in% c("Brooke", "Hancock", "Marshall", "Ohio", "Tyler", "Wetzel") ~ "Northern Panhandle",
county %in% c("Braxton", "Calhoun", "Clay", "Gilmer", "Lewis", "Nicholas", "Roane", "Upshur", "Webster", "Wirt") ~ "Central WV",
county %in% c("Boone", "Fayette", "Logan", "McDowell", "Mercer", "Mingo", "Raleigh", "Wyoming") ~ "Southern Coalfields",
county %in% c("Cabell", "Lincoln", "Mason", "Putnam", "Wayne") ~ "Southwestern",
county %in% c("Jackson", "Kanawha", "Pleasants", "Ritchie", "Wood") ~ "Western",
county %in% c("Greenbrier", "Monroe", "Pocahontas", "Summers") ~ "Greenbrier Valley",
TRUE ~ "Other"
))
glimpse(completely_joined)
## Rows: 55
## Columns: 17
## $ county <chr> "Barbour", "Berkeley", "Boone", "Braxton", "B…
## $ school <chr> "999", "999", "999", "999", "999", "999", "99…
## $ school_name <chr> "Barbour County Total", "Berkeley County Tota…
## $ population_group <chr> "Total Population", "Total Population", "Tota…
## $ subgroup <chr> "Total", "Total", "Total", "Total", "Total", …
## $ science_proficiency <dbl> 25.97, 28.63, 19.58, 22.65, 21.14, 30.82, 27.…
## $ proficiency <dbl> 25.97, 28.63, 19.58, 22.65, 21.14, 30.82, 27.…
## $ name <chr> "BARBOUR CO SCH DIST", "BERKELEY CO SCH DIST"…
## $ enroll <dbl> 2144, 19722, 3177, 1747, 2582, 11667, 861, 16…
## $ tfedrev <dbl> 7559, 48407, 8194, 5479, 6791, 42518, 3254, 6…
## $ tstrev <dbl> 16584, 140127, 26858, 12748, 17114, 88337, 99…
## $ tlocrev <dbl> 5872, 86699, 14564, 6404, 21352, 66699, 3190,…
## $ totalexp <dbl> 28021, 264253, 48642, 24417, 41908, 183621, 1…
## $ ppcstot <dbl> 11885, 12704, 14663, 13153, 15642, 14538, 160…
## $ unemployed <dbl> 10.1, 4.6, 9.8, 14.4, 5.7, 6.1, 12.2, 11.2, 4…
## $ total_checkbook_spending <dbl> 25365788, 270106909, 60173833, 28709585, 3993…
## $ region <chr> "North Central", "Eastern Panhandle", "Southe…
summary(completely_joined)
## county school school_name population_group
## Length:55 Length:55 Length:55 Length:55
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## subgroup science_proficiency proficiency name
## Length:55 Min. :14.81 Min. :14.81 Length:55
## Class :character 1st Qu.:21.45 1st Qu.:21.45 Class :character
## Mode :character Median :24.36 Median :24.36 Mode :character
## Mean :25.30 Mean :25.30
## 3rd Qu.:29.39 3rd Qu.:29.39
## Max. :41.80 Max. :41.80
##
## enroll tfedrev tstrev tlocrev
## Min. : 800 Min. : 1511 Min. : 3895 Min. : 1956
## 1st Qu.: 1654 1st Qu.: 4991 1st Qu.: 12668 1st Qu.: 8194
## Median : 3177 Median : 10158 Median : 26858 Median : 14813
## Mean : 4586 Mean : 13312 Mean : 34234 Mean : 25032
## 3rd Qu.: 5104 3rd Qu.: 14518 3rd Qu.: 39496 3rd Qu.: 33333
## Max. :24392 Max. :109522 Max. :176062 Max. :145623
##
## totalexp ppcstot unemployed total_checkbook_spending
## Min. : 13954 Min. :11885 Min. : 2.600 Min. : 17516974
## 1st Qu.: 26486 1st Qu.:13151 1st Qu.: 5.050 1st Qu.: 32887299
## Median : 48642 Median :13777 Median : 6.400 Median : 53524468
## Mean : 69482 Mean :14466 Mean : 7.055 Mean : 84148704
## 3rd Qu.: 81172 3rd Qu.:15236 3rd Qu.: 8.500 3rd Qu.:101527678
## Max. :416491 Max. :23563 Max. :15.100 Max. :566916960
## NA's :4
## region
## Length:55
## Class :character
## Mode :character
##
##
##
##
## Removing NAs
completely_joined <- completely_joined %>%
filter(!is.na(proficiency))
completely_joined <- completely_joined %>%
filter(!is.na(total_checkbook_spending))
## Looking at my spending data and making it easier to read
summary(completely_joined$total_checkbook_spending)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17516974 32887299 53524468 84148704 101527678 566916960
completely_joined <- completely_joined %>%
mutate(checkbook_spending_in_millions = total_checkbook_spending / 1e6)
summary(completely_joined$checkbook_spending_in_millions)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.52 32.89 53.52 84.15 101.53 566.92
Correlations
library(ggcorrplot)
## Does County Total Spending Impact Proficiency?
ggplot(completely_joined, aes(x = checkbook_spending_in_millions, y = proficiency, color = region)) +
geom_point(alpha = 0.7, size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "County Spending vs. Proficiency by Region",
x = "Checkbook Spending (in millions)",
y = "Proficiency (%)",
color = "Region"
) +
theme_minimal() +
geom_smooth(method = "lm", se = FALSE, color = "darkred")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## What About Unemploment?
ggplot(completely_joined, aes(x = unemployed, y = proficiency, color = region)) +
geom_point(size = 3, alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Unemployment Rate vs. Proficiency",
x = "Unemployment Rate (%)",
y = "Proficiency (%)",
color = "Region"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Linear Regression Model
model_data_scaled <- completely_joined %>%
mutate(across(c(enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot, unemployed, total_checkbook_spending), scale))
model1 <- lm(proficiency ~ totalexp + total_checkbook_spending + unemployed, data = model_data_scaled)
summary(model1)
##
## Call:
## lm(formula = proficiency ~ totalexp + total_checkbook_spending +
## unemployed, data = model_data_scaled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6532 -4.6023 0.0937 4.4268 11.2970
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.8139 0.7440 34.696 <2e-16 ***
## totalexp 6.3655 3.9140 1.626 0.111
## total_checkbook_spending -4.7746 3.8839 -1.229 0.225
## unemployed -0.8872 0.7901 -1.123 0.267
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.313 on 47 degrees of freedom
## Multiple R-squared: 0.1722, Adjusted R-squared: 0.1193
## F-statistic: 3.259 on 3 and 47 DF, p-value: 0.02964
model2 <- lm(proficiency ~ totalexp + tfedrev, data = model_data_scaled)
summary(model2)
##
## Call:
## lm(formula = proficiency ~ totalexp + tfedrev, data = model_data_scaled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1284 -3.8691 -0.1779 3.7889 8.9413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.8139 0.6815 37.881 < 2e-16 ***
## totalexp 8.9642 2.1635 4.143 0.000138 ***
## tfedrev -7.4081 2.1635 -3.424 0.001272 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.867 on 48 degrees of freedom
## Multiple R-squared: 0.2907, Adjusted R-squared: 0.2612
## F-statistic: 9.839 on 2 and 48 DF, p-value: 0.0002625
model3 <- lm(proficiency ~ tstrev + tlocrev + totalexp + ppcstot, data = model_data_scaled)
summary(model3)
##
## Call:
## lm(formula = proficiency ~ tstrev + tlocrev + totalexp + ppcstot,
## data = model_data_scaled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7828 -4.1792 -0.3228 3.4866 9.8023
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.8139 0.6899 37.420 < 2e-16 ***
## tstrev 12.5133 5.5557 2.252 0.02912 *
## tlocrev 10.9589 3.4305 3.195 0.00253 **
## totalexp -20.4858 7.9339 -2.582 0.01307 *
## ppcstot 0.2588 0.9692 0.267 0.79063
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.927 on 46 degrees of freedom
## Multiple R-squared: 0.3034, Adjusted R-squared: 0.2429
## F-statistic: 5.01 on 4 and 46 DF, p-value: 0.00195
Decision Tree Model
train_index <- createDataPartition(model_data_scaled$proficiency, p = 0.8, list = FALSE)
train <- model_data_scaled[train_index, ]
test <- model_data_scaled[-train_index, ]
tree_model1 <- rpart(proficiency ~ totalexp + tfedrev + tstrev + tlocrev + total_checkbook_spending + unemployed,
data = model_data_scaled,
method = "anova")
rpart.plot(tree_model1,
type = 3,
extra = 101,
fallen.leaves = TRUE,
main = "Decision Tree for Proficiency Prediction",
box.palette = "RdYlGn",
shadow.col = "gray",
nn = TRUE)

pred_tree1 <- predict(tree_model1, newdata = test)
MAE_tree <- mean(abs(pred_tree1 - test$proficiency))
RMSE_tree <- sqrt(mean((pred_tree1 - test$proficiency)^2))
MAE_tree
## [1] 2.601399
RMSE_tree
## [1] 2.706981
## Model 2
tree_model2 <- rpart(proficiency ~ tstrev + tlocrev + totalexp + ppcstot,
data = model_data_scaled,
method = "anova")
rpart.plot(tree_model2,
type = 3,
extra = 101,
fallen.leaves = TRUE,
main = "Decision Tree for Proficiency Prediction",
box.palette = "RdYlGn",
shadow.col = "gray",
nn = TRUE)

pred_tree2 <- predict(tree_model2, newdata = test)
MAE_tree <- mean(abs(pred_tree2 - test$proficiency))
RMSE_tree <- sqrt(mean((pred_tree2 - test$proficiency)^2))
MAE_tree
## [1] 2.909479
RMSE_tree
## [1] 3.236618
K-means
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
kmeans_data <- completely_joined %>%
select(proficiency, checkbook_spending_in_millions, unemployed, totalexp) %>%
na.omit() %>%
scale()
set.seed(123)
kmeans_result <- kmeans(kmeans_data, centers = 3, nstart = 25)
clustered_data <- completely_joined %>%
filter(!is.na(proficiency), !is.na(checkbook_spending_in_millions), !is.na(unemployed), !is.na(totalexp)) %>%
mutate(cluster = as.factor(kmeans_result$cluster))
fviz_cluster(kmeans_result, data = kmeans_data,
ellipse.type = "euclid",
geom = "point",
palette = "jco",
ggtheme = theme_minimal())


Conclusion
- In conclusion, my findings suggest that there is a positive
correlation between totalexp and educational proficiency in West
Virginia counties.