library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(rpart.plot)
library(readxl)
library(dplyr)
library(neuralnet)
##
## Attaching package: 'neuralnet'
##
## The following object is masked from 'package:dplyr':
##
## compute
assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'
t_assess_raw_school <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'b2:f7312')
t_assess_raw_science <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'db3:db7312',
col_names = c('science_proficiency'),
na = '**')
t_assess_raw <- t_assess_raw_school %>%
bind_cols(t_assess_raw_science) %>%
janitor::clean_names()
# Remove subgroups
t_assess <- t_assess_raw %>%
filter(school == 999) %>%
filter(population_group == 'Total Population') %>%
filter(county != 'Statewide') %>%
mutate(proficiency = science_proficiency) %>%
mutate(proficiency_group = ifelse(proficiency < 25, "Low",
ifelse(proficiency < 35, "Medium", "High")))
print(t_assess)
## # A tibble: 55 × 8
## county school school_name population_group subgroup science_proficiency
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Barbour 999 Barbour Count… Total Population Total 26.0
## 2 Berkeley 999 Berkeley Coun… Total Population Total 28.6
## 3 Boone 999 Boone County … Total Population Total 19.6
## 4 Braxton 999 Braxton Count… Total Population Total 22.6
## 5 Brooke 999 Brooke County… Total Population Total 21.1
## 6 Cabell 999 Cabell County… Total Population Total 30.8
## 7 Calhoun 999 Calhoun Count… Total Population Total 27.8
## 8 Clay 999 Clay County T… Total Population Total 23.3
## 9 Doddridge 999 Doddridge Cou… Total Population Total 31.3
## 10 Fayette 999 Fayette Count… Total Population Total 17.4
## # ℹ 45 more rows
## # ℹ 2 more variables: proficiency <dbl>, proficiency_group <chr>
spending_path <- './us census ed spending/elsec22t.xls'
t_spending_raw <- read_excel(path = spending_path,
sheet = 'elsec22t',
range = 'a1:gb14106') %>%
janitor::clean_names()
## New names:
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...92`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...97`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...102`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## • `` -> `...107`
## • `` -> `...108`
## • `` -> `...109`
## • `` -> `...110`
## • `` -> `...111`
## • `` -> `...112`
## • `` -> `...113`
## • `` -> `...114`
## • `` -> `...115`
## • `` -> `...116`
## • `` -> `...117`
## • `` -> `...118`
## • `` -> `...119`
## • `` -> `...120`
## • `` -> `...121`
## • `` -> `...122`
## • `` -> `...123`
## • `` -> `...124`
## • `` -> `...125`
## • `` -> `...126`
## • `` -> `...127`
## • `` -> `...128`
## • `` -> `...129`
## • `` -> `...130`
## • `` -> `...131`
## • `` -> `...132`
## • `` -> `...133`
## • `` -> `...134`
## • `` -> `...135`
## • `` -> `...136`
## • `` -> `...137`
## • `` -> `...138`
## • `` -> `...139`
## • `` -> `...140`
## • `` -> `...141`
## • `` -> `...142`
## • `` -> `...143`
## • `` -> `...144`
## • `` -> `...145`
## • `` -> `...146`
## • `` -> `...147`
## • `` -> `...148`
## • `` -> `...149`
## • `` -> `...150`
## • `` -> `...151`
## • `` -> `...152`
## • `` -> `...153`
## • `` -> `...154`
## • `` -> `...155`
## • `` -> `...156`
## • `` -> `...157`
## • `` -> `...158`
## • `` -> `...159`
## • `` -> `...160`
## • `` -> `...161`
## • `` -> `...162`
## • `` -> `...163`
## • `` -> `...164`
## • `` -> `...165`
## • `` -> `...166`
## • `` -> `...167`
## • `` -> `...168`
## • `` -> `...169`
## • `` -> `...170`
## • `` -> `...171`
## • `` -> `...172`
## • `` -> `...173`
## • `` -> `...174`
## • `` -> `...175`
## • `` -> `...176`
## • `` -> `...177`
## • `` -> `...178`
## • `` -> `...179`
## • `` -> `...180`
## • `` -> `...181`
## • `` -> `...182`
## • `` -> `...183`
## • `` -> `...184`
cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')
t_spending <- t_spending_raw %>%
filter(state == 49) %>%
filter(!name %in% cooperates) %>%
select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>%
mutate(county = str_to_title(str_split_i(name, ' ',1)),
county = ifelse(county == 'Mc', 'McDowell', county))
print(t_spending)
## # A tibble: 55 × 8
## name enroll tfedrev tstrev tlocrev totalexp ppcstot county
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 BARBOUR CO SCH DIST 2144 7559 16584 5872 28021 11885 Barbour
## 2 BERKELEY CO SCH DIST 19722 48407 140127 86699 264253 12704 Berkeley
## 3 BOONE CO SCH DIST 3177 8194 26858 14564 48642 14663 Boone
## 4 BRAXTON CO SCH DIST 1747 5479 12748 6404 24417 13153 Braxton
## 5 BROOKE CO SCH DIST 2582 6791 17114 21352 41908 15642 Brooke
## 6 CABELL CO SCH DIST 11667 42518 88337 66699 183621 14538 Cabell
## 7 CALHOUN CO SCH DIST 861 3254 9953 3190 15154 16085 Calhoun
## 8 CLAY CO SCH DIST 1669 6157 17655 2791 25963 13825 Clay
## 9 DODDRIDGE CO SCH DIST 1082 3455 3999 31752 38493 23563 Doddrid…
## 10 FAYETTE CO SCH DIST 5594 15293 51759 23477 83373 13777 Fayette
## # ℹ 45 more rows
t_demographics_poverty <- read_excel("WV Poverty.xlsx") %>%
janitor::clean_names() %>%
mutate(county = str_remove(county, " County$")) %>%
mutate(poverty_rate = value_percent)
t_demographics_unemployment <- read_excel("WV Unemployment.xlsx") %>%
janitor::clean_names() %>%
mutate(county = str_remove(county, " County$")) %>%
mutate(unemployment_rate = value_percent)
t_demographics <- t_demographics_poverty %>%
inner_join(t_demographics_unemployment, by = 'county')
print(t_demographics)
## # A tibble: 55 × 11
## county fips.x value_percent.x families_below_poverty rank_within_us_of_31…¹
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 McDowell 54047 29.2 1173 3118
## 2 Mingo 54059 23.6 1456 3080
## 3 Calhoun 54013 20.8 304 3008
## 4 Webster 54101 19.3 357 2965
## 5 Clay 54015 18.1 364 2912
## 6 Wyoming 54109 18 1029 2908
## 7 Ritchie 54085 17.1 375 2852
## 8 Raleigh 54081 16.5 3117 2792
## 9 Logan 54045 16.3 1386 2775
## 10 Roane 54087 16.2 536 2767
## # ℹ 45 more rows
## # ℹ abbreviated name: ¹rank_within_us_of_3143_counties.x
## # ℹ 6 more variables: poverty_rate <dbl>, fips.y <dbl>, value_percent.y <dbl>,
## # people_unemployed <dbl>, rank_within_us_of_3143_counties.y <dbl>,
## # unemployment_rate <dbl>
# Merge data
t <- t_assess %>%
left_join(t_spending, by = 'county') %>%
left_join(t_demographics, by = 'county') %>%
mutate(proficiency = as.numeric(proficiency),
enroll = as.numeric(enroll),
tfedrev = as.numeric(tfedrev),
tstrev = as.numeric(tstrev),
tlocrev = as.numeric(tlocrev),
totalexp = as.numeric(totalexp),
ppcstot = as.numeric(ppcstot)) %>%
mutate(region = ifelse(
county %in% c(
"Mason", "Jackson", "Roane", "Calhoun", "Braxton",
"Webster", "Pocahontas", "Nicholas", "Clay", "Kanawha",
"Putnam", "Cabell", "Wayne", "Lincoln",
"Boone", "Fayette", "Greenbrier", "Monroe", "Summers",
"Raleigh", "Logan", "Mingo", "Wirt", "Wyoming", "Mcdowell",
"Mercer"
),
0,1
)) %>%
select(county,
region,
enroll,
tfedrev,
tstrev,
tlocrev,
totalexp,
ppcstot,
poverty_rate,
unemployment_rate,
proficiency,
proficiency_group)
print(t)
## # A tibble: 55 × 12
## county region enroll tfedrev tstrev tlocrev totalexp ppcstot poverty_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Barbour 1 2144 7559 16584 5872 28021 11885 16.1
## 2 Berkeley 1 19722 48407 140127 86699 264253 12704 8.2
## 3 Boone 0 3177 8194 26858 14564 48642 14663 13.4
## 4 Braxton 0 1747 5479 12748 6404 24417 13153 13.2
## 5 Brooke 1 2582 6791 17114 21352 41908 15642 8.1
## 6 Cabell 0 11667 42518 88337 66699 183621 14538 13.1
## 7 Calhoun 0 861 3254 9953 3190 15154 16085 20.8
## 8 Clay 0 1669 6157 17655 2791 25963 13825 18.1
## 9 Doddridge 1 1082 3455 3999 31752 38493 23563 11.9
## 10 Fayette 0 5594 15293 51759 23477 83373 13777 13.8
## # ℹ 45 more rows
## # ℹ 3 more variables: unemployment_rate <dbl>, proficiency <dbl>,
## # proficiency_group <chr>
ggplot(data = t) +
geom_histogram(mapping = aes(x = proficiency),
binwidth = 5,
fill = 'blue',
color = 'black')
library(ggcorrplot)
t_numbers <- t %>%
select(where(is.numeric)) %>%
scale() %>%
as_tibble()
ggcorrplot(cor(t_numbers),
lab = TRUE,
title = "Correlation Matrix",
colors = c("blue", "white", "red"))
pairs(t_numbers)
model <- lm(proficiency ~ poverty_rate + unemployment_rate + tfedrev + region, data = t_numbers)
summary(model)
##
## Call:
## lm(formula = proficiency ~ poverty_rate + unemployment_rate +
## tfedrev + region, data = t_numbers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.38338 -0.65849 -0.03395 0.44107 2.39657
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.665e-16 1.172e-01 0.000 1.00000
## poverty_rate -4.753e-01 1.533e-01 -3.099 0.00318 **
## unemployment_rate -1.248e-02 1.438e-01 -0.087 0.93123
## tfedrev 1.658e-01 1.231e-01 1.347 0.18406
## region 7.586e-02 1.332e-01 0.570 0.57154
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8695 on 50 degrees of freedom
## Multiple R-squared: 0.2999, Adjusted R-squared: 0.2439
## F-statistic: 5.355 on 4 and 50 DF, p-value: 0.001144
hist(model$residuals)
rmse <- function(model, tibble, dependent_variable) {
results <- predict(model, tibble)
errors <- results - dependent_variable
return(sqrt(mean(errors^2, na.rm = TRUE)))
}
rmse(model, t_numbers, t_numbers$proficiency)
## [1] 0.8290797
t <- t %>%
mutate(
region = factor(region),
proficiency_group = factor(proficiency_group)
)
set.seed(34)
t <- t %>%
mutate(test01 = rbinom(nrow(.), 1, 0.5))
t_train <- filter(t, test01 == 0)
t_test <- filter(t, test01 == 1)
m <- rpart(
formula = proficiency_group ~ poverty_rate + unemployment_rate + region,
data = t_train,
method = "class",
minsplit = 12,
minbucket = 2
)
rpart.plot(m)
t_pca_results <- prcomp(t_numbers, center = TRUE, scale. = TRUE)
biplot(t_pca_results, scale = 0)
summary(t_pca_results)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.2612 1.4495 0.95387 0.86520 0.80409 0.57196 0.2846
## Proportion of Variance 0.5113 0.2101 0.09099 0.07486 0.06466 0.03271 0.0081
## Cumulative Proportion 0.5113 0.7214 0.81239 0.88725 0.95190 0.98462 0.9927
## PC8 PC9 PC10
## Standard deviation 0.26167 0.05317 0.03895
## Proportion of Variance 0.00685 0.00028 0.00015
## Cumulative Proportion 0.99957 0.99985 1.00000
print(t_pca_results)
## Standard deviations (1, .., p=10):
## [1] 2.26118726 1.44950100 0.95386617 0.86520385 0.80409292 0.57196181
## [7] 0.28463961 0.26167013 0.05317102 0.03894576
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4 PC5
## region -0.01430721 0.47243308 -0.11322985 -0.67824393 -0.495149364
## enroll 0.43478476 -0.08822000 -0.01967394 -0.05991401 -0.037471678
## tfedrev 0.40297148 -0.19312698 0.13128844 -0.06505171 -0.112511977
## tstrev 0.42726876 -0.13198189 -0.04895699 -0.01995800 0.005670207
## tlocrev 0.41486046 0.06796116 0.25135749 -0.06484934 -0.077414993
## totalexp 0.43620580 -0.08227874 0.08507710 -0.05510063 -0.050280895
## ppcstot -0.12236146 0.29780707 0.88669487 0.07766136 -0.052591378
## poverty_rate -0.13552103 -0.54066459 0.31435894 -0.13311166 -0.072986249
## unemployment_rate -0.18178921 -0.39475035 0.10615295 -0.68829890 0.334265638
## proficiency 0.19246457 0.40753327 0.02054298 -0.16437319 0.782549853
## PC6 PC7 PC8 PC9
## region 0.210937521 -0.022698940 0.11279087 0.0172854680
## enroll 0.008414848 0.357771164 0.12824816 -0.7020751690
## tfedrev -0.149133238 -0.812875008 0.22394529 -0.1510566309
## tstrev -0.032740402 0.402562683 0.44038788 0.2102456294
## tlocrev 0.030622799 0.059620005 -0.81131512 -0.0337565348
## totalexp -0.024217942 0.073643655 0.01589868 0.6613696304
## ppcstot -0.180610884 0.114318820 0.23282838 -0.0336881875
## poverty_rate 0.752447150 0.005081785 0.03330843 0.0015415153
## unemployment_rate -0.447114668 0.089354683 -0.08044452 0.0005782796
## proficiency 0.363165389 -0.136401064 0.08039174 -0.0091347096
## PC10
## region -0.0237299757
## enroll 0.4004627522
## tfedrev -0.1010570636
## tstrev -0.6292229891
## tlocrev -0.2933069109
## totalexp 0.5886254257
## ppcstot 0.0167529171
## poverty_rate -0.0115860301
## unemployment_rate 0.0002652595
## proficiency 0.0029274240
#Thesis
This project explored whether the common belief that Southern West Virginia is significantly different from Northern West Virginia holds true in terms of education outcomes. Specifically, I examined whether student proficiency rates in science varied between Southern and Northern counties and how those rates were influenced by poverty and unemployment levels. Additionally, I investigated differences in per-student spending between the two regions.
To analyze these questions, I built a decision tree model using poverty rate, unemployment rate, and region as predictors to estimate science proficiency rates. I also performed Principal Component Analysis (PCA) to identify any underlying patterns in the data that could explain regional differences.
The analysis revealed that region alone did not significantly impact proficiency rates. Among all variables studied, poverty rate emerged as the most influential factor and was the only one with statistical significance.
#Variables
Dependent Variable (Effect):
Proficiency Rate: The percentage of students who are proficient in science in each county, showing how well students are doing academically.
proficiency_group: A categorical variable that divides counties into three groups based on proficiency rates: Low (0-25%), Medium (25-35%), and High (35%+). This helps to understand the distribution of proficiency levels across counties.
Independent Variables (Causes):
Poverty Rate (poverty_rate): The percentage of people living below the poverty line in each county. It’s expected that counties with higher poverty will have lower proficiency, as financial challenges can make learning harder.
Unemployment Rate(unemployment_rate): The percentage of people without jobs in each county. Higher unemployment is expected to be linked to lower proficiency, as it can affect families’ financial stability and students’ ability to focus on school.
Region (region): A variable that separates Northern and Southern counties. Northern counties are expected to have better education resources, leading to higher proficiency compared to Southern counties which were identified with zeroes and ones.
#RPubs link