library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(readxl)
library(dplyr)
assessment_path <- 'Historical_AssessmentResults_SY15-to-SY21.xlsx'
t_assess_raw_school <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'b2:f7312')
t_assess_raw_science <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'db3:db7312',
col_names = c('science_proficiency'),
na = '**')
t_assess_raw <- t_assess_raw_school %>%
bind_cols(t_assess_raw_science) %>%
janitor::clean_names()
# Remove subgroups
t_assess <- t_assess_raw %>%
filter(school == 999) %>%
filter(population_group == 'Total Population') %>%
filter(county != 'Statewide') %>%
mutate(proficiency = science_proficiency)
print(t_assess)
## # A tibble: 55 × 7
## county school school_name population_group subgroup science_proficiency
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Barbour 999 Barbour Count… Total Population Total 26.0
## 2 Berkeley 999 Berkeley Coun… Total Population Total 28.6
## 3 Boone 999 Boone County … Total Population Total 19.6
## 4 Braxton 999 Braxton Count… Total Population Total 22.6
## 5 Brooke 999 Brooke County… Total Population Total 21.1
## 6 Cabell 999 Cabell County… Total Population Total 30.8
## 7 Calhoun 999 Calhoun Count… Total Population Total 27.8
## 8 Clay 999 Clay County T… Total Population Total 23.3
## 9 Doddridge 999 Doddridge Cou… Total Population Total 31.3
## 10 Fayette 999 Fayette Count… Total Population Total 17.4
## # ℹ 45 more rows
## # ℹ 1 more variable: proficiency <dbl>
spending_path <- 'elsec22t.xls'
t_spending_raw <- read_excel(path = spending_path,,
sheet = 'elsec22t',
range = 'a1:gb14106') %>%
janitor::clean_names()
## New names:
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...92`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...97`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...102`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## • `` -> `...107`
## • `` -> `...108`
## • `` -> `...109`
## • `` -> `...110`
## • `` -> `...111`
## • `` -> `...112`
## • `` -> `...113`
## • `` -> `...114`
## • `` -> `...115`
## • `` -> `...116`
## • `` -> `...117`
## • `` -> `...118`
## • `` -> `...119`
## • `` -> `...120`
## • `` -> `...121`
## • `` -> `...122`
## • `` -> `...123`
## • `` -> `...124`
## • `` -> `...125`
## • `` -> `...126`
## • `` -> `...127`
## • `` -> `...128`
## • `` -> `...129`
## • `` -> `...130`
## • `` -> `...131`
## • `` -> `...132`
## • `` -> `...133`
## • `` -> `...134`
## • `` -> `...135`
## • `` -> `...136`
## • `` -> `...137`
## • `` -> `...138`
## • `` -> `...139`
## • `` -> `...140`
## • `` -> `...141`
## • `` -> `...142`
## • `` -> `...143`
## • `` -> `...144`
## • `` -> `...145`
## • `` -> `...146`
## • `` -> `...147`
## • `` -> `...148`
## • `` -> `...149`
## • `` -> `...150`
## • `` -> `...151`
## • `` -> `...152`
## • `` -> `...153`
## • `` -> `...154`
## • `` -> `...155`
## • `` -> `...156`
## • `` -> `...157`
## • `` -> `...158`
## • `` -> `...159`
## • `` -> `...160`
## • `` -> `...161`
## • `` -> `...162`
## • `` -> `...163`
## • `` -> `...164`
## • `` -> `...165`
## • `` -> `...166`
## • `` -> `...167`
## • `` -> `...168`
## • `` -> `...169`
## • `` -> `...170`
## • `` -> `...171`
## • `` -> `...172`
## • `` -> `...173`
## • `` -> `...174`
## • `` -> `...175`
## • `` -> `...176`
## • `` -> `...177`
## • `` -> `...178`
## • `` -> `...179`
## • `` -> `...180`
## • `` -> `...181`
## • `` -> `...182`
## • `` -> `...183`
## • `` -> `...184`
cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')
t_spending <- t_spending_raw %>%
filter(state == 49) %>%
filter(!name %in% cooperates) %>%
select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>%
mutate(county = str_to_title(str_split_i(name, ' ',1)),
county = ifelse(county == 'Mc', 'McDowell', county))
print(t_spending)
## # A tibble: 55 × 8
## name enroll tfedrev tstrev tlocrev totalexp ppcstot county
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 BARBOUR CO SCH DIST 2144 7559 16584 5872 28021 11885 Barbour
## 2 BERKELEY CO SCH DIST 19722 48407 140127 86699 264253 12704 Berkeley
## 3 BOONE CO SCH DIST 3177 8194 26858 14564 48642 14663 Boone
## 4 BRAXTON CO SCH DIST 1747 5479 12748 6404 24417 13153 Braxton
## 5 BROOKE CO SCH DIST 2582 6791 17114 21352 41908 15642 Brooke
## 6 CABELL CO SCH DIST 11667 42518 88337 66699 183621 14538 Cabell
## 7 CALHOUN CO SCH DIST 861 3254 9953 3190 15154 16085 Calhoun
## 8 CLAY CO SCH DIST 1669 6157 17655 2791 25963 13825 Clay
## 9 DODDRIDGE CO SCH DIST 1082 3455 3999 31752 38493 23563 Doddrid…
## 10 FAYETTE CO SCH DIST 5594 15293 51759 23477 83373 13777 Fayette
## # ℹ 45 more rows
t_demographics_unemployed <- read_csv('unemployed.csv',
skip = 4,
na = 'N/A') %>%
janitor::clean_names() %>%
filter(county != 'West Virginia',
county != 'United States',
!is.na(value_percent) ) %>%
select(county, value_percent) %>%
rename(unemployed = value_percent) %>%
mutate(county = str_remove(county, "\\s*County$"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_poverty_rate <- read_excel('HDPulse_data_export.xls', skip = 3) %>%
janitor::clean_names() %>%
mutate(county = str_remove(county, "\\s*County$"))
t_income <- read_excel('Income sheet.xls', skip = 3) %>%
janitor::clean_names() %>%
mutate(county = str_remove(county, "\\s*County$"))
t_demographics <- t_demographics_unemployed %>%
full_join(t_poverty_rate, by = 'county') %>%
full_join(t_income, by = 'county') %>%
slice(1:(n() - 14)) %>%
select(county, unemployed, value_percent, value_dollars)
## Warning in full_join(., t_income, by = "county"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 58 of `x` matches multiple rows in `y`.
## ℹ Row 58 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
print(t_demographics)
## # A tibble: 55 × 4
## county unemployed value_percent value_dollars
## <chr> <dbl> <dbl> <dbl>
## 1 McDowell 15.1 29.2 39226
## 2 Braxton 14.4 13.2 48722
## 3 Logan 13.3 16.3 55258
## 4 Calhoun 12.2 20.8 47602
## 5 Roane 11.7 16.2 52124
## 6 Clay 11.2 18.1 57145
## 7 Mingo 11.2 23.6 51322
## 8 Webster 11.1 19.3 49881
## 9 Monroe 10.6 10.5 68030
## 10 Barbour 10.1 16.1 63491
## # ℹ 45 more rows
# Merge data
t2 <- t_assess %>%
full_join(t_spending, by = 'county')
t <- t2 %>%
full_join(t_demographics, by = 'county') %>%
mutate(totalrev = tfedrev + tstrev +tlocrev,
profit = totalrev - totalexp,
state = 'West Virginia')
library(ggcorrplot)
tcorr <- t %>%
select(where(is.numeric))
pairs(tcorr)
ggcorrplot(cor(tcorr),
lab = T)
library(rpart.plot)
test01 <- sample(x = 0:1,
size = 55,
replace = TRUE,
prob = c(0.6, 0.4))
table(test01)
## test01
## 0 1
## 31 24
t <- t %>%
mutate(is_test = test01)
t_train <- t %>%
filter(is_test == 0) %>%
select(-is_test)
t_test <- t %>%
filter(is_test == 1) %>%
select(-is_test)
m_train <- lm(science_proficiency ~ value_percent + tlocrev + tfedrev, data = t_train)
summary(m_train)
##
## Call:
## lm(formula = science_proficiency ~ value_percent + tlocrev +
## tfedrev, data = t_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.837 -2.296 -0.526 3.923 6.381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.714e+01 3.509e+00 7.734 2.56e-08 ***
## value_percent -4.255e-01 2.642e-01 -1.610 0.118940
## tlocrev 2.537e-04 6.304e-05 4.024 0.000415 ***
## tfedrev -2.749e-04 9.244e-05 -2.974 0.006121 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.28 on 27 degrees of freedom
## Multiple R-squared: 0.5502, Adjusted R-squared: 0.5003
## F-statistic: 11.01 on 3 and 27 DF, p-value: 6.709e-05
rsme_train <- sqrt(mean(m_train$residuals^2))
t_test_predictions <- t_test %>%
mutate(predicted = predict(m_train, newdata = t_test),
residuals = science_proficiency - predicted,
squared_residuals = residuals^2)
hist(t_test_predictions$residuals)
pca_inputs <- t %>%
select(value_percent, tlocrev, tfedrev) %>%
mutate(across(c(value_percent, tlocrev, tfedrev), scale))
pca_results <- prcomp(pca_inputs, scale = TRUE, center = TRUE, rank = 3)
summary(pca_results)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.3810 0.9861 0.34736
## Proportion of Variance 0.6357 0.3241 0.04022
## Cumulative Proportion 0.6357 0.9598 1.00000
print(pca_results)
## Standard deviations (1, .., p=3):
## [1] 1.3809560 0.9860521 0.3473639
##
## Rotation (n x k) = (3 x 3):
## PC1 PC2 PC3
## value_percent -0.2486823 0.95016138 0.1880173
## tlocrev 0.7009861 0.04259592 0.7119017
## tfedrev 0.6684128 0.30883482 -0.6766428
m <- rpart(formula = science_proficiency ~ value_percent + tlocrev + tfedrev,
data = t,
minsplit = 8,
minbucket = 8,
method = 'anova')
rpart.plot(m)
library(usmap)
data("countypop", package = "usmap")
wv_fips <- countypop %>%
filter(abbr == "WV") %>%
select(county_name = county, fips) %>%
mutate(county = str_remove(county_name, "\\s*County$"))
t <- t %>%
mutate(county = str_to_title(county)) %>%
left_join(wv_fips, by = c("county" = "county"))
plot_usmap(data = t,
values = "proficiency",
include = 'West Virginia',
regions = "counties") +
scale_fill_continuous(name = "proficiency",
low = 'red',
high = 'blue') +
theme(legend.position = "right") +
labs(title = 'Proficiency')
This project pulls data from WV elementary and middle schools to predict the students’ science proficiency rating. The data I used showed each county’s funding for each school as well as the different demographic groups in each county. In the beginning of my code I cleaned all my data to make it easier to use. I then joined it all togther on a tibble named “t”. With this tibble I was able to design, train, and test my model. The goal of this project was that I wanted to prove that funding is the driving force for higher science proficiency.
I used the pairs and ggcorplot function to see if any of my data was correlated and useful for my model. After seeing the plots I decided to go with tlocrev, tfedrev, and value_percent. Value_percent is just what the poverty rate is in that county.
I then designed, trained, and tested my model. First I split the data into training and testing data. After I made a linear regression with the dependent variable of science proficiency and independent variables of tlocrev, tfedrev, and value_percent. All this data ended up being statistically relevant with low p-values. The model ended up having an Adjusted R-Squared of .5145. I then made a predictions tibble that predicted each county’s science proficiency. Overall my model wasn’t too great. Some counties it nearly predicted only being off by 1 or 2 proficiency points and others it was off by 10+.
I also did a Principal Component Analysis. After seeing the summary of the prcomp function I found the first two components explain 96% of the variation in the data. I then made a decision tree to see how the computer split up the data when making a prediction.
Dependent Variable - Science Proficiency: This is variable that shows the percentage of students who are doing academically good in science class.
Independent Variables - tlocrev, tfedrev, value_percent: tlocrev stands for total local revenue. This is the revenue the county is getting from their county. All towns in each county help contribute to that number. tfedrev is total federal revenue. This varibale is how much each county is receiving from the federal government for academics. Lastly, value_percent is poverty rate in the county.
Overall, I found that funding played a part in science proficiency. The model I created ended up being not amazing but not completely horrible. Like I mentioned before it was close in predicting some of the county’s proficiency but was very off about others. Funding is not the only factor leading to higher proficiency but it is a big one. Poverty rate is also a big contributing factor in science scores as well.