library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2 ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(caret)
Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
library(rpart)
library(readxl)
library(usmap)
getwd()
[1] "/Users/elireeves/Documents/LAST HALF SEMESTER/ACCT 426/Project_2"
library(readxl)
t_assess_raw_school <- read_excel("~/Documents/LAST HALF SEMESTER/ACCT 426/Project_2/Historical_AssessmentResults_SY15-to-SY21.xlsx",
sheet = 'SY21 School & District',
range = 'b2:f7312',
skip = 1)
t_assess_raw_science <- read_excel("~/Documents/LAST HALF SEMESTER/ACCT 426/Project_2/Historical_AssessmentResults_SY15-to-SY21.xlsx",
sheet = 'SY21 School & District',
range = 'db3:db7312',
col_names = c('science_proficiency'),
na = '**')
t_assess_raw <- t_assess_raw_school %>%
bind_cols(t_assess_raw_science) %>%
janitor::clean_names()
# Remove subgroups
t_assess <- t_assess_raw %>%
filter(school == 999) %>%
filter(population_group == 'Total Population') %>%
filter(county != 'Statewide') %>%
mutate(proficiency = science_proficiency)
print(t_assess)
t_spending_raw <- read_excel("~/Documents/LAST HALF SEMESTER/ACCT 426/Project_2/elsec22t.xls",
sheet = 'elsec22t',
range = 'a1:gb14106') %>%
janitor::clean_names()
New names:
cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')
t_spending <- t_spending_raw %>%
filter(state == 49) %>%
filter(!name %in% cooperates) %>%
select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>%
mutate(county = str_to_title(str_split_i(name, ' ',1)),
county = ifelse(county == 'Mc', 'McDowell', county))
print(t_spending)
t_demographics_unemployed <- read_excel("~/Documents/LAST HALF SEMESTER/ACCT 426/Project_2/unemployed.xls",
skip = 4,
na = 'N/A') %>%
janitor::clean_names() %>%
filter(county != 'West Virginia',
county != 'United States',
!is.na(value_percent) ) %>%
select(county, value_percent) %>%
rename(unemployed = value_percent) %>%
mutate(county = str_remove(county, " County$"))
t_demographics <- t_demographics_unemployed
print(t_demographics)
t_wv_poverty_rate <- read.csv("~/Documents/LAST HALF SEMESTER/ACCT 426/Project_2/WV poverty rate.csv", skip = 4) %>%
janitor::clean_names()
t_wv_poverty_rate_clean <- t_wv_poverty_rate %>%
mutate(county = str_remove(county, " County$")) %>%
slice(3:n()) %>%
slice(1:(n() - 8)) %>%
rename(poverty_percent = value_percent) %>%
select(county, poverty_percent)
print(t_wv_poverty_rate_clean)
NA
t_population_young_people <- read.csv("~/Documents/LAST HALF SEMESTER/ACCT 426/Project_2/population of young people.csv", skip = 4) %>%
janitor::clean_names()
t_population_young_people_clean <- t_population_young_people %>%
mutate(county = str_remove(county, " County$")) %>%
slice(3:n()) %>%
slice(1:(n() - 8)) %>%
rename(population_18_to_39 = value_percent) %>%
select(county, population_18_to_39)
print(t_population_young_people_clean)
#added more data
t_bachelors_degree <- read.csv("~/Documents/LAST HALF SEMESTER/ACCT 426/Project_2/bachelors degree.csv", skip = 5) %>%
janitor::clean_names()
#cleaned the added data
t_bachelors_degree_clean <- t_bachelors_degree %>%
mutate(county = str_remove(county, " County$")) %>%
slice(3:n()) %>%
slice(1:(n() - 8)) %>%
rename(bachelors_degree_percent = value_percent) %>%
select(county, bachelors_degree_percent)
print(t_bachelors_degree_clean)
# Merge data
t <- t_assess %>%
full_join(t_spending, by = "county") %>%
full_join(t_demographics, by = "county") %>%
full_join(t_wv_poverty_rate_clean, by = "county") %>%
full_join(t_population_young_people_clean, by = "county") %>%
full_join(t_bachelors_degree_clean, by = "county") %>%
mutate(fips = fips(state = "WV", county = county)) %>%
select(-school, -school_name, -population_group, -subgroup, -name)
print(t)
NA
library(ggcorrplot)
t_corr <- t %>%
select(where(is.numeric))
corr_matrix <- cor(t_corr, use = "pairwise.complete.obs")
print(corr_matrix)
science_proficiency proficiency enroll tfedrev tstrev tlocrev totalexp ppcstot unemployed
science_proficiency 1.00000000 1.00000000 0.3381350 0.17721750 0.3059379 0.43846237 0.3381153 0.09355478 -0.3233399
proficiency 1.00000000 1.00000000 0.3381350 0.17721750 0.3059379 0.43846237 0.3381153 0.09355478 -0.3233399
enroll 0.33813495 0.33813495 1.0000000 0.91314436 0.9905798 0.90451626 0.9883729 -0.34035766 -0.2846968
tfedrev 0.17721750 0.17721750 0.9131444 1.00000000 0.9104461 0.84820726 0.9448424 -0.26212428 -0.1738598
tstrev 0.30593792 0.30593792 0.9905798 0.91044615 1.0000000 0.85436386 0.9755774 -0.37809676 -0.2494595
tlocrev 0.43846237 0.43846237 0.9045163 0.84820726 0.8543639 1.00000000 0.9370710 -0.02957104 -0.3869128
totalexp 0.33811529 0.33811529 0.9883729 0.94484238 0.9755774 0.93707098 1.0000000 -0.25492361 -0.2880562
ppcstot 0.09355478 0.09355478 -0.3403577 -0.26212428 -0.3780968 -0.02957104 -0.2549236 1.00000000 -0.1362845
unemployed -0.32333991 -0.32333991 -0.2846968 -0.17385978 -0.2494595 -0.38691283 -0.2880562 -0.13628452 1.0000000
poverty_percent -0.52145690 -0.52145690 -0.1964416 -0.04702916 -0.1652928 -0.27693811 -0.1824926 -0.04903490 0.5459838
population_18_to_39 0.45581831 0.45581831 0.4256712 0.29146072 0.3829829 0.46029222 0.4152188 -0.08362189 -0.1531934
bachelors_degree_percent 0.70060125 0.70060125 0.6011222 0.44201652 0.5690950 0.65483146 0.5997281 -0.16613996 -0.4666303
poverty_percent population_18_to_39 bachelors_degree_percent
science_proficiency -0.52145690 0.45581831 0.7006012
proficiency -0.52145690 0.45581831 0.7006012
enroll -0.19644163 0.42567118 0.6011222
tfedrev -0.04702916 0.29146072 0.4420165
tstrev -0.16529281 0.38298288 0.5690950
tlocrev -0.27693811 0.46029222 0.6548315
totalexp -0.18249265 0.41521881 0.5997281
ppcstot -0.04903490 -0.08362189 -0.1661400
unemployed 0.54598384 -0.15319340 -0.4666303
poverty_percent 1.00000000 -0.13673295 -0.5522491
population_18_to_39 -0.13673295 1.00000000 0.6497486
bachelors_degree_percent -0.55224914 0.64974860 1.0000000
ggcorrplot(corr_matrix,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_col = "black",
lab_size = 3)
model <- lm(proficiency ~ population_18_to_39 + poverty_percent + bachelors_degree_percent, data = t)
summary(model)
Call:
lm(formula = proficiency ~ population_18_to_39 + poverty_percent +
bachelors_degree_percent, data = t)
Residuals:
Min 1Q Median 3Q Max
-7.6333 -2.9653 -0.2097 2.7694 8.5352
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.1409 4.1937 4.326 7.08e-05 ***
population_18_to_39 0.1286 0.2036 0.632 0.53043
poverty_percent -0.2944 0.1654 -1.781 0.08095 .
bachelors_degree_percent 0.4177 0.1293 3.230 0.00217 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.144 on 51 degrees of freedom
Multiple R-squared: 0.5206, Adjusted R-squared: 0.4924
F-statistic: 18.46 on 3 and 51 DF, p-value: 3.052e-08
library(usmap)
plot_usmap(data = t,
values = "proficiency",
include = 'West Virginia') +
scale_fill_continuous(name = "Proficiency",
low = 'red',
high = 'blue') +
theme(legend.position = "right") +
labs('Proficiency')
NA
NA
NA
#PCA
library(FactoMineR)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
t_pca_input <- t %>%
select(poverty_percent, bachelors_degree_percent, population_18_to_39) %>%
drop_na() %>%
scale()
pca_result <- PCA(t_pca_input, graph = TRUE)
fviz_pca_biplot(pca_result,
repel = TRUE,
col.var = "blue",
col.ind = "gray",
title = "PCA Biplot of WV Counties")
NA
NA
NA
NA
library(rpart)
library(rpart.plot)
t_tree <- t %>%
select(proficiency, poverty_percent, bachelors_degree_percent, population_18_to_39) %>%
drop_na()
tree_model <- rpart(proficiency ~ poverty_percent + bachelors_degree_percent +
population_18_to_39,
data = t_tree,
method = "anova",
control = rpart.control(cp = 0.01))
rpart.plot(tree_model,
type = 2,
extra = 101,
fallen.leaves = TRUE,
main = "Decision Tree Predicting School Proficiency")