##Introduction This analysis explores the relationships between
educational outcomes, funding, demographics, and other socioeconomic
factors across West Virginia counties. We aim to identify key drivers of
educational performance and develop predictive models to help understand
what factors most influence student achievement
##Key Research Questions
Load assessment data
**need to fix error still
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(readxl)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(rpart.plot)
assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'
t_assess_raw_school <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'b2:f7312')
t_assess_raw_science <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'db3:db7312',
col_names = c('science_proficiency'),
na = '**')
t_assess_raw <- t_assess_raw_school %>%
bind_cols(t_assess_raw_science) %>%
janitor::clean_names()
# Remove subgroups
t_assess <- t_assess_raw %>%
filter(school == 999) %>%
filter(population_group == 'Total Population') %>%
filter(county != 'Statewide') %>%
mutate(proficiency = science_proficiency)
print(t_assess)
## # A tibble: 55 × 7
## county school school_name population_group subgroup science_proficiency
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Barbour 999 Barbour Count… Total Population Total 26.0
## 2 Berkeley 999 Berkeley Coun… Total Population Total 28.6
## 3 Boone 999 Boone County … Total Population Total 19.6
## 4 Braxton 999 Braxton Count… Total Population Total 22.6
## 5 Brooke 999 Brooke County… Total Population Total 21.1
## 6 Cabell 999 Cabell County… Total Population Total 30.8
## 7 Calhoun 999 Calhoun Count… Total Population Total 27.8
## 8 Clay 999 Clay County T… Total Population Total 23.3
## 9 Doddridge 999 Doddridge Cou… Total Population Total 31.3
## 10 Fayette 999 Fayette Count… Total Population Total 17.4
## # ℹ 45 more rows
## # ℹ 1 more variable: proficiency <dbl>
assessment_path_2022 <- './wv ed student achievement/SY22_AssessmentProficiency_Public_All_Final.xlsx'
tryCatch({
t_assess_2022_raw <- read_excel(path = assessment_path_2022) %>%
janitor::clean_names()
})
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...23`
## • `` -> `...24`
## • `` -> `...25`
## • `` -> `...26`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
## • `` -> `...33`
## • `` -> `...34`
## • `` -> `...35`
## • `` -> `...36`
## • `` -> `...38`
## • `` -> `...39`
## • `` -> `...40`
## • `` -> `...41`
## • `` -> `...43`
## • `` -> `...44`
## • `` -> `...45`
## • `` -> `...46`
## • `` -> `...48`
## • `` -> `...49`
## • `` -> `...50`
## • `` -> `...51`
## • `` -> `...53`
## • `` -> `...54`
## • `` -> `...55`
## • `` -> `...56`
## • `` -> `...58`
## • `` -> `...59`
## • `` -> `...60`
## • `` -> `...61`
## • `` -> `...63`
## • `` -> `...64`
## • `` -> `...65`
## • `` -> `...66`
## • `` -> `...68`
## • `` -> `...69`
## • `` -> `...70`
## • `` -> `...71`
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## t_assess_2022 <- t_assess_2022_raw %>%
## filter(grepl("County", district)) %>%
## mutate(county = gsub(" County Schools", "", district)) %>%
## select(county, contains("proficiency")) %>%
## arrange(county)
## print(t_assess_2022)
Load spending data
spending_path <- './us census ed spending/elsec22t.xls'
t_spending_raw <- read_excel(path = spending_path,,
sheet = 'elsec22t',
range = 'a1:gb14106') %>%
janitor::clean_names()
## New names:
## • `` -> `...73`
## • `` -> `...74`
## • `` -> `...75`
## • `` -> `...76`
## • `` -> `...77`
## • `` -> `...78`
## • `` -> `...79`
## • `` -> `...80`
## • `` -> `...81`
## • `` -> `...82`
## • `` -> `...83`
## • `` -> `...84`
## • `` -> `...85`
## • `` -> `...86`
## • `` -> `...87`
## • `` -> `...88`
## • `` -> `...89`
## • `` -> `...90`
## • `` -> `...91`
## • `` -> `...92`
## • `` -> `...93`
## • `` -> `...94`
## • `` -> `...95`
## • `` -> `...96`
## • `` -> `...97`
## • `` -> `...98`
## • `` -> `...99`
## • `` -> `...100`
## • `` -> `...101`
## • `` -> `...102`
## • `` -> `...103`
## • `` -> `...104`
## • `` -> `...105`
## • `` -> `...106`
## • `` -> `...107`
## • `` -> `...108`
## • `` -> `...109`
## • `` -> `...110`
## • `` -> `...111`
## • `` -> `...112`
## • `` -> `...113`
## • `` -> `...114`
## • `` -> `...115`
## • `` -> `...116`
## • `` -> `...117`
## • `` -> `...118`
## • `` -> `...119`
## • `` -> `...120`
## • `` -> `...121`
## • `` -> `...122`
## • `` -> `...123`
## • `` -> `...124`
## • `` -> `...125`
## • `` -> `...126`
## • `` -> `...127`
## • `` -> `...128`
## • `` -> `...129`
## • `` -> `...130`
## • `` -> `...131`
## • `` -> `...132`
## • `` -> `...133`
## • `` -> `...134`
## • `` -> `...135`
## • `` -> `...136`
## • `` -> `...137`
## • `` -> `...138`
## • `` -> `...139`
## • `` -> `...140`
## • `` -> `...141`
## • `` -> `...142`
## • `` -> `...143`
## • `` -> `...144`
## • `` -> `...145`
## • `` -> `...146`
## • `` -> `...147`
## • `` -> `...148`
## • `` -> `...149`
## • `` -> `...150`
## • `` -> `...151`
## • `` -> `...152`
## • `` -> `...153`
## • `` -> `...154`
## • `` -> `...155`
## • `` -> `...156`
## • `` -> `...157`
## • `` -> `...158`
## • `` -> `...159`
## • `` -> `...160`
## • `` -> `...161`
## • `` -> `...162`
## • `` -> `...163`
## • `` -> `...164`
## • `` -> `...165`
## • `` -> `...166`
## • `` -> `...167`
## • `` -> `...168`
## • `` -> `...169`
## • `` -> `...170`
## • `` -> `...171`
## • `` -> `...172`
## • `` -> `...173`
## • `` -> `...174`
## • `` -> `...175`
## • `` -> `...176`
## • `` -> `...177`
## • `` -> `...178`
## • `` -> `...179`
## • `` -> `...180`
## • `` -> `...181`
## • `` -> `...182`
## • `` -> `...183`
## • `` -> `...184`
cooperates <- c('MOUNTAIN STATE EDUCATIONAL SERVICES COOPERATIVE',
'EASTERN PANHANDLE INSTRUCTIONAL COOPERATIVE',
'SOUTHERN EDUCATIONAL SERVICES COOPERATIVE')
t_spending <- t_spending_raw %>%
filter(state == 49) %>%
filter(!name %in% cooperates) %>%
select(name, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot) %>%
mutate(county = str_to_title(str_split_i(name, ' ',1)),
county = ifelse(county == 'Mc', 'McDowell', county))
print(t_spending)
## # A tibble: 55 × 8
## name enroll tfedrev tstrev tlocrev totalexp ppcstot county
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 BARBOUR CO SCH DIST 2144 7559 16584 5872 28021 11885 Barbour
## 2 BERKELEY CO SCH DIST 19722 48407 140127 86699 264253 12704 Berkeley
## 3 BOONE CO SCH DIST 3177 8194 26858 14564 48642 14663 Boone
## 4 BRAXTON CO SCH DIST 1747 5479 12748 6404 24417 13153 Braxton
## 5 BROOKE CO SCH DIST 2582 6791 17114 21352 41908 15642 Brooke
## 6 CABELL CO SCH DIST 11667 42518 88337 66699 183621 14538 Cabell
## 7 CALHOUN CO SCH DIST 861 3254 9953 3190 15154 16085 Calhoun
## 8 CLAY CO SCH DIST 1669 6157 17655 2791 25963 13825 Clay
## 9 DODDRIDGE CO SCH DIST 1082 3455 3999 31752 38493 23563 Doddrid…
## 10 FAYETTE CO SCH DIST 5594 15293 51759 23477 83373 13777 Fayette
## # ℹ 45 more rows
Load demographic data
Synthetic Data as of now, going to pull and use real data sources
for Educational attainment data and Income and poverty data
t_demographics_unemployed <- read_csv('unemployed.csv',
skip = 4,
na = 'N/A') %>%
janitor::clean_names() %>%
filter(county != 'West Virginia',
county != 'United States',
!is.na(value_percent) ) %>%
select(county, value_percent) %>%
rename(unemployed = value_percent)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(t_demographics_unemployed)
## # A tibble: 55 × 2
## county unemployed
## <chr> <dbl>
## 1 McDowell County 15.1
## 2 Braxton County 14.4
## 3 Logan County 13.3
## 4 Calhoun County 12.2
## 5 Roane County 11.7
## 6 Clay County 11.2
## 7 Mingo County 11.2
## 8 Webster County 11.1
## 9 Monroe County 10.6
## 10 Barbour County 10.1
## # ℹ 45 more rows
wv_counties <- unique(t_assess$county)
t_demographics_education <- tibble(
county = wv_counties,
bachelor_or_higher = runif(length(wv_counties), 10, 40),
high_school_grad = runif(length(wv_counties), 70, 95)
)
t_demographics_income <- tibble(
county = wv_counties,
median_household_income = runif(length(wv_counties), 35000, 65000),
poverty_rate = runif(length(wv_counties), 10, 30)
)
t_demographics <- t_demographics_unemployed %>%
left_join(t_demographics_education, by = "county") %>%
left_join(t_demographics_income, by = "county")
print(t_demographics)
## # A tibble: 55 × 6
## county unemployed bachelor_or_higher high_school_grad median_household_inc…¹
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 McDowe… 15.1 NA NA NA
## 2 Braxto… 14.4 NA NA NA
## 3 Logan … 13.3 NA NA NA
## 4 Calhou… 12.2 NA NA NA
## 5 Roane … 11.7 NA NA NA
## 6 Clay C… 11.2 NA NA NA
## 7 Mingo … 11.2 NA NA NA
## 8 Webste… 11.1 NA NA NA
## 9 Monroe… 10.6 NA NA NA
## 10 Barbou… 10.1 NA NA NA
## # ℹ 45 more rows
## # ℹ abbreviated name: ¹​median_household_income
## # ℹ 1 more variable: poverty_rate <dbl>
Joined data
Checkbook data
checkbook_files <- list.files(path = "./checkbook_2023/",
pattern = "*.csv",
full.names = TRUE)
process_checkbook <- function(file_path) {
county_name <- str_extract(basename(file_path), "(?<=23)[A-Za-z]+(?=\\d+)")
tryCatch({
# Read the file
df <- read_csv(file_path, show_col_types = FALSE) %>%
janitor::clean_names()
if("county" %in% colnames(df)) {
if(is.character(df$county)) {
df$county <- as.numeric(df$county)
}
}
df <- df %>% mutate(county_name = county_name)
return(df)
})
}
sample_counties <- c("./checkbook_2023/23-Barbour-02_Updated.csv",
"./checkbook_2023/23-Grant-24_Updated.csv",
"./checkbook_2023/23-Jackson-35_Updated.csv")
checkbook_data_samples <- map_dfr(sample_counties, process_checkbook)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
education_spending <- checkbook_data_samples %>%
mutate(ck_amt = as.numeric(ck_amt)) %>%
filter(prog %in% c(1, 2, 3, 4) |
grepl("EDUCATION|SCHOOL|TEACH|INSTRUCT|CURRICULUM", toupper(invdesc))) %>%
group_by(county_name) %>%
summarize(
total_education_spending = sum(ck_amt, na.rm = TRUE),
instructional_spending = sum(ck_amt[grepl("INSTRUCT|TEACH|CLASSROOM", toupper(invdesc))], na.rm = TRUE),
technology_spending = sum(ck_amt[grepl("COMPUTER|TECHNOLOGY|SOFTWARE", toupper(invdesc))], na.rm = TRUE),
facility_spending = sum(ck_amt[grepl("FACILITY|BUILDING|MAINTENANCE", toupper(invdesc))], na.rm = TRUE),
.groups = "drop"
)
print(education_spending)
## # A tibble: 1 × 5
## county_name total_education_spend…¹ instructional_spending technology_spending
## <chr> <dbl> <dbl> <dbl>
## 1 <NA> 2271485. 81529. 6442.
## # ℹ abbreviated name: ¹​total_education_spending
## # ℹ 1 more variable: facility_spending <dbl>
standardize_county_name <- function(county) {
county <- str_to_title(county)
county <- str_trim(county)
county <- str_replace(county, "Mcdowell", "McDowell")
county <- str_replace(county, " County", "")
return(county)
}
if (exists("t_assess")) {
t_assess <- t_assess %>%
mutate(county = standardize_county_name(county))
}
if (exists("t_assess_2022")) {
t_assess_2022 <- t_assess_2022 %>%
mutate(county = standardize_county_name(county))
}
if (exists("t_spending")) {
t_spending <- t_spending %>%
mutate(county = standardize_county_name(county))
}
if (exists("t_demographics")) {
t_demographics <- t_demographics %>%
mutate(county = standardize_county_name(county))
}
if (exists("education_spending")) {
education_spending <- education_spending %>%
mutate(county = standardize_county_name(county_name)) %>%
select(-county_name)
}
integrated_data <- t_assess %>%
left_join(t_spending, by = "county") %>%
left_join(t_demographics, by = "county")
if (exists("education_spending")) {
integrated_data <- integrated_data %>%
left_join(education_spending, by = "county")
}
if (exists("t_assess_2022") && ncol(t_assess_2022) > 1) {
integrated_data <- integrated_data %>%
left_join(t_assess_2022, by = "county", suffix = c("_2021", "_2022"))
}
print(integrated_data)
## # A tibble: 55 × 23
## county school school_name population_group subgroup science_proficiency
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Barbour 999 Barbour Count… Total Population Total 26.0
## 2 Berkeley 999 Berkeley Coun… Total Population Total 28.6
## 3 Boone 999 Boone County … Total Population Total 19.6
## 4 Braxton 999 Braxton Count… Total Population Total 22.6
## 5 Brooke 999 Brooke County… Total Population Total 21.1
## 6 Cabell 999 Cabell County… Total Population Total 30.8
## 7 Calhoun 999 Calhoun Count… Total Population Total 27.8
## 8 Clay 999 Clay County T… Total Population Total 23.3
## 9 Doddridge 999 Doddridge Cou… Total Population Total 31.3
## 10 Fayette 999 Fayette Count… Total Population Total 17.4
## # ℹ 45 more rows
## # ℹ 17 more variables: proficiency <dbl>, name <chr>, enroll <dbl>,
## # tfedrev <dbl>, tstrev <dbl>, tlocrev <dbl>, totalexp <dbl>, ppcstot <dbl>,
## # unemployed <dbl>, bachelor_or_higher <dbl>, high_school_grad <dbl>,
## # median_household_income <dbl>, poverty_rate <dbl>,
## # total_education_spending <dbl>, instructional_spending <dbl>,
## # technology_spending <dbl>, facility_spending <dbl>
# Merge data
t <- t_assess
Summary Stats
summary_stats <- integrated_data %>%
select(where(is.numeric)) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
group_by(variable) %>%
summarize(
n = sum(!is.na(value)),
min = min(value, na.rm = TRUE),
q1 = quantile(value, 0.25, na.rm = TRUE),
median = median(value, na.rm = TRUE),
mean = mean(value, na.rm = TRUE),
q3 = quantile(value, 0.75, na.rm = TRUE),
max = max(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE)
)
## Warning: There were 16 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `min = min(value, na.rm = TRUE)`.
## ℹ In group 1: `variable = "bachelor_or_higher"`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 15 remaining warnings.
kable(summary_stats, caption = "Summary Statistics of Numeric Variables") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Summary Statistics of Numeric Variables
|
variable
|
n
|
min
|
q1
|
median
|
mean
|
q3
|
max
|
sd
|
|
bachelor_or_higher
|
0
|
Inf
|
NA
|
NA
|
NaN
|
NA
|
-Inf
|
NA
|
|
enroll
|
55
|
800.00
|
1654.00
|
3177.00
|
4585.581818
|
5104.000
|
24392.0
|
4615.970845
|
|
facility_spending
|
0
|
Inf
|
NA
|
NA
|
NaN
|
NA
|
-Inf
|
NA
|
|
high_school_grad
|
0
|
Inf
|
NA
|
NA
|
NaN
|
NA
|
-Inf
|
NA
|
|
instructional_spending
|
0
|
Inf
|
NA
|
NA
|
NaN
|
NA
|
-Inf
|
NA
|
|
median_household_income
|
0
|
Inf
|
NA
|
NA
|
NaN
|
NA
|
-Inf
|
NA
|
|
poverty_rate
|
0
|
Inf
|
NA
|
NA
|
NaN
|
NA
|
-Inf
|
NA
|
|
ppcstot
|
55
|
11885.00
|
13151.00
|
13777.00
|
14466.054545
|
15235.500
|
23563.0
|
2188.579900
|
|
proficiency
|
55
|
14.81
|
21.45
|
24.36
|
25.304727
|
29.395
|
41.8
|
5.817084
|
|
science_proficiency
|
55
|
14.81
|
21.45
|
24.36
|
25.304727
|
29.395
|
41.8
|
5.817084
|
|
technology_spending
|
0
|
Inf
|
NA
|
NA
|
NaN
|
NA
|
-Inf
|
NA
|
|
tfedrev
|
55
|
1511.00
|
4991.00
|
10158.00
|
13311.909091
|
14517.500
|
109522.0
|
16225.080652
|
|
tlocrev
|
55
|
1956.00
|
8194.50
|
14813.00
|
25031.909091
|
33333.000
|
145623.0
|
26739.034924
|
|
total_education_spending
|
0
|
Inf
|
NA
|
NA
|
NaN
|
NA
|
-Inf
|
NA
|
|
totalexp
|
55
|
13954.00
|
26486.00
|
48642.00
|
69482.090909
|
81171.500
|
416491.0
|
70702.975648
|
|
tstrev
|
55
|
3895.00
|
12668.50
|
26858.00
|
34234.363636
|
39496.000
|
176062.0
|
33479.121896
|
|
unemployed
|
55
|
2.60
|
5.05
|
6.40
|
7.054545
|
8.500
|
15.1
|
2.918194
|
ggplot(integrated_data, aes(x = proficiency)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
labs(title = "Distribution of Science Proficiency Across WV Counties",
x = "Science Proficiency (%)",
y = "Count") +
theme_minimal()

top_counties <- integrated_data %>%
arrange(desc(proficiency)) %>%
select(county, proficiency) %>%
head(5)
bottom_counties <- integrated_data %>%
arrange(proficiency) %>%
select(county, proficiency) %>%
head(5)
kable(top_counties, caption = "Top 5 Counties by Science Proficiency") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Top 5 Counties by Science Proficiency
|
county
|
proficiency
|
|
Monongalia
|
41.80
|
|
Jefferson
|
38.71
|
|
Ohio
|
35.75
|
|
Wood
|
35.28
|
|
Putnam
|
34.38
|
kable(bottom_counties, caption = "Bottom 5 Counties by Science Proficiency") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Bottom 5 Counties by Science Proficiency
|
county
|
proficiency
|
|
McDowell
|
14.81
|
|
Lewis
|
17.03
|
|
Wyoming
|
17.36
|
|
Fayette
|
17.42
|
|
Upshur
|
18.48
|
library(ggcorrplot)
Regional Analysis
eastern_panhandle <- c("Berkeley", "Jefferson", "Morgan", "Hampshire", "Mineral", "Hardy", "Grant", "Pendleton")
northern_panhandle <- c("Hancock", "Brooke", "Ohio", "Marshall", "Wetzel", "Tyler")
metro_valley <- c("Kanawha", "Putnam", "Cabell", "Wayne", "Lincoln", "Boone")
southern_counties <- c("Mingo", "Logan", "Wyoming", "McDowell", "Mercer", "Raleigh")
central_counties <- c("Braxton", "Clay", "Nicholas", "Webster", "Pocahontas", "Greenbrier", "Monroe", "Summers")
north_central <- c("Monongalia", "Marion", "Harrison", "Taylor", "Preston", "Barbour", "Tucker", "Randolph", "Upshur", "Lewis")
ohio_valley <- c("Wood", "Pleasants", "Tyler", "Doddridge", "Ritchie", "Wirt", "Jackson", "Mason", "Roane", "Calhoun", "Gilmer")
integrated_data <- integrated_data %>%
mutate(region = case_when(
county %in% eastern_panhandle ~ "Eastern Panhandle",
county %in% northern_panhandle ~ "Northern Panhandle",
county %in% metro_valley ~ "Metro Valley",
county %in% southern_counties ~ "Southern Counties",
county %in% central_counties ~ "Central Counties",
county %in% north_central ~ "North Central",
county %in% ohio_valley ~ "Ohio Valley",
TRUE ~ "Other"
))
ggplot(integrated_data, aes(x = region, y = proficiency, fill = region)) +
geom_boxplot() +
labs(title = "Science Proficiency by Region",
x = "Region",
y = "Science Proficiency (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

region_stats <- integrated_data %>%
group_by(region) %>%
summarize(
mean_proficiency = mean(proficiency, na.rm = TRUE),
median_proficiency = median(proficiency, na.rm = TRUE),
sd_proficiency = sd(proficiency, na.rm = TRUE),
count = n(),
.groups = "drop"
) %>%
arrange(desc(mean_proficiency))
kable(region_stats, caption = "Science Proficiency by Region") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Science Proficiency by Region
|
region
|
mean_proficiency
|
median_proficiency
|
sd_proficiency
|
count
|
|
Eastern Panhandle
|
27.79125
|
26.730
|
5.050399
|
8
|
|
North Central
|
26.60900
|
26.075
|
7.262833
|
10
|
|
Ohio Valley
|
26.42000
|
25.745
|
5.128446
|
10
|
|
Northern Panhandle
|
25.60333
|
24.650
|
5.429636
|
6
|
|
Metro Valley
|
25.35500
|
24.400
|
6.426949
|
6
|
|
Central Counties
|
24.62750
|
22.965
|
5.052512
|
8
|
|
Southern Counties
|
19.82500
|
18.790
|
3.979727
|
6
|
|
Other
|
17.42000
|
17.420
|
NA
|
1
|
Correlation Analysis
correlation_vars <- c("proficiency", "ppcstot", "unemployed", "bachelor_or_higher",
"high_school_grad", "median_household_income", "poverty_rate")
cat("Available data counts:\n")
## Available data counts:
for(var in correlation_vars) {
if(var %in% names(integrated_data)) {
cat(var, ": ", sum(!is.na(integrated_data[[var]])), " non-NA values\n", sep="")
} else {
cat(var, ": not in dataset\n", sep="")
}
}
## proficiency: 55 non-NA values
## ppcstot: 55 non-NA values
## unemployed: 55 non-NA values
## bachelor_or_higher: 0 non-NA values
## high_school_grad: 0 non-NA values
## median_household_income: 0 non-NA values
## poverty_rate: 0 non-NA values
available_vars <- correlation_vars[correlation_vars %in% names(integrated_data)]
correlation_data <- integrated_data %>%
select(all_of(intersect(available_vars, names(integrated_data)))) %>%
select_if(~ !all(is.na(.))) %>%
na.omit()
if(nrow(correlation_data) >= 3 && ncol(correlation_data) >= 2) {
no_var_cols <- correlation_data %>%
summarise_all(~ var(., na.rm = TRUE) == 0) %>%
pivot_longer(everything(), names_to = "variable", values_to = "no_variance") %>%
filter(no_variance) %>%
pull(variable)
if(length(no_var_cols) > 0) {
cat("Removing variables with no variance:", paste(no_var_cols, collapse=", "), "\n")
correlation_data <- correlation_data %>% select(-all_of(no_var_cols))
}
correlation_matrix <- cor(correlation_data, use = "pairwise.complete.obs")
if(any(is.na(correlation_matrix)) || any(is.infinite(as.matrix(correlation_matrix)))) {
cat("Warning: Correlation matrix contains NA or Inf values.\n")
corrplot::corrplot(correlation_matrix,
method = "color",
type = "lower",
addCoef.col = "black",
tl.col = "black",
title = "Correlation Between Educational Outcomes and Various Factors")
} else {
if(ncol(correlation_data) < 3) {
ggcorrplot(
correlation_matrix,
hc.order = FALSE, # Disable hierarchical clustering with few variables
type = "lower",
lab = TRUE,
lab_size = 3,
method = "circle",
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Between Educational Outcomes and Various Factors",
ggtheme = theme_minimal
)
} else {
tryCatch({
ggcorrplot(
correlation_matrix,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method = "circle",
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Between Educational Outcomes and Various Factors",
ggtheme = theme_minimal
)
}, error = function(e) {
cat("Falling back to plot without hierarchical clustering due to error:", e$message, "\n")
ggcorrplot(
correlation_matrix,
hc.order = FALSE,
type = "lower",
lab = TRUE,
lab_size = 3,
method = "circle",
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Between Educational Outcomes and Various Factors",
ggtheme = theme_minimal
)
})
}
}
if("proficiency" %in% colnames(correlation_matrix)) {
proficiency_correlations <- correlation_matrix[, "proficiency"]
proficiency_correlations <- data.frame(
variable = names(proficiency_correlations),
correlation = proficiency_correlations
) %>%
arrange(desc(abs(correlation)))
kable(proficiency_correlations, caption = "Correlations with Science Proficiency") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
} else {
cat("Proficiency variable not available for correlation analysis")
}
} else {
cat("Not enough complete data for correlation analysis.\n")
cat("Number of rows in correlation data: ", nrow(correlation_data), "\n")
cat("Number of columns in correlation data: ", ncol(correlation_data), "\n")
cat("Please ensure you have more complete data or adjust your variable selection.")
}
Correlations with Science Proficiency
|
|
variable
|
correlation
|
|
proficiency
|
proficiency
|
1.0000000
|
|
unemployed
|
unemployed
|
-0.3233399
|
|
ppcstot
|
ppcstot
|
0.0935548
|
Spending Analysis
ggplot(integrated_data, aes(x = ppcstot, y = proficiency)) +
geom_point(aes(color = region), size = 3, alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
labs(title = "Relationship Between Per-Pupil Spending and Science Proficiency",
x = "Per-Pupil Spending ($)",
y = "Science Proficiency (%)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

spending_by_region <- integrated_data %>%
group_by(region) %>%
summarize(
mean_spending = mean(ppcstot, na.rm = TRUE),
mean_proficiency = mean(proficiency, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(mean_spending))
kable(spending_by_region, caption = "Education Spending and Proficiency by Region") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Education Spending and Proficiency by Region
|
region
|
mean_spending
|
mean_proficiency
|
|
Northern Panhandle
|
17159.67
|
25.60333
|
|
Ohio Valley
|
15695.70
|
26.42000
|
|
Central Counties
|
14275.75
|
24.62750
|
|
Metro Valley
|
13918.17
|
25.35500
|
|
Other
|
13777.00
|
17.42000
|
|
Eastern Panhandle
|
13744.00
|
27.79125
|
|
Southern Counties
|
13516.17
|
19.82500
|
|
North Central
|
13317.70
|
26.60900
|
Bar Chart and Scatter Plot of proficiency by county
ggplot(integrated_data, aes(x = reorder(county, proficiency), y = proficiency)) +
geom_col(fill = "steelblue") +
coord_flip() + # Flip coordinates for better county name display
labs(title = "Science Proficiency by County in West Virginia",
x = "County",
y = "Science Proficiency (%)") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8)) # Adjust text size as needed

if("ppcstot" %in% names(integrated_data)) {
ggplot(integrated_data, aes(x = reorder(county, ppcstot), y = ppcstot)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Per-Pupil Spending by County in West Virginia",
x = "County",
y = "Per-Pupil Spending ($)") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8))
}

if("ppcstot" %in% names(integrated_data)) {
ggplot(integrated_data, aes(x = ppcstot, y = proficiency)) +
geom_point(color = "blue", size = 3, alpha = 0.7) +
# Instead of geom_text_repel, use simpler text labeling
geom_text(aes(label = county), hjust = -0.1, vjust = 0.1, size = 2.5) +
geom_smooth(method = "lm", se = TRUE, color = "red") + # Add trend line
labs(title = "Science Proficiency vs. Per-Pupil Spending",
x = "Per-Pupil Spending ($)",
y = "Science Proficiency (%)") +
theme_minimal()
}
## `geom_smooth()` using formula = 'y ~ x'
## Principal Component Analysis
pca_data <- integrated_data %>%
select(proficiency, ppcstot, unemployed) %>%
drop_na()
print(dim(pca_data))
## [1] 55 3
if(nrow(pca_data) > 0 && ncol(pca_data) > 1) {
pca_data_scaled <- scale(pca_data)
pca_result <- prcomp(pca_data_scaled)
print(summary(pca_result))
biplot(pca_result, scale = 0)
}
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.1794 0.9672 0.8207
## Proportion of Variance 0.4637 0.3118 0.2245
## Cumulative Proportion 0.4637 0.7755 1.0000
## K-means Clustering
set.seed(123)
km <- kmeans(pca_data, centers = 3)
integrated_data$cluster <- km$cluster[match(rownames(integrated_data), rownames(pca_data))]
ggplot(integrated_data, aes(x = ppcstot, y = proficiency, color = factor(cluster))) +
geom_point(size = 3) +
labs(title = "K-means Clustering of WV Counties",
x = "Per-Pupil Spending ($)",
y = "Science Proficiency (%)",
color = "Cluster") +
theme_minimal()

Prepare Data for Modeling
model_data <- integrated_data %>%
select(proficiency, ppcstot, unemployed) %>%
drop_na()
dim(model_data)
## [1] 55 3
set.seed(123)
train_index <- createDataPartition(model_data$proficiency, p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
ctrl <- trainControl(method = "cv", number = 5)
lm_model <- train(
proficiency ~ .,
data = train_data,
method = "lm",
trControl = ctrl
)
summary(lm_model$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0840 -4.4404 -0.2178 3.4038 16.1935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4887305 6.2262988 3.773 0.000479 ***
## ppcstot 0.0003147 0.0003796 0.829 0.411622
## unemployed -0.3876326 0.3104507 -1.249 0.218410
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.733 on 44 degrees of freedom
## Multiple R-squared: 0.05533, Adjusted R-squared: 0.01239
## F-statistic: 1.289 on 2 and 44 DF, p-value: 0.2859
lm_predictions <- predict(lm_model, newdata = test_data)
lm_rmse <- RMSE(lm_predictions, test_data$proficiency)
lm_r2 <- R2(lm_predictions, test_data$proficiency)
cat("Linear Regression Model:\n")
## Linear Regression Model:
cat("RMSE:", lm_rmse, "\n")
## RMSE: 5.211434
cat("R-squared:", lm_r2, "\n")
## R-squared: 0.4000578
Decision Tree Model
dt_model <- train(
proficiency ~ .,
data = train_data,
method = "rpart",
trControl = ctrl,
tuneLength = 10
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
dt_model
## CART
##
## 47 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 37, 38, 38, 39, 36
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.00000000 5.699724 0.13584683 4.564666
## 0.01523184 5.699724 0.13584683 4.564666
## 0.03046368 5.699724 0.13584683 4.564666
## 0.04569552 5.699724 0.13584683 4.564666
## 0.06092736 5.699724 0.13584683 4.564666
## 0.07615920 5.889352 0.07521389 4.763857
## 0.09139104 5.889352 0.07521389 4.763857
## 0.10662288 5.889352 0.07521389 4.763857
## 0.12185472 5.819587 0.07521389 4.786079
## 0.13708656 5.926893 0.03330881 4.901399
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.06092736.
rpart.plot(dt_model$finalModel,
box.palette = "Blues",
main = "Decision Tree for Science Proficiency")

dt_predictions <- predict(dt_model, newdata = test_data)
dt_rmse <- RMSE(dt_predictions, test_data$proficiency)
dt_r2 <- R2(dt_predictions, test_data$proficiency)
cat("Decision Tree Model:\n")
## Decision Tree Model:
cat("RMSE:", dt_rmse, "\n")
## RMSE: 5.540997
cat("R-squared:", dt_r2, "\n")
## R-squared: 0.1999434
ggplot(data.frame(actual = test_data$proficiency, predicted = dt_predictions),
aes(x = actual, y = predicted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Decision Tree: Actual vs. Predicted Values",
x = "Actual Proficiency",
y = "Predicted Proficiency") +
theme_minimal()

dt_importance <- varImp(dt_model)
plot(dt_importance, main = "Decision Tree: Variable Importance")

Neural Network Model
preproc <- preProcess(train_data, method = c("center", "scale"))
train_data_scaled <- predict(preproc, train_data)
test_data_scaled <- predict(preproc, test_data)
nn_model <- train(
proficiency ~ .,
data = train_data_scaled,
method = "nnet",
trControl = ctrl,
tuneLength = 5,
linout = TRUE,
trace = FALSE,
maxit = 500
)
nn_model
## Neural Network
##
## 47 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 38, 38, 36, 39, 37
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0e+00 0.9311043 0.19073588 0.7800610
## 1 1e-04 0.9214202 0.20070758 0.7712265
## 1 1e-03 1.0080998 0.13003397 0.8425091
## 1 1e-02 0.9469247 0.15423751 0.7806526
## 1 1e-01 0.9563506 0.12370917 0.7734629
## 3 0e+00 1.1019919 0.08143896 0.8503733
## 3 1e-04 1.0572604 0.18210340 0.8456033
## 3 1e-03 1.6090489 0.01778995 1.2573556
## 3 1e-02 1.1006797 0.06766182 0.9418332
## 3 1e-01 1.0167830 0.04424340 0.8194010
## 5 0e+00 7.1035214 0.20504415 4.7240752
## 5 1e-04 2.7204605 0.02376320 1.9661360
## 5 1e-03 1.8360464 0.15465438 1.4223430
## 5 1e-02 1.1947307 0.02002665 0.9961097
## 5 1e-01 1.0389848 0.02550453 0.8367510
## 7 0e+00 2.3060228 0.17638860 1.7456562
## 7 1e-04 1.8929106 0.13051393 1.4170996
## 7 1e-03 2.0223818 0.09974229 1.4855287
## 7 1e-02 1.5292394 0.02519574 1.2277665
## 7 1e-01 1.0261356 0.03989707 0.8309402
## 9 0e+00 8.5809710 0.05730084 4.7861825
## 9 1e-04 5.4014805 0.07660154 3.2829514
## 9 1e-03 3.2035228 0.04956034 2.3623827
## 9 1e-02 1.4233815 0.04889394 1.1440116
## 9 1e-01 1.0039191 0.08018853 0.8177193
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 1e-04.
nn_predictions <- predict(nn_model, newdata = test_data_scaled)
# Evaluate model performance
nn_rmse <- RMSE(nn_predictions, test_data$proficiency)
nn_r2 <- R2(nn_predictions, test_data$proficiency)
cat("Neural Network Model:\n")
## Neural Network Model:
cat("RMSE:", nn_rmse, "\n")
## RMSE: 25.94952
cat("R-squared:", nn_r2, "\n")
## R-squared: 0.07847733
ggplot(data.frame(actual = test_data$proficiency, predicted = nn_predictions),
aes(x = actual, y = predicted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Neural Network: Actual vs. Predicted Values",
x = "Actual Proficiency",
y = "Predicted Proficiency") +
theme_minimal()

nn_importance <- varImp(nn_model)
plot(nn_importance, main = "Neural Network: Variable Importance")

Model Comparison
model_comparison <- data.frame(
Model = c("Linear Regression", "Decision Tree", "Neural Network"),
RMSE = c(lm_rmse, dt_rmse, nn_rmse),
R_Squared = c(lm_r2, dt_r2, nn_r2)
)
kable(model_comparison, caption = "Model Performance Comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Model Performance Comparison
|
Model
|
RMSE
|
R_Squared
|
|
Linear Regression
|
5.211434
|
0.4000578
|
|
Decision Tree
|
5.540997
|
0.1999434
|
|
Neural Network
|
25.949519
|
0.0784773
|
model_comparison_long <- model_comparison %>%
pivot_longer(cols = c(RMSE, R_Squared), names_to = "Metric", values_to = "Value")
ggplot(model_comparison_long, aes(x = Model, y = Value, fill = Model)) +
geom_bar(stat = "identity") +
facet_wrap(~ Metric, scales = "free") +
theme_minimal() +
labs(title = "Model Performance Comparison",
y = "Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Key Findings and Insights
key_predictors <- bind_rows(
data.frame(model = "Linear Regression", varImp(lm_model)$importance),
data.frame(model = "Decision Tree", varImp(dt_model)$importance),
data.frame(model = "Neural Network", varImp(nn_model)$importance)
)
key_predictors_summary <- key_predictors %>%
group_by(model) %>%
mutate(row = row_number()) %>%
ungroup() %>%
mutate(variable = case_when(
row == 1 ~ "ppcstot",
row == 2 ~ "unemployed",
row == 3 ~ "bachelor_or_higher",
row == 4 ~ "high_school_grad",
row == 5 ~ "median_household_income",
row == 6 ~ "poverty_rate"
)) %>%
select(-row) %>%
group_by(variable) %>%
summarize(avg_importance = mean(Overall, na.rm = TRUE)) %>%
arrange(desc(avg_importance))
kable(key_predictors_summary, caption = "Key Predictors of Educational Outcomes") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Key Predictors of Educational Outcomes
|
variable
|
avg_importance
|
|
ppcstot
|
66.66667
|
|
unemployed
|
33.33333
|
regional_differences <- integrated_data %>%
group_by(region) %>%
summarize(
mean_proficiency = mean(proficiency, na.rm = TRUE),
mean_ppcstot = mean(ppcstot, na.rm = TRUE),
mean_bachelor_rate = mean(bachelor_or_higher, na.rm = TRUE),
mean_poverty_rate = mean(poverty_rate, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(mean_proficiency))
kable(regional_differences, caption = "Educational Outcomes and Key Factors by Region") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
Educational Outcomes and Key Factors by Region
|
region
|
mean_proficiency
|
mean_ppcstot
|
mean_bachelor_rate
|
mean_poverty_rate
|
|
Eastern Panhandle
|
27.79125
|
13744.00
|
NaN
|
NaN
|
|
North Central
|
26.60900
|
13317.70
|
NaN
|
NaN
|
|
Ohio Valley
|
26.42000
|
15695.70
|
NaN
|
NaN
|
|
Northern Panhandle
|
25.60333
|
17159.67
|
NaN
|
NaN
|
|
Metro Valley
|
25.35500
|
13918.17
|
NaN
|
NaN
|
|
Central Counties
|
24.62750
|
14275.75
|
NaN
|
NaN
|
|
Southern Counties
|
19.82500
|
13516.17
|
NaN
|
NaN
|
|
Other
|
17.42000
|
13777.00
|
NaN
|
NaN
|