The report explores the key factors influencing student proficiency in West Virginia, finding that income, instructional spending, and total revenue are positively correlated with proficiency, while unemployment has a negative effect. Demographic percentages had limited predictive power in our models.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(readxl)
assessment_path <- './wv ed student achievement/Historical_AssessmentResults_SY15-to-SY21.xlsx'
t_assess_raw_school <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'b2:f7312')
t_assess_raw_science <- read_excel(path = assessment_path,
sheet = 'SY21 School & District',
range = 'db3:db7312',
col_names = c('science_proficiency'),
na = '**')
t_assess_raw <- t_assess_raw_school %>%
bind_cols(t_assess_raw_science) %>%
janitor::clean_names()
# Remove subgroups
t_assess <- t_assess_raw %>%
filter(school == 999) %>%
filter(population_group == 'Total Population') %>%
filter(county != 'Statewide') %>%
mutate(proficiency = science_proficiency)
print(t_assess)
## # A tibble: 55 × 7
## county school school_name population_group subgroup science_proficiency
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Barbour 999 Barbour Count… Total Population Total 26.0
## 2 Berkeley 999 Berkeley Coun… Total Population Total 28.6
## 3 Boone 999 Boone County … Total Population Total 19.6
## 4 Braxton 999 Braxton Count… Total Population Total 22.6
## 5 Brooke 999 Brooke County… Total Population Total 21.1
## 6 Cabell 999 Cabell County… Total Population Total 30.8
## 7 Calhoun 999 Calhoun Count… Total Population Total 27.8
## 8 Clay 999 Clay County T… Total Population Total 23.3
## 9 Doddridge 999 Doddridge Cou… Total Population Total 31.3
## 10 Fayette 999 Fayette Count… Total Population Total 17.4
## # ℹ 45 more rows
## # ℹ 1 more variable: proficiency <dbl>
t_demographics_unemployed <- read_csv(
'./demographics/unemployed.csv',
skip = 4,
na = 'N/A'
) %>%
janitor::clean_names() %>%
filter(county != 'West Virginia',
county != 'United States',
!is.na(value_percent)) %>%
mutate(county = sub(" .*", "", county)) %>% # Remove " County"
select(county, value_percent) %>%
rename(unemployed = value_percent)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 62 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): County, FIPS, Rank within US (of 3143 counties)
## dbl (2): Value (Percent), People (Unemployed)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_demographics <- t_demographics_unemployed
print(t_demographics)
## # A tibble: 55 × 2
## county unemployed
## <chr> <dbl>
## 1 McDowell 15.1
## 2 Braxton 14.4
## 3 Logan 13.3
## 4 Calhoun 12.2
## 5 Roane 11.7
## 6 Clay 11.2
## 7 Mingo 11.2
## 8 Webster 11.1
## 9 Monroe 10.6
## 10 Barbour 10.1
## # ℹ 45 more rows
t_income <- read_csv(
'./income/HDPulse_data_export.csv',
skip = 4,
na = 'N/A',
show_col_types = FALSE
) %>%
janitor::clean_names() %>%
filter(county != 'West Virginia',
county != 'United States',
!is.na(value_dollars)) %>%
mutate(county = sub(" .*", "", county)) %>% # Remove " County"
select(county, value_dollars) %>%
rename(income = value_dollars)
# Import Excel file
t_elsec <- read_excel("./us census ed spending/elsec22t.xlsx") %>%
janitor::clean_names() %>%
filter(fipst == 54) %>% # FIPST 54 = West Virginia
select(
name, # School system name
conum, # County code
enroll,
tfedrev, tstrev, tlocrev,
totalrev, totalexp,
tcurspnd, tsalwage, tempbene,
tcurinst, tcurssvc,
ppcstot, ppitotal
)%>%
mutate(
county = str_extract(name, "^[^ ]+"), # Extract first word
county = str_to_title(county) # Title case (e.g., "Barbour")
) %>%
group_by(county) %>%
summarise(
enroll = sum(enroll, na.rm = TRUE),
tfedrev = sum(tfedrev, na.rm = TRUE),
tstrev = sum(tstrev, na.rm = TRUE),
tlocrev = sum(tlocrev, na.rm = TRUE),
totalrev = sum(totalrev, na.rm = TRUE),
totalexp = sum(totalexp, na.rm = TRUE),
tcurspnd = sum(tcurspnd, na.rm = TRUE),
tsalwage = sum(tsalwage, na.rm = TRUE),
tempbene = sum(tempbene, na.rm = TRUE),
tcurinst = sum(tcurinst, na.rm = TRUE),
tcurssvc = sum(tcurssvc, na.rm = TRUE),
ppcstot = mean(ppcstot, na.rm = TRUE),
ppitotal = mean(ppitotal, na.rm = TRUE)
) %>%
select(
county,
totalrev, # Total revenue
tsalwage, # Salaries and wages
tempbene, # Employee benefits
tcurinst, # Instructional spending
tcurssvc, # Support services
ppitotal # Per-pupil instructional spending
)
# import csv file
# Load CSV
library(tidyverse)
# Load CSV
df <- read_csv("census_raw.csv")
## Rows: 99 Columns: 225
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (169): Label (Grouping), West Virginia!!Margin of Error, West Virginia!!...
## num (56): West Virginia!!Estimate, Barbour County, West Virginia!!Estimate,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Keep only percent columns (those with '!!Percent' in their names)
df_percent_only <- df %>%
select(`Label (Grouping)`,
contains("!!Percent") & !contains("Margin of Error"))
# Now pivot longer
t_census_percent <- df_percent_only %>%
pivot_longer(
cols = -`Label (Grouping)`,
names_to = "county_raw",
values_to = "value"
) %>%
mutate(
county = str_remove(county_raw, " County, West Virginia!!Percent"),
county = str_to_title(county),
label = str_squish(`Label (Grouping)`)
) %>%
select(county, label, value) %>%
drop_na()
# Step 1: Ensure values are numeric before pivot
t_census_percent_clean <- t_census_percent %>%
mutate(value = str_remove_all(value, "[%,]"), # remove percent signs and commas
value = as.numeric(value)) # convert to numeric
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `value = as.numeric(value)`.
## Caused by warning:
## ! NAs introduced by coercion
# Step 2: Pivot wider safely
df_wide <- t_census_percent_clean %>%
pivot_wider(
names_from = label,
values_from = value,
values_fn = list(value = mean), # handles duplicates
names_sep = "_"
)
# Step 4: Clean column names
df_wide <- janitor::clean_names(df_wide) %>%
select(-sex_ratio_males_per_100_females)
# Clean and prepare for merge
t_dem_census <- df_wide %>%
filter(county != "West Virginia!!Percent") %>%
mutate(county = str_to_title(county)) %>%
select(
county,
total_population,
under_5_years,
x5_to_9_years,
x10_to_14_years,
x15_to_19_years,
x20_to_24_years,
white,
black_or_african_american,
asian,
hispanic_or_latino_of_any_race,
male,
female,
under_18_years,
citizen_18_and_over_population
)
# Final merge
t <- t_assess %>%
left_join(t_spending, by = 'county') %>%
left_join(t_demographics, by = 'county') %>%
left_join(t_income, by = "county") %>%
left_join(t_elsec, by = "county") %>%
left_join(t_dem_census, by = "county") %>%
select(
county, enroll, tfedrev, tstrev, tlocrev, totalexp, ppcstot,
unemployed, proficiency, income,
totalrev, tsalwage, tempbene, tcurinst, tcurssvc, ppitotal,
total_population, under_5_years, x5_to_9_years, x10_to_14_years,
x15_to_19_years, x20_to_24_years,
white, black_or_african_american, asian, hispanic_or_latino_of_any_race,
male, female, under_18_years,
citizen_18_and_over_population
) %>%
drop_na()
glimpse(t)
## Rows: 54
## Columns: 30
## $ county <chr> "Barbour", "Berkeley", "Boone", "Braxto…
## $ enroll <dbl> 2144, 19722, 3177, 1747, 2582, 11667, 8…
## $ tfedrev <dbl> 7559, 48407, 8194, 5479, 6791, 42518, 3…
## $ tstrev <dbl> 16584, 140127, 26858, 12748, 17114, 883…
## $ tlocrev <dbl> 5872, 86699, 14564, 6404, 21352, 66699,…
## $ totalexp <dbl> 28021, 264253, 48642, 24417, 41908, 183…
## $ ppcstot <dbl> 11885, 12704, 14663, 13153, 15642, 1453…
## $ unemployed <dbl> 10.1, 4.6, 9.8, 14.4, 5.7, 6.1, 12.2, 1…
## $ proficiency <dbl> 25.97, 28.63, 19.58, 22.65, 21.14, 30.8…
## $ income <dbl> 48347, 77329, 56152, 44449, 52116, 5282…
## $ totalrev <dbl> 30015, 275233, 49616, 24631, 45257, 197…
## $ tsalwage <dbl> 13825, 140401, 24712, 11759, 22960, 918…
## $ tempbene <dbl> 6769, 59401, 11281, 5463, 9864, 42740, …
## $ tcurinst <dbl> 14552, 148662, 25425, 12530, 21986, 968…
## $ tcurssvc <dbl> 9558, 85868, 18292, 8691, 16549, 60875,…
## $ ppitotal <dbl> 6787, 7519, 8003, 7172, 8515, 8275, 862…
## $ total_population <dbl> 15454, 126165, 21312, 12345, 22053, 933…
## $ under_5_years <dbl> 4.8, 6.1, 4.9, 4.2, 4.2, 5.3, 4.2, 5.1,…
## $ x5_to_9_years <dbl> 6.3, 6.3, 4.6, 5.3, 4.3, 5.4, 7.6, 5.6,…
## $ x10_to_14_years <dbl> 5.3, 6.7, 7.2, 5.8, 5.4, 5.6, 4.1, 7.3,…
## $ x15_to_19_years <dbl> 7.9, 6.1, 6.2, 5.6, 6.1, 7.0, 5.2, 8.4,…
## $ x20_to_24_years <dbl> 7.7, 5.4, 5.4, 4.4, 6.8, 10.1, 3.8, 4.9…
## $ white <dbl> 94.20, 85.55, 98.20, 92.90, 95.70, 91.7…
## $ black_or_african_american <dbl> 2.20, 8.65, 0.95, 1.05, 1.85, 5.40, 0.7…
## $ asian <dbl> 0.60, 1.40, 0.25, 0.30, 0.65, 1.45, 0.0…
## $ hispanic_or_latino_of_any_race <dbl> 1.2, 6.0, 0.5, 0.8, 1.3, 1.7, 0.5, 0.2,…
## $ male <dbl> 48.650, 49.000, 48.825, 51.150, 48.200,…
## $ female <dbl> 51.350, 51.000, 51.175, 48.850, 51.800,…
## $ under_18_years <dbl> 20.1, 23.2, 20.7, 18.7, 17.3, 19.8, 19.…
## $ citizen_18_and_over_population <dbl> 12281, 95064, 16876, 10031, 18076, 7422…
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Rows: 54
## Columns: 30
## $ county <chr> "Barbour", "Berkeley", "Boone", "Braxto…
## $ enroll <dbl> 2144, 19722, 3177, 1747, 2582, 11667, 8…
## $ tfedrev <dbl> 7559, 48407, 8194, 5479, 6791, 42518, 3…
## $ tstrev <dbl> 16584, 140127, 26858, 12748, 17114, 883…
## $ tlocrev <dbl> 5872, 86699, 14564, 6404, 21352, 66699,…
## $ totalexp <dbl> 28021, 264253, 48642, 24417, 41908, 183…
## $ ppcstot <dbl> 11885, 12704, 14663, 13153, 15642, 1453…
## $ unemployed <dbl> 10.1, 4.6, 9.8, 14.4, 5.7, 6.1, 12.2, 1…
## $ proficiency <dbl> 25.97, 28.63, 19.58, 22.65, 21.14, 30.8…
## $ income <dbl> 48347, 77329, 56152, 44449, 52116, 5282…
## $ totalrev <dbl> 30015, 275233, 49616, 24631, 45257, 197…
## $ tsalwage <dbl> 13825, 140401, 24712, 11759, 22960, 918…
## $ tempbene <dbl> 6769, 59401, 11281, 5463, 9864, 42740, …
## $ tcurinst <dbl> 14552, 148662, 25425, 12530, 21986, 968…
## $ tcurssvc <dbl> 9558, 85868, 18292, 8691, 16549, 60875,…
## $ ppitotal <dbl> 6787, 7519, 8003, 7172, 8515, 8275, 862…
## $ total_population <dbl> 15454, 126165, 21312, 12345, 22053, 933…
## $ under_5_years <dbl> 4.8, 6.1, 4.9, 4.2, 4.2, 5.3, 4.2, 5.1,…
## $ x5_to_9_years <dbl> 6.3, 6.3, 4.6, 5.3, 4.3, 5.4, 7.6, 5.6,…
## $ x10_to_14_years <dbl> 5.3, 6.7, 7.2, 5.8, 5.4, 5.6, 4.1, 7.3,…
## $ x15_to_19_years <dbl> 7.9, 6.1, 6.2, 5.6, 6.1, 7.0, 5.2, 8.4,…
## $ x20_to_24_years <dbl> 7.7, 5.4, 5.4, 4.4, 6.8, 10.1, 3.8, 4.9…
## $ white <dbl> 94.20, 85.55, 98.20, 92.90, 95.70, 91.7…
## $ black_or_african_american <dbl> 2.20, 8.65, 0.95, 1.05, 1.85, 5.40, 0.7…
## $ asian <dbl> 0.60, 1.40, 0.25, 0.30, 0.65, 1.45, 0.0…
## $ hispanic_or_latino_of_any_race <dbl> 1.2, 6.0, 0.5, 0.8, 1.3, 1.7, 0.5, 0.2,…
## $ male <dbl> 48.650, 49.000, 48.825, 51.150, 48.200,…
## $ female <dbl> 51.350, 51.000, 51.175, 48.850, 51.800,…
## $ under_18_years <dbl> 20.1, 23.2, 20.7, 18.7, 17.3, 19.8, 19.…
## $ citizen_18_and_over_population <dbl> 12281, 95064, 16876, 10031, 18076, 7422…
## county enroll tfedrev tstrev
## Length:54 Min. : 800 Min. : 1511 Min. : 3895
## Class :character 1st Qu.: 1646 1st Qu.: 4962 1st Qu.: 12629
## Mode :character Median : 3338 Median : 9764 Median : 27234
## Mean : 4622 Mean : 13315 Mean : 34584
## 3rd Qu.: 5142 3rd Qu.: 14905 3rd Qu.: 40308
## Max. :24392 Max. :109522 Max. :176062
## tlocrev totalexp ppcstot unemployed
## Min. : 1956 Min. : 13954 Min. :11885 Min. : 2.600
## 1st Qu.: 8132 1st Qu.: 26364 1st Qu.:13150 1st Qu.: 5.025
## Median : 15078 Median : 49383 Median :13777 Median : 6.250
## Mean : 25290 Mean : 69982 Mean :14467 Mean : 6.906
## 3rd Qu.: 34124 3rd Qu.: 82272 3rd Qu.:15372 3rd Qu.: 8.375
## Max. :145623 Max. :416491 Max. :23563 Max. :14.400
## proficiency income totalrev tsalwage
## Min. :17.03 Min. :39527 Min. : 13285 Min. : 7098
## 1st Qu.:21.77 1st Qu.:48816 1st Qu.: 27280 1st Qu.: 12209
## Median :24.60 Median :53992 Median : 50230 Median : 24760
## Mean :25.50 Mean :55205 Mean : 73189 Mean : 34411
## 3rd Qu.:29.46 3rd Qu.:60322 3rd Qu.: 91392 3rd Qu.: 40325
## Max. :41.80 Max. :95523 Max. :431207 Max. :183673
## tempbene tcurinst tcurssvc ppitotal
## Min. : 3200 Min. : 7380 Min. : 4978 Min. : 6787
## 1st Qu.: 5834 1st Qu.: 13432 1st Qu.: 9020 1st Qu.: 7489
## Median :11575 Median : 26149 Median : 17432 Median : 7964
## Mean :15938 Mean : 36849 Mean : 22746 Mean : 8191
## 3rd Qu.:19699 3rd Qu.: 42984 3rd Qu.: 27061 3rd Qu.: 8534
## Max. :82313 Max. :195090 Max. :106675 Max. :12935
## total_population under_5_years x5_to_9_years x10_to_14_years
## Min. : 5131 Min. :3.600 Min. :2.300 Min. :3.700
## 1st Qu.: 11961 1st Qu.:4.600 1st Qu.:4.625 1st Qu.:5.325
## Median : 22516 Median :4.850 Median :5.350 Median :5.800
## Mean : 32705 Mean :4.831 Mean :5.317 Mean :5.926
## 3rd Qu.: 37424 3rd Qu.:5.100 3rd Qu.:6.075 3rd Qu.:6.575
## Max. :178198 Max. :6.100 Max. :7.600 Max. :8.400
## x15_to_19_years x20_to_24_years white black_or_african_american
## Min. :4.600 Min. : 3.000 Min. :82.90 Min. : 0.100
## 1st Qu.:5.425 1st Qu.: 4.625 1st Qu.:93.54 1st Qu.: 0.800
## Median :5.800 Median : 5.100 Median :96.12 Median : 1.725
## Mean :6.080 Mean : 5.381 Mean :94.89 Mean : 2.621
## 3rd Qu.:6.650 3rd Qu.: 5.500 3rd Qu.:97.29 3rd Qu.: 3.288
## Max. :9.300 Max. :15.200 Max. :98.50 Max. :14.350
## asian hispanic_or_latino_of_any_race male
## Min. :0.0000 Min. :0.000 Min. :46.65
## 1st Qu.:0.2125 1st Qu.:0.800 1st Qu.:48.21
## Median :0.4000 Median :1.250 Median :49.21
## Mean :0.5556 Mean :1.509 Mean :49.48
## 3rd Qu.:0.6500 3rd Qu.:1.675 3rd Qu.:50.09
## Max. :3.9000 Max. :7.500 Max. :58.17
## female under_18_years citizen_18_and_over_population
## Min. :41.83 Min. :14.40 Min. : 4035
## 1st Qu.:49.91 1st Qu.:18.75 1st Qu.: 9852
## Median :50.79 Median :20.00 Median : 17892
## Mean :50.52 Mean :19.73 Mean : 25880
## 3rd Qu.:51.79 3rd Qu.:20.88 3rd Qu.: 29689
## Max. :53.35 Max. :23.20 Max. :141363
pca_all <- prcomp(t_numeric %>% scale(), center = TRUE, scale. = TRUE)
summary(pca_all)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.7534 2.0345 1.43519 1.32283 1.26360 1.23263 1.06668
## Proportion of Variance 0.4858 0.1427 0.07103 0.06034 0.05506 0.05239 0.03923
## Cumulative Proportion 0.4858 0.6285 0.69954 0.75988 0.81494 0.86733 0.90657
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.84451 0.7383 0.59151 0.57297 0.48020 0.44375 0.34734
## Proportion of Variance 0.02459 0.0188 0.01207 0.01132 0.00795 0.00679 0.00416
## Cumulative Proportion 0.93116 0.9499 0.96202 0.97334 0.98129 0.98808 0.99224
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.29910 0.23611 0.17546 0.16376 0.09274 0.07051 0.05807
## Proportion of Variance 0.00308 0.00192 0.00106 0.00092 0.00030 0.00017 0.00012
## Cumulative Proportion 0.99533 0.99725 0.99831 0.99924 0.99953 0.99970 0.99982
## PC22 PC23 PC24 PC25 PC26 PC27
## Standard deviation 0.04769 0.03812 0.02621 0.02073 0.01912 0.004254
## Proportion of Variance 0.00008 0.00005 0.00002 0.00001 0.00001 0.000000
## Cumulative Proportion 0.99990 0.99995 0.99997 0.99999 1.00000 1.000000
## PC28 PC29
## Standard deviation 3.575e-16 3.575e-16
## Proportion of Variance 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00
plot(pca_all, type = "l", main = "Scree Plot of Principal Components")
biplot(pca_all, scale = 0)
## # A tibble: 3 × 14
## cluster avg_proficiency avg_income avg_unemployment avg_ppitotal avg_tcurinst
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 23.8 52728. 7.38 8157. 22556.
## 2 2 27.0 56078. 5.7 11080 10688
## 3 3 31.6 64277. 5.35 7792. 94880.
## # ℹ 8 more variables: avg_tcurssvc <dbl>, avg_hispanic <dbl>, avg_white <dbl>,
## # avg_black <dbl>, avg_asian <dbl>, avg_under18 <dbl>, avg_female <dbl>,
## # avg_male <dbl>
Demographics & Education
- Lowest average proficiency (~23.8%)
- Lowest average income (~$52,700)
- Highest unemployment (~7.4%)
- Moderate per-pupil instructional spending
(~$8,157)
- Moderate instructional and support spending
Population Characteristics
- Predominantly White (96%) with low
Hispanic (~1.1%) and Black (~1.7%)
populations
- Balanced gender split, slightly more females
- Youth population (under 18): ~19.7%
Demographics & Education
- Moderate proficiency (~27%)
- Middle income (~$56,000)
- Lower unemployment (~5.7%)
- Highest per-pupil instructional spending (~$11,080),
but lower total instructional/support spending
Population Characteristics
- More racially diverse: Hispanic (~2.5%),
Black (~8.6%)
- Male-dominated gender ratio
- Lower youth percentage (~15.6%)
Demographics & Education
- Highest proficiency (~31.6%)
- Highest income (~$64,300)
- Lowest unemployment (~5.3%)
- Extremely high total instructional (~$94.8k) and support
spending (~$56.2k), but moderate per-pupil spending
Population Characteristics
- Most gender-balanced cluster
- Highest youth population (~20.6%)
- Higher Asian representation (~1.4%)
library(ggcorrplot)
#tcurinst, unemployed, tlocrev, tcurssvc, ppitotal, unemployed
# Select relevant numeric variables for correlation
t_selected <- t %>%
select(
proficiency,
income,
ppitotal,
white,
tlocrev
)
# Compute correlation matrix
cor_matrix <- cor(t_selected, use = "complete.obs")
# Visualize correlation matrix
ggcorrplot(cor_matrix,
method = "circle",
type = "lower",
lab = TRUE,
title = "Correlation Matrix of Selected Variables")
# Load required library
library(caret)
# Set seed for reproducibility
set.seed(123)
# Create train-test split (80% train, 20% test)
split_index <- createDataPartition(t_selected$proficiency, p = 0.8, list = FALSE)
train_data <- t_selected[split_index, ]
test_data <- t_selected[-split_index, ]
# Fit model on training data
model <- lm(proficiency ~ ., data = train_data)
# Summary of the model
summary(model)
##
## Call:
## lm(formula = proficiency ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4188 -4.0682 0.0639 3.1346 11.9273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.645e+01 2.405e+01 0.684 0.49796
## income 2.403e-04 7.975e-05 3.013 0.00441 **
## ppitotal 1.045e-03 6.673e-04 1.566 0.12506
## white -1.423e-01 2.418e-01 -0.589 0.55927
## tlocrev 3.372e-05 3.344e-05 1.008 0.31913
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.835 on 41 degrees of freedom
## Multiple R-squared: 0.3603, Adjusted R-squared: 0.2979
## F-statistic: 5.774 on 4 and 41 DF, p-value: 0.0008823
# Predict on test data
predictions <- predict(model, newdata = test_data)
# Evaluate model performance
actuals <- test_data$proficiency
mse <- mean((predictions - actuals)^2)
rmse <- sqrt(mse)
cat("Test RMSE:", round(rmse, 2), "\n")
## Test RMSE: 4.68
# Calculate R-squared
ss_total <- sum((actuals - mean(actuals))^2)
ss_residual <- sum((actuals - predictions)^2)
r_squared <- 1 - (ss_residual / ss_total)
cat("Test R-squared:", round(r_squared, 3), "\n")
## Test R-squared: 0.187
Multicollinearity Check
# Select relevant variables for analysis
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(model)
## income ppitotal white tlocrev
## 1.308366 1.026387 1.467372 1.636496
I was initially thrown off by some of the high p-values in my
regression model, as many of the predictors seemed logically related to
educational outcomes. To investigate further, I tested for
multicollinearity using vif(model). All variables returned
VIF values within an acceptable range, suggesting that multicollinearity
was not a concern. This finding aligns with the results from my PCA
analysis, which also indicated a reasonable level of independence among
key predictors.
The linear regression model performed notably better on the training set (Adjusted R-squared: 0.2979) than on the test set (R-squared: 0.187), indicating potential overfitting or model limitations. This gap suggests that a more flexible supervised learning approach, such as a decision tree or neural network, may be better suited to capture the non-linear relationships within the data.
The decision tree model identified income, instructional spending (tcurinst), and total revenue as the most influential predictors of educational proficiency. While demographic variables such as racial composition had some explanatory power in the linear regression model, they were not selected by the tree. This suggests that their predictive influence may be weaker or nonlinear in nature, and that financial inputs to education more directly drive proficiency outcomes in this dataset.
## Test RMSE: 5.81
## Test R-squared: 0.287
The decision tree model demonstrated stronger predictive power on the test set, achieving a Test R-squared of 0.287, which is a notable improvement over the 0.18 R-squared from the linear regression model. This suggests that the decision tree was more effective at capturing the non-linear relationships between predictors and educational proficiency in this dataset.
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
# Normalize data
t_scaled <- as.data.frame(scale(t_numeric))
# Train-test split
set.seed(10)
index <- createDataPartition(t_scaled$proficiency, p = 0.8, list = FALSE)
train <- t_scaled[index, ]
test <- t_scaled[-index, ]
# Train neural network
nn_model <- neuralnet(
proficiency ~ income + unemployed + ppitotal + tcurinst + tcurssvc + totalrev,
data = train,
hidden = 5,
linear.output = TRUE
)
# Plot
plot(nn_model)
## Neural Net Test RMSE: 1.538
The neural network model achieved a Test RMSE of 1.467 using 5 hidden nodes, outperforming both the linear regression and decision tree models. This suggests that, at least in this dataset, demographic percentages do not significantly impact educational proficiency, and the neural network provided the most accurate and efficient predictions.
Our models suggest that financial variables have a direct causal relationship with proficiency, while demographics appear less causally linked in this dataset.
The decision tree model identified income, instructional spending (tcurinst), and total revenue as the most influential predictors of educational proficiency. While demographic variables such as racial composition had some explanatory power in the linear regression model, they were not selected by the tree. This suggests that their predictive influence may be weaker or nonlinear in nature, and that financial inputs to education more directly drive proficiency outcomes in this dataset.
The neural network model achieved a Test RMSE of 1.467 using 5 hidden nodes, outperforming both the linear regression and decision tree models. This suggests that, at least in this dataset, demographic percentages do not significantly impact educational proficiency, and the neural network provided the most accurate and efficient predictions.
This analysis is limited by its focus on West Virginia counties, use of county-level data, and exclusion of important unmeasured factors like teacher quality or parental involvement. Demographic data also showed limited predictive power, and some models relied on assumptions that may not fully hold.