WV County Education Outcomes Prediction

Load assessment data

Load spending data

## # A tibble: 55 × 13
##    name    pid6  enroll tfedrev tstrev tlocrev totalexp ppcstot tpayoth tcurssch
##    <chr>   <chr>  <dbl>   <dbl>  <dbl>   <dbl>    <dbl>   <dbl>   <dbl>    <dbl>
##  1 BARBOU… 1834…   2144    7559  16584    5872    28021   11885      48     1224
##  2 BERKEL… 1879…  19722   48407 140127   86699   264253   12704    2187    13128
##  3 BOONE … 1150…   3177    8194  26858   14564    48642   14663      45     2815
##  4 BRAXTO… 1834…   1747    5479  12748    6404    24417   13153     523     1310
##  5 BROOKE… 1364…   2582    6791  17114   21352    41908   15642      90     2258
##  6 CABELL… 1150…  11667   42518  88337   66699   183621   14538    1018     8944
##  7 CALHOU… 1150…    861    3254   9953    3190    15154   16085      44      738
##  8 CLAY C… 1150…   1669    6157  17655    2791    25963   13825      21     1262
##  9 DODDRI… 1834…   1082    3455   3999   31752    38493   23563     244     1294
## 10 FAYETT… 2028…   5594   15293  51759   23477    83373   13777      28     4192
## # ℹ 45 more rows
## # ℹ 3 more variables: tcapout <dbl>, debtout <dbl>, county <chr>

Load demographic data

addin demographics riley

Transactions By County

## # A tibble: 2 × 2
##   county_name total_expenditures_gen_fund
##   <chr>                             <dbl>
## 1 Barbour                       19487610.
## 2 Berkeley                     211946860.

Expenditure Per Pupil

## # A tibble: 59 × 8
##    dist  total_exp_state total_exp_fed avg_state_exp_per_pup avg_fed_exp_per_pup
##    <chr>           <dbl>         <dbl>                 <dbl>               <dbl>
##  1 002         13711911.      4292155.                 6836.               1919.
##  2 004        139224217.     24921087.                 7445.               1509.
##  3 006         23242775.      6768952.                 7675.               2362.
##  4 008         12388162.      5512281.                 7686.               3850.
##  5 010         22211516.      2647410.                 9528.               1283.
##  6 012         81316443.     23375114.                 7397.               2288.
##  7 014          6694016.      2936742.                 8097.               3975.
##  8 016         11833748.      3580995.                 8004.               2607.
##  9 018         12107075.      1900587.                10716.               1833.
## 10 020         37477328       8382223.                 7271.               1693.
## # ℹ 49 more rows
## # ℹ 3 more variables: avg_sh_state_per_pup <dbl>, avg_sh_fedper_pup <dbl>,
## #   av_gr_tot <dbl>

Joined data riley

## # A tibble: 55 × 19
##    county    science_proficiency proficiency pid6  enroll tfedrev tstrev tlocrev
##    <chr>                   <dbl>       <dbl> <chr>  <dbl>   <dbl>  <dbl>   <dbl>
##  1 Barbour                  26.0        26.0 1834…   2144    7559  16584    5872
##  2 Berkeley                 28.6        28.6 1879…  19722   48407 140127   86699
##  3 Boone                    19.6        19.6 1150…   3177    8194  26858   14564
##  4 Braxton                  22.6        22.6 1834…   1747    5479  12748    6404
##  5 Brooke                   21.1        21.1 1364…   2582    6791  17114   21352
##  6 Cabell                   30.8        30.8 1150…  11667   42518  88337   66699
##  7 Calhoun                  27.8        27.8 1150…    861    3254   9953    3190
##  8 Clay                     23.3        23.3 1150…   1669    6157  17655    2791
##  9 Doddridge                31.3        31.3 1834…   1082    3455   3999   31752
## 10 Fayette                  17.4        17.4 2028…   5594   15293  51759   23477
## # ℹ 45 more rows
## # ℹ 11 more variables: totalexp <dbl>, ppcstot <dbl>, tpayoth <dbl>,
## #   tcurssch <dbl>, tcapout <dbl>, debtout <dbl>, fips <chr>, unemployed <dbl>,
## #   people_unemployed <dbl>, edu_percent <dbl>, income_family <dbl>

Correlations

Linear Regression Model

## 
## Call:
## lm(formula = proficiency ~ totalexp + income_family + edu_percent, 
##     data = t_combined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7003 -2.9758 -0.1353  3.1444  8.6829 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    1.411e+01  6.881e+00   2.050  0.04550 * 
## totalexp       5.898e-06  9.371e-06   0.629  0.53189   
## income_family  2.102e-04  7.097e-05   2.962  0.00463 **
## edu_percent   -2.781e-01  2.130e-01  -1.306  0.19756   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.426 on 51 degrees of freedom
## Multiple R-squared:  0.4533, Adjusted R-squared:  0.4211 
## F-statistic: 14.09 on 3 and 51 DF,  p-value: 8.178e-07

K-Means

# ggplot(t_combined, aes(x = proficiency, y = totalexp)+
#          geom_point(size = 5) +
#          labs(title = ""))

# set.seed(123)
# k_means_result <- kmeans(t_combined %>% select(edu_percent, proficiency, totalexp), centers = 3)
# 
# t_combined_2 <- t_combined %>% 
#   mutate(cluster = factor(k_means_result$cluster))
# 
# ggplot(t_combined_2, aes(x = proficiency, y = totalexp) +
#       geom_point(size = 5) +
#          labs(title = ""))

PCA

# t_pca <- select(is_combined, where(is.numeric))
# 
# pca_result <- prcomp(t_pca, rank = 5, scale = TRUE, center = TRUE)
# summary(pca_result)
# 
# t_pca_3 <- as_tibble(pca_result$x)
# 
# ggplot(t_pca_3, aes(x = PC1, y = PC2))+
#   geom_point(size = 5)+
#   ggtitle("PCA proficiency")

Neural Network

library(neuralnet)
library(tm)
library(e1071)

# sample <- sample(c(1,0), size = nrow(t_combined), replace = TRUE, prob = c(0.7,0.3))
# 
# t_train <- t[sample == 1, ]
# t_test <- t[samplpe == 0, ]
# 
# set.seed(1)
# 
# n <- neuralnet(proficiency ~.,
#                data = t_train,
#                hidden = 1,
#                linear.output = FALSE)
# 
# plot(n)
# 
# 
# #proficiency is placeholder
# vector_predicted <- predict(n, t_train)
# vector_predicted <- round(vector_predicted, digits = 0)
# 
# accuracy <- mean(vector_predicted == t_train$proficiency)
# print(accuracy)
# table(vector_predicted, t_train$proficiency)
# 
# 
# # Measure accuracy on test data
# vector_predicted <- predict(n, t_test)
# vector_predicted <- round(vector_predicted, digits = 0)
# 
# accuracy <- mean(vector_predicted == t_test$proficiency)
# print(accuracy)
# table(vector_predicted, t_test$proficiency)

References

-Proficiency data by grade and school district, https://zoomwv.k12.wv.us/Dashboard/dashboard/7310

-Balanced scorecard(exp per pupil), https://wveis.k12.wv.us/essa/dashboard.html

-Snapshot, https://wvde.us/wp-content/uploads/2023/11/29196-Education-Snapshot-Infographic-v1.pdf

-WV Checkbook 2023