Analysis

*Note this describes the work done so far as well as the intentions for the completed version of the project. This document shows correlations between various things to predict proficiency in schools by county. We start by viewing the correlations between the data, shown by the corrplot. The PCA show that each section of things grouped together correlated to create a new variable. The grouped together fields are shown by the heat map. Then there is a linear regression model and a decision tree, showing how the variables affect proficiency. Work to be done past the draft stage: improving the model so it has better accuracy, most likely by using the additional data. The written analysis will also be more in depth after creating a better model.

Load assessment data

Load spending data

Load demographic data

Joined data

# Merge data
t <- t_assess %>% 
  inner_join(t_spending, by = 'county') %>% 
  inner_join(t_demographics, by = 'county')

print(t)
## # A tibble: 55 × 15
##    county    school school_name    population_group subgroup science_proficiency
##    <chr>     <chr>  <chr>          <chr>            <chr>                  <dbl>
##  1 Barbour   999    Barbour Count… Total Population Total                   26.0
##  2 Berkeley  999    Berkeley Coun… Total Population Total                   28.6
##  3 Boone     999    Boone County … Total Population Total                   19.6
##  4 Braxton   999    Braxton Count… Total Population Total                   22.6
##  5 Brooke    999    Brooke County… Total Population Total                   21.1
##  6 Cabell    999    Cabell County… Total Population Total                   30.8
##  7 Calhoun   999    Calhoun Count… Total Population Total                   27.8
##  8 Clay      999    Clay County T… Total Population Total                   23.3
##  9 Doddridge 999    Doddridge Cou… Total Population Total                   31.3
## 10 Fayette   999    Fayette Count… Total Population Total                   17.4
## # ℹ 45 more rows
## # ℹ 9 more variables: proficiency <dbl>, name <chr>, enroll <dbl>,
## #   tfedrev <dbl>, tstrev <dbl>, tlocrev <dbl>, totalexp <dbl>, ppcstot <dbl>,
## #   unemployed <dbl>

Extra county data

t_kanawha <- read_csv('23-Kanawha-39_Updated.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 874733 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): VENDOR, ADD1, ADD2, CITY, STATE, ZIP, INVDESC, LOC2
## dbl (12): YEAR, COUNTY, CK_AMT, PO_NUM, CK_NUM, FUND, PROJ, PROG, OBJ, LOC, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_putnam <- read_csv('23-Putnam-72_Updated.csv')
## Rows: 235153 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): VENDOR, ADD1, ADD2, CITY, STATE, ZIP, INVDESC, LOC2
## dbl (11): YEAR, COUNTY, PO_NUM, CK_NUM, FUND, PROJ, PROG, OBJ, LOC, COST, EXT
## num  (1): CK_AMT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_boone <- read_csv('23-Boone-06_Updated.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 136440 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): COUNTY, VENDOR, ADD1, ADD2, CITY, STATE, ZIP, INVDESC, LOC2
## dbl (11): YEAR, CK_AMT, PO_NUM, CK_NUM, FUND, PROJ, PROG, OBJ, LOC, COST, EXT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_clay <- read_csv('23-Clay-16_Updated.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 51931 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): VENDOR, ADD1, ADD2, CITY, STATE, ZIP, INVDESC, LOC2
## dbl (12): YEAR, COUNTY, CK_AMT, PO_NUM, CK_NUM, FUND, PROJ, PROG, OBJ, LOC, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t_select <- t %>% 
  filter(county == "Kanawha"| 
         county == "Putnam"| 
         county == "Boone"| 
         county == "Clay")

Correlations

## Importance of first k=4 (out of 9) components:
##                           PC1    PC2    PC3     PC4
## Standard deviation     2.2861 1.3908 0.9814 0.84084
## Proportion of Variance 0.5807 0.2149 0.1070 0.07856
## Cumulative Proportion  0.5807 0.7956 0.9026 0.98118
## Standard deviations (1, .., p=9):
## [1] 2.286075e+00 1.390783e+00 9.814091e-01 8.408411e-01 2.955149e-01
## [6] 2.777063e-01 5.576010e-02 4.296449e-02 9.890724e-17
## 
## Rotation (n x k) = (9 x 4):
##                            PC1         PC2          PC3          PC4
## science_proficiency -0.2201909 -0.57310961  0.335755591 -0.054099098
## proficiency         -0.2201909 -0.57310961  0.335755591 -0.054099098
## enroll              -0.4263078  0.13770536  0.005682077 -0.005197766
## tfedrev             -0.3926048  0.23007619 -0.099002279 -0.168585913
## tstrev              -0.4180510  0.16921143  0.040874107  0.004871777
## tlocrev             -0.4112420 -0.02846765 -0.212156443 -0.158272467
## totalexp            -0.4283677  0.12259154 -0.067458733 -0.086673035
## ppcstot              0.1096831 -0.37358646 -0.669458674 -0.570136800
## unemployed           0.1665509  0.29521940  0.515068910 -0.779779619

Linear Regression Model

m <- lm(proficiency ~ tlocrev + totalexp + enroll + tstrev, data = t)

summary(m)
## 
## Call:
## lm(formula = proficiency ~ tlocrev + totalexp + enroll + tstrev, 
##     data = t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5442  -3.9791   0.1476   3.7254   9.9651 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.3618667  1.0639500  21.018  < 2e-16 ***
## tlocrev      0.0004532  0.0001393   3.254  0.00204 ** 
## totalexp    -0.0002749  0.0001051  -2.616  0.01172 *  
## enroll      -0.0008531  0.0018859  -0.452  0.65297    
## tstrev       0.0004268  0.0002626   1.625  0.11039    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.035 on 50 degrees of freedom
## Multiple R-squared:  0.3062, Adjusted R-squared:  0.2507 
## F-statistic: 5.518 on 4 and 50 DF,  p-value: 0.0009283
m1 <- rpart(formula = proficiency ~ tlocrev + totalexp + enroll + tstrev,
           data = t,
           minsplit = 5,
           minbucket = 3,
           method = 'class')


library(rpart.plot)
rpart.plot(m1)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes in the response 53 is greater than length(box.palette) 6.
## To silence this warning use box.palette=0 or trace=-1.

predicted <- predict(m1, t, type = 'class')
t <- mutate( t, 
             predicted = predicted, 
             is_correct = predicted == proficiency)
print(mean(t$is_correct))
## [1] 0.2727273