## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Ingest Data

df <- read.csv('states_all.csv')
knitr::kable(df[1:10 ,],"pipe")
PRIMARY_KEY STATE YEAR ENROLL TOTAL_REVENUE FEDERAL_REVENUE STATE_REVENUE LOCAL_REVENUE TOTAL_EXPENDITURE INSTRUCTION_EXPENDITURE SUPPORT_SERVICES_EXPENDITURE OTHER_EXPENDITURE CAPITAL_OUTLAY_EXPENDITURE GRADES_PK_G GRADES_KG_G GRADES_4_G GRADES_8_G GRADES_12_G GRADES_1_8_G GRADES_9_12_G GRADES_ALL_G AVG_MATH_4_SCORE AVG_MATH_8_SCORE AVG_READING_4_SCORE AVG_READING_8_SCORE
1992_ALABAMA ALABAMA 1992 NA 2678885 304177 1659028 715680 2653798 1481703 735036 NA 174053 8224 55460 57948 58025 41167 NA NA 731634 208 252 207 NA
1992_ALASKA ALASKA 1992 NA 1049591 106780 720711 222100 972488 498362 350902 NA 37451 2371 10152 9748 8789 6714 NA NA 122487 NA NA NA NA
1992_ARIZONA ARIZONA 1992 NA 3258079 297888 1369815 1590376 3401580 1435908 1007732 NA 609114 2544 53497 55433 49081 37410 NA NA 673477 215 265 209 NA
1992_ARKANSAS ARKANSAS 1992 NA 1711959 178571 958785 574603 1743022 964323 483488 NA 145212 808 33511 34632 36011 27651 NA NA 441490 210 256 211 NA
1992_CALIFORNIA CALIFORNIA 1992 NA 26260025 2072470 16546514 7641041 27138832 14358922 8520926 NA 2044688 59067 431763 418418 363296 270675 NA NA 5254844 208 261 202 NA
1992_COLORADO COLORADO 1992 NA 3185173 163253 1307986 1713934 3264826 1642466 1035970 NA 364760 7410 47588 50648 45025 34533 NA NA 612635 221 272 217 NA
1992_CONNECTICUT CONNECTICUT 1992 NA 3834302 143542 1342539 2348221 3721338 2148041 1142600 NA 48542 5731 41319 38058 33691 28366 NA NA 488476 227 274 222 NA
1992_DELAWARE DELAWARE 1992 NA 645233 45945 420942 178346 638784 372722 194915 NA 30595 463 8025 8272 8012 6129 NA NA 104321 218 263 213 NA
1992_DISTRICT_OF_COLUMBIA DISTRICT_OF_COLUMBIA 1992 NA 709480 64749 0 644731 742893 329160 316679 NA 47272 4818 6667 5832 5000 3433 NA NA 80937 193 235 188 NA
1992_FLORIDA FLORIDA 1992 NA 11506299 788420 5683949 5033930 11305642 5166374 3410440 NA 1667826 31464 161701 164416 142372 100835 NA NA 1981407 214 260 208 NA

Data Clean

df$AVG_TOTAL_REVENUE_STUDENT <- df$TOTAL_REVENUE / df$ENROLL
df$AVG_STATE_REVENUE_STUDENT <- df$STATE_REVENUE / df$ENROLL
df$AVG_LOCAL_REVENUE_STUDENT <- df$LOCAL_REVENUE / df$ENROLL
df$AVG_FEDERAL_REVENUE_STUDENT <- df$FEDERAL_REVENUE / df$ENROLL
df$AVG_EXPENDITURE_STUDENT <- df$TOTAL_EXPENDITURE / df$ENROLL

Visuals

dff <- df %>% drop_na(AVG_MATH_4_SCORE)

dfmean <- mean(dff$AVG_MATH_4_SCORE)

dff %>%
  group_by(STATE) %>%
  summarise(avg_score = mean(AVG_MATH_4_SCORE) - dfmean) %>%
  ggplot(aes(x=reorder(STATE, avg_score), y=avg_score), col='b', title='Avg Math Score 4 - Mean') + geom_bar(stat="identity") + coord_flip()+labs(title="Avg Math Score 4 - Mean",x ="Avg Math 4 Score", y = "State") +theme(text = element_text(size = 8)) 

dff <- df %>% drop_na(AVG_MATH_8_SCORE)

dfmean <- mean(dff$AVG_MATH_8_SCORE)

dff %>%
  group_by(STATE) %>%
  summarise(avg_score = mean(AVG_MATH_8_SCORE) - dfmean) %>%
  ggplot(aes(x=reorder(STATE, avg_score), y=avg_score), col='b', title='Avg Math Score 8 - Mean') + geom_bar(stat="identity") + coord_flip()+labs(title="Avg Math Score 8 - Mean",x ="Avg Math 8 Score", y = "State") +theme(text = element_text(size = 8)) 

dff <- df %>% drop_na(AVG_READING_4_SCORE)

dfmean <- mean(dff$AVG_READING_4_SCORE)

dff %>%
  group_by(STATE) %>%
  summarise(avg_score = mean(AVG_READING_4_SCORE) - dfmean) %>%
  ggplot(aes(x=reorder(STATE, avg_score), y=avg_score), col='b', title='Avg Reading Score 4 - Mean') + geom_bar(stat="identity") + coord_flip()+labs(title="Avg Reading Score 4 - Mean",x ="Avg Reading Score 4", y = "State") +theme(text = element_text(size = 8)) 

dff <- df %>% drop_na(AVG_READING_8_SCORE)

dfmean <- mean(dff$AVG_READING_8_SCORE)

dff %>%
  group_by(STATE) %>%
  summarise(avg_score = mean(AVG_READING_8_SCORE) - dfmean) %>%
  ggplot(aes(x=reorder(STATE, avg_score), y=avg_score), col='b', title='Avg Reading Score 8 - Mean') + geom_bar(stat="identity") + coord_flip()+labs(title="Avg Reading Score 8 - Mean",x ="Avg Reading Score 8", y = "State") +theme(text = element_text(size = 8)) 

df %>%
  ggplot(aes(x=TOTAL_REVENUE)) + geom_histogram() + labs(title="Total Revenue Histogram",x ="Total Revenue", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 440 rows containing non-finite values (stat_bin).

df %>%
  ggplot(aes(x=AVG_TOTAL_REVENUE_STUDENT)) + geom_histogram() + labs(title="Average Revenue Per Student",x ="Rev Per Enrolled Student", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 491 rows containing non-finite values (stat_bin).

rev <- df %>% drop_na(AVG_MATH_4_SCORE) %>% filter(TOTAL_REVENUE > 0)

ggplot(data=rev,aes(x=AVG_MATH_4_SCORE,y=TOTAL_REVENUE)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

rev <- df %>% drop_na(AVG_MATH_4_SCORE) %>% filter(TOTAL_REVENUE > 0)

ggplot(data=rev,aes(x=AVG_MATH_4_SCORE,y=AVG_TOTAL_REVENUE_STUDENT)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).

scores <- df %>% 
  dplyr::select(AVG_MATH_4_SCORE, AVG_MATH_8_SCORE, AVG_READING_4_SCORE, AVG_READING_8_SCORE)

df <- transform(df, TARGET_SCORE = rowMeans(scores, na.rm = TRUE))
rev <- df %>% drop_na(TARGET_SCORE) %>% filter(TOTAL_REVENUE > 0)

ggplot(data=rev,aes(x=TARGET_SCORE,y=AVG_TOTAL_REVENUE_STUDENT)) + geom_point() + geom_smooth(method = "lm") + labs(title="Avg Rev Per Student vs Target Score",x ="Target Score", y = "Avg Revenue per Student")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).

Model

model <- lm(TARGET_SCORE ~ AVG_TOTAL_REVENUE_STUDENT+AVG_STATE_REVENUE_STUDENT+AVG_LOCAL_REVENUE_STUDENT+AVG_FEDERAL_REVENUE_STUDENT+AVG_EXPENDITURE_STUDENT, df)


summary(model)
## 
## Call:
## lm(formula = TARGET_SCORE ~ AVG_TOTAL_REVENUE_STUDENT + AVG_STATE_REVENUE_STUDENT + 
##     AVG_LOCAL_REVENUE_STUDENT + AVG_FEDERAL_REVENUE_STUDENT + 
##     AVG_EXPENDITURE_STUDENT, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.957  -4.185   2.453   7.536  18.734 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  2.269e+02  1.493e+00 151.916   <2e-16 ***
## AVG_TOTAL_REVENUE_STUDENT    1.745e+06  1.382e+06   1.263   0.2072    
## AVG_STATE_REVENUE_STUDENT   -1.745e+06  1.382e+06  -1.263   0.2072    
## AVG_LOCAL_REVENUE_STUDENT   -1.745e+06  1.382e+06  -1.263   0.2072    
## AVG_FEDERAL_REVENUE_STUDENT -1.745e+06  1.382e+06  -1.263   0.2072    
## AVG_EXPENDITURE_STUDENT      1.653e+00  9.743e-01   1.696   0.0904 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.17 on 516 degrees of freedom
##   (1193 observations deleted due to missingness)
## Multiple R-squared:  0.2755, Adjusted R-squared:  0.2685 
## F-statistic: 39.25 on 5 and 516 DF,  p-value: < 2.2e-16
model <- lm(TARGET_SCORE ~ AVG_TOTAL_REVENUE_STUDENT, df %>% drop_na(TARGET_SCORE))


summary(model)
## 
## Call:
## lm(formula = TARGET_SCORE ~ AVG_TOTAL_REVENUE_STUDENT, data = df %>% 
##     drop_na(TARGET_SCORE))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.096  -3.400   2.599   7.520  17.979 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                229.055      1.435  159.65   <2e-16 ***
## AVG_TOTAL_REVENUE_STUDENT    1.540      0.126   12.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.53 on 520 degrees of freedom
##   (210 observations deleted due to missingness)
## Multiple R-squared:  0.223,  Adjusted R-squared:  0.2215 
## F-statistic: 149.2 on 1 and 520 DF,  p-value: < 2.2e-16

Residuals