Visuals
dff <- df %>% drop_na(AVG_MATH_4_SCORE)
dfmean <- mean(dff$AVG_MATH_4_SCORE)
dff %>%
group_by(STATE) %>%
summarise(avg_score = mean(AVG_MATH_4_SCORE) - dfmean) %>%
ggplot(aes(x=reorder(STATE, avg_score), y=avg_score), col='b', title='Avg Math Score 4 - Mean') + geom_bar(stat="identity") + coord_flip()+labs(title="Avg Math Score 4 - Mean",x ="Avg Math 4 Score", y = "State") +theme(text = element_text(size = 8))

dff <- df %>% drop_na(AVG_MATH_8_SCORE)
dfmean <- mean(dff$AVG_MATH_8_SCORE)
dff %>%
group_by(STATE) %>%
summarise(avg_score = mean(AVG_MATH_8_SCORE) - dfmean) %>%
ggplot(aes(x=reorder(STATE, avg_score), y=avg_score), col='b', title='Avg Math Score 8 - Mean') + geom_bar(stat="identity") + coord_flip()+labs(title="Avg Math Score 8 - Mean",x ="Avg Math 8 Score", y = "State") +theme(text = element_text(size = 8))

dff <- df %>% drop_na(AVG_READING_4_SCORE)
dfmean <- mean(dff$AVG_READING_4_SCORE)
dff %>%
group_by(STATE) %>%
summarise(avg_score = mean(AVG_READING_4_SCORE) - dfmean) %>%
ggplot(aes(x=reorder(STATE, avg_score), y=avg_score), col='b', title='Avg Reading Score 4 - Mean') + geom_bar(stat="identity") + coord_flip()+labs(title="Avg Reading Score 4 - Mean",x ="Avg Reading Score 4", y = "State") +theme(text = element_text(size = 8))

dff <- df %>% drop_na(AVG_READING_8_SCORE)
dfmean <- mean(dff$AVG_READING_8_SCORE)
dff %>%
group_by(STATE) %>%
summarise(avg_score = mean(AVG_READING_8_SCORE) - dfmean) %>%
ggplot(aes(x=reorder(STATE, avg_score), y=avg_score), col='b', title='Avg Reading Score 8 - Mean') + geom_bar(stat="identity") + coord_flip()+labs(title="Avg Reading Score 8 - Mean",x ="Avg Reading Score 8", y = "State") +theme(text = element_text(size = 8))

df %>%
ggplot(aes(x=TOTAL_REVENUE)) + geom_histogram() + labs(title="Total Revenue Histogram",x ="Total Revenue", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 440 rows containing non-finite values (stat_bin).

df %>%
ggplot(aes(x=AVG_TOTAL_REVENUE_STUDENT)) + geom_histogram() + labs(title="Average Revenue Per Student",x ="Rev Per Enrolled Student", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 491 rows containing non-finite values (stat_bin).

rev <- df %>% drop_na(AVG_MATH_4_SCORE) %>% filter(TOTAL_REVENUE > 0)
ggplot(data=rev,aes(x=AVG_MATH_4_SCORE,y=TOTAL_REVENUE)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

rev <- df %>% drop_na(AVG_MATH_4_SCORE) %>% filter(TOTAL_REVENUE > 0)
ggplot(data=rev,aes(x=AVG_MATH_4_SCORE,y=AVG_TOTAL_REVENUE_STUDENT)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).

scores <- df %>%
dplyr::select(AVG_MATH_4_SCORE, AVG_MATH_8_SCORE, AVG_READING_4_SCORE, AVG_READING_8_SCORE)
df <- transform(df, TARGET_SCORE = rowMeans(scores, na.rm = TRUE))
rev <- df %>% drop_na(TARGET_SCORE) %>% filter(TOTAL_REVENUE > 0)
ggplot(data=rev,aes(x=TARGET_SCORE,y=AVG_TOTAL_REVENUE_STUDENT)) + geom_point() + geom_smooth(method = "lm") + labs(title="Avg Rev Per Student vs Target Score",x ="Target Score", y = "Avg Revenue per Student")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).

Model
model <- lm(TARGET_SCORE ~ AVG_TOTAL_REVENUE_STUDENT+AVG_STATE_REVENUE_STUDENT+AVG_LOCAL_REVENUE_STUDENT+AVG_FEDERAL_REVENUE_STUDENT+AVG_EXPENDITURE_STUDENT, df)
summary(model)
##
## Call:
## lm(formula = TARGET_SCORE ~ AVG_TOTAL_REVENUE_STUDENT + AVG_STATE_REVENUE_STUDENT +
## AVG_LOCAL_REVENUE_STUDENT + AVG_FEDERAL_REVENUE_STUDENT +
## AVG_EXPENDITURE_STUDENT, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.957 -4.185 2.453 7.536 18.734
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.269e+02 1.493e+00 151.916 <2e-16 ***
## AVG_TOTAL_REVENUE_STUDENT 1.745e+06 1.382e+06 1.263 0.2072
## AVG_STATE_REVENUE_STUDENT -1.745e+06 1.382e+06 -1.263 0.2072
## AVG_LOCAL_REVENUE_STUDENT -1.745e+06 1.382e+06 -1.263 0.2072
## AVG_FEDERAL_REVENUE_STUDENT -1.745e+06 1.382e+06 -1.263 0.2072
## AVG_EXPENDITURE_STUDENT 1.653e+00 9.743e-01 1.696 0.0904 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.17 on 516 degrees of freedom
## (1193 observations deleted due to missingness)
## Multiple R-squared: 0.2755, Adjusted R-squared: 0.2685
## F-statistic: 39.25 on 5 and 516 DF, p-value: < 2.2e-16
model <- lm(TARGET_SCORE ~ AVG_TOTAL_REVENUE_STUDENT, df %>% drop_na(TARGET_SCORE))
summary(model)
##
## Call:
## lm(formula = TARGET_SCORE ~ AVG_TOTAL_REVENUE_STUDENT, data = df %>%
## drop_na(TARGET_SCORE))
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.096 -3.400 2.599 7.520 17.979
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 229.055 1.435 159.65 <2e-16 ***
## AVG_TOTAL_REVENUE_STUDENT 1.540 0.126 12.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.53 on 520 degrees of freedom
## (210 observations deleted due to missingness)
## Multiple R-squared: 0.223, Adjusted R-squared: 0.2215
## F-statistic: 149.2 on 1 and 520 DF, p-value: < 2.2e-16