library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
district_data <- read_excel("district.xls")
library(dplyr)
district_data <- district_data %>% rename(Four_Year_Grad_Rate_Class_2021 = DAGC4X21R )
district_data <- district_data %>% rename (Number_of_Students_Per_Teacher = DPSTKIDR)
district_data <- district_data %>% rename (Percentage_African_American_Students = DPETBLAP)
district_data <- district_data %>% rename (Percentage_White_Students = DPETWHIP)
district_data <- district_data %>% rename (Perecentage_Hispanic_Students = DPETHISP)
district_data <- district_data %>% rename (Spending_Per_Pupil = DPFEAOPFK)
district_data <- district_data %>% rename (Revenue_Per_Pupil = DPFRAALLK)
District_Data_Frame <- district_data %>% dplyr:: select(Four_Year_Grad_Rate_Class_2021, Number_of_Students_Per_Teacher, Percentage_African_American_Students, Percentage_White_Students, Perecentage_Hispanic_Students, Spending_Per_Pupil, Revenue_Per_Pupil)
District_Data_Frame_Clean <- District_Data_Frame |> drop_na()
Graduation_Rate_Model<-lm(Four_Year_Grad_Rate_Class_2021 ~ Spending_Per_Pupil + Number_of_Students_Per_Teacher + Percentage_African_American_Students + Percentage_White_Students + Perecentage_Hispanic_Students + Revenue_Per_Pupil, data = District_Data_Frame_Clean)
# Linearity (plot and raintest)
library(tidyverse)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
plot(Graduation_Rate_Model, which =1)

raintest (Graduation_Rate_Model) #Low p-value so data is nonlinear
##
## Rainbow test
##
## data: Graduation_Rate_Model
## Rain = 1.2889, df1 = 536, df2 = 529, p-value = 0.001738
#Independence of errors (durbin-watson)
library (car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
durbinWatsonTest(Graduation_Rate_Model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.1033774 1.792995 0.008
## Alternative hypothesis: rho != 0
#The p-value is below than .05 which means the errors are not independent of each other
#Homoscedasticity (plot, bptest)
plot(Graduation_Rate_Model, which =3)

bptest(Graduation_Rate_Model) #The p-value is less than .05, so we can reject homoscedasticity and assume the model is heteroscedastic
##
## studentized Breusch-Pagan test
##
## data: Graduation_Rate_Model
## BP = 89.863, df = 6, p-value < 2.2e-16
# Normality of residuals (QQ plot, shapiro test)
plot(Graduation_Rate_Model, which=2)

shapiro.test(Graduation_Rate_Model$residuals)
##
## Shapiro-Wilk normality test
##
## data: Graduation_Rate_Model$residuals
## W = 0.50574, p-value < 2.2e-16
#The p-value is actually below .05 which signifies that the residuals are significantly different from a normal distribution
# No multicolinarity (VIF, cor)
vif(Graduation_Rate_Model)
## Spending_Per_Pupil Number_of_Students_Per_Teacher
## 2.666137 1.676606
## Percentage_African_American_Students Percentage_White_Students
## 8.162878 33.895643
## Perecentage_Hispanic_Students Revenue_Per_Pupil
## 30.585435 2.183049
Graduation_Rate_Model_vars<-District_Data_Frame_Clean %>% dplyr::select(Four_Year_Grad_Rate_Class_2021,Revenue_Per_Pupil, Number_of_Students_Per_Teacher, Percentage_African_American_Students, Percentage_White_Students, Perecentage_Hispanic_Students, Spending_Per_Pupil)
cor(Graduation_Rate_Model_vars)
## Four_Year_Grad_Rate_Class_2021
## Four_Year_Grad_Rate_Class_2021 1.00000000
## Revenue_Per_Pupil -0.07476848
## Number_of_Students_Per_Teacher 0.01047627
## Percentage_African_American_Students -0.13342630
## Percentage_White_Students 0.19791827
## Perecentage_Hispanic_Students -0.14612076
## Spending_Per_Pupil -0.19246550
## Revenue_Per_Pupil
## Four_Year_Grad_Rate_Class_2021 -0.07476848
## Revenue_Per_Pupil 1.00000000
## Number_of_Students_Per_Teacher -0.43468242
## Percentage_African_American_Students -0.12235867
## Percentage_White_Students 0.04852799
## Perecentage_Hispanic_Students 0.03851050
## Spending_Per_Pupil 0.73433239
## Number_of_Students_Per_Teacher
## Four_Year_Grad_Rate_Class_2021 0.01047627
## Revenue_Per_Pupil -0.43468242
## Number_of_Students_Per_Teacher 1.00000000
## Percentage_African_American_Students 0.22250311
## Percentage_White_Students -0.30131306
## Perecentage_Hispanic_Students 0.17657171
## Spending_Per_Pupil -0.55849652
## Percentage_African_American_Students
## Four_Year_Grad_Rate_Class_2021 -0.1334263
## Revenue_Per_Pupil -0.1223587
## Number_of_Students_Per_Teacher 0.2225031
## Percentage_African_American_Students 1.0000000
## Percentage_White_Students -0.3323749
## Perecentage_Hispanic_Students -0.1414734
## Spending_Per_Pupil -0.1108545
## Percentage_White_Students
## Four_Year_Grad_Rate_Class_2021 0.19791827
## Revenue_Per_Pupil 0.04852799
## Number_of_Students_Per_Teacher -0.30131306
## Percentage_African_American_Students -0.33237491
## Percentage_White_Students 1.00000000
## Perecentage_Hispanic_Students -0.87022674
## Spending_Per_Pupil 0.01594008
## Perecentage_Hispanic_Students
## Four_Year_Grad_Rate_Class_2021 -0.1461208
## Revenue_Per_Pupil 0.0385105
## Number_of_Students_Per_Teacher 0.1765717
## Percentage_African_American_Students -0.1414734
## Percentage_White_Students -0.8702267
## Perecentage_Hispanic_Students 1.0000000
## Spending_Per_Pupil 0.0741356
## Spending_Per_Pupil
## Four_Year_Grad_Rate_Class_2021 -0.19246550
## Revenue_Per_Pupil 0.73433239
## Number_of_Students_Per_Teacher -0.55849652
## Percentage_African_American_Students -0.11085449
## Percentage_White_Students 0.01594008
## Perecentage_Hispanic_Students 0.07413560
## Spending_Per_Pupil 1.00000000
#Cleaning district data for negative values or zeros
District_Data_Frame_Clean_2<- District_Data_Frame_Clean |> filter(Number_of_Students_Per_Teacher > 0, )
District_Data_Frame_Clean_2 <- District_Data_Frame_Clean |> filter(if_all(everything(), ~ .x >= 0))
District_Data_Frame_Clean_3 <- District_Data_Frame_Clean_2 |>
filter(Four_Year_Grad_Rate_Class_2021 > 0)
#Linear model log transformation
Graduation_Rate_Model_Log<-lm(log(Four_Year_Grad_Rate_Class_2021) ~ log (Spending_Per_Pupil) + Percentage_African_American_Students + Percentage_White_Students + Perecentage_Hispanic_Students + Number_of_Students_Per_Teacher + Revenue_Per_Pupil, data = District_Data_Frame_Clean_3)
# Mitigate Linearity
plot(Graduation_Rate_Model_Log, which =1)

raintest (Graduation_Rate_Model_Log)
##
## Rainbow test
##
## data: Graduation_Rate_Model_Log
## Rain = 3.0253, df1 = 529, df2 = 522, p-value < 2.2e-16
# Mitigate Independence of errors
Graduation_Rate_Model_Robust<-rlm(Four_Year_Grad_Rate_Class_2021~ Spending_Per_Pupil+ Percentage_African_American_Students + Percentage_White_Students + Perecentage_Hispanic_Students + Number_of_Students_Per_Teacher + Revenue_Per_Pupil,data=District_Data_Frame_Clean_3)
summary(Graduation_Rate_Model)
##
## Call:
## lm(formula = Four_Year_Grad_Rate_Class_2021 ~ Spending_Per_Pupil +
## Number_of_Students_Per_Teacher + Percentage_African_American_Students +
## Percentage_White_Students + Perecentage_Hispanic_Students +
## Revenue_Per_Pupil, data = District_Data_Frame_Clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96.165 -1.361 1.763 4.473 59.102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.038e+02 8.219e+00 12.635 < 2e-16 ***
## Spending_Per_Pupil -1.060e-03 1.562e-04 -6.786 1.91e-11 ***
## Number_of_Students_Per_Teacher -2.304e-01 1.715e-01 -1.343 0.17950
## Percentage_African_American_Students -9.754e-02 9.288e-02 -1.050 0.29389
## Percentage_White_Students 7.767e-02 8.109e-02 0.958 0.33833
## Perecentage_Hispanic_Students 5.365e-03 7.707e-02 0.070 0.94452
## Revenue_Per_Pupil 2.275e-04 8.072e-05 2.818 0.00492 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.21 on 1065 degrees of freedom
## Multiple R-squared: 0.09478, Adjusted R-squared: 0.08968
## F-statistic: 18.58 on 6 and 1065 DF, p-value: < 2.2e-16
summary(Graduation_Rate_Model_Robust)
##
## Call: rlm(formula = Four_Year_Grad_Rate_Class_2021 ~ Spending_Per_Pupil +
## Percentage_African_American_Students + Percentage_White_Students +
## Perecentage_Hispanic_Students + Number_of_Students_Per_Teacher +
## Revenue_Per_Pupil, data = District_Data_Frame_Clean_3)
## Residuals:
## Min 1Q Median 3Q Max
## -78.2809 -2.4352 0.6772 2.2072 8.7657
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 101.5709 2.5595 39.6845
## Spending_Per_Pupil 0.0000 0.0000 -0.9378
## Percentage_African_American_Students -0.0722 0.0279 -2.5919
## Percentage_White_Students 0.0117 0.0246 0.4745
## Perecentage_Hispanic_Students -0.0336 0.0233 -1.4442
## Number_of_Students_Per_Teacher -0.3204 0.0550 -5.8296
## Revenue_Per_Pupil 0.0000 0.0000 1.9292
##
## Residual standard error: 3.315 on 1051 degrees of freedom
# Residual standard error decreased from 12.21 to 3.31 and Spending_per_pupil became not significant after robust regression model
# Mitigating Homoscedasticity
bptest(Graduation_Rate_Model_Log)
##
## studentized Breusch-Pagan test
##
## data: Graduation_Rate_Model_Log
## BP = 35.924, df = 6, p-value = 2.853e-06
#The p-value still reports a number less than . 05, so it continues to fail the assumption of homoscedasticity.
# Mitigate Normality of Residuals
plot(Graduation_Rate_Model_Log, which=2)

shapiro.test(Graduation_Rate_Model_Log$residuals) #The P-value still remains below .05, staying as a non-normal distribution
##
## Shapiro-Wilk normality test
##
## data: Graduation_Rate_Model_Log$residuals
## W = 0.39372, p-value < 2.2e-16
# Mitigating for no multicollinearity
Graduation_Rate_Model_Altered<-lm(Four_Year_Grad_Rate_Class_2021 ~ Revenue_Per_Pupil + Number_of_Students_Per_Teacher + Percentage_African_American_Students + Percentage_White_Students + Spending_Per_Pupil, data = District_Data_Frame_Clean_3)
vif(Graduation_Rate_Model_Altered)
## Revenue_Per_Pupil Number_of_Students_Per_Teacher
## 2.096428 1.759341
## Percentage_African_American_Students Percentage_White_Students
## 1.155636 1.266683
## Spending_Per_Pupil
## 2.591566
#Descriptive Statistics
head(District_Data_Frame_Clean_3)
## # A tibble: 6 × 7
## Four_Year_Grad_Rate_Class_2021 Number_of_Students_Per…¹ Percentage_African_A…²
## <dbl> <dbl> <dbl>
## 1 100 12.3 4.4
## 2 100 11 4
## 3 95.2 10.8 8.5
## 4 95.8 11.3 8.2
## 5 99 12.9 25.1
## 6 97.8 11 19.7
## # ℹ abbreviated names: ¹Number_of_Students_Per_Teacher,
## # ²Percentage_African_American_Students
## # ℹ 4 more variables: Percentage_White_Students <dbl>,
## # Perecentage_Hispanic_Students <dbl>, Spending_Per_Pupil <dbl>,
## # Revenue_Per_Pupil <dbl>
#Descriptive Statistics on dataset
summary(District_Data_Frame_Clean_3)
## Four_Year_Grad_Rate_Class_2021 Number_of_Students_Per_Teacher
## Min. : 13.10 Min. : 3.60
## 1st Qu.: 93.30 1st Qu.:10.80
## Median : 97.00 Median :12.75
## Mean : 94.89 Mean :12.57
## 3rd Qu.:100.00 3rd Qu.:14.30
## Max. :100.00 Max. :37.30
## Percentage_African_American_Students Percentage_White_Students
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.700 1st Qu.:22.12
## Median : 2.600 Median :47.00
## Mean : 7.392 Mean :44.87
## 3rd Qu.: 9.800 3rd Qu.:68.38
## Max. :96.500 Max. :97.10
## Perecentage_Hispanic_Students Spending_Per_Pupil Revenue_Per_Pupil
## Min. : 2.70 Min. : 7044 Min. : 8923
## 1st Qu.: 20.93 1st Qu.:10976 1st Qu.:13086
## Median : 37.70 Median :12218 Median :14660
## Mean : 43.09 Mean :12881 Mean :16151
## 3rd Qu.: 60.90 3rd Qu.:13830 3rd Qu.:16986
## Max. :100.00 Max. :82548 Max. :97865
#Avg grad rate is 94.89%, Min 13% and max 100%, 1058 observations or school districts in the data set after being cleaned of zeros, negative values, and na's.
#Avg Spending per pupil is $12, 881, with the minimum at $7,044 and the maximum at $82,548.
#Avg Revenue Per Pupil is $16, 151, with the minimum at $8,923 and the maximum at $97,865
summary (Graduation_Rate_Model)
##
## Call:
## lm(formula = Four_Year_Grad_Rate_Class_2021 ~ Spending_Per_Pupil +
## Number_of_Students_Per_Teacher + Percentage_African_American_Students +
## Percentage_White_Students + Perecentage_Hispanic_Students +
## Revenue_Per_Pupil, data = District_Data_Frame_Clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96.165 -1.361 1.763 4.473 59.102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.038e+02 8.219e+00 12.635 < 2e-16 ***
## Spending_Per_Pupil -1.060e-03 1.562e-04 -6.786 1.91e-11 ***
## Number_of_Students_Per_Teacher -2.304e-01 1.715e-01 -1.343 0.17950
## Percentage_African_American_Students -9.754e-02 9.288e-02 -1.050 0.29389
## Percentage_White_Students 7.767e-02 8.109e-02 0.958 0.33833
## Perecentage_Hispanic_Students 5.365e-03 7.707e-02 0.070 0.94452
## Revenue_Per_Pupil 2.275e-04 8.072e-05 2.818 0.00492 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.21 on 1065 degrees of freedom
## Multiple R-squared: 0.09478, Adjusted R-squared: 0.08968
## F-statistic: 18.58 on 6 and 1065 DF, p-value: < 2.2e-16
# The overall model explains only 9% of the dependent variable, Four year Graduation Rate as indicated by the r-squared of .089.
# P-value is small, the probability of this happening because of chance is 0.
# The significant variables are Spending per pupil, and Revenue per pupil.
# As spending per pupil increases by 1, four year graduation rate decreases by an estimate of 1.060e-03%
# As revenue per pupil increases by 1, Graduation rate increases by an estimate of 2.275e-04% .
#Spending per pupil has a statistically stronger relationship with Graduation Rate than Revenue per pupil, but the impact of Revenue per pupil is positive and more substantial.
#Race and number of students per teacher appear to be statistically insignificant when it comes to predicting graduation rate for the class of 2021.