library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
district_data <- read_excel("district.xls")
library(dplyr)
district_data <- district_data %>% rename(Four_Year_Grad_Rate_Class_2021 = DAGC4X21R )
district_data <- district_data %>% rename (Number_of_Students_Per_Teacher = DPSTKIDR)
district_data <- district_data %>% rename (Percentage_African_American_Students = DPETBLAP)
district_data <- district_data %>% rename (Percentage_White_Students = DPETWHIP)
district_data <- district_data %>% rename (Perecentage_Hispanic_Students = DPETHISP)
district_data <- district_data %>% rename (Spending_Per_Pupil = DPFEAOPFK)
district_data <- district_data %>% rename (Revenue_Per_Pupil = DPFRAALLK)
District_Data_Frame <- district_data %>% dplyr:: select(Four_Year_Grad_Rate_Class_2021, Number_of_Students_Per_Teacher, Percentage_African_American_Students, Percentage_White_Students, Perecentage_Hispanic_Students, Spending_Per_Pupil, Revenue_Per_Pupil)

District_Data_Frame_Clean <- District_Data_Frame |> drop_na()
Graduation_Rate_Model<-lm(Four_Year_Grad_Rate_Class_2021 ~ Spending_Per_Pupil + Number_of_Students_Per_Teacher + Percentage_African_American_Students + Percentage_White_Students + Perecentage_Hispanic_Students + Revenue_Per_Pupil, data = District_Data_Frame_Clean)
# Linearity (plot and raintest)
library(tidyverse)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
plot(Graduation_Rate_Model, which =1)

raintest (Graduation_Rate_Model) #Low p-value so data is nonlinear
## 
##  Rainbow test
## 
## data:  Graduation_Rate_Model
## Rain = 1.2889, df1 = 536, df2 = 529, p-value = 0.001738
#Independence of errors (durbin-watson)
library (car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(Graduation_Rate_Model)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1033774      1.792995   0.008
##  Alternative hypothesis: rho != 0
#The p-value is below than .05 which means the errors are not independent of each other
#Homoscedasticity (plot, bptest)
plot(Graduation_Rate_Model, which =3)

bptest(Graduation_Rate_Model) #The p-value is less than .05, so we can reject homoscedasticity and assume the model is heteroscedastic
## 
##  studentized Breusch-Pagan test
## 
## data:  Graduation_Rate_Model
## BP = 89.863, df = 6, p-value < 2.2e-16
# Normality of residuals (QQ plot, shapiro test)
plot(Graduation_Rate_Model, which=2)

shapiro.test(Graduation_Rate_Model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  Graduation_Rate_Model$residuals
## W = 0.50574, p-value < 2.2e-16
#The p-value is actually below .05 which signifies that the residuals are significantly different from a normal distribution
# No multicolinarity (VIF, cor)
vif(Graduation_Rate_Model)
##                   Spending_Per_Pupil       Number_of_Students_Per_Teacher 
##                             2.666137                             1.676606 
## Percentage_African_American_Students            Percentage_White_Students 
##                             8.162878                            33.895643 
##        Perecentage_Hispanic_Students                    Revenue_Per_Pupil 
##                            30.585435                             2.183049
Graduation_Rate_Model_vars<-District_Data_Frame_Clean %>% dplyr::select(Four_Year_Grad_Rate_Class_2021,Revenue_Per_Pupil, Number_of_Students_Per_Teacher, Percentage_African_American_Students, Percentage_White_Students, Perecentage_Hispanic_Students, Spending_Per_Pupil)
cor(Graduation_Rate_Model_vars)
##                                      Four_Year_Grad_Rate_Class_2021
## Four_Year_Grad_Rate_Class_2021                           1.00000000
## Revenue_Per_Pupil                                       -0.07476848
## Number_of_Students_Per_Teacher                           0.01047627
## Percentage_African_American_Students                    -0.13342630
## Percentage_White_Students                                0.19791827
## Perecentage_Hispanic_Students                           -0.14612076
## Spending_Per_Pupil                                      -0.19246550
##                                      Revenue_Per_Pupil
## Four_Year_Grad_Rate_Class_2021             -0.07476848
## Revenue_Per_Pupil                           1.00000000
## Number_of_Students_Per_Teacher             -0.43468242
## Percentage_African_American_Students       -0.12235867
## Percentage_White_Students                   0.04852799
## Perecentage_Hispanic_Students               0.03851050
## Spending_Per_Pupil                          0.73433239
##                                      Number_of_Students_Per_Teacher
## Four_Year_Grad_Rate_Class_2021                           0.01047627
## Revenue_Per_Pupil                                       -0.43468242
## Number_of_Students_Per_Teacher                           1.00000000
## Percentage_African_American_Students                     0.22250311
## Percentage_White_Students                               -0.30131306
## Perecentage_Hispanic_Students                            0.17657171
## Spending_Per_Pupil                                      -0.55849652
##                                      Percentage_African_American_Students
## Four_Year_Grad_Rate_Class_2021                                 -0.1334263
## Revenue_Per_Pupil                                              -0.1223587
## Number_of_Students_Per_Teacher                                  0.2225031
## Percentage_African_American_Students                            1.0000000
## Percentage_White_Students                                      -0.3323749
## Perecentage_Hispanic_Students                                  -0.1414734
## Spending_Per_Pupil                                             -0.1108545
##                                      Percentage_White_Students
## Four_Year_Grad_Rate_Class_2021                      0.19791827
## Revenue_Per_Pupil                                   0.04852799
## Number_of_Students_Per_Teacher                     -0.30131306
## Percentage_African_American_Students               -0.33237491
## Percentage_White_Students                           1.00000000
## Perecentage_Hispanic_Students                      -0.87022674
## Spending_Per_Pupil                                  0.01594008
##                                      Perecentage_Hispanic_Students
## Four_Year_Grad_Rate_Class_2021                          -0.1461208
## Revenue_Per_Pupil                                        0.0385105
## Number_of_Students_Per_Teacher                           0.1765717
## Percentage_African_American_Students                    -0.1414734
## Percentage_White_Students                               -0.8702267
## Perecentage_Hispanic_Students                            1.0000000
## Spending_Per_Pupil                                       0.0741356
##                                      Spending_Per_Pupil
## Four_Year_Grad_Rate_Class_2021              -0.19246550
## Revenue_Per_Pupil                            0.73433239
## Number_of_Students_Per_Teacher              -0.55849652
## Percentage_African_American_Students        -0.11085449
## Percentage_White_Students                    0.01594008
## Perecentage_Hispanic_Students                0.07413560
## Spending_Per_Pupil                           1.00000000
#Cleaning district data for negative values or zeros
District_Data_Frame_Clean_2<- District_Data_Frame_Clean |> filter(Number_of_Students_Per_Teacher > 0, )
District_Data_Frame_Clean_2 <- District_Data_Frame_Clean |> filter(if_all(everything(), ~ .x >= 0))
District_Data_Frame_Clean_3 <- District_Data_Frame_Clean_2 |>
  filter(Four_Year_Grad_Rate_Class_2021 > 0)
#Linear model log transformation
Graduation_Rate_Model_Log<-lm(log(Four_Year_Grad_Rate_Class_2021) ~ log (Spending_Per_Pupil) + Percentage_African_American_Students + Percentage_White_Students + Perecentage_Hispanic_Students + Number_of_Students_Per_Teacher + Revenue_Per_Pupil, data = District_Data_Frame_Clean_3)
# Mitigate Linearity
plot(Graduation_Rate_Model_Log, which =1)

raintest (Graduation_Rate_Model_Log)
## 
##  Rainbow test
## 
## data:  Graduation_Rate_Model_Log
## Rain = 3.0253, df1 = 529, df2 = 522, p-value < 2.2e-16
# Mitigate Independence of errors 
Graduation_Rate_Model_Robust<-rlm(Four_Year_Grad_Rate_Class_2021~ Spending_Per_Pupil+ Percentage_African_American_Students + Percentage_White_Students + Perecentage_Hispanic_Students + Number_of_Students_Per_Teacher + Revenue_Per_Pupil,data=District_Data_Frame_Clean_3)


summary(Graduation_Rate_Model)
## 
## Call:
## lm(formula = Four_Year_Grad_Rate_Class_2021 ~ Spending_Per_Pupil + 
##     Number_of_Students_Per_Teacher + Percentage_African_American_Students + 
##     Percentage_White_Students + Perecentage_Hispanic_Students + 
##     Revenue_Per_Pupil, data = District_Data_Frame_Clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -96.165  -1.361   1.763   4.473  59.102 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           1.038e+02  8.219e+00  12.635  < 2e-16 ***
## Spending_Per_Pupil                   -1.060e-03  1.562e-04  -6.786 1.91e-11 ***
## Number_of_Students_Per_Teacher       -2.304e-01  1.715e-01  -1.343  0.17950    
## Percentage_African_American_Students -9.754e-02  9.288e-02  -1.050  0.29389    
## Percentage_White_Students             7.767e-02  8.109e-02   0.958  0.33833    
## Perecentage_Hispanic_Students         5.365e-03  7.707e-02   0.070  0.94452    
## Revenue_Per_Pupil                     2.275e-04  8.072e-05   2.818  0.00492 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.21 on 1065 degrees of freedom
## Multiple R-squared:  0.09478,    Adjusted R-squared:  0.08968 
## F-statistic: 18.58 on 6 and 1065 DF,  p-value: < 2.2e-16
summary(Graduation_Rate_Model_Robust)
## 
## Call: rlm(formula = Four_Year_Grad_Rate_Class_2021 ~ Spending_Per_Pupil + 
##     Percentage_African_American_Students + Percentage_White_Students + 
##     Perecentage_Hispanic_Students + Number_of_Students_Per_Teacher + 
##     Revenue_Per_Pupil, data = District_Data_Frame_Clean_3)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -78.2809  -2.4352   0.6772   2.2072   8.7657 
## 
## Coefficients:
##                                      Value    Std. Error t value 
## (Intercept)                          101.5709   2.5595    39.6845
## Spending_Per_Pupil                     0.0000   0.0000    -0.9378
## Percentage_African_American_Students  -0.0722   0.0279    -2.5919
## Percentage_White_Students              0.0117   0.0246     0.4745
## Perecentage_Hispanic_Students         -0.0336   0.0233    -1.4442
## Number_of_Students_Per_Teacher        -0.3204   0.0550    -5.8296
## Revenue_Per_Pupil                      0.0000   0.0000     1.9292
## 
## Residual standard error: 3.315 on 1051 degrees of freedom
# Residual standard error decreased from 12.21 to 3.31 and Spending_per_pupil became not significant after robust regression model
# Mitigating Homoscedasticity
bptest(Graduation_Rate_Model_Log)
## 
##  studentized Breusch-Pagan test
## 
## data:  Graduation_Rate_Model_Log
## BP = 35.924, df = 6, p-value = 2.853e-06
#The p-value still reports a number less than . 05, so it continues to fail the assumption of homoscedasticity.
# Mitigate Normality of Residuals
plot(Graduation_Rate_Model_Log, which=2)

shapiro.test(Graduation_Rate_Model_Log$residuals) #The P-value still remains below .05, staying as a non-normal distribution
## 
##  Shapiro-Wilk normality test
## 
## data:  Graduation_Rate_Model_Log$residuals
## W = 0.39372, p-value < 2.2e-16
# Mitigating for no multicollinearity 
Graduation_Rate_Model_Altered<-lm(Four_Year_Grad_Rate_Class_2021 ~ Revenue_Per_Pupil + Number_of_Students_Per_Teacher + Percentage_African_American_Students + Percentage_White_Students + Spending_Per_Pupil, data = District_Data_Frame_Clean_3)
vif(Graduation_Rate_Model_Altered)
##                    Revenue_Per_Pupil       Number_of_Students_Per_Teacher 
##                             2.096428                             1.759341 
## Percentage_African_American_Students            Percentage_White_Students 
##                             1.155636                             1.266683 
##                   Spending_Per_Pupil 
##                             2.591566
#Descriptive Statistics
head(District_Data_Frame_Clean_3)
## # A tibble: 6 × 7
##   Four_Year_Grad_Rate_Class_2021 Number_of_Students_Per…¹ Percentage_African_A…²
##                            <dbl>                    <dbl>                  <dbl>
## 1                          100                       12.3                    4.4
## 2                          100                       11                      4  
## 3                           95.2                     10.8                    8.5
## 4                           95.8                     11.3                    8.2
## 5                           99                       12.9                   25.1
## 6                           97.8                     11                     19.7
## # ℹ abbreviated names: ¹​Number_of_Students_Per_Teacher,
## #   ²​Percentage_African_American_Students
## # ℹ 4 more variables: Percentage_White_Students <dbl>,
## #   Perecentage_Hispanic_Students <dbl>, Spending_Per_Pupil <dbl>,
## #   Revenue_Per_Pupil <dbl>
#Descriptive Statistics on dataset
summary(District_Data_Frame_Clean_3)
##  Four_Year_Grad_Rate_Class_2021 Number_of_Students_Per_Teacher
##  Min.   : 13.10                 Min.   : 3.60                 
##  1st Qu.: 93.30                 1st Qu.:10.80                 
##  Median : 97.00                 Median :12.75                 
##  Mean   : 94.89                 Mean   :12.57                 
##  3rd Qu.:100.00                 3rd Qu.:14.30                 
##  Max.   :100.00                 Max.   :37.30                 
##  Percentage_African_American_Students Percentage_White_Students
##  Min.   : 0.000                       Min.   : 0.00            
##  1st Qu.: 0.700                       1st Qu.:22.12            
##  Median : 2.600                       Median :47.00            
##  Mean   : 7.392                       Mean   :44.87            
##  3rd Qu.: 9.800                       3rd Qu.:68.38            
##  Max.   :96.500                       Max.   :97.10            
##  Perecentage_Hispanic_Students Spending_Per_Pupil Revenue_Per_Pupil
##  Min.   :  2.70                Min.   : 7044      Min.   : 8923    
##  1st Qu.: 20.93                1st Qu.:10976      1st Qu.:13086    
##  Median : 37.70                Median :12218      Median :14660    
##  Mean   : 43.09                Mean   :12881      Mean   :16151    
##  3rd Qu.: 60.90                3rd Qu.:13830      3rd Qu.:16986    
##  Max.   :100.00                Max.   :82548      Max.   :97865
#Avg grad rate is 94.89%, Min 13% and max 100%, 1058 observations or school districts in the data set after being cleaned of zeros, negative values, and na's. 

#Avg Spending per pupil is $12, 881, with the minimum at $7,044 and the maximum at $82,548.

#Avg Revenue Per Pupil is $16, 151, with the minimum at $8,923 and the maximum at $97,865
summary (Graduation_Rate_Model)
## 
## Call:
## lm(formula = Four_Year_Grad_Rate_Class_2021 ~ Spending_Per_Pupil + 
##     Number_of_Students_Per_Teacher + Percentage_African_American_Students + 
##     Percentage_White_Students + Perecentage_Hispanic_Students + 
##     Revenue_Per_Pupil, data = District_Data_Frame_Clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -96.165  -1.361   1.763   4.473  59.102 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           1.038e+02  8.219e+00  12.635  < 2e-16 ***
## Spending_Per_Pupil                   -1.060e-03  1.562e-04  -6.786 1.91e-11 ***
## Number_of_Students_Per_Teacher       -2.304e-01  1.715e-01  -1.343  0.17950    
## Percentage_African_American_Students -9.754e-02  9.288e-02  -1.050  0.29389    
## Percentage_White_Students             7.767e-02  8.109e-02   0.958  0.33833    
## Perecentage_Hispanic_Students         5.365e-03  7.707e-02   0.070  0.94452    
## Revenue_Per_Pupil                     2.275e-04  8.072e-05   2.818  0.00492 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.21 on 1065 degrees of freedom
## Multiple R-squared:  0.09478,    Adjusted R-squared:  0.08968 
## F-statistic: 18.58 on 6 and 1065 DF,  p-value: < 2.2e-16
# The overall model explains only 9% of the dependent variable, Four year Graduation Rate as indicated by the r-squared of .089.

# P-value is small, the probability of this happening because of chance is 0.

# The significant variables are Spending per pupil, and Revenue per pupil.

# As spending per pupil increases by 1, four year graduation rate decreases by an estimate of 1.060e-03%

# As revenue per pupil increases by 1, Graduation rate increases by an estimate of 2.275e-04% . 

#Spending per pupil has a statistically stronger relationship with Graduation Rate than Revenue per pupil, but the impact of Revenue per pupil is positive and more substantial.

#Race and number of students per teacher appear to be statistically insignificant when it comes to predicting graduation rate for the class of 2021.