library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
Public_School_Characteristics_2022_23 <- read_csv("Public_School_Characteristics_2022-23.csv")
## Rows: 101390 Columns: 77
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): NCESSCH, SURVYEAR, STABR, LEAID, ST_LEAID, LEA_NAME, SCH_NAME, LST...
## dbl (54): X, Y, OBJECTID, STATUS, TOTFRL, FRELCH, REDLCH, DIRECTCERT, PK, KG...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
psc_model<-lm(STUTERATIO ~ TOTFRL + ULOCALE + WH + HI, data = Public_School_Characteristics_2022_23)
raintest(psc_model)
## 
##  Rainbow test
## 
## data:  psc_model
## Rain = 0.27396, df1 = 48550, df2 = 48534, p-value = 1
plot(psc_model)

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(psc_model)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.05232374      1.895352   0.002
##  Alternative hypothesis: rho != 0
bptest(psc_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  psc_model
## BP = 39.854, df = 14, p-value = 0.0002689
plot(psc_model, which=2)

Shapiro test could not be run due to the data being a national data set and containing 101390 observations.

vif(psc_model)
##             GVIF Df GVIF^(1/(2*Df))
## TOTFRL  2.212154  1        1.487331
## ULOCALE 1.198458 11        1.008263
## WH      1.137611  1        1.066589
## HI      2.199997  1        1.483239

Does the model meet the following assumptions? Linearity: p-value of .004 means the data is not linear. Independence of errors: p-value of 0 means the errors are not independent. Homeoscedasticity: p-value of 0.0002689 means means the null hypothesis can be rejected, and the model is heteroscedastic. Multicolinarity: ULOCALE, with a VIF of 11, means it is strongly correlated with another variable and the assumption of no multicollinearity is violated.

Mitigation:

Independence of errors:

psc_model_rlm <- rlm(STUTERATIO ~ TOTFRL + ULOCALE + WH + HI, data = Public_School_Characteristics_2022_23)
summary(psc_model_rlm)
## 
## Call: rlm(formula = STUTERATIO ~ TOTFRL + ULOCALE + WH + HI, data = Public_School_Characteristics_2022_23)
## Residuals:
##       Min        1Q    Median        3Q       Max 
##  -54.4038   -2.5774   -0.1913    2.6152 3587.4892 
## 
## Coefficients:
##                            Value    Std. Error t value 
## (Intercept)                 12.6845   0.0402   315.8870
## TOTFRL                       0.0026   0.0001    35.7382
## ULOCALE12-City: Mid-size     0.5276   0.0678     7.7863
## ULOCALE13-City: Small        0.0099   0.0663     0.1497
## ULOCALE21-Suburb: Large      0.0652   0.0464     1.4075
## ULOCALE22-Suburb: Mid-size   0.2491   0.0857     2.9050
## ULOCALE23-Suburb: Small      0.4990   0.1059     4.7140
## ULOCALE31-Town: Fringe       0.3350   0.0900     3.7245
## ULOCALE32-Town: Distant     -0.0958   0.0684    -1.4012
## ULOCALE33-Town: Remote      -0.2968   0.0792    -3.7461
## ULOCALE41-Rural: Fringe     -0.1111   0.0561    -1.9795
## ULOCALE42-Rural: Distant    -0.8165   0.0585   -13.9505
## ULOCALE43-Rural: Remote     -2.1092   0.0674   -31.3009
## WH                           0.0039   0.0001    69.2779
## HI                           0.0031   0.0001    36.5255
## 
## Residual standard error: 3.846 on 97084 degrees of freedom
##   (4291 observations deleted due to missingness)

Multicolinarity:

psc_model_sans_ULOCALE <- lm(STUTERATIO ~ TOTFRL + WH + HI, data = Public_School_Characteristics_2022_23)
vif(psc_model_sans_ULOCALE)
##   TOTFRL       WH       HI 
## 2.180908 1.047477 2.115602