library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
Public_School_Characteristics_2022_23 <- read_csv("Public_School_Characteristics_2022-23.csv")
## Rows: 101390 Columns: 77
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): NCESSCH, SURVYEAR, STABR, LEAID, ST_LEAID, LEA_NAME, SCH_NAME, LST...
## dbl (54): X, Y, OBJECTID, STATUS, TOTFRL, FRELCH, REDLCH, DIRECTCERT, PK, KG...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
final_PSC_data <- Public_School_Characteristics_2022_23 |> dplyr::select(TOTFRL,STUTERATIO,ULOCALE) |> filter(TOTFRL>0) |> filter(STUTERATIO>0) |> filter(ULOCALE>0)
ULOCALE.factor <- factor(final_PSC_data$ULOCALE)
is.factor(ULOCALE.factor)
## [1] TRUE
final_PSC_data <- final_PSC_data |> mutate(ULOCALE=ULOCALE.factor)
PSC_lm <-lm(STUTERATIO~ULOCALE+TOTFRL,data=final_PSC_data)
plot(PSC_lm)

durbinWatsonTest(PSC_lm)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.09346752      1.813062       0
##  Alternative hypothesis: rho != 0
bptest(PSC_lm)
## 
##  studentized Breusch-Pagan test
## 
## data:  PSC_lm
## BP = 21.246, df = 12, p-value = 0.04689
plot(PSC_lm, which=2)

vif(PSC_lm)
##             GVIF Df GVIF^(1/(2*Df))
## ULOCALE 1.080477 11        1.003525
## TOTFRL  1.080477  1        1.039460
psc_model_rlm <- rlm(STUTERATIO ~ TOTFRL + ULOCALE, data = final_PSC_data)
summary(psc_model_rlm)
## 
## Call: rlm(formula = STUTERATIO ~ TOTFRL + ULOCALE, data = final_PSC_data)
## Residuals:
##      Min       1Q   Median       3Q      Max 
##  -30.762   -2.659   -0.296    2.674 1845.349 
## 
## Coefficients:
##                            Value    Std. Error t value 
## (Intercept)                 14.0086   0.0429   326.5148
## TOTFRL                       0.0037   0.0001    71.6560
## ULOCALE12-City: Mid-size     0.4766   0.0704     6.7668
## ULOCALE13-City: Small        0.2014   0.0688     2.9264
## ULOCALE21-Suburb: Large      0.4783   0.0471    10.1502
## ULOCALE22-Suburb: Mid-size   0.6554   0.0892     7.3447
## ULOCALE23-Suburb: Small      0.7408   0.1081     6.8502
## ULOCALE31-Town: Fringe       0.6280   0.0926     6.7839
## ULOCALE32-Town: Distant      0.0134   0.0707     0.1891
## ULOCALE33-Town: Remote      -0.2772   0.0829    -3.3430
## ULOCALE41-Rural: Fringe      0.2804   0.0569     4.9278
## ULOCALE42-Rural: Distant    -1.2123   0.0601   -20.1608
## ULOCALE43-Rural: Remote     -2.5890   0.0709   -36.5317
## 
## Residual standard error: 3.951 on 85050 degrees of freedom
STUTERATIO_log<-log(final_PSC_data$STUTERATIO)
PSC_glm<-glm(STUTERATIO~ULOCALE+TOTFRL,data=final_PSC_data,family=gaussian(link=log)) 
summary(PSC_glm)
## 
## Call:
## glm(formula = STUTERATIO ~ ULOCALE + TOTFRL, family = gaussian(link = log), 
##     data = final_PSC_data)
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.765e+00  9.531e-03 290.077   <2e-16 ***
## ULOCALE12-City: Mid-size    5.922e-03  1.587e-02   0.373   0.7090    
## ULOCALE13-City: Small       4.621e-04  1.563e-02   0.030   0.9764    
## ULOCALE21-Suburb: Large     9.767e-04  1.067e-02   0.092   0.9270    
## ULOCALE22-Suburb: Mid-size -7.633e-03  2.046e-02  -0.373   0.7091    
## ULOCALE23-Suburb: Small    -3.048e-03  2.478e-02  -0.123   0.9021    
## ULOCALE31-Town: Fringe      6.964e-03  2.110e-02   0.330   0.7414    
## ULOCALE32-Town: Distant    -2.356e-02  1.642e-02  -1.435   0.1514    
## ULOCALE33-Town: Remote      2.740e-02  1.858e-02   1.474   0.1404    
## ULOCALE41-Rural: Fringe    -3.199e-02  1.319e-02  -2.425   0.0153 *  
## ULOCALE42-Rural: Distant   -1.474e-01  1.511e-02  -9.758   <2e-16 ***
## ULOCALE43-Rural: Remote    -2.205e-01  1.924e-02 -11.459   <2e-16 ***
## TOTFRL                      1.201e-04  9.820e-06  12.229   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 252.8254)
## 
##     Null deviance: 21641315  on 85062  degrees of freedom
## Residual deviance: 21502480  on 85050  degrees of freedom
## AIC: 712040
## 
## Number of Fisher Scoring iterations: 8