Loading libraries, setting working directory, and removing NAs.

library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
setwd("~/Desktop/UTSA/Quantitative Methods/RStudio")

district <-read_excel("district.xls")

Clean_District<-district |> select(DISTNAME, DPSTURNR, DAGC4X21R) |> drop_na()

Renaming variables for ease.

Clean_District <- Clean_District |> rename(Grad_Rate_4 = DAGC4X21R,Turnover = DPSTURNR,Dist_Name = DISTNAME)

Building my regression model using my dependent and independent variables. And summary.

Gradrate_model <- lm(Grad_Rate_4 ~ Turnover, data = Clean_District)

summary(Gradrate_model)
## 
## Call:
## lm(formula = Grad_Rate_4 ~ Turnover, data = Clean_District)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -96.767  -0.790   2.899   5.094  14.162 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 96.67410    0.95813 100.899  < 2e-16 ***
## Turnover    -0.13545    0.04261  -3.179  0.00152 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.74 on 1069 degrees of freedom
## Multiple R-squared:  0.009366,   Adjusted R-squared:  0.00844 
## F-statistic: 10.11 on 1 and 1069 DF,  p-value: 0.001519

Descriptive Statistics.

summary(Clean_District$Grad_Rate_4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.00   93.20   96.90   93.89  100.00  100.00
summary(Clean_District$Turnover)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   14.80   19.00   20.55   25.20   80.00
mean(Clean_District$Grad_Rate_4)
## [1] 93.89094
sd(Clean_District$Grad_Rate_4)
## [1] 12.79782
mean(Clean_District$Turnover)
## [1] 20.54734
sd(Clean_District$Turnover)
## [1] 9.144077

Assumptions: Linearity of variables

plot(Gradrate_model, which = 1)

Assumptions: Normality of Residuals

plot(Gradrate_model, which = 2)

shapiro.test(residuals(Gradrate_model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(Gradrate_model)
## W = 0.43744, p-value < 2.2e-16

Assumptions: Independence of Errors

dwtest(Gradrate_model)
## 
##  Durbin-Watson test
## 
## data:  Gradrate_model
## DW = 1.754, p-value = 2.752e-05
## alternative hypothesis: true autocorrelation is greater than 0

Assumptions: Homoscedasticity

plot(Gradrate_model, which = 3)

bptest(Gradrate_model) 
## 
##  studentized Breusch-Pagan test
## 
## data:  Gradrate_model
## BP = 1.4527, df = 1, p-value = 0.2281