Loading libraries, setting working directory, and removing NAs.
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
setwd("~/Desktop/UTSA/Quantitative Methods/RStudio")
district <-read_excel("district.xls")
Clean_District<-district |> select(DISTNAME, DPSTURNR, DAGC4X21R) |> drop_na()
Renaming variables for ease.
Clean_District <- Clean_District |> rename(Grad_Rate_4 = DAGC4X21R,Turnover = DPSTURNR,Dist_Name = DISTNAME)
Building my regression model using my dependent and independent variables. And summary.
Gradrate_model <- lm(Grad_Rate_4 ~ Turnover, data = Clean_District)
summary(Gradrate_model)
##
## Call:
## lm(formula = Grad_Rate_4 ~ Turnover, data = Clean_District)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96.767 -0.790 2.899 5.094 14.162
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.67410 0.95813 100.899 < 2e-16 ***
## Turnover -0.13545 0.04261 -3.179 0.00152 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.74 on 1069 degrees of freedom
## Multiple R-squared: 0.009366, Adjusted R-squared: 0.00844
## F-statistic: 10.11 on 1 and 1069 DF, p-value: 0.001519
Descriptive Statistics.
summary(Clean_District$Grad_Rate_4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.00 93.20 96.90 93.89 100.00 100.00
summary(Clean_District$Turnover)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 14.80 19.00 20.55 25.20 80.00
mean(Clean_District$Grad_Rate_4)
## [1] 93.89094
sd(Clean_District$Grad_Rate_4)
## [1] 12.79782
mean(Clean_District$Turnover)
## [1] 20.54734
sd(Clean_District$Turnover)
## [1] 9.144077
Assumptions: Linearity of variables
plot(Gradrate_model, which = 1)
Assumptions: Normality of Residuals
plot(Gradrate_model, which = 2)
shapiro.test(residuals(Gradrate_model))
##
## Shapiro-Wilk normality test
##
## data: residuals(Gradrate_model)
## W = 0.43744, p-value < 2.2e-16
Assumptions: Independence of Errors
dwtest(Gradrate_model)
##
## Durbin-Watson test
##
## data: Gradrate_model
## DW = 1.754, p-value = 2.752e-05
## alternative hypothesis: true autocorrelation is greater than 0
Assumptions: Homoscedasticity
plot(Gradrate_model, which = 3)
bptest(Gradrate_model)
##
## studentized Breusch-Pagan test
##
## data: Gradrate_model
## BP = 1.4527, df = 1, p-value = 0.2281