Loading libraries, setting working directory, and removing NAs.
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
setwd("~/Desktop/UTSA/Quantitative Methods/RStudio")
district <-read_excel("district.xls")
Clean_District<-district |> select(DISTNAME, DPSTURNR, DAGC4X21R, DPETECOP) |> drop_na()
Renaming variables for ease.
Clean_District <- Clean_District |> rename(Grad_Rate_4 = DAGC4X21R,Turnover = DPSTURNR,Dist_Name = DISTNAME, Econ_Status = DPETECOP)
Building my regression model using my dependent and independent variables. And summary.
Gradrate_model <- lm(Grad_Rate_4 ~ Turnover + Econ_Status, data = Clean_District)
summary(Gradrate_model)
##
## Call:
## lm(formula = Grad_Rate_4 ~ Turnover + Econ_Status, data = Clean_District)
##
## Residuals:
## Min 1Q Median 3Q Max
## -95.229 -1.329 2.218 4.891 12.638
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103.77027 1.34060 77.406 < 2e-16 ***
## Turnover -0.06330 0.04271 -1.482 0.139
## Econ_Status -0.14187 0.01921 -7.386 3.04e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.44 on 1068 degrees of freedom
## Multiple R-squared: 0.05751, Adjusted R-squared: 0.05575
## F-statistic: 32.59 on 2 and 1068 DF, p-value: 1.834e-14
Descriptive Statistics.
summary(Clean_District$Grad_Rate_4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.00 93.20 96.90 93.89 100.00 100.00
summary(Clean_District$Turnover)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 14.80 19.00 20.55 25.20 80.00
summary(Clean_District$Econ_Status)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.20 48.00 61.50 60.47 75.20 100.00
mean(Clean_District$Grad_Rate_4)
## [1] 93.89094
sd(Clean_District$Grad_Rate_4)
## [1] 12.79782
mean(Clean_District$Turnover)
## [1] 20.54734
sd(Clean_District$Turnover)
## [1] 9.144077
mean(Clean_District$Econ_Status)
## [1] 60.46695
sd(Clean_District$Econ_Status)
## [1] 20.33193
Assumptions: Linearity of variables
plot(Gradrate_model, which = 1)
Assumptions: Normality of Residuals
plot(Gradrate_model, which = 2)
shapiro.test(residuals(Gradrate_model))
##
## Shapiro-Wilk normality test
##
## data: residuals(Gradrate_model)
## W = 0.46804, p-value < 2.2e-16
Assumptions: Independence of Errors
dwtest(Gradrate_model)
##
## Durbin-Watson test
##
## data: Gradrate_model
## DW = 1.7391, p-value = 9.189e-06
## alternative hypothesis: true autocorrelation is greater than 0
Assumptions: Homoscedasticity
plot(Gradrate_model, which = 3)
bptest(Gradrate_model)
##
## studentized Breusch-Pagan test
##
## data: Gradrate_model
## BP = 17.526, df = 2, p-value = 0.0001564