Setting up Dataset

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
setwd("~/Desktop/Monday Class")
district <- read_excel("district.xls")

Understanding Dataset + Descriptive Statistic

#Summarize Dependent Variable
summary(district$DA0912DR21R)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -1.000   0.000   0.400   1.243   1.400  50.500     112
#Summarize Independent Variable
summary(district$DPSTURNR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   14.80   19.50   21.51   25.90  100.00       7
#Remove N/As for Dependent Variable
district <- district %>%
filter(!is.na(DA0912DR21R))

#Remove N/As for Independent Variable
district <- district %>%
filter(!is.na(DPSTURNR))

#create first model
#My Dependent variable is Grades 9–12 Dropout Rate, and my Independent variable is Teacher Turnover Rate.
model_simple <- lm(DA0912DR21R ~ DPSTURNR, data = district) 

Assumption 1: Linearity

#Create graph 1 to look for possible linearity
ggplot(district,aes(x= DPSTURNR,y = DA0912DR21R)) + geom_point()

#Run Raintest to verify linearity
raintest(model_simple)
## 
##  Rainbow test
## 
## data:  model_simple
## Rain = 1.5888, df1 = 546, df2 = 544, p-value = 3.789e-08

#Assumption 2: Independence of Errors

#Run DurbinWatson to test independence
dwtdistrict <- durbinWatsonTest(model_simple)
dwtdistrict
##  lag Autocorrelation D-W Statistic p-value
##    1      0.02109722      1.957647   0.346
##  Alternative hypothesis: rho != 0

#Assumption 3: Homodscedasticity

#Plot to visualize Homodscedasticity
plot(model_simple,which=2)

#Run BP test
bptestdistrict <- bptest(model_simple)
bptestdistrict 
## 
##  studentized Breusch-Pagan test
## 
## data:  model_simple
## BP = 0.79742, df = 1, p-value = 0.3719

#Assumption 4: Normality

shapiro.test(model_simple$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_simple$residuals
## W = 0.35773, p-value < 2.2e-16

#Regression Results and Model Summary

summary(model_simple)
## 
## Call:
## lm(formula = DA0912DR21R ~ DPSTURNR, data = district)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.727 -1.143 -0.797  0.183 49.326 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.72294    0.24291   2.976  0.00298 **
## DPSTURNR     0.02521    0.01065   2.366  0.01815 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.318 on 1090 degrees of freedom
## Multiple R-squared:  0.00511,    Adjusted R-squared:  0.004197 
## F-statistic: 5.598 on 1 and 1090 DF,  p-value: 0.01815