library(ggplot2)
library(car)
## Loading required package: carData
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(readxl)
district <- read_excel("district.xls")
district_clean <- district[complete.cases(district$DA0CC21R, district$DPETECOP), ]

district_lm <- lm(DA0CC21R ~ DPETECOP, data = district_clean)

summary(district_lm)
## 
## Call:
## lm(formula = DA0CC21R ~ DPETECOP, data = district_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.876  -9.154  -0.484   8.928  72.796 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 56.73187    1.38030   41.10   <2e-16 ***
## DPETECOP    -0.51317    0.02187  -23.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.59 on 1058 degrees of freedom
## Multiple R-squared:  0.3423, Adjusted R-squared:  0.3417 
## F-statistic: 550.6 on 1 and 1058 DF,  p-value: < 2.2e-16
ggplot(district_clean, aes(x = DPETECOP, y = DA0CC21R)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "Green") +
  labs(
    title = "College Readiness vs Socioeconomically Disadvantaged Students",
    x = "Percentage of Economically Disadvantaged Students",
    y = "College Readiness Rate (%)",
    caption = "Source: Texas SChool Districts"
  ) +theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

data_multiple <- lm(DA0CC21R  ~ DPETECOP, data = district)
plot(data_multiple,which=1)

raintest(data_multiple)
## 
##  Rainbow test
## 
## data:  data_multiple
## Rain = 1.0859, df1 = 530, df2 = 528, p-value = 0.1718
durbinWatsonTest(data_multiple)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1033869      1.791504       0
##  Alternative hypothesis: rho != 0
plot(data_multiple,which=3)

bptest(data_multiple)
## 
##  studentized Breusch-Pagan test
## 
## data:  data_multiple
## BP = 20.807, df = 1, p-value = 5.079e-06
plot(data_multiple,which=2)

shapiro.test(data_multiple$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  data_multiple$residuals
## W = 0.99245, p-value = 3.112e-05