library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
district<-read_excel("district.xls")
numeric_clean_district_data<-district |> dplyr::select(where(is.numeric)) |> drop_na()
head(numeric_clean_district_data)
## # A tibble: 6 × 128
##   DZCAMPUS DPETALLC DPETBLAP DPETHISP DPETWHIP DPETINDP DPETASIP DPETPCIP
##      <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1        6     3360     25.1     42.9     27.3      0.2      0.7      0.1
## 2        5     2799      7.2     27.9     60.6      0.3      1        0.1
## 3       17     7318     28.7     43.1     24        0.1      1.1      0.1
## 4        5     1612      2.4      6.6     87        0.3      0.1      0.2
## 5        4     3005      1.3     44.1     49.6      0.3      2        0.1
## 6        6     3374      0.7     69.6     27.6      0.4      0.5      0.1
## # ℹ 120 more variables: DPETTWOP <dbl>, DPETECOP <dbl>, DPETLEPP <dbl>,
## #   DPETSPEP <dbl>, DPETBILP <dbl>, DPETVOCP <dbl>, DPETGIFP <dbl>,
## #   DA0AT21R <dbl>, DA0912DR21R <dbl>, DAGC4X21R <dbl>, DAGC5X20R <dbl>,
## #   DAGC6X19R <dbl>, DA0GR21N <dbl>, DA0GS21N <dbl>, DDA00A001S22R <dbl>,
## #   DDA00A001222R <dbl>, DDA00A001322R <dbl>, DDA00AR01S22R <dbl>,
## #   DDA00AR01222R <dbl>, DDA00AR01322R <dbl>, DDA00AM01S22R <dbl>,
## #   DDA00AM01222R <dbl>, DDA00AM01322R <dbl>, DDA00AC01S22R <dbl>, …
districtmodel1<-lm(DDB00A001322R~DPSTBLFP+DPETBLAP+DPSTKIDR, data=numeric_clean_district_data)
plot(districtmodel1,which=1)

raintest(districtmodel1)
## 
##  Rainbow test
## 
## data:  districtmodel1
## Rain = 1.2594, df1 = 162, df2 = 157, p-value = 0.07346

#Question: this doesn’t look that linear, but there’s a high p-value which assumes linearity…is this correct/accurate? Does it look like that because it violates some other assumptions?

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(districtmodel1)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.03194048      2.059369   0.614
##  Alternative hypothesis: rho != 0

#This p-value shows that the errors are independent.

plot(districtmodel1,which=3)

#This line isn’t super wavy so it could be homoscedastic and the dots are bunched in one area, but the bptest is significant so I’m not sure…

bptest(districtmodel1)
## 
##  studentized Breusch-Pagan test
## 
## data:  districtmodel1
## BP = 11.062, df = 3, p-value = 0.01139

#The p-value is less than .05 so it does in fact violate homoscedasticity

plot(districtmodel1,which=2)

#The residuals are not normally distributed as the dots are off of the line

shapiro.test(districtmodel1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  districtmodel1$residuals
## W = 0.92188, p-value = 6.064e-12

#This is a very small p-value which again shows that this is not normal

vif(districtmodel1)
## DPSTBLFP DPETBLAP DPSTKIDR 
## 3.566432 3.484464 1.044194

#These are all under 5 which shows that the variables are not correlated with each other and non-multicolinearity can be assumed

Overall, this model seems to potentially violate homoscedasticity and definitelty violates normality. The log transformation can be used on both the bp test and the Q-Q plot in order to mitigate these violations. This transformation increases the p-value which makes it non-significant and unable to reject the null hypothesis of homoscedasticity. This will also make the residuals fit closer to the line which increases the normality of the distribution.