library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
district<-read_excel("district.xls")

#a) Linearity

district_model_1<-lm(DAGC4X21R~DPSTWHFP+DPETECOP+DA0912DR21R+DISTSIZE,data = district)
plot(district_model_1,which=1)

raintest(district_model_1)
## 
##  Rainbow test
## 
## data:  district_model_1
## Rain = 0.92163, df1 = 536, df2 = 524, p-value = 0.8262

#the p-value is 0.8262, above 0.05 #the test shows that this model is linear

library(dplyr)

#b) Independence of errors

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(district_model_1)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.02428045      1.951392   0.326
##  Alternative hypothesis: rho != 0

#Durbin Watson statistic is 1.951392, which is close to 1.6 #p-value is, over 0.05, is independent

c) Homoscedasticity

plot(district_model_1,which=3)

#the dots should be scattered along the line, but it is not. #the majority of the dots are clustered on the higher values (between 80 - 100)with outliers at the top above the 80 - 100 towards 3.0 on the standardized residuals #homoscedasticity does not look met

bptest(district_model_1)
## 
##  studentized Breusch-Pagan test
## 
## data:  district_model_1
## BP = 36.257, df = 11, p-value = 0.0001534

#p value is smaller than 0.05, which means heteroscedasticity

plot(district_model_1,which=2)

d) Normality of residuals

the dots are not perfectly normally distributed, they deviate from the tails of the line.

shapiro.test(district_model_1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  district_model_1$residuals
## W = 0.39962, p-value < 2.2e-16

#p value is less than 0.05 = not normal

e) No multicolinarity

district<-lm(DAGC4X21R~DPSTWHFP+DPETECOP+DA0912DR21R+DISTSIZE,data = district)
                         

vif(district)   
##                 GVIF Df GVIF^(1/(2*Df))
## DPSTWHFP    1.723003  1        1.312632
## DPETECOP    1.508075  1        1.228037
## DA0912DR21R 1.103600  1        1.050524
## DISTSIZE    1.216933  8        1.012346

#the vifs are not over 10, and also below 5, they are all close to 1, so it shows low multicollinearity.

raintest(district)
## 
##  Rainbow test
## 
## data:  district
## Rain = 0.92163, df1 = 536, df2 = 524, p-value = 0.8262
raintest(district_model_1)
## 
##  Rainbow test
## 
## data:  district_model_1
## Rain = 0.92163, df1 = 536, df2 = 524, p-value = 0.8262
str(district)
## List of 14
##  $ coefficients : Named num [1:12] 1.01e+02 4.45e-04 -5.96e-02 -2.07 -1.45e-01 ...
##   ..- attr(*, "names")= chr [1:12] "(Intercept)" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
##  $ residuals    : Named num [1:1072] 0.937 2.129 -2.239 0.903 3.269 ...
##   ..- attr(*, "names")= chr [1:1072] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:1072] -3074.3 -80.6 66.1 -218.4 11.4 ...
##   ..- attr(*, "names")= chr [1:1072] "(Intercept)" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
##  $ rank         : int 12
##  $ fitted.values: Named num [1:1072] 99.1 97.9 97.4 94.9 95.7 ...
##   ..- attr(*, "names")= chr [1:1072] "1" "2" "3" "4" ...
##  $ assign       : int [1:12] 0 1 2 3 4 4 4 4 4 4 ...
##  $ qr           :List of 5
##   ..$ qr   : num [1:1072, 1:12] -32.7414 0.0305 0.0305 0.0305 0.0305 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:1072] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:12] "(Intercept)" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
##   .. ..- attr(*, "assign")= int [1:12] 0 1 2 3 4 4 4 4 4 4 ...
##   .. ..- attr(*, "contrasts")=List of 1
##   .. .. ..$ DISTSIZE: chr "contr.treatment"
##   ..$ qraux: num [1:12] 1.03 1.02 1 1.01 1.01 ...
##   ..$ pivot: int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ tol  : num 1e-07
##   ..$ rank : int 12
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 1060
##  $ na.action    : 'omit' Named int [1:135] 37 39 60 64 70 72 73 74 75 76 ...
##   ..- attr(*, "names")= chr [1:135] "37" "39" "60" "64" ...
##  $ contrasts    :List of 1
##   ..$ DISTSIZE: chr "contr.treatment"
##  $ xlevels      :List of 1
##   ..$ DISTSIZE: chr [1:9] "1,000 to 1,599" "1,600 to 2,999" "10,000 to 24,999" "25,000 to 49,999" ...
##  $ call         : language lm(formula = DAGC4X21R ~ DPSTWHFP + DPETECOP + DA0912DR21R + DISTSIZE,      data = district)
##  $ terms        :Classes 'terms', 'formula'  language DAGC4X21R ~ DPSTWHFP + DPETECOP + DA0912DR21R + DISTSIZE
##   .. ..- attr(*, "variables")= language list(DAGC4X21R, DPSTWHFP, DPETECOP, DA0912DR21R, DISTSIZE)
##   .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:5] "DAGC4X21R" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
##   .. .. .. ..$ : chr [1:4] "DPSTWHFP" "DPETECOP" "DA0912DR21R" "DISTSIZE"
##   .. ..- attr(*, "term.labels")= chr [1:4] "DPSTWHFP" "DPETECOP" "DA0912DR21R" "DISTSIZE"
##   .. ..- attr(*, "order")= int [1:4] 1 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(DAGC4X21R, DPSTWHFP, DPETECOP, DA0912DR21R, DISTSIZE)
##   .. ..- attr(*, "dataClasses")= Named chr [1:5] "numeric" "numeric" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:5] "DAGC4X21R" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
##  $ model        :'data.frame':   1072 obs. of  5 variables:
##   ..$ DAGC4X21R  : num [1:1072] 100 100 95.2 95.8 99 97.8 100 96.8 100 94.1 ...
##   ..$ DPSTWHFP   : num [1:1072] 91.7 90.5 93.3 93.5 74.6 80.9 100 69 86.7 93.9 ...
##   ..$ DPETECOP   : num [1:1072] 40.8 45.4 54.2 54.1 81.6 74 46.8 49.6 57.8 50.1 ...
##   ..$ DA0912DR21R: num [1:1072] 0 0.3 0.4 0 0 0 0 0.4 0.4 0.7 ...
##   ..$ DISTSIZE   : chr [1:1072] "500 to 999" "1,000 to 1,599" "500 to 999" "Under 500" ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language DAGC4X21R ~ DPSTWHFP + DPETECOP + DA0912DR21R + DISTSIZE
##   .. .. ..- attr(*, "variables")= language list(DAGC4X21R, DPSTWHFP, DPETECOP, DA0912DR21R, DISTSIZE)
##   .. .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:5] "DAGC4X21R" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
##   .. .. .. .. ..$ : chr [1:4] "DPSTWHFP" "DPETECOP" "DA0912DR21R" "DISTSIZE"
##   .. .. ..- attr(*, "term.labels")= chr [1:4] "DPSTWHFP" "DPETECOP" "DA0912DR21R" "DISTSIZE"
##   .. .. ..- attr(*, "order")= int [1:4] 1 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(DAGC4X21R, DPSTWHFP, DPETECOP, DA0912DR21R, DISTSIZE)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:5] "numeric" "numeric" "numeric" "numeric" ...
##   .. .. .. ..- attr(*, "names")= chr [1:5] "DAGC4X21R" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
##   ..- attr(*, "na.action")= 'omit' Named int [1:135] 37 39 60 64 70 72 73 74 75 76 ...
##   .. ..- attr(*, "names")= chr [1:135] "37" "39" "60" "64" ...
##  - attr(*, "class")= chr "lm"
par(mfrow = c(2, 2))
plot(district_model_1)

#4) does your model meet those assumptions?

#Rainbow test has a high p-value #Durbin-Watson test statistic is close to 2 and the p-value is above 0.05 #Breusch-Pagan test has super low p-value = violation #Shapiro-Wilk test has a very low p-value #VIF values are below 2

#5) If your model violates an assumption, which one? #Homoscedasticity because Breusch-Pagan test has a very low p-value

#6) What would you do to mitigate this assumption? Show your work.

#Did a log of the dependent variable (DAGC4X21R) #then redo Breusch Pagan test

```