library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
district<-read_excel("district.xls")
#a) Linearity
district_model_1<-lm(DAGC4X21R~DPSTWHFP+DPETECOP+DA0912DR21R+DISTSIZE,data = district)
plot(district_model_1,which=1)
raintest(district_model_1)
##
## Rainbow test
##
## data: district_model_1
## Rain = 0.92163, df1 = 536, df2 = 524, p-value = 0.8262
#the p-value is 0.8262, above 0.05 #the test shows that this model is linear
library(dplyr)
#b) Independence of errors
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
durbinWatsonTest(district_model_1)
## lag Autocorrelation D-W Statistic p-value
## 1 0.02428045 1.951392 0.326
## Alternative hypothesis: rho != 0
#Durbin Watson statistic is 1.951392, which is close to 1.6 #p-value is, over 0.05, is independent
plot(district_model_1,which=3)
#the dots should be scattered along the line, but it is not. #the
majority of the dots are clustered on the higher values (between 80 -
100)with outliers at the top above the 80 - 100 towards 3.0 on the
standardized residuals #homoscedasticity does not look met
bptest(district_model_1)
##
## studentized Breusch-Pagan test
##
## data: district_model_1
## BP = 36.257, df = 11, p-value = 0.0001534
#p value is smaller than 0.05, which means heteroscedasticity
plot(district_model_1,which=2)
shapiro.test(district_model_1$residuals)
##
## Shapiro-Wilk normality test
##
## data: district_model_1$residuals
## W = 0.39962, p-value < 2.2e-16
#p value is less than 0.05 = not normal
district<-lm(DAGC4X21R~DPSTWHFP+DPETECOP+DA0912DR21R+DISTSIZE,data = district)
vif(district)
## GVIF Df GVIF^(1/(2*Df))
## DPSTWHFP 1.723003 1 1.312632
## DPETECOP 1.508075 1 1.228037
## DA0912DR21R 1.103600 1 1.050524
## DISTSIZE 1.216933 8 1.012346
#the vifs are not over 10, and also below 5, they are all close to 1, so it shows low multicollinearity.
raintest(district)
##
## Rainbow test
##
## data: district
## Rain = 0.92163, df1 = 536, df2 = 524, p-value = 0.8262
raintest(district_model_1)
##
## Rainbow test
##
## data: district_model_1
## Rain = 0.92163, df1 = 536, df2 = 524, p-value = 0.8262
str(district)
## List of 14
## $ coefficients : Named num [1:12] 1.01e+02 4.45e-04 -5.96e-02 -2.07 -1.45e-01 ...
## ..- attr(*, "names")= chr [1:12] "(Intercept)" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
## $ residuals : Named num [1:1072] 0.937 2.129 -2.239 0.903 3.269 ...
## ..- attr(*, "names")= chr [1:1072] "1" "2" "3" "4" ...
## $ effects : Named num [1:1072] -3074.3 -80.6 66.1 -218.4 11.4 ...
## ..- attr(*, "names")= chr [1:1072] "(Intercept)" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
## $ rank : int 12
## $ fitted.values: Named num [1:1072] 99.1 97.9 97.4 94.9 95.7 ...
## ..- attr(*, "names")= chr [1:1072] "1" "2" "3" "4" ...
## $ assign : int [1:12] 0 1 2 3 4 4 4 4 4 4 ...
## $ qr :List of 5
## ..$ qr : num [1:1072, 1:12] -32.7414 0.0305 0.0305 0.0305 0.0305 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:1072] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:12] "(Intercept)" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
## .. ..- attr(*, "assign")= int [1:12] 0 1 2 3 4 4 4 4 4 4 ...
## .. ..- attr(*, "contrasts")=List of 1
## .. .. ..$ DISTSIZE: chr "contr.treatment"
## ..$ qraux: num [1:12] 1.03 1.02 1 1.01 1.01 ...
## ..$ pivot: int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ tol : num 1e-07
## ..$ rank : int 12
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 1060
## $ na.action : 'omit' Named int [1:135] 37 39 60 64 70 72 73 74 75 76 ...
## ..- attr(*, "names")= chr [1:135] "37" "39" "60" "64" ...
## $ contrasts :List of 1
## ..$ DISTSIZE: chr "contr.treatment"
## $ xlevels :List of 1
## ..$ DISTSIZE: chr [1:9] "1,000 to 1,599" "1,600 to 2,999" "10,000 to 24,999" "25,000 to 49,999" ...
## $ call : language lm(formula = DAGC4X21R ~ DPSTWHFP + DPETECOP + DA0912DR21R + DISTSIZE, data = district)
## $ terms :Classes 'terms', 'formula' language DAGC4X21R ~ DPSTWHFP + DPETECOP + DA0912DR21R + DISTSIZE
## .. ..- attr(*, "variables")= language list(DAGC4X21R, DPSTWHFP, DPETECOP, DA0912DR21R, DISTSIZE)
## .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:5] "DAGC4X21R" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
## .. .. .. ..$ : chr [1:4] "DPSTWHFP" "DPETECOP" "DA0912DR21R" "DISTSIZE"
## .. ..- attr(*, "term.labels")= chr [1:4] "DPSTWHFP" "DPETECOP" "DA0912DR21R" "DISTSIZE"
## .. ..- attr(*, "order")= int [1:4] 1 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(DAGC4X21R, DPSTWHFP, DPETECOP, DA0912DR21R, DISTSIZE)
## .. ..- attr(*, "dataClasses")= Named chr [1:5] "numeric" "numeric" "numeric" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:5] "DAGC4X21R" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
## $ model :'data.frame': 1072 obs. of 5 variables:
## ..$ DAGC4X21R : num [1:1072] 100 100 95.2 95.8 99 97.8 100 96.8 100 94.1 ...
## ..$ DPSTWHFP : num [1:1072] 91.7 90.5 93.3 93.5 74.6 80.9 100 69 86.7 93.9 ...
## ..$ DPETECOP : num [1:1072] 40.8 45.4 54.2 54.1 81.6 74 46.8 49.6 57.8 50.1 ...
## ..$ DA0912DR21R: num [1:1072] 0 0.3 0.4 0 0 0 0 0.4 0.4 0.7 ...
## ..$ DISTSIZE : chr [1:1072] "500 to 999" "1,000 to 1,599" "500 to 999" "Under 500" ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language DAGC4X21R ~ DPSTWHFP + DPETECOP + DA0912DR21R + DISTSIZE
## .. .. ..- attr(*, "variables")= language list(DAGC4X21R, DPSTWHFP, DPETECOP, DA0912DR21R, DISTSIZE)
## .. .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:5] "DAGC4X21R" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
## .. .. .. .. ..$ : chr [1:4] "DPSTWHFP" "DPETECOP" "DA0912DR21R" "DISTSIZE"
## .. .. ..- attr(*, "term.labels")= chr [1:4] "DPSTWHFP" "DPETECOP" "DA0912DR21R" "DISTSIZE"
## .. .. ..- attr(*, "order")= int [1:4] 1 1 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(DAGC4X21R, DPSTWHFP, DPETECOP, DA0912DR21R, DISTSIZE)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:5] "numeric" "numeric" "numeric" "numeric" ...
## .. .. .. ..- attr(*, "names")= chr [1:5] "DAGC4X21R" "DPSTWHFP" "DPETECOP" "DA0912DR21R" ...
## ..- attr(*, "na.action")= 'omit' Named int [1:135] 37 39 60 64 70 72 73 74 75 76 ...
## .. ..- attr(*, "names")= chr [1:135] "37" "39" "60" "64" ...
## - attr(*, "class")= chr "lm"
par(mfrow = c(2, 2))
plot(district_model_1)
#4) does your model meet those assumptions?
#Rainbow test has a high p-value #Durbin-Watson test statistic is close to 2 and the p-value is above 0.05 #Breusch-Pagan test has super low p-value = violation #Shapiro-Wilk test has a very low p-value #VIF values are below 2
#5) If your model violates an assumption, which one? #Homoscedasticity because Breusch-Pagan test has a very low p-value
#6) What would you do to mitigate this assumption? Show your work.
#Did a log of the dependent variable (DAGC4X21R) #then redo Breusch Pagan test
```