library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(ggplot2)
districtbasehw7 <- read_xls("district.xls")
districtbasehw7cleaned<-districtbasehw7 %>% select(DISTNAME,DZCAMPUS,DAGC4X21R,DA0AT21R,DPSTURNR,DPSTKIDR,DPFEAINSP,DZEXADMP) %>% na.omit(.)
Load your preferred dataset into R studio
Create a linear model “lm()” from the variables, with a continuous dependent variable as the outcome
hw7model2 <- lm(DAGC4X21R ~ DA0AT21R + DPSTURNR +
DPSTKIDR + DPFEAINSP + DZEXADMP, data = districtbasehw7cleaned)
summary(hw7model2)
##
## Call:
## lm(formula = DAGC4X21R ~ DA0AT21R + DPSTURNR + DPSTKIDR + DPFEAINSP +
## DZEXADMP, data = districtbasehw7cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -95.800 -1.604 1.615 4.456 22.070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.375e+01 1.341e+01 -2.517 0.012 *
## DA0AT21R 1.683e+00 1.268e-01 13.279 < 2e-16 ***
## DPSTURNR 1.283e-03 4.140e-02 0.031 0.975
## DPSTKIDR 1.414e-04 1.409e-01 0.001 0.999
## DPFEAINSP -4.314e-01 8.586e-02 -5.024 5.92e-07 ***
## DZEXADMP -9.074e-01 1.386e-01 -6.547 9.10e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.6 on 1065 degrees of freedom
## Multiple R-squared: 0.1827, Adjusted R-squared: 0.1788
## F-statistic: 47.61 on 5 and 1065 DF, p-value: < 2.2e-16
plot(hw7model2,which = 1)
raintest
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(zoo)
raintest(hw7model2)
##
## Rainbow test
##
## data: hw7model2
## Rain = 0.97204, df1 = 536, df2 = 529, p-value = 0.6283
dw_hw7model<-dwtest(hw7model2)
print(dw_hw7model)
##
## Durbin-Watson test
##
## data: hw7model2
## DW = 1.7985, p-value = 0.0004514
## alternative hypothesis: true autocorrelation is greater than 0
plot(hw7model2,which= 3)
Bptest
library(lmtest)
bp_resulthw7<-bptest(hw7model2)
print(bp_resulthw7)
##
## studentized Breusch-Pagan test
##
## data: hw7model2
## BP = 55.017, df = 5, p-value = 1.295e-10
plot(hw7model2, which = 2)
Shapiro test
shw7model2<-residuals(hw7model2)
shap_testhw7<- shapiro.test(shw7model2)
print(shap_testhw7)
##
## Shapiro-Wilk normality test
##
## data: shw7model2
## W = 0.50561, p-value < 2.2e-16
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vifhw7<-vif(hw7model2)
print(vifhw7)
## DA0AT21R DPSTURNR DPSTKIDR DPFEAINSP DZEXADMP
## 1.091423 1.139951 1.222410 1.389638 1.602137
cor
predictors <- districtbasehw7cleaned[, c("DA0AT21R", "DPSTURNR",
"DPSTKIDR", "DPFEAINSP", "DZEXADMP")]
print(predictors)
## # A tibble: 1,071 × 5
## DA0AT21R DPSTURNR DPSTKIDR DPFEAINSP DZEXADMP
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 96.7 19.1 12.3 49.6 9.1
## 2 96 13.9 11 60.3 6.9
## 3 95.4 21.6 10.8 54.2 8.3
## 4 95.8 18.3 11.3 53.7 10.7
## 5 93.7 17.9 12.9 54.6 8.3
## 6 94.5 30.6 11 50.6 8.5
## 7 96.7 14.6 9.3 61.7 6.4
## 8 92.8 11.5 14.4 56.7 7.6
## 9 97.3 17 14.8 58.7 12.8
## 10 95.2 9.5 13.2 58.5 4.6
## # ℹ 1,061 more rows
I think of the test the one that isnt violated is the rainbow test. The p value is greater than 0.05.
The easiest one to see is through the visual representation of the homoscedasticity. The red line clearly curves and the actual plots of data pop up very far from the line. THis shows my data is heteroscedasticity.
I would work on the Heteroscedasticity by squaring my data. I think it helps the curve of the redline but there is more work to be done. The data is much too clumped together still.
hw7model2_sq <- lm(DAGC4X21R ~ I(DA0AT21R^2) + I(DPSTURNR^2) + I(DPSTKIDR^2) +
I(DPFEAINSP^2) + I(DZEXADMP^2), data = districtbasehw7cleaned)
plot(hw7model2_sq)