library(ggplot2)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(ggplot2)

districtbasehw7 <- read_xls("district.xls")
districtbasehw7cleaned<-districtbasehw7 %>% select(DISTNAME,DZCAMPUS,DAGC4X21R,DA0AT21R,DPSTURNR,DPSTKIDR,DPFEAINSP,DZEXADMP) %>% na.omit(.)
  1. Load your preferred dataset into R studio

  2. Create a linear model “lm()” from the variables, with a continuous dependent variable as the outcome

hw7model2 <- lm(DAGC4X21R ~  DA0AT21R + DPSTURNR + 
            DPSTKIDR + DPFEAINSP + DZEXADMP, data = districtbasehw7cleaned)

summary(hw7model2)
## 
## Call:
## lm(formula = DAGC4X21R ~ DA0AT21R + DPSTURNR + DPSTKIDR + DPFEAINSP + 
##     DZEXADMP, data = districtbasehw7cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -95.800  -1.604   1.615   4.456  22.070 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.375e+01  1.341e+01  -2.517    0.012 *  
## DA0AT21R     1.683e+00  1.268e-01  13.279  < 2e-16 ***
## DPSTURNR     1.283e-03  4.140e-02   0.031    0.975    
## DPSTKIDR     1.414e-04  1.409e-01   0.001    0.999    
## DPFEAINSP   -4.314e-01  8.586e-02  -5.024 5.92e-07 ***
## DZEXADMP    -9.074e-01  1.386e-01  -6.547 9.10e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.6 on 1065 degrees of freedom
## Multiple R-squared:  0.1827, Adjusted R-squared:  0.1788 
## F-statistic: 47.61 on 5 and 1065 DF,  p-value: < 2.2e-16
  1. Check the following assumptions:
  1. Linearity (plot and raintest)
plot(hw7model2,which = 1)

raintest

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(zoo)

raintest(hw7model2)
## 
##  Rainbow test
## 
## data:  hw7model2
## Rain = 0.97204, df1 = 536, df2 = 529, p-value = 0.6283
  1. Independence of errors (durbin-watson)
dw_hw7model<-dwtest(hw7model2)

print(dw_hw7model)
## 
##  Durbin-Watson test
## 
## data:  hw7model2
## DW = 1.7985, p-value = 0.0004514
## alternative hypothesis: true autocorrelation is greater than 0
  1. Homoscedasticity (plot, bptest)
plot(hw7model2,which= 3)

Bptest

library(lmtest)

bp_resulthw7<-bptest(hw7model2)

print(bp_resulthw7)
## 
##  studentized Breusch-Pagan test
## 
## data:  hw7model2
## BP = 55.017, df = 5, p-value = 1.295e-10
  1. Normality of residuals (QQ plot, shapiro test) lk,
plot(hw7model2, which = 2)

Shapiro test

shw7model2<-residuals(hw7model2)

shap_testhw7<- shapiro.test(shw7model2)

print(shap_testhw7)
## 
##  Shapiro-Wilk normality test
## 
## data:  shw7model2
## W = 0.50561, p-value < 2.2e-16
  1. No multicolinarity (VIF, cor)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vifhw7<-vif(hw7model2)

print(vifhw7)
##  DA0AT21R  DPSTURNR  DPSTKIDR DPFEAINSP  DZEXADMP 
##  1.091423  1.139951  1.222410  1.389638  1.602137

cor

predictors <- districtbasehw7cleaned[, c("DA0AT21R", "DPSTURNR", 
                                     "DPSTKIDR", "DPFEAINSP", "DZEXADMP")]

print(predictors)
## # A tibble: 1,071 × 5
##    DA0AT21R DPSTURNR DPSTKIDR DPFEAINSP DZEXADMP
##       <dbl>    <dbl>    <dbl>     <dbl>    <dbl>
##  1     96.7     19.1     12.3      49.6      9.1
##  2     96       13.9     11        60.3      6.9
##  3     95.4     21.6     10.8      54.2      8.3
##  4     95.8     18.3     11.3      53.7     10.7
##  5     93.7     17.9     12.9      54.6      8.3
##  6     94.5     30.6     11        50.6      8.5
##  7     96.7     14.6      9.3      61.7      6.4
##  8     92.8     11.5     14.4      56.7      7.6
##  9     97.3     17       14.8      58.7     12.8
## 10     95.2      9.5     13.2      58.5      4.6
## # ℹ 1,061 more rows
  1. does your model meet those assumptions? You don’t have to be perfectly right, just make a good case.

I think of the test the one that isnt violated is the rainbow test. The p value is greater than 0.05.

  1. If your model violates an assumption, which one?

The easiest one to see is through the visual representation of the homoscedasticity. The red line clearly curves and the actual plots of data pop up very far from the line. THis shows my data is heteroscedasticity.

  1. What would you do to mitigate this assumption? Show your work.

I would work on the Heteroscedasticity by squaring my data. I think it helps the curve of the redline but there is more work to be done. The data is much too clumped together still.

hw7model2_sq <- lm(DAGC4X21R ~ I(DA0AT21R^2) + I(DPSTURNR^2) + I(DPSTKIDR^2) + 
                  I(DPFEAINSP^2) + I(DZEXADMP^2), data = districtbasehw7cleaned)

plot(hw7model2_sq)