library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(zoo)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
setwd("~/Desktop/Monday Class")

district<-read_excel("district.xls")
DistrictModel <- lm(DA0912DR21R ~ DA0AT21R+DA0CT21R, data = district)
summary(DistrictModel)
## 
## Call:
## lm(formula = DA0912DR21R ~ DA0AT21R + DA0CT21R, data = district)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6637 -0.9424 -0.2303  0.6698 28.0421 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 60.626183   2.043604  29.666   <2e-16 ***
## DA0AT21R    -0.624078   0.021918 -28.473   <2e-16 ***
## DA0CT21R    -0.004277   0.002269  -1.885   0.0597 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.002 on 1078 degrees of freedom
##   (126 observations deleted due to missingness)
## Multiple R-squared:  0.462,  Adjusted R-squared:  0.461 
## F-statistic: 462.9 on 2 and 1078 DF,  p-value: < 2.2e-16
ggplot(district,aes(x= DA0AT21R,y = DA0912DR21R)) + geom_point()
## Warning: Removed 112 rows containing missing values or values outside the scale range
## (`geom_point()`).

raintest(DistrictModel)
## 
##  Rainbow test
## 
## data:  DistrictModel
## Rain = 1.8528, df1 = 541, df2 = 537, p-value = 6.364e-13
dwtdistrict <- durbinWatsonTest(DistrictModel)
dwtdistrict
##  lag Autocorrelation D-W Statistic p-value
##    1      0.05241542      1.894533   0.108
##  Alternative hypothesis: rho != 0
plot(DistrictModel,which=3)

bptestdistrict <- bptest(DistrictModel)
bptestdistrict
## 
##  studentized Breusch-Pagan test
## 
## data:  DistrictModel
## BP = 170.14, df = 2, p-value < 2.2e-16
plot(DistrictModel,which=2)

shapiro.test(DistrictModel$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  DistrictModel$residuals
## W = 0.72197, p-value < 2.2e-16
vif(DistrictModel)
## DA0AT21R DA0CT21R 
## 1.094539 1.094539

Homework 7

  1. Load your preferred dataset into R studio
  2. Create a linear model “lm()” from the variables, with a continuous dependent variable as the outcome
  3. Check the following assumptions:
  1. Linearity (plot and raintest)
  2. Independence of errors (durbin-watson)
  3. Homoscedasticity (plot, bptest)
  4. Normality of residuals (QQ plot, shapiro test)
  5. No multicolinarity (VIF, cor)
  1. does your model meet those assumptions? You don’t have to be perfectly right, just make a good case.
  2. If your model violates an assumption, which one?

linearity: the graph shows that these are not a linear relationship independence of errors: Homoscedasticity: violates this assumption graph is not a straight line, it is wavy Normality: there is not a normal distribution shown here Multicolinearity: the VIF shows that the variables are not highly correlated

  1. What would you do to mitigate this assumption? Show your work.