library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
TEA_Data<-read_excel("district.xls")
Distric_model<-lm(DZRVLOCP~DPETALLC+COMMTYPE,data =TEA_Data)
Distric_model<-lm(DZRVLOCP~DPETALLC+COMMTYPE,data =TEA_Data)
plot(Distric_model,which=1)

#According residual vs fitted shows me is that the variables are not linear because the variables are not on the red line. I can visibly observe the majority of variables graphed vertically.

raintest(Distric_model)
## 
##  Rainbow test
## 
## data:  Distric_model
## Rain = 1.0247, df1 = 601, df2 = 591, p-value = 0.3832

#The rainbow test is showing that the data is linear.

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(Distric_model)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2530837      1.493002       0
##  Alternative hypothesis: rho != 0

#The Durbin Watson Test is showing a p value of 0 with a W of 1.49, meaning the errors are accurate and do not appear to be correlated.

bptest(Distric_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  Distric_model
## BP = 118.25, df = 9, p-value < 2.2e-16

#The p value shows to be very small. The model is heteroscedastic.

plot(Distric_model,which=3)

#This is model with these variables does not explain . The data we have here is heteroscedastic by visual inspection.

plot(Distric_model,which=2)

#The residuals show to be normally distribute due to the majority of observations are all alonge the dotted line.

shapiro.test(Distric_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  Distric_model$residuals
## W = 0.96612, p-value = 3.905e-16

#This shows a normal distribution and the shapiro Wilk tests agrees that this data is normal.

kitchen_sink<-lm(DZRVLOCP~DPETALLC+COMMTYPE,data =TEA_Data)

vif(kitchen_sink)
##            GVIF Df GVIF^(1/(2*Df))
## DPETALLC 2.3955  1        1.547740
## COMMTYPE 2.3955  8        1.056118

#The vif shows that the variables are not strongly correlated. Im starting to see that there are more variables that go into local and state funding and not just the community type and population.