1. Load your preferred dataset into R studio
  2. Create a linear model “lm()” from the variables, with a continuous dependent variable as the outcome
  3. Check the following assumptions:
  1. Linearity (plot and raintest)
  2. Independence of errors (durbin-watson)
  3. Homoscedasticity (plot, bptest)
  4. Normality of residuals (QQ plot, shapiro test)
  5. No multicolinarity (VIF, cor)
  1. does your model meet those assumptions? You don’t have to be perfectly right, just make a good case.
  2. If your model violates an assumption, which one?
  3. What would you do to mitigate this assumption? Show your work.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
district_data <- read_excel("district.xls")
library(dplyr)
district_data <- district_data %>% rename(Four_Year_Grad_Rate_Class_2021 = DAGC4X21R )
district_data <- district_data %>% rename (Number_of_Students_Per_Teacher = DPSTKIDR)
district_data <- district_data %>% rename (Percentage_African_American_Students = DPETBLAP)
district_data <- district_data %>% rename (Percentage_White_Students = DPETWHIP)
district_data <- district_data %>% rename (Perecentage_Hispanic_Students = DPETHISP)
district_data <- district_data %>% rename (Spending_Per_Pupil = DPFEAOPFK)
district_data <- district_data %>% rename (Revenue_Per_Pupil = DPFRAALLK)
District_Data_Frame <- district_data %>% dplyr:: select(Four_Year_Grad_Rate_Class_2021, Number_of_Students_Per_Teacher, Percentage_African_American_Students, Percentage_White_Students, Perecentage_Hispanic_Students, Spending_Per_Pupil, Revenue_Per_Pupil)

District_Data_Frame_Clean <- District_Data_Frame |> drop_na()
Revenue_Funding_Model<-lm(Revenue_Per_Pupil ~ Number_of_Students_Per_Teacher + Percentage_African_American_Students + Percentage_White_Students + Perecentage_Hispanic_Students + Spending_Per_Pupil, data = District_Data_Frame_Clean)
# Linearity (plot and raintest)
library(tidyverse)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
plot(Revenue_Funding_Model, which =1)

raintest (Revenue_Funding_Model) #High p-value so data is linear
## 
##  Rainbow test
## 
## data:  Revenue_Funding_Model
## Rain = 0.89788, df1 = 536, df2 = 530, p-value = 0.893
#Independence of errors (durbin-watson)
library (car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(Revenue_Funding_Model)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.01629246      1.967343   0.394
##  Alternative hypothesis: rho != 0
#The p-value is greater than .05 which means the errors are independent
#Homoscedasticity (plot, bptest)
plot(Revenue_Funding_Model, which =3)

bptest(Revenue_Funding_Model) #The p-value is less than .05, so we can reject homoscedasticity and assume the model is heteroscedastic
## 
##  studentized Breusch-Pagan test
## 
## data:  Revenue_Funding_Model
## BP = 24.816, df = 5, p-value = 0.0001512
# Normality of residuals (QQ plot, shapiro test)
plot(Revenue_Funding_Model, which=2)

shapiro.test(Revenue_Funding_Model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  Revenue_Funding_Model$residuals
## W = 0.45648, p-value < 2.2e-16
#The p-value is actually below .05 which signifies that the residuals are significantly different from a normal distribution
# No multicolinarity (VIF, cor)
vif(Revenue_Funding_Model)
##       Number_of_Students_Per_Teacher Percentage_African_American_Students 
##                             1.675831                             8.162873 
##            Percentage_White_Students        Perecentage_Hispanic_Students 
##                            33.877401                            30.575220 
##                   Spending_Per_Pupil 
##                             1.542493
Revenue_Funding_Model_vars<-District_Data_Frame_Clean %>% dplyr::select(Revenue_Per_Pupil, Number_of_Students_Per_Teacher, Percentage_African_American_Students, Percentage_White_Students, Perecentage_Hispanic_Students, Spending_Per_Pupil)
cor(Revenue_Funding_Model_vars)
##                                      Revenue_Per_Pupil
## Revenue_Per_Pupil                           1.00000000
## Number_of_Students_Per_Teacher             -0.43468242
## Percentage_African_American_Students       -0.12235867
## Percentage_White_Students                   0.04852799
## Perecentage_Hispanic_Students               0.03851050
## Spending_Per_Pupil                          0.73433239
##                                      Number_of_Students_Per_Teacher
## Revenue_Per_Pupil                                        -0.4346824
## Number_of_Students_Per_Teacher                            1.0000000
## Percentage_African_American_Students                      0.2225031
## Percentage_White_Students                                -0.3013131
## Perecentage_Hispanic_Students                             0.1765717
## Spending_Per_Pupil                                       -0.5584965
##                                      Percentage_African_American_Students
## Revenue_Per_Pupil                                              -0.1223587
## Number_of_Students_Per_Teacher                                  0.2225031
## Percentage_African_American_Students                            1.0000000
## Percentage_White_Students                                      -0.3323749
## Perecentage_Hispanic_Students                                  -0.1414734
## Spending_Per_Pupil                                             -0.1108545
##                                      Percentage_White_Students
## Revenue_Per_Pupil                                   0.04852799
## Number_of_Students_Per_Teacher                     -0.30131306
## Percentage_African_American_Students               -0.33237491
## Percentage_White_Students                           1.00000000
## Perecentage_Hispanic_Students                      -0.87022674
## Spending_Per_Pupil                                  0.01594008
##                                      Perecentage_Hispanic_Students
## Revenue_Per_Pupil                                        0.0385105
## Number_of_Students_Per_Teacher                           0.1765717
## Percentage_African_American_Students                    -0.1414734
## Percentage_White_Students                               -0.8702267
## Perecentage_Hispanic_Students                            1.0000000
## Spending_Per_Pupil                                       0.0741356
##                                      Spending_Per_Pupil
## Revenue_Per_Pupil                            0.73433239
## Number_of_Students_Per_Teacher              -0.55849652
## Percentage_African_American_Students        -0.11085449
## Percentage_White_Students                    0.01594008
## Perecentage_Hispanic_Students                0.07413560
## Spending_Per_Pupil                           1.00000000
#The first two assumptions are met, but the last three are not met. Homoscedasticity can be mitigated with a log-transformation of the dependent variable. Normality of residuals can be mitigated with log-tranformation as well. To resolve the assumption of no multicolinarity, we can remove the strongly correlated variables. 
District_Data_Frame_Clean_2<- District_Data_Frame_Clean |> filter(Number_of_Students_Per_Teacher > 0, )
District_Data_Frame_Clean_2 <- District_Data_Frame_Clean |> filter(if_all(everything(), ~ .x >= 0))
Revenue_Funding_Model_Log<-lm(log(Revenue_Per_Pupil) ~ log (Number_of_Students_Per_Teacher) + Percentage_African_American_Students + Percentage_White_Students + Perecentage_Hispanic_Students + Spending_Per_Pupil, data = District_Data_Frame_Clean_2)
bptest(Revenue_Funding_Model_Log) #The p-value still reports a number less than . 05, so it continues to fail the assumption of homoscedasticity. 
## 
##  studentized Breusch-Pagan test
## 
## data:  Revenue_Funding_Model_Log
## BP = 176.67, df = 5, p-value < 2.2e-16
plot(Revenue_Funding_Model_Log, which=2)

shapiro.test(Revenue_Funding_Model_Log$residuals) #The P-value still remains below .05, staying as a non-normal distribution
## 
##  Shapiro-Wilk normality test
## 
## data:  Revenue_Funding_Model_Log$residuals
## W = 0.77214, p-value < 2.2e-16
Revenue_Funding_Model_Altered<-lm(Revenue_Per_Pupil ~ Number_of_Students_Per_Teacher + Percentage_African_American_Students + Percentage_White_Students + Spending_Per_Pupil, data = District_Data_Frame_Clean_2)
vif(Revenue_Funding_Model_Altered)
##       Number_of_Students_Per_Teacher Percentage_African_American_Students 
##                             1.764729                             1.149972 
##            Percentage_White_Students                   Spending_Per_Pupil 
##                             1.269839                             1.557368
#This assumption was fixed by removing the second highest correlation which was Hispanic students at 30. After removing this variable, the rest of the variables fall below 5.