Example 1 -
In this case, we try to use sugar intake and water intake to predict weight gained.
# Sugar intake
s <- c(5, 8, 9, 10, 15, 18, 14, 17, 20, 22, 24, 26, 30 ,30, 32, 35, 40, 20, 25, 35)
# Water intake
w <- c(1000, 1100, 900, 2000, 1800, 1100, 1400, 2200, 2600, 2400, 3200, 1900, 2050, 2100, 2200, 2200, 1100, 1300, 1900, 1400)
# Weight gained
wg <- c(20, 30, 60, 70, 100, 95, 70, 83, 103, 112, 130, 80, 95, 130, 112, 100, 105, 84, 82, 91)
data1 <- data.frame(s, w, wg)
#data
names(data1)
## [1] "s" "w" "wg"
str(data1)
## 'data.frame': 20 obs. of 3 variables:
## $ s : num 5 8 9 10 15 18 14 17 20 22 ...
## $ w : num 1000 1100 900 2000 1800 1100 1400 2200 2600 2400 ...
## $ wg: num 20 30 60 70 100 95 70 83 103 112 ...
summary(data1)
## s w wg
## Min. : 5.00 Min. : 900 Min. : 20.0
## 1st Qu.:14.75 1st Qu.:1250 1st Qu.: 77.5
## Median :21.00 Median :1900 Median : 93.0
## Mean :21.75 Mean :1792 Mean : 87.6
## 3rd Qu.:30.00 3rd Qu.:2200 3rd Qu.:103.5
## Max. :40.00 Max. :3200 Max. :130.0
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.6.2
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
#we Check our dataset for the correlation between predictors/IVs and outcome/DV.
#OPTION 1
#let's first check relationship between sugar vs water intake
cor(data1$s, data1$w, use = "complete.obs", method = "pearson")
## [1] 0.3053748
cor.test(data1$s, data1$w, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data1$s and data1$w
## t = 1.3606, df = 18, p-value = 0.1904
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1585750 0.6588606
## sample estimates:
## cor
## 0.3053748
# results demonstrate r = 0.3053748 medium correlation but not significant p-value = 0.1904
#let's now check relationship between water intake vs weight gain
cor(data1$w, data1$wg, use = "complete.obs", method = "pearson")
## [1] 0.6637163
cor.test(data1$w, data1$wg, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data1$w and data1$wg
## t = 3.7647, df = 18, p-value = 0.001419
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3131787 0.8550901
## sample estimates:
## cor
## 0.6637163
# results demonstrate r = 0.6637163 strong correlation and also significant p-value = 0.001419
#OPTION 2 - rather than checking separate models, we can preferrably use the data.matrix
DataMatrix1 <- as.matrix(data1[, c("s", "w", "wg")])
rcorr(DataMatrix1)
## s w wg
## s 1.00 0.31 0.70
## w 0.31 1.00 0.66
## wg 0.70 0.66 1.00
##
## n= 20
##
##
## P
## s w wg
## s 0.1904 0.0005
## w 0.1904 0.0014
## wg 0.0005 0.0014
#Results show medium correlation b/t sugar and water, STRONG correlation b/t sugar and weight gain
#So we can use sugar intake - IV and weight gain - DV in our multiple regression model
# 1.The type of variable - data is continuous
# 2.Non- zero variance:
plot(density(data1$wg))
plot(density(data1$w))
plot(density(data1$s))
#Graphs demonstrate there is variability b/w datasetg
#Multicollinearity - After we finish OUR MODEL, Make sure there are not Multicollinearity issues: Sugar and water are not significantly correlated.
#VIF
#vif(model3)
#Predictors are not correlated with extraneous variable - not third party issue
#Heteroscedasticity:
library(car)
## Warning: package 'car' was built under R version 3.6.2
## Loading required package: carData
scatterplot(data1$s, data1$wg)
scatterplot(data1$w, data1$wg)
#Independent errors - randomly selected sample so we are good here
#Normally distributed error: We perform this step also After we finish OUR MODEL
library(moments)
#shapiro.test(model$residuals)
#Independence - randomly selected sample so we are good here
#Linearity:
scatterplot(data1$s, data1$wg)
scatterplot(data1$w, data1$wg)
After checking all the assumption, we move on to creating models
# Now we create the model:
model1 <- lm(wg ~ s, data1) # IV: sugar, DV: weight
summary(model1)
##
## Call:
## lm(formula = wg ~ s, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.167 -14.556 -1.119 15.887 37.909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.1869 11.3395 3.897 0.001057 **
## s 1.9960 0.4762 4.191 0.000549 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.64 on 18 degrees of freedom
## Multiple R-squared: 0.4939, Adjusted R-squared: 0.4658
## F-statistic: 17.57 on 1 and 18 DF, p-value: 0.0005488
#F-statistic: 17.57 on 1 and 18 DF, p-value: 0.0005488 - overall significant model. Sugar (slope) is significant and the intercept is also significant, so CAN be used as a good regression model
# Multiple R-squared: 0.4939 - explains about 50% of variance
AIC(model1) #Akaike Information Criterion
## [1] 181.7333
BIC(model1) #Bayesian Information Criterion
## [1] 184.7205
model2 <- lm(wg ~ w, data1) #IV: water, DV: weight
summary(model2)
##
## Call:
## lm(formula = wg ~ w, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.376 -9.684 -0.395 12.001 38.568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.808755 15.341490 2.139 0.04643 *
## w 0.030567 0.008119 3.765 0.00142 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.7 on 18 degrees of freedom
## Multiple R-squared: 0.4405, Adjusted R-squared: 0.4094
## F-statistic: 14.17 on 1 and 18 DF, p-value: 0.001419
# F(1,18) = 14.17, p < .001, R2 = .44 Est
AIC(model2) #Akaike Information Criterion
## [1] 183.7389
BIC(model2) #Bayesian Information Criterion
## [1] 186.7261
model3 <- lm(wg ~ s + w, data1) #IV: sugar and water, DV: weight
summary(model3)
##
## Call:
## lm(formula = wg ~ s + w, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.2851 -11.8333 -0.6146 10.2072 29.0703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.642588 12.286212 1.029 0.317896
## s 1.566433 0.385277 4.066 0.000804 ***
## w 0.022810 0.006247 3.651 0.001977 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.9 on 17 degrees of freedom
## Multiple R-squared: 0.7163, Adjusted R-squared: 0.683
## F-statistic: 21.47 on 2 and 17 DF, p-value: 2.232e-05
# F(2,17) = 21.47, p < .001, R2 = .72
AIC(model3) #Akaike Information Criterion
## [1] 172.1542
BIC(model3) #Bayesian Information Criterion
## [1] 176.1372
#AIC and BIC score: the smaller, the better.
# To compare BIC we can use exp((BIC Smaller – BIC Larger)/2) or we can just subtract the smaller BIC from the larger BIC. Values of 2 - 6 positive support, 6 – 10 strong support, and > 10 very strong support for the SMALLER BIC value model.
# Inthis case model3(BIC 176) minus either model1 (BIC 184) or model2 (BIC 186) is more than 6 - positive support for model 3 with 2 predictors compared to either model 1 or 2 with 1 predictor.
#Lets check for if there is any significant interaction effect
model4 <- lm(wg ~ s*w, data1) #IV: sugar and water, DV: weight
summary(model4)
##
## Call:
## lm(formula = wg ~ s * w, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.805 -10.720 -2.461 9.066 28.697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.018e+01 2.657e+01 -0.383 0.7066
## s 2.684e+00 1.216e+00 2.207 0.0422 *
## w 3.940e-02 1.822e-02 2.162 0.0461 *
## s:w -7.659e-04 7.902e-04 -0.969 0.3468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.93 on 16 degrees of freedom
## Multiple R-squared: 0.7321, Adjusted R-squared: 0.6818
## F-statistic: 14.57 on 3 and 16 DF, p-value: 7.737e-05
# F(3,16) = 14.57, p < .001, R2 = .73 - no significant interaction effect b/w sugar and water in terms of how they affect wwight gain
AIC(model4) #Akaike Information Criterion
## [1] 173.013
BIC(model4) #Bayesian Information Criterion
## [1] 177.9917
# AIC and BIC not significantly better than model3
#COmpare model using CHi Sq - Analysis of Variance Table
anova(model1, model2) #no significant difference b/w model1 and 2
## Analysis of Variance Table
##
## Model 1: wg ~ s
## Model 2: wg ~ w
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 7665.8
## 2 18 8474.3 0 -808.57
anova(model1, model3) #significant difference b/w model1 and 3 [Pr(>F) or p-value = 0.001977]
## Analysis of Variance Table
##
## Model 1: wg ~ s
## Model 2: wg ~ s + w
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 7665.8
## 2 17 4296.5 1 3369.2 13.331 0.001977 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since we have decided the model 3 isbetter, we can then generate our regression model equation - Y = -10.18 + 1.57(sugar) - .004 (water)
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 3.6.3
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:survival':
##
## aml
## The following object is masked from 'package:lattice':
##
## melanoma
## Loading required package: MASS
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
##
## norm
lm.beta(model3) #for reference only
## s w
## 0.5515316 0.4952925
Now that we have decided on model - 3, we need to still check our two more assumptions - multicollinearity and normally distribution of error
#Multicollinearity - After we finish OUR MODEL, Make sure there are not Multicollinearity issues: Sugar and water are not significantly correlated.
#VIF
vif(model3)
## s w
## 1.102844 1.102844
#Results with values < 3 suggest that none of the predictors violate the assumption of multicollinearity
#Normally distributed error: We perform this step also After we finish OUR MODEL
library(moments)
shapiro.test(model3$residuals) # p-value - 0.7337 fails to reject the null hypothesis, so we are good to continue
##
## Shapiro-Wilk normality test
##
## data: model3$residuals
## W = 0.969, p-value = 0.7337
plot(density(model3$residuals)) # graphs shows normal distribution of errors/residuals
#
cutoff <- 4/((nrow(data1)-length(model1$w)-1))
plot(model1, which = 4, cook.levels = cutoff)
outlierTest(model1) # p-value = 0.056331 fails to reject the null hypothesis, meaning no significant outliers
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 11 2.047996 0.056331 NA
A linear regression model was conducted to predict participants’ weight increase, based on their sugar and water intake. All the regression assumptions were met, and no further adjustments made. A significant regression equation was found (F(2,17) = 21.47, p < .001), with an R-square of .72, which suggests that 72% of the variance of participant’s weight gained can be explained by the two predictors. Both sugar (t = 4.07, p < .001. b = 1.57) and water (t = 3.65, p = .002. b = .02) were statistically significant. The result suggested that sugar predicts that for each sugar intake, there is a 1.57-gram increase in participant’s weight. Besides, water also predicts that for a cup of water drank there is a 0.02-gram increase in participant’s weight.
Example 2 -
Our goal is to determine whether how many times the ad is watched and age can significantly predict rating.
library(readxl)
## Warning: package 'readxl' was built under R version 3.6.3
setwd("E:/mikhilesh/HU Sem VI ANLY 510 and 506/ANLY 510 Kao Principals and Applications/Lecture and other materials")
data2 <- read_xlsx("Lecture 8 MultipleRegressionExample.xlsx")
names(data2)
## [1] "Ad" "Age" "Sex" "Rating"
str(data2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 504 obs. of 4 variables:
## $ Ad : num 0 0 1 1 2 2 0 0 1 1 ...
## $ Age : num 55 56 56 54 36 36 75 42 61 41 ...
## $ Sex : chr "male" "female" "male" "female" ...
## $ Rating: num 4.5 19.4 36.4 49.6 75.6 81.6 5.5 28.2 35.9 60.1 ...
summary(data2)
## Ad Age Sex Rating
## Min. :0 Min. :18.00 Length:504 Min. : 0.00
## 1st Qu.:0 1st Qu.:33.75 Class :character 1st Qu.: 23.88
## Median :1 Median :45.00 Mode :character Median : 43.70
## Mean :1 Mean :45.96 Mean : 46.00
## 3rd Qu.:2 3rd Qu.:60.00 3rd Qu.: 68.17
## Max. :2 Max. :75.00 Max. :103.60
library(Hmisc)
#we Check our dataset for the correlation between predictors/IVs and outcome/DV.
#OPTION 1
#let's first check relationship between Ad vs Rating
cor(data2$Rating, data2$Ad, use = "complete.obs", method = "pearson")
## [1] 0.9068833
cor.test(data2$Rating, data2$Ad, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data2$Rating and data2$Ad
## t = 48.22, df = 502, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8900406 0.9212536
## sample estimates:
## cor
## 0.9068833
# results demonstrate r = 0.9068833 strong correlation and significant p-value < 0.05
#let's now check relationship between water Age vs Rating
cor(data2$Rating, data2$Age, use = "complete.obs", method = "pearson")
## [1] -0.1761425
cor.test(data2$Rating, data2$Age, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data2$Rating and data2$Age
## t = -4.0092, df = 502, p-value = 7.017e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.25949194 -0.09018839
## sample estimates:
## cor
## -0.1761425
# results demonstrate r = -0.1761425 weak correlation and also significant p-value < 0.05
#OPTION 2 - rather than checking separate models, we can preferrably use the data.matrix
DataMatrix2 <- as.matrix(data2[, c("Ad", "Age", "Rating")])
rcorr(DataMatrix2)
## Ad Age Rating
## Ad 1.00 -0.04 0.91
## Age -0.04 1.00 -0.18
## Rating 0.91 -0.18 1.00
##
## n= 504
##
##
## P
## Ad Age Rating
## Ad 0.3561 0.0000
## Age 0.3561 0.0000
## Rating 0.0000 0.0000
#Results show STRONG correlation b/t Ad and Rating
#So we can use Ad - IV and Rating - DV in our multiple regression model
# 1.The type of variable - data is continuous
# 2.Non- zero variance:
plot(density(data2$Rating))
plot(density(data2$Ad))
plot(density(data2$Age))
#Graphs demonstrate there is variability b/w dataset
#Multicollinearity - After we finish OUR MODEL, Make sure there are not Multicollinearity issues: Sugar and water are not significantly correlated.
#VIF
#vif(model3)
#Predictors are not correlated with extraneous variable - no third party issue
#Heteroscedasticity:
library(car)
scatterplot(data2$Ad, data2$Rating)
scatterplot(data2$Age, data2$Rating)
#Independent errors - randomly selected sample so we are good here
#Normally distributed error: We perform this step also After we finish OUR MODEL
library(moments)
#shapiro.test(model$residuals)
#Independence - randomly selected sample so we are good here
#Linearity:
scatterplot(data2$Ad, data2$Rating)
scatterplot(data2$Age, data2$Rating)
After checking all the assumption, we move on to creating models
# Now we create the model:
model5 <- lm(Rating ~ Ad, data2) # IV: Ad, DV: Rating
summary(model5)
##
## Call:
## lm(formula = Rating ~ Ad, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.013 -8.322 -0.951 7.921 28.087
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.4878 0.7901 20.87 <2e-16 ***
## Ad 29.5128 0.6120 48.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.22 on 502 degrees of freedom
## Multiple R-squared: 0.8224, Adjusted R-squared: 0.8221
## F-statistic: 2325 on 1 and 502 DF, p-value: < 2.2e-16
#F-statistic: 2325 on 1 and 502 DF, p-value: < 2.2e-16 - overall significant model. Ad (slope) is significant and the intercept is also significant, so CAN be used as a good regression model
# Multiple R-squared: 0.8224 - explains > 80% of variance
AIC(model5) #Akaike Information Criterion
## [1] 3871.232
BIC(model5) #Bayesian Information Criterion
## [1] 3883.9
model6 <- lm(Rating ~ Age, data2) #IV: Age, DV: Rating
summary(model6)
##
## Call:
## lm(formula = Rating ~ Age, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.297 -23.285 -1.919 22.568 54.715
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.30913 3.51877 16.855 < 2e-16 ***
## Age -0.28957 0.07222 -4.009 7.02e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.21 on 502 degrees of freedom
## Multiple R-squared: 0.03103, Adjusted R-squared: 0.0291
## F-statistic: 16.07 on 1 and 502 DF, p-value: 7.017e-05
# F(1,502) = 16.07 , p < 0.001, R2 = 0.03103 Est
AIC(model6) #Akaike Information Criterion
## [1] 4726.477
BIC(model6) #Bayesian Information Criterion
## [1] 4739.145
model7 <- lm(Rating ~ Ad + Age, data2) #IV: Ad + Age, DV: Rating
summary(model7)
##
## Call:
## lm(formula = Rating ~ Ad + Age, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.046 -7.393 -1.536 7.728 25.997
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.17791 1.55844 17.439 < 2e-16 ***
## Ad 29.32643 0.57890 50.659 < 2e-16 ***
## Age -0.22854 0.02924 -7.815 3.26e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.6 on 501 degrees of freedom
## Multiple R-squared: 0.8417, Adjusted R-squared: 0.8411
## F-statistic: 1332 on 2 and 501 DF, p-value: < 2.2e-16
# F(2,501) = 1332 , p < .001, R2 = 0.8417
AIC(model7) #Akaike Information Criterion
## [1] 3815.258
BIC(model7) #Bayesian Information Criterion
## [1] 3832.148
#AIC and BIC score: the smaller, the better.
# To compare BIC we can use exp((BIC Smaller – BIC Larger)/2) or we can just subtract the smaller BIC from the larger BIC. Values of 2 - 6 positive support, 6 – 10 strong support, and > 10 very strong support for the SMALLER BIC value model.
# In this case model7 (BIC 3832.148) minus either model5 (BIC 3883.9) or model6 (BIC 4739.145) is more than 6 - positive support for model 7 with 2 predictors compared to either model 5 or 6 with 1 predictor.
#Lets check for if there is any significant interaction effect
model8 <- lm(Rating ~ Ad*Age, data2) #IV: Ad + Age, DV: Rating
summary(model8)
##
## Call:
## lm(formula = Rating ~ Ad * Age, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.043 -7.471 -1.572 7.612 26.061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.455351 2.221037 12.362 < 2e-16 ***
## Ad 29.032870 1.770343 16.400 < 2e-16 ***
## Age -0.234501 0.044843 -5.229 2.5e-07 ***
## Ad:Age 0.006388 0.036399 0.175 0.861
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.61 on 500 degrees of freedom
## Multiple R-squared: 0.8417, Adjusted R-squared: 0.8408
## F-statistic: 886.5 on 3 and 500 DF, p-value: < 2.2e-16
# F(3,500) = 886.5 , p < .001, R2 = 0.8417 - no significant interaction effect b/w Ad and Age in terms of how they affect Ratings
AIC(model8) #Akaike Information Criterion
## [1] 3817.227
BIC(model8) #Bayesian Information Criterion
## [1] 3838.34
# AIC and BIC not significantly better than model3
#COmpare model using CHi Sq - Analysis of Variance Table
anova(model5, model6) #no significant difference b/w model5 and 6
## Analysis of Variance Table
##
## Model 1: Rating ~ Ad
## Model 2: Rating ~ Age
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 502 63184
## 2 502 344802 0 -281617
anova(model5, model7) #significant difference b/w model5 and 7 [Pr(>F) or p-value < 0.05]
## Analysis of Variance Table
##
## Model 1: Rating ~ Ad
## Model 2: Rating ~ Ad + Age
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 502 63184
## 2 501 56319 1 6865.5 61.075 3.258e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since we have decided the model 3 isbetter, we can then generate our regression model equation - Y = -10.18 + 1.57(sugar) - .004 (water)
library(QuantPsyc)
lm.beta(model7) #for reference only
## Ad Age
## 0.9011564 -0.1390201
Now that we have decided on model - 7, we need to still check our two more assumptions - multicollinearity and normally distribution of error
#Multicollinearity - After we finish OUR MODEL, Make sure there are not Multicollinearity issues: Sugar and water are not significantly correlated.
#VIF
vif(model7)
## Ad Age
## 1.0017 1.0017
#Results with values < 3 suggest that none of the predictors violate the assumption of multicollinearity
#Normally distributed error: We perform this step also After we finish OUR MODEL
library(moments)
shapiro.test(model7$residuals) # p-value < 0.05 fails to reject the null hypothesis, so we are good to continue
##
## Shapiro-Wilk normality test
##
## data: model7$residuals
## W = 0.95248, p-value = 1.176e-11
plot(density(model7$residuals)) # graphs shows normal distribution of errors/residuals
cutoff <- 4/((nrow(data2)-length(model5$Ad)-1))
plot(model5, which = 4, cook.levels = cutoff)
outlierTest(model5) # both p-value < 0.05 aceepts the null hypothesis, meaning significant outliers
## rstudent unadjusted p-value Bonferroni p
## 503 -5.811569 1.1022e-08 5.5549e-06
## 504 -5.576195 4.0252e-08 2.0287e-05