Multiple Regression

Example 1 -

In this case, we try to use sugar intake and water intake to predict weight gained.

# Sugar intake
s <- c(5, 8, 9, 10, 15, 18, 14, 17, 20, 22, 24, 26, 30 ,30, 32, 35, 40, 20, 25, 35)
# Water intake
w <- c(1000, 1100, 900, 2000, 1800, 1100, 1400, 2200, 2600, 2400, 3200, 1900, 2050, 2100, 2200, 2200, 1100, 1300, 1900, 1400)
# Weight gained 
wg <- c(20, 30, 60, 70, 100, 95, 70, 83, 103, 112, 130, 80, 95, 130, 112, 100, 105, 84, 82, 91)

data1 <- data.frame(s, w, wg)
#data
names(data1)
## [1] "s"  "w"  "wg"
str(data1)
## 'data.frame':    20 obs. of  3 variables:
##  $ s : num  5 8 9 10 15 18 14 17 20 22 ...
##  $ w : num  1000 1100 900 2000 1800 1100 1400 2200 2600 2400 ...
##  $ wg: num  20 30 60 70 100 95 70 83 103 112 ...
summary(data1)
##        s               w              wg       
##  Min.   : 5.00   Min.   : 900   Min.   : 20.0  
##  1st Qu.:14.75   1st Qu.:1250   1st Qu.: 77.5  
##  Median :21.00   Median :1900   Median : 93.0  
##  Mean   :21.75   Mean   :1792   Mean   : 87.6  
##  3rd Qu.:30.00   3rd Qu.:2200   3rd Qu.:103.5  
##  Max.   :40.00   Max.   :3200   Max.   :130.0
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.6.2
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
#we Check our dataset for the correlation between predictors/IVs and outcome/DV.

#OPTION 1
#let's first check relationship between sugar vs water intake
cor(data1$s, data1$w, use = "complete.obs", method = "pearson")
## [1] 0.3053748
cor.test(data1$s, data1$w, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data1$s and data1$w
## t = 1.3606, df = 18, p-value = 0.1904
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1585750  0.6588606
## sample estimates:
##       cor 
## 0.3053748
# results demonstrate r = 0.3053748 medium correlation but not significant p-value = 0.1904

#let's now check relationship between water intake vs weight gain
cor(data1$w, data1$wg, use = "complete.obs", method = "pearson")
## [1] 0.6637163
cor.test(data1$w, data1$wg, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data1$w and data1$wg
## t = 3.7647, df = 18, p-value = 0.001419
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3131787 0.8550901
## sample estimates:
##       cor 
## 0.6637163
# results demonstrate r = 0.6637163 strong correlation and also significant p-value = 0.001419

#OPTION 2 - rather than checking separate models, we can preferrably use the data.matrix
DataMatrix1 <- as.matrix(data1[, c("s", "w", "wg")])
rcorr(DataMatrix1)
##       s    w   wg
## s  1.00 0.31 0.70
## w  0.31 1.00 0.66
## wg 0.70 0.66 1.00
## 
## n= 20 
## 
## 
## P
##    s      w      wg    
## s         0.1904 0.0005
## w  0.1904        0.0014
## wg 0.0005 0.0014
#Results show medium correlation b/t sugar and water, STRONG correlation b/t sugar and weight gain
#So we can use sugar intake - IV and weight gain - DV in our multiple regression model 
# 1.The type of variable - data is continuous
# 2.Non- zero variance:
plot(density(data1$wg))

plot(density(data1$w))

plot(density(data1$s))

#Graphs demonstrate there is variability b/w datasetg
#Multicollinearity - After we finish OUR MODEL, Make sure there are not Multicollinearity issues: Sugar and water are not significantly correlated.
#VIF
#vif(model3)

#Predictors are not correlated with extraneous variable - not third party issue

#Heteroscedasticity:
library(car)
## Warning: package 'car' was built under R version 3.6.2
## Loading required package: carData
scatterplot(data1$s, data1$wg)

scatterplot(data1$w, data1$wg)

#Independent errors - randomly selected sample so we are good here

#Normally distributed error: We perform this step also After we finish OUR MODEL
library(moments)
#shapiro.test(model$residuals)

#Independence - randomly selected sample so we are good here

#Linearity: 
scatterplot(data1$s, data1$wg)

scatterplot(data1$w, data1$wg)

After checking all the assumption, we move on to creating models

# Now we create the model:
model1 <- lm(wg ~ s, data1) # IV: sugar, DV: weight
summary(model1)
## 
## Call:
## lm(formula = wg ~ s, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.167 -14.556  -1.119  15.887  37.909 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44.1869    11.3395   3.897 0.001057 ** 
## s             1.9960     0.4762   4.191 0.000549 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.64 on 18 degrees of freedom
## Multiple R-squared:  0.4939, Adjusted R-squared:  0.4658 
## F-statistic: 17.57 on 1 and 18 DF,  p-value: 0.0005488
#F-statistic: 17.57 on 1 and 18 DF,  p-value: 0.0005488 - overall significant model. Sugar (slope) is significant and the intercept is also significant, so CAN be used as a good regression model
# Multiple R-squared:  0.4939 - explains about 50% of variance
AIC(model1) #Akaike Information Criterion
## [1] 181.7333
BIC(model1) #Bayesian Information Criterion
## [1] 184.7205
model2 <- lm(wg ~ w, data1) #IV: water, DV: weight
summary(model2)
## 
## Call:
## lm(formula = wg ~ w, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.376  -9.684  -0.395  12.001  38.568 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 32.808755  15.341490   2.139  0.04643 * 
## w            0.030567   0.008119   3.765  0.00142 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.7 on 18 degrees of freedom
## Multiple R-squared:  0.4405, Adjusted R-squared:  0.4094 
## F-statistic: 14.17 on 1 and 18 DF,  p-value: 0.001419
# F(1,18) = 14.17, p < .001, R2  = .44 Est
AIC(model2) #Akaike Information Criterion
## [1] 183.7389
BIC(model2) #Bayesian Information Criterion
## [1] 186.7261
model3 <- lm(wg ~ s + w, data1) #IV: sugar and water, DV: weight
summary(model3)
## 
## Call:
## lm(formula = wg ~ s + w, data = data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.2851 -11.8333  -0.6146  10.2072  29.0703 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.642588  12.286212   1.029 0.317896    
## s            1.566433   0.385277   4.066 0.000804 ***
## w            0.022810   0.006247   3.651 0.001977 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.9 on 17 degrees of freedom
## Multiple R-squared:  0.7163, Adjusted R-squared:  0.683 
## F-statistic: 21.47 on 2 and 17 DF,  p-value: 2.232e-05
# F(2,17) = 21.47, p < .001, R2  = .72 
AIC(model3) #Akaike Information Criterion
## [1] 172.1542
BIC(model3) #Bayesian Information Criterion
## [1] 176.1372
#AIC and BIC score: the smaller, the better.
# To compare BIC we can use exp((BIC Smaller – BIC Larger)/2) or we can just subtract the smaller BIC from the larger BIC. Values of 2 - 6 positive support, 6 – 10 strong support, and > 10 very strong support for the SMALLER BIC value model.
# Inthis case model3(BIC 176) minus either model1 (BIC 184) or model2 (BIC 186) is more than 6 - positive support for model 3 with 2 predictors compared to either model 1 or 2 with 1 predictor.
#Lets check for if there is any significant interaction effect
model4 <- lm(wg ~ s*w, data1) #IV: sugar and water, DV: weight
summary(model4)
## 
## Call:
## lm(formula = wg ~ s * w, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.805 -10.720  -2.461   9.066  28.697 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -1.018e+01  2.657e+01  -0.383   0.7066  
## s            2.684e+00  1.216e+00   2.207   0.0422 *
## w            3.940e-02  1.822e-02   2.162   0.0461 *
## s:w         -7.659e-04  7.902e-04  -0.969   0.3468  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.93 on 16 degrees of freedom
## Multiple R-squared:  0.7321, Adjusted R-squared:  0.6818 
## F-statistic: 14.57 on 3 and 16 DF,  p-value: 7.737e-05
# F(3,16) = 14.57, p < .001, R2  = .73 - no significant interaction effect b/w sugar and water in terms of how they affect wwight gain
AIC(model4) #Akaike Information Criterion
## [1] 173.013
BIC(model4) #Bayesian Information Criterion
## [1] 177.9917
# AIC and BIC not significantly better than model3
#COmpare model using CHi Sq - Analysis of Variance Table
anova(model1, model2) #no significant difference b/w model1 and 2
## Analysis of Variance Table
## 
## Model 1: wg ~ s
## Model 2: wg ~ w
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1     18 7665.8                      
## 2     18 8474.3  0   -808.57
anova(model1, model3) #significant difference b/w model1 and 3 [Pr(>F) or p-value = 0.001977]
## Analysis of Variance Table
## 
## Model 1: wg ~ s
## Model 2: wg ~ s + w
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1     18 7665.8                                
## 2     17 4296.5  1    3369.2 13.331 0.001977 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since we have decided the model 3 isbetter, we can then generate our regression model equation - Y = -10.18 + 1.57(sugar) - .004 (water)

library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 3.6.3
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:survival':
## 
##     aml
## The following object is masked from 'package:lattice':
## 
##     melanoma
## Loading required package: MASS
## 
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
## 
##     norm
lm.beta(model3) #for reference only
##         s         w 
## 0.5515316 0.4952925

Now that we have decided on model - 3, we need to still check our two more assumptions - multicollinearity and normally distribution of error

#Multicollinearity - After we finish OUR MODEL, Make sure there are not Multicollinearity issues: Sugar and water are not significantly correlated.
#VIF
vif(model3)
##        s        w 
## 1.102844 1.102844
#Results with values < 3 suggest that none of the predictors violate the assumption of multicollinearity

#Normally distributed error: We perform this step also After we finish OUR MODEL
library(moments)
shapiro.test(model3$residuals) # p-value - 0.7337 fails to reject the null hypothesis, so we are good to continue
## 
##  Shapiro-Wilk normality test
## 
## data:  model3$residuals
## W = 0.969, p-value = 0.7337
plot(density(model3$residuals)) # graphs shows normal distribution of errors/residuals

# 
cutoff <- 4/((nrow(data1)-length(model1$w)-1)) 
plot(model1, which = 4, cook.levels = cutoff)

outlierTest(model1) # p-value = 0.056331 fails to reject the null hypothesis, meaning no significant outliers
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##    rstudent unadjusted p-value Bonferroni p
## 11 2.047996           0.056331           NA

SUmmary Write Up 1 -

A linear regression model was conducted to predict participants’ weight increase, based on their sugar and water intake. All the regression assumptions were met, and no further adjustments made. A significant regression equation was found (F(2,17) = 21.47, p < .001), with an R-square of .72, which suggests that 72% of the variance of participant’s weight gained can be explained by the two predictors. Both sugar (t = 4.07, p < .001. b = 1.57) and water (t = 3.65, p = .002. b = .02) were statistically significant. The result suggested that sugar predicts that for each sugar intake, there is a 1.57-gram increase in participant’s weight. Besides, water also predicts that for a cup of water drank there is a 0.02-gram increase in participant’s weight.

Example 2 -

Our goal is to determine whether how many times the ad is watched and age can significantly predict rating.

library(readxl)
## Warning: package 'readxl' was built under R version 3.6.3
setwd("E:/mikhilesh/HU Sem VI ANLY 510 and 506/ANLY 510 Kao Principals and Applications/Lecture and other materials")
data2 <- read_xlsx("Lecture 8 MultipleRegressionExample.xlsx")
names(data2)
## [1] "Ad"     "Age"    "Sex"    "Rating"
str(data2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    504 obs. of  4 variables:
##  $ Ad    : num  0 0 1 1 2 2 0 0 1 1 ...
##  $ Age   : num  55 56 56 54 36 36 75 42 61 41 ...
##  $ Sex   : chr  "male" "female" "male" "female" ...
##  $ Rating: num  4.5 19.4 36.4 49.6 75.6 81.6 5.5 28.2 35.9 60.1 ...
summary(data2)
##        Ad         Age            Sex                Rating      
##  Min.   :0   Min.   :18.00   Length:504         Min.   :  0.00  
##  1st Qu.:0   1st Qu.:33.75   Class :character   1st Qu.: 23.88  
##  Median :1   Median :45.00   Mode  :character   Median : 43.70  
##  Mean   :1   Mean   :45.96                      Mean   : 46.00  
##  3rd Qu.:2   3rd Qu.:60.00                      3rd Qu.: 68.17  
##  Max.   :2   Max.   :75.00                      Max.   :103.60
library(Hmisc)
#we Check our dataset for the correlation between predictors/IVs and outcome/DV.

#OPTION 1
#let's first check relationship between Ad vs Rating
cor(data2$Rating, data2$Ad, use = "complete.obs", method = "pearson")
## [1] 0.9068833
cor.test(data2$Rating, data2$Ad, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data2$Rating and data2$Ad
## t = 48.22, df = 502, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8900406 0.9212536
## sample estimates:
##       cor 
## 0.9068833
# results demonstrate r = 0.9068833 strong correlation and significant p-value < 0.05

#let's now check relationship between water Age vs Rating
cor(data2$Rating, data2$Age, use = "complete.obs", method = "pearson")
## [1] -0.1761425
cor.test(data2$Rating, data2$Age, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data2$Rating and data2$Age
## t = -4.0092, df = 502, p-value = 7.017e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.25949194 -0.09018839
## sample estimates:
##        cor 
## -0.1761425
# results demonstrate r = -0.1761425 weak correlation and also significant p-value < 0.05

#OPTION 2 - rather than checking separate models, we can preferrably use the data.matrix
DataMatrix2 <- as.matrix(data2[, c("Ad", "Age", "Rating")])
rcorr(DataMatrix2)
##           Ad   Age Rating
## Ad      1.00 -0.04   0.91
## Age    -0.04  1.00  -0.18
## Rating  0.91 -0.18   1.00
## 
## n= 504 
## 
## 
## P
##        Ad     Age    Rating
## Ad            0.3561 0.0000
## Age    0.3561        0.0000
## Rating 0.0000 0.0000
#Results show STRONG correlation b/t Ad and Rating
#So we can use Ad - IV and Rating - DV in our multiple regression model 
# 1.The type of variable - data is continuous
# 2.Non- zero variance:
plot(density(data2$Rating))

plot(density(data2$Ad))

plot(density(data2$Age))

#Graphs demonstrate there is variability b/w dataset
#Multicollinearity - After we finish OUR MODEL, Make sure there are not Multicollinearity issues: Sugar and water are not significantly correlated.
#VIF
#vif(model3)

#Predictors are not correlated with extraneous variable - no third party issue

#Heteroscedasticity:
library(car)
scatterplot(data2$Ad, data2$Rating)

scatterplot(data2$Age, data2$Rating)

#Independent errors - randomly selected sample so we are good here

#Normally distributed error: We perform this step also After we finish OUR MODEL
library(moments)
#shapiro.test(model$residuals)

#Independence - randomly selected sample so we are good here

#Linearity: 
scatterplot(data2$Ad, data2$Rating)

scatterplot(data2$Age, data2$Rating)

After checking all the assumption, we move on to creating models

# Now we create the model:
model5 <- lm(Rating ~ Ad, data2) # IV: Ad, DV: Rating
summary(model5)
## 
## Call:
## lm(formula = Rating ~ Ad, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.013  -8.322  -0.951   7.921  28.087 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  16.4878     0.7901   20.87   <2e-16 ***
## Ad           29.5128     0.6120   48.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.22 on 502 degrees of freedom
## Multiple R-squared:  0.8224, Adjusted R-squared:  0.8221 
## F-statistic:  2325 on 1 and 502 DF,  p-value: < 2.2e-16
#F-statistic:  2325 on 1 and 502 DF,  p-value: < 2.2e-16 - overall significant model. Ad (slope) is significant and the intercept is also significant, so CAN be used as a good regression model
# Multiple R-squared:  0.8224 - explains > 80% of variance
AIC(model5) #Akaike Information Criterion
## [1] 3871.232
BIC(model5) #Bayesian Information Criterion
## [1] 3883.9
model6 <- lm(Rating ~ Age, data2) #IV: Age, DV: Rating
summary(model6)
## 
## Call:
## lm(formula = Rating ~ Age, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.297 -23.285  -1.919  22.568  54.715 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 59.30913    3.51877  16.855  < 2e-16 ***
## Age         -0.28957    0.07222  -4.009 7.02e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.21 on 502 degrees of freedom
## Multiple R-squared:  0.03103,    Adjusted R-squared:  0.0291 
## F-statistic: 16.07 on 1 and 502 DF,  p-value: 7.017e-05
# F(1,502) = 16.07 , p < 0.001, R2  = 0.03103 Est
AIC(model6) #Akaike Information Criterion
## [1] 4726.477
BIC(model6) #Bayesian Information Criterion
## [1] 4739.145
model7 <- lm(Rating ~ Ad + Age, data2) #IV: Ad + Age, DV: Rating
summary(model7)
## 
## Call:
## lm(formula = Rating ~ Ad + Age, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.046  -7.393  -1.536   7.728  25.997 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.17791    1.55844  17.439  < 2e-16 ***
## Ad          29.32643    0.57890  50.659  < 2e-16 ***
## Age         -0.22854    0.02924  -7.815 3.26e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.6 on 501 degrees of freedom
## Multiple R-squared:  0.8417, Adjusted R-squared:  0.8411 
## F-statistic:  1332 on 2 and 501 DF,  p-value: < 2.2e-16
# F(2,501) = 1332 , p < .001, R2  = 0.8417 
AIC(model7) #Akaike Information Criterion
## [1] 3815.258
BIC(model7) #Bayesian Information Criterion
## [1] 3832.148
#AIC and BIC score: the smaller, the better.
# To compare BIC we can use exp((BIC Smaller – BIC Larger)/2) or we can just subtract the smaller BIC from the larger BIC. Values of 2 - 6 positive support, 6 – 10 strong support, and > 10 very strong support for the SMALLER BIC value model.
# In this case model7 (BIC 3832.148) minus either model5 (BIC 3883.9) or model6 (BIC 4739.145) is more than 6 - positive support for model 7 with 2 predictors compared to either model 5 or 6 with 1 predictor.
#Lets check for if there is any significant interaction effect
model8 <- lm(Rating ~ Ad*Age, data2) #IV: Ad + Age, DV: Rating
summary(model8)
## 
## Call:
## lm(formula = Rating ~ Ad * Age, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.043  -7.471  -1.572   7.612  26.061 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.455351   2.221037  12.362  < 2e-16 ***
## Ad          29.032870   1.770343  16.400  < 2e-16 ***
## Age         -0.234501   0.044843  -5.229  2.5e-07 ***
## Ad:Age       0.006388   0.036399   0.175    0.861    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.61 on 500 degrees of freedom
## Multiple R-squared:  0.8417, Adjusted R-squared:  0.8408 
## F-statistic: 886.5 on 3 and 500 DF,  p-value: < 2.2e-16
# F(3,500) = 886.5 , p < .001, R2  = 0.8417 - no significant interaction effect b/w Ad and Age in terms of how they affect Ratings
AIC(model8) #Akaike Information Criterion
## [1] 3817.227
BIC(model8) #Bayesian Information Criterion
## [1] 3838.34
# AIC and BIC not significantly better than model3
#COmpare model using CHi Sq - Analysis of Variance Table
anova(model5, model6) #no significant difference b/w model5 and 6
## Analysis of Variance Table
## 
## Model 1: Rating ~ Ad
## Model 2: Rating ~ Age
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1    502  63184                      
## 2    502 344802  0   -281617
anova(model5, model7) #significant difference b/w model5 and 7 [Pr(>F) or p-value < 0.05]
## Analysis of Variance Table
## 
## Model 1: Rating ~ Ad
## Model 2: Rating ~ Ad + Age
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1    502 63184                                  
## 2    501 56319  1    6865.5 61.075 3.258e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since we have decided the model 3 isbetter, we can then generate our regression model equation - Y = -10.18 + 1.57(sugar) - .004 (water)

library(QuantPsyc)
lm.beta(model7) #for reference only
##         Ad        Age 
##  0.9011564 -0.1390201

Now that we have decided on model - 7, we need to still check our two more assumptions - multicollinearity and normally distribution of error

#Multicollinearity - After we finish OUR MODEL, Make sure there are not Multicollinearity issues: Sugar and water are not significantly correlated.
#VIF
vif(model7)
##     Ad    Age 
## 1.0017 1.0017
#Results with values < 3 suggest that none of the predictors violate the assumption of multicollinearity

#Normally distributed error: We perform this step also After we finish OUR MODEL
library(moments)
shapiro.test(model7$residuals) # p-value < 0.05 fails to reject the null hypothesis, so we are good to continue
## 
##  Shapiro-Wilk normality test
## 
## data:  model7$residuals
## W = 0.95248, p-value = 1.176e-11
plot(density(model7$residuals)) # graphs shows normal distribution of errors/residuals

cutoff <- 4/((nrow(data2)-length(model5$Ad)-1)) 
plot(model5, which = 4, cook.levels = cutoff)

outlierTest(model5) # both p-value < 0.05 aceepts the null hypothesis, meaning significant outliers
##      rstudent unadjusted p-value Bonferroni p
## 503 -5.811569         1.1022e-08   5.5549e-06
## 504 -5.576195         4.0252e-08   2.0287e-05

SUmmary Write Up 2 -