Final LMT

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(dplyr)
library(broom)
wine<-read.csv("https://raw.githubusercontent.com/kitadasmalley/sp21_MATH376LMT/main/data/wine.csv", header=TRUE)

a) Create a pairs plot. Does it appear that there are linear relationships between the response and explanatory variables?

There appears to be some linear relationship between the response and explanatory variables.

b) Fit a multiple linear regression model relating wine quality to the regressors of interest (clarity, aroma, body, flavor, and oakiness). i) You can use the lm() function

mod = lm(quality ~ clarity+ aroma + body + flavor+oakiness)

c) Perform a test the significance of regression (ie. a global F-test).

anova(mod)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## clarity    1  0.125   0.125  0.0926 0.7628120    
## aroma      1 77.353  77.353 57.2351 1.286e-08 ***
## body       1  6.414   6.414  4.7461 0.0368417 *  
## flavor     1 19.050  19.050 14.0953 0.0006946 ***
## oakiness   1  8.598   8.598  6.3616 0.0168327 *  
## Residuals 32 43.248   1.352                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Response vect
Y = quality
n = length(quality)
p=5


# Design mat
xMat<-matrix(c(rep(1, length(quality)), 
          clarity, aroma, body, flavor, oakiness), 
          nrow=length(quality))
# Hat 
hMat<-xMat%*%solve(t(xMat)%*%xMat)%*%t(xMat)

#SS(Res)
I<-diag(length(quality))
ss_res<-t(Y)%*%(I-hMat)%*%Y
ss_res
##          [,1]
## [1,] 43.24801
#SS(Tot)
ones<-matrix(rep(1, length(quality)),
             nrow=length(quality))

J<-ones%*%t(ones)
ss_tot<-t(Y)%*%(I-(1/n)*J)%*%Y
ss_tot
##          [,1]
## [1,] 154.7884
### SS(Reg)

ss_reg<-t(Y)%*%(hMat-(1/n)*J)%*%Y
ss_reg
##          [,1]
## [1,] 111.5404
### MS(Res)
ms_res<-ss_res/(n-p-1)
ms_res
##        [,1]
## [1,] 1.3515
### MS(Reg)
ms_reg<-ss_reg/p
ms_reg
##          [,1]
## [1,] 22.30808
### F-stat
f_stat<-ms_reg/ms_res
f_stat
##          [,1]
## [1,] 16.50616
### P-value
pf(f_stat, df1=p, df2=(n-p-1), lower.tail = FALSE)
##             [,1]
## [1,] 4.70304e-08

We reject the null with a p-value of 4.7e-08 and an alpha level of 0.05. Therefore, there is sufficient evidence that the variables of interest are related to the quality of wine.

d) Perform the forward variable selection algorithm to find a final model.

n=dim(wine)[1]

alpha=0.05


p=1
this.df=n-p-1

#F-Cut off
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.113165
#Single Variables
#Consider clarity : F-value = 0.03
mod1a<-lm(quality~clarity, data=wine)
anova(mod1a)
## Analysis of Variance Table
## 
## Response: quality
##           Df  Sum Sq Mean Sq F value Pr(>F)
## clarity    1   0.125  0.1252  0.0291 0.8654
## Residuals 36 154.663  4.2962
#Consider body : F-value = 15.508
mod1b<-lm(quality~body, data=wine)
anova(mod1b)
## Analysis of Variance Table
## 
## Response: quality
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## body       1  46.603  46.603  15.508 0.0003612 ***
## Residuals 36 108.186   3.005                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider aroma : F-value = 36.044
mod1c<-lm(quality~aroma, data=wine)
anova(mod1c)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## aroma      1 77.442  77.442  36.044 6.871e-07 ***
## Residuals 36 77.347   2.149                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider flavor : F-value = 59.789
mod1d<-lm(quality~flavor, data=wine)
anova(mod1d)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615  59.789 3.683e-09 ***
## Residuals 36 58.173   1.616                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider oakiness : F-value = 0.0798
mod1f<-lm(quality~oakiness, data=wine)
anova(mod1f)
## Analysis of Variance Table
## 
## Response: quality
##           Df  Sum Sq Mean Sq F value Pr(>F)
## oakiness   1   0.343  0.3425  0.0798 0.7791
## Residuals 36 154.446  4.2902
#Flavor enters the model
#Two variables
p=2
this.df=n-p-1

# F_in 
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.121338
#Consider body : F-value = 0.2303
mod2a<-lm(quality~flavor+body, data=wine)
anova(mod2a)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 58.5109 5.662e-09 ***
## body       1  0.380   0.380  0.2303    0.6343    
## Residuals 35 57.793   1.651                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider clarity : F-value = 0.8812
mod2b<-lm(quality~flavor+clarity, data=wine)
anova(mod2b)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 59.5918 4.616e-09 ***
## clarity    1  1.429   1.429  0.8812    0.3543    
## Residuals 35 56.745   1.621                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider aroma : F-value = 3.5238
mod2c<-lm(quality~flavor+aroma, data=wine)
anova(mod2c)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 63.9807 2.063e-09 ***
## aroma      1  5.321   5.321  3.5238   0.06885 .  
## Residuals 35 52.852   1.510                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider oakiness : F-value = 3.8148
mod2d<-lm(quality~flavor+oakiness, data=wine)
anova(mod2d)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 64.4640 1.892e-09 ***
## oakiness   1  5.717   5.717  3.8148   0.05883 .  
## Residuals 35 52.456   1.499                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Oakiness enters the model
p=3
this.df=n-p-1

# F_in 
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.130018
#Consider body : F-value = 0.3507
mod3a<-lm(quality~flavor+oakiness+body, data=wine)
anova(mod3a)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 63.2682 2.882e-09 ***
## oakiness   1  5.717   5.717  3.7441   0.06135 .  
## body       1  0.536   0.536  0.3507   0.55762    
## Residuals 34 51.920   1.527                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider aroma : F-value = 4.8958
mod3b<-lm(quality~flavor+oakiness+aroma, data=wine)
anova(mod3b)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 71.6394 6.948e-10 ***
## oakiness   1  5.717   5.717  4.2394   0.04722 *  
## aroma      1  6.603   6.603  4.8958   0.03374 *  
## Residuals 34 45.853   1.349                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider clarity : F-value = 2.0199
mod3c<-lm(quality~flavor+oakiness+clarity, data=wine)
anova(mod3c)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 66.3425 1.686e-09 ***
## oakiness   1  5.717   5.717  3.9260   0.05568 .  
## clarity    1  2.942   2.942  2.0199   0.16435    
## Residuals 34 49.514   1.456                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Aroma enters the model
p=4
this.df=n-p-1

# F_in 
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.139252
#Consider body : F-value = 0.1066
mod4a<-lm(quality~flavor+oakiness+aroma+body, data=wine)
anova(mod4a)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 69.7570 1.198e-09 ***
## oakiness   1  5.717   5.717  4.1281   0.05029 .  
## aroma      1  6.603   6.603  4.7671   0.03622 *  
## body       1  0.148   0.148  0.1066   0.74607    
## Residuals 33 45.706   1.385                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider clarity : F-value = 1.2656
mod4b<-lm(quality~flavor+oakiness+aroma+clarity, data=wine)
anova(mod4b)
## Analysis of Variance Table
## 
## Response: quality
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## flavor     1 96.615  96.615 72.1990 8.089e-10 ***
## oakiness   1  5.717   5.717  4.2726   0.04665 *  
## aroma      1  6.603   6.603  4.9340   0.03331 *  
## clarity    1  1.694   1.694  1.2656   0.26871    
## Residuals 33 44.160   1.338                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#End of selection

The final model for quality of wine is flavor, oakiness, and aroma.

e) Look at the summary() for your final model.

new_mod = lm(quality ~ flavor+oakiness+aroma)

summary(new_mod)
## 
## Call:
## lm(formula = quality ~ flavor + oakiness + aroma)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5707 -0.6256  0.1521  0.6467  1.7741 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.4672     1.3328   4.852 2.67e-05 ***
## flavor        1.1997     0.2749   4.364 0.000113 ***
## oakiness     -0.6023     0.2644  -2.278 0.029127 *  
## aroma         0.5801     0.2622   2.213 0.033740 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.161 on 34 degrees of freedom
## Multiple R-squared:  0.7038, Adjusted R-squared:  0.6776 
## F-statistic: 26.92 on 3 and 34 DF,  p-value: 4.203e-09
confint(new_mod, level = .95)
##                   2.5 %     97.5 %
## (Intercept)  3.75864235  9.1757473
## flavor       0.64106744  1.7583182
## oakiness    -1.13965261 -0.0649967
## aroma        0.04729651  1.1129440
#From scratch
#Flavor
x_0 =  1.2643 
ss_res<-sum((quality-new_mod$fitted)^2)
n<-length(new_mod$residuals)
ms_res<-ss_res/(n-2)  

seConf<-sqrt(ms_res*(1 / n + (x_0 - mean(flavor))^2 / (sum((flavor - mean(flavor))^2))))
seConf 
## [1] 0.6583984
#Oakiness
x_0 =  -0.6589 
ss_res<-sum((quality-new_mod$fitted)^2)
n<-length(new_mod$residuals)
ms_res<-ss_res/(n-2)  

seConf<-sqrt(ms_res*(1 / n + (x_0 - mean(flavor))^2 / (sum((flavor - mean(flavor))^2))))
seConf 
## [1] 0.9964977
#Aroma
x_0 =   0.5300
ss_res<-sum((quality-new_mod$fitted)^2)
n<-length(new_mod$residuals)
ms_res<-ss_res/(n-2)  

seConf<-sqrt(ms_res*(1 / n + (x_0 - mean(flavor))^2 / (sum((flavor - mean(flavor))^2))))
seConf 
## [1] 0.7865635