library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
library(broom)
wine<-read.csv("https://raw.githubusercontent.com/kitadasmalley/sp21_MATH376LMT/main/data/wine.csv", header=TRUE)
There appears to be some linear relationship between the response and explanatory variables.
mod = lm(quality ~ clarity+ aroma + body + flavor+oakiness)
anova(mod)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## clarity 1 0.125 0.125 0.0926 0.7628120
## aroma 1 77.353 77.353 57.2351 1.286e-08 ***
## body 1 6.414 6.414 4.7461 0.0368417 *
## flavor 1 19.050 19.050 14.0953 0.0006946 ***
## oakiness 1 8.598 8.598 6.3616 0.0168327 *
## Residuals 32 43.248 1.352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Response vect
Y = quality
n = length(quality)
p=5
# Design mat
xMat<-matrix(c(rep(1, length(quality)),
clarity, aroma, body, flavor, oakiness),
nrow=length(quality))
# Hat
hMat<-xMat%*%solve(t(xMat)%*%xMat)%*%t(xMat)
#SS(Res)
I<-diag(length(quality))
ss_res<-t(Y)%*%(I-hMat)%*%Y
ss_res
## [,1]
## [1,] 43.24801
#SS(Tot)
ones<-matrix(rep(1, length(quality)),
nrow=length(quality))
J<-ones%*%t(ones)
ss_tot<-t(Y)%*%(I-(1/n)*J)%*%Y
ss_tot
## [,1]
## [1,] 154.7884
### SS(Reg)
ss_reg<-t(Y)%*%(hMat-(1/n)*J)%*%Y
ss_reg
## [,1]
## [1,] 111.5404
### MS(Res)
ms_res<-ss_res/(n-p-1)
ms_res
## [,1]
## [1,] 1.3515
### MS(Reg)
ms_reg<-ss_reg/p
ms_reg
## [,1]
## [1,] 22.30808
### F-stat
f_stat<-ms_reg/ms_res
f_stat
## [,1]
## [1,] 16.50616
### P-value
pf(f_stat, df1=p, df2=(n-p-1), lower.tail = FALSE)
## [,1]
## [1,] 4.70304e-08
We reject the null with a p-value of 4.7e-08 and an alpha level of 0.05. Therefore, there is sufficient evidence that the variables of interest are related to the quality of wine.
n=dim(wine)[1]
alpha=0.05
p=1
this.df=n-p-1
#F-Cut off
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.113165
#Single Variables
#Consider clarity : F-value = 0.03
mod1a<-lm(quality~clarity, data=wine)
anova(mod1a)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## clarity 1 0.125 0.1252 0.0291 0.8654
## Residuals 36 154.663 4.2962
#Consider body : F-value = 15.508
mod1b<-lm(quality~body, data=wine)
anova(mod1b)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## body 1 46.603 46.603 15.508 0.0003612 ***
## Residuals 36 108.186 3.005
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider aroma : F-value = 36.044
mod1c<-lm(quality~aroma, data=wine)
anova(mod1c)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## aroma 1 77.442 77.442 36.044 6.871e-07 ***
## Residuals 36 77.347 2.149
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider flavor : F-value = 59.789
mod1d<-lm(quality~flavor, data=wine)
anova(mod1d)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 59.789 3.683e-09 ***
## Residuals 36 58.173 1.616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider oakiness : F-value = 0.0798
mod1f<-lm(quality~oakiness, data=wine)
anova(mod1f)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## oakiness 1 0.343 0.3425 0.0798 0.7791
## Residuals 36 154.446 4.2902
#Flavor enters the model
#Two variables
p=2
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.121338
#Consider body : F-value = 0.2303
mod2a<-lm(quality~flavor+body, data=wine)
anova(mod2a)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 58.5109 5.662e-09 ***
## body 1 0.380 0.380 0.2303 0.6343
## Residuals 35 57.793 1.651
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider clarity : F-value = 0.8812
mod2b<-lm(quality~flavor+clarity, data=wine)
anova(mod2b)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 59.5918 4.616e-09 ***
## clarity 1 1.429 1.429 0.8812 0.3543
## Residuals 35 56.745 1.621
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider aroma : F-value = 3.5238
mod2c<-lm(quality~flavor+aroma, data=wine)
anova(mod2c)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 63.9807 2.063e-09 ***
## aroma 1 5.321 5.321 3.5238 0.06885 .
## Residuals 35 52.852 1.510
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider oakiness : F-value = 3.8148
mod2d<-lm(quality~flavor+oakiness, data=wine)
anova(mod2d)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 64.4640 1.892e-09 ***
## oakiness 1 5.717 5.717 3.8148 0.05883 .
## Residuals 35 52.456 1.499
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Oakiness enters the model
p=3
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.130018
#Consider body : F-value = 0.3507
mod3a<-lm(quality~flavor+oakiness+body, data=wine)
anova(mod3a)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 63.2682 2.882e-09 ***
## oakiness 1 5.717 5.717 3.7441 0.06135 .
## body 1 0.536 0.536 0.3507 0.55762
## Residuals 34 51.920 1.527
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider aroma : F-value = 4.8958
mod3b<-lm(quality~flavor+oakiness+aroma, data=wine)
anova(mod3b)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 71.6394 6.948e-10 ***
## oakiness 1 5.717 5.717 4.2394 0.04722 *
## aroma 1 6.603 6.603 4.8958 0.03374 *
## Residuals 34 45.853 1.349
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider clarity : F-value = 2.0199
mod3c<-lm(quality~flavor+oakiness+clarity, data=wine)
anova(mod3c)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 66.3425 1.686e-09 ***
## oakiness 1 5.717 5.717 3.9260 0.05568 .
## clarity 1 2.942 2.942 2.0199 0.16435
## Residuals 34 49.514 1.456
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Aroma enters the model
p=4
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.139252
#Consider body : F-value = 0.1066
mod4a<-lm(quality~flavor+oakiness+aroma+body, data=wine)
anova(mod4a)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 69.7570 1.198e-09 ***
## oakiness 1 5.717 5.717 4.1281 0.05029 .
## aroma 1 6.603 6.603 4.7671 0.03622 *
## body 1 0.148 0.148 0.1066 0.74607
## Residuals 33 45.706 1.385
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Consider clarity : F-value = 1.2656
mod4b<-lm(quality~flavor+oakiness+aroma+clarity, data=wine)
anova(mod4b)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## flavor 1 96.615 96.615 72.1990 8.089e-10 ***
## oakiness 1 5.717 5.717 4.2726 0.04665 *
## aroma 1 6.603 6.603 4.9340 0.03331 *
## clarity 1 1.694 1.694 1.2656 0.26871
## Residuals 33 44.160 1.338
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#End of selection
The final model for quality of wine is flavor, oakiness, and aroma.
new_mod = lm(quality ~ flavor+oakiness+aroma)
summary(new_mod)
##
## Call:
## lm(formula = quality ~ flavor + oakiness + aroma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5707 -0.6256 0.1521 0.6467 1.7741
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4672 1.3328 4.852 2.67e-05 ***
## flavor 1.1997 0.2749 4.364 0.000113 ***
## oakiness -0.6023 0.2644 -2.278 0.029127 *
## aroma 0.5801 0.2622 2.213 0.033740 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.161 on 34 degrees of freedom
## Multiple R-squared: 0.7038, Adjusted R-squared: 0.6776
## F-statistic: 26.92 on 3 and 34 DF, p-value: 4.203e-09
confint(new_mod, level = .95)
## 2.5 % 97.5 %
## (Intercept) 3.75864235 9.1757473
## flavor 0.64106744 1.7583182
## oakiness -1.13965261 -0.0649967
## aroma 0.04729651 1.1129440
#From scratch
#Flavor
x_0 = 1.2643
ss_res<-sum((quality-new_mod$fitted)^2)
n<-length(new_mod$residuals)
ms_res<-ss_res/(n-2)
seConf<-sqrt(ms_res*(1 / n + (x_0 - mean(flavor))^2 / (sum((flavor - mean(flavor))^2))))
seConf
## [1] 0.6583984
#Oakiness
x_0 = -0.6589
ss_res<-sum((quality-new_mod$fitted)^2)
n<-length(new_mod$residuals)
ms_res<-ss_res/(n-2)
seConf<-sqrt(ms_res*(1 / n + (x_0 - mean(flavor))^2 / (sum((flavor - mean(flavor))^2))))
seConf
## [1] 0.9964977
#Aroma
x_0 = 0.5300
ss_res<-sum((quality-new_mod$fitted)^2)
n<-length(new_mod$residuals)
ms_res<-ss_res/(n-2)
seConf<-sqrt(ms_res*(1 / n + (x_0 - mean(flavor))^2 / (sum((flavor - mean(flavor))^2))))
seConf
## [1] 0.7865635