# Import the data
wine<-read.csv("https://raw.githubusercontent.com/kitadasmalley/sp21_MATH376LMT/main/data/wine.csv", header=TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
wine%>%
select(Quality, Aroma, Body, Flavor, Oakiness, Region, Clarity)%>%
pairs()
There appears to be a fairly strong, positive, linear relationship between Quality and Aroma, Body, and Flavor, and a weaker positive, linear relationship with Oakiness. Quality doesn’t see to have a linear relationship with Region or Clarity.
mod<-lm(wine$Quality~wine$Clarity + wine$Aroma + wine$Body + wine$Flavor + wine$Oakiness)
mod
##
## Call:
## lm(formula = wine$Quality ~ wine$Clarity + wine$Aroma + wine$Body +
## wine$Flavor + wine$Oakiness)
##
## Coefficients:
## (Intercept) wine$Clarity wine$Aroma wine$Body wine$Flavor
## 3.9969 2.3395 0.4826 0.2732 1.1683
## wine$Oakiness
## -0.6840
Our multiple linear regression model is \(y=3.9969 +2.3395x_1 +0.4826 x_2+ 0.2732 x_3 + 1.1683 x_4 -0.6840 x_5,\) where \(x_1,x_2,x_3,x_4,x_5\) are the slopes of clarity, aroma, body, flavor, and oakiness (respectivly).
#ANOVA Table
#Start off by setting up our response variable
y<-wine$Quality
#Make our Design Matrix
xMat<-matrix(c(rep(1, dim(wine)[1]), wine$Clarity, wine$Aroma, wine$Body, wine$Flavor, wine$Oakiness), nrow=dim(wine)[1])
#Hat Matrix
hMat<-xMat%*%solve(t(xMat)%*%xMat)%*%t(xMat)
#Identity Matrix
I<-diag(dim(wine)[1])
#Define J
J<-matrix( rep( 1, len=length(y)*length(y)), nrow = length(y))
n<-length(y)
p<-6
#Finding the Sum of Squares
ssReg<-t(y) %*% (hMat-((1/n)*J))%*% y
ssReg
## [,1]
## [1,] 111.5404
ssRes<- t(y) %*% (I-hMat) %*% y
ssRes
## [,1]
## [1,] 43.24801
ssTot<-t(y) %*% (I-(1/n)*J)%*% y
ssTot
## [,1]
## [1,] 154.7884
#Next, our mean squares
ms_res<-sum(mod$residuals^2)/(n-p-1)
ms_res
## [1] 1.395097
ms_reg <- ssReg/p
ms_reg
## [,1]
## [1,] 18.59007
#double check
anova(mod)
Thus, we have constructed our ANOVA Table Summary: Source) Degrees of freedom, sum of squares, mean squares, F value, P value
Regression) df=6, SS=111.5404, MS=18.59007, F val= 13.32529 (solved below), P val= 2.050148e-07 (solved below) Residual) df=31, SS=43.24801, MS=1.395097, x, x Total) df=37, SS=154.7884, x, x, x
#F-Statistic
f_stat<- ms_reg/ms_res
f_stat
## [,1]
## [1,] 13.32529
#P-Value
pf(F, p, n-p-1, lower.tail = FALSE)
## [1] 1
We have an F-statistic of \(13.32529\) which follows an F distribution with degrees of freedom df1=6, and df2=31. We have a P-Value of \(2.050148e-07.\)
We can reject the null hypothesis with a p-value of 2.050148e-07 at the 0.01 significance level. There is convincing evidence to suggest that there is a significant relationship between the quality of a wine and its clarity, aroma, body, flavor, and oakiness.
#Forward Variable Selection
#STEP 1
n=dim(wine)[1]
alpha=0.15
# Look at models with one variable
p=1
this.df=n-p-1
# Find the F for our first cut-off
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE) #Our value is 2.163609
## [1] 2.163609
## A: x1 Clarity
mod1a<-lm(y~Clarity, data=wine)
anova(mod1a)#F-value: 0.0291 oof
## B: x2 Aroma
mod1b<-lm(y~Aroma, data=wine)
anova(mod1b)#F-value: 36.044
## C: x3 Body
mod1c<-lm(y~Body, data=wine)
anova(mod1c)#F-value: 15.508
## D: x4 Flavor
mod1d<-lm(y~Flavor, data=wine)
anova(mod1d)#F-value: 59.789 (BIGGEST)
## E: x4 Oakiness
mod1e<-lm(y~Oakiness, data=wine)
anova(mod1e)#F-value: 0.0798
# STEP 2: NOW LOOK AT TWO VARIABLE MODELS
p=2
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE) #2.166308
## [1] 2.166308
## A: x1 Clarity
mod2a<-lm(y~Flavor +Clarity, data=wine)
anova(mod2a)#F-value: 0.8812 uh oh
## B: x2 Aroma
mod2b<-lm(y~Flavor + Aroma, data=wine)
anova(mod2b)#F-value: 3.5238 wahoo! better than clarity
## C: x3 Body
mod2c<-lm(y~Flavor + Body, data=wine)
anova(mod2c)#F-value: 0.2303
## E: x5 Oakiness
mod2e<-lm(y~Flavor + Oakiness, data=wine)
anova(mod2e)#F-value: 3.8148 (BIGGEST)
# NOW LOOK AT THREE VARIABLE MODELS
p=3
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE) #2.169171
## [1] 2.169171
## A: x1 Clarity
mod3a<-lm(y~Flavor +Oakiness + Clarity, data=wine)
anova(mod3a)#F-value: 2.0199
## B: x2 Aroma
mod3b<-lm(y~Flavor + Oakiness + Aroma, data=wine)
anova(mod3b)#F-value: 4.8958 (BIGGEST)
## C: x3 Body
mod3c<-lm(y~Flavor + Oakiness + Body, data=wine)
anova(mod3c)#F-value: 0.3507
#Step 4: Four variable models
p=4
this.df=n-p-1
# F_in
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE) #2.172213
## [1] 2.172213
## A: x1 Clarity
mod4a<-lm(y~Flavor+ Oakiness+ Aroma +Clarity, data=wine)
anova(mod4a)#F-value: 1.2656
## C: x3 Body
mod4a<-lm(y~Flavor+ Oakiness+ Aroma + Body, data=wine)
anova(mod4a)#F-value: 0.1066
#neither of these values are larger than 2.172213 so we can conclude they are not significant
Our final model includes Flavor, Oakiness, and Aroma.
#Final Model
mod<-lm(y~ wine$Flavor+wine$Oakiness +wine$Aroma)
summary(mod)
##
## Call:
## lm(formula = y ~ wine$Flavor + wine$Oakiness + wine$Aroma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5707 -0.6256 0.1521 0.6467 1.7741
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4672 1.3328 4.852 2.67e-05 ***
## wine$Flavor 1.1997 0.2749 4.364 0.000113 ***
## wine$Oakiness -0.6023 0.2644 -2.278 0.029127 *
## wine$Aroma 0.5801 0.2622 2.213 0.033740 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.161 on 34 degrees of freedom
## Multiple R-squared: 0.7038, Adjusted R-squared: 0.6776
## F-statistic: 26.92 on 3 and 34 DF, p-value: 4.203e-09
#95% Confidence intervals
n=dim(wine)[1]
p=3
#Flavor
#Recall from above, this is our point estimate
beta_1<-mod$coefficients[2]
beta_1
## wine$Flavor
## 1.199693
#We first find the critical value for 95% confidence from a t-distribution with n−2 degrees of freedom
qt(0.975, df=n-4)
## [1] 2.032245
#Now we find the standard error
se_b1<-sqrt(ms_res/sum((wine$Quality-mean(wine$Quality))^2))
se_b1
## [1] 0.09493644
#We finally find our confidence interval
beta_1+c(-1,1)*qt(0.975, df=n-4)*se_b1
## [1] 1.006759 1.392627
#Oakiness
# point estimate
beta_2<-mod$coefficients[3]
beta_2
## wine$Oakiness
## -0.6023247
#critical value
qt(0.975, df=n-4)
## [1] 2.032245
#standard error
se_b2<-sqrt(ms_res/sum((wine$Quality-mean(wine$Quality))^2))
se_b2
## [1] 0.09493644
#confidence interval
beta_2+c(-1,1)*qt(0.975, df=n-4)*se_b2
## [1] -0.7952587 -0.4093906
#Aroma
# point estimate
beta_3<-mod$coefficients[4]
beta_3
## wine$Aroma
## 0.5801203
#critical value
qt(0.975, df=n-4)
## [1] 2.032245
#standard error
se_b3<-sqrt(ms_res/sum((wine$Quality-mean(wine$Quality))^2))
se_b3
## [1] 0.09493644
#confidence interval
beta_3+c(-1,1)*qt(0.975, df=n-4)*se_b3
## [1] 0.3871862 0.7730543
#Double check with this guy
confint(mod)
## 2.5 % 97.5 %
## (Intercept) 3.75864235 9.1757473
## wine$Flavor 0.64106744 1.7583182
## wine$Oakiness -1.13965261 -0.0649967
## wine$Aroma 0.04729651 1.1129440
#hmmm something is off but I can't figure out what??
Our final model is \(y=6.4672+ 1.1997 x_4 -0.6023 x_5 + 0.5801x_2.\)
I would tell wine makers to spend less effort importing wine from different regions since it seems to have no effect on the quality. From this data it seems like the most important elements to consider when trying to create a quality wine is the flavor, the oakiness, and the aroma. These are the elements that have the most correlation with the quality of the wine.