# Import the data
wine<-read.csv("https://raw.githubusercontent.com/kitadasmalley/sp21_MATH376LMT/main/data/wine.csv", header=TRUE)
pairs(wine)
# Response Variable is Quality
# Explanatory Variables are everything else (except for region)
It seems that there is a linear relationship between Flavor and Quality, Body and Quality, and Aroma and Quality. Removing Region from the equation, the only two variables that donโt seem to have any linear relationship with Quality are Oakiness and Clarity.
model<-lm("Quality~Clarity+Aroma+Body+Flavor+Oakiness", data = wine)
model
##
## Call:
## lm(formula = "Quality~Clarity+Aroma+Body+Flavor+Oakiness", data = wine)
##
## Coefficients:
## (Intercept) Clarity Aroma Body Flavor Oakiness
## 3.9969 2.3395 0.4826 0.2732 1.1683 -0.6840
response_matrix<-wine$Quality
design_matrix<-matrix(c(rep(1, dim(wine)[1]), wine$Clarity, wine$Aroma, wine$Body, wine$Flavor, wine$Oakiness), nrow=dim(wine)[1])
betaMat<-solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)%*%response_matrix
hat_matrix<-design_matrix%*%solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)
y_hat_matrix<-design_matrix%*%betaMat
error_vector<-response_matrix-y_hat_matrix
n<-dim(wine)[1]
p<-5
ms_res<-sum(error_vector^2)/(n-p-1)
covMat<-ms_res*solve(t(design_matrix)%*%design_matrix)
I<-diag(dim(wine)[1])
ss_res<-t(response_matrix)%*%(I-hat_matrix)%*%response_matrix
ones<-matrix(c(rep(1, dim(wine)[1])),
nrow=dim(wine)[1])
J<-ones%*%t(ones)
ss_tot<-t(response_matrix)%*%(I-(J/n))%*%response_matrix
ss_reg<-t(response_matrix)%*%(hat_matrix-(J/n))%*%response_matrix
# Sum of Squares Regression:
ss_reg
## [,1]
## [1,] 111.5404
# Sum of Squares Residual:
ss_res
## [,1]
## [1,] 43.24801
# Sum of Squares Total
ss_tot
## [,1]
## [1,] 154.7884
ms_res<-ss_res/(n-p-1)
ms_reg<-ss_reg/p
f_stat<-ms_reg/ms_res
f_stat
## [,1]
## [1,] 16.50616
# This uses an "F" Distribution with n-p-1 degree of freedom
p_value<-pf(f_stat, p, n-p-1, lower.tail = FALSE)
p_value
## [,1]
## [1,] 4.70304e-08
We (I) reject the null hypothesis in favor of the alternative, where the p-value is 4.70304e-08 at a significance level of 0.05. There is compelling evidence that the clarity, flavor, body, aroma, and oakiness of a wine determine its quality.
# Using Backward Elimination
n<-dim(wine)[1]
p<-5
this.df=n-p-1
alpha<-0.05
# Rejection Region
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.149097
back1<-lm(Quality~Clarity+Aroma+Body+Flavor+Oakiness, data = wine)
anova(back1) # Clarity is the smallest, so it gets removed
## Analysis of Variance Table
##
## Response: Quality
## Df Sum Sq Mean Sq F value Pr(>F)
## Clarity 1 0.125 0.125 0.0926 0.7628120
## Aroma 1 77.353 77.353 57.2351 1.286e-08 ***
## Body 1 6.414 6.414 4.7461 0.0368417 *
## Flavor 1 19.050 19.050 14.0953 0.0006946 ***
## Oakiness 1 8.598 8.598 6.3616 0.0168327 *
## Residuals 32 43.248 1.352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Next Step
p<-4
this.df=n-p-1
# New Rejection Region
reject_rej<-qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
reject_rej
## [1] 4.139252
back2<-lm(Quality~Aroma+Body+Flavor+Oakiness, data = wine)
anova_check<-anova(back2) # Body is the smallest, and its f-value falls just outside the rejection region
check<-anova_check$`F value`[2]
anova_check$`F value`[2] # 4.11759
## [1] 4.11759
# checking because I don't trust my own inspection skills
reject_rej>check
## [1] TRUE
# Next Step
p<-3
this.df=n-p-1
# New Rejection Region
qf(alpha, df1=1, df2=this.df, lower.tail = FALSE)
## [1] 4.130018
back3<-lm(Quality~Aroma+Flavor+Oakiness, data = wine)
anova(back3) # All the f values are greater than the rejection region, so this is our final model
## Analysis of Variance Table
##
## Response: Quality
## Df Sum Sq Mean Sq F value Pr(>F)
## Aroma 1 77.442 77.442 57.4226 8.401e-09 ***
## Flavor 1 24.494 24.494 18.1624 0.000152 ***
## Oakiness 1 6.999 6.999 5.1896 0.029127 *
## Residuals 34 45.853 1.349
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final_model<-lm(Quality~Aroma+Flavor+Oakiness, data = wine)
summary(final_model)
##
## Call:
## lm(formula = Quality ~ Aroma + Flavor + Oakiness, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5707 -0.6256 0.1521 0.6467 1.7741
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4672 1.3328 4.852 2.67e-05 ***
## Aroma 0.5801 0.2622 2.213 0.033740 *
## Flavor 1.1997 0.2749 4.364 0.000113 ***
## Oakiness -0.6023 0.2644 -2.278 0.029127 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.161 on 34 degrees of freedom
## Multiple R-squared: 0.7038, Adjusted R-squared: 0.6776
## F-statistic: 26.92 on 3 and 34 DF, p-value: 4.203e-09
The estimated beta coefficients are 6.4672 for the intercept, and 0.5801, 1.1997, and -0.6023 for the Aroma, Flavor, and Oakiness slopes, respectively.
# To find confidence intervals for my new model, I need a new Covariance Matrix:
response_matrix<-wine$Quality
design_matrix<-matrix(c(rep(1, dim(wine)[1]), wine$Aroma, wine$Flavor, wine$Oakiness), nrow=dim(wine)[1])
betaMat<-solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)%*%response_matrix
y_hat_matrix<-design_matrix%*%betaMat
error_vector<-response_matrix-y_hat_matrix
n<-dim(wine)[1]
p<-3
ms_res<-sum(error_vector^2)/(n-p-1)
covMat<-ms_res*solve(t(design_matrix)%*%design_matrix)
se<-sqrt(diag(covMat))
UB<-betaMat+1*qt(0.05/2, df = n-p-1, lower.tail = FALSE)*se
LB<-betaMat-1*qt(0.05/2, df = n-p-1, lower.tail = FALSE)*se
conf_int<-cbind(LB, UB)
# These are the confidence intervals for the beta estimates
conf_int
## [,1] [,2]
## [1,] 3.75864235 9.1757473
## [2,] 0.04729651 1.1129440
## [3,] 0.64106744 1.7583182
## [4,] -1.13965261 -0.0649967
#checking using confint()
confint(final_model)
## 2.5 % 97.5 %
## (Intercept) 3.75864235 9.1757473
## Aroma 0.04729651 1.1129440
## Flavor 0.64106744 1.7583182
## Oakiness -1.13965261 -0.0649967
In conclusion, in order to make a better quality wine, it would be best for Oregon winemakers to focus less on the Clarity and Body of the wine and instead focus on the Aroma, Flavor, and Oakiness. There is sufficient evidence that these three variables contribute the most and are the most important in depicting the Quality of the wine. However, based on the results, it looks like the higher amount of Oakiness results in a worse Quality wine, so if winemakers put more focus into increasing the Aroma and Flavor of the wine, however that may be, the Quality will increase.