What is wine?
redwine = read.csv("\\Users\\Thomas\\Desktop\\StatisticalLearning\\Final\\Wine+Wuality\\winequality-red.csv", sep=';', header=T)
whitewine = read.csv("\\Users\\Thomas\\Desktop\\StatisticalLearning\\Final\\Wine+Wuality\\winequality-white.csv", sep=';', header=T)
head(redwine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
head(whitewine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.0 0.27 0.36 20.7 0.045
## 2 6.3 0.30 0.34 1.6 0.049
## 3 8.1 0.28 0.40 6.9 0.050
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.40 6.9 0.050
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
wnames = names(whitewine)
rnames = names(redwine)
print("Common Names:")
## [1] "Common Names:"
print(intersect(wnames, rnames))
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
for (name in setdiff(rnames, "quality")) {
par(mfrow = c(1,2))
formula = as.formula(paste(name, "~ quality"))
boxplot(formula, data = redwine)
boxplot(formula, data = whitewine)
}











Just for funsies
summary(lm(quality ~ ., data = redwine))
##
## Call:
## lm(formula = quality ~ ., data = redwine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.68911 -0.36652 -0.04699 0.45202 2.02498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.197e+01 2.119e+01 1.036 0.3002
## fixed.acidity 2.499e-02 2.595e-02 0.963 0.3357
## volatile.acidity -1.084e+00 1.211e-01 -8.948 < 2e-16 ***
## citric.acid -1.826e-01 1.472e-01 -1.240 0.2150
## residual.sugar 1.633e-02 1.500e-02 1.089 0.2765
## chlorides -1.874e+00 4.193e-01 -4.470 8.37e-06 ***
## free.sulfur.dioxide 4.361e-03 2.171e-03 2.009 0.0447 *
## total.sulfur.dioxide -3.265e-03 7.287e-04 -4.480 8.00e-06 ***
## density -1.788e+01 2.163e+01 -0.827 0.4086
## pH -4.137e-01 1.916e-01 -2.159 0.0310 *
## sulphates 9.163e-01 1.143e-01 8.014 2.13e-15 ***
## alcohol 2.762e-01 2.648e-02 10.429 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.648 on 1587 degrees of freedom
## Multiple R-squared: 0.3606, Adjusted R-squared: 0.3561
## F-statistic: 81.35 on 11 and 1587 DF, p-value: < 2.2e-16
summary(lm(quality ~ fixed.acidity + volatile.acidity + residual.sugar + free.sulfur.dioxide + density + pH + sulphates + alcohol, data = whitewine))
##
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + residual.sugar +
## free.sulfur.dioxide + density + pH + sulphates + alcohol,
## data = whitewine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8246 -0.4938 -0.0396 0.4660 3.1208
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.541e+02 1.810e+01 8.514 < 2e-16 ***
## fixed.acidity 6.810e-02 2.043e-02 3.333 0.000864 ***
## volatile.acidity -1.888e+00 1.095e-01 -17.242 < 2e-16 ***
## residual.sugar 8.285e-02 7.287e-03 11.370 < 2e-16 ***
## free.sulfur.dioxide 3.349e-03 6.766e-04 4.950 7.67e-07 ***
## density -1.543e+02 1.834e+01 -8.411 < 2e-16 ***
## pH 6.942e-01 1.034e-01 6.717 2.07e-11 ***
## sulphates 6.285e-01 9.997e-02 6.287 3.52e-10 ***
## alcohol 1.932e-01 2.408e-02 8.021 1.31e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7512 on 4889 degrees of freedom
## Multiple R-squared: 0.2818, Adjusted R-squared: 0.2806
## F-statistic: 239.7 on 8 and 4889 DF, p-value: < 2.2e-16
w.model = lm(quality ~ poly(fixed.acidity,2) + poly(volatile.acidity,3) + residual.sugar + free.sulfur.dioxide + poly(density,2) + pH + sulphates + poly(alcohol,2), data = whitewine)
summary(w.model)
##
## Call:
## lm(formula = quality ~ poly(fixed.acidity, 2) + poly(volatile.acidity,
## 3) + residual.sugar + free.sulfur.dioxide + poly(density,
## 2) + pH + sulphates + poly(alcohol, 2), data = whitewine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7496 -0.4940 -0.0328 0.4664 3.3045
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.715e+00 3.956e-01 4.336 1.48e-05 ***
## poly(fixed.acidity, 2)1 7.726e+00 1.317e+00 5.866 4.75e-09 ***
## poly(fixed.acidity, 2)2 -3.667e+00 7.615e-01 -4.816 1.51e-06 ***
## poly(volatile.acidity, 3)1 -1.374e+01 7.809e-01 -17.589 < 2e-16 ***
## poly(volatile.acidity, 3)2 2.448e+00 7.734e-01 3.165 0.001561 **
## poly(volatile.acidity, 3)3 -3.023e+00 7.664e-01 -3.945 8.10e-05 ***
## residual.sugar 9.886e-02 8.015e-03 12.334 < 2e-16 ***
## free.sulfur.dioxide 3.388e-03 6.727e-04 5.036 4.92e-07 ***
## poly(density, 2)1 -4.456e+01 4.413e+00 -10.097 < 2e-16 ***
## poly(density, 2)2 4.900e+00 9.440e-01 5.190 2.18e-07 ***
## pH 9.638e-01 1.100e-01 8.760 < 2e-16 ***
## sulphates 6.906e-01 9.993e-02 6.910 5.45e-12 ***
## poly(alcohol, 2)1 8.906e+00 2.482e+00 3.588 0.000336 ***
## poly(alcohol, 2)2 3.971e+00 8.078e-01 4.916 9.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7436 on 4884 degrees of freedom
## Multiple R-squared: 0.2969, Adjusted R-squared: 0.295
## F-statistic: 158.7 on 13 and 4884 DF, p-value: < 2.2e-16
pred = predict(w.model, whitewine)
plot(y = whitewine$quality, x = fitted(w.model))

correct = sum(whitewine$quality == round(fitted(w.model)))
correct / nrow(whitewine)
## [1] 0.5177624
diff = whitewine$quality - round(fitted(w.model))
hist(diff)

hist(whitewine$quality)

hist(redwine$quality)
