wine <- read.csv("winequality-red.csv", check.names = TRUE)
names(wine)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
str(wine)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
## Manual calculation simple linear regression slope beta
b <- cov(wine$volatile.acidity, wine$quality) / var(wine$volatile.acidity)
b
## [1] -1.761438
##Intercept alpha
a <- mean(wine$quality) - b * mean(wine$volatile.acidity)
a
## [1] 6.565746
##Correlation analysis
r <- cov(wine$volatile.acidity, wine$quality) /
(sd(wine$volatile.acidity) * sd(wine$quality))
r
## [1] -0.3905578
cor(wine$volatile.acidity, wine$quality)
## [1] -0.3905578
# slope from correlation
r * (sd(wine$quality) / sd(wine$volatile.acidity))
## [1] -1.761438
model_simple <- lm(quality ~ volatile.acidity, data = wine)
summary(model_simple)
##
## Call:
## lm(formula = quality ~ volatile.acidity, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.79071 -0.54411 -0.00687 0.47350 2.93148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.56575 0.05791 113.39 <2e-16 ***
## volatile.acidity -1.76144 0.10389 -16.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7437 on 1597 degrees of freedom
## Multiple R-squared: 0.1525, Adjusted R-squared: 0.152
## F-statistic: 287.4 on 1 and 1597 DF, p-value: < 2.2e-16
##Volatile acidity coefficient is negative: as volatile acidity increases, predicted quality decreases. [winequality-red | Excel] The relationship is typically statistically significant (very small p-value in this dataset). [winequality-red | Excel] R2R^2R2 is usually modest, meaning volatile acidity explains some but not most of the variation in quality (quality depends on multiple chemical characteristics).
reg <- function(y, x) {
x <- as.matrix(x)
x <- cbind(Intercept = 1, x)
b <- solve(t(x) %*% x) %*% t(x) %*% y
colnames(b) <- "estimate"
print(b)
}
## Using next two columns after volatile acidity
reg(y = wine$quality, x = wine[c("volatile.acidity", "citric.acid", "residual.sugar")])
## estimate
## Intercept 6.516649077
## volatile.acidity -1.730127305
## citric.acid 0.052273675
## residual.sugar 0.007249562
model_multi <- lm(quality ~ volatile.acidity + citric.acid + residual.sugar, data = wine)
summary(model_multi)
##
## Call:
## lm(formula = quality ~ volatile.acidity + citric.acid + residual.sugar,
## data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78822 -0.53729 -0.01137 0.46846 2.94120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.51665 0.09159 71.154 <2e-16 ***
## volatile.acidity -1.73013 0.12531 -13.807 <2e-16 ***
## citric.acid 0.05227 0.11639 0.449 0.653
## residual.sugar 0.00725 0.01340 0.541 0.589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.744 on 1595 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.1513
## F-statistic: 95.93 on 3 and 1595 DF, p-value: < 2.2e-16
# columns 2:4 are volatile acidity, citric acid, residual sugar
reg(y = wine$quality, x = wine[2:4])
## estimate
## Intercept 6.516649077
## volatile.acidity -1.730127305
## citric.acid 0.052273675
## residual.sugar 0.007249562
model_multi <- lm(quality ~ ., data = wine[, c("quality", names(wine)[2:4])])
summary(model_multi)
##
## Call:
## lm(formula = quality ~ ., data = wine[, c("quality", names(wine)[2:4])])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78822 -0.53729 -0.01137 0.46846 2.94120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.51665 0.09159 71.154 <2e-16 ***
## volatile.acidity -1.73013 0.12531 -13.807 <2e-16 ***
## citric.acid 0.05227 0.11639 0.449 0.653
## residual.sugar 0.00725 0.01340 0.541 0.589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.744 on 1595 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.1513
## F-statistic: 95.93 on 3 and 1595 DF, p-value: < 2.2e-16
# Diagnostic plots for the multiple regression
plot(model_multi)