wine <- read.csv("winequality-red.csv", check.names = TRUE)
names(wine)
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"
str(wine)
## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
summary(wine)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
## Manual calculation simple linear regression slope beta
b <- cov(wine$volatile.acidity, wine$quality) / var(wine$volatile.acidity)
b
## [1] -1.761438
##Intercept alpha
a <- mean(wine$quality) - b * mean(wine$volatile.acidity)
a
## [1] 6.565746

##Correlation analysis

r <- cov(wine$volatile.acidity, wine$quality) /
     (sd(wine$volatile.acidity) * sd(wine$quality))
r
## [1] -0.3905578
cor(wine$volatile.acidity, wine$quality)
## [1] -0.3905578
# slope from correlation
r * (sd(wine$quality) / sd(wine$volatile.acidity))
## [1] -1.761438
model_simple <- lm(quality ~ volatile.acidity, data = wine)
summary(model_simple)
## 
## Call:
## lm(formula = quality ~ volatile.acidity, data = wine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.79071 -0.54411 -0.00687  0.47350  2.93148 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.56575    0.05791  113.39   <2e-16 ***
## volatile.acidity -1.76144    0.10389  -16.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7437 on 1597 degrees of freedom
## Multiple R-squared:  0.1525, Adjusted R-squared:  0.152 
## F-statistic: 287.4 on 1 and 1597 DF,  p-value: < 2.2e-16

##Volatile acidity coefficient is negative: as volatile acidity increases, predicted quality decreases. [winequality-red | Excel] The relationship is typically statistically significant (very small p-value in this dataset). [winequality-red | Excel] R2R^2R2 is usually modest, meaning volatile acidity explains some but not most of the variation in quality (quality depends on multiple chemical characteristics).

reg <- function(y, x) {
  x <- as.matrix(x)
  x <- cbind(Intercept = 1, x)
  b <- solve(t(x) %*% x) %*% t(x) %*% y
  colnames(b) <- "estimate"
  print(b)
}
## Using next two columns after volatile acidity
reg(y = wine$quality, x = wine[c("volatile.acidity", "citric.acid", "residual.sugar")])
##                      estimate
## Intercept         6.516649077
## volatile.acidity -1.730127305
## citric.acid       0.052273675
## residual.sugar    0.007249562
model_multi <- lm(quality ~ volatile.acidity + citric.acid + residual.sugar, data = wine)
summary(model_multi)
## 
## Call:
## lm(formula = quality ~ volatile.acidity + citric.acid + residual.sugar, 
##     data = wine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78822 -0.53729 -0.01137  0.46846  2.94120 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.51665    0.09159  71.154   <2e-16 ***
## volatile.acidity -1.73013    0.12531 -13.807   <2e-16 ***
## citric.acid       0.05227    0.11639   0.449    0.653    
## residual.sugar    0.00725    0.01340   0.541    0.589    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.744 on 1595 degrees of freedom
## Multiple R-squared:  0.1529, Adjusted R-squared:  0.1513 
## F-statistic: 95.93 on 3 and 1595 DF,  p-value: < 2.2e-16
# columns 2:4 are volatile acidity, citric acid, residual sugar
reg(y = wine$quality, x = wine[2:4])
##                      estimate
## Intercept         6.516649077
## volatile.acidity -1.730127305
## citric.acid       0.052273675
## residual.sugar    0.007249562
model_multi <- lm(quality ~ ., data = wine[, c("quality", names(wine)[2:4])])
summary(model_multi)
## 
## Call:
## lm(formula = quality ~ ., data = wine[, c("quality", names(wine)[2:4])])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78822 -0.53729 -0.01137  0.46846  2.94120 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.51665    0.09159  71.154   <2e-16 ***
## volatile.acidity -1.73013    0.12531 -13.807   <2e-16 ***
## citric.acid       0.05227    0.11639   0.449    0.653    
## residual.sugar    0.00725    0.01340   0.541    0.589    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.744 on 1595 degrees of freedom
## Multiple R-squared:  0.1529, Adjusted R-squared:  0.1513 
## F-statistic: 95.93 on 3 and 1595 DF,  p-value: < 2.2e-16
# Diagnostic plots for the multiple regression
plot(model_multi)