knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
theme_set(theme_classic())
library(readxl)
winedata <- read_excel("C:/Users/ADMIN/Desktop/winequalityred.xlsx")
wdata=winedata
head(wdata)
## # A tibble: 6 × 12
##   fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
##           <dbl>            <dbl>       <dbl>          <dbl>     <dbl>
## 1           7.4             0.7         0               1.9     0.076
## 2           7.8             0.88        0               2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.7         0               1.9     0.076
## 6           7.4             0.66        0               1.8     0.075
## # ℹ 7 more variables: free_sulfurdioxide <dbl>, total_sulfurdioxide <dbl>,
## #   density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>, quality <dbl>
summary(wdata)
##  fixed_acidity   volatile_acidity  citric_acid    residual_sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free_sulfurdioxide total_sulfurdioxide    density      
##  Min.   :0.01200   Min.   : 1.00      Min.   :  6.00      Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00      1st Qu.: 22.00      1st Qu.:0.9956  
##  Median :0.07900   Median :14.00      Median : 38.00      Median :0.9968  
##  Mean   :0.08747   Mean   :15.87      Mean   : 46.47      Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00      3rd Qu.: 62.00      3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00      Max.   :289.00      Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
ggplot(wdata, aes(quality, pH) ) +
 geom_point() +
 stat_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.

Simple Linear Regression Model QUALITY vs. pH

# Build the model
model <- lm(
  quality~ pH, data = wdata)
# Make predictions
summary(model)
## 
## Call:
## lm(formula = quality ~ pH, data = wdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6817 -0.6394  0.3032  0.3878  2.4874 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.6359     0.4332  15.320   <2e-16 ***
## pH           -0.3020     0.1307  -2.311    0.021 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8065 on 1597 degrees of freedom
## Multiple R-squared:  0.003333,   Adjusted R-squared:  0.002709 
## F-statistic:  5.34 on 1 and 1597 DF,  p-value: 0.02096
predictions <- model %>% predict(wdata)
# Model performance
data.frame(
 RMSE = RMSE(predictions, wdata$
               quality),
 R2 = R2(predictions, wdata$quality)
)
##        RMSE          R2
## 1 0.8059704 0.003332914
summary(model)
## 
## Call:
## lm(formula = quality ~ pH, data = wdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6817 -0.6394  0.3032  0.3878  2.4874 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.6359     0.4332  15.320   <2e-16 ***
## pH           -0.3020     0.1307  -2.311    0.021 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8065 on 1597 degrees of freedom
## Multiple R-squared:  0.003333,   Adjusted R-squared:  0.002709 
## F-statistic:  5.34 on 1 and 1597 DF,  p-value: 0.02096
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
coeftest(model)
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.63592    0.43315 15.3201  < 2e-16 ***
## pH          -0.30198    0.13068 -2.3109  0.02096 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(wdata, aes(pH, quality) ) +
 geom_point() +
 stat_smooth(method = lm, formula = y ~ x)

Multiple regressio

  model2 = lm(formula = quality ~ pH + fixed_acidity + volatile_acidity +

                     citric_acid + alcohol + sulphates + density + chlorides + residual_sugar + free_sulfurdioxide + total_sulfurdioxide, data = wdata)
summary(model2)
## 
## Call:
## lm(formula = quality ~ pH + fixed_acidity + volatile_acidity + 
##     citric_acid + alcohol + sulphates + density + chlorides + 
##     residual_sugar + free_sulfurdioxide + total_sulfurdioxide, 
##     data = wdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.68911 -0.36652 -0.04699  0.45202  2.02498 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.197e+01  2.119e+01   1.036   0.3002    
## pH                  -4.137e-01  1.916e-01  -2.159   0.0310 *  
## fixed_acidity        2.499e-02  2.595e-02   0.963   0.3357    
## volatile_acidity    -1.084e+00  1.211e-01  -8.948  < 2e-16 ***
## citric_acid         -1.826e-01  1.472e-01  -1.240   0.2150    
## alcohol              2.762e-01  2.648e-02  10.429  < 2e-16 ***
## sulphates            9.163e-01  1.143e-01   8.014 2.13e-15 ***
## density             -1.788e+01  2.163e+01  -0.827   0.4086    
## chlorides           -1.874e+00  4.193e-01  -4.470 8.37e-06 ***
## residual_sugar       1.633e-02  1.500e-02   1.089   0.2765    
## free_sulfurdioxide   4.361e-03  2.171e-03   2.009   0.0447 *  
## total_sulfurdioxide -3.265e-03  7.287e-04  -4.480 8.00e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.648 on 1587 degrees of freedom
## Multiple R-squared:  0.3606, Adjusted R-squared:  0.3561 
## F-statistic: 81.35 on 11 and 1587 DF,  p-value: < 2.2e-16
# Get the model residuals
model_residuals = model2$residuals

# Plot the result
hist(model_residuals)

# Plot the residuals
qqnorm(model_residuals)
# Plot the Q-Q line
qqline(model_residuals)

library(ggcorrplot)

# Compute correlation at 2 decimal places
corr_matrix = round(cor(wdata), 2)

# Compute and show the  result
ggcorrplot(corr_matrix, hc.order = TRUE, type = "lower",
          lab = TRUE)

plot(model$residuals, fitted(model2))

# Anova test
anova(model2)
## Analysis of Variance Table
## 
## Response: quality
##                       Df Sum Sq Mean Sq  F value    Pr(>F)    
## pH                     1   3.47   3.473   8.2717 0.0040805 ** 
## fixed_acidity          1  13.99  13.987  33.3100 9.432e-09 ***
## volatile_acidity       1 147.07 147.075 350.2463 < 2.2e-16 ***
## citric_acid            1   0.02   0.017   0.0403 0.8408671    
## alcohol                1 175.84 175.845 418.7595 < 2.2e-16 ***
## sulphates              1  19.19  19.190  45.6983 1.932e-11 ***
## density                1   0.65   0.648   1.5420 0.2144982    
## chlorides              1   6.20   6.205  14.7762 0.0001258 ***
## residual_sugar         1   0.20   0.195   0.4654 0.4952198    
## free_sulfurdioxide     1   0.69   0.692   1.6483 0.1993792    
## total_sulfurdioxide    1   8.43   8.427  20.0689 8.005e-06 ***
## Residuals           1587 666.41   0.420                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#library(forecast)
#forecast(model2)%>%
# autoplot()+xlab(
#"Quality"
#)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.