knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
theme_set(theme_classic())
library(readxl)
winedata <- read_excel("C:/Users/ADMIN/Desktop/winequalityred.xlsx")
wdata=winedata
head(wdata)
## # A tibble: 6 × 12
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # ℹ 7 more variables: free_sulfurdioxide <dbl>, total_sulfurdioxide <dbl>,
## # density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>, quality <dbl>
summary(wdata)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free_sulfurdioxide total_sulfurdioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
ggplot(wdata, aes(quality, pH) ) +
geom_point() +
stat_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.
# Build the model
model <- lm(
quality~ pH, data = wdata)
# Make predictions
summary(model)
##
## Call:
## lm(formula = quality ~ pH, data = wdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6817 -0.6394 0.3032 0.3878 2.4874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6359 0.4332 15.320 <2e-16 ***
## pH -0.3020 0.1307 -2.311 0.021 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8065 on 1597 degrees of freedom
## Multiple R-squared: 0.003333, Adjusted R-squared: 0.002709
## F-statistic: 5.34 on 1 and 1597 DF, p-value: 0.02096
predictions <- model %>% predict(wdata)
# Model performance
data.frame(
RMSE = RMSE(predictions, wdata$
quality),
R2 = R2(predictions, wdata$quality)
)
## RMSE R2
## 1 0.8059704 0.003332914
summary(model)
##
## Call:
## lm(formula = quality ~ pH, data = wdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6817 -0.6394 0.3032 0.3878 2.4874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6359 0.4332 15.320 <2e-16 ***
## pH -0.3020 0.1307 -2.311 0.021 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8065 on 1597 degrees of freedom
## Multiple R-squared: 0.003333, Adjusted R-squared: 0.002709
## F-statistic: 5.34 on 1 and 1597 DF, p-value: 0.02096
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
coeftest(model)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.63592 0.43315 15.3201 < 2e-16 ***
## pH -0.30198 0.13068 -2.3109 0.02096 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(wdata, aes(pH, quality) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)
model2 = lm(formula = quality ~ pH + fixed_acidity + volatile_acidity +
citric_acid + alcohol + sulphates + density + chlorides + residual_sugar + free_sulfurdioxide + total_sulfurdioxide, data = wdata)
summary(model2)
##
## Call:
## lm(formula = quality ~ pH + fixed_acidity + volatile_acidity +
## citric_acid + alcohol + sulphates + density + chlorides +
## residual_sugar + free_sulfurdioxide + total_sulfurdioxide,
## data = wdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.68911 -0.36652 -0.04699 0.45202 2.02498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.197e+01 2.119e+01 1.036 0.3002
## pH -4.137e-01 1.916e-01 -2.159 0.0310 *
## fixed_acidity 2.499e-02 2.595e-02 0.963 0.3357
## volatile_acidity -1.084e+00 1.211e-01 -8.948 < 2e-16 ***
## citric_acid -1.826e-01 1.472e-01 -1.240 0.2150
## alcohol 2.762e-01 2.648e-02 10.429 < 2e-16 ***
## sulphates 9.163e-01 1.143e-01 8.014 2.13e-15 ***
## density -1.788e+01 2.163e+01 -0.827 0.4086
## chlorides -1.874e+00 4.193e-01 -4.470 8.37e-06 ***
## residual_sugar 1.633e-02 1.500e-02 1.089 0.2765
## free_sulfurdioxide 4.361e-03 2.171e-03 2.009 0.0447 *
## total_sulfurdioxide -3.265e-03 7.287e-04 -4.480 8.00e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.648 on 1587 degrees of freedom
## Multiple R-squared: 0.3606, Adjusted R-squared: 0.3561
## F-statistic: 81.35 on 11 and 1587 DF, p-value: < 2.2e-16
# Get the model residuals
model_residuals = model2$residuals
# Plot the result
hist(model_residuals)
# Plot the residuals
qqnorm(model_residuals)
# Plot the Q-Q line
qqline(model_residuals)
library(ggcorrplot)
# Compute correlation at 2 decimal places
corr_matrix = round(cor(wdata), 2)
# Compute and show the result
ggcorrplot(corr_matrix, hc.order = TRUE, type = "lower",
lab = TRUE)
plot(model$residuals, fitted(model2))
# Anova test
anova(model2)
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## pH 1 3.47 3.473 8.2717 0.0040805 **
## fixed_acidity 1 13.99 13.987 33.3100 9.432e-09 ***
## volatile_acidity 1 147.07 147.075 350.2463 < 2.2e-16 ***
## citric_acid 1 0.02 0.017 0.0403 0.8408671
## alcohol 1 175.84 175.845 418.7595 < 2.2e-16 ***
## sulphates 1 19.19 19.190 45.6983 1.932e-11 ***
## density 1 0.65 0.648 1.5420 0.2144982
## chlorides 1 6.20 6.205 14.7762 0.0001258 ***
## residual_sugar 1 0.20 0.195 0.4654 0.4952198
## free_sulfurdioxide 1 0.69 0.692 1.6483 0.1993792
## total_sulfurdioxide 1 8.43 8.427 20.0689 8.005e-06 ***
## Residuals 1587 666.41 0.420
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#library(forecast)
#forecast(model2)%>%
# autoplot()+xlab(
#"Quality"
#)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.