In this case, we want to see the relationship between wine substantial and quality based on the assessment of wine experts. Moreover, predicting quality of wine so we can deliminate assestement of wine from expert.
Read data
wine <- read.csv("winequality-red.csv")
head(wine,10)
Suspect Data
glimpse(wine)
#> Rows: 1,599
#> Columns: 12
#> $ fixed.acidity <dbl> 7.4, 7.8, 7.8, 11.2, 7.4, 7.4, 7.9, 7.3, 7.8, 7.5…
#> $ volatile.acidity <dbl> 0.700, 0.880, 0.760, 0.280, 0.700, 0.660, 0.600, …
#> $ citric.acid <dbl> 0.00, 0.00, 0.04, 0.56, 0.00, 0.00, 0.06, 0.00, 0…
#> $ residual.sugar <dbl> 1.9, 2.6, 2.3, 1.9, 1.9, 1.8, 1.6, 1.2, 2.0, 6.1,…
#> $ chlorides <dbl> 0.076, 0.098, 0.092, 0.075, 0.076, 0.075, 0.069, …
#> $ free.sulfur.dioxide <dbl> 11, 25, 15, 17, 11, 13, 15, 15, 9, 17, 15, 17, 16…
#> $ total.sulfur.dioxide <dbl> 34, 67, 54, 60, 34, 40, 59, 21, 18, 102, 65, 102,…
#> $ density <dbl> 0.9978, 0.9968, 0.9970, 0.9980, 0.9978, 0.9978, 0…
#> $ pH <dbl> 3.51, 3.20, 3.26, 3.16, 3.51, 3.51, 3.30, 3.39, 3…
#> $ sulphates <dbl> 0.56, 0.68, 0.65, 0.58, 0.56, 0.56, 0.46, 0.47, 0…
#> $ alcohol <dbl> 9.4, 9.8, 9.8, 9.8, 9.4, 9.4, 9.4, 10.0, 9.5, 10.…
#> $ quality <int> 5, 5, 5, 6, 5, 5, 5, 7, 7, 5, 5, 5, 5, 5, 5, 5, 7…
nrow(wine)
#> [1] 1599
colSums(is.na(wine))
#> fixed.acidity volatile.acidity citric.acid
#> 0 0 0
#> residual.sugar chlorides free.sulfur.dioxide
#> 0 0 0
#> total.sulfur.dioxide density pH
#> 0 0 0
#> sulphates alcohol quality
#> 0 0 0
summary(wine)
#> fixed.acidity volatile.acidity citric.acid residual.sugar
#> Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
#> 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
#> Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
#> Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
#> 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
#> Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
#> chlorides free.sulfur.dioxide total.sulfur.dioxide density
#> Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
#> 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
#> Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
#> Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
#> 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
#> Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
#> pH sulphates alcohol quality
#> Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
#> 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
#> Median :3.310 Median :0.6200 Median :10.20 Median :6.000
#> Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
#> 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
#> Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
boxplot(wine)
### Outlier Iteration
#Create outlier function
outl <- function(x, na.rm = FALSE){
qq <- quantile(x, probs = c(0.25, 0.75), na.rm = na.rm)
iqr <- diff(qq)
x > qq[1] - 1.5*iqr & x < qq[2] + 1.5*iqr
}
#Iteration Outliers 1
which_out1 <- colwise(outl)(wine)
wine_clean1 <- subset(wine, rowSums(which_out1) == ncol(which_out1))
#Iteration Outliers 2
which_out2 <- colwise(outl)(wine_clean1)
wine_clean2 <- subset(wine_clean1, rowSums(which_out2) == ncol(which_out2))
#Iteration Outliers 3
which_out3 <- colwise(outl)(wine_clean2)
wine_clean3 <- subset(wine_clean2, rowSums(which_out3) == ncol(which_out3))
#Iteration Outliers 4
which_out4 <- colwise(outl)(wine_clean3)
wine_clean4 <- subset(wine_clean3, rowSums(which_out4) == ncol(which_out4))
#Iteration Outliers 5
which_out5 <- colwise(outl)(wine_clean4)
wine_clean5 <- subset(wine_clean4, rowSums(which_out5) == ncol(which_out5))
nrow(wine_clean5)
#> [1] 923
boxplot(wine_clean5)
# Data Processing
ggcorr(data = wine_clean5, label = TRUE)
Linierity
cor.test(wine_clean5$quality,wine_clean5$alcohol)$p.value #linier
#> [1] 0.000000000000000000000000000000000000000000000000000000000378668
cor.test(wine_clean5$quality,wine_clean5$sulphates)$p.value #linier
#> [1] 0.000000000000000000000000000000000000000000092605
cor.test(wine_clean5$quality,wine_clean5$pH)$p.value #linier
#> [1] 0.03328923
cor.test(wine_clean5$quality,wine_clean5$density)$p.value #linier
#> [1] 0.000000000009770698
cor.test(wine_clean5$quality,wine_clean5$total.sulfur.dioxide)$p.value #linier
#> [1] 0.0000007509132
cor.test(wine_clean5$quality,wine_clean5$free.sulfur.dioxide)$p.value #non linier
#> [1] 0.5958058
cor.test(wine_clean5$quality,wine_clean5$chlorides)$p.value #linier
#> [1] 0.00006030588
cor.test(wine_clean5$quality,wine_clean5$residual.sugar)$p.value #no linier
#> [1] 0.1387455
cor.test(wine_clean5$quality,wine_clean5$citric.acid)$p.value #linier
#> [1] 0.00000000001237595
cor.test(wine_clean5$quality,wine_clean5$volatile.acidity)$p.value #linier
#> [1] 0.0000000000000000000000000007967186
cor.test(wine_clean5$quality,wine_clean5$fixed.acidity)$p.value #linier
#> [1] 0.0010324
linierity 1. alcohol 2. sulphates 3. pH 4. density 5. total.sulfur.dioxide 6. chlorides 7. citric.acid 8. volatile.acidity 9. fixed.acidity
inlinierity 1. free.sulfur.dioxide 2. residual.sugar
Take out variable non linier
wine_clean6 <- wine_clean5 %>%
select(-c(free.sulfur.dioxide, residual.sugar))
Correlation
ggcorr(wine_clean6, geom = "blank", label = T, label_size = 3, hjust = 1, size = 3, layout.exp = 2) +
geom_point(size = 8, aes(color = coefficient > 0, alpha = abs(coefficient) >= 0.5)) +
scale_alpha_manual(values = c("TRUE" = 0.25, "FALSE" = 0)) +
guides(color = "none", alpha = "none")
RNGkind(sample.kind = "Rounding")
set.seed(123)
# train-test splitting
index <- sample(nrow(wine_clean6), nrow(wine_clean6)*0.75)
# sms_dtm = DocumentTermMatrix yang tidak ada labelnya
wine_train <- wine_clean6[index,]
wine_test <- wine_clean6[-index,]
# model tanpa prediktor
model_wine_none <- lm(quality ~ 1, data = wine_train)
# model dengan semua prediktor
model_wine_all <- lm(quality ~ ., data = wine_train)
model_back <- step(object = model_wine_all, direction = "backward", trace = F)
summary(model_back)
#>
#> Call:
#> lm(formula = quality ~ volatile.acidity + citric.acid + pH +
#> sulphates + alcohol, data = wine_train)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.94525 -0.38253 -0.05837 0.44910 1.67451
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 3.89418 0.71033 5.482 0.0000000591 ***
#> volatile.acidity -0.95079 0.17737 -5.360 0.0000001136 ***
#> citric.acid -0.50392 0.18349 -2.746 0.00618 **
#> pH -0.63038 0.19806 -3.183 0.00153 **
#> sulphates 2.13242 0.22485 9.484 < 0.0000000000000002 ***
#> alcohol 0.30478 0.02397 12.713 < 0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.5708 on 686 degrees of freedom
#> Multiple R-squared: 0.3927, Adjusted R-squared: 0.3882
#> F-statistic: 88.7 on 5 and 686 DF, p-value: < 0.00000000000000022
model_forward <- step(object = model_wine_none,
scope = list(lower = model_wine_none,
upper = model_wine_all),
direction = "forward",
trace = T)
#> Start: AIC=-435.07
#> quality ~ 1
#>
#> Df Sum of Sq RSS AIC
#> + alcohol 1 92.400 275.57 -633.17
#> + sulphates 1 71.064 296.90 -581.56
#> + volatile.acidity 1 46.661 321.30 -526.90
#> + citric.acid 1 20.401 347.56 -472.54
#> + density 1 19.206 348.76 -470.16
#> + total.sulfur.dioxide 1 10.846 357.12 -453.77
#> + chlorides 1 6.033 361.93 -444.51
#> + fixed.acidity 1 4.253 363.71 -441.11
#> + pH 1 2.614 365.35 -438.00
#> <none> 367.97 -435.07
#>
#> Step: AIC=-633.17
#> quality ~ alcohol
#>
#> Df Sum of Sq RSS AIC
#> + sulphates 1 38.848 236.72 -736.32
#> + volatile.acidity 1 21.235 254.33 -686.66
#> + citric.acid 1 6.173 269.39 -646.84
#> + pH 1 3.224 272.34 -639.31
#> + fixed.acidity 1 2.676 272.89 -637.92
#> + density 1 1.637 273.93 -635.29
#> <none> 275.56 -633.17
#> + total.sulfur.dioxide 1 0.402 275.16 -632.18
#> + chlorides 1 0.003 275.56 -631.18
#>
#> Step: AIC=-736.32
#> quality ~ alcohol + sulphates
#>
#> Df Sum of Sq RSS AIC
#> + volatile.acidity 1 9.0517 227.66 -761.30
#> + pH 1 3.8196 232.90 -745.58
#> + citric.acid 1 1.1222 235.59 -737.61
#> + fixed.acidity 1 0.7406 235.98 -736.49
#> <none> 236.72 -736.32
#> + total.sulfur.dioxide 1 0.5934 236.12 -736.06
#> + chlorides 1 0.0605 236.66 -734.50
#> + density 1 0.0573 236.66 -734.49
#>
#> Step: AIC=-761.3
#> quality ~ alcohol + sulphates + volatile.acidity
#>
#> Df Sum of Sq RSS AIC
#> + pH 1 1.72520 225.94 -764.57
#> + citric.acid 1 0.88252 226.78 -761.99
#> <none> 227.66 -761.30
#> + total.sulfur.dioxide 1 0.45086 227.21 -760.68
#> + chlorides 1 0.11253 227.55 -759.65
#> + fixed.acidity 1 0.08754 227.58 -759.57
#> + density 1 0.01596 227.65 -759.35
#>
#> Step: AIC=-764.57
#> quality ~ alcohol + sulphates + volatile.acidity + pH
#>
#> Df Sum of Sq RSS AIC
#> + citric.acid 1 2.45723 223.48 -770.14
#> <none> 225.94 -764.57
#> + fixed.acidity 1 0.62262 225.32 -764.48
#> + total.sulfur.dioxide 1 0.35362 225.59 -763.65
#> + density 1 0.11266 225.83 -762.91
#> + chlorides 1 0.03026 225.91 -762.66
#>
#> Step: AIC=-770.14
#> quality ~ alcohol + sulphates + volatile.acidity + pH + citric.acid
#>
#> Df Sum of Sq RSS AIC
#> <none> 223.48 -770.14
#> + density 1 0.226214 223.26 -768.84
#> + total.sulfur.dioxide 1 0.152620 223.33 -768.61
#> + chlorides 1 0.145111 223.34 -768.58
#> + fixed.acidity 1 0.005148 223.48 -768.15
summary(model_forward)
#>
#> Call:
#> lm(formula = quality ~ alcohol + sulphates + volatile.acidity +
#> pH + citric.acid, data = wine_train)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.94525 -0.38253 -0.05837 0.44910 1.67451
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 3.89418 0.71033 5.482 0.0000000591 ***
#> alcohol 0.30478 0.02397 12.713 < 0.0000000000000002 ***
#> sulphates 2.13242 0.22485 9.484 < 0.0000000000000002 ***
#> volatile.acidity -0.95079 0.17737 -5.360 0.0000001136 ***
#> pH -0.63038 0.19806 -3.183 0.00153 **
#> citric.acid -0.50392 0.18349 -2.746 0.00618 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.5708 on 686 degrees of freedom
#> Multiple R-squared: 0.3927, Adjusted R-squared: 0.3882
#> F-statistic: 88.7 on 5 and 686 DF, p-value: < 0.00000000000000022
model_both <- step(object = model_wine_none,
scope = list(lower = model_wine_none,
upper = model_wine_all),
direction = "both",
trace = T)
#> Start: AIC=-435.07
#> quality ~ 1
#>
#> Df Sum of Sq RSS AIC
#> + alcohol 1 92.400 275.57 -633.17
#> + sulphates 1 71.064 296.90 -581.56
#> + volatile.acidity 1 46.661 321.30 -526.90
#> + citric.acid 1 20.401 347.56 -472.54
#> + density 1 19.206 348.76 -470.16
#> + total.sulfur.dioxide 1 10.846 357.12 -453.77
#> + chlorides 1 6.033 361.93 -444.51
#> + fixed.acidity 1 4.253 363.71 -441.11
#> + pH 1 2.614 365.35 -438.00
#> <none> 367.97 -435.07
#>
#> Step: AIC=-633.17
#> quality ~ alcohol
#>
#> Df Sum of Sq RSS AIC
#> + sulphates 1 38.848 236.72 -736.32
#> + volatile.acidity 1 21.235 254.33 -686.66
#> + citric.acid 1 6.173 269.39 -646.84
#> + pH 1 3.224 272.34 -639.31
#> + fixed.acidity 1 2.676 272.89 -637.92
#> + density 1 1.637 273.93 -635.29
#> <none> 275.57 -633.17
#> + total.sulfur.dioxide 1 0.402 275.16 -632.18
#> + chlorides 1 0.003 275.56 -631.18
#> - alcohol 1 92.400 367.97 -435.07
#>
#> Step: AIC=-736.32
#> quality ~ alcohol + sulphates
#>
#> Df Sum of Sq RSS AIC
#> + volatile.acidity 1 9.052 227.66 -761.30
#> + pH 1 3.820 232.90 -745.58
#> + citric.acid 1 1.122 235.59 -737.61
#> + fixed.acidity 1 0.741 235.98 -736.49
#> <none> 236.72 -736.32
#> + total.sulfur.dioxide 1 0.593 236.12 -736.06
#> + chlorides 1 0.060 236.66 -734.50
#> + density 1 0.057 236.66 -734.49
#> - sulphates 1 38.848 275.56 -633.17
#> - alcohol 1 60.184 296.90 -581.56
#>
#> Step: AIC=-761.3
#> quality ~ alcohol + sulphates + volatile.acidity
#>
#> Df Sum of Sq RSS AIC
#> + pH 1 1.725 225.94 -764.57
#> + citric.acid 1 0.883 226.78 -761.99
#> <none> 227.66 -761.30
#> + total.sulfur.dioxide 1 0.451 227.21 -760.68
#> + chlorides 1 0.113 227.55 -759.65
#> + fixed.acidity 1 0.088 227.58 -759.57
#> + density 1 0.016 227.65 -759.35
#> - volatile.acidity 1 9.052 236.72 -736.32
#> - sulphates 1 26.665 254.33 -686.66
#> - alcohol 1 50.181 277.85 -625.46
#>
#> Step: AIC=-764.57
#> quality ~ alcohol + sulphates + volatile.acidity + pH
#>
#> Df Sum of Sq RSS AIC
#> + citric.acid 1 2.457 223.48 -770.14
#> <none> 225.94 -764.57
#> + fixed.acidity 1 0.623 225.32 -764.48
#> + total.sulfur.dioxide 1 0.354 225.59 -763.65
#> + density 1 0.113 225.83 -762.91
#> + chlorides 1 0.030 225.91 -762.66
#> - pH 1 1.725 227.66 -761.30
#> - volatile.acidity 1 6.957 232.90 -745.58
#> - sulphates 1 27.661 253.60 -686.65
#> - alcohol 1 51.035 276.98 -625.64
#>
#> Step: AIC=-770.14
#> quality ~ alcohol + sulphates + volatile.acidity + pH + citric.acid
#>
#> Df Sum of Sq RSS AIC
#> <none> 223.48 -770.14
#> + density 1 0.226 223.26 -768.84
#> + total.sulfur.dioxide 1 0.153 223.33 -768.61
#> + chlorides 1 0.145 223.34 -768.58
#> + fixed.acidity 1 0.005 223.48 -768.15
#> - citric.acid 1 2.457 225.94 -764.57
#> - pH 1 3.300 226.78 -761.99
#> - volatile.acidity 1 9.361 232.84 -743.74
#> - sulphates 1 29.301 252.78 -686.88
#> - alcohol 1 52.655 276.14 -625.73
summary(model_both)
#>
#> Call:
#> lm(formula = quality ~ alcohol + sulphates + volatile.acidity +
#> pH + citric.acid, data = wine_train)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.94525 -0.38253 -0.05837 0.44910 1.67451
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 3.89418 0.71033 5.482 0.0000000591 ***
#> alcohol 0.30478 0.02397 12.713 < 0.0000000000000002 ***
#> sulphates 2.13242 0.22485 9.484 < 0.0000000000000002 ***
#> volatile.acidity -0.95079 0.17737 -5.360 0.0000001136 ***
#> pH -0.63038 0.19806 -3.183 0.00153 **
#> citric.acid -0.50392 0.18349 -2.746 0.00618 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.5708 on 686 degrees of freedom
#> Multiple R-squared: 0.3927, Adjusted R-squared: 0.3882
#> F-statistic: 88.7 on 5 and 686 DF, p-value: < 0.00000000000000022
compare_performance(model_back, model_forward, model_both)
wine_predict <- predict(model_both, newdata = wine_test, level = 0.95)
wine_test$predict <- wine_predict
wine_test %>%
mutate(quality_flag = ifelse(quality > 6, "good", "bad"),
predict_flag = ifelse(predict > 6, "good", "bad")) %>%
select(quality_flag, predict_flag) %>%
table()
#> predict_flag
#> quality_flag bad good
#> bad 181 34
#> good 2 14
hist(model_both$residuals)
plot(density(model_both$residuals))
shapiro.test(model_both$residuals)
#>
#> Shapiro-Wilk normality test
#>
#> data: model_both$residuals
#> W = 0.9925, p-value = 0.00148
Result : normal distribution
plot(model_both$fitted.values, model_both$residuals)
abline(h = 0, col = "red")
bptest(model_both)
#>
#> studentized Breusch-Pagan test
#>
#> data: model_both
#> BP = 1.1424, df = 5, p-value = 0.9503
Result : residual hetero
vif(model_both)
#> alcohol sulphates volatile.acidity pH
#> 1.114592 1.180623 1.809674 1.238258
#> citric.acid
#> 2.043909
Result : no multicolinierity
Based on the model, the adjusted R-squared is still below the standard < 0.4, which is 0.393. The comparison performance stage in each model also produces the same output due to data cleaning which has removed all outliers in each variable (predictor & target) with the final result there are 923 rows which previously had 1599 rows.
The target variable in this case is quality because the output is predicting the quality of wine from the predictor variable. From the exploratory data, there are 9 linear variables from 11 variables with the highest correlation found in the alcohol variable of 0.5. The interpretation of the model is that 40% of the results of the quality of wine can be generated from the relationship with the selected variable, the rest is described by other variables.
According to the prediction model from the 25% test data, we classify the predicted & actual results with a threshold if quality > 6 is “good” wine and vice versa “bad”. This is done because the results of the prediction are numerical continue while the actual is not. The result is that from a total of 231 test data, there are 34 false positive data and 2 false negative data, the rest is in accordance with the actual, if it is a percentage there are 15% of the predictive data that are not appropriate. So it is concluded that the prediction model from the train data has 15% error which by default is only 5% of the data.