library(tidyr)
library(readr)
library(stringr)
library(dplyr)
library(Hmisc)
library(outliers)
library(InformationValue)
library(ggplot2)
library(corrplot)
library(rpart)
library(Metrics)
library(car)
The dataset downloaded from the UCI Machine Learning Repository is related to red variant of the Portuguese “Vinho Verde” wine.
#read data
redwine <- read.csv("/Users/Vidya/Downloads/winequality-red.csv", sep = ";")
describe(redwine)
redwine
12 Variables 1599 Observations
----------------------------------------------------------------------------------------------------------
fixed.acidity
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 96 0.999 8.32 1.893 6.1 6.5 7.1 7.9 9.2
.90 .95
10.7 11.8
lowest : 4.6 4.7 4.9 5.0 5.1, highest: 14.3 15.0 15.5 15.6 15.9
----------------------------------------------------------------------------------------------------------
volatile.acidity
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 143 1 0.5278 0.199 0.270 0.310 0.390 0.520 0.640
.90 .95
0.745 0.840
lowest : 0.120 0.160 0.180 0.190 0.200, highest: 1.180 1.185 1.240 1.330 1.580
----------------------------------------------------------------------------------------------------------
citric.acid
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 80 0.999 0.271 0.2227 0.000 0.010 0.090 0.260 0.420
.90 .95
0.522 0.600
lowest : 0.00 0.01 0.02 0.03 0.04, highest: 0.75 0.76 0.78 0.79 1.00
----------------------------------------------------------------------------------------------------------
residual.sugar
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 91 0.996 2.539 1.078 1.59 1.70 1.90 2.20 2.60
.90 .95
3.60 5.10
lowest : 0.9 1.2 1.3 1.4 1.5, highest: 13.4 13.8 13.9 15.4 15.5
----------------------------------------------------------------------------------------------------------
chlorides
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 153 1 0.08747 0.03217 0.0540 0.0600 0.0700 0.0790 0.0900
.90 .95
0.1090 0.1261
lowest : 0.012 0.034 0.038 0.039 0.041, highest: 0.422 0.464 0.467 0.610 0.611
----------------------------------------------------------------------------------------------------------
free.sulfur.dioxide
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 60 0.998 15.87 11.24 4 5 7 14 21
.90 .95
31 35
lowest : 1 2 3 4 5, highest: 55 57 66 68 72
----------------------------------------------------------------------------------------------------------
total.sulfur.dioxide
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 144 1 46.47 34.63 11.0 14.0 22.0 38.0 62.0
.90 .95
93.2 112.1
lowest : 6 7 8 9 10, highest: 155 160 165 278 289
----------------------------------------------------------------------------------------------------------
density
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 436 1 0.9967 0.002081 0.9936 0.9946 0.9956 0.9968 0.9978
.90 .95
0.9991 1.0000
lowest : 0.99007 0.99020 0.99064 0.99080 0.99084, highest: 1.00260 1.00289 1.00315 1.00320 1.00369
----------------------------------------------------------------------------------------------------------
pH
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 89 1 3.311 0.1716 3.06 3.12 3.21 3.31 3.40
.90 .95
3.51 3.57
lowest : 2.74 2.86 2.87 2.88 2.89, highest: 3.75 3.78 3.85 3.90 4.01
----------------------------------------------------------------------------------------------------------
sulphates
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 96 0.999 0.6581 0.1679 0.47 0.50 0.55 0.62 0.73
.90 .95
0.85 0.93
lowest : 0.33 0.37 0.39 0.40 0.42, highest: 1.61 1.62 1.95 1.98 2.00
----------------------------------------------------------------------------------------------------------
alcohol
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
1599 0 65 0.998 10.42 1.178 9.2 9.3 9.5 10.2 11.1
.90 .95
12.0 12.5
lowest : 8.40000 8.50000 8.70000 8.80000 9.00000, highest: 13.50000 13.56667 13.60000 14.00000 14.90000
----------------------------------------------------------------------------------------------------------
quality
n missing distinct Info Mean Gmd
1599 0 6 0.857 5.636 0.8431
Value 3 4 5 6 7 8
Frequency 10 53 681 638 199 18
Proportion 0.006 0.033 0.426 0.399 0.124 0.011
----------------------------------------------------------------------------------------------------------
str(redwine)
'data.frame': 1599 obs. of 12 variables:
$ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
$ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
$ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
$ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
$ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
$ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
$ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
$ density : num 0.998 0.997 0.997 0.998 0.998 ...
$ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
$ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
$ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
$ quality : int 5 5 5 6 5 5 5 7 7 5 ...
summary(redwine)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900 Min. :0.01200 Min. : 1.00
1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00
Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200 Median :0.07900 Median :14.00
Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539 Mean :0.08747 Mean :15.87
3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00
Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500 Max. :0.61100 Max. :72.00
total.sulfur.dioxide density pH sulphates alcohol quality
Min. : 6.00 Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
Median : 38.00 Median :0.9968 Median :3.310 Median :0.6200 Median :10.20 Median :6.000
Mean : 46.47 Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
Max. :289.00 Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
table(redwine$quality)
3 4 5 6 7 8
10 53 681 638 199 18
ggplot(data = redwine, aes(redwine$quality)) + geom_histogram(binwidth = 1, bins = 10, color = "black", fill = "Light Blue")
The wines are given a score between 1 and 10, 1 being the lowest and 10 being the highest rating for a wine. Filter quality to see the score distribution for wines with quality rating of 8 or above. Factor was
best_wines <- redwine %>% filter(quality > 7)
There are no null values in the dataset. A correlaton matrix was generated to check whether any significant correlations existed between the features in the dataset. In the generated correlations heatmap, negative correlations are in blue and positive ones in red color.
summary(best_wines)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
Min. : 5.000 Min. :0.2600 Min. :0.0300 Min. :1.400 Min. :0.04400 Min. : 3.00
1st Qu.: 7.250 1st Qu.:0.3350 1st Qu.:0.3025 1st Qu.:1.800 1st Qu.:0.06200 1st Qu.: 6.00
Median : 8.250 Median :0.3700 Median :0.4200 Median :2.100 Median :0.07050 Median : 7.50
Mean : 8.567 Mean :0.4233 Mean :0.3911 Mean :2.578 Mean :0.06844 Mean :13.28
3rd Qu.:10.225 3rd Qu.:0.4725 3rd Qu.:0.5300 3rd Qu.:2.600 3rd Qu.:0.07550 3rd Qu.:16.50
Max. :12.600 Max. :0.8500 Max. :0.7200 Max. :6.400 Max. :0.08600 Max. :42.00
total.sulfur.dioxide density pH sulphates alcohol quality
Min. :12.00 Min. :0.9908 Min. :2.880 Min. :0.6300 Min. : 9.80 Min. :8
1st Qu.:16.00 1st Qu.:0.9942 1st Qu.:3.163 1st Qu.:0.6900 1st Qu.:11.32 1st Qu.:8
Median :21.50 Median :0.9949 Median :3.230 Median :0.7400 Median :12.15 Median :8
Mean :33.44 Mean :0.9952 Mean :3.267 Mean :0.7678 Mean :12.09 Mean :8
3rd Qu.:43.00 3rd Qu.:0.9972 3rd Qu.:3.350 3rd Qu.:0.8200 3rd Qu.:12.88 3rd Qu.:8
Max. :88.00 Max. :0.9988 Max. :3.720 Max. :1.1000 Max. :14.00 Max. :8
corr_redwine <- cor(redwine)
corrplot(corr_redwine, method = "number", number.font = 2)
There is moderate correlation between citric acid and volatile acidity, density and alcohol, quality and volatile acidity and quality and alcohol. The boxplot for the volatile acidity shows that red wine with lower volatile acidity have a higher quality rating. The distribution of the percent alcohol content of the wine showed a higher median readings in wines rated 6 or higher. According to the visualisations, red wine with higher alcohol content and lower volatile acidity levels were rated higher in quality.
ggplot(data = redwine, aes(x = density, y = alcohol))+ geom_point(color = "navy blue")+ geom_smooth(method = "lm")
ggplot(data = redwine, aes(x = citric.acid, y = volatile.acidity))+ geom_point(color = "navy blue")+ geom_smooth(method = "lm")
ggplot(redwine, aes (y=volatile.acidity, x=quality, group = quality)) + geom_boxplot( color = "light blue")
ggplot(data = redwine, aes(x = quality, y = alcohol, group = quality)) + geom_boxplot( color = "black", fill ="orange")+ scale_x_continuous(breaks = 0:10)
The wine was labelled low, average or high based on the quality score. Wine scoring 3 or lower than 3 were labeled Low, wines with a score ranging from 4 to 7 were labeled as Average. Wines that scored 8 or above were labeled high.
#redwine <- mutate(redwine, wine_quality = ifelse(quality %in% 1:4, "Low", ifelse(quality %in% 5:7, "Average", "High")))
#table(redwine$wine_quality)
The data was split into test and training sets
set.seed(123)
n <- nrow(redwine)
n_train <- round(0.75*n)
train_indices <- sample(1:n, n_train)
wine_train <- redwine[train_indices ,]
wine_test <- redwine[-train_indices ,]
dim(wine_train)
[1] 1199 12
dim(wine_test)
[1] 400 12
Train a gini-based model
wine_tree <- rpart(formula = quality~.,
data = wine_train,
method = "class",
parms = list(split = "gini"))
printcp(wine_tree)
Classification tree:
rpart(formula = quality ~ ., data = wine_train, method = "class",
parms = list(split = "gini"))
Variables actually used in tree construction:
[1] alcohol sulphates total.sulfur.dioxide volatile.acidity
Root node error: 699/1199 = 0.58299
n= 1199
CP nsplit rel error xerror xstd
1 0.237482 0 1.00000 1.00000 0.024425
2 0.018598 1 0.76252 0.77825 0.024662
3 0.017883 2 0.74392 0.78255 0.024673
4 0.014306 4 0.70815 0.77825 0.024662
5 0.012876 5 0.69385 0.78255 0.024673
6 0.010000 6 0.68097 0.74392 0.024550
Plot the decison tree
wine_tree
n= 1199
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 1199 699 5 (0.0058 0.035 0.42 0.4 0.13 0.011)
2) alcohol< 10.15 589 224 5 (0.0068 0.039 0.62 0.31 0.027 0.0017)
4) volatile.acidity>=0.4175 461 148 5 (0.0087 0.046 0.68 0.25 0.017 0) *
5) volatile.acidity< 0.4175 128 63 6 (0 0.016 0.41 0.51 0.063 0.0078)
10) total.sulfur.dioxide>=83.5 10 0 5 (0 0 1 0 0 0) *
11) total.sulfur.dioxide< 83.5 118 53 6 (0 0.017 0.36 0.55 0.068 0.0085)
22) sulphates< 0.525 18 5 5 (0 0.056 0.72 0.22 0 0) *
23) sulphates>=0.525 100 39 6 (0 0.01 0.29 0.61 0.08 0.01) *
3) alcohol>=10.15 610 309 6 (0.0049 0.031 0.22 0.49 0.23 0.02)
6) alcohol< 11.55 428 211 6 (0.007 0.04 0.29 0.51 0.15 0.0047) *
7) alcohol>=11.55 182 98 6 (0 0.011 0.055 0.46 0.42 0.055)
14) sulphates< 0.685 91 34 6 (0 0.022 0.088 0.63 0.26 0) *
15) sulphates>=0.685 91 39 7 (0 0 0.022 0.3 0.57 0.11) *
rpart.plot(wine_tree)
wine_pred<- predict(object = wine_tree,
newdata = wine_test,
type = "class",
control = rpart.control(cp = 0, maxdepth = 3, minsplit = 5))
ce(actual = wine_test$quality,
predicted = wine_pred)
[1] 0.4125