Wine quality depends on a lot of factors like alcohol content,presence of sulphates,its pH values etc.The taste,smell and potency of the wine is defined by its chemical ingredients and its percentages in wines. A restaurant needs to classify its wines into different categories depending on its ingredients and label it accordingly for its different category of customers.
Using K-means clustering algorithm classify the wines into appropriate distinguished optimal clusters having similar properties in each cluster.
winequality-red.csv consists of 1599 observations of wines having 12 variables. Use variables pH, alcohol, sulphates and total.sulpur.dioxide to segment the dataset appropriately using k means clustering algorithm.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
wine=read.csv("winequality-red.csv",sep=";")
glimpse(wine) #1599 obs and 12 variables.
## Observations: 1,599
## Variables: 12
## $ fixed.acidity <dbl> 7.4, 7.8, 7.8, 11.2, 7.4, 7.4, 7.9, 7.3, ...
## $ volatile.acidity <dbl> 0.700, 0.880, 0.760, 0.280, 0.700, 0.660,...
## $ citric.acid <dbl> 0.00, 0.00, 0.04, 0.56, 0.00, 0.00, 0.06,...
## $ residual.sugar <dbl> 1.9, 2.6, 2.3, 1.9, 1.9, 1.8, 1.6, 1.2, 2...
## $ chlorides <dbl> 0.076, 0.098, 0.092, 0.075, 0.076, 0.075,...
## $ free.sulfur.dioxide <dbl> 11, 25, 15, 17, 11, 13, 15, 15, 9, 17, 15...
## $ total.sulfur.dioxide <dbl> 34, 67, 54, 60, 34, 40, 59, 21, 18, 102, ...
## $ density <dbl> 0.9978, 0.9968, 0.9970, 0.9980, 0.9978, 0...
## $ pH <dbl> 3.51, 3.20, 3.26, 3.16, 3.51, 3.51, 3.30,...
## $ sulphates <dbl> 0.56, 0.68, 0.65, 0.58, 0.56, 0.56, 0.46,...
## $ alcohol <dbl> 9.4, 9.8, 9.8, 9.8, 9.4, 9.4, 9.4, 10.0, ...
## $ quality <int> 5, 5, 5, 6, 5, 5, 5, 7, 7, 5, 5, 5, 5, 5,...
wine=wine %>%
select(pH,sulphates,alcohol,total.sulfur.dioxide)
glimpse(wine)
## Observations: 1,599
## Variables: 4
## $ pH <dbl> 3.51, 3.20, 3.26, 3.16, 3.51, 3.51, 3.30,...
## $ sulphates <dbl> 0.56, 0.68, 0.65, 0.58, 0.56, 0.56, 0.46,...
## $ alcohol <dbl> 9.4, 9.8, 9.8, 9.8, 9.4, 9.4, 9.4, 10.0, ...
## $ total.sulfur.dioxide <dbl> 34, 67, 54, 60, 34, 40, 59, 21, 18, 102, ...
md=function(x){
return((x-mean(x))/sd(x))
}
wine_std=wine %>%
mutate(pH=md(pH),
sulphates=md(sulphates),
alcohol=md(alcohol),
total.sulfur.dioxide=md(total.sulfur.dioxide))
summary(wine)
## pH sulphates alcohol total.sulfur.dioxide
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. : 6.00
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.: 22.00
## Median :3.310 Median :0.6200 Median :10.20 Median : 38.00
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean : 46.47
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.: 62.00
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :289.00
summary(wine_std)
## pH sulphates alcohol
## Min. :-3.69924 Min. :-1.9359 Min. :-1.8983
## 1st Qu.:-0.65494 1st Qu.:-0.6380 1st Qu.:-0.8661
## Median :-0.00721 Median :-0.2251 Median :-0.2092
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.57574 3rd Qu.: 0.4239 3rd Qu.: 0.6353
## Max. : 4.52687 Max. : 7.9162 Max. : 4.2011
## total.sulfur.dioxide
## Min. :-1.2302
## 1st Qu.:-0.7438
## Median :-0.2574
## Mean : 0.0000
## 3rd Qu.: 0.4722
## Max. : 7.3728
mydata <- wine_std
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
#finding within sum of squares
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
#for k=2 to 15,kmeans function takes two arguments(data, and i) and thus finding SSW for each K=i(2 to 15) and storing it in vector wss[i]
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",col="mediumseagreen",pch=12)
#
fit <- kmeans(mydata,5 )
#running kmeans for mydata dataset with k=5 and storing the result in fit.
fit$cluster will give the cluster in which the obs go to.Lets Store it in mydata dataset with header cluster.
mydata$cluster=fit$cluster
#or same as:
wine_std$cluster=fit$cluster
ggplot(wine_std,aes(pH,alcohol,color=as.factor(cluster)))+geom_point()
ggplot(wine_std,aes(pH,sulphates,color=as.factor(cluster)))+geom_point()
ggplot(wine_std,aes(pH,total.sulfur.dioxide,color=as.factor(cluster)))+geom_point()
ggplot(wine_std,aes(alcohol,sulphates,color=as.factor(cluster)))+geom_point()
ggplot(wine_std,aes(alcohol,total.sulfur.dioxide,color=as.factor(cluster)))+geom_point()
ggplot(wine_std,aes(sulphates,total.sulfur.dioxide,color=as.factor(cluster)))+geom_point()
Cluster 1:low pH,high sulphates,low alcohol
Cluster 2:high pH,low sulphates,high alcohol,low total.sulpur.dioxide
Cluster 3:Low alcohol,low sulphates,high total.sulpur.dioxide
Cluster 4:high alcohol,low pH,low total.sulpur.dioxide
Cluster 5:Low alcohol,low sulphates,low total.sulphur.dioxide
apply(wine,2,function(x)tapply(x,wine_std$cluster,mean))
## pH sulphates alcohol total.sulfur.dioxide
## 1 3.211812 0.6001790 9.799254 32.98658
## 2 3.312680 0.7060567 11.828265 31.96392
## 3 3.135263 1.1582895 9.821053 59.72368
## 4 3.260209 0.5931799 9.769805 103.16318
## 5 3.465479 0.6243875 10.279139 40.00000
diss=daisy(wine_std)
sk=silhouette(wine_std$cluster,diss)
plot(sk)