Task Description

The main aim of this task is to find similarities of different types of wine by grouping them through clustering.

Solution

Library

library(scales)
library(NbClust)
library(purrr)
library(ggplot2)
library(dplyr)
library(fpc)
library(cluster)
library(corrplot)
library(cluster.datasets)
library(clustree)
library(cowplot)
library(dendextend)
library(factoextra)

Import data

wine_data<-read.csv("Wine_data.csv", header = TRUE, sep = ',')

Structure of the data

str(wine_data)
## 'data.frame':    178 obs. of  14 variables:
##  $ TYPE           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ALCOHOL        : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ MALIC          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ ASH            : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ ALCALINITY     : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ MAGNESIUM      : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ PHENOLS        : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ FLAVANOIDS     : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ NONFLAVANOIDS  : num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ PROANTHOCYANINS: num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ COLOR          : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ HUE            : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ DILUTION       : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ PROLINE        : int  1065 1050 1185 1480 735 NA 1290 1295 1045 1045 ...

Missing value treatment of the data set

missing_value<-is.na(wine_data)
which(is.na(wine_data))
## [1]  494  986 1940 2074 2320 2333 2346
### There are missing values in the data set.

Clean the data set

clean_wine_data<-na.omit(wine_data)

Find optimal number of clusters

Elbow method

k.max <- 15
data <- clean_wine_data
wss <- sapply(1:k.max, 
              function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})

Visualization of the optimal number of cluster

plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

fviz_nbclust(clean_wine_data,kmeans,method="wss")+
  geom_vline(xintercept = 3,linetype=5,col="darkred")

Silhouette method

pamk.best2 <- pamk(clean_wine_data)
cat("Number of clusters estimated by optimum average silhouette width:", pamk.best2$nc, "\n")
## Number of clusters estimated by optimum average silhouette width: 2
plot(pam(clean_wine_data, pamk.best2$nc))

fviz_nbclust(clean_wine_data, kmeans, method = "silhouette", k.max = 24) + theme_minimal() + ggtitle("The Silhouette Plot")

Gap statistic method

clusGap(clean_wine_data, kmeans, 10, B = 100, verbose = interactive())
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = clean_wine_data, FUNcluster = kmeans, K.max = 10,     B = 100, verbose = interactive())
## B=100 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstSEmax', SE.factor=1): 1
##           logW   E.logW       gap     SE.sim
##  [1,] 9.574150 9.802093 0.2279430 0.03556892
##  [2,] 8.904059 9.115802 0.2117431 0.03264239
##  [3,] 8.508689 8.727099 0.2184099 0.03328336
##  [4,] 8.274464 8.455796 0.1813319 0.03591199
##  [5,] 8.084612 8.259256 0.1746435 0.03568258
##  [6,] 7.859991 8.102017 0.2420259 0.04712583
##  [7,] 7.740815 7.995241 0.2544264 0.05350587
##  [8,] 7.651259 7.884467 0.2332079 0.04647377
##  [9,] 7.625411 7.804726 0.1793152 0.05286162
## [10,] 7.576544 7.734261 0.1577166 0.06132931
gap_stat <- clusGap(clean_wine_data, FUN = kmeans, nstart = 50, K.max = 24, B = 50)

fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")

Choose silhouette method for choosing optimal number of clusters

k_means<-kmeans(clean_wine_data,2,nstart=25)
k_means
## K-means clustering with 2 clusters of sizes 58, 113
## 
## Cluster means:
##       TYPE  ALCOHOL    MALIC      ASH ALCALINITY MAGNESIUM  PHENOLS
## 1 1.224138 13.60793 2.028276 2.438103   17.75517       106 2.705862
## 2 2.309735 12.68469 2.504513 2.326195   20.36726        96 2.060265
##   FLAVANOIDS NONFLAVANOIDS PROANTHOCYANINS    COLOR       HUE DILUTION
## 1   2.716379     0.3018966        1.823448 5.561724 1.0307931 3.012586
## 2   1.654956     0.3896460        1.442124 4.727080 0.9115929 2.412655
##     PROLINE
## 1 1097.2931
## 2  550.2655
## 
## Clustering vector:
##   1   2   3   4   5   7   8   9  10  11  12  13  14  15  16  17  18  20 
##   1   1   1   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  33  34  35  36  37  38  39 
##   2   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57 
##   2   2   1   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75 
##   1   1   2   2   2   2   2   2   2   2   2   2   2   1   2   2   1   1 
##  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  94  95  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 113 114 115 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 132 133 134 135 136 137 139 140 141 142 143 144 145 146 147 148 149 150 
##   2   2   2   2   2   2   2   2   2   2   2   2   1   1   2   2   2   2 
## 151 152 153 154 155 156 157 158 159 161 162 163 164 165 166 167 168 169 
##   2   2   2   2   2   2   2   1   2   2   2   2   2   2   2   2   2   2 
## 170 171 172 173 174 175 176 177 178 
##   2   2   2   2   2   2   1   1   2 
## 
## Within cluster sum of squares by cluster:
## [1] 1938438 1978295
##  (between_SS / total_SS =  74.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Determine cluster, a vector of integers (from 1: k) indicating the cluster to which each point is allocated.

k_means$centers
##       TYPE  ALCOHOL    MALIC      ASH ALCALINITY MAGNESIUM  PHENOLS
## 1 1.224138 13.60793 2.028276 2.438103   17.75517       106 2.705862
## 2 2.309735 12.68469 2.504513 2.326195   20.36726        96 2.060265
##   FLAVANOIDS NONFLAVANOIDS PROANTHOCYANINS    COLOR       HUE DILUTION
## 1   2.716379     0.3018966        1.823448 5.561724 1.0307931 3.012586
## 2   1.654956     0.3896460        1.442124 4.727080 0.9115929 2.412655
##     PROLINE
## 1 1097.2931
## 2  550.2655

Determine cluster size

k_means$size
## [1]  58 113

Visualizing k_means cluster

clusplot(clean_wine_data,k_means$cluster,main="2D representation of the Cluster",color=TRUE, shade=TRUE,labels=2,lines=0)

fviz_cluster(object = k_means,
             data=clean_wine_data,
             ellipse.type = "norm",
             geom="point",
             palette="jco",
             main="",
             ggtheme=theme_minimal())

Recommendation

### Three most important variables are proline, flavonoid, color. Data scientists can narrow their focus and resources on research and development about improving wine quality by three mentioned variables