Library
library(scales)
library(NbClust)
library(purrr)
library(ggplot2)
library(dplyr)
library(fpc)
library(cluster)
library(corrplot)
library(cluster.datasets)
library(clustree)
library(cowplot)
library(dendextend)
library(factoextra)
Import data
wine_data<-read.csv("Wine_data.csv", header = TRUE, sep = ',')
Structure of the data
str(wine_data)
## 'data.frame': 178 obs. of 14 variables:
## $ TYPE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ALCOHOL : num 14.2 13.2 13.2 14.4 13.2 ...
## $ MALIC : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ ASH : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ ALCALINITY : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ MAGNESIUM : int 127 100 101 113 118 112 96 121 97 98 ...
## $ PHENOLS : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ FLAVANOIDS : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ NONFLAVANOIDS : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ PROANTHOCYANINS: num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ COLOR : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ HUE : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ DILUTION : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ PROLINE : int 1065 1050 1185 1480 735 NA 1290 1295 1045 1045 ...
Missing value treatment of the data set
missing_value<-is.na(wine_data)
which(is.na(wine_data))
## [1] 494 986 1940 2074 2320 2333 2346
### There are missing values in the data set.
Clean the data set
clean_wine_data<-na.omit(wine_data)
Find optimal number of clusters
Elbow method
k.max <- 15
data <- clean_wine_data
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
Visualization of the optimal number of cluster
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")

fviz_nbclust(clean_wine_data,kmeans,method="wss")+
geom_vline(xintercept = 3,linetype=5,col="darkred")

Silhouette method
pamk.best2 <- pamk(clean_wine_data)
cat("Number of clusters estimated by optimum average silhouette width:", pamk.best2$nc, "\n")
## Number of clusters estimated by optimum average silhouette width: 2
plot(pam(clean_wine_data, pamk.best2$nc))


fviz_nbclust(clean_wine_data, kmeans, method = "silhouette", k.max = 24) + theme_minimal() + ggtitle("The Silhouette Plot")

Gap statistic method
clusGap(clean_wine_data, kmeans, 10, B = 100, verbose = interactive())
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = clean_wine_data, FUNcluster = kmeans, K.max = 10, B = 100, verbose = interactive())
## B=100 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 1
## logW E.logW gap SE.sim
## [1,] 9.574150 9.802093 0.2279430 0.03556892
## [2,] 8.904059 9.115802 0.2117431 0.03264239
## [3,] 8.508689 8.727099 0.2184099 0.03328336
## [4,] 8.274464 8.455796 0.1813319 0.03591199
## [5,] 8.084612 8.259256 0.1746435 0.03568258
## [6,] 7.859991 8.102017 0.2420259 0.04712583
## [7,] 7.740815 7.995241 0.2544264 0.05350587
## [8,] 7.651259 7.884467 0.2332079 0.04647377
## [9,] 7.625411 7.804726 0.1793152 0.05286162
## [10,] 7.576544 7.734261 0.1577166 0.06132931
gap_stat <- clusGap(clean_wine_data, FUN = kmeans, nstart = 50, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")

Choose silhouette method for choosing optimal number of clusters
k_means<-kmeans(clean_wine_data,2,nstart=25)
k_means
## K-means clustering with 2 clusters of sizes 58, 113
##
## Cluster means:
## TYPE ALCOHOL MALIC ASH ALCALINITY MAGNESIUM PHENOLS
## 1 1.224138 13.60793 2.028276 2.438103 17.75517 106 2.705862
## 2 2.309735 12.68469 2.504513 2.326195 20.36726 96 2.060265
## FLAVANOIDS NONFLAVANOIDS PROANTHOCYANINS COLOR HUE DILUTION
## 1 2.716379 0.3018966 1.823448 5.561724 1.0307931 3.012586
## 2 1.654956 0.3896460 1.442124 4.727080 0.9115929 2.412655
## PROLINE
## 1 1097.2931
## 2 550.2655
##
## Clustering vector:
## 1 2 3 4 5 7 8 9 10 11 12 13 14 15 16 17 18 20
## 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 33 34 35 36 37 38 39
## 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
## 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1
## 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 94 95 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 113 114 115 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 132 133 134 135 136 137 139 140 141 142 143 144 145 146 147 148 149 150
## 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2
## 151 152 153 154 155 156 157 158 159 161 162 163 164 165 166 167 168 169
## 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2
## 170 171 172 173 174 175 176 177 178
## 2 2 2 2 2 2 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 1938438 1978295
## (between_SS / total_SS = 74.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Determine cluster, a vector of integers (from 1: k) indicating the cluster to which each point is allocated.
k_means$centers
## TYPE ALCOHOL MALIC ASH ALCALINITY MAGNESIUM PHENOLS
## 1 1.224138 13.60793 2.028276 2.438103 17.75517 106 2.705862
## 2 2.309735 12.68469 2.504513 2.326195 20.36726 96 2.060265
## FLAVANOIDS NONFLAVANOIDS PROANTHOCYANINS COLOR HUE DILUTION
## 1 2.716379 0.3018966 1.823448 5.561724 1.0307931 3.012586
## 2 1.654956 0.3896460 1.442124 4.727080 0.9115929 2.412655
## PROLINE
## 1 1097.2931
## 2 550.2655
Determine cluster size
k_means$size
## [1] 58 113
Visualizing k_means cluster
clusplot(clean_wine_data,k_means$cluster,main="2D representation of the Cluster",color=TRUE, shade=TRUE,labels=2,lines=0)

fviz_cluster(object = k_means,
data=clean_wine_data,
ellipse.type = "norm",
geom="point",
palette="jco",
main="",
ggtheme=theme_minimal())

Recommendation
### Three most important variables are proline, flavonoid, color. Data scientists can narrow their focus and resources on research and development about improving wine quality by three mentioned variables