About me

Hassan OUKHOUYA
E-mail: hassan.oukhouya@um5r.ac.ma
LinkedIn: https://www.linkedin.com/in/hassan-oukhouya-3901b816b/
ORCID iD: https://orcid.org/0000-0002-5058-2008
Upwork: https://www.upwork.com/services/product/time-series-analysis-with-python-or-r-studio-1449669530698514432?ref=project_share
Fiverr: https://www.fiverr.com/share/5Dzz6z

Importing necessary libraries

First, we load the required librairies and data sets.

library(caret)
library(tidyverse)
library(leaps)
library(ggplot2)
library(reshape2)
library(MASS)
library(ggcorrplot)
library(corrplot)
library(plotmo)
library(keras)
library(kableExtra)
library(modelr)
library(psych)
library(Rmisc)
library(gridExtra)
library(scales)
library(rpart)
library(yardstick)
library(cluster)
library(NbClust)
library(factoextra)

Importing the Dataset

whiteDat <- read_csv("winequality-white.csv")

Then I create a shortcut function to sort the dataframe columns by the absolute value of their correlation with the outcome.

sortByCorr = function(dataset, refColName) {
  # Sort the dataframe columns by the absolute value of their correlation with
  # a given column
  #
  # Args:
  #   dataset: A vector, matrix, or data frame to sort
  #   refColName: The name of the reference colum for the correlation
  #
  # Returns:
  #   The sorted dataframe
  refColIdx = grep(refColName, colnames(dataset))
  corrTmp = cor(dataset)[, refColIdx]
  corrTmp[order(abs(corrTmp), decreasing = TRUE)]

  dataset[, order(abs(corrTmp), decreasing = TRUE)]
}

Summary

dim(whiteDat)

## [1] 4873   12

numericVars <- which(sapply(whiteDat, is.numeric))
numericVarNames <- names(numericVars)
cat("There are", length(numericVars), "numeric variables")

## There are 12 numeric variables

sapply(whiteDat, class)

##        fixed acidity     volatile acidity          citric acid 
##            "numeric"            "numeric"            "numeric" 
##       residual sugar            chlorides  free sulfur dioxide 
##            "numeric"            "numeric"            "numeric" 
## total sulfur dioxide              density                   pH 
##            "numeric"            "numeric"            "numeric" 
##            sulphates              alcohol              quality 
##            "numeric"            "numeric"            "numeric"

# We take a look at the data distribution:  
head(whiteDat)

## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             6.2            0.45           0.26              4.4     0.063
## 2             9.8            0.36           0.46             10.5     0.038
## 3             5.5            0.485          0                 1.5     0.065
## 4             6.4            0.595          0.14              5.2     0.058
## 5             7.6            0.48           0.37              0.8     0.037
## 6             7.2            0.32           0.47              5.1     0.044
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>

# Get some metrics about the variables
summary(whiteDat)

##  fixed acidity    volatile acidity  citric acid     residual sugar  
##  Min.   : 3.800   Min.   :0.080    Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.210    1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.260    Median :0.3200   Median : 5.200  
##  Mean   : 6.851   Mean   :0.278    Mean   :0.3341   Mean   : 6.394  
##  3rd Qu.: 7.300   3rd Qu.:0.320    3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.100    Max.   :1.6600   Max.   :65.800  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0        1st Qu.:0.9917  
##  Median :0.04300   Median : 34.00      Median :134.0        Median :0.9937  
##  Mean   :0.04576   Mean   : 35.24      Mean   :138.3        Mean   :0.9940  
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0        3rd Qu.:0.9961  
##  Max.   :0.34600   Max.   :138.50      Max.   :344.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :4.000  
##  1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
##  Mean   :3.188   Mean   :0.4899   Mean   :10.51   Mean   :5.887  
##  3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
##  Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :8.000

The white wine dataset has 4873 observations, 11 predictors and 1 outcome (quality). All of the predictors are numeric values, outcomes are integer.

The summary stats shows that most of the variables has wide range compared to the IQR, which may indicate spread in the data and the presence of outliers. We investigate further by producing boxplots for each of the variables:

oldpar = par(mfrow = c(2,6))
for ( i in 1:11 ) {
  boxplot(whiteDat[[i]])
  mtext(names(whiteDat)[i], cex = 0.8, side = 1, line = 2)
}
par(oldpar)

It demonstrate that all variables, except alcohol contains outliers.

We now use a scatter plot matrix to get an insight on the outliers locations:

pairs(whiteDat[, -grep("quality", colnames(whiteDat))])

We see that outliers seems to be on the higher end.

Now we look at the predictor values distribution:

oldpar = par(mfrow = c(2,6))
for ( i in 1:12 ) {
  truehist(whiteDat[[i]], xlab = names(whiteDat)[i], col = 'lightgreen',
           main = paste("Average =", signif(mean(whiteDat[[i]]),3)))
}

par(oldpar)

We note that all the variables has positively skewed distributions except quality which is normally distributed. citric.acide show a peak at the lower end.

Outlier detection

For each variables, we consider observations that lie outside 1.5 * IQR as outliers.

outliers = c()
for ( i in 1:11 ) {
  stats = boxplot.stats(whiteDat[[i]])$stats
  bottom_outlier_rows = which(whiteDat[[i]] < stats[1])
  top_outlier_rows = which(whiteDat[[i]] > stats[5])
  outliers = c(outliers , top_outlier_rows[ !top_outlier_rows %in% outliers ] )
  outliers = c(outliers , bottom_outlier_rows[ !bottom_outlier_rows %in% outliers ] )
}

We use the Cook’s ditance to detect influential observations.

mod = lm(quality ~ ., data = whiteDat)
cooksd = cooks.distance(mod)
plot(cooksd, pch = "*", cex = 2, main = "Influential Obs by Cooks distance")
abline(h = 4*mean(cooksd, na.rm = T), col = "red")

head(whiteDat[cooksd > 4 * mean(cooksd, na.rm=T), ])

## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             9.8             0.36          0.46             10.5     0.038
## 2             7.2             0.32          0.47              5.1     0.044
## 3             5.9             0.21          0.28              4.6     0.053
## 4             7.2             0.31          0.46              5       0.04 
## 5             6.7             0.31          0.31              9.9     0.04 
## 6             6.8             0.29          0.16              1.4     0.038
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>

By looking at each row we can find out why it is influential:

Row 99 and 252 have very high residual.sugar.
Row 18 and 21 have very high free.sulfur.dioxide.
Row 251 have very high sulphates.
Row 254 and 99 have very low free.sulfur.dioxide.

We remove all the ouliers in our list from the dataset and create a new set of histograms:

coutliers = as.numeric(rownames(whiteDat[cooksd > 4 * mean(cooksd, na.rm=T), ]))
outliers = c(outliers , coutliers[ !coutliers %in% outliers ] )

cleanWhiteDat = whiteDat[-outliers, ]
oldpar = par(mfrow=c(2,6))
for ( i in 1:12 ) {
  truehist(cleanWhiteDat[[i]], xlab = names(cleanWhiteDat)[i], col = 'lightgreen', 
           main = paste("Average =", signif(mean(cleanWhiteDat[[i]]),3)))
}

par(oldpar)

dim(cleanWhiteDat)

## [1] 3981   12

By removing the outliers, the dataset size reduced to 3981 observations. Now, the variables are approximatly normaly distributed, except for residual.sugar which is unimodal and skewed to the right. This could be an interesting fact for further analysis.

We now use a scatterplot matrice to roughly determine if there is a linear correlation between our variables:

pairs(cleanWhiteDat, col = cleanWhiteDat$quality, pch = cleanWhiteDat$quality)

pairs(cleanWhiteDat[,c(7, 8, 10, 11)], col = cleanWhiteDat$quality, pch = cleanWhiteDat$quality)

Only residual.sugar/density and density/alcohol pairs seems to have a linear correlation.

We note a trend with the alcohol variable: higher the alcohol value is, better is the quality. In the oposite, it seems like the lowest the density, the better the quality.

Correlation

The following correlation matrix confirm the strong correlation between residual.sugar/density and density/alcohol. It also confirm that alcohol is the variable with the highest correlation with quality. At a lower level, density and chlorides also have a significant correlation with quality.

ggcorrplot(cor(cleanWhiteDat), hc.order = TRUE, type = "lower", lab = TRUE, insig = "blank")

colnames(sortByCorr(dataset = cleanWhiteDat, refColName = 'quality'))

##  [1] "quality"              "alcohol"              "density"             
##  [4] "chlorides"            "total sulfur dioxide" "residual sugar"      
##  [7] "volatile acidity"     "pH"                   "fixed acidity"       
## [10] "citric acid"          "sulphates"            "free sulfur dioxide"

Creating the corrplot

numericVars <- which(sapply(cleanWhiteDat, is.numeric))

all_numVar <- cleanWhiteDat[, numericVars]
cor_numVar <- cor(all_numVar, use = "pairwise.complete.obs")

#Sort on decreasing correlations with alcohol
cor_sorted <- as.matrix(sort(cor_numVar[,"alcohol"], decreasing = TRUE))

#Selecting high correlations 
Cor_High <- names(which(apply(cor_sorted, 1, function(x) abs(x) > 0.175)))
cor_numVar <- cor_numVar[Cor_High, Cor_High]

corrplot.mixed(cor_numVar, tl.col = "black", tl.pos = "lt")

Should we log-transform the data?

Log transformation is widely used to make data conform to normality and/or reduce variability of data. For the white wine, since our data are now close to the normal distribution, we don’t need to transform them.

In both case we can see in the following pairwise plots that log-transformation doesn’t improve linearity of the relationships between predictors and outcome.

pairs(log(cleanWhiteDat))

Start of Clustering

We have gone through the data available in hand and ran tests which has given us some clarity over data, and we can start our clustering analysis.

For successfull clustering and clustering to be good we need to follow few steps, before we can do actual clustering.

Objectives:

Understanding data.
Data pre-processing.

Creating dummy variables.
Removing columns with zero-variance.
Scaling and centering data.
Discover and remove higly correlated variables.
Data Transformation. Scaling and Centering.

Determining Number of Clusters.
Segmentation using k-means.

Understanding data

We have already performed Step: a. Understanding Data. Now let us make use of same and perform our analysis.

Data PreProcessing

Data pre-processing involves various process. We will define the process required as we proceed. All these steps will help in achiving good clustering. Will also tell steps which we can do but might not required here depending on the data.

Changing all column values to Numeric

Remeber, previously i asked to make a note of class of columns. This where we will make use of it. Goal here is to convert all and any columns to numerical value.

Clustering can be performed only for numerical values.

Based on str() called earlier we have seen that all our columns are of class Num accept - Quality, (of int type).

We will not make changes to Color column as we will use it as is for plotting graphs. And just to be carefulll we will convert int -> numeric.

#Making changes for converting data types

#Converting 2 interger columns too numeric
cleanWhiteDat$quality <- as.numeric(cleanWhiteDat$quality)

#for ease creating a data set without color column
wineData_no_color <- cleanWhiteDat[1:12]

#Let us see the structure again
str(wineData_no_color)

## tbl_df [3,981 x 12] (S3: tbl_df/tbl/data.frame)
##  $ fixed acidity       : num [1:3981] 6.9 6.3 7 6.9 6.9 6.2 8.8 8.5 7.6 8.2 ...
##  $ volatile acidity    : num [1:3981] 0.32 0.39 0.39 0.25 0.25 0.43 0.35 0.17 0.31 0.29 ...
##  $ citric acid         : num [1:3981] 0.16 0.24 0.24 0.24 0.24 0.49 0.49 0.49 0.49 0.49 ...
##  $ residual sugar      : num [1:3981] 1.4 6.9 1 3.6 3.6 6.4 1 8.8 13.4 1 ...
##  $ chlorides           : num [1:3981] 0.051 0.069 0.048 0.057 0.057 0.045 0.036 0.048 0.062 0.044 ...
##  $ free sulfur dioxide : num [1:3981] 15 9 8 13 13 12 14 23 50 29 ...
##  $ total sulfur dioxide: num [1:3981] 96 117 119 85 85 115 56 108 191 118 ...
##  $ density             : num [1:3981] 0.994 0.994 0.992 0.994 0.994 ...
##  $ pH                  : num [1:3981] 3.22 3.15 3 2.99 2.99 3.27 2.96 2.88 3.22 3.24 ...
##  $ sulphates           : num [1:3981] 0.38 0.35 0.31 0.48 0.48 0.57 0.33 0.34 0.53 0.36 ...
##  $ alcohol             : num [1:3981] 9.5 10.2 10.1 9.5 9.5 9 10.5 10.5 9 10.9 ...
##  $ quality             : num [1:3981] 4 4 4 4 4 4 4 4 4 4 ...

Try finding Near Zero Variance

In this step we will try to find out columns which might not have any significant variations in there values, and there variance doesn’t make any difference to results.

We can remove those coluns and make our computation faster.

We will make use a function from Caret library for this.

#Will give us column number which might insignificant variance.
nzv <- nearZeroVar(cleanWhiteDat)

print(paste("---Column number with----", nzv))

## [1] "---Column number with---- "

As we can see nothing is returned i.e we don’t have any such column.

Normalizing the Data i.e Scaling and Centering

In this we step we try to normalize the values of column so that any columns with large values compared to others may not dominate columns with lower values. This columns with higher values may cause inconsistency in clustering.

Let us identifying columns with large data.

There are 2 ways of doing it but we will make use of a technique which will give us all values in the range of 0’s and 1’s

#Identifying columns with large data
head(cleanWhiteDat)

## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             6.9             0.32          0.16              1.4     0.051
## 2             6.3             0.39          0.24              6.9     0.069
## 3             7               0.39          0.24              1       0.048
## 4             6.9             0.25          0.24              3.6     0.057
## 5             6.9             0.25          0.24              3.6     0.057
## 6             6.2             0.43          0.49              6.4     0.045
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>

#By looking at data we can say that Columns 1, 4, 6, 7, 9, 11, 12 can cause issues, 
#we will try to normalize them so that there value are in 0 to 1 range.
print("---Normalizing Data----")

## [1] "---Normalizing Data----"

norm_data <- sapply(cleanWhiteDat[,c(1,4,6,7,9,11,12)], function(x) (x - min(x))/(max(x) - min(x)))

print("---Type of returned data")

## [1] "---Type of returned data"

class(norm_data)

## [1] "matrix"

print("---Converting data from matrix to data.frame---")

## [1] "---Converting data from matrix to data.frame---"

norm_data <- data.frame(norm_data)    # norm_data is a 'matrix'

print("---Normalised data---")

## [1] "---Normalised data---"

head(norm_data)

##   fixed.acidity residual.sugar free.sulfur.dioxide total.sulfur.dioxide
## 1         0.525     0.03738318          0.16666667            0.3205128
## 2         0.375     0.29439252          0.08974359            0.4102564
## 3         0.550     0.01869159          0.07692308            0.4188034
## 4         0.525     0.14018692          0.14102564            0.2735043
## 5         0.525     0.14018692          0.14102564            0.2735043
## 6         0.350     0.27102804          0.12820513            0.4017094
##          pH   alcohol quality
## 1 0.5405405 0.1896552       0
## 2 0.4459459 0.3103448       0
## 3 0.2432432 0.2931034       0
## 4 0.2297297 0.1896552       0
## 5 0.2297297 0.1896552       0
## 6 0.6081081 0.1034483       0

Brief theory on what we have done here.

(Each Data point in column subtracted by minimum value if that column) divided by (maximum value of that columns subtracted by minimum value of that column).

The result of this will always give a value which will be in range of 0 to 1

Now let us bind the rest of data with normalized data and see the data.

#Binding the normalised data with other data
wineData_norm <- cbind(cleanWhiteDat[,c(2,3,5,8,10)],norm_data)
head(wineData_norm)

##   volatile acidity citric acid chlorides density sulphates fixed.acidity
## 1             0.32        0.16     0.051  0.9940      0.38         0.525
## 2             0.39        0.24     0.069  0.9942      0.35         0.375
## 3             0.39        0.24     0.048  0.9923      0.31         0.550
## 4             0.25        0.24     0.057  0.9942      0.48         0.525
## 5             0.25        0.24     0.057  0.9942      0.48         0.525
## 6             0.43        0.49     0.045  0.9963      0.57         0.350
##   residual.sugar free.sulfur.dioxide total.sulfur.dioxide        pH   alcohol
## 1     0.03738318          0.16666667            0.3205128 0.5405405 0.1896552
## 2     0.29439252          0.08974359            0.4102564 0.4459459 0.3103448
## 3     0.01869159          0.07692308            0.4188034 0.2432432 0.2931034
## 4     0.14018692          0.14102564            0.2735043 0.2297297 0.1896552
## 5     0.14018692          0.14102564            0.2735043 0.2297297 0.1896552
## 6     0.27102804          0.12820513            0.4017094 0.6081081 0.1034483
##   quality
## 1       0
## 2       0
## 3       0
## 4       0
## 5       0
## 6       0

str(wineData_norm)

## 'data.frame':    3981 obs. of  12 variables:
##  $ volatile acidity    : num  0.32 0.39 0.39 0.25 0.25 0.43 0.35 0.17 0.31 0.29 ...
##  $ citric acid         : num  0.16 0.24 0.24 0.24 0.24 0.49 0.49 0.49 0.49 0.49 ...
##  $ chlorides           : num  0.051 0.069 0.048 0.057 0.057 0.045 0.036 0.048 0.062 0.044 ...
##  $ density             : num  0.994 0.994 0.992 0.994 0.994 ...
##  $ sulphates           : num  0.38 0.35 0.31 0.48 0.48 0.57 0.33 0.34 0.53 0.36 ...
##  $ fixed.acidity       : num  0.525 0.375 0.55 0.525 0.525 0.35 1 0.925 0.7 0.85 ...
##  $ residual.sugar      : num  0.0374 0.2944 0.0187 0.1402 0.1402 ...
##  $ free.sulfur.dioxide : num  0.1667 0.0897 0.0769 0.141 0.141 ...
##  $ total.sulfur.dioxide: num  0.321 0.41 0.419 0.274 0.274 ...
##  $ pH                  : num  0.541 0.446 0.243 0.23 0.23 ...
##  $ alcohol             : num  0.19 0.31 0.293 0.19 0.19 ...
##  $ quality             : num  0 0 0 0 0 0 0 0 0 0 ...

Alternatively we can make use of scale() method for normalisation. Let us see how the data looks with scaling and will do comparison for data.

wineData_scaled <- scale(wineData_no_color)

head(wineData_scaled)

##      fixed acidity volatile acidity citric acid residual sugar chlorides
## [1,]     0.1206571        0.7372113  -1.9045360    -1.01256142 0.8755540
## [2,]    -0.6945076        1.6565851  -0.9759435     0.10257829 2.6903830
## [3,]     0.2565178        1.6565851  -0.9759435    -1.09366248 0.5730825
## [4,]     0.1206571       -0.1821625  -0.9759435    -0.56650553 1.4804970
## [5,]     0.1206571       -0.1821625  -0.9759435    -0.56650553 1.4804970
## [6,]    -0.8303684        2.1819416   1.9259081     0.00120195 0.2706111
##      free sulfur dioxide total sulfur dioxide     density         pH
## [1,]           -1.327208           -1.0029624  0.04642827  0.2393283
## [2,]           -1.728444           -0.4909627  0.11562872 -0.2676547
## [3,]           -1.795317           -0.4422008 -0.54177554 -1.3540468
## [4,]           -1.460953           -1.2711528  0.11562872 -1.4264729
## [5,]           -1.460953           -1.2711528  0.11562872 -1.4264729
## [6,]           -1.527826           -0.5397246  0.84223343  0.6014590
##        sulphates    alcohol   quality
## [1,] -1.02065102 -0.8906438 -2.361914
## [2,] -1.32349909 -0.3138576 -2.361914
## [3,] -1.72729653 -0.3962556 -2.361914
## [4,] -0.01115743 -0.8906438 -2.361914
## [5,] -0.01115743 -0.8906438 -2.361914
## [6,]  0.89738680 -1.3026339 -2.361914

class(wineData_scaled)

## [1] "matrix"

#Converting to data.frame
wineData_scaled_df <- as.data.frame(wineData_scaled)
class(wineData_scaled_df)

## [1] "data.frame"

Finding higly corelated columns

In this step we will try to determine columns which are highly correlated to each other. This tells if the values of 1 column are changing then the value in other column will also change in same manner.

For Ex: Income and Taxes. As Income will go up taxes will also go up and vice-versa.

By this we can easily say that we can remove 1 of the two corelated columns and it will not effect our clustering results.

To Find correlation we will make use of both Visuale as well as data based techniques.

#Lets find correlation using cor()
corr_norm <- round(cor(wineData_norm),1)
corr_norm

##                      volatile acidity citric acid chlorides density sulphates
## volatile acidity                  1.0        -0.1       0.0     0.0       0.0
## citric acid                      -0.1         1.0       0.0     0.1       0.1
## chlorides                         0.0         0.0       1.0     0.5       0.1
## density                           0.0         0.1       0.5     1.0       0.1
## sulphates                         0.0         0.1       0.1     0.1       1.0
## fixed.acidity                     0.0         0.2       0.1     0.2       0.0
## residual.sugar                    0.1         0.0       0.3     0.8       0.0
## free.sulfur.dioxide              -0.1         0.1       0.2     0.3       0.1
## total.sulfur.dioxide              0.1         0.1       0.4     0.6       0.1
## pH                                0.0        -0.1       0.0    -0.1       0.1
## alcohol                           0.1         0.0      -0.6    -0.8      -0.1
## quality                          -0.1         0.0      -0.3    -0.3       0.0
##                      fixed.acidity residual.sugar free.sulfur.dioxide
## volatile acidity               0.0            0.1                -0.1
## citric acid                    0.2            0.0                 0.1
## chlorides                      0.1            0.3                 0.2
## density                        0.2            0.8                 0.3
## sulphates                      0.0            0.0                 0.1
## fixed.acidity                  1.0            0.1                 0.0
## residual.sugar                 0.1            1.0                 0.3
## free.sulfur.dioxide            0.0            0.3                 1.0
## total.sulfur.dioxide           0.1            0.4                 0.6
## pH                            -0.4           -0.2                 0.0
## alcohol                       -0.1           -0.5                -0.3
## quality                       -0.1           -0.1                 0.0
##                      total.sulfur.dioxide   pH alcohol quality
## volatile acidity                      0.1  0.0     0.1    -0.1
## citric acid                           0.1 -0.1     0.0     0.0
## chlorides                             0.4  0.0    -0.6    -0.3
## density                               0.6 -0.1    -0.8    -0.3
## sulphates                             0.1  0.1    -0.1     0.0
## fixed.acidity                         0.1 -0.4    -0.1    -0.1
## residual.sugar                        0.4 -0.2    -0.5    -0.1
## free.sulfur.dioxide                   0.6  0.0    -0.3     0.0
## total.sulfur.dioxide                  1.0  0.0    -0.5    -0.2
## pH                                    0.0  1.0     0.1     0.1
## alcohol                              -0.5  0.1     1.0     0.4
## quality                              -0.2  0.1     0.4     1.0

#Let us plot a Correlogram using above returned results.
ggcorrplot(corr_norm, hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of Wine Data", 
           ggtheme=theme_dark)

By looking at above graph we can tell how each correlated with other columns. And we can visually analyze that column ‘density’ and ‘residual.sugar’ has the maximum correlation of 0.8.

It is upto us and requirment to determine at what level of correlation we want to consider for removing from cluster equation.

For us we are considering level 0.7 and anything above it can be removed.

(For data interpreation we will again make use of a fuction available in Caret library.)

corr_scaled <- round(cor(wineData_scaled_df),1)

##Lets plot same for Scaled data
ggcorrplot(corr_scaled, hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of Wine Data", 
           ggtheme=theme_dark)

Note: Same results i.e we can use any one of them. We will make use normalised data

Determining Number of clusters

K-Mean makes us of number of clusters as one of the inputs.

The numbers of clusters which be best for us varies from data to data. There are various techniques which can be used to determine the number of clusters best suited for the data.

Will try to make use of few. We will also include graphs represnations for visual analystics.

Method: 1 - Elbow method

Elbow method makes of use Sum of Squared Error or Within-cluster sum of square (WSS).

It is nothing but distance of all points in a cluster from its center, square it and sum them up for all points.

Visually it is the point in graph from were we starting no change or a straight line. i.e increase in number of clusters is practically making no difference to our data points.

This technique makes use calling k-means for a range of cluster i.e from 1 to certain number and gives us the clustering data. Then we can plot the same data to see the behaviour.

# Initialize total within sum of squares error: wss
wss <- 0

# For 1 to 20 cluster centers
for (i in 1:10) {
  km.out <- kmeans(wineData_norm, centers = i)
  # Save total within sum of squares to wss variable
  wss[i] <- km.out$tot.withinss
}
wss

##  [1] 1191.2128  892.8319  784.4902  727.0704  673.3241  646.0413  616.6501
##  [8]  591.9018  574.3872  561.2452

# Plot total within sum of squares vs. number of clusters
plot(1:10, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

Method: 2 Silhouette plot

In this we will make used NbClust, after trying multiple time it was found that the large data set can some time cause issues in getting out puts with NbClust. To tackle this issue we will make use sample data and will do our analysis over it.

Sampling Data Was not able to find many methods, but few are sample(), sample_frac() - dplyr, createDataPartition() - caret

We will make use of sample_frac()

wine_train_data <- sample_frac(wineData_norm, 0.65)

head(wine_train_data)

##   volatile acidity citric acid chlorides density sulphates fixed.acidity
## 1             0.25        0.38     0.046 0.99560      0.59         0.500
## 2             0.46        0.14     0.042 0.99310      0.51         0.275
## 3             0.29        0.38     0.038 0.99366      0.59         0.250
## 4             0.22        0.28     0.060 0.99350      0.59         0.600
## 5             0.31        0.28     0.037 0.99190      0.51         0.400
## 6             0.39        0.31     0.054 0.99480      0.57         0.475
##   residual.sugar free.sulfur.dioxide total.sulfur.dioxide        pH   alcohol
## 1     0.35046729           0.2820513            0.5726496 0.6891892 0.3103448
## 2     0.09813084           0.3205128            0.5940171 0.8648649 0.3793103
## 3     0.47196262           0.6025641            0.4914530 0.3918919 0.4827586
## 4     0.30841121           0.5000000            0.4743590 0.3513514 0.5000000
## 5     0.04205607           0.1282051            0.4188034 0.6756757 0.3448276
## 6     0.09813084           0.3205128            0.7735043 0.8648649 0.3620690
##   quality
## 1    0.50
## 2    0.75
## 3    0.50
## 4    0.50
## 5    0.75
## 6    0.50

str(wine_train_data)

## 'data.frame':    2588 obs. of  12 variables:
##  $ volatile acidity    : num  0.25 0.46 0.29 0.22 0.31 0.39 0.32 0.2 0.26 0.19 ...
##  $ citric acid         : num  0.38 0.14 0.38 0.28 0.28 0.31 0.26 0.26 0.4 0.32 ...
##  $ chlorides           : num  0.046 0.042 0.038 0.06 0.037 0.054 0.03 0.04 0.047 0.049 ...
##  $ density             : num  0.996 0.993 0.994 0.994 0.992 ...
##  $ sulphates           : num  0.59 0.51 0.59 0.59 0.51 0.57 0.42 0.44 0.53 0.44 ...
##  $ fixed.acidity       : num  0.5 0.275 0.25 0.6 0.4 0.475 0.525 0.625 0.6 0.5 ...
##  $ residual.sugar      : num  0.3505 0.0981 0.472 0.3084 0.0421 ...
##  $ free.sulfur.dioxide : num  0.282 0.321 0.603 0.5 0.128 ...
##  $ total.sulfur.dioxide: num  0.573 0.594 0.491 0.474 0.419 ...
##  $ pH                  : num  0.689 0.865 0.392 0.351 0.676 ...
##  $ alcohol             : num  0.31 0.379 0.483 0.5 0.345 ...
##  $ quality             : num  0.5 0.75 0.5 0.5 0.75 0.5 0.5 0.5 0.5 0.75 ...

nrow(wine_train_data)

## [1] 2588

#Let us plot Silhouette Plot using NbCluster
fviz_nbclust(wine_train_data, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

There are more methods to find out cluster numbers. You can check out the ref link.

Segmentation Using K-mean

Let us do the clustering using the numbers of cluster we have got from above analysis and see our outputs.

We are making use od Cluster - 2 for our analysis.

km <- kmeans(wineData_norm, 2, iter.max = 140 , algorithm="Lloyd", nstart=100)
km

## K-means clustering with 2 clusters of sizes 2498, 1483
## 
## Cluster means:
##   volatile acidity citric acid  chlorides   density sulphates fixed.acidity
## 1        0.2600560   0.3201241 0.03918215 0.9921150 0.4778263     0.4910679
## 2        0.2702933   0.3307417 0.04759474 0.9968149 0.4866285     0.5225556
##   residual.sugar free.sulfur.dioxide total.sulfur.dioxide        pH   alcohol
## 1      0.1388120           0.3440830            0.4116892 0.5206922 0.4798091
## 2      0.4929922           0.5508628            0.6388543 0.4541014 0.2011897
##     quality
## 1 0.5395316
## 2 0.4076197
## 
## Clustering vector:
##    [1] 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1
##   [38] 2 1 2 1 1 1 2 1 1 1 1 2 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 1 1 1 1 1 1 2 2 1 2
##   [75] 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 2 2 2
##  [112] 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 1 2 2 2 1
##  [149] 2 2 2 2 2 2 2 2 1 1 1 2 1 1 1 2 1 1 1 2 1 2 2 2 1 1 1 2 2 1 2 2 1 1 1 1 2
##  [186] 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 1 2 2 1 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2
##  [223] 2 2 2 2 1 1 1 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 2 2 2 2 2
##  [260] 2 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2 1 2 2 1 1 1 1 2 2 2 2 1 2 1 1 1 1 1 1
##  [297] 1 1 2 1 1 2 1 2 2 1 1 1 1 2 1 1 1 1 2 1 1 1 2 2 2 2 1 2 2 1 1 1 1 2 1 1 2
##  [334] 1 1 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 2 2 2 2 2 1 2 1 1 1 1 2 2 2 2 1 2 1 1 2
##  [371] 2 2 2 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1
##  [408] 1 2 1 1 2 2 1 2 1 2 2 2 2 2 2 2 2 1 1 1 1 2 1 1 2 2 2 2 2 2 2 2 1 1 2 1 1
##  [445] 2 1 2 1 1 2 1 2 1 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 1 2 2 2 2 1
##  [482] 1 1 2 2 1 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 1 1 2 2 2 1 1 2 1 1 1 1 2 2 2 2 2
##  [519] 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 2 2 2 2 2 1 2 2 2 1 1 2 2 2 2 1 2 1 2 2 2 1
##  [556] 1 2 2 1 2 2 2 2 2 2 2 1 1 1 1 2 2 1 1 2 2 2 1 1 2 1 1 1 1 2 2 2 2 2 2 2 2
##  [593] 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 1 2 2 1 2 2 1 2 2 1 1 2
##  [630] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 1 1 2 2 2 2 2 1 2 2 1 2 2 2 1 1
##  [667] 2 1 2 2 2 1 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 1 1 2 2 2 2 2 1
##  [704] 1 2 1 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 1
##  [741] 1 2 1 1 1 2 2 1 2 1 1 2 2 2 1 2 2 2 2 2 2 2 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1
##  [778] 2 2 1 1 2 1 1 1 2 2 2 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 2 2 2 1 1 1 1 2
##  [815] 1 2 1 1 1 1 1 1 1 2 1 2 1 1 2 1 2 1 2 2 2 2 1 1 2 2 1 1 2 2 2 2 1 1 2 2 1
##  [852] 2 2 2 1 1 2 1 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 1 2 2 2 2 1 1 1 1 1 2 2 2 2
##  [889] 1 2 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 1 1 1 1 1 1 1
##  [926] 1 1 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 2 2 1 2 1 1 2 1 2 1 2 2 1 1 1 1 2 2 2 2
##  [963] 1 1 2 1 2 2 1 1 2 2 1 1 1 1 1 2 1 1 2 1 1 1 2 2 2 2 1 2 2 2 2 1 2 2 1 2 1
## [1000] 1 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 1 2 2 2 2 2 2 2 1 2 2
## [1037] 2 1 1 1 1 2 2 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 1 2 1 2 1
## [1074] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 2 2 2 2 2 1 2 2 1 2 1 2 2
## [1111] 2 1 1 2 1 1 2 2 2 2 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 2 1 2 2 1 1
## [1148] 2 1 2 1 2 2 2 2 2 1 2 2 1 1 2 2 1 1 2 1 2 1 2 1 1 2 2 1 1 2 2 1 1 2 1 1 1
## [1185] 1 2 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 2 2 1 1 2 1 1 2 1 2 2 2 2 2 2 2 1 1 1 1
## [1222] 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 1 1 2 2 2 2 2 1 2 1 1 1 1 2 2 2 2 2 2 2 2
## [1259] 1 1 1 2 1 2 2 2 2 2 1 1 2 1 1 1 2 2 2 1 1 2 1 1 1 2 2 1 2 1 1 1 2 1 2 1 1
## [1296] 2 2 1 2 2 2 1 2 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 2 1
## [1333] 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1
## [1370] 1 1 2 2 2 2 2 1 2 2 1 2 1 1 1 2 2 1 2 2 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
## [1407] 1 2 2 1 1 1 1 2 2 2 1 2 1 2 1 2 1 1 2 2 2 2 2 1 1 1 1 2 1 1 1 2 1 2 2 2 1
## [1444] 2 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 1 1 1 1
## [1481] 1 1 1 1 1 2 1 1 1 2 1 1 2 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 1 1 1 2
## [1518] 2 1 1 1 2 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 1 1 1 1 2 1 2 2 2 2 2 2 2 1
## [1555] 2 2 2 1 1 2 2 1 1 1 2 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1
## [1592] 1 1 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 2 1 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 2
## [1629] 1 2 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## [1666] 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 2 2 2 1 2 1 1 1 1 1 1
## [1703] 1 1 1 1 1 1 1 2 1 1 2 2 2 1 2 1 2 2 2 2 2 1 1 1 1 2 1 2 2 1 2 1 1 1 2 1 2
## [1740] 2 1 2 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 1
## [1777] 2 2 2 1 2 2 1 2 2 1 1 2 1 2 2 1 1 2 1 2 1 1 2 2 2 2 1 1 1 1 1 1 1 2 2 2 2
## [1814] 1 1 1 2 2 1 2 2 1 1 1 1 2 2 1 2 2 1 1 1 1 1 2 2 2 1 2 1 1 2 2 1 1 2 2 1 2
## [1851] 1 2 1 2 2 2 1 1 1 2 1 1 2 1 2 1 1 2 1 2 1 2 1 1 1 1 2 1 1 1 2 1 2 1 1 2 1
## [1888] 1 2 1 2 2 2 2 1 2 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 2
## [1925] 1 1 1 1 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 1 2 1 1 2 2 1 1 1 2 1 1
## [1962] 1 1 2 2 1 1 1 2 1 2 1 1 1 2 1 2 2 1 1 1 1 1 1 2 1 1 2 1 1 2 2 2 1 1 1 2 1
## [1999] 2 2 2 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 2 1 1 1 1
## [2036] 1 2 2 2 2 1 1 2 2 1 1 1 2 1 1 1 2 1 2 1 1 1 2 1 2 2 1 1 1 1 2 1 2 1 1 1 1
## [2073] 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 2 2 2 2 2 1 2 1 2 1 1 2 1 2 2 1 1 1
## [2110] 2 2 1 1 2 1 1 1 1 2 1 1 2 1 1 2 2 2 1 2 1 1 2 1 1 1 2 1 2 1 1 1 2 2 1 1 1
## [2147] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 2 1 1 2 1 1
## [2184] 2 2 1 1 2 2 1 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 1 1 1 2 2 1 1 2 2 2 2
## [2221] 1 2 2 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 1 1
## [2258] 1 2 1 1 1 2 1 2 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## [2295] 2 2 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1
## [2332] 2 2 1 1 1 1 1 1 2 2 1 1 2 2 2 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [2369] 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 2 1 2 2 2 2 1 2 1 1 1 2 2 1
## [2406] 1 2 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2
## [2443] 2 2 1 1 1 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 2 1 1 2 1 1 1
## [2480] 1 2 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1 2 2 2 2 1 1 2 2 1 1 1 2 2 1 1 2 1
## [2517] 1 1 1 2 2 2 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1
## [2554] 1 1 1 1 1 1 1 1 2 2 2 1 2 1 1 1 2 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2591] 1 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 2 2 2 1 2 2 2 1 1 1 2 1 1 2 2 1 2 1 1 1
## [2628] 1 1 2 2 2 1 1 1 1 2 1 1 2 2 2 1 2 1 1 1 2 2 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1
## [2665] 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 1 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2
## [2702] 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 1 1 1 2 2 2 2 2 2 2
## [2739] 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 1 2 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 1 1
## [2776] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## [2813] 1 1 1 2 2 1 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1
## [2850] 1 1 1 1 2 2 2 2 1 1 2 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## [2887] 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1
## [2924] 1 1 1 2 1 2 2 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 2 2 1
## [2961] 1 1 1 1 1 1 1 1 1 2 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 2 1 2 1 1 1 1 1 1 2 1 1
## [2998] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 2
## [3035] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3072] 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 2 1 1 1
## [3109] 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1
## [3146] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1
## [3183] 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3220] 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1
## [3257] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2
## [3294] 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1
## [3331] 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2
## [3368] 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [3405] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 1 1 2 1 1 1 1 1 2 1 1 1
## [3442] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3479] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3516] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3553] 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## [3590] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3627] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1
## [3664] 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3701] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 1 1 2 2 2 2
## [3738] 2 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3775] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1
## [3812] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3849] 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3886] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3923] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
## [3960] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 589.1324 303.6995
##  (between_SS / total_SS =  25.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

#Structure of km
str(km)

## List of 9
##  $ cluster     : int [1:3981] 1 1 1 1 1 2 1 2 2 1 ...
##  $ centers     : num [1:2, 1:12] 0.2601 0.2703 0.3201 0.3307 0.0392 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:12] "volatile acidity" "citric acid" "chlorides" "density" ...
##  $ totss       : num 1191
##  $ withinss    : num [1:2] 589 304
##  $ tot.withinss: num 893
##  $ betweenss   : num 298
##  $ size        : int [1:2] 2498 1483
##  $ iter        : int 14
##  $ ifault      : NULL
##  - attr(*, "class")= chr "kmeans"

Let us do visual analysis of data using results from K-mean. We will try to plot different graphs for better understanding.

# Centroid Plot against 1st 2 discriminant functions
clusplot(cleanWhiteDat, km$cluster, color=TRUE, shade=TRUE, 
         labels=2, lines=0)

fviz_cluster(km, data = wineData_norm,
             ellipse.type = "convex",
             palette = "jco",
             ggtheme = theme_minimal())

fviz_cluster(list(data = wineData_norm, cluster = km$cluster),
             ellipse.type = "norm", geom = "point", stand = FALSE,
             palette = "jco", ggtheme = theme_classic())

pam.res <- pam(wineData_norm, 2)
# Visualize
fviz_cluster(pam.res)

K-means clustering analysis of the white wine dataset using RStudio

Hassan OUKHOUYA

November 17, 2021