About me
First, we load the required librairies and data sets.
library(caret)
library(tidyverse)
library(leaps)
library(ggplot2)
library(reshape2)
library(MASS)
library(ggcorrplot)
library(corrplot)
library(plotmo)
library(keras)
library(kableExtra)
library(modelr)
library(psych)
library(Rmisc)
library(gridExtra)
library(scales)
library(rpart)
library(yardstick)
library(cluster)
library(NbClust)
library(factoextra)
whiteDat <- read_csv("winequality-white.csv")
Then I create a shortcut function to sort the dataframe columns by the absolute value of their correlation with the outcome.
sortByCorr = function(dataset, refColName) {
# Sort the dataframe columns by the absolute value of their correlation with
# a given column
#
# Args:
# dataset: A vector, matrix, or data frame to sort
# refColName: The name of the reference colum for the correlation
#
# Returns:
# The sorted dataframe
refColIdx = grep(refColName, colnames(dataset))
corrTmp = cor(dataset)[, refColIdx]
corrTmp[order(abs(corrTmp), decreasing = TRUE)]
dataset[, order(abs(corrTmp), decreasing = TRUE)]
}
dim(whiteDat)
## [1] 4873 12
numericVars <- which(sapply(whiteDat, is.numeric))
numericVarNames <- names(numericVars)
cat("There are", length(numericVars), "numeric variables")
## There are 12 numeric variables
sapply(whiteDat, class)
## fixed acidity volatile acidity citric acid
## "numeric" "numeric" "numeric"
## residual sugar chlorides free sulfur dioxide
## "numeric" "numeric" "numeric"
## total sulfur dioxide density pH
## "numeric" "numeric" "numeric"
## sulphates alcohol quality
## "numeric" "numeric" "numeric"
# We take a look at the data distribution:
head(whiteDat)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.2 0.45 0.26 4.4 0.063
## 2 9.8 0.36 0.46 10.5 0.038
## 3 5.5 0.485 0 1.5 0.065
## 4 6.4 0.595 0.14 5.2 0.058
## 5 7.6 0.48 0.37 0.8 0.037
## 6 7.2 0.32 0.47 5.1 0.044
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
# Get some metrics about the variables
summary(whiteDat)
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 3.800 Min. :0.080 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.210 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.260 Median :0.3200 Median : 5.200
## Mean : 6.851 Mean :0.278 Mean :0.3341 Mean : 6.394
## 3rd Qu.: 7.300 3rd Qu.:0.320 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.100 Max. :1.6600 Max. :65.800
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.00900 Min. : 2.00 Min. : 9.0 Min. :0.9871
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0 1st Qu.:0.9917
## Median :0.04300 Median : 34.00 Median :134.0 Median :0.9937
## Mean :0.04576 Mean : 35.24 Mean :138.3 Mean :0.9940
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0 3rd Qu.:0.9961
## Max. :0.34600 Max. :138.50 Max. :344.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :4.000
## 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.180 Median :0.4700 Median :10.40 Median :6.000
## Mean :3.188 Mean :0.4899 Mean :10.51 Mean :5.887
## 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 3rd Qu.:6.000
## Max. :3.820 Max. :1.0800 Max. :14.20 Max. :8.000
The white wine dataset has 4873 observations, 11 predictors and 1 outcome (quality). All of the predictors are numeric values, outcomes are integer.
The summary stats shows that most of the variables has wide range compared to the IQR, which may indicate spread in the data and the presence of outliers. We investigate further by producing boxplots for each of the variables:
oldpar = par(mfrow = c(2,6))
for ( i in 1:11 ) {
boxplot(whiteDat[[i]])
mtext(names(whiteDat)[i], cex = 0.8, side = 1, line = 2)
}
par(oldpar)
It demonstrate that all variables, except
alcoholcontains outliers.
We now use a scatter plot matrix to get an insight on the outliers locations:
pairs(whiteDat[, -grep("quality", colnames(whiteDat))])
We see that outliers seems to be on the higher end.
Now we look at the predictor values distribution:
oldpar = par(mfrow = c(2,6))
for ( i in 1:12 ) {
truehist(whiteDat[[i]], xlab = names(whiteDat)[i], col = 'lightgreen',
main = paste("Average =", signif(mean(whiteDat[[i]]),3)))
}
par(oldpar)
We note that all the variables has positively skewed distributions except quality which is normally distributed. citric.acide show a peak at the lower end.
For each variables, we consider observations that lie outside 1.5 * IQR as outliers.
outliers = c()
for ( i in 1:11 ) {
stats = boxplot.stats(whiteDat[[i]])$stats
bottom_outlier_rows = which(whiteDat[[i]] < stats[1])
top_outlier_rows = which(whiteDat[[i]] > stats[5])
outliers = c(outliers , top_outlier_rows[ !top_outlier_rows %in% outliers ] )
outliers = c(outliers , bottom_outlier_rows[ !bottom_outlier_rows %in% outliers ] )
}
We use the Cook’s ditance to detect influential observations.
mod = lm(quality ~ ., data = whiteDat)
cooksd = cooks.distance(mod)
plot(cooksd, pch = "*", cex = 2, main = "Influential Obs by Cooks distance")
abline(h = 4*mean(cooksd, na.rm = T), col = "red")
head(whiteDat[cooksd > 4 * mean(cooksd, na.rm=T), ])
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9.8 0.36 0.46 10.5 0.038
## 2 7.2 0.32 0.47 5.1 0.044
## 3 5.9 0.21 0.28 4.6 0.053
## 4 7.2 0.31 0.46 5 0.04
## 5 6.7 0.31 0.31 9.9 0.04
## 6 6.8 0.29 0.16 1.4 0.038
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
By looking at each row we can find out why it is influential:
residual.sugar.free.sulfur.dioxide.sulphates.free.sulfur.dioxide.We remove all the ouliers in our list from the dataset and create a new set of histograms:
coutliers = as.numeric(rownames(whiteDat[cooksd > 4 * mean(cooksd, na.rm=T), ]))
outliers = c(outliers , coutliers[ !coutliers %in% outliers ] )
cleanWhiteDat = whiteDat[-outliers, ]
oldpar = par(mfrow=c(2,6))
for ( i in 1:12 ) {
truehist(cleanWhiteDat[[i]], xlab = names(cleanWhiteDat)[i], col = 'lightgreen',
main = paste("Average =", signif(mean(cleanWhiteDat[[i]]),3)))
}
par(oldpar)
dim(cleanWhiteDat)
## [1] 3981 12
By removing the outliers, the dataset size reduced to 3981 observations. Now, the variables are approximatly normaly distributed, except for residual.sugar which is unimodal and skewed to the right. This could be an interesting fact for further analysis.
We now use a scatterplot matrice to roughly determine if there is a linear correlation between our variables:
pairs(cleanWhiteDat, col = cleanWhiteDat$quality, pch = cleanWhiteDat$quality)
pairs(cleanWhiteDat[,c(7, 8, 10, 11)], col = cleanWhiteDat$quality, pch = cleanWhiteDat$quality)
Only residual.sugar/density and density/alcohol pairs seems to have a linear correlation.
We note a trend with the alcohol variable: higher the alcohol value is, better is the quality. In the oposite, it seems like the lowest the density, the better the quality.
The following correlation matrix confirm the strong correlation between residual.sugar/density and density/alcohol. It also confirm that alcohol is the variable with the highest correlation with quality. At a lower level, density and chlorides also have a significant correlation with quality.
ggcorrplot(cor(cleanWhiteDat), hc.order = TRUE, type = "lower", lab = TRUE, insig = "blank")
colnames(sortByCorr(dataset = cleanWhiteDat, refColName = 'quality'))
## [1] "quality" "alcohol" "density"
## [4] "chlorides" "total sulfur dioxide" "residual sugar"
## [7] "volatile acidity" "pH" "fixed acidity"
## [10] "citric acid" "sulphates" "free sulfur dioxide"
numericVars <- which(sapply(cleanWhiteDat, is.numeric))
all_numVar <- cleanWhiteDat[, numericVars]
cor_numVar <- cor(all_numVar, use = "pairwise.complete.obs")
#Sort on decreasing correlations with alcohol
cor_sorted <- as.matrix(sort(cor_numVar[,"alcohol"], decreasing = TRUE))
#Selecting high correlations
Cor_High <- names(which(apply(cor_sorted, 1, function(x) abs(x) > 0.175)))
cor_numVar <- cor_numVar[Cor_High, Cor_High]
corrplot.mixed(cor_numVar, tl.col = "black", tl.pos = "lt")
Log transformation is widely used to make data conform to normality and/or reduce variability of data. For the white wine, since our data are now close to the normal distribution, we don’t need to transform them.
In both case we can see in the following pairwise plots that log-transformation doesn’t improve linearity of the relationships between predictors and outcome.
pairs(log(cleanWhiteDat))
We have gone through the data available in hand and ran tests which has given us some clarity over data, and we can start our clustering analysis.
For successfull clustering and clustering to be good we need to follow few steps, before we can do actual clustering.
Objectives:
Understanding data.
Data pre-processing.
Creating dummy variables.
Removing columns with zero-variance.
Scaling and centering data.
Discover and remove higly correlated variables.
Data Transformation. Scaling and Centering.
Determining Number of Clusters.
Segmentation using k-means.
We have already performed Step: a. Understanding Data. Now let us make use of same and perform our analysis.
Data pre-processing involves various process. We will define the process required as we proceed. All these steps will help in achiving good clustering. Will also tell steps which we can do but might not required here depending on the data.
Remeber, previously i asked to make a note of class of columns. This where we will make use of it. Goal here is to convert all and any columns to numerical value.
Clustering can be performed only for numerical values.
Based on str() called earlier we have seen that all our columns are of class Num accept - Quality, (of int type).
We will not make changes to Color column as we will use it as is for plotting graphs. And just to be carefulll we will convert int -> numeric.
#Making changes for converting data types
#Converting 2 interger columns too numeric
cleanWhiteDat$quality <- as.numeric(cleanWhiteDat$quality)
#for ease creating a data set without color column
wineData_no_color <- cleanWhiteDat[1:12]
#Let us see the structure again
str(wineData_no_color)
## tbl_df [3,981 x 12] (S3: tbl_df/tbl/data.frame)
## $ fixed acidity : num [1:3981] 6.9 6.3 7 6.9 6.9 6.2 8.8 8.5 7.6 8.2 ...
## $ volatile acidity : num [1:3981] 0.32 0.39 0.39 0.25 0.25 0.43 0.35 0.17 0.31 0.29 ...
## $ citric acid : num [1:3981] 0.16 0.24 0.24 0.24 0.24 0.49 0.49 0.49 0.49 0.49 ...
## $ residual sugar : num [1:3981] 1.4 6.9 1 3.6 3.6 6.4 1 8.8 13.4 1 ...
## $ chlorides : num [1:3981] 0.051 0.069 0.048 0.057 0.057 0.045 0.036 0.048 0.062 0.044 ...
## $ free sulfur dioxide : num [1:3981] 15 9 8 13 13 12 14 23 50 29 ...
## $ total sulfur dioxide: num [1:3981] 96 117 119 85 85 115 56 108 191 118 ...
## $ density : num [1:3981] 0.994 0.994 0.992 0.994 0.994 ...
## $ pH : num [1:3981] 3.22 3.15 3 2.99 2.99 3.27 2.96 2.88 3.22 3.24 ...
## $ sulphates : num [1:3981] 0.38 0.35 0.31 0.48 0.48 0.57 0.33 0.34 0.53 0.36 ...
## $ alcohol : num [1:3981] 9.5 10.2 10.1 9.5 9.5 9 10.5 10.5 9 10.9 ...
## $ quality : num [1:3981] 4 4 4 4 4 4 4 4 4 4 ...
In this step we will try to find out columns which might not have any significant variations in there values, and there variance doesn’t make any difference to results.
We can remove those coluns and make our computation faster.
We will make use a function from Caret library for this.
#Will give us column number which might insignificant variance.
nzv <- nearZeroVar(cleanWhiteDat)
print(paste("---Column number with----", nzv))
## [1] "---Column number with---- "
As we can see nothing is returned i.e we don’t have any such column.
In this we step we try to normalize the values of column so that any columns with large values compared to others may not dominate columns with lower values. This columns with higher values may cause inconsistency in clustering.
Let us identifying columns with large data.
There are 2 ways of doing it but we will make use of a technique which will give us all values in the range of 0’s and 1’s
#Identifying columns with large data
head(cleanWhiteDat)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi~ `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.9 0.32 0.16 1.4 0.051
## 2 6.3 0.39 0.24 6.9 0.069
## 3 7 0.39 0.24 1 0.048
## 4 6.9 0.25 0.24 3.6 0.057
## 5 6.9 0.25 0.24 3.6 0.057
## 6 6.2 0.43 0.49 6.4 0.045
## # ... with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
#By looking at data we can say that Columns 1, 4, 6, 7, 9, 11, 12 can cause issues,
#we will try to normalize them so that there value are in 0 to 1 range.
print("---Normalizing Data----")
## [1] "---Normalizing Data----"
norm_data <- sapply(cleanWhiteDat[,c(1,4,6,7,9,11,12)], function(x) (x - min(x))/(max(x) - min(x)))
print("---Type of returned data")
## [1] "---Type of returned data"
class(norm_data)
## [1] "matrix"
print("---Converting data from matrix to data.frame---")
## [1] "---Converting data from matrix to data.frame---"
norm_data <- data.frame(norm_data) # norm_data is a 'matrix'
print("---Normalised data---")
## [1] "---Normalised data---"
head(norm_data)
## fixed.acidity residual.sugar free.sulfur.dioxide total.sulfur.dioxide
## 1 0.525 0.03738318 0.16666667 0.3205128
## 2 0.375 0.29439252 0.08974359 0.4102564
## 3 0.550 0.01869159 0.07692308 0.4188034
## 4 0.525 0.14018692 0.14102564 0.2735043
## 5 0.525 0.14018692 0.14102564 0.2735043
## 6 0.350 0.27102804 0.12820513 0.4017094
## pH alcohol quality
## 1 0.5405405 0.1896552 0
## 2 0.4459459 0.3103448 0
## 3 0.2432432 0.2931034 0
## 4 0.2297297 0.1896552 0
## 5 0.2297297 0.1896552 0
## 6 0.6081081 0.1034483 0
Brief theory on what we have done here.
(Each Data point in column subtracted by minimum value if that column) divided by (maximum value of that columns subtracted by minimum value of that column).
The result of this will always give a value which will be in range of 0 to 1
Now let us bind the rest of data with normalized data and see the data.
#Binding the normalised data with other data
wineData_norm <- cbind(cleanWhiteDat[,c(2,3,5,8,10)],norm_data)
head(wineData_norm)
## volatile acidity citric acid chlorides density sulphates fixed.acidity
## 1 0.32 0.16 0.051 0.9940 0.38 0.525
## 2 0.39 0.24 0.069 0.9942 0.35 0.375
## 3 0.39 0.24 0.048 0.9923 0.31 0.550
## 4 0.25 0.24 0.057 0.9942 0.48 0.525
## 5 0.25 0.24 0.057 0.9942 0.48 0.525
## 6 0.43 0.49 0.045 0.9963 0.57 0.350
## residual.sugar free.sulfur.dioxide total.sulfur.dioxide pH alcohol
## 1 0.03738318 0.16666667 0.3205128 0.5405405 0.1896552
## 2 0.29439252 0.08974359 0.4102564 0.4459459 0.3103448
## 3 0.01869159 0.07692308 0.4188034 0.2432432 0.2931034
## 4 0.14018692 0.14102564 0.2735043 0.2297297 0.1896552
## 5 0.14018692 0.14102564 0.2735043 0.2297297 0.1896552
## 6 0.27102804 0.12820513 0.4017094 0.6081081 0.1034483
## quality
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
str(wineData_norm)
## 'data.frame': 3981 obs. of 12 variables:
## $ volatile acidity : num 0.32 0.39 0.39 0.25 0.25 0.43 0.35 0.17 0.31 0.29 ...
## $ citric acid : num 0.16 0.24 0.24 0.24 0.24 0.49 0.49 0.49 0.49 0.49 ...
## $ chlorides : num 0.051 0.069 0.048 0.057 0.057 0.045 0.036 0.048 0.062 0.044 ...
## $ density : num 0.994 0.994 0.992 0.994 0.994 ...
## $ sulphates : num 0.38 0.35 0.31 0.48 0.48 0.57 0.33 0.34 0.53 0.36 ...
## $ fixed.acidity : num 0.525 0.375 0.55 0.525 0.525 0.35 1 0.925 0.7 0.85 ...
## $ residual.sugar : num 0.0374 0.2944 0.0187 0.1402 0.1402 ...
## $ free.sulfur.dioxide : num 0.1667 0.0897 0.0769 0.141 0.141 ...
## $ total.sulfur.dioxide: num 0.321 0.41 0.419 0.274 0.274 ...
## $ pH : num 0.541 0.446 0.243 0.23 0.23 ...
## $ alcohol : num 0.19 0.31 0.293 0.19 0.19 ...
## $ quality : num 0 0 0 0 0 0 0 0 0 0 ...
Alternatively we can make use of scale() method for normalisation. Let us see how the data looks with scaling and will do comparison for data.
wineData_scaled <- scale(wineData_no_color)
head(wineData_scaled)
## fixed acidity volatile acidity citric acid residual sugar chlorides
## [1,] 0.1206571 0.7372113 -1.9045360 -1.01256142 0.8755540
## [2,] -0.6945076 1.6565851 -0.9759435 0.10257829 2.6903830
## [3,] 0.2565178 1.6565851 -0.9759435 -1.09366248 0.5730825
## [4,] 0.1206571 -0.1821625 -0.9759435 -0.56650553 1.4804970
## [5,] 0.1206571 -0.1821625 -0.9759435 -0.56650553 1.4804970
## [6,] -0.8303684 2.1819416 1.9259081 0.00120195 0.2706111
## free sulfur dioxide total sulfur dioxide density pH
## [1,] -1.327208 -1.0029624 0.04642827 0.2393283
## [2,] -1.728444 -0.4909627 0.11562872 -0.2676547
## [3,] -1.795317 -0.4422008 -0.54177554 -1.3540468
## [4,] -1.460953 -1.2711528 0.11562872 -1.4264729
## [5,] -1.460953 -1.2711528 0.11562872 -1.4264729
## [6,] -1.527826 -0.5397246 0.84223343 0.6014590
## sulphates alcohol quality
## [1,] -1.02065102 -0.8906438 -2.361914
## [2,] -1.32349909 -0.3138576 -2.361914
## [3,] -1.72729653 -0.3962556 -2.361914
## [4,] -0.01115743 -0.8906438 -2.361914
## [5,] -0.01115743 -0.8906438 -2.361914
## [6,] 0.89738680 -1.3026339 -2.361914
class(wineData_scaled)
## [1] "matrix"
#Converting to data.frame
wineData_scaled_df <- as.data.frame(wineData_scaled)
class(wineData_scaled_df)
## [1] "data.frame"
K-Mean makes us of number of clusters as one of the inputs.
The numbers of clusters which be best for us varies from data to data. There are various techniques which can be used to determine the number of clusters best suited for the data.
Will try to make use of few. We will also include graphs represnations for visual analystics.
Method: 1 - Elbow method
Elbow method makes of use Sum of Squared Error or Within-cluster sum of square (WSS).
It is nothing but distance of all points in a cluster from its center, square it and sum them up for all points.
Visually it is the point in graph from were we starting no change or a straight line. i.e increase in number of clusters is practically making no difference to our data points.
This technique makes use calling k-means for a range of cluster i.e from 1 to certain number and gives us the clustering data. Then we can plot the same data to see the behaviour.
# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 20 cluster centers
for (i in 1:10) {
km.out <- kmeans(wineData_norm, centers = i)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
wss
## [1] 1191.2128 892.8319 784.4902 727.0704 673.3241 646.0413 616.6501
## [8] 591.9018 574.3872 561.2452
# Plot total within sum of squares vs. number of clusters
plot(1:10, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
Method: 2 Silhouette plot
In this we will make used NbClust, after trying multiple time it was found that the large data set can some time cause issues in getting out puts with NbClust. To tackle this issue we will make use sample data and will do our analysis over it.
Sampling Data Was not able to find many methods, but few are sample(), sample_frac() - dplyr, createDataPartition() - caret
We will make use of sample_frac()
wine_train_data <- sample_frac(wineData_norm, 0.65)
head(wine_train_data)
## volatile acidity citric acid chlorides density sulphates fixed.acidity
## 1 0.25 0.38 0.046 0.99560 0.59 0.500
## 2 0.46 0.14 0.042 0.99310 0.51 0.275
## 3 0.29 0.38 0.038 0.99366 0.59 0.250
## 4 0.22 0.28 0.060 0.99350 0.59 0.600
## 5 0.31 0.28 0.037 0.99190 0.51 0.400
## 6 0.39 0.31 0.054 0.99480 0.57 0.475
## residual.sugar free.sulfur.dioxide total.sulfur.dioxide pH alcohol
## 1 0.35046729 0.2820513 0.5726496 0.6891892 0.3103448
## 2 0.09813084 0.3205128 0.5940171 0.8648649 0.3793103
## 3 0.47196262 0.6025641 0.4914530 0.3918919 0.4827586
## 4 0.30841121 0.5000000 0.4743590 0.3513514 0.5000000
## 5 0.04205607 0.1282051 0.4188034 0.6756757 0.3448276
## 6 0.09813084 0.3205128 0.7735043 0.8648649 0.3620690
## quality
## 1 0.50
## 2 0.75
## 3 0.50
## 4 0.50
## 5 0.75
## 6 0.50
str(wine_train_data)
## 'data.frame': 2588 obs. of 12 variables:
## $ volatile acidity : num 0.25 0.46 0.29 0.22 0.31 0.39 0.32 0.2 0.26 0.19 ...
## $ citric acid : num 0.38 0.14 0.38 0.28 0.28 0.31 0.26 0.26 0.4 0.32 ...
## $ chlorides : num 0.046 0.042 0.038 0.06 0.037 0.054 0.03 0.04 0.047 0.049 ...
## $ density : num 0.996 0.993 0.994 0.994 0.992 ...
## $ sulphates : num 0.59 0.51 0.59 0.59 0.51 0.57 0.42 0.44 0.53 0.44 ...
## $ fixed.acidity : num 0.5 0.275 0.25 0.6 0.4 0.475 0.525 0.625 0.6 0.5 ...
## $ residual.sugar : num 0.3505 0.0981 0.472 0.3084 0.0421 ...
## $ free.sulfur.dioxide : num 0.282 0.321 0.603 0.5 0.128 ...
## $ total.sulfur.dioxide: num 0.573 0.594 0.491 0.474 0.419 ...
## $ pH : num 0.689 0.865 0.392 0.351 0.676 ...
## $ alcohol : num 0.31 0.379 0.483 0.5 0.345 ...
## $ quality : num 0.5 0.75 0.5 0.5 0.75 0.5 0.5 0.5 0.5 0.75 ...
nrow(wine_train_data)
## [1] 2588
#Let us plot Silhouette Plot using NbCluster
fviz_nbclust(wine_train_data, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
There are more methods to find out cluster numbers. You can check out the ref link.
Let us do the clustering using the numbers of cluster we have got from above analysis and see our outputs.
We are making use od Cluster - 2 for our analysis.
km <- kmeans(wineData_norm, 2, iter.max = 140 , algorithm="Lloyd", nstart=100)
km
## K-means clustering with 2 clusters of sizes 2498, 1483
##
## Cluster means:
## volatile acidity citric acid chlorides density sulphates fixed.acidity
## 1 0.2600560 0.3201241 0.03918215 0.9921150 0.4778263 0.4910679
## 2 0.2702933 0.3307417 0.04759474 0.9968149 0.4866285 0.5225556
## residual.sugar free.sulfur.dioxide total.sulfur.dioxide pH alcohol
## 1 0.1388120 0.3440830 0.4116892 0.5206922 0.4798091
## 2 0.4929922 0.5508628 0.6388543 0.4541014 0.2011897
## quality
## 1 0.5395316
## 2 0.4076197
##
## Clustering vector:
## [1] 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1
## [38] 2 1 2 1 1 1 2 1 1 1 1 2 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 1 1 1 1 1 1 2 2 1 2
## [75] 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 2 2 2
## [112] 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 1 2 2 2 1
## [149] 2 2 2 2 2 2 2 2 1 1 1 2 1 1 1 2 1 1 1 2 1 2 2 2 1 1 1 2 2 1 2 2 1 1 1 1 2
## [186] 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 1 2 2 1 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2
## [223] 2 2 2 2 1 1 1 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2 1 2 2 1 1 1 1 2 2 2 2 1 2 1 1 1 1 1 1
## [297] 1 1 2 1 1 2 1 2 2 1 1 1 1 2 1 1 1 1 2 1 1 1 2 2 2 2 1 2 2 1 1 1 1 2 1 1 2
## [334] 1 1 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 2 2 2 2 2 1 2 1 1 1 1 2 2 2 2 1 2 1 1 2
## [371] 2 2 2 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1
## [408] 1 2 1 1 2 2 1 2 1 2 2 2 2 2 2 2 2 1 1 1 1 2 1 1 2 2 2 2 2 2 2 2 1 1 2 1 1
## [445] 2 1 2 1 1 2 1 2 1 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 1 2 2 2 2 1
## [482] 1 1 2 2 1 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 1 1 2 2 2 1 1 2 1 1 1 1 2 2 2 2 2
## [519] 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 2 2 2 2 2 1 2 2 2 1 1 2 2 2 2 1 2 1 2 2 2 1
## [556] 1 2 2 1 2 2 2 2 2 2 2 1 1 1 1 2 2 1 1 2 2 2 1 1 2 1 1 1 1 2 2 2 2 2 2 2 2
## [593] 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 1 2 2 1 2 2 1 2 2 1 1 2
## [630] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 1 1 2 2 2 2 2 1 2 2 1 2 2 2 1 1
## [667] 2 1 2 2 2 1 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 1 1 2 2 2 2 2 1
## [704] 1 2 1 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 1
## [741] 1 2 1 1 1 2 2 1 2 1 1 2 2 2 1 2 2 2 2 2 2 2 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1
## [778] 2 2 1 1 2 1 1 1 2 2 2 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 2 2 2 1 1 1 1 2
## [815] 1 2 1 1 1 1 1 1 1 2 1 2 1 1 2 1 2 1 2 2 2 2 1 1 2 2 1 1 2 2 2 2 1 1 2 2 1
## [852] 2 2 2 1 1 2 1 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 1 2 2 2 2 1 1 1 1 1 2 2 2 2
## [889] 1 2 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 1 1 1 1 1 1 1
## [926] 1 1 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 2 2 1 2 1 1 2 1 2 1 2 2 1 1 1 1 2 2 2 2
## [963] 1 1 2 1 2 2 1 1 2 2 1 1 1 1 1 2 1 1 2 1 1 1 2 2 2 2 1 2 2 2 2 1 2 2 1 2 1
## [1000] 1 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 1 2 2 2 2 2 2 2 1 2 2
## [1037] 2 1 1 1 1 2 2 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 1 2 1 2 1
## [1074] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 2 2 2 2 2 1 2 2 1 2 1 2 2
## [1111] 2 1 1 2 1 1 2 2 2 2 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 2 1 2 2 1 1
## [1148] 2 1 2 1 2 2 2 2 2 1 2 2 1 1 2 2 1 1 2 1 2 1 2 1 1 2 2 1 1 2 2 1 1 2 1 1 1
## [1185] 1 2 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 2 2 1 1 2 1 1 2 1 2 2 2 2 2 2 2 1 1 1 1
## [1222] 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 1 1 2 2 2 2 2 1 2 1 1 1 1 2 2 2 2 2 2 2 2
## [1259] 1 1 1 2 1 2 2 2 2 2 1 1 2 1 1 1 2 2 2 1 1 2 1 1 1 2 2 1 2 1 1 1 2 1 2 1 1
## [1296] 2 2 1 2 2 2 1 2 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 2 1
## [1333] 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1
## [1370] 1 1 2 2 2 2 2 1 2 2 1 2 1 1 1 2 2 1 2 2 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
## [1407] 1 2 2 1 1 1 1 2 2 2 1 2 1 2 1 2 1 1 2 2 2 2 2 1 1 1 1 2 1 1 1 2 1 2 2 2 1
## [1444] 2 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 1 1 1 1
## [1481] 1 1 1 1 1 2 1 1 1 2 1 1 2 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 1 1 1 2
## [1518] 2 1 1 1 2 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 1 1 1 1 2 1 2 2 2 2 2 2 2 1
## [1555] 2 2 2 1 1 2 2 1 1 1 2 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1
## [1592] 1 1 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 2 1 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 2
## [1629] 1 2 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## [1666] 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 2 2 2 1 2 1 1 1 1 1 1
## [1703] 1 1 1 1 1 1 1 2 1 1 2 2 2 1 2 1 2 2 2 2 2 1 1 1 1 2 1 2 2 1 2 1 1 1 2 1 2
## [1740] 2 1 2 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 1
## [1777] 2 2 2 1 2 2 1 2 2 1 1 2 1 2 2 1 1 2 1 2 1 1 2 2 2 2 1 1 1 1 1 1 1 2 2 2 2
## [1814] 1 1 1 2 2 1 2 2 1 1 1 1 2 2 1 2 2 1 1 1 1 1 2 2 2 1 2 1 1 2 2 1 1 2 2 1 2
## [1851] 1 2 1 2 2 2 1 1 1 2 1 1 2 1 2 1 1 2 1 2 1 2 1 1 1 1 2 1 1 1 2 1 2 1 1 2 1
## [1888] 1 2 1 2 2 2 2 1 2 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 2
## [1925] 1 1 1 1 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 1 2 1 1 2 2 1 1 1 2 1 1
## [1962] 1 1 2 2 1 1 1 2 1 2 1 1 1 2 1 2 2 1 1 1 1 1 1 2 1 1 2 1 1 2 2 2 1 1 1 2 1
## [1999] 2 2 2 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 2 1 1 1 1
## [2036] 1 2 2 2 2 1 1 2 2 1 1 1 2 1 1 1 2 1 2 1 1 1 2 1 2 2 1 1 1 1 2 1 2 1 1 1 1
## [2073] 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 2 2 2 2 2 1 2 1 2 1 1 2 1 2 2 1 1 1
## [2110] 2 2 1 1 2 1 1 1 1 2 1 1 2 1 1 2 2 2 1 2 1 1 2 1 1 1 2 1 2 1 1 1 2 2 1 1 1
## [2147] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 2 1 1 2 1 1
## [2184] 2 2 1 1 2 2 1 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 1 1 1 2 2 1 1 2 2 2 2
## [2221] 1 2 2 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 1 1
## [2258] 1 2 1 1 1 2 1 2 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## [2295] 2 2 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1
## [2332] 2 2 1 1 1 1 1 1 2 2 1 1 2 2 2 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [2369] 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 2 1 2 2 2 2 1 2 1 1 1 2 2 1
## [2406] 1 2 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2
## [2443] 2 2 1 1 1 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 2 1 1 2 1 1 1
## [2480] 1 2 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1 2 2 2 2 1 1 2 2 1 1 1 2 2 1 1 2 1
## [2517] 1 1 1 2 2 2 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1
## [2554] 1 1 1 1 1 1 1 1 2 2 2 1 2 1 1 1 2 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2591] 1 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 2 2 2 1 2 2 2 1 1 1 2 1 1 2 2 1 2 1 1 1
## [2628] 1 1 2 2 2 1 1 1 1 2 1 1 2 2 2 1 2 1 1 1 2 2 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1
## [2665] 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 1 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2
## [2702] 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 1 1 1 2 2 2 2 2 2 2
## [2739] 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 1 2 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 1 1
## [2776] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## [2813] 1 1 1 2 2 1 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1
## [2850] 1 1 1 1 2 2 2 2 1 1 2 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## [2887] 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1
## [2924] 1 1 1 2 1 2 2 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 2 2 1
## [2961] 1 1 1 1 1 1 1 1 1 2 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 2 1 2 1 1 1 1 1 1 2 1 1
## [2998] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 2
## [3035] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3072] 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 2 1 1 1
## [3109] 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1
## [3146] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1
## [3183] 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3220] 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1
## [3257] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2
## [3294] 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1
## [3331] 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2
## [3368] 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [3405] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 1 1 2 1 1 1 1 1 2 1 1 1
## [3442] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3479] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3516] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3553] 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## [3590] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3627] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1
## [3664] 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3701] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 1 1 2 2 2 2
## [3738] 2 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3775] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1
## [3812] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3849] 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3886] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3923] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
## [3960] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 589.1324 303.6995
## (between_SS / total_SS = 25.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Structure of km
str(km)
## List of 9
## $ cluster : int [1:3981] 1 1 1 1 1 2 1 2 2 1 ...
## $ centers : num [1:2, 1:12] 0.2601 0.2703 0.3201 0.3307 0.0392 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:12] "volatile acidity" "citric acid" "chlorides" "density" ...
## $ totss : num 1191
## $ withinss : num [1:2] 589 304
## $ tot.withinss: num 893
## $ betweenss : num 298
## $ size : int [1:2] 2498 1483
## $ iter : int 14
## $ ifault : NULL
## - attr(*, "class")= chr "kmeans"
Let us do visual analysis of data using results from K-mean. We will try to plot different graphs for better understanding.
# Centroid Plot against 1st 2 discriminant functions
clusplot(cleanWhiteDat, km$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)
fviz_cluster(km, data = wineData_norm,
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal())
fviz_cluster(list(data = wineData_norm, cluster = km$cluster),
ellipse.type = "norm", geom = "point", stand = FALSE,
palette = "jco", ggtheme = theme_classic())
pam.res <- pam(wineData_norm, 2)
# Visualize
fviz_cluster(pam.res)