#install.packages('corrplot')
#install.packages("devtools")
#devtools::install_github("kassambara/factoextra")
library(corrplot)
## corrplot 0.84 loaded
library(cluster)
library(factoextra) # for pretty graphs and k means
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(Hmisc) #better ead
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggplot2) #scatter plot
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("webshot")
#webshot::install_phantomjs()
library(webshot) # html graphs
#Using the https://archive.ics.uci.edu/ml/datasets/Wholesale+customers
cust <- read.csv("/Users/jay/Desktop/CODE/MSDS/MSDS680_ML/Week6/data/Wholesale_customers_data.csv",header=TRUE, dec = ",", check.names = TRUE)
#summary
str(cust) # Summary of the data set
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
#any empty values?
sum(complete.cases(cust)) # does not look like it but let's take a deeper look for other unknowns.
## [1] 440
#Find NAs and empty values and '?' in entire dataframe
colSums(is.na(cust)|cust == ''|cust == '?') # all looks good
## Channel Region Fresh Milk
## 0 0 0 0
## Grocery Frozen Detergents_Paper Delicassen
## 0 0 0 0
#Check if there is a need to remove nas?
summary(cust)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
#no need to get rid of the na values - cust<-na.omit(cust). All predictors look like same scale except Channel and Region. Channel and region wont need normalization therefore can be removed.
#I need a more detailed look at summary information to make sure. Hmisc package provides me highest and lowest limits, distinct values as well as missing values.
#library(Hmisc)
describe(cust)
## cust
##
## 8 Variables 440 Observations
## ---------------------------------------------------------------------------
## Channel
## n missing distinct Info Mean Gmd
## 440 0 2 0.656 1.323 0.4381
##
## Value 1 2
## Frequency 298 142
## Proportion 0.677 0.323
## ---------------------------------------------------------------------------
## Region
## n missing distinct Info Mean Gmd
## 440 0 3 0.623 2.543 0.6951
##
## Value 1 2 3
## Frequency 77 47 316
## Proportion 0.175 0.107 0.718
## ---------------------------------------------------------------------------
## Fresh
## n missing distinct Info Mean Gmd .05 .10
## 440 0 433 1 12000 12256 401.9 915.6
## .25 .50 .75 .90 .95
## 3127.8 8504.0 16933.8 27090.5 36818.5
##
## lowest : 3 9 18 23 37, highest: 56083 56159 68951 76237 112151
## ---------------------------------------------------------------------------
## Milk
## n missing distinct Info Mean Gmd .05 .10
## 440 0 421 1 5796 6152 593.8 889.8
## .25 .50 .75 .90 .95
## 1533.0 3627.0 7190.2 12229.9 16843.4
##
## lowest : 55 112 134 201 254, highest: 38369 43950 46197 54259 73498
## ---------------------------------------------------------------------------
## Grocery
## n missing distinct Info Mean Gmd .05 .10
## 440 0 430 1 7951 8365 851.5 1381.9
## .25 .50 .75 .90 .95
## 2153.0 4755.5 10655.8 18910.1 24033.5
##
## lowest : 3 137 218 223 245, highest: 45828 55571 59598 67298 92780
## ---------------------------------------------------------------------------
## Frozen
## n missing distinct Info Mean Gmd .05 .10
## 440 0 426 1 3072 3635 136.8 281.3
## .25 .50 .75 .90 .95
## 742.2 1526.0 3554.2 7545.3 9930.7
##
## lowest : 25 33 36 38 42, highest: 18028 18711 35009 36534 60869
## ---------------------------------------------------------------------------
## Detergents_Paper
## n missing distinct Info Mean Gmd .05 .10
## 440 0 417 1 2881 3927 63.7 99.6
## .25 .50 .75 .90 .95
## 256.8 816.5 3922.0 7438.3 12043.2
##
## lowest : 3 5 7 9 10, highest: 24171 24231 26701 38102 40827
## ---------------------------------------------------------------------------
## Delicassen
## n missing distinct Info Mean Gmd .05 .10
## 440 0 403 1 1525 1677 63.95 180.80
## .25 .50 .75 .90 .95
## 408.25 965.50 1820.25 2945.90 4485.40
##
## Value 0 500 1000 1500 2000 2500 3000 3500 4000 4500
## Frequency 65 121 82 54 42 23 18 8 1 5
## Proportion 0.148 0.275 0.186 0.123 0.095 0.052 0.041 0.018 0.002 0.011
##
## Value 5000 5500 6000 6500 7000 8000 8500 14500 16500 48000
## Frequency 8 1 3 2 1 1 1 2 1 1
## Proportion 0.018 0.002 0.007 0.005 0.002 0.002 0.002 0.005 0.002 0.002
## ---------------------------------------------------------------------------
#All looks good! It doesnt look like there are outliers or missing values that needs to be imputed.
#Remove Channel and Region from the data set. These two are not related and no need to dummfy since data is not categorical
cust_c<- cust[-c(1,2)]
head(cust_c, n=3)
#summary(cust)
#Also create a smaller set with Fresh, Frozen and Grocery and Detergents_Paper. Fresh and Frozen was suggested in the instructions. I chose Detergents_Paper and Grocery since Detergents_Paper mean is close to Frozen category and Detergents_Paper and Grocery are highly correlated which i will show in the next cell.
#library(dplyr)
cust_c4 <- cust_c
cust_c4 <- select(cust_c4, Fresh, Frozen, Grocery, Detergents_Paper)
str(cust_c4)
## 'data.frame': 440 obs. of 4 variables:
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
#now I have a dataset with all observations and only 3 attributes.
#Check the correlation of the predictors in cust_c data set. Also display the smaller cust_c4 dataset correlation values.
#par(mfrow=c(1,2)) #display side by side
corrmatrix <- cor(cust_c)
#corrplot(corrmatrix, method = 'shade') #if there were more categories in teh dataset, using the 'shade' option would make to spot highly correlated items easier.
corrplot(corrmatrix, method = 'number')
corrmatrix2 <- cor(cust_c4)
corrplot(corrmatrix2, method = 'number')
#looks like Grocery and Detergents_Paper categories are highly correlated. This is going to be helpful during plotting. Initially these plots were side by side however knitting does not produce a readable visualization.
#Before we do the actual clustering, we need to identity the Optimal number of clusters (k) for this data set of wholesale customers. I will use the the popular way of determining number of clusters. These are:
#1. Silhouette Method
#2. Elbow Method
#3. Gap Static Method
#Elbow and Silhouette methods are direct methods and gap statistic method is the statistics method.The calculation/estimation continues until k is less then total number of observations. Here default is 10 and I decided to keep it at default apart from Gap Statistics method. 10 was not enough for it to converge therefore I had to try a different runs.
# 1.Silhouette_score Method
#cust_c data set
silhouette_score <- function(k){
km <- kmeans(cust_c, centers = k, nstart=25)
ss <- silhouette(km$cluster, dist(cust_c))
mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', main ='cust_c dataset', frame=FALSE)
#cust_c4 data set
silhouette_score <- function(k){
km <- kmeans(cust_c4, centers = k, nstart=25)
ss <- silhouette(km$cluster, dist(cust_c))
mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', main ='cust_c4 dataset', frame=FALSE)
# When using the Silhouette method, for the optimized k value, I look at the highest point in the plot. For cust_c it is 2 and for cust_c4 it is 3.
#Another way to visualize silhouette using factoextra package:
#using factorextra package - Average silhouette method
fviz_nbclust(cust_c, kmeans, method='silhouette')
fviz_nbclust(cust_c4, kmeans, method='silhouette')
#factoextra package makes the plot much more easy to construct and also creates a line for th eoptimum k value. I will use fviz_nbclust function from the factoextra package for the rest of the methods.
#cust_c shows optimum as k=2
#cust_c4 shows optimum as k=3 the results are the same whether using fviz_nbclust or creating a function that calculates kmeans as above.
# 2.Elbow Method.
fviz_nbclust(cust_c, kmeans, method='wss') # wss = Within sum of square
fviz_nbclust(cust_c4, kmeans, method='wss')
#cust_c
#the plot shows the optimal number of clusters as 2. In this plot, as k increases the total within cluster, sum of squares keeps decreasing. this is true for all charts and part of the estimation. The more k segments the data, the more points are grouped together into smaller and more compact clusters until there are many clusters with only one or two members. For this reason, i need to figure out the point where the curve begins to flatten out and for cust_c data set that is 2.
#cust_c4
#biggest drop is from k=1 to k=2. then slowly flattens out after that. i think I can safely claim that the elbow of the curve is 2 and use this. however, i might also try k=3. Afterall silhouette method shows this datasets' optimum k value as 3.
# 3.Gap Statistics Method:
#cust_c
gap_stat <- clusGap(cust_c, kmeans, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
#cust_c4
gap_stat <- clusGap(cust_c4, kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
#At first, cust_c dataset did not converge at k.max = 10 therefore I increased the iteration value to 15 which did the trick. With it, the Gap statistics method showed the optimum k for cust_c as 13 whereas for the cust_c4 was at k=1. these are different outcomes than the elbow and silhouette methods. to recall for cust_c set k value was 2 and for cust_c4 it was 3 so quite a bit different. maybe the issue is not enough iteration for converging, since the iteration had to be changed to 15. in order to test i removed the nstart = 25. Cust_c4 k value remained at 1 however cust_c dropped to 11 from 13. Next, i lowered the iteration count back to 10 and removed the nstart=25. This resulted in a different result, k=5. I like k=5 better since the calculation use similar parameters as elbow and silhouette. keeping it at 5.
# Thoughts: Changing the parameters make a massive difference in the gap statistics method. Perhaps this method is more suitable for finetuning the algorithm or simply not suitable for the dataset i am working with. On the other hand the task is to find the optimal k value so the parameters that go into the calculation make a difference.
#Compute k-means with k=3, full dataset - cust_c.
set.seed(123)
km.final <- kmeans(cust_c, centers = 3, nstart = 25) # clustering into groups!!!
km.final
## K-means clustering with 3 clusters of sizes 330, 50, 60
##
## Cluster means:
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 8253.47 3824.603 5280.455 2572.661 1773.058 1137.497
## 2 8000.04 18511.420 27573.900 1996.680 12407.360 2252.020
## 3 35941.40 6044.450 6288.617 6713.967 1039.667 3049.467
##
## Clustering vector:
## [1] 1 1 1 1 3 1 1 1 1 2 1 1 3 1 3 1 1 1 1 1 1 1 3 2 3 1 1 1 2 3 1 1 1 3 1
## [36] 1 3 1 2 3 3 1 1 2 1 2 2 2 1 2 1 1 3 1 3 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1
## [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 3 1 3 1 1 2 1 1 1 1 1 1 1 1 1 1 3 1
## [106] 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 3 1 1 1 1 1 1 1 1 1 1
## [141] 1 3 3 1 1 2 1 1 1 3 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 2 1
## [176] 1 3 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 2 2 3 1 1 2 1 1 1 2
## [211] 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 3 3 3 1 1 1
## [246] 1 1 1 1 1 1 2 1 3 1 3 1 1 3 3 1 1 3 1 1 2 2 1 2 1 1 1 1 3 1 1 3 1 1 1
## [281] 1 1 3 3 3 3 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 3 2 1 1
## [316] 1 1 1 1 2 1 1 1 1 3 3 1 1 1 1 1 2 1 2 1 3 1 1 1 1 1 1 1 2 1 1 1 3 1 2
## [351] 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 3 1 1 3 1 3 1 2
## [386] 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 3 3 3 1 1 3 2 1 1 1 1 1 1 1 1 1 1 2 1
## [421] 1 1 3 1 1 1 1 3 1 1 1 1 1 1 1 3 3 2 1 1
##
## Within cluster sum of squares by cluster:
## [1] 28184318853 26382784678 25765310312
## (between_SS / total_SS = 49.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
#i am planning to calculate with k values equal to 3 and 2. i decided to start with 3, first.
#As the final result of k-means clustering result is sensitive to the random starting assignments, we specify nstart = 25 . This means that R will try 25 different random starting assignments and then select the best results corresponding to the one with the lowest within cluster variation. The default value of nstart in R is one. But, apparently it's strongly recommended to compute k-means clustering with a large value of nstart such as 25 or 50, in order to have a more stable result.
#in order to see the impact, I did also run it with default value 1 and cluster means were completely different. It seemed like cluster 3 became cluster 2 with each value lower.
#From the results there are 3 clusters with each 330, 50 and 60 members. Next, let's dive into cluster details.
#Details fo the kmeans clusters
## Cluster sizes - Number of observations in each cluster
km.final$size
## [1] 330 50 60
## No of iterations
km.final$iter
## [1] 3
#vector of int, indicating the cluster
head(km.final$cluster, n=10)
## [1] 1 1 1 1 3 1 1 1 1 2
#A matrix of cluster centers (cluster means)
km.final$centers
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 8253.47 3824.603 5280.455 2572.661 1773.058 1137.497
## 2 8000.04 18511.420 27573.900 1996.680 12407.360 2252.020
## 3 35941.40 6044.450 6288.617 6713.967 1039.667 3049.467
#Add the point classifications to the original data (cust_c) and assign it to a new variable called cust_c_clustered3
cust_c_clustered3<- cbind(cust_c, cluster = km.final$cluster)
head(cust_c_clustered3, n=3)
#Calculate the size of each cluster and see if it matches the kmeans model results
print(km.final$size)
## [1] 330 50 60
count(cust_c_clustered3, cluster)
#yes, it does! moving on to plots.
#plot the positions of the Fresh and frozen categories and color it using clusters. So 2 categories divided into 3 groups.
x<-cust_c_clustered3$Fresh
y<-cust_c_clustered3$Frozen
ggplot(cust_c_clustered3, aes(x = x, y = y, color = factor(cluster))) +
geom_point() +
xlab("Fresh") +
ylab("Frozen") +
ggtitle("Fresh vs Frozen")
#to get a nice plot I need to lower the dimensions from 6 to 2. Initially I thought of sampling the data since there are a few observations but I think displaying the entire data works well here.
#Compute the mean of each variable by cluster. Assign to cust2_mean
cust3_clustered_mean <- aggregate(cust_c, by=list(cluster=km.final$cluster), mean) #using cust_c here - cleaned customer data set.
cust3_clustered_mean
#With this view, It looks like Fresh category is the bulk of overall sales for all groups and it belongs to cluster 3. In every category of items but Detergents_Paper, cluster 3 customer base spent more then cluster 1. Detergents_Paper category performs as the slowest moving category for both clusters. And overall, Delicassen owns the lowest part of the pie. Finally, cluster 2 members spending distribution seems to be quite a bit different then clusters 1 and 3.
#plot the means of the Fresh and frozen categories and color it using clusters.
x<-cust3_clustered_mean$Fresh
y<-cust3_clustered_mean$Frozen
ggplot(cust3_clustered_mean, aes(x = x, y = y, color = factor(cluster))) +
geom_point() +
xlab("Fresh") +
ylab("Frozen") +
ggtitle("Fresh vs Frozen")
# the difference between the cluster 3 and other clusters is massive. To a point that perhaps it is best to concentrate on this customer base and drop the other two. Afterall a business can not be everything to everyone and must differentiate in order to grow. Remember this is only for Fresh and frozen categories.
#plots using the factoextra package.
#k=3 - Specify the categories - fresh and frozen
fviz_cluster(object = km.final, data = cust_c_clustered3, geom = "point",
choose.vars = c("Fresh", "Frozen"), stand = FALSE,
ellipse.type = "norm") + theme_bw()
#k=3 - not specify the categories.
fviz_cluster(object = km.final, data = cust_c_clustered3, geom = "point", stand = FALSE,
ellipse.type = "norm") + theme_bw()
# I created two plots here. First with specific variables and in the second plot I omit the choose.vars parameter. When i do that fviz_cluster function transforms the initial set of variables into a new set of variables thorough principal component analysis (PCA). This dimensionality reduction algorithm operates on the entire data sets' variables and outputs two new variables (Dim1 and Dim2) that represent the original variables as a projection of the original data set. Each dimension represent a certain amount of the variation/information contained in the original data set. In this example, Dim1 and Dim2 represent 67.7% and 24.3% respectively. When plotted, this lower-dimensional picture can be difficult to interpret this is the reason i used the choose.vars argument, to specify variables in plot 1. Dims together account for 92% (67.7% + 24.3%) of the information. The new centers are shown in respective cluster symbols.
#k=3 - here is another visualization but this time in 2D represenatation for easier interpretation. clusplot finction also uses PCA to plot the data.
clusplot(cust_c_clustered3, cust_c_clustered3$cluster, color=TRUE, shade = TRUE, label=2, main='2D representation by clusplot')
#another difference between clusplot and fviz_cluster is the point variability percebtage. 2d plot explains less thus is easier to interpret in my opinion.
set.seed(123)
km.final <- kmeans(cust_c4, centers = 3, nstart = 25)
#add the cluster to data set
cust_c4_clustered<- cbind(cust_c4, cluster = km.final$cluster)
head(cust_c4_clustered, n=3)
#Calculate the size of each cluster and see if it matches the kmeans model results
print(km.final$size)
## [1] 48 328 64
count(cust_c4_clustered, cluster)
#Calculate the means
print(aggregate(cust_c4, by=list(cluster=km.final$cluster), mean))
## cluster Fresh Frozen Grocery Detergents_Paper
## 1 1 7709.083 1788.771 28352.229 12867.812
## 2 2 8109.439 2592.640 5287.341 1778.040
## 3 3 35159.359 6490.672 6303.234 1046.953
#customers that are in cluster 3, spend significantly more on fresh food then cluster 1 and 2. For other categories, the difference is not as big. However for Detergents_Paper category the spending is larger for cluster 1. in conclusion both cust_c and cust_c4 datsets produce similar results. Lowering the number of variables did not make a significant difference. Perhaps this is because I chose variables that are highly correlated with each other.
clusplot(cust_c4_clustered, km.final$cluster, main='2D representation of cust_c4', color=TRUE, shade=TRUE, labels=2, lines=0)
fviz_cluster(object = km.final, data = cust_c4_clustered, geom = "point", stand = FALSE,
ellipse.type = "norm") + theme_bw()
#2D analysis explains more then 79% of information about the multivariate data with the plot and components 1 and 2. Since there are observations for only 4 categories in this data set, it results in a more distinct plot. I understand the 2d plot better however prefer using fviz_cluster since it produces a much nicer plot as well as sets the new centers.
#Compute k-means with k=2. Let's assume we want to cluster the customers into 2 groups and potentially figure out regular customers vs not frequent spending according to category of items so that we can run related promotions or maybe change our operating hours to 24/7.
set.seed(123)
km.final.2 <- kmeans(cust_c, 2, nstart = 25) # clustering into 2 groups!
km.final.2
## K-means clustering with 2 clusters of sizes 375, 65
##
## Cluster means:
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 7944.112 5151.819 7536.128 2484.131 2872.557 1214.261
## 2 35401.369 9514.231 10346.369 6463.092 2933.046 3316.846
##
## Clustering vector:
## [1] 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2 2 1 1 1 1 2 1 1 1 2 1
## [36] 1 2 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1
## [141] 1 2 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1
## [246] 1 1 1 1 1 1 1 1 2 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1
## [281] 1 1 2 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
## [316] 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1
## [351] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1
## [386] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [421] 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 60341401922 52876126599
## (between_SS / total_SS = 28.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# 2 clusters with sizes of 375 and 65.
# Create a new data frame appending the cluster assignment - trying to append cluster column using another way
cust2 <- mutate(cust_c, cluster = km.final.2$cluster)
tail(cust2, n=3)
## Cluster sizes - Number of observations in each cluster
km.final.2$size
## [1] 375 65
## No of iterations
km.final.2$iter
## [1] 1
#vector of int indicating the cluster
head(km.final.2$cluster, n=10)
## [1] 1 1 1 1 2 1 1 1 1 1
#A matrix of cluster centers (cluster means)
km.final.2$centers
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 7944.112 5151.819 7536.128 2484.131 2872.557 1214.261
## 2 35401.369 9514.231 10346.369 6463.092 2933.046 3316.846
#Compute the mean of each variable by cluster. Assign to cust2_mean
cust2_mean <- aggregate(cust_c, by=list(cluster=km.final.2$cluster), mean) #using cust_c here - cleaned customer data set.
cust2_mean
#With this view, It looks like Fresh category is the bulk of overall sales for all groups followed by Grocery and it belongs to cluster 2. In every category of items cluster 2 spends more then cluster 1 so cluster 1 can be categorized as the low spenders and cluster 2 is the high spenders. this time instead of Detergents_Paper category Delicassen performs lowest for cluster 1 customers.
#plot the positions of the Fresh and frozen categories and color it using clusters
x<-cust2$Fresh
y<-cust2$Frozen
ggplot(cust2, aes(x = x, y = y, color = factor(cluster))) +
geom_point() +
xlab("Fresh") +
ylab("Frozen") +
ggtitle("Fresh vs Frozen")
#Naturally, this plot is much cleaner when compared to k=3 plot. Since I am trying to divide the customer base according to spending, I have no idea what the value of k is. I think this is one of the reason before calculating k-means, it makes sense to evaluate different values of k to find the optimal one. Note that i finally got the label colors working!
#k=2
fviz_cluster(km.final.2, data=cust2)
#Dim 1 and 2 explain ~69% of the information. This is much less then k=2 results.
#k=2 - 2d representation
clusplot(cust2, cust2$cluster, color=TRUE, shade = TRUE, label=2)
#Since principal component analysis is used for dimensionality reduction, with this plot almost 70% of the information about the multivariate data is explained with components 1 and 2. This plot is very similar to the one produced by the fviz_cluster function. So using k=2 clusplot and fviz_cluster functions perform similarly.
#Compute k-means with k=5. gap stats gave the value of k as 5. here is the run but this time running it just for Detergents_Paper and Grocery since these categories had the highest correlation between them (0.92). i also got 11 and 13 as the optimal k values but think that would create too many clusters especially for a smaller dataset like this and this will not produce any menaingful results. this is the reason picking k=5.
set.seed(123)
km.final.5 <- kmeans(cust_c4[c("Detergents_Paper", "Grocery")], 5, nstart = 25, iter.max=30)
km.final.5
## K-means clustering with 5 clusters of sizes 251, 4, 52, 118, 15
##
## Cluster means:
## Detergents_Paper Grocery
## 1 525.7371 2660.721
## 2 32450.2500 68811.750
## 3 7449.0000 18729.154
## 4 3223.7627 9387.466
## 5 15889.6000 31589.133
##
## Clustering vector:
## [1] 4 4 4 1 4 1 4 4 4 3 4 1 4 3 4 1 4 1 4 4 1 1 1 3 4 4 1 1 3 1 4 1 1 4 1
## [36] 4 1 4 3 1 1 4 3 3 4 3 3 2 4 5 1 4 4 4 1 1 5 4 1 4 4 2 4 3 1 5 4 4 1 1
## [71] 1 3 1 4 4 1 1 5 1 1 1 3 4 1 1 2 5 4 1 1 1 1 5 1 4 1 4 1 1 1 4 3 4 4 1
## [106] 1 4 3 4 3 1 3 1 1 1 1 1 1 1 1 1 1 1 4 1 4 1 4 1 1 1 1 1 1 1 1 4 4 1 1
## [141] 4 1 4 1 1 5 1 1 1 1 1 1 1 1 1 3 3 1 4 3 4 1 1 5 4 3 4 1 1 1 3 3 4 3 1
## [176] 4 4 1 1 1 4 3 4 3 1 1 1 4 4 4 1 1 1 3 1 1 1 4 4 1 3 5 4 1 1 3 1 4 4 3
## [211] 1 5 1 4 4 3 5 1 3 1 1 4 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 4
## [246] 3 1 1 1 1 1 5 1 3 1 1 1 1 1 4 1 1 1 4 3 4 3 1 3 1 1 1 4 1 1 1 4 1 1 4
## [281] 1 4 1 1 4 1 1 1 1 1 1 1 1 4 4 1 1 4 4 1 4 3 4 3 3 4 3 1 1 3 1 4 3 1 4
## [316] 4 1 1 1 3 1 1 1 4 1 4 1 1 1 1 1 5 1 2 1 4 1 1 4 1 4 4 4 5 1 4 3 4 1 3
## [351] 1 3 1 3 1 1 1 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 4 1 1 4 1 1 4 1 1 4 1 4
## [386] 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 3 4 4 1 1 4 4 1 4 4 4 3 1
## [421] 4 4 1 1 4 1 3 1 1 1 3 1 1 1 4 4 1 5 1 1
##
## Within cluster sum of squares by cluster:
## [1] 613624481 1040691172 1183933229 1081018307 672853623
## (between_SS / total_SS = 90.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
#initially this did not converge in 10 iterations therefore added iter.max = 30 parameter which allowed enough iterations.
#bind the lcuster to the dataset and assign it to cust5
cust5<- cbind(cust_c4[c("Detergents_Paper", "Grocery")], cluster = km.final.5$cluster)
head(cust5, n=3) #Cluster sizes
#Sizes of the 5 clusters
km.final.5$size
## [1] 251 4 52 118 15
#Compute the mean of each variable by cluster. Assign to cust5_mean
cust5_mean <- aggregate(cust_c4[c("Detergents_Paper", "Grocery")], by=list(cluster=km.final.5$cluster), mean) #using cust_c here - cleaned customer data set.
cust5_mean
#When only Detergents_Paper and Grocery categories are analyzed, cluster 2 customers are the big spenders in both categories. Cluster 1 consists of customers that are low spenders. In my opinion, although it is easier to comprehend this result it does not really provide in depth analysis. only perhaps when comparing 2 categories to each other. I think k=3 - 3 clusters provide the optimal understanding of the customer segmentation
#plot the positions of the Detergents_Paper and Grocery categories and color it using clusters
ggplot(cust5, aes(x = Detergents_Paper, y = Grocery, color = factor(cluster))) +
geom_point() +
xlab("Detergents_Paper") +
ylab("Grocery") +
ggtitle("Detergents_Paper vs Grocery")
#Easier to interpret the results when there are fewer data points. CLusters 4 and 5 are closer to each other and cluster 2 has the most amount of the outliers.
fviz_cluster(km.final.5, cust5, palette = "Set4", ggtheme = theme_minimal(), main = "k=5 and only Detergents_Paper, Grocery", xlab = "Detergents_Paper", ylab = "Grocery", ellipse.type = "norm")
#this is an interesting plot. Although the categories are highly correlated, the plot does interprit this explicitly in my opinion.
clusplot(cust5, cust5$cluster, color=TRUE, shade = TRUE, label=2)
#here is the 2D representation.
library(cluster)
suppressPackageStartupMessages(library(dendextend)) # for coloring hc
library(dplyr) #to select()
library(ggplot2) #plot points
library(purrr) #map functionality
#used suppressPackageStartupMessages. dendextend package was creating excess messaging that did not provide an value to final report.
#Using the https://archive.ics.uci.edu/ml/datasets/Wholesale+customers
cust <- read.csv("/Users/jay/Desktop/CODE/MSDS/MSDS680_ML/Week6/data/Wholesale_customers_data.csv",header=TRUE, dec = ",", check.names = TRUE)
#summary
str(cust)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
head(cust, n=3)
#load the dataset again since this time I want to work with a smaller subset of the data.
#First draw a sample of 44 records (10%) from the customer data, so that the clustering plot will not be over crowded. This is assigned to customer_sample variable. Variables “Region” and “Channel” are removed. After that, I apply hierarchical clustering to the data.
set.seed(123)
idx <- sample(1:dim(cust)[1], 44)
customer_sample <- cust[idx,]
customer_sample$Region <- NULL
customer_sample$Channel <- NULL
#summary(cust)
#Also create a smaller set with Fresh, frozen and Detergents_Paper. Fresh and frozen was suggested in the instructions. I chose Detergents_Paper since its mean is close to frozen category.
#library(dplyr)
cust_sample3 <- customer_sample
cust_sample3 <- select(cust_sample3, Fresh, Frozen, Detergents_Paper)
str(cust_sample3)
## 'data.frame': 44 obs. of 3 variables:
## $ Fresh : int 5969 11002 21217 7107 11243 243 6990 6758 1869 11210 ...
## $ Frozen : int 5679 1152 3095 806 15348 799 1647 934 950 561 ...
## $ Detergents_Paper: int 1135 120 6707 355 108 3909 319 4538 4762 1682 ...
#now I have a sampled dataset with 44 observations and 3 attributes assigned to cust_sample3 variable.
#use average linkage and build a dendrogram
hc <- hclust(dist(customer_sample), method = "average") # distance done by euclidean distance.
plot(hc)
#HCA iteratively groups each point by taking the average distance. So in this graph 78 and 332 has the closest observations and then this is grouped with 438 since it has the closest average distance to 78-332 cluster. The height of the dendrogram represents the calculate distance. so the line between 438 and the groups of 78/332 represents the distance between these two groups. Since this is a dendrogram created by average distances the overall look is more uniform.
## Do the same with centroid clustering and *squared* Euclidean distance, but this time cutting the tree into ten clusters and reconstruct the upper part of the tree from the cluster centers.
hc <- hclust(dist(customer_sample)^2, method = "centroid") # distance is *squared* Euclidean distance
memb <- cutree(hc, k = 10) #10 clusters
cent <- NULL
for(k in 1:10){
cent <- rbind(cent, colMeans(customer_sample[memb == k, , drop = FALSE]))
}
hc1 <- hclust(dist(cent)^2, method = "centroid", members = table(memb))
opar <- par(mfrow = c(1, 2))
plot(hc, labels = FALSE, hang = -1, main = "Original Tree") #omitting the labels to have a better view
plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters")
par(opar)
#There are a wide range of hierarchical clustering methods, another one is "ward"
d <- dist(customer_sample, method = "euclidean") # distance matrix using euclidean
fit <- hclust(d, method="ward.D2") # ward method is renamed to ward.D2
plot(fit) # display dendogram
#Cut the tree from hclust, into several groups either by specifying the desired number(s) of groups or the cut height(s).
groups <- cutree(fit, k=6) # cut tree into 6 clusters
plot(fit)
rect.hclust(fit, k=6, border="red") # draw dendogram with red borders around the 6 clusters
#Distance method used was euclidean. The linkage method was ward.D2 and the chosen k was 6. if any one of these parameters change the results are impacted - speaking from experience :). Incorporating the analysis of the data into clustering analysis is the key to leverage the HCA. here also the 6 clusters are shown with red boxes in the dendrogram.
#There are a wide range of hierarchical clustering methods, another one is "complete"
d <- dist(customer_sample, method = "euclidean") # distance matrix using euclidean
hc.customer_sample <- hclust(d, method="complete") # Complete is the default method
plot(hc.customer_sample) # display dendogram
#hc.customer_sample is the hclust object containing the linkage steps and now can be used to extract clusters
#Cut the tree from hclust, into several groups either by specifying the desired number(s) of groups or the cut height(s). here desired number is given as k value. cutree allows to determine which observations belong to which cluster.
cluster_assignments <- cutree(hc.customer_sample, k=6) # cut tree into 6 clusters
#print(cluster_assignments)
plot(hc.customer_sample)
rect.hclust(hc.customer_sample, k=6, border="red") # draw dendogram with red borders around the 6 clusters
#Here, the distance is calculated by euclidean and linkage method was complete. Since I removed the Channel and region from the cust dataset and the remaining observations are on the same scale, I chose not to rescale the data.
#Based on the code above and plot below, I can concretely say that for a given branch on a tree all members that are a part of that branch must have a Euclidean distance amongst one another no greater than the height of that branch.
#The clusters in red are cut off at point ~20000 (height). This means all members of the created clusters will have a euclidean distance amongst each other no greater than the cut height of 20000. this statement of course depends on the choice of the k value, distance metric (euclidean) and the linkage criteria (complete). just to confirm i also ran it with using h=20000 instead of k=6 and got the same results!
#Next, I would like to visualize the results using ggplot2 to have a different look at how 6 clusters exist.
#First Add the cluster assignments to the data set.
#library(dplyr)
customer_clustered <- mutate(customer_sample, cluster = cluster_assignments) # create a new data frame storing the results
head(customer_clustered, n=3)
unique(cluster_assignments) # k=6 distinct assignments
## [1] 1 2 3 4 5 6
#library(ggplot2)
#Count of each cluster assignment for the sample set of 44. (Based on k=6)
count(customer_clustered, cluster)
#Calculate the mean for each category
customer_clustered %>%
group_by(cluster) %>%
summarise_all(funs(mean(.)))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
#interpreting insights from the table: customers in cluster 5 spend the most on Grocery as well as Milk but they spend the least on Frozen. Customers in cluster 6 spend the most on Fresh however the least on Detergents_Paper. this could be a valuable insight. perhaps prices for this department can be lowered or bulk discounts can be provided.
#prepare for the ggplot2
#clusters <- as.factor(customer_clustered$cluster) -- this was not necessary. the reason it was added because of evaluation error, $ operator is invalid for atomic vectors
x=customer_clustered$Fresh
y=customer_clustered$Frozen
#Plot the positions of the frsh and frozen items and color them using their cluster.
ggplot(customer_clustered, aes(x = x, y = y, color= factor(cluster))) +
geom_point() + theme(legend.position="bottom")
#I had some difficulty with ggplot2 for this particular plot. Somehow it is best to prep the data that will be in ggplot2 rather then plugging it directly. With that everything seem to work well.
#Run the hierarchical cluster with Agglomerative Clustering and calculate the Agglomerative coefficient using complete method.
hc2 <- agnes(d, method = "complete")
hc2$ac # Agglomerative coefficient
## [1] 0.853182
#display dendogram - agness with complete method
plot(hc2) #display dendogram
rect.hclust(hc2, k=6, border="red")
#Although I used k=6 because of the type of linkage changing from ward to complete, the clusters are different.
# methods to assess
#library(purrr)
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
agnes(customer_sample, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward
## 0.8103062 0.6487442 0.8531820 0.9290328
#Looks like ward provides the strongest clustering structure of the four methods assessed.
#This time scaling the distance so that I can place the dendrograms side by side for inspection. Also using the small set cust_sample3
# Scale & calculate the distance
scaled_cust <- scale(cust_sample3)
dist_cust <- dist(scaled_cust)
# Generate hclust for each of the complete, single & average linkage methods
hc_complete <- hclust(dist_cust, method = "complete")
hc_single <- hclust(dist_cust, method = "single")
hc_average <- hclust(dist_cust, method = "average")
# Plot & Label the 3 Dendrograms Side-by-Side
par(mfrow = c(1,3))
plot(hc_complete, labels = FALSE, main = 'Complete Linkage')
plot(hc_single, labels = FALSE, main = 'Single Linkage')
plot(hc_average, labels = FALSE, main = 'Average Linkage')
#This is just to show the differences between the dendograms and how picking a different method results in a different dendrogram. I removed the labels as well as added plot names for cosmetics. next pruning...
#Cut the heights and color the dendrogram
#library(dendextend)
dend_complete <- as.dendrogram(hc_complete)
dend_single <- as.dendrogram(hc_single)
dend_average <- as.dendrogram(hc_average)
dend_complete_colored <- color_branches(dend_complete, h=2)
dend_single_colored <- color_branches(dend_single, h=2)
dend_average_colored <- color_branches(dend_average, h=2)
par(mfrow = c(1,3))
plot(remove_leaves_nodePar(dend_complete_colored), main = 'Complete Linkage')
plot(dend_single_colored, main = 'Single Linkage')
plot(dend_average_colored, main = 'Average Linkage')
#I specifically chose the h=2 to show the differences explicitly. Since h=2, all branches in single linkage method are clustered into the same group. So now it has one color meaning belongs to a single cluster. the rest of the dendrograms are also clustered by color according to the height cut. By looking at the plots it can be said that every member belonging to a cluster must have a maximum euclidean distance to all other members of its cluster that is less than 2.
#dendextend library is an alternate to using cutree function. The cutree function is used to first make the clusters which can then be used to assign cluster memberships accordingly. one drawback of the dendextend is that the labels = false does not remove the leaf node information. The only way i was able to find was to set the font color to white.
#instead of h=4, what if I used k=4
dend_complete_colored <- color_branches(dend_complete, k=4)
dend_single_colored <- color_branches(dend_single, k=4)
dend_average_colored <- color_branches(dend_average, k=4)
par(mfrow = c(1,3))
plot(dend_complete_colored, main = 'Complete Linkage')
plot(dend_single_colored, main = 'Single Linkage')
plot(dend_average_colored, main = 'Average Linkage')
# when I used k=4 instead of h=4, then each dendrogram is clustered into 4 clusters, differentiated by color.
#Final thoughts: After running though the exercises, I think it is important to remember that both HCA and k-means unsupervised clustering methods are descriptive rather then prescriptive. I think they are useful to create several points of view to understand the dataset. The decision of picking one over the other and set it up correctly is going to require trial and error as well as the type of question I am trying to answer. I dont think there is one size fits all with either method which makes it difficult to create insights right out of the box. In my opinion k=3, 3 clusters provide the results that are optimal for understanding the customer segmentation. After all, high, medium, low grouping is the de facto in prioritization.
#When I compare the methods to each other, I think HCA would be more suitable when working with a smaller data set. I can only imagine how memory taxing the dendrograms can be therefore I imagine with k-means I can run larger datasets. Even during this exercise running cells for HCA took a bit longer then calculating k-means.
#I also think these clustering methods require a certain amount of subjectivity. I think the best algorithm and the method depends on the data set, my understanding of the data set, the questions I am trying to answer, and my set up to run these. i read a cool quote that sums it all succintly. "Generating clusters is a science, but interpreting them is an art"