df <- read.csv("Fashion_Clustering.csv",header=TRUE)
df_scaled <- scale(df) # Scale the data
head(df_scaled, n = 3) # View the firt 3 rows of the data
## Store.Area East West North South CRESCENT.NS
## [1,] -0.9928914 2.1806892 -0.5321496 -0.4646845 -0.8609614 -0.4115927
## [2,] -0.3834972 -0.4560919 1.8690132 -0.4646845 -0.8609614 -0.4115927
## [3,] -0.9463081 2.1806892 -0.5321496 -0.4646845 -0.8609614 -0.4115927
## CRESCENT.MIX.N.MATCH.NS CRESCENT.POISE.NS CRESCENT.SET.NS BLINK.NS
## [1,] -0.3255487 -0.2694052 -0.05244054 -0.5013978
## [2,] -0.4421498 -0.2694052 -0.04917769 0.1668502
## [3,] -0.5011641 -0.2694052 0.07212238 -0.2764724
## SAHAR.NS CRESCENT.CD CRESCENT.MIX.N.MATCH.CD CRESCENT.POISE.CD
## [1,] -0.689266 -0.4105919 -0.3871156 -0.2628897
## [2,] -0.689266 -0.4105919 -0.3080246 -0.2628897
## [3,] -0.689266 -0.4105919 -0.7127685 -0.2628897
## CRESCENT.SET.CD BLINK.CD SAHAR.CD CRESCENT.CGS
## [1,] -0.07628098 -0.4900366 -0.6568683 -0.4244854
## [2,] 0.20223977 0.3326759 -0.6568683 -0.4244854
## [3,] -0.15012646 -0.4315284 -0.6568683 -0.4244854
## CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS CRESCENT.SET.CGS
## [1,] -0.3562077 -0.2647972 -0.05655066
## [2,] -0.4038287 -0.2647972 0.08541278
## [3,] -0.5869480 -0.2647972 0.03251893
## BLINK.CGS SAHAR.CGS
## [1,] -0.5148522 -0.694181
## [2,] 0.2722038 -0.694181
## [3,] -0.3172879 -0.694181
Loading Packages
library(cluster)
## Warning: package 'cluster' was built under R version 3.6.2
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Determining the Optimum Number of Clusters
fviz_nbclust(df_scaled,pam,method="silhouette")+
theme_classic()
Avearge Silhouette width suggests that there are 7 clusters. Lets compute PAM with k=7. Lets check the elbow plot.
fviz_nbclust(df_scaled,pam,method="wss")+
geom_vline(xintercept = 5,linetype=2)
Again we see there are 7 clusters.
pam.res <- pam(df_scaled,7)
print(pam.res)
## Medoids:
## ID Store.Area East West North South
## [1,] 52 -0.6054031 2.1806892 -0.5321496 -0.4646845 -0.8609614
## [2,] 2 -0.3834972 -0.4560919 1.8690132 -0.4646845 -0.8609614
## [3,] 106 -0.4629006 -0.4560919 -0.5321496 2.1403652 -0.8609614
## [4,] 27 -0.2697917 -0.4560919 -0.5321496 -0.4646845 1.1552140
## [5,] 123 0.2231443 -0.4560919 -0.5321496 -0.4646845 1.1552140
## [6,] 66 0.5191599 -0.4560919 -0.5321496 -0.4646845 1.1552140
## [7,] 173 -0.6153550 -0.4560919 1.8690132 -0.4646845 -0.8609614
## CRESCENT.NS CRESCENT.MIX.N.MATCH.NS CRESCENT.POISE.NS CRESCENT.SET.NS
## [1,] -0.4115927 -0.7180440 -0.2694052 -0.39959417
## [2,] -0.4115927 -0.4421498 -0.2694052 -0.04917769
## [3,] -0.4115927 -1.2078449 -0.0690487 -1.02127773
## [4,] 0.3185246 0.1052274 -0.2694052 0.03916813
## [5,] 2.6695737 2.0426329 -0.2694052 0.90789309
## [6,] -0.4115927 0.9991692 -0.2694052 0.05638250
## [7,] -0.4115927 -1.2249164 4.9234880 -1.07136346
## BLINK.NS SAHAR.NS CRESCENT.CD CRESCENT.MIX.N.MATCH.CD
## [1,] -0.81766397 -0.68926600 -0.4105919 -0.7760795
## [2,] 0.16685024 -0.68926600 -0.4105919 -0.3080246
## [3,] -0.93247236 -0.67619144 -0.4105919 -0.8640611
## [4,] 0.09463189 -0.03234262 0.3210103 -0.1951867
## [5,] 0.34276242 0.14730983 2.6645254 1.9011270
## [6,] 1.55184516 1.70816437 -0.4105919 0.7810233
## [7,] -0.53777458 -0.54121863 -0.4105919 -0.8566780
## CRESCENT.POISE.CD CRESCENT.SET.CD BLINK.CD SAHAR.CD CRESCENT.CGS
## [1,] -0.26288972 -0.4735641 -0.7719338 -0.6568683 -0.4244854
## [2,] -0.26288972 0.2022398 0.3326759 -0.6568683 -0.4244854
## [3,] -0.07712932 -0.9959273 -0.7215851 -0.6422666 -0.4244854
## [4,] -0.26288972 -0.4813504 -0.3994474 -0.6476328 0.4036409
## [5,] -0.26288972 0.9268953 0.2215917 0.1308213 2.6010674
## [6,] -0.26288972 0.1568241 1.2731757 1.5583583 -0.4244854
## [7,] 4.77723664 -0.9561644 0.2988174 -0.3342799 -0.4244854
## CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS CRESCENT.SET.CGS
## [1,] -0.76783018 -0.26479725 -0.47946728
## [2,] -0.40382868 -0.26479725 0.08541278
## [3,] -1.12757639 -0.07002298 -1.06638059
## [4,] 0.02187466 -0.26479725 -0.13940363
## [5,] 2.10611704 -0.26479725 0.98012425
## [6,] 0.96452825 -0.26479725 0.09339719
## [7,] -1.15587047 4.79306320 -1.12158078
## BLINK.CGS SAHAR.CGS
## [1,] -0.79034566 -0.6941810
## [2,] 0.27220377 -0.6941810
## [3,] -0.87520812 -0.6803411
## [4,] -0.06385015 -0.2660462
## [5,] 0.36009071 0.1349322
## [6,] 1.50214343 1.6535096
## [7,] -0.30021986 -0.4993470
## Clustering vector:
## [1] 1 2 1 3 3 2 4 2 4 4 5 4 4 3 4 4 4 4 6 4 4 4 4 6 6 7 4 6 6 4 4 6 1 7 3
## [36] 2 2 2 2 7 3 6 7 6 6 2 2 6 3 3 3 1 1 1 1 6 6 6 6 4 6 6 4 6 6 6 6 4 4 4
## [71] 4 4 4 4 4 4 6 4 4 4 4 6 2 2 2 2 2 2 2 6 6 2 6 6 2 6 2 6 2 2 1 4 1 3 3
## [106] 3 3 3 3 5 5 5 5 5 5 4 5 5 5 5 4 5 5 3 3 3 3 3 3 3 3 3 3 1 1 7 1 1 2 6
## [141] 6 1 1 6 3 4 5 4 4 4 1 1 2 1 4 3 5 4 2 2 2 3 4 4 4 1 3 1 3 2 2 1 7 5 1
## [176] 2 2 3 1 1 4 1 3 4 2
## Objective function:
## build swap
## 2.700944 2.633456
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
Binding the cluster to the original data, we have
dd <- cbind(df,cluster=pam.res$cluster)
head(dd,n=3)
## Store.Area East West North South CRESCENT.NS CRESCENT.MIX.N.MATCH.NS
## 1 8154 1 0 0 0 0 523482
## 2 11032 0 1 0 0 0 479127
## 3 8374 1 0 0 0 0 456678
## CRESCENT.POISE.NS CRESCENT.SET.NS BLINK.NS SAHAR.NS CRESCENT.CD
## 1 0 224349 88251 0 0
## 2 0 224900 151922 0 0
## 3 0 245384 109682 0 0
## CRESCENT.MIX.N.MATCH.CD CRESCENT.POISE.CD CRESCENT.SET.CD BLINK.CD
## 1 253127 0 118631 47095
## 2 268028 0 138734 85511
## 3 191773 0 113301 49827
## SAHAR.CD CRESCENT.CGS CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS
## 1 0 0 306384 0
## 2 0 0 295954 0
## 3 0 0 255847 0
## CRESCENT.SET.CGS BLINK.CGS SAHAR.CGS cluster
## 1 133801 54759 0 1
## 2 145927 99258 0 2
## 3 141409 65929 0 1
Visualising the Clusters
fviz_cluster(pam.res,
palette = c("#9400D3", "#4B0082","#0000FF","#00FF00","#FFFF00","#FF7F00","#FF0000"), # color palette
ellipse.type = "t", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_classic()
)
write.csv(dd,"dd.csv")