PAM_Hierarchical_Clustering

df <- read.csv("Fashion_Clustering.csv",header=TRUE)
df_scaled <- scale(df) # Scale the data
head(df_scaled, n = 3) # View the firt 3 rows of the data

##      Store.Area       East       West      North      South CRESCENT.NS
## [1,] -0.9928914  2.1806892 -0.5321496 -0.4646845 -0.8609614  -0.4115927
## [2,] -0.3834972 -0.4560919  1.8690132 -0.4646845 -0.8609614  -0.4115927
## [3,] -0.9463081  2.1806892 -0.5321496 -0.4646845 -0.8609614  -0.4115927
##      CRESCENT.MIX.N.MATCH.NS CRESCENT.POISE.NS CRESCENT.SET.NS   BLINK.NS
## [1,]              -0.3255487        -0.2694052     -0.05244054 -0.5013978
## [2,]              -0.4421498        -0.2694052     -0.04917769  0.1668502
## [3,]              -0.5011641        -0.2694052      0.07212238 -0.2764724
##       SAHAR.NS CRESCENT.CD CRESCENT.MIX.N.MATCH.CD CRESCENT.POISE.CD
## [1,] -0.689266  -0.4105919              -0.3871156        -0.2628897
## [2,] -0.689266  -0.4105919              -0.3080246        -0.2628897
## [3,] -0.689266  -0.4105919              -0.7127685        -0.2628897
##      CRESCENT.SET.CD   BLINK.CD   SAHAR.CD CRESCENT.CGS
## [1,]     -0.07628098 -0.4900366 -0.6568683   -0.4244854
## [2,]      0.20223977  0.3326759 -0.6568683   -0.4244854
## [3,]     -0.15012646 -0.4315284 -0.6568683   -0.4244854
##      CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS CRESCENT.SET.CGS
## [1,]               -0.3562077         -0.2647972      -0.05655066
## [2,]               -0.4038287         -0.2647972       0.08541278
## [3,]               -0.5869480         -0.2647972       0.03251893
##       BLINK.CGS SAHAR.CGS
## [1,] -0.5148522 -0.694181
## [2,]  0.2722038 -0.694181
## [3,] -0.3172879 -0.694181

Loading Packages

library(cluster)

## Warning: package 'cluster' was built under R version 3.6.2

library(factoextra)

## Warning: package 'factoextra' was built under R version 3.6.2

## Loading required package: ggplot2

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Determining the Optimum Number of Clusters

fviz_nbclust(df_scaled,pam,method="silhouette")+
  theme_classic()

Avearge Silhouette width suggests that there are 7 clusters. Lets compute PAM with k=7. Lets check the elbow plot.

fviz_nbclust(df_scaled,pam,method="wss")+
  geom_vline(xintercept = 5,linetype=2)

Again we see there are 7 clusters.

pam.res <- pam(df_scaled,7)
print(pam.res)

## Medoids:
##       ID Store.Area       East       West      North      South
## [1,]  52 -0.6054031  2.1806892 -0.5321496 -0.4646845 -0.8609614
## [2,]   2 -0.3834972 -0.4560919  1.8690132 -0.4646845 -0.8609614
## [3,] 106 -0.4629006 -0.4560919 -0.5321496  2.1403652 -0.8609614
## [4,]  27 -0.2697917 -0.4560919 -0.5321496 -0.4646845  1.1552140
## [5,] 123  0.2231443 -0.4560919 -0.5321496 -0.4646845  1.1552140
## [6,]  66  0.5191599 -0.4560919 -0.5321496 -0.4646845  1.1552140
## [7,] 173 -0.6153550 -0.4560919  1.8690132 -0.4646845 -0.8609614
##      CRESCENT.NS CRESCENT.MIX.N.MATCH.NS CRESCENT.POISE.NS CRESCENT.SET.NS
## [1,]  -0.4115927              -0.7180440        -0.2694052     -0.39959417
## [2,]  -0.4115927              -0.4421498        -0.2694052     -0.04917769
## [3,]  -0.4115927              -1.2078449        -0.0690487     -1.02127773
## [4,]   0.3185246               0.1052274        -0.2694052      0.03916813
## [5,]   2.6695737               2.0426329        -0.2694052      0.90789309
## [6,]  -0.4115927               0.9991692        -0.2694052      0.05638250
## [7,]  -0.4115927              -1.2249164         4.9234880     -1.07136346
##         BLINK.NS    SAHAR.NS CRESCENT.CD CRESCENT.MIX.N.MATCH.CD
## [1,] -0.81766397 -0.68926600  -0.4105919              -0.7760795
## [2,]  0.16685024 -0.68926600  -0.4105919              -0.3080246
## [3,] -0.93247236 -0.67619144  -0.4105919              -0.8640611
## [4,]  0.09463189 -0.03234262   0.3210103              -0.1951867
## [5,]  0.34276242  0.14730983   2.6645254               1.9011270
## [6,]  1.55184516  1.70816437  -0.4105919               0.7810233
## [7,] -0.53777458 -0.54121863  -0.4105919              -0.8566780
##      CRESCENT.POISE.CD CRESCENT.SET.CD   BLINK.CD   SAHAR.CD CRESCENT.CGS
## [1,]       -0.26288972      -0.4735641 -0.7719338 -0.6568683   -0.4244854
## [2,]       -0.26288972       0.2022398  0.3326759 -0.6568683   -0.4244854
## [3,]       -0.07712932      -0.9959273 -0.7215851 -0.6422666   -0.4244854
## [4,]       -0.26288972      -0.4813504 -0.3994474 -0.6476328    0.4036409
## [5,]       -0.26288972       0.9268953  0.2215917  0.1308213    2.6010674
## [6,]       -0.26288972       0.1568241  1.2731757  1.5583583   -0.4244854
## [7,]        4.77723664      -0.9561644  0.2988174 -0.3342799   -0.4244854
##      CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS CRESCENT.SET.CGS
## [1,]              -0.76783018        -0.26479725      -0.47946728
## [2,]              -0.40382868        -0.26479725       0.08541278
## [3,]              -1.12757639        -0.07002298      -1.06638059
## [4,]               0.02187466        -0.26479725      -0.13940363
## [5,]               2.10611704        -0.26479725       0.98012425
## [6,]               0.96452825        -0.26479725       0.09339719
## [7,]              -1.15587047         4.79306320      -1.12158078
##        BLINK.CGS  SAHAR.CGS
## [1,] -0.79034566 -0.6941810
## [2,]  0.27220377 -0.6941810
## [3,] -0.87520812 -0.6803411
## [4,] -0.06385015 -0.2660462
## [5,]  0.36009071  0.1349322
## [6,]  1.50214343  1.6535096
## [7,] -0.30021986 -0.4993470
## Clustering vector:
##   [1] 1 2 1 3 3 2 4 2 4 4 5 4 4 3 4 4 4 4 6 4 4 4 4 6 6 7 4 6 6 4 4 6 1 7 3
##  [36] 2 2 2 2 7 3 6 7 6 6 2 2 6 3 3 3 1 1 1 1 6 6 6 6 4 6 6 4 6 6 6 6 4 4 4
##  [71] 4 4 4 4 4 4 6 4 4 4 4 6 2 2 2 2 2 2 2 6 6 2 6 6 2 6 2 6 2 2 1 4 1 3 3
## [106] 3 3 3 3 5 5 5 5 5 5 4 5 5 5 5 4 5 5 3 3 3 3 3 3 3 3 3 3 1 1 7 1 1 2 6
## [141] 6 1 1 6 3 4 5 4 4 4 1 1 2 1 4 3 5 4 2 2 2 3 4 4 4 1 3 1 3 2 2 1 7 5 1
## [176] 2 2 3 1 1 4 1 3 4 2
## Objective function:
##    build     swap 
## 2.700944 2.633456 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"

Binding the cluster to the original data, we have

dd <- cbind(df,cluster=pam.res$cluster)
head(dd,n=3)

##   Store.Area East West North South CRESCENT.NS CRESCENT.MIX.N.MATCH.NS
## 1       8154    1    0     0     0           0                  523482
## 2      11032    0    1     0     0           0                  479127
## 3       8374    1    0     0     0           0                  456678
##   CRESCENT.POISE.NS CRESCENT.SET.NS BLINK.NS SAHAR.NS CRESCENT.CD
## 1                 0          224349    88251        0           0
## 2                 0          224900   151922        0           0
## 3                 0          245384   109682        0           0
##   CRESCENT.MIX.N.MATCH.CD CRESCENT.POISE.CD CRESCENT.SET.CD BLINK.CD
## 1                  253127                 0          118631    47095
## 2                  268028                 0          138734    85511
## 3                  191773                 0          113301    49827
##   SAHAR.CD CRESCENT.CGS CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS
## 1        0            0                   306384                  0
## 2        0            0                   295954                  0
## 3        0            0                   255847                  0
##   CRESCENT.SET.CGS BLINK.CGS SAHAR.CGS cluster
## 1           133801     54759         0       1
## 2           145927     99258         0       2
## 3           141409     65929         0       1

Visualising the Clusters

fviz_cluster(pam.res,
palette = c("#9400D3", "#4B0082","#0000FF","#00FF00","#FFFF00","#FF7F00","#FF0000"), # color palette
ellipse.type = "t", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_classic()
)

write.csv(dd,"dd.csv")

PAM_Hierarchical_Clustering

Priyank Goyal

22/03/2020