Load the data

df_Original <- read.csv("Fashion_Clustering.csv",header=TRUE)
df <- scale(df_Original) # Scale the data
head(df, n = 3) # View the firt 3 rows of the data  
##      Store.Area       East       West      North      South CRESCENT.NS
## [1,] -0.9928914  2.1806892 -0.5321496 -0.4646845 -0.8609614  -0.4115927
## [2,] -0.3834972 -0.4560919  1.8690132 -0.4646845 -0.8609614  -0.4115927
## [3,] -0.9463081  2.1806892 -0.5321496 -0.4646845 -0.8609614  -0.4115927
##      CRESCENT.MIX.N.MATCH.NS CRESCENT.POISE.NS CRESCENT.SET.NS   BLINK.NS
## [1,]              -0.3255487        -0.2694052     -0.05244054 -0.5013978
## [2,]              -0.4421498        -0.2694052     -0.04917769  0.1668502
## [3,]              -0.5011641        -0.2694052      0.07212238 -0.2764724
##       SAHAR.NS CRESCENT.CD CRESCENT.MIX.N.MATCH.CD CRESCENT.POISE.CD
## [1,] -0.689266  -0.4105919              -0.3871156        -0.2628897
## [2,] -0.689266  -0.4105919              -0.3080246        -0.2628897
## [3,] -0.689266  -0.4105919              -0.7127685        -0.2628897
##      CRESCENT.SET.CD   BLINK.CD   SAHAR.CD CRESCENT.CGS
## [1,]     -0.07628098 -0.4900366 -0.6568683   -0.4244854
## [2,]      0.20223977  0.3326759 -0.6568683   -0.4244854
## [3,]     -0.15012646 -0.4315284 -0.6568683   -0.4244854
##      CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS CRESCENT.SET.CGS
## [1,]               -0.3562077         -0.2647972      -0.05655066
## [2,]               -0.4038287         -0.2647972       0.08541278
## [3,]               -0.5869480         -0.2647972       0.03251893
##       BLINK.CGS SAHAR.CGS
## [1,] -0.5148522 -0.694181
## [2,]  0.2722038 -0.694181
## [3,] -0.3172879 -0.694181

Compute the dissimilarity matrix

# Compute the dissimilarity matrix
# df = the standardized data
res.dist <- dist(df, method = "euclidean")

Reforming in the form of matrix

as.matrix(res.dist)[1:6, 1:6]
##           1        2         3        4        5        6
## 1 0.0000000 3.866818 0.5605804 4.786394 4.319814 5.648856
## 2 3.8668183 0.000000 3.8086091 4.316297 4.521550 3.743348
## 3 0.5605804 3.808609 0.0000000 4.836293 4.261080 5.708326
## 4 4.7863935 4.316297 4.8362929 0.000000 4.021744 7.013343
## 5 4.3198144 4.521550 4.2610797 4.021744 0.000000 6.782814
## 6 5.6488563 3.743348 5.7083261 7.013343 6.782814 0.000000

R base function hclust() can be used to create the hierarchical tree

res.hc <- hclust(d=res.dist,method="ward.D2")

Next we visualize the dendogram

# cex: label size
library("factoextra")
## Warning: package 'factoextra' was built under R version 3.6.2
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(res.hc, cex = 0.5)

The height of the tree is called Cophenetic Distance. Lets verify that the distances in the tree reflect the original distances accurately.

# Compute cophentic distance
res.coph <- cophenetic(res.hc)
# Correlation between cophenetic distance and
# the original distance
cor(res.dist, res.coph)
## [1] 0.5972371

The correlation is 85% which is fairly good. Now we need to cut the tree into different groups.

# Cut tree into 7 groups
grp <- cutree(res.hc, k = 7)
head(grp, n = 7)
## [1] 1 2 1 2 3 4 5

No of members of each group

# Number of members in each cluster
table(grp)
## grp
##  1  2  3  4  5  6  7 
## 25 29 25 38 46 17  5

So there are 176 members in the first group.

# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## NULL
# Cut in 7 groups and color by groups
fviz_dend(res.hc, k = 7, # Cut in three groups
cex = 0.5, # label size
k_colors = c("#9400D3", "#4B0082","#0000FF","#00FF00","#FFFF00","#FF7F00","#FF0000"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)

We can now draw the cluster plot

fviz_cluster(list(data = df, cluster = grp),
palette = c("#9400D3", "#4B0082","#0000FF","#00FF00","#FFFF00","#FF7F00","#FF0000"),
ellipse.type = "convex", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
show.clust.cent = FALSE, ggtheme = theme_minimal())

khkh

ddH <- cbind(df_Original,grp)
head(ddH,n=3)
##   Store.Area East West North South CRESCENT.NS CRESCENT.MIX.N.MATCH.NS
## 1       8154    1    0     0     0           0                  523482
## 2      11032    0    1     0     0           0                  479127
## 3       8374    1    0     0     0           0                  456678
##   CRESCENT.POISE.NS CRESCENT.SET.NS BLINK.NS SAHAR.NS CRESCENT.CD
## 1                 0          224349    88251        0           0
## 2                 0          224900   151922        0           0
## 3                 0          245384   109682        0           0
##   CRESCENT.MIX.N.MATCH.CD CRESCENT.POISE.CD CRESCENT.SET.CD BLINK.CD
## 1                  253127                 0          118631    47095
## 2                  268028                 0          138734    85511
## 3                  191773                 0          113301    49827
##   SAHAR.CD CRESCENT.CGS CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS
## 1        0            0                   306384                  0
## 2        0            0                   295954                  0
## 3        0            0                   255847                  0
##   CRESCENT.SET.CGS BLINK.CGS SAHAR.CGS grp
## 1           133801     54759         0   1
## 2           145927     99258         0   2
## 3           141409     65929         0   1

lets write csv for further analysis

write.csv(ddH,"ddH.csv")