Load the data
df_Original <- read.csv("Fashion_Clustering.csv",header=TRUE)
df <- scale(df_Original) # Scale the data
head(df, n = 3) # View the firt 3 rows of the data
## Store.Area East West North South CRESCENT.NS
## [1,] -0.9928914 2.1806892 -0.5321496 -0.4646845 -0.8609614 -0.4115927
## [2,] -0.3834972 -0.4560919 1.8690132 -0.4646845 -0.8609614 -0.4115927
## [3,] -0.9463081 2.1806892 -0.5321496 -0.4646845 -0.8609614 -0.4115927
## CRESCENT.MIX.N.MATCH.NS CRESCENT.POISE.NS CRESCENT.SET.NS BLINK.NS
## [1,] -0.3255487 -0.2694052 -0.05244054 -0.5013978
## [2,] -0.4421498 -0.2694052 -0.04917769 0.1668502
## [3,] -0.5011641 -0.2694052 0.07212238 -0.2764724
## SAHAR.NS CRESCENT.CD CRESCENT.MIX.N.MATCH.CD CRESCENT.POISE.CD
## [1,] -0.689266 -0.4105919 -0.3871156 -0.2628897
## [2,] -0.689266 -0.4105919 -0.3080246 -0.2628897
## [3,] -0.689266 -0.4105919 -0.7127685 -0.2628897
## CRESCENT.SET.CD BLINK.CD SAHAR.CD CRESCENT.CGS
## [1,] -0.07628098 -0.4900366 -0.6568683 -0.4244854
## [2,] 0.20223977 0.3326759 -0.6568683 -0.4244854
## [3,] -0.15012646 -0.4315284 -0.6568683 -0.4244854
## CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS CRESCENT.SET.CGS
## [1,] -0.3562077 -0.2647972 -0.05655066
## [2,] -0.4038287 -0.2647972 0.08541278
## [3,] -0.5869480 -0.2647972 0.03251893
## BLINK.CGS SAHAR.CGS
## [1,] -0.5148522 -0.694181
## [2,] 0.2722038 -0.694181
## [3,] -0.3172879 -0.694181
Compute the dissimilarity matrix
# Compute the dissimilarity matrix
# df = the standardized data
res.dist <- dist(df, method = "euclidean")
Reforming in the form of matrix
as.matrix(res.dist)[1:6, 1:6]
## 1 2 3 4 5 6
## 1 0.0000000 3.866818 0.5605804 4.786394 4.319814 5.648856
## 2 3.8668183 0.000000 3.8086091 4.316297 4.521550 3.743348
## 3 0.5605804 3.808609 0.0000000 4.836293 4.261080 5.708326
## 4 4.7863935 4.316297 4.8362929 0.000000 4.021744 7.013343
## 5 4.3198144 4.521550 4.2610797 4.021744 0.000000 6.782814
## 6 5.6488563 3.743348 5.7083261 7.013343 6.782814 0.000000
R base function hclust() can be used to create the hierarchical tree
res.hc <- hclust(d=res.dist,method="ward.D2")
Next we visualize the dendogram
# cex: label size
library("factoextra")
## Warning: package 'factoextra' was built under R version 3.6.2
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(res.hc, cex = 0.5)
The height of the tree is called Cophenetic Distance. Lets verify that the distances in the tree reflect the original distances accurately.
# Compute cophentic distance
res.coph <- cophenetic(res.hc)
# Correlation between cophenetic distance and
# the original distance
cor(res.dist, res.coph)
## [1] 0.5972371
The correlation is 85% which is fairly good. Now we need to cut the tree into different groups.
# Cut tree into 7 groups
grp <- cutree(res.hc, k = 7)
head(grp, n = 7)
## [1] 1 2 1 2 3 4 5
No of members of each group
# Number of members in each cluster
table(grp)
## grp
## 1 2 3 4 5 6 7
## 25 29 25 38 46 17 5
So there are 176 members in the first group.
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## NULL
# Cut in 7 groups and color by groups
fviz_dend(res.hc, k = 7, # Cut in three groups
cex = 0.5, # label size
k_colors = c("#9400D3", "#4B0082","#0000FF","#00FF00","#FFFF00","#FF7F00","#FF0000"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
We can now draw the cluster plot
fviz_cluster(list(data = df, cluster = grp),
palette = c("#9400D3", "#4B0082","#0000FF","#00FF00","#FFFF00","#FF7F00","#FF0000"),
ellipse.type = "convex", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
show.clust.cent = FALSE, ggtheme = theme_minimal())
khkh
ddH <- cbind(df_Original,grp)
head(ddH,n=3)
## Store.Area East West North South CRESCENT.NS CRESCENT.MIX.N.MATCH.NS
## 1 8154 1 0 0 0 0 523482
## 2 11032 0 1 0 0 0 479127
## 3 8374 1 0 0 0 0 456678
## CRESCENT.POISE.NS CRESCENT.SET.NS BLINK.NS SAHAR.NS CRESCENT.CD
## 1 0 224349 88251 0 0
## 2 0 224900 151922 0 0
## 3 0 245384 109682 0 0
## CRESCENT.MIX.N.MATCH.CD CRESCENT.POISE.CD CRESCENT.SET.CD BLINK.CD
## 1 253127 0 118631 47095
## 2 268028 0 138734 85511
## 3 191773 0 113301 49827
## SAHAR.CD CRESCENT.CGS CRESCENT.MIX.N.MATCH.CGS CRESCENT.POISE.CGS
## 1 0 0 306384 0
## 2 0 0 295954 0
## 3 0 0 255847 0
## CRESCENT.SET.CGS BLINK.CGS SAHAR.CGS grp
## 1 133801 54759 0 1
## 2 145927 99258 0 2
## 3 141409 65929 0 1
lets write csv for further analysis
write.csv(ddH,"ddH.csv")