Hierarchical Clustering (Agglomerative Method)

# Read the DataSet
mall_cust <- read.csv("C:\\Users\\samy_\\Desktop\\R_Python_Machine Learning DataSets\\Mall_Customers.csv")
str(mall_cust)
## 'data.frame':    200 obs. of  5 variables:
##  $ CustomerID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Genre                 : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ Age                   : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ Annual.Income..k..    : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending.Score..1.100.: int  39 81 6 77 40 76 6 94 3 72 ...
# Sometimes need to manipulate the data

# Normalize the Data (Only for numeric Data)
normalized_data <- scale(mall_cust[,c(1,3:5)])
colnames(normalized_data)
## [1] "CustomerID"             "Age"                   
## [3] "Annual.Income..k.."     "Spending.Score..1.100."
# Calculate the Distance Matrix
d <- dist(normalized_data, method = "euclidean")
# Hierarchical Clustering (Aggomerative)
fit <- hclust(d, method = "complete")
# Plot the Dendogram
plot(fit)

plot(fit, hang = -1)

# Specifying the number of groups (Cut a Tree into Groups of Data)
groups <- cutree(fit, k=4) # Specifying the desired number of groups
groups1 <- cutree(fit, h=4) # Specifying the height where the tree should be cut
# To see how the tree has been cut down (Draw Rectangles Around the Hierarchical Clusters)
#rect.hclust(fit, k=4, border = "red")
# To organize the groups in a matrix
membership <- as.matrix(groups)
# Create a data frame with membership and dataset
final <- data.frame(mall_cust, membership)
final1 <- final[,c(ncol(final),1:(ncol(final)-1))]
# Write the Data into a csv file
write.csv(final, file="final.csv")
# Aggregate (To see the summary statistics of each clusters)
aggregate(final[,c(1,3:5)], by=list(final$membership), FUN = mean) # Only pass numerical
##   Group.1 CustomerID      Age Annual.Income..k.. Spending.Score..1.100.
## 1       1   55.45161 26.45161           40.70968               57.75806
## 2       2   68.65574 54.42623           47.65574               41.77049
## 3       3  162.00000 32.69231           86.53846               82.12821
## 4       4  162.00000 40.39474           87.00000               18.63158
# To check how many records in a group or cluster
mean(final$membership == 1)*200
## [1] 62
mean(final$membership == 2)*200
## [1] 61
mean(final$membership == 3)*200
## [1] 39
mean(final$membership == 4)*200
## [1] 38