Hierarchical Clustering (Agglomerative Method)
# Read the DataSet
mall_cust <- read.csv("C:\\Users\\samy_\\Desktop\\R_Python_Machine Learning DataSets\\Mall_Customers.csv")
str(mall_cust)
## 'data.frame': 200 obs. of 5 variables:
## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Genre : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income..k.. : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score..1.100.: int 39 81 6 77 40 76 6 94 3 72 ...
# Sometimes need to manipulate the data
# Normalize the Data (Only for numeric Data)
normalized_data <- scale(mall_cust[,c(1,3:5)])
colnames(normalized_data)
## [1] "CustomerID" "Age"
## [3] "Annual.Income..k.." "Spending.Score..1.100."
# Calculate the Distance Matrix
d <- dist(normalized_data, method = "euclidean")
# Hierarchical Clustering (Aggomerative)
fit <- hclust(d, method = "complete")
# Plot the Dendogram
plot(fit)

plot(fit, hang = -1)

# Specifying the number of groups (Cut a Tree into Groups of Data)
groups <- cutree(fit, k=4) # Specifying the desired number of groups
groups1 <- cutree(fit, h=4) # Specifying the height where the tree should be cut
# To see how the tree has been cut down (Draw Rectangles Around the Hierarchical Clusters)
#rect.hclust(fit, k=4, border = "red")
# To organize the groups in a matrix
membership <- as.matrix(groups)
# Create a data frame with membership and dataset
final <- data.frame(mall_cust, membership)
final1 <- final[,c(ncol(final),1:(ncol(final)-1))]
# Write the Data into a csv file
write.csv(final, file="final.csv")
# Aggregate (To see the summary statistics of each clusters)
aggregate(final[,c(1,3:5)], by=list(final$membership), FUN = mean) # Only pass numerical
## Group.1 CustomerID Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 55.45161 26.45161 40.70968 57.75806
## 2 2 68.65574 54.42623 47.65574 41.77049
## 3 3 162.00000 32.69231 86.53846 82.12821
## 4 4 162.00000 40.39474 87.00000 18.63158
# To check how many records in a group or cluster
mean(final$membership == 1)*200
## [1] 62
mean(final$membership == 2)*200
## [1] 61
mean(final$membership == 3)*200
## [1] 39
mean(final$membership == 4)*200
## [1] 38