Customer Segmentation K-Means Clustering
# Load required libraries
library(ggplot2)
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
install.packages("flexclust")
Error in install.packages : Updating loaded packages
library(flexclust)
install.packages("mclust")
Error in install.packages : Updating loaded packages
library(mclust)
install.packages("clusterSim")
Error in install.packages : Updating loaded packages
library(cluster)
install.packages("hopkins")
Error in install.packages : Updating loaded packages
library(hopkins)
library(tidytext)
library(scatterplot3d)
library(tidyverse)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice, Description)
# Concatenate the transaction descriptions for each customer
rfm_with_descriptions <- data %>%
group_by(CustomerID) %>%
summarise(Descriptions = paste(Description, collapse = ", "))
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the first day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(InvoiceDate), max(data$InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)
# Prepare data frame to use for clustering
kmeans_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
# Perform PCA and extract results
pca <- prcomp(kmeans_data, scale. = TRUE)
pca_results <- factoextra::get_pca(pca)
plot(pca)
summary(pca)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.2827 0.9555 0.6646
Proportion of Variance 0.5484 0.3044 0.1472
Cumulative Proportion 0.5484 0.8528 1.0000
biplot(pca, scale. = TRUE)
# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(kmeans_data), m = nrow(kmeans_data) - 1, method = "simple")
cat("Hopkins Statistic:", hopkins_stat, "\n")
Hopkins Statistic: 0.9967286
# Determine and visualise the optimal number of clusters
#fviz_nbclust(kmeans_data, kmeans, method = "wss")
#fviz_nbclust(kmeans_data, kmeans, method = "silhouette")
#fviz_nbclust(kmeans_data, kmeans, method = "gap_stat")
# Prompt the user to choose the number of clusters based on the elbow plot
chosen_k <- readline(prompt = "Enter the optimal number of clusters based on the elbow plot: ")
7
chosen_k <- as.integer(chosen_k)
# Perform K-means clustering with chosen number of clusters
set.seed(123)
kmeans_model <- kmeans(kmeans_data, centers = chosen_k, nstart = 25)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(kmeans_model$cluster)
# Visualize the clusters
fviz_cluster(kmeans_model, data = kmeans_data)
# Create a 3D scatter plot
scatterplot3d(kmeans_data$recency_scaled, kmeans_data$frequency_scaled, kmeans_data$monetary_scaled,
color = kmeans_model$cluster,
main = "3D Scatter Plot of RFM Clusters",
xlab = "Recency",
ylab = "Frequency",
zlab = "Monetary")
legend("topright", legend = levels(as.factor(kmeans_model$cluster)), col = 1:max(kmeans_model$cluster), pch = 1, cex = 1.2)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(kmeans_model$cluster, dist(kmeans_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
Average Silhouette Width: 0.5094448
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(kmeans_data), kmeans_model$cluster)
print(clustering_indices)
$n
[1] 4372
$cluster.number
[1] 7
$cluster.size
[1] 48 617 2462 471 4 3 767
$min.cluster.size
[1] 3
$noisen
[1] 0
$diameter
[1] 13.829273 1.721251 1.977773 4.432850 14.771437 12.673996 3.480057
$average.distance
[1] 3.6092760 0.5556253 0.5007617 0.9763928 10.7583198 10.5712937 0.5532323
$median.distance
[1] 2.5705356 0.4949363 0.4762083 0.8248311 12.4094830 11.8860221 0.4993416
$separation
[1] 0.216424551 0.008527502 0.008876033 0.023725235 9.252061626 10.390085232 0.008527502
$average.toother
[1] 6.035026 2.562908 1.991939 2.035711 25.705291 28.374071 1.546554
$separation.matrix
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] 0.0000000 3.647130892 2.873742316 0.21642455 9.252062 10.66982 2.953838122
[2,] 3.6471309 0.000000000 1.304904340 1.48149596 18.905945 22.85383 0.008527502
[3,] 2.8737423 1.304904340 0.000000000 0.02372524 17.989815 22.76257 0.008876033
[4,] 0.2164246 1.481495960 0.023725235 0.00000000 16.439481 19.89367 0.346069518
[5,] 9.2520616 18.905945115 17.989814732 16.43948070 0.000000 10.39009 16.260864040
[6,] 10.6698195 22.853832079 22.762571910 19.89366834 10.390085 0.00000 21.945623396
[7,] 2.9538381 0.008527502 0.008876033 0.34606952 16.260864 21.94562 0.000000000
$ave.between.matrix
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] 0.000000 6.891162 5.956237 4.652317 23.77636 23.31331 6.288220
[2,] 6.891162 0.000000 2.620755 3.326089 25.98129 28.88034 1.412632
[3,] 5.956237 2.620755 0.000000 1.539219 25.75248 28.49905 1.288422
[4,] 4.652317 3.326089 1.539219 0.000000 25.10700 27.14937 2.209086
[5,] 23.776361 25.981290 25.752477 25.107002 0.00000 22.54987 25.832261
[6,] 23.313310 28.880335 28.499050 27.149372 22.54987 0.00000 28.664789
[7,] 6.288220 1.412632 1.288422 2.209086 25.83226 28.66479 0.000000
$average.between
[1] 2.141269
$average.within
[1] 0.619373
$n.between
[1] 5929896
$n.within
[1] 3625110
$max.diameter
[1] 14.77144
$min.separation
[1] 0.008527502
$within.cluster.ss
[1] 1826.977
$clus.avg.silwidths
1 2 3 4 5 6 7
0.1760061 0.5717862 0.5580084 0.3192309 0.4650750 0.4981160 0.4413600
$avg.silwidth
[1] 0.5094448
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.4099661
$dunn
[1] 0.0005772967
$dunn2
[1] 0.1197605
$entropy
[1] 1.206027
$wb.ratio
[1] 0.2892551
$ch
[1] 4494.081
$cwidegap
[1] 4.3314064 0.3625535 0.5992575 2.8213618 11.8906586 11.8860221 1.4109591
$widestgap
[1] 11.89066
$sindex
[1] 0.09630851
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
Dunn Index: 0.0005772967
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(kmeans_data, kmeans_model$cluster)
print(db_index)
$DB
[1] 0.7757305
$r
[1] 0.9758716 0.6732245 0.8507658 0.9758716 0.6272708 0.6272708 0.6998387
$R
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] Inf 0.5729061 0.6528990 0.9758716 0.4403419 0.4157323 0.6286032
[2,] 0.5729061 Inf 0.3318168 0.3970572 0.2918980 0.2350118 0.6732245
[3,] 0.6528990 0.3318168 Inf 0.8507658 0.2915483 0.2356091 0.6998387
[4,] 0.9758716 0.3970572 0.8507658 Inf 0.3161507 0.2633344 0.6111041
[5,] 0.4403419 0.2918980 0.2915483 0.3161507 Inf 0.6272708 0.2932302
[6,] 0.4157323 0.2350118 0.2356091 0.2633344 0.6272708 Inf 0.2365250
[7,] 0.6286032 0.6732245 0.6998387 0.6111041 0.2932302 0.2365250 Inf
$d
1 2 3 4 5 6 7
1 0.000000 6.519704 5.610816 4.189210 23.39613 22.92693 5.930687
2 6.519704 0.000000 2.599600 3.242420 25.69935 28.64009 1.377465
3 5.610816 2.599600 0.000000 1.428765 25.48363 28.26239 1.222369
4 4.189210 3.242420 1.428765 0.000000 24.84430 26.90007 2.095054
5 23.396125 25.699351 25.483629 24.844302 0.00000 21.19958 25.558278
6 22.926931 28.640094 28.262394 26.900068 21.19958 0.00000 28.426716
7 5.930687 1.377465 1.222369 2.095054 25.55828 28.42672 0.000000
$S
[1] 3.2679420 0.4672364 0.3953546 0.8201897 7.0343533 6.2635227 0.4601066
$centers
[,1] [,2] [,3]
[1,] 0.8675599 5.0559406 1.93721894
[2,] -2.0354305 -0.3824309 -0.18487878
[3,] 0.5534493 -0.1655285 -0.09227753
[4,] 0.7556678 1.1479795 0.43229235
[5,] 0.2267805 2.8027850 25.21577909
[6,] 0.9023141 22.3360828 17.00505334
[7,] -0.6622009 -0.2843701 -0.13978747
# Merge the cluster labels with the transaction descriptions
rfm_with_descriptions <- left_join(rfm_with_descriptions, rfm, by = "CustomerID")
# Text Mining on the concatenated transaction descriptions of each cluster
cluster_texts <- rfm_with_descriptions %>%
group_by(cluster) %>%
summarise(Descriptions = paste(Descriptions, collapse = ", ")) %>%
unnest_tokens(output = "word", input = Descriptions) %>%
anti_join(stop_words)
Joining with `by = join_by(word)`
# Calculate the frequency of each word in the entire dataset
word_counts_total <- cluster_texts %>%
count(word, sort = TRUE)
# Get the top 10 words in the entire dataset
top_words_total <- word_counts_total %>%
top_n(n = 10, wt = n) %>%
rename(frequency = n)
# Print the top words in the entire dataset
cat("Top Words in the Entire Dataset:\n")
Top Words in the Entire Dataset:
print(top_words_total)
# Get the top 10 words for each cluster
top_words <- cluster_texts %>%
count(cluster, word, sort = TRUE) %>%
group_by(cluster) %>%
top_n(n = 10, wt = n) %>%
rename(frequency = n)
# Print the data frames for each cluster
cluster_word_counts <- split(top_words, top_words$cluster)
for (i in seq_along(cluster_word_counts)) {
cat("Cluster", i, "Top Words:\n")
print(cluster_word_counts[[i]])
cat("\n")
}
Cluster 1 Top Words:
Cluster 2 Top Words:
Cluster 3 Top Words:
Cluster 4 Top Words:
Cluster 5 Top Words:
Cluster 6 Top Words:
Cluster 7 Top Words:
NA
K-Means Redone after filtering out outliers
# Remove observations in cluster 3
rfm_filtered <- rfm[rfm$cluster != 3, ]
# Prepare filtered data frame for clustering
kmeans_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
# Perform PCA and extract results
pca <- prcomp(kmeans_data_filtered, scale. = TRUE)
pca_results <- factoextra::get_pca(pca)
plot(pca)
summary(pca)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.356 0.9076 0.5807
Proportion of Variance 0.613 0.2746 0.1124
Cumulative Proportion 0.613 0.8876 1.0000
biplot(pca, scale. = TRUE)
# Calculate the Hopkins statistic
hopkins_stat_filtered <- hopkins::hopkins(X = as.matrix(kmeans_data_filtered), m = nrow(kmeans_data_filtered) - 1, method = "simple")
cat("Hopkins Statistic (Filtered):", hopkins_stat_filtered, "\n")
Hopkins Statistic (Filtered): 0.9991266
# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data_filtered, kmeans, method = "wss")
fviz_nbclust(kmeans_data_filtered, kmeans, method = "silhouette")
fviz_nbclust(kmeans_data_filtered, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
....
Warning: Quick-TRANSfer stage steps exceeded maximum (= 218250)
.............................................. 50
............................
Warning: did not converge in 10 iterations
...................... 100
# Perform K-means clustering on filtered data
set.seed(123)
kmeans_model_filtered <- kmeans(kmeans_data_filtered, centers = chosen_k, nstart = 25)
# Add cluster labels to the original dataset
rfm_filtered$cluster <- as.factor(kmeans_model_filtered$cluster)
# Visualize the clusters after filtering
fviz_cluster(kmeans_model_filtered, data = kmeans_data_filtered)
# Create a 3D scatter plot
scatterplot3d(kmeans_data_filtered$recency_scaled, kmeans_data_filtered$frequency_scaled, kmeans_data_filtered$monetary_scaled,
color = kmeans_model_filtered$cluster,
main = "3D Scatter Plot of RFM Clusters",
xlab = "Recency",
ylab = "Frequency",
zlab = "Monetary")
legend("topright", legend = levels(as.factor(kmeans_model_filtered$cluster)), col = 1:max(kmeans_model_filtered$cluster), pch = 1, cex = 1.2)
# Visualize data points plotted against recency, monetary, and frequency after filtering
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis on filtered data
cluster_analysis_filtered <- rfm_filtered %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis_filtered)
# Silhouette analysis on filtered data
sil_filtered <- silhouette(kmeans_model_filtered$cluster, dist(kmeans_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])
cat("Average Silhouette Width (filtered data):", avg_silhouette_filtered, "\n")
Average Silhouette Width (filtered data): 0.5383663
# Calculate clustering indices using cluster.stats on filtered data
clustering_indices_filtered <- cluster.stats(dist(kmeans_data_filtered), kmeans_model_filtered$cluster)
print(clustering_indices_filtered)
$n
[1] 4365
$cluster.number
[1] 4
$cluster.size
[1] 2733 527 1050 55
$min.cluster.size
[1] 55
$noisen
[1] 0
$diameter
[1] 2.475988 5.103040 2.463134 13.829273
$average.distance
[1] 0.5799676 0.9411030 0.8045243 3.5230297
$median.distance
[1] 0.5368057 0.7973409 0.7098496 2.5988504
$separation
[1] 0.01229936 0.02016448 0.01229936 0.11192763
$average.toother
[1] 2.004758 1.915521 2.266522 5.715890
$separation.matrix
[,1] [,2] [,3] [,4]
[1,] 0.00000000 0.02016448 0.01229936 2.7028272
[2,] 0.02016448 0.00000000 0.34277648 0.1119276
[3,] 0.01229936 0.34277648 0.00000000 2.4331014
[4,] 2.70282717 0.11192763 2.43310143 0.0000000
$ave.between.matrix
[,1] [,2] [,3] [,4]
[1,] 0.000000 1.496080 2.066557 5.699031
[2,] 1.496080 0.000000 2.873580 4.467709
[3,] 2.066557 2.873580 0.000000 6.386240
[4,] 5.699031 4.467709 6.386240 0.000000
$average.between
[1] 2.162626
$average.within
[1] 0.7146689
$n.between
[1] 5100341
$n.within
[1] 4424089
$max.diameter
[1] 13.82927
$min.separation
[1] 0.01229936
$within.cluster.ss
[1] 1963.399
$clus.avg.silwidths
1 2 3 4
0.5738919 0.3302756 0.5701521 0.1601315
$avg.silwidth
[1] 0.5383663
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.6268952
$dunn
[1] 0.0008893713
$dunn2
[1] 0.4246573
$entropy
[1] 0.9462661
$wb.ratio
[1] 0.3304634
$ch
[1] 4357.639
$cwidegap
[1] 0.5992575 1.5073112 0.6296994 4.3314064
$widestgap
[1] 4.331406
$sindex
[1] 0.1270792
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list for filtered data
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index (filtered data):", dunn_index_filtered, "\n")
Dunn Index (filtered data): 0.0008893713
# Calculate the Davies-Bouldin Index for filtered data
db_index_filtered <- clusterSim::index.DB(kmeans_data_filtered, kmeans_model_filtered$cluster)
print(db_index_filtered)
$DB
[1] 0.8847213
$r
[1] 0.9225626 0.9874744 0.6413737 0.9874744
$R
[,1] [,2] [,3] [,4]
[1,] Inf 0.9225626 0.5626750 0.6812017
[2,] 0.9225626 Inf 0.5281891 0.9874744
[3,] 0.5626750 0.5281891 Inf 0.6413737
[4,] 0.6812017 0.9874744 0.6413737 Inf
$d
1 2 3 4
1 0.000000 1.360032 2.035747 5.357418
2 1.360032 0.000000 2.774485 4.019821
3 2.035747 2.774485 0.000000 6.018674
4 5.357418 4.019821 6.018674 0.000000
$S
[1] 0.4673633 0.7873519 0.6781006 3.1821184
$centers
[,1] [,2] [,3]
[1,] 0.4455741 -0.1901157 -0.1023983
[2,] 0.7504796 1.0389510 0.3937024
[3,] -1.5825233 -0.3510466 -0.1744255
[4,] 0.8141591 4.7716090 1.8843971
Hierarchical Clustering
# Load required libraries
library(ggplot2)
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)
library(clusterSim)
library(hopkins)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the first day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(InvoiceDate), max(data$InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(rfm), m = nrow(rfm) - 1, method = "simple")
cat("Hopkins Statistic:", hopkins_stat, "\n")
Hopkins Statistic: 0.993877
# Determine and visualise the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data, hcut, method = "wss")
fviz_nbclust(hclust_data, hcut, method = "silhouette")
gap_stat_hclust <- clusGap(hclust_data, hcut, K.max = 10, B = 10)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 10) [one "." per sample]:
.......... 10
fviz_gap_stat(gap_stat_hclust)
# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)
# Perform hierarchical clustering
hclust_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
hclust_model <- hclust(dist(hclust_data))
# Perform PCA and extract results
pca <- prcomp(hclust_data, scale. = TRUE)
pca_results <- factoextra::get_pca(pca)
plot(pca)
summary(pca)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.2827 0.9555 0.6646
Proportion of Variance 0.5484 0.3044 0.1472
Cumulative Proportion 0.5484 0.8528 1.0000
biplot(pca, scale. = TRUE)
# Specify the desired number of clusters
desired_clusters <- 4
# Determine the cutoff height for the desired number of clusters
cutoff_height <- hclust_model$height[length(hclust_model$height) - (desired_clusters - 1)]
cat("Cut-off height at ", desired_clusters, " clusters:", cutoff_height, "\n")
Cut-off height at 4 clusters: 14.77144
# Plot the dendogram with an indication of the cut-off
plot(hclust_model, labels = FALSE)
abline(h = cutoff_height, col = "red", lty = 2)
# Determine the tree cut for a desired number of clusters
cut <- cutree(hclust_model, k = desired_clusters)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(cut)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(cut, dist(hclust_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
Average Silhouette Width: 0.8031784
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(hclust_data), cut)
print(clustering_indices)
$n
[1] 4372
$cluster.number
[1] 4
$cluster.size
[1] 4340 25 4 3
$min.cluster.size
[1] 3
$noisen
[1] 0
$diameter
[1] 10.94129 10.17666 14.77144 12.67400
$average.distance
[1] 1.378864 3.601903 10.758320 10.571294
$median.distance
[1] 1.110563 3.180654 12.409483 11.886022
$separation
[1] 0.6352564 0.6352564 9.2520616 10.3900852
$average.toother
[1] 11.567897 7.307411 25.705291 28.374071
$separation.matrix
[,1] [,2] [,3] [,4]
[1,] 0.0000000 0.6352564 9.252062 18.40517
[2,] 0.6352564 0.0000000 13.717897 10.66982
[3,] 9.2520616 13.7178974 0.000000 10.39009
[4,] 18.4051659 10.6698195 10.390085 0.00000
$ave.between.matrix
[,1] [,2] [,3] [,4]
[1,] 0.000000 7.282036 25.71830 28.41620
[2,] 7.282036 0.000000 23.82517 21.99288
[3,] 25.718302 23.825174 0.00000 22.54987
[4,] 28.416196 21.992881 22.54987 0.00000
$average.between
[1] 11.58328
$average.within
[1] 1.406465
$n.between
[1] 139067
$n.within
[1] 9415939
$max.diameter
[1] 14.77144
$min.separation
[1] 0.6352564
$within.cluster.ss
[1] 6916.75
$clus.avg.silwidths
1 2 3 4
0.8054802 0.4963800 0.4678211 0.4770577
$avg.silwidth
[1] 0.8031784
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.6381906
$dunn
[1] 0.04300573
$dunn2
[1] 0.6768748
$entropy
[1] 0.04822162
$wb.ratio
[1] 0.121422
$ch
[1] 1304.332
$cwidegap
[1] 4.331406 3.433129 11.890659 11.886022
$widestgap
[1] 11.89066
$sindex
[1] 2.794022
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
Dunn Index: 0.04300573
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(hclust_data, cut)
print(db_index)
$DB
[1] 0.6138877
$r
[1] 0.6005045 0.6005045 0.6272708 0.6272708
$R
[,1] [,2] [,3] [,4]
[1,] Inf 0.6005045 0.3243178 0.2654419
[2,] 0.6005045 Inf 0.4295965 0.4297899
[3,] 0.3243178 0.4295965 Inf 0.6272708
[4,] 0.2654419 0.4297899 0.6272708 Inf
$d
1 2 3 4
1 0.000000 7.085531 25.42511 28.16053
2 7.085531 0.000000 23.45872 21.65466
3 25.425107 23.458720 0.00000 21.19958
4 28.160533 21.654660 21.19958 0.00000
$S
[1] 1.211463 3.043431 7.034353 6.263523
$centers
[,1] [,2] [,3]
[1,] -0.005880044 -0.05629346 -0.04699245
[2,] 0.876213115 6.64376839 2.08275854
[3,] 0.226780490 2.80278496 25.21577909
[4,] 0.902314125 22.33608281 17.00505334
Hierarchical Clustering Redone after filtering out outliers
# Filter out observations in clusters other than cluster 1
rfm_filtered <- rfm[rfm$cluster == 1, ]
# Perform hierarchical clustering on the filtered dataset
hclust_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model_filtered <- hclust(dist(hclust_data_filtered))
# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(hclust_data_filtered), m = nrow(hclust_data_filtered) - 1, method = "simple")
cat("Hopkins Statistic:", hopkins_stat, "\n")
Hopkins Statistic: 0.998526
# Perform PCA and extract results
pca <- prcomp(hclust_data, scale. = TRUE)
pca_results <- factoextra::get_pca(pca)
plot(pca)
summary(pca)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.2827 0.9555 0.6646
Proportion of Variance 0.5484 0.3044 0.1472
Cumulative Proportion 0.5484 0.8528 1.0000
biplot(pca, scale. = TRUE)
# Determine and visualize the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data_filtered, hcut, method = "wss")
fviz_nbclust(hclust_data_filtered, hcut, method = "silhouette")
gap_stat_hclust_filtered <- clusGap(hclust_data_filtered, hcut, K.max = 10, B = 10)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 10) [one "." per sample]:
.......... 10
fviz_gap_stat(gap_stat_hclust_filtered)
# Specify the desired number of clusters
desired_clusters_filtered <- 4
# Determine the cutoff height for the desired number of clusters
cutoff_height_filtered <- hclust_model_filtered$height[length(hclust_model_filtered$height) - (desired_clusters_filtered - 1)]
cat("Cut-off height at", desired_clusters_filtered, "cluster(s):", cutoff_height_filtered, "\n")
Cut-off height at 4 cluster(s): 4.637608
# Plot the dendrogram with an indication of the cut-off
plot(hclust_model_filtered, labels = FALSE)
abline(h = cutoff_height_filtered, col = "red", lty = 2)
# Determine the tree cut for a desired number of clusters
cut_filtered <- cutree(hclust_model_filtered, k = desired_clusters_filtered)
# Add cluster labels to the filtered dataset
rfm_filtered$cluster <- as.factor(cut_filtered)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis_filtered <- rfm_filtered %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis_filtered)
# Silhouette analysis
sil_filtered <- silhouette(cut_filtered, dist(hclust_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette_filtered, "\n")
Average Silhouette Width: 0.5618641
# Calculate clustering indices using cluster.stats
clustering_indices_filtered <- cluster.stats(dist(hclust_data_filtered), cut_filtered)
print(clustering_indices_filtered)
$n
[1] 4340
$cluster.number
[1] 4
$cluster.size
[1] 697 3453 187 3
$min.cluster.size
[1] 3
$noisen
[1] 0
$diameter
[1] 1.637197 3.874338 3.821206 4.637608
$average.distance
[1] 0.5996526 0.8388544 1.2124658 3.1414414
$median.distance
[1] 0.5274782 0.7316045 1.0796680 4.3314064
$separation
[1] 0.02074356 0.02074356 0.10850601 3.13231909
$average.toother
[1] 2.427511 2.382063 2.671076 7.659358
$separation.matrix
[,1] [,2] [,3] [,4]
[1,] 0.00000000 0.02074356 1.954498 6.442425
[2,] 0.02074356 0.00000000 0.108506 3.159829
[3,] 1.95449833 0.10850601 0.000000 3.132319
[4,] 6.44242505 3.15982900 3.132319 0.000000
$ave.between.matrix
[,1] [,2] [,3] [,4]
[1,] 0.000000 2.346175 3.836342 8.228365
[2,] 2.346175 0.000000 2.432268 7.590602
[3,] 3.836342 2.432268 0.000000 6.808108
[4,] 8.228365 7.590602 6.808108 0.000000
$average.between
[1] 2.445977
$average.within
[1] 0.8181285
$n.between
[1] 3195802
$n.within
[1] 6219828
$max.diameter
[1] 4.637608
$min.separation
[1] 0.02074356
$within.cluster.ss
[1] 2052.066
$clus.avg.silwidths
1 2 3 4
0.7356960 0.5313029 0.4786358 0.5387765
$avg.silwidth
[1] 0.5618641
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.751978
$dunn
[1] 0.0044729
$dunn2
[1] 0.7468465
$entropy
[1] 0.6161344
$wb.ratio
[1] 0.3344792
$ch
[1] 3040.947
$cwidegap
[1] 0.3619369 1.4109591 2.8213618 4.3314064
$widestgap
[1] 4.331406
$sindex
[1] 0.1846325
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index:", dunn_index_filtered, "\n")
Dunn Index: 0.0044729
# Calculate the Davies-Bouldin Index
db_index_filtered <- clusterSim::index.DB(hclust_data_filtered, cut_filtered)
print(db_index_filtered)
$DB
[1] 0.6150994
$r
[1] 0.5214179 0.7381963 0.7381963 0.4625872
$R
[,1] [,2] [,3] [,4]
[1,] Inf 0.5214179 0.4004371 0.3234591
[2,] 0.5214179 Inf 0.7381963 0.3764138
[3,] 0.4004371 0.7381963 Inf 0.4625872
[4,] 0.3234591 0.3764138 0.4625872 Inf
$d
1 2 3 4
1 0.000000 2.305232 3.740158 8.118145
2 2.305232 0.000000 2.288395 7.485045
3 3.740158 2.288395 0.000000 6.729939
4 8.118145 7.485045 6.729939 0.000000
$S
[1] 0.5052015 0.6967875 0.9924968 2.1206864
$centers
[,1] [,2] [,3]
[1,] -1.9435668 -0.3838634 -0.18685373
[2,] 0.3412198 -0.1040612 -0.06213785
[3,] 0.7944171 2.0275954 0.63600386
[4,] 0.7862184 1.1341113 7.30636307
DBSCAN
# Load required libraries
library(ggplot2)
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)
library(clusterSim)
library(hopkins)
install.packages("dbscan")
Error in install.packages : Updating loaded packages
library(dbscan)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the first day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(InvoiceDate), max(data$InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)
# Prepare data frame to use for clustering
dbscan_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
# Perform PCA and extract results
pca <- prcomp(dbscan_data, scale. = TRUE)
pca_results <- factoextra::get_pca(pca)
plot(pca)
summary(pca)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.2827 0.9555 0.6646
Proportion of Variance 0.5484 0.3044 0.1472
Cumulative Proportion 0.5484 0.8528 1.0000
biplot(pca, scale. = TRUE)
# Perform DBSCAN clustering
dbscan_model <- dbscan(dist(dbscan_data), eps = 0.5, minPts = 1000)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(as.integer(dbscan_model$cluster) + 1)
# Visualize the clusters
fviz_cluster(dbscan_model, data = dbscan_data)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(as.integer(dbscan_model$cluster) + 1, dist(dbscan_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
Average Silhouette Width: 0.4640388
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(dbscan_data), as.integer(dbscan_model$cluster) + 1)
print(clustering_indices)
$n
[1] 4372
$cluster.number
[1] 2
$cluster.size
[1] 1570 2802
$min.cluster.size
[1] 1570
$noisen
[1] 0
$diameter
[1] 36.021083 1.813879
$average.distance
[1] 2.1677237 0.5851593
$median.distance
[1] 1.5659859 0.5479547
$separation
[1] 0.01612688 0.01612688
$average.toother
[1] 2.188771 2.188771
$separation.matrix
[,1] [,2]
[1,] 0.00000000 0.01612688
[2,] 0.01612688 0.00000000
$ave.between.matrix
[,1] [,2]
[1,] 0.000000 2.188771
[2,] 2.188771 0.000000
$average.between
[1] 2.188771
$average.within
[1] 1.153464
$n.between
[1] 4399140
$n.within
[1] 5155866
$max.diameter
[1] 36.02108
$min.separation
[1] 0.01612688
$within.cluster.ss
[1] 11030.5
$clus.avg.silwidths
1 2
-0.004612322 0.726630580
$avg.silwidth
[1] 0.4640388
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.3189999
$dunn
[1] 0.0004477068
$dunn2
[1] 1.009709
$entropy
[1] 0.6529006
$wb.ratio
[1] 0.5269915
$ch
[1] 825.0337
$cwidegap
[1] 11.890659 0.223929
$widestgap
[1] 11.89066
$sindex
[1] 0.1396465
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
Dunn Index: 0.0004477068
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(dbscan_data, as.integer(dbscan_model$cluster) + 1)
print(db_index)
$DB
[1] 2.115439
$r
[1] 2.115439 2.115439
$R
[,1] [,2]
[1,] Inf 2.115439
[2,] 2.115439 Inf
$d
1 2
1 0.000000 1.438629
2 1.438629 0.000000
$S
[1] 2.576068 0.467264
$centers
[,1] [,2] [,3]
[1,] -0.8624138 0.2787846 0.16920057
[2,] 0.4832226 -0.1562069 -0.09480546
DBSCAN Redone after filtering out outliers
# Refilter the dataset to remove specific entries that are outliers
rfm_filtered <- rfm[-c(329, 2723, 4043, 331, 1301, 2028, 1301, 1896), ]
# Prepare data frame to use for clustering
dbscan_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
# Perform PCA and extract results
pca <- prcomp(dbscan_data, scale. = TRUE)
pca_results <- factoextra::get_pca(pca)
plot(pca)
summary(pca)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.2827 0.9555 0.6646
Proportion of Variance 0.5484 0.3044 0.1472
Cumulative Proportion 0.5484 0.8528 1.0000
biplot(pca, scale. = TRUE)
# Perform DBSCAN clustering
dbscan_model_filtered <- dbscan(dist(dbscan_data_filtered), eps = 0.5, minPts = 1000)
# Add cluster labels to the original dataset
# Because DBSCAN starts cluster labels from 0
rfm_filtered$cluster <- as.factor(as.integer(dbscan_model_filtered$cluster) + 1)
# Visualize the clusters
fviz_cluster(dbscan_model_filtered, data = dbscan_data_filtered)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis_filtered <- rfm_filtered %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis_filtered)
# Silhouette analysis
sil_filtered <- silhouette(as.integer(dbscan_model_filtered$cluster) + 1, dist(dbscan_data_filtered))
avg_silhouette_filtered <- mean(sil[, "sil_width"])
cat("Average Silhouette Width (Filtered):", avg_silhouette, "\n")
Average Silhouette Width (Filtered): 0.4640388
# Calculate clustering indices using cluster.stats
clustering_indices_filtered <- cluster.stats(dist(dbscan_data_filtered), as.integer(dbscan_model_filtered$cluster) + 1)
print(clustering_indices_filtered)
$n
[1] 4365
$cluster.number
[1] 2
$cluster.size
[1] 1563 2802
$min.cluster.size
[1] 1563
$noisen
[1] 0
$diameter
[1] 15.353404 1.813879
$average.distance
[1] 1.9474398 0.5851593
$median.distance
[1] 1.5428794 0.5479547
$separation
[1] 0.01612688 0.01612688
$average.toother
[1] 2.07797 2.07797
$separation.matrix
[,1] [,2]
[1,] 0.00000000 0.01612688
[2,] 0.01612688 0.00000000
$ave.between.matrix
[,1] [,2]
[1,] 0.00000 2.07797
[2,] 2.07797 0.00000
$average.between
[1] 2.07797
$average.within
[1] 1.072959
$n.between
[1] 4379526
$n.within
[1] 5144904
$max.diameter
[1] 15.3534
$min.separation
[1] 0.01612688
$within.cluster.ss
[1] 5837.759
$clus.avg.silwidths
1 2
0.04131742 0.71143944
$avg.silwidth
[1] 0.4714851
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.4750304
$dunn
[1] 0.001050378
$dunn2
[1] 1.067027
$entropy
[1] 0.6523029
$wb.ratio
[1] 0.5163494
$ch
[1] 1503.196
$cwidegap
[1] 4.331406 0.223929
$widestgap
[1] 4.331406
$sindex
[1] 0.1394663
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index (Filtered):", dunn_index, "\n")
Dunn Index (Filtered): 0.0004477068
# Calculate the Davies-Bouldin Index
db_index_filtered <- clusterSim::index.DB(dbscan_data_filtered, as.integer(dbscan_model_filtered$cluster) + 1)
print(db_index)
$DB
[1] 2.115439
$r
[1] 2.115439 2.115439
$R
[,1] [,2]
[1,] Inf 2.115439
[2,] 2.115439 Inf
$d
1 2
1 0.000000 1.438629
2 1.438629 0.000000
$S
[1] 2.576068 0.467264
$centers
[,1] [,2] [,3]
[1,] -0.8624138 0.2787846 0.16920057
[2,] 0.4832226 -0.1562069 -0.09480546