Customer Segmentation K-Means Clustering
# Load required libraries
library(ggplot2)
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
install.packages("flexclust")
Error in install.packages : Updating loaded packages
library(flexclust)
install.packages("mclust")
Error in install.packages : Updating loaded packages
library(mclust)
install.packages("clusterSim")
Error in install.packages : Updating loaded packages
library(cluster)
install.packages("hopkins")
Error in install.packages : Updating loaded packages
library(hopkins)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
install.packages("hopkins")
Warning in install.packages :
package ‘hopkins’ is in use and will not be installed
install.packages("clusterSim")
Warning in install.packages :
package ‘clusterSim’ is in use and will not be installed
install.packages("flexclust")
Warning in install.packages :
package ‘flexclust’ is in use and will not be installed
install.packages("mclust")
Warning in install.packages :
package ‘mclust’ is in use and will not be installed
install.packages("dplyr")
Error in install.packages : Updating loaded packages
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)
# Prepare data frame to use for clustering
kmeans_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(kmeans_data), m = nrow(kmeans_data) - 1, method = "simple")
install.packages("dplyr")
Warning in install.packages :
package ‘dplyr’ is in use and will not be installed
cat("Hopkins Statistic:", hopkins_stat, "\n")
Hopkins Statistic: 0.9965328
# Prompt the user to choose the number of clusters based on the elbow plot
chosen_k <- readline(prompt = "Enter the optimal number of clusters based on the elbow plot: ")
6
chosen_k <- as.integer(chosen_k)
# Perform K-means clustering with chosen number of clusters
set.seed(123)
kmeans_model <- kmeans(kmeans_data, centers = chosen_k, nstart = 25)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(kmeans_model$cluster)
# Visualize the clusters
fviz_cluster(kmeans_model, data = kmeans_data)
# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data, kmeans, method = "wss")
fviz_nbclust(kmeans_data, kmeans, method = "silhouette")
fviz_nbclust(kmeans_data, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.......
Warning: did not converge in 10 iterations
............
Warning: did not converge in 10 iterations
..........
Warning: did not converge in 10 iterations
.........
Warning: Quick-TRANSfer stage steps exceeded maximum (= 218600)
.....
Warning: did not converge in 10 iterations
....... 50
..................
Warning: did not converge in 10 iterationsWarning: did not converge in 10 iterations
....
Warning: did not converge in 10 iterations
............
Warning: did not converge in 10 iterations
................ 100
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(kmeans_model$cluster, dist(kmeans_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
Average Silhouette Width: 0.5090461
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(kmeans_data), kmeans_model$cluster)
print(clustering_indices)
$n
[1] 4372
$cluster.number
[1] 6
$cluster.size
[1] 471 617 7 767 48 2462
$min.cluster.size
[1] 7
$noisen
[1] 0
$diameter
[1] 4.432850 1.721251 30.606993 3.480057 13.829273 1.977773
$average.distance
[1] 0.9763928 0.5556253 17.4696282 0.5532323 3.6092760 0.5007617
$median.distance
[1] 0.8248311 0.4949363 17.3298115 0.4993416 2.5705356 0.4762083
$separation
[1] 0.023725235 0.008527502 9.252061626 0.008527502 0.216424551 0.008876033
$average.toother
[1] 2.035711 2.562908 26.852580 1.546554 6.035026 1.991939
$separation.matrix
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0.00000000 1.481495960 16.439481 0.346069518 0.2164246 0.023725235
[2,] 1.48149596 0.000000000 18.905945 0.008527502 3.6471309 1.304904340
[3,] 16.43948070 18.905945115 0.000000 16.260864040 9.2520616 17.989814732
[4,] 0.34606952 0.008527502 16.260864 0.000000000 2.9538381 0.008876033
[5,] 0.21642455 3.647130892 9.252062 2.953838122 0.0000000 2.873742316
[6,] 0.02372524 1.304904340 17.989815 0.008876033 2.8737423 0.000000000
$ave.between.matrix
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0.000000 3.326089 25.98230 2.209086 4.652317 1.539219
[2,] 3.326089 0.000000 27.22374 1.412632 6.891162 2.620755
[3,] 25.982303 27.223738 0.00000 27.046201 23.577911 26.929580
[4,] 2.209086 1.412632 27.04620 0.000000 6.288220 1.288422
[5,] 4.652317 6.891162 23.57791 6.288220 0.000000 5.956237
[6,] 1.539219 2.620755 26.92958 1.288422 5.956237 0.000000
$average.between
[1] 2.141228
$average.within
[1] 0.6302468
$n.between
[1] 5929884
$n.within
[1] 3625122
$max.diameter
[1] 30.60699
$min.separation
[1] 0.008527502
$within.cluster.ss
[1] 2597.415
$clus.avg.silwidths
1 2 3 4 5 6
0.3192309 0.5717862 0.2302471 0.4413600 0.1760061 0.5580084
$avg.silwidth
[1] 0.5090461
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.4099374
$dunn
[1] 0.0002786129
$dunn2
[1] 0.07375209
$entropy
[1] 1.204934
$wb.ratio
[1] 0.294339
$ch
[1] 3535.134
$cwidegap
[1] 2.8213618 0.3625535 11.8906586 1.4109591 4.3314064 0.5992575
$widestgap
[1] 11.89066
$sindex
[1] 0.09630851
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
Dunn Index: 0.0002786129
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(kmeans_data, kmeans_model$cluster)
print(db_index)
$DB
[1] 0.8226027
$r
[1] 0.9758716 0.6732245 0.7600437 0.6998387 0.9758716 0.8507658
$R
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] Inf 0.3970572 0.5646801 0.6111041 0.9758716 0.8507658
[2,] 0.3970572 Inf 0.5194790 0.6732245 0.5729061 0.3318168
[3,] 0.5646801 0.5194790 Inf 0.5231597 0.7600437 0.5231915
[4,] 0.6111041 0.6732245 0.5231597 Inf 0.6286032 0.6998387
[5,] 0.9758716 0.5729061 0.7600437 0.6286032 Inf 0.6528990
[6,] 0.8507658 0.3318168 0.5231915 0.6998387 0.6528990 Inf
$d
1 2 3 4 5 6
1 0.000000 3.242420 23.51097 2.095054 4.189210 1.428765
2 3.242420 0.000000 24.87728 1.377465 6.519704 2.599600
3 23.510971 24.877280 0.00000 24.688629 20.688193 24.563365
4 2.095054 1.377465 24.68863 0.000000 5.930687 1.222369
5 4.189210 6.519704 20.68819 5.930687 0.000000 5.610816
6 1.428765 2.599600 24.56336 1.222369 5.610816 0.000000
$S
[1] 0.8201897 0.4672364 12.4559884 0.4601066 3.2679420 0.3953546
$centers
[,1] [,2] [,3]
[1,] -0.7556678 1.1479795 0.43229235
[2,] 2.0354305 -0.3824309 -0.18487878
[3,] -0.5162949 11.1741983 21.69689662
[4,] 0.6622009 -0.2843701 -0.13978747
[5,] -0.8675599 5.0559406 1.93721894
[6,] -0.5534493 -0.1655285 -0.09227753
K-Means Redone
# Remove observations in cluster 3
rfm_filtered <- rfm[rfm$cluster != 3, ]
# Prepare filtered data frame for clustering
kmeans_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
# Calculate the Hopkins statistic
hopkins_stat_filtered <- hopkins::hopkins(X = as.matrix(kmeans_data_filtered), m = nrow(kmeans_data_filtered) - 1, method = "simple")
cat("Hopkins Statistic (Filtered):", hopkins_stat_filtered, "\n")
Hopkins Statistic (Filtered): 0.9991269
# Perform K-means clustering on filtered data
set.seed(123)
kmeans_model_filtered <- kmeans(kmeans_data_filtered, centers = chosen_k, nstart = 25)
# Add cluster labels to the original dataset
rfm_filtered$cluster <- as.factor(kmeans_model_filtered$cluster)
# Visualize the clusters after filtering
fviz_cluster(kmeans_model_filtered, data = kmeans_data_filtered)
# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data_filtered, kmeans, method = "wss")
fviz_nbclust(kmeans_data_filtered, kmeans, method = "silhouette")
fviz_nbclust(kmeans_data_filtered, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
....
Warning: Quick-TRANSfer stage steps exceeded maximum (= 218250)
.............................................. 50
............................
Warning: did not converge in 10 iterations
...................... 100
# Visualize data points plotted against recency, monetary, and frequency after filtering
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis on filtered data
cluster_analysis_filtered <- rfm_filtered %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis_filtered)
# Silhouette analysis on filtered data
sil_filtered <- silhouette(kmeans_model_filtered$cluster, dist(kmeans_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])
cat("Average Silhouette Width (filtered data):", avg_silhouette_filtered, "\n")
Average Silhouette Width (filtered data): 0.4972092
# Calculate clustering indices using cluster.stats on filtered data
clustering_indices_filtered <- cluster.stats(dist(kmeans_data_filtered), kmeans_model_filtered$cluster)
print(clustering_indices_filtered)
$n
[1] 4365
$cluster.number
[1] 6
$cluster.size
[1] 93 2342 765 540 10 615
$min.cluster.size
[1] 10
$noisen
[1] 0
$diameter
[1] 11.442911 1.916348 3.480057 3.216320 7.651552 1.721251
$average.distance
[1] 1.8702627 0.4699777 0.5535234 0.7690341 4.0383326 0.5545625
$median.distance
[1] 1.4501589 0.4513367 0.4994568 0.6877940 4.1604236 0.4939610
$separation
[1] 0.11952798 0.01922779 0.01123853 0.03034987 1.10815058 0.01123853
$average.toother
[1] 3.870893 1.831538 1.499372 1.748479 9.985118 2.518026
$separation.matrix
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0.000000 1.90323875 2.01420873 0.11952798 1.108151 2.43310143
[2,] 1.903239 0.00000000 0.01922779 0.03034987 6.760194 1.30211749
[3,] 2.014209 0.01922779 0.00000000 0.14088595 6.380692 0.01123853
[4,] 0.119528 0.03034987 0.14088595 0.00000000 5.043469 1.46407430
[5,] 1.108151 6.76019371 6.38069184 5.04346876 0.000000 6.87209694
[6,] 2.433101 1.30211749 0.01123853 1.46407430 6.872097 0.00000000
$ave.between.matrix
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0.000000 3.759216 4.144952 2.720648 6.785039 4.917861
[2,] 3.759216 0.000000 1.275870 1.249443 10.065950 2.608448
[3,] 4.144952 1.275870 0.000000 1.952249 10.277443 1.410055
[4,] 2.720648 1.249443 1.952249 0.000000 8.973011 3.130919
[5,] 6.785039 10.065950 10.277443 8.973011 0.000000 10.686268
[6,] 4.917861 2.608448 1.410055 3.130919 10.686268 0.000000
$average.between
[1] 1.966626
$average.within
[1] 0.571543
$n.between
[1] 6152231
$n.within
[1] 3372199
$max.diameter
[1] 11.44291
$min.separation
[1] 0.01123853
$within.cluster.ss
[1] 1203.924
$clus.avg.silwidths
1 2 3 4 5 6
0.2784650 0.5436213 0.4358933 0.3379704 0.3560846 0.5719291
$avg.silwidth
[1] 0.4972092
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.5728735
$dunn
[1] 0.0009821389
$dunn2
[1] 0.3093957
$entropy
[1] 1.269844
$wb.ratio
[1] 0.2906211
$ch
[1] 4811.955
$cwidegap
[1] 4.3314064 0.5992575 1.4109591 0.7670103 3.4331291 0.3625535
$widestgap
[1] 4.331406
$sindex
[1] 0.09485941
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list for filtered data
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index (filtered data):", dunn_index_filtered, "\n")
Dunn Index (filtered data): 0.0009821389
# Calculate the Davies-Bouldin Index for filtered data
db_index_filtered <- clusterSim::index.DB(kmeans_data_filtered, kmeans_model_filtered$cluster)
print(db_index_filtered)
$DB
[1] 0.8251814
$r
[1] 0.9871801 0.8744544 0.6824004 0.9871801 0.7457325 0.6741406
$R
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] Inf 0.6010589 0.5665070 0.9871801 0.7457325 0.4733325
[2,] 0.6010589 Inf 0.6824004 0.8744544 0.3390987 0.3226072
[3,] 0.5665070 0.6824004 Inf 0.5845842 0.3411828 0.6741406
[4,] 0.9871801 0.8744544 0.5845842 Inf 0.4114658 0.3548898
[5,] 0.7457325 0.3390987 0.3411828 0.4114658 Inf 0.3287088
[6,] 0.4733325 0.3226072 0.6741406 0.3548898 0.3287088 Inf
$d
1 2 3 4 5 6
1 0.000000 3.543605 3.920166 2.415808 6.370734 4.704273
2 3.543605 0.000000 1.216240 1.136707 9.908693 2.590910
3 3.920166 1.216240 0.000000 1.855823 10.114549 1.374687
4 2.415808 1.136707 1.855823 0.000000 8.785534 3.073542
5 6.370734 9.908693 10.114549 8.785534 0.000000 10.516282
6 4.704273 2.590910 1.374687 3.073542 10.516282 0.000000
$S
[1] 1.7603772 0.3695379 0.4604244 0.6244605 2.9904864 0.4663081
$centers
[,1] [,2] [,3]
[1,] -0.8223478 3.1156793 1.1246312
[2,] -0.5453086 -0.1954051 -0.1070763
[3,] 0.6673445 -0.2831328 -0.1389328
[4,] -0.7365941 0.8340871 0.3352567
[5,] -0.8727366 9.0937740 3.3260173
[6,] 2.0376827 -0.3822554 -0.1848967
Hierarchical Clustering
# Load required libraries
library(ggplot2)
install.packages("dplyr")
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)
library(clusterSim)
library(hopkins)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(rfm), m = nrow(rfm) - 1, method = "simple")
print(hopkins_stat)
# Check the value of the Hopkins statistic
cat("Hopkins Statistic:", hopkins_stat, "\n")
# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)
# Perform hierarchical clustering
hclust_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model <- hclust(dist(hclust_data))
# Specify the desired number of clusters
desired_clusters <- 4
# Determine the cutoff height for the desired number of clusters
cutoff_height <- hclust_model$height[length(hclust_model$height) - (desired_clusters - 1)]
cat("Cut-off height at ", desired_clusters, " clusters:", cutoff_height, "\n")
# Plot the dendogram with an indication of the cut-off
plot(hclust_model, labels = FALSE)
abline(h = cutoff_height, col = "red", lty = 2)
# Determine and visualise the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data, hcut, method = "wss")
fviz_nbclust(hclust_data, hcut, method = "silhouette")
gap_stat_hclust <- clusGap(hclust_data, hcut, K.max = 10, B = 10)
fviz_gap_stat(gap_stat_hclust)
# Determine the tree cut for a desired number of clusters
cut <- cutree(hclust_model, k = desired_clusters)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(cut)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(cut, dist(hclust_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(hclust_data), cut)
print(clustering_indices)
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(hclust_data, cut)
print(db_index)
Hierarchical Clustering Redone
# Filter out observations in clusters other than cluster 1
rfm_filtered <- rfm[rfm$cluster == 1, ]
# Perform hierarchical clustering on the filtered dataset
hclust_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model_filtered <- hclust(dist(hclust_data_filtered))
# Specify the desired number of clusters
desired_clusters_filtered <- 4
# Determine the cutoff height for the desired number of clusters
cutoff_height_filtered <- hclust_model_filtered$height[length(hclust_model_filtered$height) - (desired_clusters_filtered - 1)]
cat("Cut-off height at", desired_clusters_filtered, "cluster(s):", cutoff_height_filtered, "\n")
# Plot the dendrogram with an indication of the cut-off
plot(hclust_model_filtered, labels = FALSE)
abline(h = cutoff_height_filtered, col = "red", lty = 2)
# Determine and visualize the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data_filtered, hcut, method = "wss")
fviz_nbclust(hclust_data_filtered, hcut, method = "silhouette")
gap_stat_hclust_filtered <- clusGap(hclust_data_filtered, hcut, K.max = 10, B = 10)
fviz_gap_stat(gap_stat_hclust_filtered)
# Determine the tree cut for a desired number of clusters
cut_filtered <- cutree(hclust_model_filtered, k = desired_clusters_filtered)
# Add cluster labels to the filtered dataset
rfm_filtered$cluster <- as.factor(cut_filtered)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis_filtered <- rfm_filtered %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis_filtered)
# Silhouette analysis
sil_filtered <- silhouette(cut_filtered, dist(hclust_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette_filtered, "\n")
# Calculate clustering indices using cluster.stats
clustering_indices_filtered <- cluster.stats(dist(hclust_data_filtered), cut_filtered)
print(clustering_indices_filtered)
# Extract the Dunn index from the clustering indices list
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index:", dunn_index_filtered, "\n")
# Calculate the Davies-Bouldin Index
db_index_filtered <- clusterSim::index.DB(hclust_data_filtered, cut_filtered)
print(db_index_filtered)
DBSCAN
# Load required libraries
library(ggplot2)
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
# Create distance matrix of RFM data frame
# Perform DBSCAN clustering
dbscan_model <- dbscan(dist(rfm), eps = 0.5, MinPts = 5, scale = TRUE, method = "raw", showplot = 1)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(dbscan_model$cluster)
# Visualize the clusters
fviz_cluster(dbscan_model, data = normalized_data)
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(dbscan_model$cluster, dist(normalized_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
# Extract cluster labels from the DBSCAN model
dbscan_labels <- dbscan_model$cluster
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(rfm), dbscan_labels)
print(clustering_indices)
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(rfm, dbscan_labels)
print(db_index)