Customer Segmentation K-Means Clustering
# Load required libraries
library(ggplot2)
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
install.packages("flexclust")
Error in install.packages : Updating loaded packages
library(flexclust)
install.packages("mclust")
Error in install.packages : Updating loaded packages
library(mclust)
install.packages("clusterSim")
Error in install.packages : Updating loaded packages
library(cluster)
install.packages("hopkins")
Error in install.packages : Updating loaded packages
library(hopkins)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
install.packages("hopkins")
Warning in install.packages :
package ‘hopkins’ is in use and will not be installed
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
install.packages("clusterSim")
Warning in install.packages :
package ‘clusterSim’ is in use and will not be installed
# Data normalization
install.packages("mclust")
Warning in install.packages :
package ‘mclust’ is in use and will not be installed
install.packages("flexclust")
Warning in install.packages :
package ‘flexclust’ is in use and will not be installed
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)
# Prepare data frame to use for clustering
kmeans_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(kmeans_data), m = nrow(kmeans_data) - 1, method = "simple")
install.packages("dplyr")
Warning in install.packages :
package ‘dplyr’ is in use and will not be installed
cat("Hopkins Statistic:", hopkins_stat, "\n")
Hopkins Statistic: 0.9965223
# Prompt the user to choose the number of clusters based on the elbow plot
chosen_k <- readline(prompt = "Enter the optimal number of clusters based on the elbow plot: ")
5
chosen_k <- as.integer(chosen_k)
# Perform K-means clustering with chosen number of clusters
set.seed(123)
kmeans_model <- kmeans(kmeans_data, centers = chosen_k, nstart = 25)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(kmeans_model$cluster)
# Visualize the clusters
fviz_cluster(kmeans_model, data = kmeans_data)
# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data, kmeans, method = "wss")
fviz_nbclust(kmeans_data, kmeans, method = "silhouette")
fviz_nbclust(kmeans_data, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.......
Warning: did not converge in 10 iterations
............
Warning: did not converge in 10 iterations
..........
Warning: did not converge in 10 iterations
.........
Warning: Quick-TRANSfer stage steps exceeded maximum (= 218600)
.....
Warning: did not converge in 10 iterations
....... 50
..................
Warning: did not converge in 10 iterationsWarning: did not converge in 10 iterations
....
Warning: did not converge in 10 iterations
............
Warning: did not converge in 10 iterations
................ 100
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(kmeans_model$cluster, dist(kmeans_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
Average Silhouette Width: 0.4807923
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(kmeans_data), kmeans_model$cluster)
print(clustering_indices)
$n
[1] 4372
$cluster.number
[1] 5
$cluster.size
[1] 157 7 2737 633 838
$min.cluster.size
[1] 7
$noisen
[1] 0
$diameter
[1] 13.829273 30.606993 2.743004 1.721251 3.480057
$average.distance
[1] 2.4269641 17.4696282 0.6597140 0.5643137 0.5776147
$median.distance
[1] 1.5497905 17.3298115 0.5625444 0.4994139 0.5228272
$separation
[1] 0.11105938 9.25206163 0.01847143 0.03082044 0.01847143
$average.toother
[1] 3.803182 26.852580 2.163191 2.553593 1.518549
$separation.matrix
[,1] [,2] [,3] [,4] [,5]
[1,] 0.0000000 9.252062 0.11105938 2.43310143 1.25267895
[2,] 9.2520616 0.000000 17.23812325 18.90594512 16.26086404
[3,] 0.1110594 17.238123 0.00000000 1.32586933 0.01847143
[4,] 2.4331014 18.905945 1.32586933 0.00000000 0.03082044
[5,] 1.2526789 16.260864 0.01847143 0.03082044 0.00000000
$ave.between.matrix
[,1] [,2] [,3] [,4] [,5]
[1,] 0.000000 24.82984 3.443142 4.853022 4.010454
[2,] 24.829845 0.00000 26.824289 27.222184 27.044755
[3,] 3.443142 26.82429 0.000000 2.688798 1.320363
[4,] 4.853022 27.22218 2.688798 0.000000 1.475138
[5,] 4.010454 27.04475 1.320363 1.475138 0.000000
$average.between
[1] 2.244611
$average.within
[1] 0.720542
$n.between
[1] 5247792
$n.within
[1] 4307214
$max.diameter
[1] 30.60699
$min.separation
[1] 0.01847143
$within.cluster.ss
[1] 3186.563
$clus.avg.silwidths
1 2 3 4 5
0.2580942 0.2730968 0.4847761 0.5851573 0.4324041
$avg.silwidth
[1] 0.4807923
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.4133831
$dunn
[1] 0.0006035034
$dunn2
[1] 0.0755805
$entropy
[1] 1.019412
$wb.ratio
[1] 0.3210097
$ch
[1] 3400.902
$cwidegap
[1] 4.3314064 11.8906586 0.5992575 0.3625535 1.4109591
$widestgap
[1] 11.89066
$sindex
[1] 0.09642582
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
Dunn Index: 0.0006035034
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(kmeans_data, kmeans_model$cluster)
print(db_index)
$DB
[1] 0.8044733
$r
[1] 0.9179110 0.6717660 0.9179110 0.6613997 0.8533788
$R
[,1] [,2] [,3] [,4] [,5]
[1,] Inf 0.6717660 0.9179110 0.6304341 0.7667936
[2,] 0.6717660 Inf 0.5321864 0.5198163 0.5239432
[3,] 0.9179110 0.5321864 Inf 0.3892650 0.8533788
[4,] 0.6304341 0.5198163 0.3892650 Inf 0.6613997
[5,] 0.7667936 0.5239432 0.8533788 0.6613997 Inf
$d
1 2 3 4 5
1 0.000000 22.15844 3.249879 4.606358 3.792171
2 22.158436 0.00000 24.445935 24.875517 24.686849
3 3.249879 24.44593 0.000000 2.642198 1.209690
4 4.606358 24.87552 2.642198 0.000000 1.441229
5 3.792171 24.68685 1.209690 1.441229 0.000000
$S
[1] 2.4292947 12.4559884 0.5538053 0.4747100 0.4785183
$centers
[,1] [,2] [,3]
[1,] -0.8183457 3.01336556 1.09821069
[2,] -0.5162949 11.17419832 21.69689662
[3,] -0.5959877 -0.02533589 -0.03252604
[4,] 2.0175621 -0.38210357 -0.18512314
[5,] 0.5801859 -0.28651777 -0.14091961
K-Means Redone
# Remove observations in cluster 3
rfm_filtered <- rfm[rfm$cluster != 2, ]
# Prepare filtered data frame for clustering
kmeans_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
# Calculate the Hopkins statistic
hopkins_stat_filtered <- hopkins::hopkins(X = as.matrix(kmeans_data_filtered), m = nrow(kmeans_data_filtered) - 1, method = "simple")
cat("Hopkins Statistic (Filtered):", hopkins_stat_filtered, "\n")
Hopkins Statistic (Filtered): 0.9991269
# Perform K-means clustering on filtered data
set.seed(123)
kmeans_model_filtered <- kmeans(kmeans_data_filtered, centers = chosen_k, nstart = 25)
# Add cluster labels to the original dataset
rfm_filtered$cluster <- as.factor(kmeans_model_filtered$cluster)
# Visualize the clusters after filtering
fviz_cluster(kmeans_model_filtered, data = kmeans_data_filtered)
# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data_filtered, kmeans, method = "wss")
fviz_nbclust(kmeans_data_filtered, kmeans, method = "silhouette")
fviz_nbclust(kmeans_data_filtered, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
....
Warning: Quick-TRANSfer stage steps exceeded maximum (= 218250)
.............................................. 50
............................
Warning: did not converge in 10 iterations
...................... 100
# Visualize data points plotted against recency, monetary, and frequency after filtering
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis on filtered data
cluster_analysis_filtered <- rfm_filtered %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis_filtered)
# Silhouette analysis on filtered data
sil_filtered <- silhouette(kmeans_model_filtered$cluster, dist(kmeans_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])
cat("Average Silhouette Width (filtered data):", avg_silhouette_filtered, "\n")
Average Silhouette Width (filtered data): 0.5094932
# Calculate clustering indices using cluster.stats on filtered data
clustering_indices_filtered <- cluster.stats(dist(kmeans_data_filtered), kmeans_model_filtered$cluster)
print(clustering_indices_filtered)
$n
[1] 4365
$cluster.number
[1] 5
$cluster.size
[1] 767 2462 617 471 48
$min.cluster.size
[1] 48
$noisen
[1] 0
$diameter
[1] 3.480057 1.977773 1.721251 4.432850 13.829273
$average.distance
[1] 0.5532323 0.5007617 0.5556253 0.9763928 3.6092760
$median.distance
[1] 0.4993416 0.4762083 0.4949363 0.8248311 2.5705356
$separation
[1] 0.008527502 0.008876033 0.008527502 0.023725235 0.216424551
$average.toother
[1] 1.496943 1.900208 2.516850 1.992664 6.006580
$separation.matrix
[,1] [,2] [,3] [,4] [,5]
[1,] 0.000000000 0.008876033 0.008527502 0.34606952 2.9538381
[2,] 0.008876033 0.000000000 1.304904340 0.02372524 2.8737423
[3,] 0.008527502 1.304904340 0.000000000 1.48149596 3.6471309
[4,] 0.346069518 0.023725235 1.481495960 0.00000000 0.2164246
[5,] 2.953838122 2.873742316 3.647130892 0.21642455 0.0000000
$ave.between.matrix
[,1] [,2] [,3] [,4] [,5]
[1,] 0.000000 1.288422 1.412632 2.209086 6.288220
[2,] 1.288422 0.000000 2.620755 1.539219 5.956237
[3,] 1.412632 2.620755 0.000000 3.326089 6.891162
[4,] 2.209086 1.539219 3.326089 0.000000 4.652317
[5,] 6.288220 5.956237 6.891162 4.652317 0.000000
$average.between
[1] 2.013238
$average.within
[1] 0.603242
$n.between
[1] 5899329
$n.within
[1] 3625101
$max.diameter
[1] 13.82927
$min.separation
[1] 0.008527502
$within.cluster.ss
[1] 1511.353
$clus.avg.silwidths
1 2 3 4 5
0.4413600 0.5580084 0.5717862 0.3192309 0.1760061
$avg.silwidth
[1] 0.5094932
$g2
NULL
$g3
NULL
$pearsongamma
[1] 0.5895107
$dunn
[1] 0.0006166269
$dunn2
[1] 0.3569751
$entropy
[1] 1.194941
$wb.ratio
[1] 0.2996378
$ch
[1] 4570.802
$cwidegap
[1] 1.4109591 0.5992575 0.3625535 2.8213618 4.3314064
$widestgap
[1] 4.331406
$sindex
[1] 0.09620177
$corrected.rand
NULL
$vi
NULL
# Extract the Dunn index from the clustering indices list for filtered data
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index (filtered data):", dunn_index_filtered, "\n")
Dunn Index (filtered data): 0.0006166269
# Calculate the Davies-Bouldin Index for filtered data
db_index_filtered <- clusterSim::index.DB(kmeans_data_filtered, kmeans_model_filtered$cluster)
print(db_index_filtered)
$DB
[1] 0.8351144
$r
[1] 0.6998387 0.8507658 0.6732245 0.9758716 0.9758716
$R
[,1] [,2] [,3] [,4] [,5]
[1,] Inf 0.6998387 0.6732245 0.6111041 0.6286032
[2,] 0.6998387 Inf 0.3318168 0.8507658 0.6528990
[3,] 0.6732245 0.3318168 Inf 0.3970572 0.5729061
[4,] 0.6111041 0.8507658 0.3970572 Inf 0.9758716
[5,] 0.6286032 0.6528990 0.5729061 0.9758716 Inf
$d
1 2 3 4 5
1 0.000000 1.222369 1.377465 2.095054 5.930687
2 1.222369 0.000000 2.599600 1.428765 5.610816
3 1.377465 2.599600 0.000000 3.242420 6.519704
4 2.095054 1.428765 3.242420 0.000000 4.189210
5 5.930687 5.610816 6.519704 4.189210 0.000000
$S
[1] 0.4601066 0.3953546 0.4672364 0.8201897 3.2679420
$centers
[,1] [,2] [,3]
[1,] 0.6622009 -0.2843701 -0.13978747
[2,] -0.5534493 -0.1655285 -0.09227753
[3,] 2.0354305 -0.3824309 -0.18487878
[4,] -0.7556678 1.1479795 0.43229235
[5,] -0.8675599 5.0559406 1.93721894
Hierarchical Clustering
# Load required libraries
library(ggplot2)
install.packages("dplyr")
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)
library(clusterSim)
library(hopkins)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(rfm), m = nrow(rfm) - 1, method = "simple")
print(hopkins_stat)
# Check the value of the Hopkins statistic
cat("Hopkins Statistic:", hopkins_stat, "\n")
# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)
# Perform hierarchical clustering
hclust_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model <- hclust(dist(hclust_data))
# Specify the desired number of clusters
desired_clusters <- 4
# Determine the cutoff height for the desired number of clusters
cutoff_height <- hclust_model$height[length(hclust_model$height) - (desired_clusters - 1)]
cat("Cut-off height at ", desired_clusters, " clusters:", cutoff_height, "\n")
# Plot the dendogram with an indication of the cut-off
plot(hclust_model, labels = FALSE)
abline(h = cutoff_height, col = "red", lty = 2)
# Determine and visualise the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data, hcut, method = "wss")
fviz_nbclust(hclust_data, hcut, method = "silhouette")
gap_stat_hclust <- clusGap(hclust_data, hcut, K.max = 10, B = 10)
fviz_gap_stat(gap_stat_hclust)
# Determine the tree cut for a desired number of clusters
cut <- cutree(hclust_model, k = desired_clusters)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(cut)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(cut, dist(hclust_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(hclust_data), cut)
print(clustering_indices)
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(hclust_data, cut)
print(db_index)
Hierarchical Clustering Redone
# Filter out observations in clusters other than cluster 1
rfm_filtered <- rfm[rfm$cluster == 1, ]
# Perform hierarchical clustering on the filtered dataset
hclust_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model_filtered <- hclust(dist(hclust_data_filtered))
# Specify the desired number of clusters
desired_clusters_filtered <- 4
# Determine the cutoff height for the desired number of clusters
cutoff_height_filtered <- hclust_model_filtered$height[length(hclust_model_filtered$height) - (desired_clusters_filtered - 1)]
cat("Cut-off height at", desired_clusters_filtered, "cluster(s):", cutoff_height_filtered, "\n")
# Plot the dendrogram with an indication of the cut-off
plot(hclust_model_filtered, labels = FALSE)
abline(h = cutoff_height_filtered, col = "red", lty = 2)
# Determine and visualize the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data_filtered, hcut, method = "wss")
fviz_nbclust(hclust_data_filtered, hcut, method = "silhouette")
gap_stat_hclust_filtered <- clusGap(hclust_data_filtered, hcut, K.max = 10, B = 10)
fviz_gap_stat(gap_stat_hclust_filtered)
# Determine the tree cut for a desired number of clusters
cut_filtered <- cutree(hclust_model_filtered, k = desired_clusters_filtered)
# Add cluster labels to the filtered dataset
rfm_filtered$cluster <- as.factor(cut_filtered)
# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Monetary", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Recency", y = "Frequency", color = "Cluster") +
theme_minimal()
ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
geom_point() +
labs(x = "Monetary", y = "Frequency", color = "Cluster") +
theme_minimal()
# Cluster analysis
cluster_analysis_filtered <- rfm_filtered %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis_filtered)
# Silhouette analysis
sil_filtered <- silhouette(cut_filtered, dist(hclust_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette_filtered, "\n")
# Calculate clustering indices using cluster.stats
clustering_indices_filtered <- cluster.stats(dist(hclust_data_filtered), cut_filtered)
print(clustering_indices_filtered)
# Extract the Dunn index from the clustering indices list
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index:", dunn_index_filtered, "\n")
# Calculate the Davies-Bouldin Index
db_index_filtered <- clusterSim::index.DB(hclust_data_filtered, cut_filtered)
print(db_index_filtered)
DBSCAN
# Load required libraries
library(ggplot2)
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)
# Load the Online Retail dataset
data <- Online_Retail
# Data preprocessing
data <- data %>%
dplyr::filter(!is.na(CustomerID)) %>%
dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)
# Calculate total spending (monetary value) per customer
monetary <- data %>%
group_by(CustomerID) %>%
summarise(monetary = sum(UnitPrice))
# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
group_by(CustomerID) %>%
summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
frequency = n_distinct(InvoiceNo))
# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")
# Create distance matrix of RFM data frame
# Perform DBSCAN clustering
dbscan_model <- dbscan(dist(rfm), eps = 0.5, MinPts = 5, scale = TRUE, method = "raw", showplot = 1)
# Add cluster labels to the original dataset
rfm$cluster <- as.factor(dbscan_model$cluster)
# Visualize the clusters
fviz_cluster(dbscan_model, data = normalized_data)
# Cluster analysis
cluster_analysis <- rfm %>%
group_by(cluster) %>%
summarise(average_recency = mean(recency),
average_frequency = mean(frequency),
average_monetary = mean(monetary),
count_customers = n())
print(cluster_analysis)
# Silhouette analysis
sil <- silhouette(dbscan_model$cluster, dist(normalized_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")
# Extract cluster labels from the DBSCAN model
dbscan_labels <- dbscan_model$cluster
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(rfm), dbscan_labels)
print(clustering_indices)
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(rfm, dbscan_labels)
print(db_index)