Customer Segmentation K-Means Clustering

# Load required libraries
library(ggplot2)
install.packages("dplyr")
Warning in install.packages :
  package ‘dplyr’ is in use and will not be installed
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
install.packages("flexclust")
Error in install.packages : Updating loaded packages
library(flexclust)
install.packages("mclust")
Error in install.packages : Updating loaded packages
library(mclust)
install.packages("clusterSim")
Error in install.packages : Updating loaded packages
library(cluster)
install.packages("hopkins")
Error in install.packages : Updating loaded packages
library(hopkins)

# Load the Online Retail dataset
data <- Online_Retail

# Data preprocessing
data <- data %>%
  dplyr::filter(!is.na(CustomerID)) %>%
  dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)

# Calculate total spending (monetary value) per customer
monetary <- data %>%
  group_by(CustomerID) %>%
  summarise(monetary = sum(UnitPrice))

# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
  group_by(CustomerID) %>%
  summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
            frequency = n_distinct(InvoiceNo))

# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")

# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)

# Prepare data frame to use for clustering
kmeans_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) : 
  invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) : 
  attempt to use zero-length variable name
# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(kmeans_data), m = nrow(kmeans_data) - 1, method = "simple")
cat("Hopkins Statistic:", hopkins_stat, "\n")
Hopkins Statistic: 0.9967299 
# Prompt the user to choose the number of clusters based on the elbow plot
chosen_k <- readline(prompt = "Enter the optimal number of clusters based on the elbow plot: ")
4
chosen_k <- as.integer(chosen_k)

# Perform K-means clustering with chosen number of clusters
set.seed(123)
kmeans_model <- kmeans(kmeans_data, centers = chosen_k, nstart = 25)

# Add cluster labels to the original dataset
rfm$cluster <- as.factor(kmeans_model$cluster)

# Visualize the clusters
fviz_cluster(kmeans_model, data = kmeans_data)


# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data, kmeans, method = "wss")

fviz_nbclust(kmeans_data, kmeans, method = "silhouette")

fviz_nbclust(kmeans_data, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100)  [one "." per sample]:
.......
Warning: did not converge in 10 iterations
............
Warning: did not converge in 10 iterations
..........
Warning: did not converge in 10 iterations
.........
Warning: Quick-TRANSfer stage steps exceeded maximum (= 218600)
.....
Warning: did not converge in 10 iterations
....... 50 
..................
Warning: did not converge in 10 iterationsWarning: did not converge in 10 iterations
....
Warning: did not converge in 10 iterations
............
Warning: did not converge in 10 iterations
................ 100 

# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Monetary", color = "Cluster") +
  theme_minimal()


ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Frequency", color = "Cluster") +
  theme_minimal()


ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Monetary", y = "Frequency", color = "Cluster") +
  theme_minimal()


# Cluster analysis
cluster_analysis <- rfm %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis)

# Silhouette analysis
sil <- silhouette(kmeans_model$cluster, dist(kmeans_data))
avg_silhouette <- mean(sil[, "sil_width"])

cat("Average Silhouette Width:", avg_silhouette, "\n")
Average Silhouette Width: 0.6064892 
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(kmeans_data), kmeans_model$cluster)
print(clustering_indices)
$n
[1] 4372

$cluster.number
[1] 4

$cluster.size
[1]  227 1066    7 3072

$min.cluster.size
[1] 7

$noisen
[1] 0

$diameter
[1] 13.829273  2.512074 30.606993  3.874338

$average.distance
[1]  2.1388173  0.8135454 17.4696282  0.6914000

$median.distance
[1]  1.3177458  0.7170029 17.3298115  0.6229301

$separation
[1] 0.08155842 0.02178571 9.25206163 0.02178571

$average.toother
[1]  3.277427  2.310772 26.852580  2.400118

$separation.matrix
           [,1]        [,2]      [,3]        [,4]
[1,] 0.00000000  0.95490609  9.252062  0.08155842
[2,] 0.95490609  0.00000000 18.739599  0.02178571
[3,] 9.25206163 18.73959866  0.000000 16.26086404
[4,] 0.08155842  0.02178571 16.260864  0.00000000

$ave.between.matrix
          [,1]      [,2]     [,3]      [,4]
[1,]  0.000000  4.085878 25.15694  2.947035
[2,]  4.085878  0.000000 27.16396  2.122972
[3,] 25.156939 27.163963  0.00000 26.869825
[4,]  2.947035  2.122972 26.86983  0.000000

$average.between
[1] 2.548275

$average.within
[1] 0.8231975

$n.between
[1] 4244633

$n.within
[1] 5310373

$max.diameter
[1] 30.60699

$min.separation
[1] 0.02178571

$within.cluster.ss
[1] 3669.908

$clus.avg.silwidths
        1         2         3         4 
0.2301737 0.5796220 0.2833442 0.6443557 

$avg.silwidth
[1] 0.6064892

$g2
NULL

$g3
NULL

$pearsongamma
[1] 0.4766032

$dunn
[1] 0.0007117885

$dunn2
[1] 0.1215236

$entropy
[1] 0.7559626

$wb.ratio
[1] 0.323041

$ch
[1] 3746.454

$cwidegap
[1]  4.3314064  0.6296994 11.8906586  1.4109591

$widestgap
[1] 11.89066

$sindex
[1] 0.1784922

$corrected.rand
NULL

$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
Dunn Index: 0.0007117885 
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(kmeans_data, kmeans_model$cluster)
print(db_index)
$DB
[1] 0.8493978

$r
[1] 0.9988550 0.7495313 0.6503498 0.9988550

$R
          [,1]      [,2]      [,3]      [,4]
[1,]       Inf 0.7495313 0.6503498 0.9988550
[2,] 0.7495313       Inf 0.5297251 0.6043077
[3,] 0.6503498 0.5297251       Inf 0.5315909
[4,] 0.9988550 0.6043077 0.5315909       Inf

$d
          1         2        3         4
1  0.000000  3.852261 22.53791  2.770151
2  3.852261  0.000000 24.80879  2.070622
3 22.537912 24.808791  0.00000 24.495207
4  2.770151  2.070622 24.49521  0.000000

$S
[1]  2.2015383  0.6858520 12.4559884  0.5654409

$centers
           [,1]        [,2]        [,3]
[1,] -0.8005187  2.49251048  0.90014256
[2,]  1.5671071 -0.35021827 -0.17370844
[3,] -0.5162949 11.17419832 21.69689662
[4,] -0.4834649 -0.08811413 -0.05567625

K-Means Redone

# Remove observations in cluster 3
rfm_filtered <- rfm[rfm$cluster != 3, ]

# Prepare filtered data frame for clustering
kmeans_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]

# Calculate the Hopkins statistic
hopkins_stat_filtered <- hopkins::hopkins(X = as.matrix(kmeans_data_filtered), m = nrow(kmeans_data_filtered) - 1, method = "simple")
cat("Hopkins Statistic (Filtered):", hopkins_stat_filtered, "\n")
Hopkins Statistic (Filtered): 0.9991269 
# Perform K-means clustering on filtered data
set.seed(123)
kmeans_model_filtered <- kmeans(kmeans_data_filtered, centers = chosen_k, nstart = 25)

# Add cluster labels to the original dataset
rfm_filtered$cluster <- as.factor(kmeans_model_filtered$cluster)

# Visualize the clusters after filtering
fviz_cluster(kmeans_model_filtered, data = kmeans_data_filtered)


# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data_filtered, kmeans, method = "wss")

fviz_nbclust(kmeans_data_filtered, kmeans, method = "silhouette")

fviz_nbclust(kmeans_data_filtered, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100)  [one "." per sample]:
....
Warning: Quick-TRANSfer stage steps exceeded maximum (= 218250)
.............................................. 50 
............................
Warning: did not converge in 10 iterations
...................... 100 

# Visualize data points plotted against recency, monetary, and frequency after filtering
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Monetary", color = "Cluster") +
  theme_minimal()


ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Frequency", color = "Cluster") +
  theme_minimal()


ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Monetary", y = "Frequency", color = "Cluster") +
  theme_minimal()


# Cluster analysis on filtered data
cluster_analysis_filtered <- rfm_filtered %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis_filtered)

# Silhouette analysis on filtered data
sil_filtered <- silhouette(kmeans_model_filtered$cluster, dist(kmeans_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])

cat("Average Silhouette Width (filtered data):", avg_silhouette_filtered, "\n")
Average Silhouette Width (filtered data): 0.5383663 
# Calculate clustering indices using cluster.stats on filtered data
clustering_indices_filtered <- cluster.stats(dist(kmeans_data_filtered), kmeans_model_filtered$cluster)
print(clustering_indices_filtered)
$n
[1] 4365

$cluster.number
[1] 4

$cluster.size
[1] 2733  527 1050   55

$min.cluster.size
[1] 55

$noisen
[1] 0

$diameter
[1]  2.475988  5.103040  2.463134 13.829273

$average.distance
[1] 0.5799676 0.9411030 0.8045243 3.5230297

$median.distance
[1] 0.5368057 0.7973409 0.7098496 2.5988504

$separation
[1] 0.01229936 0.02016448 0.01229936 0.11192763

$average.toother
[1] 2.004758 1.915521 2.266522 5.715890

$separation.matrix
           [,1]       [,2]       [,3]      [,4]
[1,] 0.00000000 0.02016448 0.01229936 2.7028272
[2,] 0.02016448 0.00000000 0.34277648 0.1119276
[3,] 0.01229936 0.34277648 0.00000000 2.4331014
[4,] 2.70282717 0.11192763 2.43310143 0.0000000

$ave.between.matrix
         [,1]     [,2]     [,3]     [,4]
[1,] 0.000000 1.496080 2.066557 5.699031
[2,] 1.496080 0.000000 2.873580 4.467709
[3,] 2.066557 2.873580 0.000000 6.386240
[4,] 5.699031 4.467709 6.386240 0.000000

$average.between
[1] 2.162626

$average.within
[1] 0.7146689

$n.between
[1] 5100341

$n.within
[1] 4424089

$max.diameter
[1] 13.82927

$min.separation
[1] 0.01229936

$within.cluster.ss
[1] 1963.399

$clus.avg.silwidths
        1         2         3         4 
0.5738919 0.3302756 0.5701521 0.1601315 

$avg.silwidth
[1] 0.5383663

$g2
NULL

$g3
NULL

$pearsongamma
[1] 0.6268952

$dunn
[1] 0.0008893713

$dunn2
[1] 0.4246573

$entropy
[1] 0.9462661

$wb.ratio
[1] 0.3304634

$ch
[1] 4357.639

$cwidegap
[1] 0.5992575 1.5073112 0.6296994 4.3314064

$widestgap
[1] 4.331406

$sindex
[1] 0.1270792

$corrected.rand
NULL

$vi
NULL
# Extract the Dunn index from the clustering indices list for filtered data
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index (filtered data):", dunn_index_filtered, "\n")
Dunn Index (filtered data): 0.0008893713 
# Calculate the Davies-Bouldin Index for filtered data
db_index_filtered <- clusterSim::index.DB(kmeans_data_filtered, kmeans_model_filtered$cluster)
print(db_index_filtered)
$DB
[1] 0.8847213

$r
[1] 0.9225626 0.9874744 0.6413737 0.9874744

$R
          [,1]      [,2]      [,3]      [,4]
[1,]       Inf 0.9225626 0.5626750 0.6812017
[2,] 0.9225626       Inf 0.5281891 0.9874744
[3,] 0.5626750 0.5281891       Inf 0.6413737
[4,] 0.6812017 0.9874744 0.6413737       Inf

$d
         1        2        3        4
1 0.000000 1.360032 2.035747 5.357418
2 1.360032 0.000000 2.774485 4.019821
3 2.035747 2.774485 0.000000 6.018674
4 5.357418 4.019821 6.018674 0.000000

$S
[1] 0.4673633 0.7873519 0.6781006 3.1821184

$centers
           [,1]       [,2]       [,3]
[1,] -0.4455741 -0.1901157 -0.1023983
[2,] -0.7504796  1.0389510  0.3937024
[3,]  1.5825233 -0.3510466 -0.1744255
[4,] -0.8141591  4.7716090  1.8843971

Hierarchical Clustering

# Load required libraries
library(ggplot2)
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)
library(clusterSim)
library(hopkins)

# Load the Online Retail dataset
data <- Online_Retail

# Data preprocessing
data <- data %>%
  dplyr::filter(!is.na(CustomerID)) %>%
  dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)

# Calculate total spending (monetary value) per customer
monetary <- data %>%
  group_by(CustomerID) %>%
  summarise(monetary = sum(UnitPrice))

# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
  group_by(CustomerID) %>%
  summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
            frequency = n_distinct(InvoiceNo))

# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")

# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(rfm), m = nrow(rfm) - 1, method = "simple")
install.packages("dplyr")
Warning in install.packages :
  package ‘dplyr’ is in use and will not be installed
print(hopkins_stat)
[1] 0.9937592
# Check the value of the Hopkins statistic
cat("Hopkins Statistic:", hopkins_stat, "\n")
Hopkins Statistic: 0.9937592 
# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)

# Perform hierarchical clustering
hclust_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model <- hclust(dist(hclust_data))

# Specify the desired number of clusters
desired_clusters <- 4

# Determine the cutoff height for the desired number of clusters
cutoff_height <- hclust_model$height[length(hclust_model$height) - (desired_clusters - 1)]
cat("Cut-off height at ", desired_clusters, " clusters:", cutoff_height, "\n")
Cut-off height at  4  clusters: 14.77144 
# Plot the dendogram with an indication of the cut-off
plot(hclust_model, labels = FALSE)
abline(h = cutoff_height, col = "red", lty = 2)


# Determine and visualise the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data, hcut, method = "wss")

fviz_nbclust(hclust_data, hcut, method = "silhouette")

gap_stat_hclust <- clusGap(hclust_data, hcut, K.max = 10, B = 10)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 10)  [one "." per sample]:
.......... 10 
fviz_gap_stat(gap_stat_hclust)


# Determine the tree cut for a desired number of clusters
cut <- cutree(hclust_model, k = desired_clusters)

# Add cluster labels to the original dataset
rfm$cluster <- as.factor(cut)

# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Monetary", color = "Cluster") +
  theme_minimal()


ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Frequency", color = "Cluster") +
  theme_minimal()


ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Monetary", y = "Frequency", color = "Cluster") +
  theme_minimal()


# Cluster analysis
cluster_analysis <- rfm %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis)

# Silhouette analysis
sil <- silhouette(cut, dist(hclust_data))
avg_silhouette <- mean(sil[, "sil_width"])

cat("Average Silhouette Width:", avg_silhouette, "\n")
Average Silhouette Width: 0.8031784 
# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(hclust_data), cut)
print(clustering_indices)
$n
[1] 4372

$cluster.number
[1] 4

$cluster.size
[1] 4340   25    4    3

$min.cluster.size
[1] 3

$noisen
[1] 0

$diameter
[1] 10.94129 10.17666 14.77144 12.67400

$average.distance
[1]  1.378864  3.601903 10.758320 10.571294

$median.distance
[1]  1.110563  3.180654 12.409483 11.886022

$separation
[1]  0.6352564  0.6352564  9.2520616 10.3900852

$average.toother
[1] 11.567897  7.307411 25.705291 28.374071

$separation.matrix
           [,1]       [,2]      [,3]     [,4]
[1,]  0.0000000  0.6352564  9.252062 18.40517
[2,]  0.6352564  0.0000000 13.717897 10.66982
[3,]  9.2520616 13.7178974  0.000000 10.39009
[4,] 18.4051659 10.6698195 10.390085  0.00000

$ave.between.matrix
          [,1]      [,2]     [,3]     [,4]
[1,]  0.000000  7.282036 25.71830 28.41620
[2,]  7.282036  0.000000 23.82517 21.99288
[3,] 25.718302 23.825174  0.00000 22.54987
[4,] 28.416196 21.992881 22.54987  0.00000

$average.between
[1] 11.58328

$average.within
[1] 1.406465

$n.between
[1] 139067

$n.within
[1] 9415939

$max.diameter
[1] 14.77144

$min.separation
[1] 0.6352564

$within.cluster.ss
[1] 6916.75

$clus.avg.silwidths
        1         2         3         4 
0.8054802 0.4963800 0.4678211 0.4770577 

$avg.silwidth
[1] 0.8031784

$g2
NULL

$g3
NULL

$pearsongamma
[1] 0.6381906

$dunn
[1] 0.04300573

$dunn2
[1] 0.6768748

$entropy
[1] 0.04822162

$wb.ratio
[1] 0.121422

$ch
[1] 1304.332

$cwidegap
[1]  4.331406  3.433129 11.890659 11.886022

$widestgap
[1] 11.89066

$sindex
[1] 2.794022

$corrected.rand
NULL

$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")
Dunn Index: 0.04300573 
# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(hclust_data, cut)
print(db_index)
$DB
[1] 0.6138877

$r
[1] 0.6005045 0.6005045 0.6272708 0.6272708

$R
          [,1]      [,2]      [,3]      [,4]
[1,]       Inf 0.6005045 0.3243178 0.2654419
[2,] 0.6005045       Inf 0.4295965 0.4297899
[3,] 0.3243178 0.4295965       Inf 0.6272708
[4,] 0.2654419 0.4297899 0.6272708       Inf

$d
          1         2        3        4
1  0.000000  7.085531 25.42511 28.16053
2  7.085531  0.000000 23.45872 21.65466
3 25.425107 23.458720  0.00000 21.19958
4 28.160533 21.654660 21.19958  0.00000

$S
[1] 1.211463 3.043431 7.034353 6.263523

$centers
             [,1]        [,2]        [,3]
[1,]  0.005880044 -0.05629346 -0.04699245
[2,] -0.876213115  6.64376839  2.08275854
[3,] -0.226780490  2.80278496 25.21577909
[4,] -0.902314125 22.33608281 17.00505334

Hierarchical Clustering Redone

# Filter out observations in clusters other than cluster 1
rfm_filtered <- rfm[rfm$cluster == 1, ]

# Perform hierarchical clustering on the filtered dataset
hclust_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model_filtered <- hclust(dist(hclust_data_filtered))

# Specify the desired number of clusters
desired_clusters_filtered <- 4

# Determine the cutoff height for the desired number of clusters
cutoff_height_filtered <- hclust_model_filtered$height[length(hclust_model_filtered$height) - (desired_clusters_filtered - 1)]
cat("Cut-off height at", desired_clusters_filtered, "cluster(s):", cutoff_height_filtered, "\n")
Cut-off height at 4 cluster(s): 4.637608 
# Plot the dendrogram with an indication of the cut-off
plot(hclust_model_filtered, labels = FALSE)
abline(h = cutoff_height_filtered, col = "red", lty = 2)


# Determine and visualize the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data_filtered, hcut, method = "wss")

fviz_nbclust(hclust_data_filtered, hcut, method = "silhouette")

gap_stat_hclust_filtered <- clusGap(hclust_data_filtered, hcut, K.max = 10, B = 10)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 10)  [one "." per sample]:
.......... 10 
fviz_gap_stat(gap_stat_hclust_filtered)


# Determine the tree cut for a desired number of clusters
cut_filtered <- cutree(hclust_model_filtered, k = desired_clusters_filtered)

# Add cluster labels to the filtered dataset
rfm_filtered$cluster <- as.factor(cut_filtered)

# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Monetary", color = "Cluster") +
  theme_minimal()


ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Frequency", color = "Cluster") +
  theme_minimal()


ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Monetary", y = "Frequency", color = "Cluster") +
  theme_minimal()


# Cluster analysis
cluster_analysis_filtered <- rfm_filtered %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis_filtered)

# Silhouette analysis
sil_filtered <- silhouette(cut_filtered, dist(hclust_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])

cat("Average Silhouette Width:", avg_silhouette_filtered, "\n")
Average Silhouette Width: 0.5618641 
# Calculate clustering indices using cluster.stats
clustering_indices_filtered <- cluster.stats(dist(hclust_data_filtered), cut_filtered)
print(clustering_indices_filtered)
$n
[1] 4340

$cluster.number
[1] 4

$cluster.size
[1]  697 3453  187    3

$min.cluster.size
[1] 3

$noisen
[1] 0

$diameter
[1] 1.637197 3.874338 3.821206 4.637608

$average.distance
[1] 0.5996526 0.8388544 1.2124658 3.1414414

$median.distance
[1] 0.5274782 0.7316045 1.0796680 4.3314064

$separation
[1] 0.02074356 0.02074356 0.10850601 3.13231909

$average.toother
[1] 2.427511 2.382063 2.671076 7.659358

$separation.matrix
           [,1]       [,2]     [,3]     [,4]
[1,] 0.00000000 0.02074356 1.954498 6.442425
[2,] 0.02074356 0.00000000 0.108506 3.159829
[3,] 1.95449833 0.10850601 0.000000 3.132319
[4,] 6.44242505 3.15982900 3.132319 0.000000

$ave.between.matrix
         [,1]     [,2]     [,3]     [,4]
[1,] 0.000000 2.346175 3.836342 8.228365
[2,] 2.346175 0.000000 2.432268 7.590602
[3,] 3.836342 2.432268 0.000000 6.808108
[4,] 8.228365 7.590602 6.808108 0.000000

$average.between
[1] 2.445977

$average.within
[1] 0.8181285

$n.between
[1] 3195802

$n.within
[1] 6219828

$max.diameter
[1] 4.637608

$min.separation
[1] 0.02074356

$within.cluster.ss
[1] 2052.066

$clus.avg.silwidths
        1         2         3         4 
0.7356960 0.5313029 0.4786358 0.5387765 

$avg.silwidth
[1] 0.5618641

$g2
NULL

$g3
NULL

$pearsongamma
[1] 0.751978

$dunn
[1] 0.0044729

$dunn2
[1] 0.7468465

$entropy
[1] 0.6161344

$wb.ratio
[1] 0.3344792

$ch
[1] 3040.947

$cwidegap
[1] 0.3619369 1.4109591 2.8213618 4.3314064

$widestgap
[1] 4.331406

$sindex
[1] 0.1846325

$corrected.rand
NULL

$vi
NULL
# Extract the Dunn index from the clustering indices list
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index:", dunn_index_filtered, "\n")
Dunn Index: 0.0044729 
# Calculate the Davies-Bouldin Index
db_index_filtered <- clusterSim::index.DB(hclust_data_filtered, cut_filtered)
print(db_index_filtered)
$DB
[1] 0.6150994

$r
[1] 0.5214179 0.7381963 0.7381963 0.4625872

$R
          [,1]      [,2]      [,3]      [,4]
[1,]       Inf 0.5214179 0.4004371 0.3234591
[2,] 0.5214179       Inf 0.7381963 0.3764138
[3,] 0.4004371 0.7381963       Inf 0.4625872
[4,] 0.3234591 0.3764138 0.4625872       Inf

$d
         1        2        3        4
1 0.000000 2.305232 3.740158 8.118145
2 2.305232 0.000000 2.288395 7.485045
3 3.740158 2.288395 0.000000 6.729939
4 8.118145 7.485045 6.729939 0.000000

$S
[1] 0.5052015 0.6967875 0.9924968 2.1206864

$centers
           [,1]       [,2]        [,3]
[1,]  1.9435668 -0.3838634 -0.18685373
[2,] -0.3412198 -0.1040612 -0.06213785
[3,] -0.7944171  2.0275954  0.63600386
[4,] -0.7862184  1.1341113  7.30636307

DBSCAN

# Load required libraries
library(ggplot2)
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)

# Load the Online Retail dataset
data <- Online_Retail

# Data preprocessing
data <- data %>%
  dplyr::filter(!is.na(CustomerID)) %>%
  dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)

# Calculate total spending (monetary value) per customer
monetary <- data %>%
  group_by(CustomerID) %>%
  summarise(monetary = sum(UnitPrice))

# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
  group_by(CustomerID) %>%
  summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
            frequency = n_distinct(InvoiceNo))

# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")

# Create distance matrix of RFM data frame 

# Perform DBSCAN clustering
dbscan_model <- dbscan(dist(rfm), eps = 0.5, MinPts = 5, scale = TRUE, method = "raw", showplot = 1)

# Add cluster labels to the original dataset
rfm$cluster <- as.factor(dbscan_model$cluster)

# Visualize the clusters
fviz_cluster(dbscan_model, data = normalized_data)

# Cluster analysis
cluster_analysis <- rfm %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis)

# Silhouette analysis
sil <- silhouette(dbscan_model$cluster, dist(normalized_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")

# Extract cluster labels from the DBSCAN model
dbscan_labels <- dbscan_model$cluster

# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(rfm), dbscan_labels)
print(clustering_indices)

# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")

# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(rfm, dbscan_labels)
print(db_index)
---
title: "RFM Customer Segmentation"
author: "Frederick Hagelstein"
date: "June 16 2023"
output: html_notebook
---

Customer Segmentation
K-Means Clustering
```{r}
# Load required libraries
library(ggplot2)
install.packages("dplyr")
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
install.packages("flexclust")
library(flexclust)
install.packages("mclust")
library(mclust)
install.packages("clusterSim")
library(cluster)
install.packages("hopkins")
library(hopkins)

# Load the Online Retail dataset
data <- Online_Retail

# Data preprocessing
data <- data %>%
  dplyr::filter(!is.na(CustomerID)) %>%
  dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)

# Calculate total spending (monetary value) per customer
monetary <- data %>%
  group_by(CustomerID) %>%
  summarise(monetary = sum(UnitPrice))

# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
  group_by(CustomerID) %>%
  summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
            frequency = n_distinct(InvoiceNo))

# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")

# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)

# Prepare data frame to use for clustering
kmeans_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]

# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(kmeans_data), m = nrow(kmeans_data) - 1, method = "simple")
cat("Hopkins Statistic:", hopkins_stat, "\n")

# Prompt the user to choose the number of clusters based on the elbow plot
chosen_k <- readline(prompt = "Enter the optimal number of clusters based on the elbow plot: ")
chosen_k <- as.integer(chosen_k)

# Perform K-means clustering with chosen number of clusters
set.seed(123)
kmeans_model <- kmeans(kmeans_data, centers = chosen_k, nstart = 25)

# Add cluster labels to the original dataset
rfm$cluster <- as.factor(kmeans_model$cluster)

# Visualize the clusters
fviz_cluster(kmeans_model, data = kmeans_data)

# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data, kmeans, method = "wss")
fviz_nbclust(kmeans_data, kmeans, method = "silhouette")
fviz_nbclust(kmeans_data, kmeans, method = "gap_stat")

# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Monetary", color = "Cluster") +
  theme_minimal()

ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Frequency", color = "Cluster") +
  theme_minimal()

ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Monetary", y = "Frequency", color = "Cluster") +
  theme_minimal()

# Cluster analysis
cluster_analysis <- rfm %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis)

# Silhouette analysis
sil <- silhouette(kmeans_model$cluster, dist(kmeans_data))
avg_silhouette <- mean(sil[, "sil_width"])

cat("Average Silhouette Width:", avg_silhouette, "\n")

# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(kmeans_data), kmeans_model$cluster)
print(clustering_indices)

# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")

# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(kmeans_data, kmeans_model$cluster)
print(db_index)

```

K-Means Redone
```{r}
# Remove observations in cluster 3
rfm_filtered <- rfm[rfm$cluster != 3, ]

# Prepare filtered data frame for clustering
kmeans_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]

# Calculate the Hopkins statistic
hopkins_stat_filtered <- hopkins::hopkins(X = as.matrix(kmeans_data_filtered), m = nrow(kmeans_data_filtered) - 1, method = "simple")
cat("Hopkins Statistic (Filtered):", hopkins_stat_filtered, "\n")

# Perform K-means clustering on filtered data
set.seed(123)
kmeans_model_filtered <- kmeans(kmeans_data_filtered, centers = chosen_k, nstart = 25)

# Add cluster labels to the original dataset
rfm_filtered$cluster <- as.factor(kmeans_model_filtered$cluster)

# Visualize the clusters after filtering
fviz_cluster(kmeans_model_filtered, data = kmeans_data_filtered)

# Determine and visualise the optimal number of clusters
fviz_nbclust(kmeans_data_filtered, kmeans, method = "wss")
fviz_nbclust(kmeans_data_filtered, kmeans, method = "silhouette")
fviz_nbclust(kmeans_data_filtered, kmeans, method = "gap_stat")

# Visualize data points plotted against recency, monetary, and frequency after filtering
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Monetary", color = "Cluster") +
  theme_minimal()

ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Frequency", color = "Cluster") +
  theme_minimal()

ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Monetary", y = "Frequency", color = "Cluster") +
  theme_minimal()

# Cluster analysis on filtered data
cluster_analysis_filtered <- rfm_filtered %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis_filtered)

# Silhouette analysis on filtered data
sil_filtered <- silhouette(kmeans_model_filtered$cluster, dist(kmeans_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])

cat("Average Silhouette Width (filtered data):", avg_silhouette_filtered, "\n")

# Calculate clustering indices using cluster.stats on filtered data
clustering_indices_filtered <- cluster.stats(dist(kmeans_data_filtered), kmeans_model_filtered$cluster)
print(clustering_indices_filtered)

# Extract the Dunn index from the clustering indices list for filtered data
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index (filtered data):", dunn_index_filtered, "\n")

# Calculate the Davies-Bouldin Index for filtered data
db_index_filtered <- clusterSim::index.DB(kmeans_data_filtered, kmeans_model_filtered$cluster)
print(db_index_filtered)

```



Hierarchical Clustering
```{r}
# Load required libraries
library(ggplot2)
install.packages("dplyr")
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)
library(clusterSim)
library(hopkins)

# Load the Online Retail dataset
data <- Online_Retail

# Data preprocessing
data <- data %>%
  dplyr::filter(!is.na(CustomerID)) %>%
  dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)

# Calculate total spending (monetary value) per customer
monetary <- data %>%
  group_by(CustomerID) %>%
  summarise(monetary = sum(UnitPrice))

# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
  group_by(CustomerID) %>%
  summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
            frequency = n_distinct(InvoiceNo))

# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")

# Calculate the Hopkins statistic
hopkins_stat <- hopkins::hopkins(X = as.matrix(rfm), m = nrow(rfm) - 1, method = "simple")
print(hopkins_stat)

# Check the value of the Hopkins statistic
cat("Hopkins Statistic:", hopkins_stat, "\n")

# Data normalization
rfm$recency_scaled <- scale(rfm$recency)
rfm$frequency_scaled <- scale(rfm$frequency)
rfm$monetary_scaled <- scale(rfm$monetary)

# Perform hierarchical clustering
hclust_data <- rfm[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model <- hclust(dist(hclust_data))

# Specify the desired number of clusters
desired_clusters <- 4

# Determine the cutoff height for the desired number of clusters
cutoff_height <- hclust_model$height[length(hclust_model$height) - (desired_clusters - 1)]
cat("Cut-off height at ", desired_clusters, " clusters:", cutoff_height, "\n")

# Plot the dendogram with an indication of the cut-off
plot(hclust_model, labels = FALSE)
abline(h = cutoff_height, col = "red", lty = 2)

# Determine and visualise the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data, hcut, method = "wss")
fviz_nbclust(hclust_data, hcut, method = "silhouette")
gap_stat_hclust <- clusGap(hclust_data, hcut, K.max = 10, B = 10)
fviz_gap_stat(gap_stat_hclust)

# Determine the tree cut for a desired number of clusters
cut <- cutree(hclust_model, k = desired_clusters)

# Add cluster labels to the original dataset
rfm$cluster <- as.factor(cut)

# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm, aes(x = recency, y = monetary, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Monetary", color = "Cluster") +
  theme_minimal()

ggplot(rfm, aes(x = recency, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Frequency", color = "Cluster") +
  theme_minimal()

ggplot(rfm, aes(x = monetary, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Monetary", y = "Frequency", color = "Cluster") +
  theme_minimal()

# Cluster analysis
cluster_analysis <- rfm %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis)

# Silhouette analysis
sil <- silhouette(cut, dist(hclust_data))
avg_silhouette <- mean(sil[, "sil_width"])

cat("Average Silhouette Width:", avg_silhouette, "\n")

# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(hclust_data), cut)
print(clustering_indices)

# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")

# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(hclust_data, cut)
print(db_index)
```

Hierarchical Clustering Redone
```{r}
# Filter out observations in clusters other than cluster 1
rfm_filtered <- rfm[rfm$cluster == 1, ]

# Perform hierarchical clustering on the filtered dataset
hclust_data_filtered <- rfm_filtered[, c("recency_scaled", "frequency_scaled", "monetary_scaled")]
hclust_model_filtered <- hclust(dist(hclust_data_filtered))

# Specify the desired number of clusters
desired_clusters_filtered <- 4

# Determine the cutoff height for the desired number of clusters
cutoff_height_filtered <- hclust_model_filtered$height[length(hclust_model_filtered$height) - (desired_clusters_filtered - 1)]
cat("Cut-off height at", desired_clusters_filtered, "cluster(s):", cutoff_height_filtered, "\n")

# Plot the dendrogram with an indication of the cut-off
plot(hclust_model_filtered, labels = FALSE)
abline(h = cutoff_height_filtered, col = "red", lty = 2)

# Determine and visualize the optimal number of clusters up to 10 with bootstrapping up to 10
fviz_nbclust(hclust_data_filtered, hcut, method = "wss")
fviz_nbclust(hclust_data_filtered, hcut, method = "silhouette")
gap_stat_hclust_filtered <- clusGap(hclust_data_filtered, hcut, K.max = 10, B = 10)
fviz_gap_stat(gap_stat_hclust_filtered)

# Determine the tree cut for a desired number of clusters
cut_filtered <- cutree(hclust_model_filtered, k = desired_clusters_filtered)

# Add cluster labels to the filtered dataset
rfm_filtered$cluster <- as.factor(cut_filtered)

# Visualize data points plotted against recency, monetary, and frequency
ggplot(rfm_filtered, aes(x = recency, y = monetary, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Monetary", color = "Cluster") +
  theme_minimal()

ggplot(rfm_filtered, aes(x = recency, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Recency", y = "Frequency", color = "Cluster") +
  theme_minimal()

ggplot(rfm_filtered, aes(x = monetary, y = frequency, color = cluster)) +
  geom_point() +
  labs(x = "Monetary", y = "Frequency", color = "Cluster") +
  theme_minimal()

# Cluster analysis
cluster_analysis_filtered <- rfm_filtered %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis_filtered)

# Silhouette analysis
sil_filtered <- silhouette(cut_filtered, dist(hclust_data_filtered))
avg_silhouette_filtered <- mean(sil_filtered[, "sil_width"])

cat("Average Silhouette Width:", avg_silhouette_filtered, "\n")

# Calculate clustering indices using cluster.stats
clustering_indices_filtered <- cluster.stats(dist(hclust_data_filtered), cut_filtered)
print(clustering_indices_filtered)

# Extract the Dunn index from the clustering indices list
dunn_index_filtered <- clustering_indices_filtered$dunn
cat("Dunn Index:", dunn_index_filtered, "\n")

# Calculate the Davies-Bouldin Index
db_index_filtered <- clusterSim::index.DB(hclust_data_filtered, cut_filtered)
print(db_index_filtered)
```


DBSCAN
```{r}
# Load required libraries
library(ggplot2)
library(dplyr)
library(factoextra)
library(cluster)
library(fpc)
library(flexclust)
library(mclust)

# Load the Online Retail dataset
data <- Online_Retail

# Data preprocessing
data <- data %>%
  dplyr::filter(!is.na(CustomerID)) %>%
  dplyr::select(CustomerID, InvoiceNo, InvoiceDate, UnitPrice)

# Calculate total spending (monetary value) per customer
monetary <- data %>%
  group_by(CustomerID) %>%
  summarise(monetary = sum(UnitPrice))

# Calculate the recency and frequency variables
# Recency is calculated by the time elapsed since the last day of the dataset
# Frequency is calculated by summing up the distinct invoices of a customer
recency <- data %>%
  group_by(CustomerID) %>%
  summarise(recency = as.numeric(difftime(max(data$InvoiceDate), max(InvoiceDate), units = "days")),
            frequency = n_distinct(InvoiceNo))

# Merge RFM variables with monetary value
rfm <- left_join(recency, monetary, by = "CustomerID")

# Create distance matrix of RFM data frame 

# Perform DBSCAN clustering
dbscan_model <- dbscan(dist(rfm), eps = 0.5, MinPts = 5, scale = TRUE, method = "raw", showplot = 1)

# Add cluster labels to the original dataset
rfm$cluster <- as.factor(dbscan_model$cluster)

# Visualize the clusters
fviz_cluster(dbscan_model, data = normalized_data)

# Cluster analysis
cluster_analysis <- rfm %>%
  group_by(cluster) %>%
  summarise(average_recency = mean(recency),
            average_frequency = mean(frequency),
            average_monetary = mean(monetary),
            count_customers = n())

print(cluster_analysis)

# Silhouette analysis
sil <- silhouette(dbscan_model$cluster, dist(normalized_data))
avg_silhouette <- mean(sil[, "sil_width"])
cat("Average Silhouette Width:", avg_silhouette, "\n")

# Extract cluster labels from the DBSCAN model
dbscan_labels <- dbscan_model$cluster

# Calculate clustering indices using cluster.stats
clustering_indices <- cluster.stats(dist(rfm), dbscan_labels)
print(clustering_indices)

# Extract the Dunn index from the clustering indices list
dunn_index <- clustering_indices$dunn
cat("Dunn Index:", dunn_index, "\n")

# Calculate the Davies-Bouldin Index
db_index <- clusterSim::index.DB(rfm, dbscan_labels)
print(db_index)



```

