#library
library(dbplyr)
library(tidyverse)
library(cluster)
library(ClusterR)
library(factoextra)
library(flexclust)
library(leaflet)
library(maptiles)
library(scales)
library(kableExtra)
library(viridis)
library(Rtsne)
library(fpc)
library(clusterSim)
This document represents the unsupervised learning of clustering study, aiming to inform the siting of delivery courier stations, by clustering features from working population and income characteristics combined with geographic location.
The data set of population and income comes from the 2021 Population Census generating by (The Government of the Hong Kong Special Administrative Region.)
In the data set, they divided population into small tertiary planning unit groups (STPUGs).
the geographic of the longtitude and latitude of STPUGs could be found in the website of (Common Spatial Data Infrastructure.)
The analysis follows three ideas from literature:
In the process, I aggregate two data set together, and omit 17 rows of data which aren’t involved in all two data set.(Including:1 row of data was shown”9999”,which is a mistake data;9 rows of data might be some places where nobody lived;7 rows of data are not seen in the geographic data set because of unexplained reason.)
Also, I identify the high income population who are earned over HKD 30,000(≈EUR 3,330).Following (Hong Kong’s Census and Statistics Department(2021)), the monthly income of HKD 30,000 roughly corresponds to the top 25% of full-time employees.
Therefore, individuals earning ≥ HKD 30,000 are classified as high-income workers in this study.
#1. population and income variables
hk_raw_data <- read_csv("dataset/raw_data_STPUG.csv",skip = 4, show_col_types = FALSE)
hk_filtered <- hk_raw_data[!is.na(hk_raw_data[["stpug"]]), ]
hk_filtered <- hk_filtered[-212,]
hk_selected <- hk_filtered %>%
mutate(
stpug = substr(stpug, 1, 3),
across(where(is.character),
~ as.numeric(stringr::str_remove_all(., " "))),
across(c(t_pop), as.numeric),
Total_Working = t_lf)%>%
mutate(
High_Income = rowSums(dplyr::select(., mearn_xfdh_5:mearn_xfdh_7), na.rm = TRUE)
)
income_demand <- hk_selected[,c("stpug", "Total_Working","High_Income")]
head(income_demand)
## # A tibble: 6 × 3
## stpug Total_Working High_Income
## <dbl> <dbl> <dbl>
## 1 111 34591 12779
## 2 112 29179 10761
## 3 113 13658 6137
## 4 114 4886 2250
## 5 115 1070 212
## 6 116 7210 1816
#2.introduce geographic statis
library(sf)
sf_data <- st_read("dataset/SSBG_11C_converted.json", quiet = TRUE)
sf_wgs84 <- st_transform(sf_data, crs = 4326) # HK80:2326 -> WGS84:4326
sf_centroids <- st_centroid(sf_wgs84)
#3.aggregate two data together
coords <- st_coordinates(sf_centroids)
geo_coords <- sf_centroids %>%
st_drop_geometry() %>%
mutate(
SSBG_Code = ssbg,
Longitude = coords[, 1],
Latitude = coords[, 2],
stpug = substr(ssbg, 1, 3)
) %>%
dplyr::select(SSBG_Code, Longitude, Latitude, stpug)
geo_agg <- geo_coords %>%
group_by(stpug) %>%
summarise(
Longitude = mean(Longitude),
Latitude = mean(Latitude),
.groups = "drop"
) %>%
mutate(stpug = as.numeric(stpug))
missing_geo <- setdiff(income_demand$stpug, geo_agg$stpug)
missing_geo
## [1] 252 269 286 295 547 548 824
cluster_data <- geo_agg %>%
inner_join(income_demand, by = "stpug") %>%
filter(Total_Working > 0) %>%
mutate(
Income_per_worker = High_Income / Total_Working
) %>%
dplyr::select(Longitude, Latitude,
Total_Working, High_Income,
Income_per_worker,stpug)
After processing the data, I generate a data set with 6 columns.
The stpug,
Longitude,Latitudecolumns represent the
geographic location, where stpug was an index column,and
would not be used in cluster.
The Total_Working, High_Incomecolumns
represent the feature of data. However, those two columns couldn’t
explain sufficiently of the structural information of the
stpug area.
Therefore, I define another columnIncome_per_worker
which is a percentage-base features to yield clusters and capture
economic intensity of each stpug.(Referenced by a paper
processing customer churn.)
head(cluster_data)
## # A tibble: 6 × 6
## Longitude Latitude Total_Working High_Income Income_per_worker stpug
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 114. 22.3 34591 12779 0.369 111
## 2 114. 22.3 29179 10761 0.369 112
## 3 114. 22.3 13658 6137 0.449 113
## 4 114. 22.3 4886 2250 0.460 114
## 5 114. 22.3 1070 212 0.198 115
## 6 114. 22.3 7210 1816 0.252 116
hk_dict <- read_csv("descripition.csv", show_col_types = FALSE)
hk_dict %>%
kable(caption = "Description of the variables") %>%
kable_styling(font_size = 12)
| stpug | Small Tertiary Planning Unit Group |
|---|---|
| mearn_xfdh_1 | Average monthly income (hereinafter the same) Less than HKD 6,000 |
| mearn_xfdh_2 | HKD 6,000 - 9,999 |
| mearn_xfdh_3 | HKD 10,000 - 19,999 |
| mearn_xfdh_4 | HKD 20,000 - 29,999 |
| mearn_xfdh_5 | HKD 30,000 - 39,999 |
| mearn_xfdh_6 | HKD 40,000 - 59,999 |
| mearn_xfdh_7 | HKD 60,000 and over |
| Longtitude | Longtitude of regional center point |
| Laititude | Latitude o regional center point |
| High_income | Working population monthly over HKD 30,000 |
| Total_Working | Labour force |
| Income_per_worker | Economic structure ratio(High_income/Total Working) |
I use a scatter plot to demonstrate clearly about the relation of geographic site,high income percentage and whole working population.Observing the picture, there is an obvious concentrated places at the bottom right, which is the most crowded place in Hong Kong, While the wealthy class living a bit far away.That makes sense in the reality.
plot_a <- quote({
library(ggplot2)
ggplot(cluster_data,
aes(x = Longitude,
y = Latitude,
color = High_Income,
size = Total_Working)) +
geom_point(alpha = 0.7) +
scale_size(range = c(2, 5), guide = "legend") +
scale_color_cont_hk +
labs(
title = "Geographic distribution of working population and income",
x = "Longitude",
y = "Latitude",
color = "High income",
size = "Working population"
) +
guides(
color = guide_colorbar(order = 1),
size = guide_legend(order = 2)
) +
theme_minimal(base_size = 12) +
theme(
plot.title.position = "plot",
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 10),
axis.line = element_line(color = "black", linewidth = 0.4),
panel.grid.major = element_line(color = "gray85", linetype = "dotted"),
panel.grid.minor = element_line(color = "gray90", linetype = "dotted"),
legend.position = "right",
legend.background = element_rect(fill = alpha("white", 0.7), color = "grey80"),
legend.key.size = unit(0.3, "cm"),
legend.title = element_text(size = 8, face = "bold"),
legend.text = element_text(size = 8)
)
})
So far, I have a 5-columns-data for clustering.The original variables (geographic coordinates, working population, income count, and income intensity) operate on different scales. To make the result of cluster more clear, I will process the data to a 2-columns-data. Following the PCA customer segmentation literature, I standardize all features and apply Principal Component Analysis (PCA) to reduce data dimension.
# Standardise numerical features
scaled_data <- cluster_data %>%
mutate(across(everything(), scale)) %>%
dplyr::select(Longitude, Latitude,Total_Working,High_Income,Income_per_worker)%>%
as.data.frame()
# PCA on all five variables
pca_res <- prcomp(scaled_data, center = FALSE, scale. = FALSE)
summary(pca_res)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.3777 1.2689 0.8885 0.7947 0.26605
## Proportion of Variance 0.3796 0.3220 0.1579 0.1263 0.01416
## Cumulative Proportion 0.3796 0.7016 0.8595 0.9858 1.00000
pca_res$rotation
## PC1 PC2 PC3 PC4 PC5
## Longitude 0.10606973 -0.53339410 -0.69296918 -0.47320540 0.01050304
## Latitude -0.03987972 0.61172888 0.06103232 -0.78728403 0.02561817
## Total_Working 0.70535878 0.10627085 -0.01020694 0.06874546 0.69738467
## High_Income 0.68851080 -0.09400494 0.24327672 -0.11078280 -0.66757738
## Income_per_worker -0.12482261 -0.56669929 0.67585578 -0.37317789 0.25928452
From the summary table of components of each column,the share of PC1,PC2,PC3 totally have 86% variance explained.
From the rotation sable:
1.PC1(37.6%) represents the Economic scale effect with high loading value in Total_Working and High_Income.
2.PC2(32.2%) represents Spatial–income contrast,with high loading value in Latitude,Longitude and Income_per_worker.
3.PC3(15.79%) represents Income per worker and spatial axis with high loading value in Longitude, Income_per_worked and High_income.
Therefore, PC1,PC2,PC3 are all needed in the clustering, and PC4(12.63%),PC5(1.41%) will be considered as the noisy data.
Based on the loading structure, PCA is necessary to separate scale, income, and spatial effects before performing clustering. Without PCA, distance-based methods would be dominated by Total_Working, and DBSCAN would fail to capture meaningful economic or spatial patterns.
#data prepare for clustering
pca_scores <- as.data.frame(pca_res$x[, 1:3])
colnames(pca_scores) <- c("PC1", "PC2", "PC3")
#data prepare for draw visualizing map
map_scores <- as.data.frame(pca_res$x[, 1:2])
colnames(map_scores) <- c("PC1", "PC2")
fviz_pca_biplot(
pca_res,
repel = TRUE,
col.var = "purple",
col.ind = "gray70"
) +
labs(title = "PCA of geographic and income features")
After processing the data, the hopkins_stat is used to make sure the
data is available to cluster.The value 0.78 is possible for clustering
and the diagonal line of graph is clearly visible.
get_clust_tendency(pca_scores, 2, graph=TRUE, gradient=list(low="red", mid="white", high="blue"), seed = 123)
## $hopkins_stat
## [1] 0.7821313
##
## $plot
I consider three clustering methods: K-means, Partitioning Around Medoids (PAM), and DBSCAN. K-means and PAM require to specify optimal number of clusters, whereas DBSCAN automatically determines clusters based on density.
For K-means and PAM,I use three common dices which are Silhouette, Davies–Bouldin and Calinski–Harabasz to find the optimal number of clusters. I define a function to compare the values of dices and plot the graph together.
In summary, the higest value of Silhouette & CH, the best numbers of clusters are, while the lowest value of DB, deciding the numbers of clusters.
And for DBSCAN,I use the regular way to tun parameter of kNN-distance plot.
compute_indices <- function(data, labels) {
labels <- as.integer(labels)
# Remove possible noise label 0 (for DBSCAN)
keep <- labels > 0
data_use <- data[keep, , drop = FALSE]
labels_use <- labels[keep]
dist_mat <- dist(data_use)
# Silhouette
sil_obj <- cluster::silhouette(labels_use, dist_mat)
sil_avg <- mean(sil_obj[, "sil_width"])
# Davies–Bouldin
db_val <- clusterSim::index.DB(data_use, labels_use)$DB
# Calinski–Harabasz
k_val <- length(unique(labels_use))
ch_val <- fpc::calinhara(data_use, labels_use, cn = k_val)
tibble(
k = k_val,
sil = sil_avg,
db = db_val,
ch = ch_val
)
}
K-means algorithm is centroid-based by minimizing the squared distance between data points and their assigned cluster center. It has a clearer structure but sensitive to outliers.
set.seed(123)
k_grid <- 2:10
indices_kmeans <- map_dfr(k_grid, function(k) {
fit <- kmeans(pca_scores, centers = k, nstart = 25)
compute_indices(pca_scores, fit$cluster) %>%
mutate(method = "kmeans")
})
indices_kmeans
## # A tibble: 9 × 5
## k sil db ch method
## <int> <dbl> <dbl> <dbl> <chr>
## 1 2 0.324 1.44 81.4 kmeans
## 2 3 0.357 1.12 108. kmeans
## 3 4 0.340 1.06 115. kmeans
## 4 5 0.339 1.06 109. kmeans
## 5 6 0.341 1.03 107. kmeans
## 6 7 0.336 0.996 107. kmeans
## 7 8 0.314 1.03 107. kmeans
## 8 9 0.296 1.06 103. kmeans
## 9 10 0.299 1.03 107. kmeans
indices_kmeans_long <- indices_kmeans %>%
pivot_longer(cols = c("sil", "db", "ch"),
names_to = "metric",
values_to = "value") %>%
mutate(metric = factor(
metric,
levels = c("sil", "db", "ch"),
labels = c("Silhouette","Davies–Bouldin","Calinski–Harabasz"))
)
library(gridExtra)
p1<-ggplot(indices_kmeans_long,
aes(x = k, y = value, color = metric)) +
geom_line(linewidth = 1.2) +
geom_point(size = 3) +
scale_color_disc_hk() +
labs(
title = "Internal indices for K-means",
x = "Number of clusters K",
y = "Index value",
color = "Metric"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, hjust = 0.5),
panel.grid.major = element_line(color = "gray85", linewidth = 0.4),
panel.grid.minor = element_line(color = "gray90", linewidth = 0.2, linetype = "dotted"),
legend.position = c(0.75, 0.75), #
legend.background = element_rect(fill = alpha("white", 0.2), color = NA),
legend.key.size = unit(0.3, "cm"),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10, face = "bold")
)
p2<-fviz_nbclust(pca_scores, kmeans, method = "silhouette")
By comparing the result these internal validation indices and the
quality of clusters(using the average silhouette to approximate), the
best optimal numbers of K-means are 3.
The result of average silhouette is 0.34. The visual graph demonstrates 3 clusters clearly, where cluster 1 and 2 are much more concentrated, have distinct boundary and the shapes are closer to a sphere.The cluster 3 has more discrete plot.
set.seed(123)
k_final <- 3
kmeans_final <- kmeans(pca_scores, centers = k_final)
kmeans_labels <- factor(kmeans_final$cluster)
pca_final <- pca_scores %>%
mutate(cluster = kmeans_labels)
ggplot(pca_final, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.8, size = 2) +
stat_ellipse(type = "norm", linetype = "dashed") +
scale_color_disc_hk() +
labs(
title = "Final K-means clustering (K = 3) on PCA scores",
color = "Cluster"
)
sil<-silhouette(kmeans_final$cluster, dist(pca_scores))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 64 0.37
## 2 2 100 0.39
## 3 3 40 0.18
Compared to K-means,PAM is medoid-based by iteratively swapping medoids with non-medoids to improve the overall quality of the clustering.Therefore it is more robust to noise and non-spherical cluster shapes.
set.seed(123)
indices_pam <- map_dfr(k_grid, function(k) {
fit <- cluster::pam(pca_scores, k = k)
compute_indices(pca_scores, fit$clustering) %>%
mutate(method = "PAM")
})
indices_pam_long <- indices_pam %>%
pivot_longer(cols = c("sil", "db", "ch"),
names_to = "metric",
values_to = "value") %>%
mutate(metric = factor(
metric,
levels = c("sil", "db", "ch"),
labels = c("Silhouette","Davies–Bouldin","Calinski–Harabasz"))
)
p3<-ggplot(indices_pam_long,
aes(x = k, y = value, color = metric)) +
geom_line(linewidth = 1.2) +
geom_point(size = 3) +
scale_color_disc_hk() +
labs(
title = "Internal indices for PAM",
x = "Number of clusters K",
y = "Index value",
color = "Metric"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, hjust = 0.5),
panel.grid.major = element_line(color = "gray85", linewidth = 0.4),
panel.grid.minor = element_line(color = "gray90", linewidth = 0.2, linetype = "dotted"),
legend.position = c(0.75, 0.75), #
legend.background = element_rect(fill = alpha("white", 0.2), color = NA),
legend.key.size = unit(0.3, "cm"),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10, face = "bold")
)
p4<-fviz_nbclust(pca_scores, FUNcluster=cluster::pam)
# pamk_result <- pamk(pca_scores)
In the PAM indices, the peak of CH is still at K = 3,
but the overall value is low (around 100), and the Silhouette value is
around 0.25~0.35 (low to medium level). This indicates that those
clusters by PAM are more evenly distributed and robust, but the
separation is not as obvious as K-means.
From PAM Clustering Diagram,both cluster 2 and cluster 3 have significant overlap with cluster 1,and the cluster structure is unclear. In the contrast of K-means and PAM diagram, K-means method describes a better quality in this data set.
set.seed(123)
pam1<-pam(pca_scores,3)
fviz_cluster(pam1,data=pca_scores,main = "PAM Clustering Plot",
geom="point",ellipse.type="t",pointsize=1,ggtheme=theme_classic())
fviz_silhouette(pam1)
## cluster size ave.sil.width
## 1 1 58 0.14
## 2 2 93 0.43
## 3 3 53 0.44
DBSCAN identifies high-density clusters and separates low-density noise points. The process for DBSCAN starts from using kNN-distance plot prediction, finding suitable inflection points, determining parameters, and applying to the dbscan algorithm.
I define a function to get the result with the various of eps so that to see clearly which eps is the most appropriate, considering the number of clusters and the noise.
set.seed(123)
library(dbscan)
eps_values <- seq(0.3, 0.8, by = 0.05)
for (e in eps_values) {
fit <- dbscan(pca_scores, eps = e, minPts = 7)
cat("eps =", e, " clusters =", length(unique(fit$cluster)) - 1,
" noise =", sum(fit$cluster == 0), "\n")
}
## eps = 0.3 clusters = 2 noise = 189
## eps = 0.35 clusters = 3 noise = 172
## eps = 0.4 clusters = 2 noise = 164
## eps = 0.45 clusters = 3 noise = 142
## eps = 0.5 clusters = 2 noise = 128
## eps = 0.55 clusters = 3 noise = 98
## eps = 0.6 clusters = 3 noise = 84
## eps = 0.65 clusters = 2 noise = 76
## eps = 0.7 clusters = 2 noise = 57
## eps = 0.75 clusters = 1 noise = 48
## eps = 0.8 clusters = 1 noise = 35
kNNdistplot(pca_scores, k = 10)
abline(h = 0.6, col = "red")
The most appropriate combination in this project is
(eps,minPts)=(0.6,7).
eps_val <- 0.6
min_pts_val <- 7
dbscan_fit <- fpc::dbscan(pca_scores, eps = eps_val, MinPts = min_pts_val)
table(dbscan_fit$cluster)
##
## 0 1 2 3
## 84 87 27 6
indices_dbscan <- compute_indices(pca_scores, dbscan_fit$cluster) %>%
mutate(method = "DBSCAN")
indices_dbscan
## # A tibble: 1 × 5
## k sil db ch method
## <int> <dbl> <dbl> <dbl> <chr>
## 1 3 0.358 0.765 68.9 DBSCAN
pca_dbscan <- map_scores %>%
mutate(cluster = factor(ifelse(dbscan_fit$cluster == 0,
"Noise",
paste0("C", dbscan_fit$cluster))))
ggplot(pca_dbscan, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.8, size = 2) +
scale_color_disc_hk() +
labs(
title = "DBSCAN clustering in PCA space",
color = "Cluster"
)
Under DBSCAN, still the best cluster number is 3.
This graph demonstrates three dense clusters (C1–C3), and low-density points are classified as Noise from scattered observations, highlighting density-based structures that may not be captured by K-means or PAM.
After observation three methods of clustering, K-means has the best clustering quality. I will use the cluster generated by K-means to set up a model to detect economical structure of HongKong.
clustered_data <- cluster_data %>%
mutate(cluster = kmeans_labels)
cluster_summary <- clustered_data %>%
group_by(cluster) %>%
summarise(
n_areas = n(),
avg_longitude = mean(Longitude),
avg_latitude = mean(Latitude),
avg_working = mean(Total_Working),
avg_high_income = mean(High_Income),
income_per_worker = mean(Income_per_worker),
.groups = "drop"
)
cluster_summary %>%
kable(digits = 2,
caption = "Cluster-level summary statistics") %>%
kable_styling(full_width = FALSE)
| cluster | n_areas | avg_longitude | avg_latitude | avg_working | avg_high_income | income_per_worker |
|---|---|---|---|---|---|---|
| 1 | 64 | 114.08 | 22.42 | 9808.31 | 1877.34 | 0.21 |
| 2 | 100 | 114.19 | 22.31 | 11181.48 | 3629.75 | 0.35 |
| 3 | 40 | 114.15 | 22.36 | 53925.25 | 11695.02 | 0.22 |
The clustering results reveal three groups of areas that vary primarily in their level of economic activity.
These differences are also visible in the scatter plot of High Income versus Total Working Population.
Cluster 1 and Cluster 2 lie close to each other in the lower-left part of the graph, whereas Cluster 3 extends toward the upper-right region, showing that these areas contribute much more to overall economic activity.
This pattern indicates a clear separation between smaller neighbourhoods, mid-sized districts, and major employment centres.
ggplot(clustered_data,
aes(x = Total_Working, y = High_Income,
color = cluster)) +
geom_point(alpha = 0.7) +
scale_color_disc_hk() +
labs(
title = "Working population and high income by cluster",
color = "Cluster")
The aim of this paper is to give recommendations and conclusions regarding the establishment of courier stations in Hong Kong (large stations, similar to distribution centers, without considering offline consumption scenarios).
From a logistics standpoint, areas in Cluster 3 present the strongest potential: their dense employment, higher income levels, and active economic environment can support substantial parcel flows and sustained operational demand.
Cluster 2 can serve as an appropriate layer of secondary sites, especially for expanding coverage to nearby residential or mixed-use districts.
Cluster 1, while important for final delivery, does not exhibit the conditions needed for large sorting facilities due to its smaller workforce and more modest demand.
In summary, the results suggest positioning major distribution centres within Cluster 3, supported by medium-sized facilities in Cluster 2. Cluster 1 is more suitable for smaller courier stations that serve local areas.Together, this structure forms a distribution network that aligns with HongKong’s economic spatial pattern and supports efficient parcel movement.
ggplot(clustered_data,
aes(x = Longitude, y = Latitude)) +
geom_point(aes(color = cluster), alpha = 0.8, size = 2) +
scale_color_disc_hk() +
labs(
title = "Spatial distribution of K-means clusters",
color = "Cluster"
)
pal_cluster <- colorFactor(
palette = viridis(length(levels(clustered_data$cluster)), option = "C"),
domain = clustered_data$cluster
)
plot_b<-leaflet(clustered_data) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
lng = ~Longitude,
lat = ~Latitude,
radius = 4,
stroke = FALSE,
fillOpacity = 0.8,
color = ~pal_cluster(cluster),
popup = ~paste0(
"Cluster: ", cluster,
"Working population: ", round(Total_Working),
"High income: ", round(High_Income),
"Income per worker: ",
round(Income_per_worker, 2)
)
) %>%
addLegend(
"bottomright",
pal = pal_cluster,
values = ~cluster,
title = "K-means cluster",
opacity = 0.9
)%>%
htmlwidgets::onRender("
function(el, x) {
var legend = el.querySelector('.legend.leaflet-control');
if (legend) {
legend.style.background = 'rgba(255,255,255,0.2)';
legend.style.border = 'none';
legend.style.boxShadow = 'none';
legend.style.width = '110px';
legend.style.padding = '6px 8px';
legend.style.lineHeight = '1.1';
var title = legend.querySelector('.legend-title');
if (title) {
title.style.fontSize = '12px';
title.style.marginBottom = '4px';
}
var items = legend.querySelectorAll('i');
items.forEach(function(i){
i.style.width = '12px';
i.style.height = '12px';
i.style.marginRight = '6px';
});
legend.style.right = '1px';
legend.style.top = '50%';
legend.style.left = 'auto';
legend.style.transform = 'translateY(-50%)';
}
}
")
plot_b
Weatherill, G., & Burton, P. W. (2009). Delineation of shallow seismic source zones using K-means cluster analysis, with application to the Aegean region. Geophysical Journal International, 176(2), 565–588.↩︎
Xiahou, X., & Harada, Y. (2022). B2C e-commerce customer churn prediction based on K-means and SVM. Journal of Theoretical and Applied Electronic Commerce Research, 17(2), 458–475.↩︎
du Toit, J., Davimes, R., Mohamed, A., Patel, K., & Nye, J. M. (2016). Customer segmentation using unsupervised learning on daily energy load profiles. Journal of Advances in Information Technology, 7(2), 69–75.↩︎