#library
library(dbplyr)
library(tidyverse)
library(cluster)
library(ClusterR)
library(factoextra)
library(flexclust)
library(leaflet)
library(maptiles)
library(scales)
library(kableExtra)
library(viridis)
library(Rtsne)
library(fpc)
library(clusterSim)

1 Introduction

This document represents the unsupervised learning of clustering study, aiming to inform the siting of delivery courier stations, by clustering features from working population and income characteristics combined with geographic location.

The data set of population and income comes from the 2021 Population Census generating by (The Government of the Hong Kong Special Administrative Region.)

In the data set, they divided population into small tertiary planning unit groups (STPUGs).
the geographic of the longtitude and latitude of STPUGs could be found in the website of (Common Spatial Data Infrastructure.)

2 Thesis guidance

The analysis follows three ideas from literature:

1. K-means source zoning on spatial point data, similar to the K-means delineation of seismic zones in the Aegean region.¹
1. Customer segmentation with PCA and K-means, as in energy-load and customer-segmentation studies, where dimensional reduction improves cluster interpret ability.²
1. Customer segmentation in churn models, where K-means is used as a pre-processing step before further modelling; Here, we focus on the segmentation part.³

3 Data process

In the process, I aggregate two data set together, and omit 17 rows of data which aren’t involved in all two data set.(Including:1 row of data was shown”9999”,which is a mistake data;9 rows of data might be some places where nobody lived;7 rows of data are not seen in the geographic data set because of unexplained reason.)

Also, I identify the high income population who are earned over HKD 30,000(≈EUR 3,330).Following (Hong Kong’s Census and Statistics Department(2021)), the monthly income of HKD 30,000 roughly corresponds to the top 25% of full-time employees.

Therefore, individuals earning ≥ HKD 30,000 are classified as high-income workers in this study.

Click here to expand the code of data process

#1. population and income variables
hk_raw_data <- read_csv("dataset/raw_data_STPUG.csv",skip = 4, show_col_types = FALSE)

hk_filtered <- hk_raw_data[!is.na(hk_raw_data[["stpug"]]), ]
hk_filtered <- hk_filtered[-212,]

hk_selected <- hk_filtered %>%
  mutate(
    stpug = substr(stpug, 1, 3),
    across(where(is.character),
           ~ as.numeric(stringr::str_remove_all(., " "))),
    across(c(t_pop), as.numeric),
    Total_Working = t_lf)%>%
  mutate(
    High_Income = rowSums(dplyr::select(., mearn_xfdh_5:mearn_xfdh_7), na.rm = TRUE)
  )


income_demand <- hk_selected[,c("stpug", "Total_Working","High_Income")]

head(income_demand)

## # A tibble: 6 × 3
##   stpug Total_Working High_Income
##   <dbl>         <dbl>       <dbl>
## 1   111         34591       12779
## 2   112         29179       10761
## 3   113         13658        6137
## 4   114          4886        2250
## 5   115          1070         212
## 6   116          7210        1816

#2.introduce geographic statis
library(sf)

sf_data  <- st_read("dataset/SSBG_11C_converted.json", quiet = TRUE)
sf_wgs84 <- st_transform(sf_data, crs = 4326)   # HK80:2326 -> WGS84:4326

sf_centroids <- st_centroid(sf_wgs84)

#3.aggregate two data together
coords <- st_coordinates(sf_centroids)

geo_coords <- sf_centroids %>%
  st_drop_geometry() %>%   
  mutate(
    SSBG_Code = ssbg,
    Longitude = coords[, 1],
    Latitude  = coords[, 2],
    stpug = substr(ssbg, 1, 3)
  ) %>%
  dplyr::select(SSBG_Code, Longitude, Latitude, stpug)

geo_agg <- geo_coords %>%
  group_by(stpug) %>%
  summarise(
    Longitude = mean(Longitude),
    Latitude  = mean(Latitude),
    .groups   = "drop"
  ) %>%
  mutate(stpug = as.numeric(stpug)) 

missing_geo <- setdiff(income_demand$stpug, geo_agg$stpug)
missing_geo

## [1] 252 269 286 295 547 548 824

cluster_data <- geo_agg %>%
  inner_join(income_demand, by = "stpug") %>%
  filter(Total_Working > 0) %>%
  mutate(
    Income_per_worker = High_Income / Total_Working
  ) %>%
  dplyr::select(Longitude, Latitude,
         Total_Working, High_Income,
         Income_per_worker,stpug)

4 Overview of the dataset

After processing the data, I generate a data set with 6 columns.

The stpug, Longitude,Latitudecolumns represent the geographic location, where stpug was an index column,and would not be used in cluster.
The Total_Working, High_Incomecolumns represent the feature of data. However, those two columns couldn’t explain sufficiently of the structural information of the stpug area.
Therefore, I define another columnIncome_per_worker which is a percentage-base features to yield clusters and capture economic intensity of each stpug.(Referenced by a paper processing customer churn.)

head(cluster_data)

## # A tibble: 6 × 6
##   Longitude Latitude Total_Working High_Income Income_per_worker stpug
##       <dbl>    <dbl>         <dbl>       <dbl>             <dbl> <dbl>
## 1      114.     22.3         34591       12779             0.369   111
## 2      114.     22.3         29179       10761             0.369   112
## 3      114.     22.3         13658        6137             0.449   113
## 4      114.     22.3          4886        2250             0.460   114
## 5      114.     22.3          1070         212             0.198   115
## 6      114.     22.3          7210        1816             0.252   116

hk_dict <- read_csv("descripition.csv", show_col_types = FALSE)
hk_dict %>%
  kable(caption = "Description of the variables") %>%
  kable_styling(font_size = 12)

Description of the variables
stpug	Small Tertiary Planning Unit Group
mearn_xfdh_1	Average monthly income (hereinafter the same) Less than HKD 6,000
mearn_xfdh_2	HKD 6,000 - 9,999
mearn_xfdh_3	HKD 10,000 - 19,999
mearn_xfdh_4	HKD 20,000 - 29,999
mearn_xfdh_5	HKD 30,000 - 39,999
mearn_xfdh_6	HKD 40,000 - 59,999
mearn_xfdh_7	HKD 60,000 and over
Longtitude	Longtitude of regional center point
Laititude	Latitude o regional center point
High_income	Working population monthly over HKD 30,000
Total_Working	Labour force
Income_per_worker	Economic structure ratio(High_income/Total Working)

I use a scatter plot to demonstrate clearly about the relation of geographic site,high income percentage and whole working population.Observing the picture, there is an obvious concentrated places at the bottom right, which is the most crowded place in Hong Kong, While the wealthy class living a bit far away.That makes sense in the reality.

plot_a <- quote({

  library(ggplot2)

  ggplot(cluster_data, 
         aes(x = Longitude, 
             y = Latitude,
             color = High_Income,
             size  = Total_Working)) +

    geom_point(alpha = 0.7) +
    scale_size(range = c(2, 5), guide = "legend") + 

    scale_color_cont_hk +

    labs(
      title = "Geographic distribution of working population and income",
      x = "Longitude",
      y = "Latitude",
      color = "High income",
      size  = "Working population"
    ) +

    guides(
      color = guide_colorbar(order = 1),
      size  = guide_legend(order = 2)
    ) +

    theme_minimal(base_size = 12) +
    theme(
      plot.title.position = "plot",
      plot.title = element_text(size = 14, face = "bold", hjust = 0.5),

      axis.title = element_text(size = 12, face = "bold"),
      axis.text  = element_text(size = 10),
      axis.line  = element_line(color = "black", linewidth = 0.4),

      panel.grid.major = element_line(color = "gray85", linetype = "dotted"),
      panel.grid.minor = element_line(color = "gray90", linetype = "dotted"),

      legend.position = "right",
      legend.background = element_rect(fill = alpha("white", 0.7), color = "grey80"),
      legend.key.size = unit(0.3, "cm"),
      legend.title = element_text(size = 8, face = "bold"),
      legend.text  = element_text(size = 8)
    )

})

5 PCA Preprocessing

So far, I have a 5-columns-data for clustering.The original variables (geographic coordinates, working population, income count, and income intensity) operate on different scales. To make the result of cluster more clear, I will process the data to a 2-columns-data. Following the PCA customer segmentation literature, I standardize all features and apply Principal Component Analysis (PCA) to reduce data dimension.

# Standardise numerical features

scaled_data <- cluster_data %>%
mutate(across(everything(), scale)) %>%
dplyr::select(Longitude, Latitude,Total_Working,High_Income,Income_per_worker)%>%
as.data.frame()

# PCA on all five variables

pca_res <- prcomp(scaled_data, center = FALSE, scale. = FALSE)
summary(pca_res)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.3777 1.2689 0.8885 0.7947 0.26605
## Proportion of Variance 0.3796 0.3220 0.1579 0.1263 0.01416
## Cumulative Proportion  0.3796 0.7016 0.8595 0.9858 1.00000

pca_res$rotation

##                           PC1         PC2         PC3         PC4         PC5
## Longitude          0.10606973 -0.53339410 -0.69296918 -0.47320540  0.01050304
## Latitude          -0.03987972  0.61172888  0.06103232 -0.78728403  0.02561817
## Total_Working      0.70535878  0.10627085 -0.01020694  0.06874546  0.69738467
## High_Income        0.68851080 -0.09400494  0.24327672 -0.11078280 -0.66757738
## Income_per_worker -0.12482261 -0.56669929  0.67585578 -0.37317789  0.25928452

From the summary table of components of each column,the share of PC1,PC2,PC3 totally have 86% variance explained.

From the rotation sable:

1.PC1(37.6%) represents the Economic scale effect with high loading value in Total_Working and High_Income.
2.PC2(32.2%) represents Spatial–income contrast,with high loading value in Latitude,Longitude and Income_per_worker.
3.PC3(15.79%) represents Income per worker and spatial axis with high loading value in Longitude, Income_per_worked and High_income.

Therefore, PC1,PC2,PC3 are all needed in the clustering, and PC4(12.63%),PC5(1.41%) will be considered as the noisy data.

Based on the loading structure, PCA is necessary to separate scale, income, and spatial effects before performing clustering. Without PCA, distance-based methods would be dominated by Total_Working, and DBSCAN would fail to capture meaningful economic or spatial patterns.

#data prepare for clustering
pca_scores <- as.data.frame(pca_res$x[, 1:3])
colnames(pca_scores) <- c("PC1", "PC2", "PC3")

#data prepare for draw visualizing map
map_scores <- as.data.frame(pca_res$x[, 1:2])
colnames(map_scores) <- c("PC1", "PC2")

fviz_pca_biplot(
pca_res,
repel   = TRUE,
col.var = "purple",
col.ind = "gray70"
) +
labs(title = "PCA of geographic and income features")

After processing the data, the hopkins_stat is used to make sure the data is available to cluster.The value 0.78 is possible for clustering and the diagonal line of graph is clearly visible.

get_clust_tendency(pca_scores, 2, graph=TRUE, gradient=list(low="red", mid="white", high="blue"), seed = 123)

## $hopkins_stat
## [1] 0.7821313
## 
## $plot

6 Clustering

I consider three clustering methods: K-means, Partitioning Around Medoids (PAM), and DBSCAN. K-means and PAM require to specify optimal number of clusters, whereas DBSCAN automatically determines clusters based on density.

For K-means and PAM,I use three common dices which are Silhouette, Davies–Bouldin and Calinski–Harabasz to find the optimal number of clusters. I define a function to compare the values of dices and plot the graph together.

In summary, the higest value of Silhouette & CH, the best numbers of clusters are, while the lowest value of DB, deciding the numbers of clusters.

And for DBSCAN,I use the regular way to tun parameter of kNN-distance plot.

compute_indices <- function(data, labels) {
labels <- as.integer(labels)

# Remove possible noise label 0 (for DBSCAN)

keep <- labels > 0 
data_use   <- data[keep, , drop = FALSE]
labels_use <- labels[keep]

dist_mat <- dist(data_use)

# Silhouette

sil_obj <- cluster::silhouette(labels_use, dist_mat)
sil_avg <- mean(sil_obj[, "sil_width"])

# Davies–Bouldin

db_val <- clusterSim::index.DB(data_use, labels_use)$DB

# Calinski–Harabasz

k_val <- length(unique(labels_use))
ch_val <- fpc::calinhara(data_use, labels_use, cn = k_val)

tibble(
k   = k_val,
sil = sil_avg,
db  = db_val,
ch  = ch_val
)
}

6.1 CH Index Comparison

6.1.1 K-means

K-means algorithm is centroid-based by minimizing the squared distance between data points and their assigned cluster center. It has a clearer structure but sensitive to outliers.

set.seed(123)

k_grid <- 2:10

indices_kmeans <- map_dfr(k_grid, function(k) {
fit <- kmeans(pca_scores, centers = k, nstart = 25)
compute_indices(pca_scores, fit$cluster) %>%
mutate(method = "kmeans")
})

indices_kmeans

## # A tibble: 9 × 5
##       k   sil    db    ch method
##   <int> <dbl> <dbl> <dbl> <chr> 
## 1     2 0.324 1.44   81.4 kmeans
## 2     3 0.357 1.12  108.  kmeans
## 3     4 0.340 1.06  115.  kmeans
## 4     5 0.339 1.06  109.  kmeans
## 5     6 0.341 1.03  107.  kmeans
## 6     7 0.336 0.996 107.  kmeans
## 7     8 0.314 1.03  107.  kmeans
## 8     9 0.296 1.06  103.  kmeans
## 9    10 0.299 1.03  107.  kmeans

indices_kmeans_long <- indices_kmeans %>%
pivot_longer(cols = c("sil", "db", "ch"),
names_to = "metric",
values_to = "value") %>%
mutate(metric = factor(
metric,
levels = c("sil", "db", "ch"),
labels = c("Silhouette","Davies–Bouldin","Calinski–Harabasz"))
)

library(gridExtra)
p1<-ggplot(indices_kmeans_long,
       aes(x = k, y = value, color = metric)) +
  geom_line(linewidth = 1.2) +            
  geom_point(size = 3) +                  
  scale_color_disc_hk() +
  
  labs(
    title = "Internal indices for K-means",
    x     = "Number of clusters K",
    y     = "Index value",
    color = "Metric"
  ) +
  

  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 15, hjust = 0.5),

    panel.grid.major = element_line(color = "gray85", linewidth = 0.4),
    panel.grid.minor = element_line(color = "gray90", linewidth = 0.2, linetype = "dotted"),

    legend.position = c(0.75, 0.75),       #
    legend.background = element_rect(fill = alpha("white", 0.2), color = NA),
    legend.key.size = unit(0.3, "cm"),     
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 10, face = "bold")
  )

p2<-fviz_nbclust(pca_scores, kmeans, method = "silhouette")

By comparing the result these internal validation indices and the quality of clusters(using the average silhouette to approximate), the best optimal numbers of K-means are 3.

The result of average silhouette is 0.34. The visual graph demonstrates 3 clusters clearly, where cluster 1 and 2 are much more concentrated, have distinct boundary and the shapes are closer to a sphere.The cluster 3 has more discrete plot.

set.seed(123)
k_final <- 3
kmeans_final <- kmeans(pca_scores, centers = k_final)

kmeans_labels <- factor(kmeans_final$cluster)

pca_final <- pca_scores %>%
mutate(cluster = kmeans_labels)

ggplot(pca_final, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.8, size = 2) +
stat_ellipse(type = "norm", linetype = "dashed") +
scale_color_disc_hk() +
labs(
title = "Final K-means clustering (K = 3) on PCA scores",
color = "Cluster"
)

sil<-silhouette(kmeans_final$cluster, dist(pca_scores))
fviz_silhouette(sil)

##   cluster size ave.sil.width
## 1       1   64          0.37
## 2       2  100          0.39
## 3       3   40          0.18

6.1.2 PAM

Compared to K-means,PAM is medoid-based by iteratively swapping medoids with non-medoids to improve the overall quality of the clustering.Therefore it is more robust to noise and non-spherical cluster shapes.

set.seed(123)

indices_pam <- map_dfr(k_grid, function(k) {
fit <- cluster::pam(pca_scores, k = k)
compute_indices(pca_scores, fit$clustering) %>%
mutate(method = "PAM")
})

indices_pam_long <- indices_pam %>%
pivot_longer(cols = c("sil", "db", "ch"),
names_to = "metric",
values_to = "value") %>%
mutate(metric = factor(
metric,
levels = c("sil", "db", "ch"),
labels = c("Silhouette","Davies–Bouldin","Calinski–Harabasz"))
)

p3<-ggplot(indices_pam_long,
       aes(x = k, y = value, color = metric)) +
  geom_line(linewidth = 1.2) +            
  geom_point(size = 3) +                  
  scale_color_disc_hk() +
  
  labs(
    title = "Internal indices for PAM",
    x     = "Number of clusters K",
    y     = "Index value",
    color = "Metric"
  ) +
  

  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 15, hjust = 0.5),

    panel.grid.major = element_line(color = "gray85", linewidth = 0.4),
    panel.grid.minor = element_line(color = "gray90", linewidth = 0.2, linetype = "dotted"),

    legend.position = c(0.75, 0.75),       #
    legend.background = element_rect(fill = alpha("white", 0.2), color = NA),
    legend.key.size = unit(0.3, "cm"),     
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 10, face = "bold")
  )
p4<-fviz_nbclust(pca_scores, FUNcluster=cluster::pam)
# pamk_result <- pamk(pca_scores)

In the PAM indices, the peak of CH is still at K = 3, but the overall value is low (around 100), and the Silhouette value is around 0.25~0.35 (low to medium level). This indicates that those clusters by PAM are more evenly distributed and robust, but the separation is not as obvious as K-means.

From PAM Clustering Diagram,both cluster 2 and cluster 3 have significant overlap with cluster 1,and the cluster structure is unclear. In the contrast of K-means and PAM diagram, K-means method describes a better quality in this data set.

set.seed(123)
pam1<-pam(pca_scores,3)
fviz_cluster(pam1,data=pca_scores,main = "PAM Clustering Plot",
             geom="point",ellipse.type="t",pointsize=1,ggtheme=theme_classic())

fviz_silhouette(pam1)

##   cluster size ave.sil.width
## 1       1   58          0.14
## 2       2   93          0.43
## 3       3   53          0.44

6.1.3 DBSCAN

DBSCAN identifies high-density clusters and separates low-density noise points. The process for DBSCAN starts from using kNN-distance plot prediction, finding suitable inflection points, determining parameters, and applying to the dbscan algorithm.

I define a function to get the result with the various of eps so that to see clearly which eps is the most appropriate, considering the number of clusters and the noise.

set.seed(123)
library(dbscan)

eps_values <- seq(0.3, 0.8, by = 0.05)

for (e in eps_values) {
  fit <- dbscan(pca_scores, eps = e, minPts = 7)
  cat("eps =", e, " clusters =", length(unique(fit$cluster)) - 1,
      " noise =", sum(fit$cluster == 0), "\n")
}

## eps = 0.3  clusters = 2  noise = 189 
## eps = 0.35  clusters = 3  noise = 172 
## eps = 0.4  clusters = 2  noise = 164 
## eps = 0.45  clusters = 3  noise = 142 
## eps = 0.5  clusters = 2  noise = 128 
## eps = 0.55  clusters = 3  noise = 98 
## eps = 0.6  clusters = 3  noise = 84 
## eps = 0.65  clusters = 2  noise = 76 
## eps = 0.7  clusters = 2  noise = 57 
## eps = 0.75  clusters = 1  noise = 48 
## eps = 0.8  clusters = 1  noise = 35

kNNdistplot(pca_scores, k = 10)
abline(h = 0.6, col = "red")

The most appropriate combination in this project is (eps,minPts)=(0.6,7).

eps_val     <- 0.6
min_pts_val <- 7

dbscan_fit <- fpc::dbscan(pca_scores, eps = eps_val, MinPts = min_pts_val)

table(dbscan_fit$cluster)

## 
##  0  1  2  3 
## 84 87 27  6

indices_dbscan <- compute_indices(pca_scores, dbscan_fit$cluster) %>%
mutate(method = "DBSCAN")

indices_dbscan

## # A tibble: 1 × 5
##       k   sil    db    ch method
##   <int> <dbl> <dbl> <dbl> <chr> 
## 1     3 0.358 0.765  68.9 DBSCAN

pca_dbscan <- map_scores %>%
mutate(cluster = factor(ifelse(dbscan_fit$cluster == 0,
"Noise",
paste0("C", dbscan_fit$cluster))))

ggplot(pca_dbscan, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.8, size = 2) +
scale_color_disc_hk() +
labs(
title = "DBSCAN clustering in PCA space",
color = "Cluster"
)

Under DBSCAN, still the best cluster number is 3.

This graph demonstrates three dense clusters (C1–C3), and low-density points are classified as Noise from scattered observations, highlighting density-based structures that may not be captured by K-means or PAM.

7 Selected Clustering Model and Profiling

After observation three methods of clustering, K-means has the best clustering quality. I will use the cluster generated by K-means to set up a model to detect economical structure of HongKong.

clustered_data <- cluster_data %>%
mutate(cluster = kmeans_labels)

cluster_summary <- clustered_data %>%
group_by(cluster) %>%
summarise(
n_areas           = n(),
avg_longitude     = mean(Longitude),
avg_latitude      = mean(Latitude),
avg_working       = mean(Total_Working),
avg_high_income   = mean(High_Income),
income_per_worker = mean(Income_per_worker),
.groups = "drop"
)

cluster_summary %>%
kable(digits = 2,
caption = "Cluster-level summary statistics") %>%
kable_styling(full_width = FALSE)

Cluster-level summary statistics
cluster	n_areas	avg_longitude	avg_latitude	avg_working	avg_high_income	income_per_worker
1	64	114.08	22.42	9808.31	1877.34	0.21
2	100	114.19	22.31	11181.48	3629.75	0.35
3	40	114.15	22.36	53925.25	11695.02	0.22

The clustering results reveal three groups of areas that vary primarily in their level of economic activity.

Cluster 1 consists of locations with smaller working populations and fewer high-income residents.
Cluster 2 includes areas of intermediate size, where both the labour force and the number of high-income residents increase noticeably compared with Cluster 1.
Cluster 3 differs from the other two groups: although it covers fewer areas, these districts have very large working populations and substantially higher concentrations of high-income residents.

These differences are also visible in the scatter plot of High Income versus Total Working Population.

Cluster 1 and Cluster 2 lie close to each other in the lower-left part of the graph, whereas Cluster 3 extends toward the upper-right region, showing that these areas contribute much more to overall economic activity.

This pattern indicates a clear separation between smaller neighbourhoods, mid-sized districts, and major employment centres.

ggplot(clustered_data,
aes(x = Total_Working, y = High_Income,
color = cluster)) +
geom_point(alpha = 0.7) +
scale_color_disc_hk() +
labs(
title = "Working population and high income by cluster",
color = "Cluster")

8 Spatial Visualisation

The aim of this paper is to give recommendations and conclusions regarding the establishment of courier stations in Hong Kong (large stations, similar to distribution centers, without considering offline consumption scenarios).

From a logistics standpoint, areas in Cluster 3 present the strongest potential: their dense employment, higher income levels, and active economic environment can support substantial parcel flows and sustained operational demand.
Cluster 2 can serve as an appropriate layer of secondary sites, especially for expanding coverage to nearby residential or mixed-use districts.
Cluster 1, while important for final delivery, does not exhibit the conditions needed for large sorting facilities due to its smaller workforce and more modest demand.
In summary, the results suggest positioning major distribution centres within Cluster 3, supported by medium-sized facilities in Cluster 2. Cluster 1 is more suitable for smaller courier stations that serve local areas.Together, this structure forms a distribution network that aligns with HongKong’s economic spatial pattern and supports efficient parcel movement.

ggplot(clustered_data,
aes(x = Longitude, y = Latitude)) +
geom_point(aes(color = cluster), alpha = 0.8, size = 2) +
scale_color_disc_hk() +
labs(
title = "Spatial distribution of K-means clusters",
color = "Cluster"
)

pal_cluster <- colorFactor(
palette = viridis(length(levels(clustered_data$cluster)), option = "C"),
domain  = clustered_data$cluster
)

plot_b<-leaflet(clustered_data) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
lng = ~Longitude,
lat = ~Latitude,
radius = 4,
stroke = FALSE,
fillOpacity = 0.8,
color = ~pal_cluster(cluster),
popup = ~paste0(
"Cluster: ", cluster,
"Working population: ", round(Total_Working),
"High income: ", round(High_Income),
"Income per worker: ",
round(Income_per_worker, 2)
)
) %>%
addLegend(
"bottomright",
pal    = pal_cluster,
values = ~cluster,
title  = "K-means cluster",
opacity = 0.9
)%>%


 htmlwidgets::onRender("
    function(el, x) {
      var legend = el.querySelector('.legend.leaflet-control');
      if (legend) {
        legend.style.background = 'rgba(255,255,255,0.2)';
        legend.style.border = 'none';
        legend.style.boxShadow = 'none';
        
        legend.style.width = '110px';       
        legend.style.padding = '6px 8px';   
        legend.style.lineHeight = '1.1';

        var title = legend.querySelector('.legend-title');
        if (title) {
          title.style.fontSize = '12px';
          title.style.marginBottom = '4px';
        }

        var items = legend.querySelectorAll('i');
        items.forEach(function(i){
          i.style.width = '12px';    
          i.style.height = '12px';
          i.style.marginRight = '6px';
        });

        legend.style.right = '1px';
        legend.style.top = '50%';
        legend.style.left = 'auto';
        legend.style.transform = 'translateY(-50%)';
      }
    }
  ")

plot_b

9 References

Weatherill, G., & Burton, P. W. (2009). Delineation of shallow seismic source zones using K-means cluster analysis, with application to the Aegean region. Geophysical Journal International, 176(2), 565–588.↩︎
Xiahou, X., & Harada, Y. (2022). B2C e-commerce customer churn prediction based on K-means and SVM. Journal of Theoretical and Applied Electronic Commerce Research, 17(2), 458–475.↩︎
du Toit, J., Davimes, R., Mohamed, A., Patel, K., & Nye, J. M. (2016). Customer segmentation using unsupervised learning on daily energy load profiles. Journal of Advances in Information Technology, 7(2), 69–75.↩︎

Clustering Hong Kong Areas by Working Population and Income

Kaiwen Qu

2026-01-28