R Notebook

Geometries are projected to a metric coordinate system (EPSG:2264) to enable distance-based clustering. Spatial coordinates are derived using interior points, and time is transformed into a numeric index relative to the start year.

k-distance plots are used to identify appropriate spatial neighborhood thresholds. Percentile-based summaries provide candidate values for ε (epsilon), supporting a data-driven parameter selection.

MIN_PTS_LIST <- c(10, 15, 20, 25, 30, 40, 50)

coords_xy <- ABT_proj %>%
  st_drop_geometry() %>%
  select(x, y) %>%
  as.matrix()

storage.mode(coords_xy) <- "double"

plots_spatial <- list()
results_spatial <- list()

for (k in MIN_PTS_LIST) {

  nn <- get.knn(coords_xy, k = k)
  k_dist <- sort(nn$nn.dist[, k])

  df_k <- data.frame(index = seq_along(k_dist), dist = k_dist)

  plots_spatial[[as.character(k)]] <- ggplot(df_k, aes(index, dist)) +
    geom_line(color = "steelblue") +
    labs(title = paste("MinPts =", k),
         x = "Sorted observations",
         y = "k-distance (feet)") +
    theme_minimal()

  results_spatial[[as.character(k)]] <- data.frame(
    MinPts = k,
    p85 = quantile(k_dist, 0.85),
    p90 = quantile(k_dist, 0.90),
    p95 = quantile(k_dist, 0.95),
    p96 = quantile(k_dist, 0.96),
    p97 = quantile(k_dist, 0.97),
    p98 = quantile(k_dist, 0.98),
    p99 = quantile(k_dist, 0.99)
  )
}

results_spatial_df <- bind_rows(results_spatial)
results_spatial_df

grid.arrange(grobs = plots_spatial, ncol = 3)

Final clustering parameters are selected based on diagnostic outputs. The spatial threshold is derived from percentile-based k-distance statistics, and the temporal window is defined based on domain knowledge.

MIN_PTS <- 15

eps_spatial <- results_spatial_df %>%
  filter(MinPts == MIN_PTS) %>%
  pull(p96)

TEMPORAL_WINDOW_YEARS <- 2

cat("Selected properties:", "\n")

## Selected properties:

cat("MinPts:", MIN_PTS, "\n")

## MinPts: 15

cat("eps_spatial:", eps_spatial, "feet\n")

## eps_spatial: 5445.264 feet

cat("Temporal window:", TEMPORAL_WINDOW_YEARS, "years\n")

## Temporal window: 2 years

ST-DBSCAN is applied to identify clusters in the combined spatio-temporal space.

Clusters are visualized spatially to assess geographic patterns.

ABT_wgs <- st_transform(ABT_proj, 4326)

# Ensure cluster is factor
ABT_wgs$cluster <- as.factor(ABT_wgs$cluster)

# Identify noise label
noise_label <- "0"

# Define color palette
cluster_levels <- levels(ABT_wgs$cluster)

cluster_colors <- setNames(
  scales::hue_pal()(length(cluster_levels)),
  cluster_levels
)

# Override noise color
if (noise_label %in% cluster_levels) {
  cluster_colors[noise_label] <- "grey80"
}

pal <- leaflet::colorFactor(
  palette = cluster_colors,
  domain = cluster_levels,
  na.color = "grey80"
)

# Leaflet map
leaflet(ABT_wgs) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  
  addPolygons(
    fillColor = ~pal(cluster),
    fillOpacity = 0.6,
    color = ~pal(cluster),
    weight = 0.2,
    smoothFactor = 0.2,
    
    # Hover label
    label = ~paste0(
      "Cluster: ", cluster,
      "<br>Year: ", year
    ) %>% lapply(htmltools::HTML),
    
    # Click popup
    popup = ~paste0(
      "<b>Cluster:</b> ", cluster,
      "<br><b>Year:</b> ", year
    )
  ) %>%
  
  addLegend(
    position = "bottomleft",
    pal = pal,
    values = cluster_levels,
    title = "Clusters",
    opacity = 1
  )

safe_int_criteria <- function(x, labels) {
  labs <- as.integer(as.factor(labels))
  if (length(unique(labs)) < 2) {
    return(list(davies_bouldin = NA_real_, calinski_harabasz = NA_real_))
  }

  out <- clusterCrit::intCriteria(
    traj = as.matrix(x),
    part = labs,
    crit = c("Davies_Bouldin", "Calinski_Harabasz")
  )

  list(
    davies_bouldin = unname(out$davies_bouldin),
    calinski_harabasz = unname(out$calinski_harabasz)
  )
}

safe_silhouette <- function(x, labels) {
  labs <- as.integer(as.factor(labels))
  if (length(unique(labs)) < 2) return(NA_real_)
  sil <- cluster::silhouette(labs, dist(x))
  mean(sil[, 3])
}

safe_dbcv <- function(x, labels) {
  x <- as.matrix(x)
  labs <- as.integer(as.factor(labels))

  if (length(unique(labs)) < 2 || nrow(x) < 50) return(NA_real_)

  out <- tryCatch(dbscan::dbcv(x, labs),
                  error = function(e) NULL,
                  warning = function(w) NULL)

  if (is.numeric(out)) return(out)

  if (is.list(out)) {
    for (nm in c("score", "dbcv", "value")) {
      if (!is.null(out[[nm]]) && is.numeric(out[[nm]])) {
        return(as.numeric(out[[nm]]))
      }
    }
  }

  NA_real_
}

evaluate_partition <- function(data_mat, cluster_vec) {

  noise_idx <- cluster_vec <= 0
  valid_idx <- which(!noise_idx)

  n_clusters <- length(unique(cluster_vec[cluster_vec > 0]))
  noise_ratio <- sum(noise_idx) / length(cluster_vec)

  out <- list(
    n_clusters = n_clusters,
    noise_ratio = noise_ratio,
    silhouette = NA_real_,
    davies_bouldin = NA_real_,
    calinski_harabasz = NA_real_,
    dbcv = NA_real_
  )

  if (length(valid_idx) >= 50 && n_clusters >= 2) {

    x_valid <- data_mat[valid_idx, , drop = FALSE]
    cl_valid <- cluster_vec[valid_idx]

    out$silhouette <- safe_silhouette(x_valid, cl_valid)

    ic <- safe_int_criteria(x_valid, cl_valid)
    out$davies_bouldin <- ic$davies_bouldin
    out$calinski_harabasz <- ic$calinski_harabasz

    if (min(table(cl_valid)) >= 15) {
      out$dbcv <- safe_dbcv(x_valid, cl_valid)
    }
  }

  out
}

run_stdbscan_once <- function(data_mat, eps_spatial, eps_temporal, min_pts) {
  stdbscan::st_dbscan(
    data = data_mat,
    eps_spatial = eps_spatial,
    eps_temporal = eps_temporal,
    min_pts = min_pts,
    search = "kdtree",
    splitRule = "STD"
  )
}

Run Sensitivity Analysis among multiple eps, minimum neighbors and time window

# --- Run Grid Evaluation ---
grid_results <- pmap_dfr(
  param_grid,
  function(eps_mult, min_pts, time_window, eps_spatial_test, eps_temporal_test) {
    
    res <- run_stdbscan_once(
      data_mat,
      eps_spatial = eps_spatial_test,
      eps_temporal = eps_temporal_test,
      min_pts = min_pts
    )
    
    ev <- evaluate_partition(data_mat, res$cluster)
    
    tibble(
      eps_mult = eps_mult,
      min_pts = min_pts,
      time_window = time_window,
      n_clusters = ev$n_clusters,
      noise_ratio = ev$noise_ratio,
      silhouette = ev$silhouette,
      davies_bouldin = ev$davies_bouldin,
      calinski_harabasz = ev$calinski_harabasz,
      dbcv = ev$dbcv
    )
  }
)

# --- Factor + Labels ---
time_labels <- c(
  "1" = "1-Year (strict adjacency)",
  "3" = "3-Year (short persistence)",
  "5" = "5-Year (medium persistence)",
  "7" = "7-Year (long persistence)",
  "9" = "9-Year (very long / chaining risk)"
)

grid_results <- grid_results %>%
  mutate(time_window = factor(time_window, levels = c(1,3,5,7,9)))

# ============================================================
# 1. STRUCTURAL DIAGNOSTICS
# ============================================================

# --- Noise Ratio ---
# Interpretation:
# ↓ lower = less over-constrained
# ↑ high noise = too strict (small ε or small time window)
ggplot(grid_results,
       aes(x = factor(min_pts), y = noise_ratio, fill = factor(eps_mult))) +
  geom_col(position = "dodge") +
  facet_wrap(~ time_window, labeller = labeller(time_window = time_labels)) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal() +
  labs(title = "Noise Ratio Across Parameter Space",
       subtitle = "Lower = better coverage; high values indicate overly strict clustering",
       x = "MinPts", y = "Noise Ratio", fill = "ε multiplier")

# --- Number of Clusters ---
# Interpretation:
# Stable cluster counts across parameters = robust structure
# Large variation = unstable clustering
ggplot(grid_results,
       aes(x = factor(min_pts), y = n_clusters, fill = factor(eps_mult))) +
  geom_col(position = "dodge") +
  facet_wrap(~ time_window, labeller = labeller(time_window = time_labels)) +
  theme_minimal() +
  labs(title = "Cluster Count Stability",
       subtitle = "Stable values indicate robust clustering structure",
       x = "MinPts", y = "Number of Clusters", fill = "ε multiplier")

# ============================================================
# 2. VALIDITY METRICS (HEATMAPS)
# ============================================================

# NOTE:
# - DBCV is the most reliable for DBSCAN
# - Silhouette / DB / CH are approximate (Euclidean assumption)

grid_long <- grid_results %>%
  pivot_longer(cols = c(silhouette, davies_bouldin, calinski_harabasz, dbcv),
               names_to = "metric", values_to = "value")

grid_long$metric <- recode(grid_long$metric,
  "silhouette" = "Silhouette (approx)",
  "davies_bouldin" = "Davies-Bouldin (↓ better)",
  "calinski_harabasz" = "Calinski-Harabasz (↑ better)",
  "dbcv" = "DBCV (density validity)"
)

plot_metric <- function(metric_name) {
  ggplot(grid_long %>% filter(metric == metric_name),
         aes(x = factor(min_pts), y = factor(eps_mult), fill = value)) +
    geom_tile() +
    facet_wrap(~ time_window, labeller = labeller(time_window = time_labels)) +
    theme_minimal() +
    labs(title = paste(metric_name, "Across Parameter Space"),
         x = "MinPts", y = "ε multiplier", fill = metric_name)
}

# --- Plot all metrics ---
plot_metric("Silhouette (approx)")

plot_metric("Davies-Bouldin (↓ better)")

plot_metric("Calinski-Harabasz (↑ better)")

plot_metric("DBCV (density validity)")

# ============================================================
# 3. COMPOSITE ROBUSTNESS SCORE
# ============================================================

# Interpretation:
# Combines multiple criteria into one surface
# Higher = more stable, balanced clustering

grid_ranked <- grid_results %>%
  mutate(
    dbcv_norm  = scales::rescale(dbcv, to = c(0,1), na.rm = TRUE),
    sil_norm   = scales::rescale(silhouette, to = c(0,1), na.rm = TRUE),
    db_norm    = scales::rescale(-davies_bouldin, to = c(0,1), na.rm = TRUE),
    noise_norm = scales::rescale(-noise_ratio, to = c(0,1), na.rm = TRUE)
  ) %>%
  mutate(composite_score = rowMeans(cbind(dbcv_norm, sil_norm, db_norm, noise_norm), na.rm = TRUE))

ggplot(grid_ranked,
       aes(x = factor(min_pts), y = factor(eps_mult), fill = composite_score)) +
  geom_tile() +
  facet_wrap(~ time_window, labeller = labeller(time_window = time_labels)) +
  theme_minimal() +
  labs(title = "Composite Robustness Surface",
       subtitle = "Higher values indicate stable and well-balanced clustering",
       x = "MinPts", y = "ε multiplier", fill = "Score")