suppressPackageStartupMessages({
library(tidyverse)
library(cluster) # silhouette
library(factoextra) # visualization helpers (optional)
})
path <- "wbc_clustering.csv"
stopifnot(file.exists(path))
raw <- read.csv(path, header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
label_candidates <- grep("diagnosis|class|target|label|cancer", names(raw), ignore.case = TRUE, value = TRUE)
label_col <- if (length(label_candidates)) label_candidates[1] else NULL
drop_like_id <- grep("id|patient|name|index|time|date", names(raw), ignore.case = TRUE, value = TRUE)
drop_cols <- unique(c(drop_like_id, label_col))
X <- raw %>% select(where(is.numeric), -any_of(drop_cols))
if (ncol(X) == 0) stop("No numeric features found for clustering. Please ensure the dataset has numeric predictors.")
impute_median <- function(v) { v[is.na(v)] <- median(v, na.rm = TRUE); v }
X_imp <- X %>% mutate(across(everything(), impute_median))
X_scaled <- scale(X_imp)
k_grid <- 2:6
sil_scores <- sapply(k_grid, function(k) {
km <- kmeans(X_scaled, centers = k, nstart = 25, iter.max = 100)
si <- silhouette(km$cluster, dist(X_scaled))
mean(si[, 3])
})
best_k <- k_grid[which.max(sil_scores)]
message(sprintf("Silhouette-based best k = %d (scores: %s)", best_k,
paste(sprintf("%d:%.3f", k_grid, sil_scores), collapse = ", ")))
set.seed(42)
km <- kmeans(X_scaled, centers = best_k, nstart = 50, iter.max = 200)
res <- raw %>% mutate(.cluster = km$cluster)
cluster_sizes <- as.data.frame(table(res$.cluster)) %>% arrange(desc(Freq))
print(cluster_sizes)
## Var1 Freq
## 1 2 345
## 2 1 33
centers <- as.data.frame(km$centers)
centers$.cluster <- 1:nrow(centers)
# Pairwise spread: for each feature, max(center) - min(center)
spread <- sort(apply(centers[ , setdiff(names(centers), ".cluster"), drop = FALSE], 2,
function(col) diff(range(col))), decreasing = TRUE)
top_features <- head(spread, 10)
cat("\nTop features separating clusters (centroid max-min spread):\n")
##
## Top features separating clusters (centroid max-min spread):
print(round(top_features, 3))
## X7 X6 X27 X8 X26 X28 X16 y X30 X13
## 2.649 2.453 2.390 2.364 2.350 2.199 2.058 2.051 2.013 1.860
if (!is.null(label_col)) {
# Normalize label to {malignant/positive, benign/negative} where possible
y_raw <- tolower(trimws(as.character(raw[[label_col]])))
y <- case_when(
y_raw %in% c("m","malignant","cancer","yes","1","positive","pos") ~ "positive",
y_raw %in% c("b","benign","no cancer","no","0","negative","neg") ~ "negative",
TRUE ~ y_raw
)
res$.label <- factor(y)
# Majority mapping from cluster -> label
mapping <- res %>%
filter(!is.na(.label)) %>%
count(.cluster, .label) %>%
group_by(.cluster) %>%
slice_max(n, with_ties = FALSE) %>%
ungroup() %>%
select(.cluster, majority_label = .label)
res <- res %>% left_join(mapping, by = ".cluster")}
# Runs only if we have the mapping and the joined columns in `res`
if (exists("mapping") &&
is.data.frame(mapping) && nrow(mapping) > 0 &&
all(c(".label", "majority_label") %in% names(res))) {
eval_df <- res %>% dplyr::filter(!is.na(.label), !is.na(majority_label))
if (nrow(eval_df) == 0) {
cat("\nNo rows have both .label and majority_label; cannot evaluate.\n")
} else {
acc <- mean(eval_df$.label == eval_df$majority_label)
# Extra metrics (if exactly 2 classes recognized)
classes <- na.omit(unique(eval_df$.label))
if (length(classes) == 2) {
positive_name <- "positive"
predicted_pos <- eval_df$majority_label == positive_name
actual_pos <- eval_df$.label == positive_name
TP <- sum(predicted_pos & actual_pos)
TN <- sum(!predicted_pos & !actual_pos)
FP <- sum(predicted_pos & !actual_pos)
FN <- sum(!predicted_pos & actual_pos)
sens <- if ((TP + FN) > 0) TP / (TP + FN) else NA_real_
spec <- if ((TN + FP) > 0) TN / (TN + FP) else NA_real_
cat(sprintf(
"\nEvaluation vs. ground truth (post-hoc):\nAccuracy: %.3f | Sensitivity: %s | Specificity: %s\n",
acc,
ifelse(is.na(sens), "NA", sprintf("%.3f", sens)),
ifelse(is.na(spec), "NA", sprintf("%.3f", spec))
))
} else {
cat(sprintf(
"\nEvaluation (multi-class/unknown labels):\nClustering accuracy (majority mapping): %.3f\n",
acc
))
}
# Show cluster → majority label map (keep this INSIDE the if)
cat("\nCluster → majority label mapping:\n")
print(mapping %>% dplyr::arrange(.cluster))
}
} else {
cat("\nNo label column detected or mapping not built. Proceeded in a fully unsupervised manner.\n")
cat("Interpretation tip: The cluster whose centroid shows higher values on\n")
cat("known malignancy-associated features (e.g., larger radii, texture, concavity)\n")
cat("would be the 'higher-likelihood' cancer group.\n")
}
##
## No label column detected or mapping not built. Proceeded in a fully unsupervised manner.
## Interpretation tip: The cluster whose centroid shows higher values on
## known malignancy-associated features (e.g., larger radii, texture, concavity)
## would be the 'higher-likelihood' cancer group.
if (best_k == 2) {
# Heuristic: the cluster with larger average across top-separating features gets flagged
tf_names <- names(top_features)
cluster_means <- centers %>%
select(all_of(tf_names)) %>%
mutate(.cluster = 1:2) %>%
rowwise() %>%
mutate(avg_top = mean(c_across(all_of(tf_names)))) %>%
ungroup()
likely_pos_cluster <- cluster_means$.cluster[which.max(cluster_means$avg_top)]
cat(sprintf("\nHeuristic (k=2): Cluster %d shows stronger 'risk-feature' intensity; flag as higher-likelihood.\n",
likely_pos_cluster))
}
##
## Heuristic (k=2): Cluster 1 shows stronger 'risk-feature' intensity; flag as higher-likelihood.
# (Helpful eyeball check; safe to comment out if not needed.)
pca <- prcomp(X_scaled, center = TRUE, scale. = FALSE)
plot_df <- as.data.frame(pca$x[, 1:2]) %>%
mutate(cluster = factor(km$cluster))
ggplot(plot_df, aes(PC1, PC2, color = cluster)) +
geom_point(alpha = 0.7, size = 1.6) +
labs(title = sprintf("Patient Clusters (k = %d) via PCA projection", best_k)) +
theme_minimal()
Cluster 1 (red) contains a relatively small group of patients that lie
farther from the main population cloud. Their separation indicates that
their underlying feature patterns differ noticeably from the majority.
These patients may represent those with abnormal or cancer-like
characteristics, as their measurements deviate strongly from the overall
average. Cluster 2 (blue) forms a larger, denser cluster, capturing most
patients whose profiles are more homogeneous and close to the population
center—likely representing non-cancer or lower-risk cases.