Load the data & Identify potential label column

suppressPackageStartupMessages({
  library(tidyverse)
  library(cluster)       # silhouette
  library(factoextra)    # visualization helpers (optional)
})

path <- "wbc_clustering.csv"
stopifnot(file.exists(path))
raw <- read.csv(path, header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
label_candidates <- grep("diagnosis|class|target|label|cancer", names(raw), ignore.case = TRUE, value = TRUE)
label_col <- if (length(label_candidates)) label_candidates[1] else NULL

Build a numeric-only matrix for clustering & Basic cleaning: median imputation + scaling

drop_like_id <- grep("id|patient|name|index|time|date", names(raw), ignore.case = TRUE, value = TRUE)
drop_cols <- unique(c(drop_like_id, label_col))
X <- raw %>% select(where(is.numeric), -any_of(drop_cols))
if (ncol(X) == 0) stop("No numeric features found for clustering. Please ensure the dataset has numeric predictors.")

impute_median <- function(v) { v[is.na(v)] <- median(v, na.rm = TRUE); v }
X_imp <- X %>% mutate(across(everything(), impute_median))
X_scaled <- scale(X_imp)

Pick k (number of clusters) using silhouette (2..6)

k_grid <- 2:6
sil_scores <- sapply(k_grid, function(k) {
  km <- kmeans(X_scaled, centers = k, nstart = 25, iter.max = 100)
  si <- silhouette(km$cluster, dist(X_scaled))
  mean(si[, 3])
})

best_k <- k_grid[which.max(sil_scores)]
message(sprintf("Silhouette-based best k = %d (scores: %s)", best_k,
                paste(sprintf("%d:%.3f", k_grid, sil_scores), collapse = ", ")))

Final k-means with best_k & Attach cluster to data & summarize

set.seed(42)
km <- kmeans(X_scaled, centers = best_k, nstart = 50, iter.max = 200)

res <- raw %>% mutate(.cluster = km$cluster)

cluster_sizes <- as.data.frame(table(res$.cluster)) %>% arrange(desc(Freq))
print(cluster_sizes)

##   Var1 Freq
## 1    2  345
## 2    1   33

Rank top features that distinguish clusters & Majority mapping from cluster

centers <- as.data.frame(km$centers)
centers$.cluster <- 1:nrow(centers)
# Pairwise spread: for each feature, max(center) - min(center)
spread <- sort(apply(centers[ , setdiff(names(centers), ".cluster"), drop = FALSE], 2,
                     function(col) diff(range(col))), decreasing = TRUE)
top_features <- head(spread, 10)
cat("\nTop features separating clusters (centroid max-min spread):\n")

## 
## Top features separating clusters (centroid max-min spread):

print(round(top_features, 3))

##    X7    X6   X27    X8   X26   X28   X16     y   X30   X13 
## 2.649 2.453 2.390 2.364 2.350 2.199 2.058 2.051 2.013 1.860

if (!is.null(label_col)) {
  # Normalize label to {malignant/positive, benign/negative} where possible
  y_raw <- tolower(trimws(as.character(raw[[label_col]])))
  y <- case_when(
    y_raw %in% c("m","malignant","cancer","yes","1","positive","pos") ~ "positive",
    y_raw %in% c("b","benign","no cancer","no","0","negative","neg")  ~ "negative",
    TRUE ~ y_raw
  )
  res$.label <- factor(y)

   # Majority mapping from cluster -> label
  mapping <- res %>%
    filter(!is.na(.label)) %>%
    count(.cluster, .label) %>%
    group_by(.cluster) %>%
    slice_max(n, with_ties = FALSE) %>%
    ungroup() %>%
    select(.cluster, majority_label = .label)

  res <- res %>% left_join(mapping, by = ".cluster")}

Metrics

 # Runs only if we have the mapping and the joined columns in `res`
if (exists("mapping") &&
    is.data.frame(mapping) && nrow(mapping) > 0 &&
    all(c(".label", "majority_label") %in% names(res))) {

  eval_df <- res %>% dplyr::filter(!is.na(.label), !is.na(majority_label))
  if (nrow(eval_df) == 0) {
    cat("\nNo rows have both .label and majority_label; cannot evaluate.\n")
  } else {
    acc <- mean(eval_df$.label == eval_df$majority_label)

    # Extra metrics (if exactly 2 classes recognized)
    classes <- na.omit(unique(eval_df$.label))
    if (length(classes) == 2) {
      positive_name <- "positive"
      predicted_pos <- eval_df$majority_label == positive_name
      actual_pos    <- eval_df$.label == positive_name

      TP <- sum(predicted_pos &  actual_pos)
      TN <- sum(!predicted_pos & !actual_pos)
      FP <- sum(predicted_pos & !actual_pos)
      FN <- sum(!predicted_pos &  actual_pos)

      sens <- if ((TP + FN) > 0) TP / (TP + FN) else NA_real_
      spec <- if ((TN + FP) > 0) TN / (TN + FP) else NA_real_

      cat(sprintf(
        "\nEvaluation vs. ground truth (post-hoc):\nAccuracy: %.3f | Sensitivity: %s | Specificity: %s\n",
        acc,
        ifelse(is.na(sens), "NA", sprintf("%.3f", sens)),
        ifelse(is.na(spec), "NA", sprintf("%.3f", spec))
      ))
    } else {
      cat(sprintf(
        "\nEvaluation (multi-class/unknown labels):\nClustering accuracy (majority mapping): %.3f\n",
        acc
      ))
    }

    # Show cluster → majority label map (keep this INSIDE the if)
    cat("\nCluster → majority label mapping:\n")
    print(mapping %>% dplyr::arrange(.cluster))
  }

} else {
  cat("\nNo label column detected or mapping not built. Proceeded in a fully unsupervised manner.\n")
  cat("Interpretation tip: The cluster whose centroid shows higher values on\n")
  cat("known malignancy-associated features (e.g., larger radii, texture, concavity)\n")
  cat("would be the 'higher-likelihood' cancer group.\n")
}

## 
## No label column detected or mapping not built. Proceeded in a fully unsupervised manner.
## Interpretation tip: The cluster whose centroid shows higher values on
## known malignancy-associated features (e.g., larger radii, texture, concavity)
## would be the 'higher-likelihood' cancer group.

Identify the likely higher-risk cluster(s) for k=2 case

if (best_k == 2) {
  # Heuristic: the cluster with larger average across top-separating features gets flagged
  tf_names <- names(top_features)
  cluster_means <- centers %>%
    select(all_of(tf_names)) %>%
    mutate(.cluster = 1:2) %>%
    rowwise() %>%
    mutate(avg_top = mean(c_across(all_of(tf_names)))) %>%
    ungroup()
  likely_pos_cluster <- cluster_means$.cluster[which.max(cluster_means$avg_top)]
  cat(sprintf("\nHeuristic (k=2): Cluster %d shows stronger 'risk-feature' intensity; flag as higher-likelihood.\n",
              likely_pos_cluster))
}

## 
## Heuristic (k=2): Cluster 1 shows stronger 'risk-feature' intensity; flag as higher-likelihood.

2D visualization (PCA)

#     (Helpful eyeball check; safe to comment out if not needed.)
pca <- prcomp(X_scaled, center = TRUE, scale. = FALSE)
plot_df <- as.data.frame(pca$x[, 1:2]) %>%
  mutate(cluster = factor(km$cluster))
ggplot(plot_df, aes(PC1, PC2, color = cluster)) +
  geom_point(alpha = 0.7, size = 1.6) +
  labs(title = sprintf("Patient Clusters (k = %d) via PCA projection", best_k)) +
  theme_minimal()

Cluster 1 (red) contains a relatively small group of patients that lie farther from the main population cloud. Their separation indicates that their underlying feature patterns differ noticeably from the majority. These patients may represent those with abnormal or cancer-like characteristics, as their measurements deviate strongly from the overall average. Cluster 2 (blue) forms a larger, denser cluster, capturing most patients whose profiles are more homogeneous and close to the population center—likely representing non-cancer or lower-risk cases.

MBAn556_11.03_Cancer Risk Prediction Excercise

Yuhan Wen, Yash Nagal

2025-11-03