clustering_exercise

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 1: Setup - Load Necessary Libraries
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

if (!requireNamespace("breastCancerVDX", quietly = TRUE)) {
  BiocManager::install("breastCancerVDX")
}

# Loading all the libraries used for the analysis.
suppressMessages(library(breastCancerVDX))
suppressMessages(library(Biobase)) # Needed to work with ExpressionSet objects

## Warning: package 'generics' was built under R version 4.5.1

suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(pheatmap))

## Warning: package 'pheatmap' was built under R version 4.5.1

suppressMessages(library(clValid))  # Added for cluster validation

## Warning: package 'clValid' was built under R version 4.5.1

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 2: Loading and Preparing the Dataset
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Load the specific dataset object from the library. Its name is 'vdx'.
data(vdx)

# To get the numeric gene expression matrix, we use the exprs() function.
expression_data <- exprs(vdx)
# To get the sample information, we use the pData() function.
sample_info <- pData(vdx)

# For clustering SAMPLES, we need samples in rows and genes in columns.
data_for_clustering <- t(expression_data)

# Best Practice: Checking for missing values.
cat("\nNumber of missing values:", sum(is.na(data_for_clustering)), "\n")

## 
## Number of missing values: 0

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 3: Comprehensive Cluster Validation with clValid
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

clvalid_results <- clValid(
  obj = data_for_clustering,                # The numeric data for clustering
  nClust = 2:5,                             # The range of cluster numbers to test
  clMethods = c("hierarchical", "kmeans"),  # The algorithms to compare
  validation = c("internal"), # The types of validation to perform
  
  
)

# Seeing the results with the summary() function.
cat("\n--- Summary of clValid Results ---\n")

## 
## --- Summary of clValid Results ---

print(summary(clvalid_results))

## 
## Clustering Methods:
##  hierarchical kmeans 
## 
## Cluster sizes:
##  2 3 4 5 
## 
## Validation Measures:
##                                   2        3        4        5
##                                                               
## hierarchical Connectivity    5.8579   5.8579   8.7869  11.7159
##              Dunn            0.6998   0.6998   0.6828   0.6828
##              Silhouette      0.1500   0.1281   0.0801   0.0648
## kmeans       Connectivity   78.4262 177.6778 214.7389 282.9452
##              Dunn            0.5218   0.5194   0.5500   0.5659
##              Silhouette      0.0658   0.0397   0.0405   0.0378
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## Connectivity 5.8579 hierarchical 2       
## Dunn         0.6998 hierarchical 2       
## Silhouette   0.1500 hierarchical 2       
## 
## NULL

# Plotting the results for a quick visual comparison of the scores.
plot(clvalid_results, main = "Cluster Validation Scores")

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 4: Performing and Visualizing a Specific Clustering
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Based on the clValid results, we confirmed that k=2 is a great choice.

# Step 1: Performing Hierarchical Clustering and cutting the tree into 2 clusters.
dist_matrix <- dist(data_for_clustering, method = 'euclidean')
h_clusters <- hclust(dist_matrix, method = 'ward.D2')
cluster_assignments <- cutree(h_clusters, k = 2)

# Step 2: Performing PCA to prepare for plotting.
pca_result <- prcomp(data_for_clustering, scale. = TRUE)
pca_data <- as.data.frame(pca_result$x)

# Step 3: Combining PCA results with cluster assignments and true 'er' status.
pca_data <- pca_data %>%
  mutate(
    Cluster = as.factor(cluster_assignments),
    True_ER_Status = as.factor(sample_info$er) # Extracting the true label
  )

# Step 4: Creating the PCA plot.
ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster, shape = True_ER_Status)) +
  geom_point(size = 4, alpha = 0.7) +
  labs(
    title = "PCA of VDX Breast Cancer Dataset (k=2)",
    subtitle = "Colored by Hierarchical Cluster, Shape by True ER Status",
    x = paste0("PC1 (", round(summary(pca_result)$importance[2,1]*100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca_result)$importance[2,2]*100, 1), "%)")
  ) +
  theme_minimal() +
  scale_color_manual(values = c("darkorange", "steelblue"))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 5: Creating a Summary Table for Reporting (Corrected for Label Switching)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Creating a contingency table to compare assigned clusters vs. true labels.
confusion_matrix <- table(
  Assigned_Cluster = cluster_assignments, 
  True_ER_Status = sample_info$er
)

# Printing the table to the console.
cat("\n--- Confusion Matrix ---\n")

## 
## --- Confusion Matrix ---

print(confusion_matrix)

##                 True_ER_Status
## Assigned_Cluster   0   1
##                1  42 200
##                2  93   9

# To calculate accuracy correctly, we must account for "label switching".
# The algorithm might group ER-positive samples as "Cluster 2" instead of "Cluster 1".
# We solve this by checking both the main diagonal and the anti-diagonal and picking the max.

# Sum of the main diagonal (e.g., Cluster 1 maps to Label 0, Cluster 2 to Label 1)
diagonal_sum <- sum(diag(confusion_matrix))


# Sum of the anti-diagonal (e.g., Cluster 1 maps to Label 1, Cluster 2 to Label 0)
# We can get this by reversing the matrix columns before taking the diagonal.
anti_diag_sum <- sum(diag(confusion_matrix[, rev(colnames(confusion_matrix))]))

# The number of correctly clustered samples is the maximum of these two possibilities.
correctly_clustered <- max(diagonal_sum, anti_diag_sum)

# Calculate the robust accuracy.
accuracy <- correctly_clustered / sum(confusion_matrix)

cat("\nOverall Clustering Accuracy (corrected for label switching):", round(accuracy * 100, 1), "%\n")

## 
## Overall Clustering Accuracy (corrected for label switching): 85.2 %

clustering_exercise_GSE2034.R

Pugsley

2025-09-28