#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 1: Setup - Load Necessary Libraries
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("breastCancerVDX", quietly = TRUE)) {
BiocManager::install("breastCancerVDX")
}
# Loading all the libraries used for the analysis.
suppressMessages(library(breastCancerVDX))
suppressMessages(library(Biobase)) # Needed to work with ExpressionSet objects
## Warning: package 'generics' was built under R version 4.5.1
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(pheatmap))
## Warning: package 'pheatmap' was built under R version 4.5.1
suppressMessages(library(clValid)) # Added for cluster validation
## Warning: package 'clValid' was built under R version 4.5.1
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 2: Loading and Preparing the Dataset
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Load the specific dataset object from the library. Its name is 'vdx'.
data(vdx)
# To get the numeric gene expression matrix, we use the exprs() function.
expression_data <- exprs(vdx)
# To get the sample information, we use the pData() function.
sample_info <- pData(vdx)
# For clustering SAMPLES, we need samples in rows and genes in columns.
data_for_clustering <- t(expression_data)
# Best Practice: Checking for missing values.
cat("\nNumber of missing values:", sum(is.na(data_for_clustering)), "\n")
##
## Number of missing values: 0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 3: Comprehensive Cluster Validation with clValid
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
clvalid_results <- clValid(
obj = data_for_clustering, # The numeric data for clustering
nClust = 2:5, # The range of cluster numbers to test
clMethods = c("hierarchical", "kmeans"), # The algorithms to compare
validation = c("internal"), # The types of validation to perform
)
# Seeing the results with the summary() function.
cat("\n--- Summary of clValid Results ---\n")
##
## --- Summary of clValid Results ---
print(summary(clvalid_results))
##
## Clustering Methods:
## hierarchical kmeans
##
## Cluster sizes:
## 2 3 4 5
##
## Validation Measures:
## 2 3 4 5
##
## hierarchical Connectivity 5.8579 5.8579 8.7869 11.7159
## Dunn 0.6998 0.6998 0.6828 0.6828
## Silhouette 0.1500 0.1281 0.0801 0.0648
## kmeans Connectivity 78.4262 177.6778 214.7389 282.9452
## Dunn 0.5218 0.5194 0.5500 0.5659
## Silhouette 0.0658 0.0397 0.0405 0.0378
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 5.8579 hierarchical 2
## Dunn 0.6998 hierarchical 2
## Silhouette 0.1500 hierarchical 2
##
## NULL
# Plotting the results for a quick visual comparison of the scores.
plot(clvalid_results, main = "Cluster Validation Scores")



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 4: Performing and Visualizing a Specific Clustering
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Based on the clValid results, we confirmed that k=2 is a great choice.
# Step 1: Performing Hierarchical Clustering and cutting the tree into 2 clusters.
dist_matrix <- dist(data_for_clustering, method = 'euclidean')
h_clusters <- hclust(dist_matrix, method = 'ward.D2')
cluster_assignments <- cutree(h_clusters, k = 2)
# Step 2: Performing PCA to prepare for plotting.
pca_result <- prcomp(data_for_clustering, scale. = TRUE)
pca_data <- as.data.frame(pca_result$x)
# Step 3: Combining PCA results with cluster assignments and true 'er' status.
pca_data <- pca_data %>%
mutate(
Cluster = as.factor(cluster_assignments),
True_ER_Status = as.factor(sample_info$er) # Extracting the true label
)
# Step 4: Creating the PCA plot.
ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster, shape = True_ER_Status)) +
geom_point(size = 4, alpha = 0.7) +
labs(
title = "PCA of VDX Breast Cancer Dataset (k=2)",
subtitle = "Colored by Hierarchical Cluster, Shape by True ER Status",
x = paste0("PC1 (", round(summary(pca_result)$importance[2,1]*100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca_result)$importance[2,2]*100, 1), "%)")
) +
theme_minimal() +
scale_color_manual(values = c("darkorange", "steelblue"))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Part 5: Creating a Summary Table for Reporting (Corrected for Label Switching)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Creating a contingency table to compare assigned clusters vs. true labels.
confusion_matrix <- table(
Assigned_Cluster = cluster_assignments,
True_ER_Status = sample_info$er
)
# Printing the table to the console.
cat("\n--- Confusion Matrix ---\n")
##
## --- Confusion Matrix ---
print(confusion_matrix)
## True_ER_Status
## Assigned_Cluster 0 1
## 1 42 200
## 2 93 9
# To calculate accuracy correctly, we must account for "label switching".
# The algorithm might group ER-positive samples as "Cluster 2" instead of "Cluster 1".
# We solve this by checking both the main diagonal and the anti-diagonal and picking the max.
# Sum of the main diagonal (e.g., Cluster 1 maps to Label 0, Cluster 2 to Label 1)
diagonal_sum <- sum(diag(confusion_matrix))
# Sum of the anti-diagonal (e.g., Cluster 1 maps to Label 1, Cluster 2 to Label 0)
# We can get this by reversing the matrix columns before taking the diagonal.
anti_diag_sum <- sum(diag(confusion_matrix[, rev(colnames(confusion_matrix))]))
# The number of correctly clustered samples is the maximum of these two possibilities.
correctly_clustered <- max(diagonal_sum, anti_diag_sum)
# Calculate the robust accuracy.
accuracy <- correctly_clustered / sum(confusion_matrix)
cat("\nOverall Clustering Accuracy (corrected for label switching):", round(accuracy * 100, 1), "%\n")
##
## Overall Clustering Accuracy (corrected for label switching): 85.2 %