library(factoextra) ## for cluster visualization and heatmap
library(dplyr) ## for piping and select()
library(tidyr) ## for gather(), spread(), unite() and separate()
Format data for
clustering
## Obtain data
d <- read.csv("Pathways/data/SNVIndel.csv", header=TRUE) ## data table in csv format
df <- data.frame(d)
names(df)
## [1] "Gene" "SJQuality" "sample" "chr"
## [5] "posi" "class" "aachange" "proteinGI"
## [9] "mRNA_acc" "Mutant_In_Tumor" "Total_In_Tumor" "Mutant_In_Normal"
## [13] "Total_In_Normal" "ReferenceAllele" "MutantAllele" "Flanking"
dim(df)
## [1] 18748 16
length(unique(df$sample)) ## number of samples
## [1] 112
length(unique(df$Gene)) ## number of mutated gene types
## [1] 9434
## Create binary mutation matrix
mut.tibble_tidy <- df %>%
select(sample, Gene, chr, posi, class) %>%
mutate(sample=as.factor(sample),
Gene=as.factor(Gene),
chr=as.factor(chr),
posi=as.integer(posi),
class=as.factor(class),
) %>%
rename(gene=Gene, location=posi, chrom=chr) %>%
group_by(sample, chrom, location, class) %>% # everything but gene distinction
mutate(freq = n()) %>%
spread(key="gene", value="freq", fill=0)
dim(mut.tibble_tidy) ## event vs. gene table
## [1] 18748 9438
colnames(mut.tibble_tidy) <- gsub(" ", "_", colnames(mut.tibble_tidy)) # replace ' ' with '_'
## Collapse rows with the same sample IDs
## https://stackoverflow.com/questions/56014241/
mut.tibble <- mut.tibble_tidy %>%
group_by(sample) %>%
select(!location) %>%
#summarise(across(where(is.numeric), sum) ) ## columnwise sum
summarise(across(where(is.numeric), ~ ifelse(any(.==1),1,0)) ) ## Y/N table
dim(mut.tibble)
## [1] 112 9435
mut.tibble[11:20,c(1,15:20)] ## Display a subset
## # A tibble: 10 × 7
## sample A2ML1 A3GALT2 A4GALT AADAC AADACL3 AADAT
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MP2PRT-GAAFAW-TTP1-A-1-0-D-A87Z-48 0 0 0 0 0 0
## 2 MP2PRT-GAAFAZ-TTP1-A-1-0-D-A87Z-48 0 0 0 0 0 0
## 3 MP2PRT-GAAFHK-TTP1-A-1-0-D-A85N-48 0 0 0 0 0 0
## 4 MP2PRT-GAAFKA-TTP1-A-1-0-D-A87P-48 0 0 0 0 0 0
## 5 MP2PRT-GAAFYE-TTM1-A-1-0-D-A85N-48 1 0 0 0 0 0
## 6 MP2PRT-GAAGMX-TTP1-A-1-0-D-A87Z-48 0 0 0 0 0 0
## 7 MP2PRT-GAAGRY-TTP1-A-1-0-D-A83Y-48 1 0 0 1 0 0
## 8 MP2PRT-GAAHGH-TTP1-A-1-0-D-A87P-48 0 0 0 0 0 0
## 9 MP2PRT-GAAHNH-TTP1-A-1-0-D-A83Y-48 0 0 0 0 0 0
## 10 MP2PRT-GAAHPD-TTP1-A-1-0-D-A83Y-48 0 0 0 0 0 1
## convert 1st column to rownames
mut.matrix <- data.frame(mut.tibble)
rownames(mut.matrix)<-mut.matrix[,1]
mut.matrix[,1]<-NULL
mut.matrix[11:20,14:19] ## Display a part of the matrix
## A2ML1 A3GALT2 A4GALT AADAC AADACL3 AADAT
## MP2PRT-GAAFAW-TTP1-A-1-0-D-A87Z-48 0 0 0 0 0 0
## MP2PRT-GAAFAZ-TTP1-A-1-0-D-A87Z-48 0 0 0 0 0 0
## MP2PRT-GAAFHK-TTP1-A-1-0-D-A85N-48 0 0 0 0 0 0
## MP2PRT-GAAFKA-TTP1-A-1-0-D-A87P-48 0 0 0 0 0 0
## MP2PRT-GAAFYE-TTM1-A-1-0-D-A85N-48 1 0 0 0 0 0
## MP2PRT-GAAGMX-TTP1-A-1-0-D-A87Z-48 0 0 0 0 0 0
## MP2PRT-GAAGRY-TTP1-A-1-0-D-A83Y-48 1 0 0 1 0 0
## MP2PRT-GAAHGH-TTP1-A-1-0-D-A87P-48 0 0 0 0 0 0
## MP2PRT-GAAHNH-TTP1-A-1-0-D-A83Y-48 0 0 0 0 0 0
## MP2PRT-GAAHPD-TTP1-A-1-0-D-A83Y-48 0 0 0 0 0 1
## Subset data for genes that are affected by >5 samples
dim(mut.matrix)
## [1] 112 9434
mut.matrix <- mut.matrix[which(colSums(mut.matrix)>5)]
dim(mut.matrix)
## [1] 112 252
Distance
evaluation
## Euclidean distance
dist.eucl <- dist(mut.matrix, method = "euclidean")
# Visualize as a heatmap
fviz_dist(dist.eucl)

## Correlation-based distance
dist.corr <- get_dist(mut.matrix, method = "pearson")
## Visualize
fviz_dist(dist.corr)

Unsupervised
Clustering
nc = 10 ## number of clusters
Hierarchical
clustering
hamming <- function(X) {
D <- (1 - X) %*% t(X)
D + t(D)
}
h.dist <- hamming(data.matrix(mut.matrix))
dist <- as.dist(h.dist)
clusters <- hclust(dist)
clustermembership.hclust <- cutree(clusters, k = nc)
## Dendrogram plot
library(ggpubr) ## for get_palette() function
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
mypalette = get_palette(cbPalette,nc)
fviz_dend(clusters,
k = nc, # cut tree into nc groups
cex = 0.5, # label size
k_colors = mypalette,
color_labels_by_k = TRUE, # color labels by groups
ggtheme = theme_gray() # Change theme
)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Remove clusters that have <=2 members
df.hclust<-data.frame(clustermembership.hclust)
df.hclust$sample <- rownames(df.hclust) ## convert rownames to a column
rownames(df.hclust) <- NULL
dim(df.hclust)
## [1] 112 2
df.hclust <- df.hclust %>%
group_by(clustermembership.hclust) %>%
mutate(freq=n()) %>%
filter(freq>2) ## Remove clusters that have <=2 members
dim(df.hclust)
## [1] 102 3
sslist <- df.hclust$sample
## Subset data for samples that are in sslist
dim(mut.matrix)
## [1] 112 252
mut.matrix <- mut.matrix %>%
filter(rownames(mut.matrix) %in% sslist)
dim(mut.matrix)
## [1] 102 252
K-Means
set.seed(1234)
# Best of 240 iterations
result <- kmeans(x = mut.matrix, centers = nc, nstart = 240)
## Warning: did not converge in 10 iterations
clustermembership.kmeans <- fitted(result, method = "classes")
## 2D visualization
mypalette = get_palette(cbPalette,nc)
fviz_cluster(result, data = mut.matrix,
palette = mypalette,
geom = "point",
ggtheme = theme_bw()
)

mclust
library(mclust)
## Package 'mclust' version 5.4.10
## Type 'citation("mclust")' for citing this R package in publications.
BIC <- mclustBIC(mut.matrix, G = nc)
result <- Mclust(mut.matrix, nc, x = BIC)
clustermembership.mclust <- summary(result, parameters = TRUE)$classification
## 2D visualization
fviz_cluster(result,
palette = mypalette,
geom = "point",
ggtheme = theme_bw()
)
