library(factoextra)  ## for cluster visualization and heatmap
library(dplyr)  ## for piping and select()
library(tidyr)  ## for gather(), spread(), unite() and separate()

1 Format data for clustering

## Obtain data
d <- read.csv("Pathways/data/SNVIndel.csv", header=TRUE)  ## data table in csv format
df <- data.frame(d)
names(df)

##  [1] "Gene"             "SJQuality"        "sample"           "chr"             
##  [5] "posi"             "class"            "aachange"         "proteinGI"       
##  [9] "mRNA_acc"         "Mutant_In_Tumor"  "Total_In_Tumor"   "Mutant_In_Normal"
## [13] "Total_In_Normal"  "ReferenceAllele"  "MutantAllele"     "Flanking"

dim(df)

## [1] 18748    16

length(unique(df$sample))  ## number of samples

## [1] 112

length(unique(df$Gene)) ## number of mutated gene types

## [1] 9434

## Create binary mutation matrix
mut.tibble_tidy <- df %>%
  select(sample, Gene, chr, posi, class) %>%
  mutate(sample=as.factor(sample),
         Gene=as.factor(Gene),
         chr=as.factor(chr),
         posi=as.integer(posi),
         class=as.factor(class),
         ) %>%
  rename(gene=Gene, location=posi, chrom=chr) %>% 
  group_by(sample, chrom, location, class) %>%  # everything but gene distinction
  mutate(freq = n()) %>%
  spread(key="gene", value="freq", fill=0) 

dim(mut.tibble_tidy)  ## event vs. gene table

## [1] 18748  9438

colnames(mut.tibble_tidy) <- gsub(" ", "_", colnames(mut.tibble_tidy))  # replace ' ' with '_'

## Collapse rows with the same sample IDs 
## https://stackoverflow.com/questions/56014241/
mut.tibble <- mut.tibble_tidy %>%
  group_by(sample) %>%
  select(!location) %>%
  #summarise(across(where(is.numeric), sum) ) ## columnwise sum
  summarise(across(where(is.numeric), ~ ifelse(any(.==1),1,0)) )   ## Y/N table
dim(mut.tibble)

## [1]  112 9435

mut.tibble[11:20,c(1,15:20)]  ## Display a subset

## # A tibble: 10 × 7
##    sample                             A2ML1 A3GALT2 A4GALT AADAC AADACL3 AADAT
##    <fct>                              <dbl>   <dbl>  <dbl> <dbl>   <dbl> <dbl>
##  1 MP2PRT-GAAFAW-TTP1-A-1-0-D-A87Z-48     0       0      0     0       0     0
##  2 MP2PRT-GAAFAZ-TTP1-A-1-0-D-A87Z-48     0       0      0     0       0     0
##  3 MP2PRT-GAAFHK-TTP1-A-1-0-D-A85N-48     0       0      0     0       0     0
##  4 MP2PRT-GAAFKA-TTP1-A-1-0-D-A87P-48     0       0      0     0       0     0
##  5 MP2PRT-GAAFYE-TTM1-A-1-0-D-A85N-48     1       0      0     0       0     0
##  6 MP2PRT-GAAGMX-TTP1-A-1-0-D-A87Z-48     0       0      0     0       0     0
##  7 MP2PRT-GAAGRY-TTP1-A-1-0-D-A83Y-48     1       0      0     1       0     0
##  8 MP2PRT-GAAHGH-TTP1-A-1-0-D-A87P-48     0       0      0     0       0     0
##  9 MP2PRT-GAAHNH-TTP1-A-1-0-D-A83Y-48     0       0      0     0       0     0
## 10 MP2PRT-GAAHPD-TTP1-A-1-0-D-A83Y-48     0       0      0     0       0     1

## convert 1st column to rownames
mut.matrix <- data.frame(mut.tibble)
rownames(mut.matrix)<-mut.matrix[,1]
mut.matrix[,1]<-NULL
mut.matrix[11:20,14:19]  ## Display a part of the matrix

##                                    A2ML1 A3GALT2 A4GALT AADAC AADACL3 AADAT
## MP2PRT-GAAFAW-TTP1-A-1-0-D-A87Z-48     0       0      0     0       0     0
## MP2PRT-GAAFAZ-TTP1-A-1-0-D-A87Z-48     0       0      0     0       0     0
## MP2PRT-GAAFHK-TTP1-A-1-0-D-A85N-48     0       0      0     0       0     0
## MP2PRT-GAAFKA-TTP1-A-1-0-D-A87P-48     0       0      0     0       0     0
## MP2PRT-GAAFYE-TTM1-A-1-0-D-A85N-48     1       0      0     0       0     0
## MP2PRT-GAAGMX-TTP1-A-1-0-D-A87Z-48     0       0      0     0       0     0
## MP2PRT-GAAGRY-TTP1-A-1-0-D-A83Y-48     1       0      0     1       0     0
## MP2PRT-GAAHGH-TTP1-A-1-0-D-A87P-48     0       0      0     0       0     0
## MP2PRT-GAAHNH-TTP1-A-1-0-D-A83Y-48     0       0      0     0       0     0
## MP2PRT-GAAHPD-TTP1-A-1-0-D-A83Y-48     0       0      0     0       0     1

## Subset data for genes that are affected by >5 samples
dim(mut.matrix)

## [1]  112 9434

mut.matrix <- mut.matrix[which(colSums(mut.matrix)>5)]
dim(mut.matrix)

## [1] 112 252

2 Distance evaluation

## Euclidean distance
dist.eucl <- dist(mut.matrix, method = "euclidean")
# Visualize as a heatmap
fviz_dist(dist.eucl)

## Correlation-based distance
dist.corr <- get_dist(mut.matrix, method = "pearson")
## Visualize
fviz_dist(dist.corr)

3 Unsupervised Clustering

nc = 10  ## number of clusters

3.1 Hierarchical clustering

hamming <- function(X) {
  D <- (1 - X) %*% t(X)
  D + t(D)
}
h.dist <- hamming(data.matrix(mut.matrix))
dist <- as.dist(h.dist)
clusters <- hclust(dist)
clustermembership.hclust <- cutree(clusters, k = nc)

## Dendrogram plot
library(ggpubr)  ## for get_palette() function
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
mypalette = get_palette(cbPalette,nc)
fviz_dend(clusters, 
          k = nc,                    # cut tree into nc groups
          cex = 0.5,                 # label size
          k_colors = mypalette,
          color_labels_by_k = TRUE,  # color labels by groups
          ggtheme = theme_gray()     # Change theme
)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Remove clusters that have <=2 members
df.hclust<-data.frame(clustermembership.hclust)
df.hclust$sample <- rownames(df.hclust)  ## convert rownames to a column
rownames(df.hclust) <- NULL
dim(df.hclust)

## [1] 112   2

df.hclust <- df.hclust %>%
  group_by(clustermembership.hclust) %>%
  mutate(freq=n()) %>%
  filter(freq>2)  ## Remove clusters that have <=2 members
dim(df.hclust)

## [1] 102   3

sslist <- df.hclust$sample
## Subset data for samples that are in sslist
dim(mut.matrix)

## [1] 112 252

mut.matrix <- mut.matrix %>%
  filter(rownames(mut.matrix) %in% sslist) 
dim(mut.matrix)

## [1] 102 252

3.2 K-Means

set.seed(1234)
# Best of 240 iterations
result <- kmeans(x = mut.matrix, centers = nc, nstart = 240)

## Warning: did not converge in 10 iterations

clustermembership.kmeans <- fitted(result, method = "classes")
## 2D visualization
mypalette = get_palette(cbPalette,nc)
fviz_cluster(result, data = mut.matrix,
             palette = mypalette, 
             geom = "point",
             ggtheme = theme_bw()
)

3.3 mclust

library(mclust)

## Package 'mclust' version 5.4.10
## Type 'citation("mclust")' for citing this R package in publications.

BIC <- mclustBIC(mut.matrix, G = nc)
result <- Mclust(mut.matrix, nc, x = BIC)
clustermembership.mclust <- summary(result, parameters = TRUE)$classification
## 2D visualization
fviz_cluster(result, 
             palette = mypalette, 
             geom = "point",
             ggtheme = theme_bw()
)

Unsupervised Clustering of MP2PRT data

Mitsuko Korobkin

2022-09-02

1 Format data for clustering

2 Distance evaluation

3 Unsupervised Clustering

3.1 Hierarchical clustering

3.2 K-Means

3.3 mclust