Clustering

Clustering in R is an unsupervised learning technique in which the data set is partitioned into several groups called as clusters based on their similarity. Several clusters of data are produced after the segmentation of data. All the objects in a cluster share common characteristics. During data mining and analysis, clustering is used to find the similar data sets.

Pre-requisites for clustering: Applicable only for RNA-seq data

The data should be normalized
The data should be log transformed

First, we calculate the variable genes in the dataset

data = read.csv("/home/pranali/Documents/cancerGenomics/GSE139505_Normalized_counts.txt", header = T)
rownames(data) = data[, 1]
data = data[, -1]

data[1:5, 1:5]

##                         H3       H4       H6       H5       H2
## ENSG00000223972.4 16.33683 16.81758 17.27575 16.79159 15.21029
## ENSG00000227232.4 23.05195 23.36448 23.56528 23.97866 23.56359
## ENSG00000243485.2 15.53700 15.44431  0.00000 14.58642 15.56526
## ENSG00000237613.2 14.62603 16.34666  0.00000 12.77921 14.47541
## ENSG00000268020.2 15.39950  0.00000 13.34056 16.28238  0.00000

var = apply(data, 1, var)
data = cbind.data.frame(var, data)

#order the data in descending order
data = data[order(data[,1], decreasing = T), ]
data[1:5, 1:5]

##                         var       H3       H4 H6       H5
## ENSG00000183878.11 196.7960 14.39953  0.00000  0  0.00000
## ENSG00000067646.7  170.3346  0.00000  0.00000  0  0.00000
## ENSG00000012817.11 166.8904  0.00000  0.00000  0  0.00000
## ENSG00000114374.8  153.3661 12.39973 14.28047  0 13.15768
## ENSG00000233864.3  150.6696  0.00000  0.00000  0  0.00000

#subset top 2000 variant genes
mat = data[1:2000, -1]
head(mat)

##                          H3       H4 H6       H5       H2 H1       H7    cSCC1
## ENSG00000183878.11 14.39953  0.00000  0  0.00000 28.72964  0  0.00000 27.51570
## ENSG00000067646.7   0.00000  0.00000  0  0.00000 25.75521  0  0.00000 25.48191
## ENSG00000012817.11  0.00000  0.00000  0  0.00000 26.19663  0  0.00000 25.78352
## ENSG00000114374.8  12.39973 14.28047  0 13.15768 27.11178  0  0.00000 26.38798
## ENSG00000233864.3   0.00000  0.00000  0  0.00000 24.04791  0  0.00000 23.71040
## ENSG00000131002.7   0.00000  0.00000  0  0.00000 26.60048  0 11.16736 25.91130
##                       cSCC8    cSCC4    cSCC3    cSCC7    cSCC9  cSCC1.1 cSCC5
## ENSG00000183878.11  0.00000  0.00000 29.31185 27.36728 28.00216 27.35417     0
## ENSG00000067646.7   0.00000  0.00000 26.30786 24.43659 25.19366 25.31976     0
## ENSG00000012817.11  0.00000  0.00000 25.20058 24.87410 25.32313 25.50089     0
## ENSG00000114374.8   0.00000  0.00000 27.25423 25.82276 26.65178 26.26681     0
## ENSG00000233864.3   0.00000  0.00000 24.48189 23.04688 24.04877 23.64001     0
## ENSG00000131002.7  11.75239 11.50795 25.21152 24.70503 24.72932 25.50241     0
##                       cSCC6
## ENSG00000183878.11 28.38960
## ENSG00000067646.7  25.74802
## ENSG00000012817.11 23.46190
## ENSG00000114374.8  27.04300
## ENSG00000233864.3  24.66318
## ENSG00000131002.7  25.82102

#perform tSNE
library(Rtsne)
res = Rtsne(t(mat), check_duplicates = F,  perplexity = 5)
res1 = res$Y
rownames(res1) = colnames(mat)
colnames(res1) = c("tSNE_1", "tSNE_2")
clus = kmeans(res1, 2)
res1 =  cbind(res1, clus$cluster)
colnames(res1) = c("tSNE_1", "tSNE_2", "clusters")

label = c(rep("normal", 7),  rep("skcm", 9))
res1 = cbind.data.frame(res1,  label)

head(res1)

##      tSNE_1   tSNE_2 clusters  label
## H3 78.04466 26.53003        1 normal
## H4 65.03103 25.48136        1 normal
## H6 53.26074 32.94732        1 normal
## H5 62.48978 34.49802        1 normal
## H2 49.91713 41.16131        1 normal
## H1 47.10706 27.37512        1 normal

library(ggplot2)
ggplot(res1, aes(x = tSNE_1, y = tSNE_2,  color = label)) + geom_point()

library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.8.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

library(circlize)

## ========================================
## circlize version 0.4.13
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

df = as.data.frame(res1$label)
colnames(df) =  "labels"



library(matrixStats)
mat1 = (mat - rowMeans(mat))/(rowSds(as.matrix(mat)))[row(mat)]

Next for hclust, we use the heatmap function for plotting the data

Heatmap(mat1, top_annotation = HeatmapAnnotation(df = df),  col = colorRamp2(c(-2,0,2), c("orangered", "white", "purple")), row_names_gp = gpar(fontsize = 0))

## Warning: The input is a data frame, convert it to the matrix.

Clustering

Pranali

25/10/2021