# Load the gene dataset
gene <- read.delim("~/Google Drive/CR Rao Course/bodygene.csv")
gene[c(1:2),c(1:7)]
## Gene adipose adrenal brain breast colon heart
## 1 1060P11.3 0.762865 0.00000000 0.000000000 0.971663 0.0000000 0
## 2 A1CF 0.000000 0.02799675 0.000288957 0.000000 0.3955925 0
# Transpose the dataset
gene <- t(gene)
# Convert the matrix to a dataframe.
gene <- as.data.frame(gene)
Calculate the eucledian distance between the observations.
gene_dist <- dist(gene [-1, ])
Hierarchical Clustering
gene_clust <- hclust(gene_dist)
Dendrogram
plot(gene_clust, hang = -1)
We use the k-means command to generate the clusters
gene_clust2 <- kmeans(gene[-1, ], centers = 3, nstart=25)
Evaluating the cluster generated
gene_clust2$cluster
## adipose adrenal brain breast colon
## 1 1 1 1 1
## heart kidney liver lung lymphnode
## 1 1 2 1 1
## ovary prostate skeletalmuscle testes thyroid
## 1 1 3 1 1
## whiteblood
## 1
Create a loop to determine optimal K
# Creata a folder with 10 0s.
ess <- rep(0,10)
#Create the loop for this command
for (k in 1:10)
{
ess[k] <- kmeans(gene[-1, ],nstart = 25,centers=k+1)$tot.withinss # in this we say that the k value of ess is the kmeans cultering algorirthm for K+1 centers and get only the total ESS value.
}
Determining the optimal number of K using scree plot
plot(2:11, ess, xlab = "Number of clusters",ylab="Within Sum of Squares", col="blue", pch=10, main ="Scree Plot to determine optimal K")
The optimal value of K seems to be 5 or 6
We do k-means clustering with 5 clusters
gene_clust3 <- kmeans(gene[-1, ], centers = 5, nstart=25)
Check the cluster
gene_clust3$cluster
## adipose adrenal brain breast colon
## 3 5 1 3 5
## heart kidney liver lung lymphnode
## 1 1 2 5 5
## ovary prostate skeletalmuscle testes thyroid
## 1 1 4 1 1
## whiteblood
## 5