# Header is kept False because we don't have any gene type names, so we don't want to keep the first row as the header.
# A False Header willl give another default header (row-column names) to the data
gene_express <- read.csv('D:/MLDS/Datasets/gene_expression.csv', header = FALSE)
# Few top rows of the dataset:
head(gene_express)## V1 V2 V3 V4 V5 V6
## 1 -0.96193340 0.4418028 -0.9750051 1.4175040 0.8188148 0.3162937
## 2 -0.29252570 -1.1392670 0.1958370 -1.2811210 -0.2514393 2.5119970
## 3 0.25878820 -0.9728448 0.5884858 -0.8002581 -1.8203980 -2.0589240
## 4 -1.15213200 -2.2131680 -0.8615249 0.6309253 0.9517719 -1.1657240
## 5 0.19578280 0.5933059 0.2829921 0.2471472 1.9786680 -0.8710180
## 6 0.03012394 -0.6910143 -0.4034258 -0.7298590 -0.3640986 1.1253490
## V7 V8 V9 V10 V11 V12
## 1 -0.02496682 -0.06396600 0.03149702 -0.3503106 -0.7227299 -0.2819547
## 2 -0.92220620 0.05954277 -1.40964500 -0.6567122 -0.1157652 0.8259783
## 3 -0.06476437 1.59212400 -0.17311700 -0.1210874 -0.1875790 -1.5001630
## 4 -0.39155860 1.06361900 -0.35000900 -1.4890580 -0.2432189 -0.4330340
## 5 -0.98971500 -1.03225300 -1.10965400 -0.3851423 1.6509570 -1.7449090
## 6 -1.40404100 -0.80613040 -1.23792400 0.5776018 -0.2720642 2.1765620
## V13 V14 V15 V16 V17 V18
## 1 1.33751500 0.70197980 1.0076160 -0.4653828 0.6385951 0.2867807
## 2 0.34644960 -0.56954860 -0.1315365 0.6902290 -0.9090382 1.3026420
## 3 -1.22873700 0.85598900 1.2498550 -0.8980815 0.8702058 -0.2252529
## 4 -0.03879128 -0.05789677 -1.3977620 -0.1561871 -2.7359820 0.7756169
## 5 -0.37888530 -0.67982610 -2.1315840 -0.2301718 0.4661243 -1.8004490
## 6 1.43640700 -1.02578100 0.2981582 -0.5559659 0.2046529 -1.1916480
## V19 V20 V21 V22 V23 V24
## 1 -0.2270782 -0.22004520 -1.2425730 -0.1085056 -1.8642620 -0.5005122
## 2 -1.6726950 -0.52550400 0.7979700 -0.6897930 0.8995305 0.4285812
## 3 0.4502892 0.55144040 0.1462943 0.1297400 1.3042290 -1.6619080
## 4 0.6141562 2.01919400 1.0811390 -1.0766180 -0.2434181 0.5134822
## 5 0.6262904 -0.09772305 -0.2997108 -0.5295591 -2.0235670 -0.5108402
## 6 0.2350916 0.67096470 0.1307988 1.0689940 1.2309870 1.1344690
## V25 V26 V27 V28 V29 V30
## 1 -1.32500800 1.06341100 -0.2963712 -0.1216457 0.08516605 0.62417640
## 2 -0.67611410 -0.53409490 -1.7325070 -1.6034470 -1.08362000 0.03342185
## 3 -1.63037600 -0.07742528 1.3061820 0.7926002 1.55946500 -0.68851160
## 4 -0.51285780 2.55167600 -2.3143010 -1.2764700 -1.22927100 1.43439600
## 5 0.04600274 1.26803000 -0.7439868 0.2231319 0.85846280 0.27472610
## 6 0.55636800 -0.35876640 1.0798650 -0.2064905 -0.00616453 0.16425470
## V31 V32 V33 V34 V35 V36
## 1 -0.5095915 -0.216725500 -0.05550597 -0.4844491 -0.5215811 1.9491350
## 2 1.7007080 0.007289556 0.09906234 0.5638533 -0.2572752 -0.5817805
## 3 -0.6154720 0.009999363 0.94581000 -0.3185212 -0.1178895 0.6213662
## 4 -0.2842774 0.198945600 -0.09183320 0.3496279 -0.2989097 1.5136960
## 5 -0.6929984 -0.845707200 -0.17749680 -0.1664908 1.4831550 -1.6879460
## 6 1.1567370 0.241774500 0.08863952 0.1829540 0.9426771 -0.2096004
## V37 V38 V39 V40
## 1 1.32433500 0.4681471 1.06110000 1.6559700
## 2 -0.16988710 -0.5423036 0.31293890 -1.2843770
## 3 -0.07076396 0.4016818 -0.01622713 -0.5265532
## 4 0.67118470 0.0108553 -1.04368900 1.6252750
## 5 -0.14142960 0.2007785 -0.67594210 2.2206110
## 6 0.53626210 -1.1852260 -0.42274760 0.6243603
There are several methods of “dissimilarity” (correlation based distance) which can be explored for creation of distance matrix.
The hclust() function implements hierarchical clustering in R.
We now plot the hierarchical clustering dendrogram using complete, single, and average linkage clustering using correlation based distance.
dd=as.dist(1- cor(gene_express))
# To plot the dendrograms obtained using the usual plot() function.
# The numbers at the bottom of the plot identify each observation.
plot(hclust(dd, method ="complete"), main=" Complete Linkage
with Correlation - Based Distance ", xlab="", sub ="")plot(hclust(dd, method ="average"), main=" Average Linkage
with Correlation - Based Distance ", xlab="", sub ="")plot(hclust(dd, method ="single"), main=" Single Linkage
with Correlation - Based Distance ", xlab="", sub ="")Inference:
To see which genes differ the most across the two groups(healthy,diseased), I perform the PCA on the data using the prcomp() function
By default, the prcomp() function centers the variables to have mean zero. By using the option scale=TRUE, we scale the variables to have standard deviation one.
The rotation matrix provides the principal component loadings; each column of pr.out$rotation contains the corresponding principal component loading vector.
In this case, the loadings can be considered to be the weight of each gene in both the groups
pr.out <- prcomp(t(gene_express), scale = TRUE)
head(pr.out$rotation)## PC1 PC2 PC3 PC4 PC5
## [1,] -0.005248643 0.02924652 -0.003735358 -0.016526153 0.006318053
## [2,] -0.002162728 0.04373711 -0.071475379 0.036521467 -0.010739305
## [3,] 0.014444163 0.01361593 -0.011068681 -0.075011306 0.002396073
## [4,] 0.014868417 -0.01975169 -0.038690207 0.018872555 -0.007595757
## [5,] 0.010600362 -0.02949636 0.016983813 0.005730388 -0.019518632
## [6,] 0.030133863 0.03181327 0.016420667 0.016799619 0.037829117
## PC6 PC7 PC8 PC9 PC10
## [1,] -0.049838208 -0.01283006 0.0215726238 0.007640568 -0.042478511
## [2,] 0.044918065 0.02817240 -0.0372224597 -0.030437604 0.077891794
## [3,] -0.073571187 -0.03590103 0.0385994243 0.013067476 0.004386205
## [4,] -0.006314036 0.04078925 0.0032669777 0.013650293 -0.003238640
## [5,] 0.025142879 0.03502170 0.0357811899 0.036519007 -0.078427052
## [6,] 0.076979558 0.01745551 0.0002496599 -0.037883486 0.001399260
## PC11 PC12 PC13 PC14 PC15
## [1,] -0.06799323 -0.066325536 0.049657614 -0.0105490714 0.002388065
## [2,] 0.03504599 0.007576127 -0.013509593 -0.0120626555 0.017620628
## [3,] 0.02675494 -0.004421772 -0.023358127 -0.0002795565 0.005085890
## [4,] -0.04521873 -0.084699633 -0.002718848 -0.0170544115 -0.047018603
## [5,] -0.06127074 -0.004607158 0.021907325 -0.0174513795 -0.014291851
## [6,] 0.06333435 0.001071488 0.031586824 0.0091279070 0.012496721
## PC16 PC17 PC18 PC19 PC20
## [1,] 0.057226217 0.005064164 -0.0368384794 -0.01386445 0.008475267
## [2,] -0.038753365 0.018177190 0.0065293541 0.02616184 -0.008769863
## [3,] -0.008360650 -0.033206319 0.0036294280 -0.02014200 -0.024457400
## [4,] -0.027114939 -0.037841235 0.0090504436 0.03994846 0.014224876
## [5,] -0.010468740 -0.027149195 0.0487582488 0.02195479 0.060265982
## [6,] 0.004923492 -0.003525826 0.0006219705 0.03129817 0.023261190
## PC21 PC22 PC23 PC24 PC25
## [1,] 0.0194249905 0.000140247 -0.02989553 -0.076029645 -0.004324215
## [2,] -0.0078725765 -0.026677304 -0.03100535 -0.002287257 -0.021849136
## [3,] -0.0093182521 0.024043186 0.05819660 0.018947343 0.014286045
## [4,] 0.0554784102 0.059728272 -0.04416882 0.030836222 0.002601610
## [5,] -0.0000414575 -0.039202866 0.01982787 -0.037371354 0.003398134
## [6,] -0.0105442468 0.006799149 -0.06354254 0.019801778 0.007791356
## PC26 PC27 PC28 PC29 PC30
## [1,] -0.005802548 0.053495473 0.03324798 0.039595474 0.0008190378
## [2,] -0.018643636 0.051218992 0.01319940 -0.005557906 -0.0085837033
## [3,] -0.012192009 -0.083097382 0.02438412 0.012882579 -0.0032415950
## [4,] 0.018058462 0.041462682 0.05223420 0.003324611 0.0449676174
## [5,] -0.027906713 0.014694086 -0.01277924 0.001121646 0.0203340545
## [6,] -0.053747211 0.008386746 -0.01655763 -0.005120959 -0.0673679957
## PC31 PC32 PC33 PC34 PC35
## [1,] 0.04451014 0.0001689378 -0.025423202 -0.010178476 0.021477278
## [2,] 0.06072757 -0.0089395354 0.042380402 0.005784563 0.002459284
## [3,] -0.02798607 -0.0406599481 -0.004561497 0.077879450 -0.005489991
## [4,] 0.02768043 -0.0086310349 0.014854858 -0.006997894 -0.023272536
## [5,] -0.06631350 -0.0079579383 0.033064563 -0.017950552 -0.050971271
## [6,] -0.01807539 -0.0222340328 0.016882833 0.005770649 0.067631690
## PC36 PC37 PC38 PC39 PC40
## [1,] -0.019328540 0.027045885 -0.01467986 -0.00732158 0.100069783
## [2,] 0.042550057 0.004536028 0.01595791 0.02267139 0.001950052
## [3,] 0.017625830 0.003143908 -0.02153624 -0.01162287 0.023821341
## [4,] 0.015260530 -0.009337707 -0.03368954 0.01920661 0.111874990
## [5,] -0.038077959 -0.003937376 0.02531928 -0.02512450 -0.045837979
## [6,] 0.008485763 -0.044439490 -0.01276119 -0.01742770 -0.108836513
# apply() function allows us to apply a function-in this case, the sum() function-to each row of the data set
# The second input here denotes whether we wish to compute the sum of the rows, 1, or the columns, 2.
# So, I am summing all the weights of each gene to know what is the total weightage of each gene in this distribution
gene.load <- apply(pr.out$rotation, 1, sum)
# Now, I am arranging the weightage obtained above in a descending order by taking the absolute value of the total loadings for each gene
gene.differ <- order(abs(gene.load), decreasing = TRUE)
# To show the top most different genes across the two groups
gene.differ[1:20]## [1] 889 676 755 960 907 19 475 673 374 174 716 878 327 567 840 822 281
## [18] 370 984 138
set.seed(174004689)
# Generating 20 observations for each of the three classes
y_class <- rep(c(1,2,3),20 )
# Using rnorm() function to randomly create a matrix of 50 cols (variables) * 60 rows (observations):
x_simulatn <- matrix(rnorm(60*50), ncol=50)
# add a mean shift to the observations in each class so that there are three distinct classes:
x_simulatn[y_class==2,]= x_simulatn[y_class==2,] - .7
x_simulatn[y_class==3,]= x_simulatn[y_class==3,] + .7
# Giving Give the matrix column and row names
dimnames(x_simulatn) <- list(rownames(x_simulatn, do.NULL = FALSE, prefix ="row"), colnames(x_simulatn, do.NULL = FALSE, prefix = "col"))
# Below function will be used to assign a color to each of the 3 classes, based on the data to which it corresponds.
#the rainbow() function takes as its argument a positive integer, and returns a vector containing that number of distinct colors
Cols=function(vec ){
cols=rainbow(length(unique(vec)));
return (cols[as.numeric(as.factor(vec))]);
}# PCA using prcomp():
pr.out =prcomp(x_simulatn, scale =TRUE)
# To see if the three classes appear distinctly separated, plotting the first two principal component score vectors:
# 0.7 shift in the data above allows the distinct seperation in the below plot:
plot(pr.out$x[,1:2], col=Cols(y_class), pch =19, xlab ="First principal component", ylab="Second principal component")km.out.3 = kmeans(x_simulatn, 3, nstart =20)
table(km.out.3$cluster, y_class, dnn=c("Clusters","Class Labels"))## Class Labels
## Clusters 1 2 3
## 1 0 20 0
## 2 0 0 19
## 3 20 0 1
km.out.2 =kmeans(x_simulatn, 2, nstart =20)
table(km.out.2$cluster, y_class, dnn=c("Cluster","Class Labels"))## Class Labels
## Cluster 1 2 3
## 1 19 20 0
## 2 1 0 20
km.out.4 =kmeans(x_simulatn, 4, nstart =20)
table(km.out.4$cluster, y_class, dnn=c("Cluster","Class Labels"))## Class Labels
## Cluster 1 2 3
## 1 0 13 0
## 2 0 7 0
## 3 20 0 1
## 4 0 0 19
km.out.pca =kmeans(pr.out$x[,1:2], 3, nstart =20)
table(km.out.pca$cluster, y_class, dnn=c("Cluster","Class Labels"))## Class Labels
## Cluster 1 2 3
## 1 0 20 0
## 2 20 0 0
## 3 0 0 20