
This document is an Explorative Data Analysis of Genome Predictive Analytics Train and Test Data. The exploratory analysis includes Heatmaps, Hierarchical Clustering, Random Forest Machine Learning, Kmeans Clustering, and PCA analysis. The datasets are available at http://bioinf.ucd.ie/people/aedin/R/full_datasets/.
Working Directory
setwd("C:/Users/Administrator/Dropbox/Programming/DataClassification/ClusterAnalysis")
Required Packages
# install.packages("lattice")
library(lattice)
# install.packages("randomForest")
library(randomForest)
Normalization of the Data
load("khan.rda")
skhanTrain <- scale(khan$train)
skhanTest <- scale(khan$test)
head(skhanTrain, 1)
## EWS.T1 EWS.T2 EWS.T3 EWS.T4 EWS.T6 EWS.T7 EWS.T9
## 1 2.551633 0.8310227 2.611437 0.07076376 2.043638 1.213812 1.096357
## EWS.T11 EWS.T12 EWS.T13 EWS.T14 EWS.T15 EWS.T19 EWS.C8
## 1 1.725331 1.255386 0.7386695 2.306785 0.9851019 1.10014 0.146627
## EWS.C3 EWS.C2 EWS.C4 EWS.C6 EWS.C9 EWS.C7 EWS.C1
## 1 0.02825556 0.03341385 2.286969 0.20624 0.21098 0.6404922 1.169167
## EWS.C11 EWS.C10 BL.C5 BL.C6 BL.C7 BL.C8
## 1 -0.5342358 0.2032243 -0.6228589 -0.7473008 -0.7680685 -0.5284596
## BL.C1 BL.C2 BL.C3 BL.C4 NB.C1 NB.C2
## 1 -0.6797929 -0.4702524 -0.5448033 -0.8535249 0.3834779 0.3656222
## NB.C3 NB.C6 NB.C12 NB.C7 NB.C4 NB.C5 NB.C10
## 1 -0.1072794 -0.1842123 0.7688073 0.438864 -0.2915887 -0.330252 0.4074449
## NB.C11 NB.C9 NB.C8 RMS.C4 RMS.C3 RMS.C9 RMS.C2
## 1 0.3840647 0.04309766 -0.3058032 1.536961 1.364684 0.3173317 0.3645133
## RMS.C5 RMS.C6 RMS.C7 RMS.C8 RMS.C10 RMS.C11 RMS.T1
## 1 0.3322257 -0.07246296 0.5202957 0.06736783 0.5565358 1.907763 0.1909644
## RMS.T4 RMS.T2 RMS.T6 RMS.T7 RMS.T8 RMS.T5 RMS.T9
## 1 2.647415 1.294766 1.843351 0.6380244 1.18883 0.8562458 1.508946
## RMS.T3 RMS.T10 RMS.T11
## 1 0.3293574 0.3065548 0.6880562
head(skhanTest, 1)
## TEST.9 TEST.11 TEST.5 TEST.8 TEST.10 TEST.13
## 1 0.6132046 -0.7789118 1.167259 -0.09612295 -0.3907166 -0.1009558
## TEST.3 TEST.1 TEST.2 TEST.4 TEST.7 TEST.12 TEST.24
## 1 0.4502672 0.20196 1.439363 0.9357368 -0.4502526 2.359291 0.3107886
## TEST.6 TEST.21 TEST.20 TEST.17 TEST.18 TEST.22 TEST.16
## 1 0.9538559 0.7387116 1.93437 2.190438 -0.4583831 -0.1338046 -0.07221085
## TEST.23 TEST.14 TEST.25 TEST.15 TEST.19
## 1 -0.2363164 -0.2322534 -0.06872159 -0.469293 -0.131364
skhanTrain <- skhanTrain[,14:53]
plot(density(as.numeric(skhanTrain[,1])), type="n",
main="Density Plot of Train Data")
lines(density(as.numeric(skhanTrain[,1])))
lines(density(as.numeric(skhanTrain[,2])), col="red")
lines(density(as.numeric(skhanTrain[,3])), col="blue")
lines(density(as.numeric(skhanTrain[,4])), col="green")
lines(density(as.numeric(skhanTrain[,14])), col="green")
lines(density(as.numeric(skhanTrain[,15])), col="blue")

plot(density(as.numeric(skhanTest[,1])), type="n",
main="Density Plot of Test Data")
lines(density(as.numeric(skhanTest[,1])))
lines(density(as.numeric(skhanTest[,2])), col="red")
lines(density(as.numeric(skhanTest[,3])), col="blue")
lines(density(as.numeric(skhanTest[,4])), col="green")
lines(density(as.numeric(skhanTest[,14])), col="green")
lines(density(as.numeric(skhanTest[,15])), col="blue")

Principal Component Analysis
PCATrain <- princomp(skhanTrain, scores=TRUE, cor=TRUE)
PCATest <- princomp(skhanTest, scores=TRUE, cor=TRUE)
plot(PCATrain, main="PCA Train - Bar Plot")

plot(PCATrain, type="lines", main="PCA Train - Line Plot")

biplot(PCATrain, main="PCA Train - BiPlot")

# PCATrainTable <- prcomp(t(skhanTrain))
PCATrain <- prcomp(skhanTrain)
plot(x=PCATrain$x[,1],y=PCATrain$x[,2],
main="Correlation of PCA Train$x - Columns 1 & 2")

# Train Data Heatmap
TrainMap <- cor(skhanTrain)
TrainMap <- round(TrainMap, 2)
levelplot(TrainMap, main="Train Data Heat Map")

# Shapiro test
shapiro.test(skhanTrain[1:4000])
##
## Shapiro-Wilk normality test
##
## data: skhanTrain[1:4000]
## W = 0.71259, p-value < 2.2e-16
# mean for each col and plot
colMeans(skhanTrain)
## EWS.C8 EWS.C3 EWS.C2 EWS.C4 EWS.C6
## 2.875973e-17 -3.492221e-17 -1.805375e-17 -4.219820e-17 -2.621928e-17
## EWS.C9 EWS.C7 EWS.C1 EWS.C11 EWS.C10
## -4.137331e-17 -4.881428e-17 1.019263e-17 -4.314260e-17 2.080766e-17
## BL.C5 BL.C6 BL.C7 BL.C8 BL.C1
## -2.394941e-17 -2.118384e-17 5.579526e-17 -1.126518e-17 -2.810883e-17
## BL.C2 BL.C3 BL.C4 NB.C1 NB.C2
## -7.906220e-18 -4.181976e-18 4.667368e-17 4.249922e-17 4.596115e-17
## NB.C3 NB.C6 NB.C12 NB.C7 NB.C4
## -3.941235e-17 -3.235094e-17 -3.054932e-17 -3.398795e-18 3.904631e-18
## NB.C5 NB.C10 NB.C11 NB.C9 NB.C8
## -5.928350e-17 3.418713e-18 3.506051e-17 -2.031611e-18 -4.885486e-18
## RMS.C4 RMS.C3 RMS.C9 RMS.C2 RMS.C5
## 1.194614e-17 4.896911e-17 2.474010e-17 -4.027144e-17 4.855422e-18
## RMS.C6 RMS.C7 RMS.C8 RMS.C10 RMS.C11
## -6.306036e-17 -4.627946e-17 1.042638e-17 9.676270e-18 2.008611e-17
barplot(colMeans(skhanTrain), main="Bar Plot of Train Data Mean Values")

# the distance of each gene
cluster.dist <- prop.table(table(skhanTrain))
plot(cluster.dist[1:500], main="Distance of 1st 500 Train genes")

## Test Data ##
plot(PCATest, main="PCA Test - Bar Plot")

plot(PCATest, type="lines", main="PCA Test - Line Plot")

biplot(PCATest, main="PCA Test - BiPlot") ### read red

# PCATestTable <- prcomp(t(skhanTest))
PCATest <- prcomp(skhanTest)
plot(x=PCATest$x[,1],y=PCATest$x[,2],
main="Correlation of PCA Test$x - Columns 1 & 2")

# Test Data Heatmap
TestMap <- cor(skhanTest)
TestMap <- round(TestMap, 2)
levelplot(TestMap, main="Test Data Heat Map")

# Shapiro test
shapiro.test(skhanTest[1:4000])
##
## Shapiro-Wilk normality test
##
## data: skhanTest[1:4000]
## W = 0.62387, p-value < 2.2e-16
# mean for each column and plot
colMeans(skhanTest)
## TEST.9 TEST.11 TEST.5 TEST.8 TEST.10
## -2.798106e-17 -3.133438e-17 2.593893e-17 1.408523e-17 1.820407e-17
## TEST.13 TEST.3 TEST.1 TEST.2 TEST.4
## -2.456084e-17 -3.373917e-17 -5.567200e-18 -1.521716e-17 -7.336685e-18
## TEST.7 TEST.12 TEST.24 TEST.6 TEST.21
## -2.967520e-17 1.863700e-17 2.136310e-17 8.969001e-18 -8.072327e-19
## TEST.20 TEST.17 TEST.18 TEST.22 TEST.16
## -4.292276e-17 -1.540657e-17 -2.738653e-17 -4.463869e-17 1.576283e-17
## TEST.23 TEST.14 TEST.25 TEST.15 TEST.19
## 3.038397e-18 1.342607e-17 1.467675e-17 8.581921e-18 1.905415e-17
barplot(colMeans(skhanTest), main="Bar Plot of Test Data Mean Values")

# the distance of each gene
cluster.dist <- prop.table(table(skhanTest))
plot(cluster.dist[1:500], main="Distance of 1st 500 Test genes")

Kmeans Clustering
plot(skhanTrain, main="Plot of Train Data")

resultTrain <- kmeans(skhanTrain, 3, nstart = 10)
attributes(resultTrain)
## $names
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
##
## $class
## [1] "kmeans"
resultTrain$centers
## EWS.C8 EWS.C3 EWS.C2 EWS.C4 EWS.C6 EWS.C9
## 1 2.7044053 2.4377883 2.2205622 1.8706174 2.4363145 2.6791860
## 2 0.6365057 0.8118929 0.7986767 0.8398681 0.9459529 0.8562593
## 3 -0.3473828 -0.3800859 -0.3628411 -0.3526265 -0.4176839 -0.4075938
## EWS.C7 EWS.C1 EWS.C11 EWS.C10 BL.C5 BL.C6
## 1 3.1742604 2.4673070 2.5512816 2.8061272 2.6740399 2.7847064
## 2 0.6271084 0.8001787 0.8370475 0.7360663 0.5930660 0.7125463
## 3 -0.3740042 -0.3786311 -0.3942265 -0.3817088 -0.3332789 -0.3737622
## BL.C7 BL.C8 BL.C1 BL.C2 BL.C3 BL.C4
## 1 3.0306035 2.9918459 2.7526280 2.7553968 2.5219200 2.7686850
## 2 0.6238697 0.6279659 0.7557159 0.6803564 0.8493209 0.7124814
## 3 -0.3641465 -0.3628842 -0.3839010 -0.3628868 -0.3958483 -0.3727461
## NB.C1 NB.C2 NB.C3 NB.C6 NB.C12 NB.C7
## 1 2.7595545 2.6388035 3.0239822 3.0437104 3.0313503 2.7502372
## 2 0.7698311 0.7848119 0.5757974 0.7546801 0.7303554 0.7261918
## 3 -0.3883008 -0.3849919 -0.3502190 -0.4017389 -0.3941305 -0.3754517
## NB.C4 NB.C5 NB.C10 NB.C11 NB.C9 NB.C8
## 1 2.7830874 2.858715 2.5429711 3.1971433 3.0350750 3.3219931
## 2 0.8069524 0.725335 0.8608072 0.7389393 0.7327271 0.6382546
## 3 -0.4002028 -0.381967 -0.4003887 -0.4068696 -0.3950292 -0.3863389
## RMS.C4 RMS.C3 RMS.C9 RMS.C2 RMS.C5 RMS.C6
## 1 2.6324189 2.7915877 3.0788228 2.9405381 3.0805789 2.5995453
## 2 0.5824217 0.6285695 0.3736961 0.5077702 0.6218642 0.7417481
## 3 -0.3276941 -0.3505815 -0.2968156 -0.3258967 -0.3666952 -0.3704399
## RMS.C7 RMS.C8 RMS.C10 RMS.C11
## 1 3.2361786 2.0545140 3.2591211 2.3613458
## 2 0.6101807 0.7026625 0.6157820 0.7162954
## 3 -0.3731015 -0.3255058 -0.3761052 -0.3484486
plot(skhanTrain,col= resultTrain$cluster, main="Train Data Cluster Plot")

plot(skhanTrain,col= resultTrain$centers, main="Train Data Centers Plot")

plot(cbind(skhanTrain[,1], skhanTrain[,16]),
col=resultTrain$cluster,
main="Train Data Cluster Plot, (EWS.C8 & BL.C2)")

## Test Data ##
plot(skhanTest, main="Plot of Test Data")

resultTest <- kmeans(skhanTest,3, nstart = 10)
attributes(resultTest)
## $names
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
##
## $class
## [1] "kmeans"
resultTest$centers
## TEST.9 TEST.11 TEST.5 TEST.8 TEST.10 TEST.13
## 1 -0.2110609 -0.3037932 -0.2638622 -0.3213358 -0.2337475 -0.2211017
## 2 2.5542261 2.6728522 2.6275715 3.9353069 3.1934442 2.6048426
## 3 0.6059513 0.9897020 0.8237778 0.9170974 0.6283826 0.6430798
## TEST.3 TEST.1 TEST.2 TEST.4 TEST.7 TEST.12
## 1 -0.2775662 -0.2759212 -0.2606222 -0.2542182 -0.2571212 -0.2366873
## 2 2.6121636 3.6751493 3.6977605 3.9775444 3.7662091 2.9545106
## 3 0.8843456 0.7528202 0.6845694 0.6243473 0.6615422 0.6689669
## TEST.24 TEST.6 TEST.21 TEST.20 TEST.17 TEST.18
## 1 -0.2458945 -0.2153716 -0.2383906 -0.2452876 -0.2485569 -0.2877200
## 2 3.8936780 2.7730732 3.3715276 3.1599841 3.7209208 4.2457038
## 3 0.5984752 0.5988096 0.6274397 0.6817857 0.6301208 0.7366047
## TEST.22 TEST.16 TEST.23 TEST.14 TEST.25 TEST.15
## 1 -0.2929609 -0.3048507 -0.3104841 -0.3136586 -0.2930111 -0.2469694
## 2 3.7569209 3.5866981 4.0377656 3.2748048 3.7033880 4.6008358
## 3 0.8163122 0.8872291 0.8585668 0.9615191 0.8227961 0.5202789
## TEST.19
## 1 -0.2864272
## 2 4.2066277
## 3 0.7356366
plot(skhanTest,col= resultTest$cluster, main="Test Data Cluster Plot")

plot(skhanTest,col= resultTest$centers, main="Test Data Centers Plot")

plot(cbind(skhanTest[,1], skhanTest[,16]),
col=resultTest$cluster,
main="Test Data Cluster Plot, (TEST.9 & TEST.20)")

Hierarchical Clustering
hc.s.Train <- hclust(dist(skhanTrain[1:100], method= "euclidian"), method ="single")
hc.c.Train <- hclust(dist(skhanTrain[1:100], method= "euclidian"), method ="complete")
plot(hc.c.Train, main ="Hierarchical cluster - Dendrogram", xlab="Train Data", cex=0.9)

plot(hc.s.Train, main ="Hierarchical cluster", xlab="Train Data", cex=0.9)

## Test Data ##
hc.s.Test <- hclust(dist(skhanTest[1:100], method= "euclidian"), method ="single")
hc.c.Test <- hclust(dist(skhanTest[1:100], method= "euclidian"), method ="complete")
plot(hc.c.Test, main ="Hierarchical cluster - Dendrogram", xlab="Test Data", cex=0.9)

plot(hc.s.Test, main ="Hierarchical cluster", xlab="Test Data", cex=0.9)

Random Forest Machine Learning
# Fit Correlation Model
TrainFit <- randomForest(EWS.C8~., data=skhanTrain)
# Predict PM25 Levels
predictionsTrain <- predict(TrainFit, skhanTrain[,2:40])
# Compare Accuracy of Predictions
TrainComparison <- data.frame(skhanTrain[1:10,2], skhanTrain[1:10,3], skhanTrain[1:10,4],
predictionsTrain[1:10], skhanTrain[1:10,1])
names(TrainComparison) <- c("EWS.C3", "EWS.C2", "EWS.C4", "Predicted Values", "Actual Values")
TrainComparison
## EWS.C3 EWS.C2 EWS.C4 Predicted Values Actual Values
## 1 0.02825556 0.033413855 2.2869691 0.1934887 0.14662705
## 2 -0.70811859 -0.805938768 -1.1115950 -0.6008316 -0.60456266
## 3 -0.10688313 0.622316916 1.2708469 0.8201949 0.67365686
## 4 -0.84558726 -0.836077103 -1.0005405 0.3858389 0.31851455
## 5 -0.48646683 -0.358585419 -0.6597318 -0.5953600 -0.56141856
## 6 0.08285808 0.165218839 2.2313077 0.1903012 0.21508925
## 7 2.32652499 3.300208420 2.7687420 4.2086937 4.56885495
## 8 0.12631722 -0.004861831 1.8088176 0.1113428 -0.13979471
## 9 0.32466921 0.690630475 -0.2266459 0.1170682 0.04251283
## 10 0.76179322 0.282155578 2.3910492 -0.3592974 -0.52568042
## Test Data ##
# Fit Correlation Model
TestFit <- randomForest(TEST.9~., data=skhanTest)
# Predict PM25 Levels
predictionsTest <- predict(TestFit, skhanTest[,2:25])
# Compare Accuracy of Predictions
TestComparison <- data.frame(skhanTest[1:10,2], skhanTest[1:10,3], skhanTest[1:10,4],
predictionsTest[1:10], skhanTest[1:10,1])
names(TestComparison) <- c("TEST.11", "TEST.5", "TEST.8", "Predicted Values", "Actual Values")
TestComparison
## TEST.11 TEST.5 TEST.8 Predicted Values Actual Values
## 1 -0.7789118 1.1672588 -0.09612295 0.49599329 0.61320457
## 2 -0.8354450 -0.8200757 -0.60201541 -0.35175390 -0.30128536
## 3 1.4648706 1.7958106 0.26617745 -0.04289739 -0.23322136
## 4 -0.8003554 -0.8582915 -0.67756293 -0.43529546 -0.39808274
## 5 -0.5634496 -0.3770942 -0.60322279 -0.26063919 -0.13720632
## 6 1.7501015 2.3703094 0.09300459 1.72474409 2.74352940
## 7 3.1620971 3.8110322 7.85827262 5.16707418 5.67220152
## 8 0.9775156 1.7522010 0.59837959 0.66905792 0.94570110
## 9 0.4359872 0.8089716 -0.14967891 0.01014390 -0.01444931
## 10 1.0737554 1.4980030 1.00362825 2.23694911 1.91132792