install.packages(“randomForest”) install.packages(“gbm”) install.packages(“rpart”) install.packages(“rattle”) install.packages(“rpart.plot”) install.packages(“RColorBrewer”)
data<- read.csv("./defNASH top 25.csv")
dim(data)
## [1] 95 27
str(data)
## 'data.frame': 95 obs. of 27 variables:
## $ subject : Factor w/ 95 levels "NASH1","NASH10",..: 53 64 75 86 91 92 93 94 95 54 ...
## $ defNASH : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ENSG00000213606: int 0 0 0 3 0 0 10 3 1 4 ...
## $ ENSG00000083857: int 756 492 732 1450 679 1602 1805 541 635 362 ...
## $ ENSG00000198074: int 2 2 1 16 0 8 51 23 10 30 ...
## $ ENSG00000173391: int 4 2 0 1 0 5 4 3 0 0 ...
## $ ENSG00000165912: int 59 84 51 136 222 288 126 124 51 62 ...
## $ ENSG00000115009: int 0 2 0 3 0 0 3 1 1 0 ...
## $ ENSG00000148773: int 7 9 5 42 25 58 43 25 17 7 ...
## $ ENSG00000129474: int 84 128 75 373 326 450 347 241 187 127 ...
## $ ENSG00000138448: int 191 212 338 687 548 977 1050 521 334 270 ...
## $ ENSG00000169429: int 6 0 0 0 2 1 5 1 1 1 ...
## $ ENSG00000156535: int 10 23 24 52 37 85 56 40 24 24 ...
## $ ENSG00000196177: int 2052 2777 2825 4190 9494 10358 6260 5327 1651 2406 ...
## $ ENSG00000060982: int 10 18 10 33 16 44 65 15 18 9 ...
## $ ENSG00000234964: int 2 2 0 6 0 3 0 2 3 2 ...
## $ ENSG00000187498: int 138 200 94 438 391 755 648 264 251 106 ...
## $ ENSG00000231991: int 69 57 85 162 78 201 353 105 121 63 ...
## $ ENSG00000078098: int 1 1 2 5 4 7 6 5 3 2 ...
## $ ENSG00000024526: int 1 1 0 2 1 11 3 0 1 2 ...
## $ ENSG00000079931: int 3 5 2 11 2 12 6 6 11 0 ...
## $ ENSG00000137161: int 57 102 131 187 239 400 230 208 79 130 ...
## $ ENSG00000173744: int 117 158 189 368 357 635 573 250 159 177 ...
## $ ENSG00000118785: int 8 29 4 106 34 92 81 65 48 10 ...
## $ ENSG00000182718: int 122 55 87 152 103 176 271 111 73 64 ...
## $ ENSG00000162366: int 1 1 2 3 3 1 1 2 1 1 ...
## $ ENSG00000168542: int 567 786 631 2166 1599 2575 3674 2085 944 704 ...
data$defNASH <- as.factor(data$defNASH)
data$subject <- NULL
NumbofObs <- dim(data)
SplitofTrainTest <- 0.5
train <- sample(1:NumbofObs, SplitofTrainTest*NumbofObs)
## Warning in 1:NumbofObs: numerical expression has 2 elements: only the first
## used
test <- -train
TrainSet <- data[train, ]
TestSet <- data[test, ]
Testing.Outcome <- data$defNASH[test]
set.seed(123)
rpart.model <- rpart(defNASH ~ ., TrainSet, method = "anova")
fancyRpartPlot(rpart.model)
predict.rpart <- predict(rpart.model, newdata = TestSet)
rpart.results <- table(observed = Testing.Outcome, predict = predict.rpart)
rpart.results #confusion matrix
## predict
## observed 1.15789473684211 1.71428571428571 1.75 2
## 0 12 4 5 2
## 1 8 3 3 11
summary(rpart.results)
## Number of cases in table: 48
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 7.603, df = 3, p-value = 0.05496
## Chi-squared approximation may be incorrect
bag.model <- randomForest(defNASH ~ ., TrainSet, mtry= 25, importance= T )
bag.model
##
## Call:
## randomForest(formula = defNASH ~ ., data = TrainSet, mtry = 25, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 25
##
## OOB estimate of error rate: 31.91%
## Confusion matrix:
## 0 1 class.error
## 0 14 6 0.3000000
## 1 9 18 0.3333333
importance(bag.model)
## 0 1 MeanDecreaseAccuracy
## ENSG00000213606 3.4119081 4.18184753 4.7980805
## ENSG00000083857 -2.3325058 0.60009314 -1.0291516
## ENSG00000198074 4.1787200 3.13906948 4.4217293
## ENSG00000173391 2.1120207 3.28705124 3.6240647
## ENSG00000165912 3.2898648 0.94618109 2.8566631
## ENSG00000115009 2.8016300 2.54650587 3.4404161
## ENSG00000148773 -1.8597390 4.53521840 1.9759286
## ENSG00000129474 -1.1458335 -2.23387731 -2.2374472
## ENSG00000138448 -3.2578551 0.87179878 -1.6856044
## ENSG00000169429 5.8585756 2.77309059 5.8441789
## ENSG00000156535 -1.5287892 1.01351267 -0.3228246
## ENSG00000196177 -2.2274035 -0.74439925 -2.5920140
## ENSG00000060982 -1.0931995 0.98629542 0.2521196
## ENSG00000234964 0.5951718 1.62430958 1.5175364
## ENSG00000187498 0.2418235 -0.07158886 0.3329395
## ENSG00000231991 -0.3282091 -2.49841124 -1.8302299
## ENSG00000078098 1.7620838 5.96691677 4.7845545
## ENSG00000024526 8.9294135 4.14944926 8.5894947
## ENSG00000079931 -2.3194400 -0.10123742 -1.4942253
## ENSG00000137161 -0.4456163 -2.58316569 -2.4158183
## ENSG00000173744 -0.6035168 -2.53060566 -2.1518117
## ENSG00000118785 -1.5273488 -1.05196364 -1.4716860
## ENSG00000182718 1.7458746 0.69978147 2.2397654
## ENSG00000162366 2.4548491 -0.25691526 1.1587612
## ENSG00000168542 -1.2130002 1.32936662 0.1626110
## MeanDecreaseGini
## ENSG00000213606 1.6323679
## ENSG00000083857 0.2417435
## ENSG00000198074 2.6465439
## ENSG00000173391 1.0186995
## ENSG00000165912 1.4826157
## ENSG00000115009 0.5149525
## ENSG00000148773 1.4904747
## ENSG00000129474 0.3834768
## ENSG00000138448 0.2915831
## ENSG00000169429 2.8218948
## ENSG00000156535 0.1000760
## ENSG00000196177 1.2125862
## ENSG00000060982 1.0553121
## ENSG00000234964 0.8672632
## ENSG00000187498 0.2662054
## ENSG00000231991 0.1973183
## ENSG00000078098 1.1042246
## ENSG00000024526 3.1773363
## ENSG00000079931 0.1992244
## ENSG00000137161 0.5471829
## ENSG00000173744 0.2699355
## ENSG00000118785 0.1798585
## ENSG00000182718 0.1316691
## ENSG00000162366 0.3325649
## ENSG00000168542 0.3024648
varImpPlot(bag.model)
predict.bag <- predict(bag.model, newdata = TestSet )
Model.Results <- table(observed = Testing.Outcome, predict = predict.bag)
Model.Results #confusion matrix
## predict
## observed 0 1
## 0 15 8
## 1 4 21
summary(Model.Results)
## Number of cases in table: 48
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 12.134, df = 1, p-value = 0.0004952
rf1.model <- randomForest(defNASH ~ ., TrainSet, mtry= 6, importance= T )
rf1.model
##
## Call:
## randomForest(formula = defNASH ~ ., data = TrainSet, mtry = 6, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 31.91%
## Confusion matrix:
## 0 1 class.error
## 0 12 8 0.4000000
## 1 7 20 0.2592593
importance(rf1.model)
## 0 1 MeanDecreaseAccuracy
## ENSG00000213606 6.033528117 3.63088181 6.322180901
## ENSG00000083857 -1.395389109 3.54946467 2.219000413
## ENSG00000198074 3.493873614 0.51943614 2.488378991
## ENSG00000173391 4.366711351 3.44237183 5.024696947
## ENSG00000165912 4.089230617 3.88209930 5.576090497
## ENSG00000115009 4.158813127 2.51495612 3.901401862
## ENSG00000148773 2.858971900 3.51011216 4.559413647
## ENSG00000129474 0.288219511 -2.22444622 -1.533987712
## ENSG00000138448 -1.955460528 -0.64712944 -1.649364122
## ENSG00000169429 6.327500941 4.81684443 6.929213175
## ENSG00000156535 -0.740933082 -0.18103376 -0.579134325
## ENSG00000196177 1.096351441 -0.01542274 0.882641345
## ENSG00000060982 2.450122478 0.52021722 1.998026630
## ENSG00000234964 0.780289311 1.65341342 1.547601854
## ENSG00000187498 0.648058969 -0.72606174 -0.161619645
## ENSG00000231991 -1.912279489 -0.53835276 -1.601487199
## ENSG00000078098 2.101087639 2.44172071 2.794371550
## ENSG00000024526 7.346126737 4.04022548 7.028244304
## ENSG00000079931 0.020896294 -1.03350294 -0.939970024
## ENSG00000137161 1.366492818 -0.84708279 -0.002860047
## ENSG00000173744 -0.004035696 -0.45790973 -0.584667815
## ENSG00000118785 -1.871080210 -0.60661571 -1.644298369
## ENSG00000182718 -1.138400307 -0.98475508 -1.122012163
## ENSG00000162366 2.825345099 0.57980696 1.935222291
## ENSG00000168542 0.493033342 1.89711405 1.691999883
## MeanDecreaseGini
## ENSG00000213606 1.6573088
## ENSG00000083857 0.8170887
## ENSG00000198074 1.4602527
## ENSG00000173391 1.2516230
## ENSG00000165912 1.3269988
## ENSG00000115009 0.6767214
## ENSG00000148773 1.1160003
## ENSG00000129474 0.5303145
## ENSG00000138448 0.5527683
## ENSG00000169429 1.8662159
## ENSG00000156535 0.5393983
## ENSG00000196177 1.0735042
## ENSG00000060982 1.1887764
## ENSG00000234964 0.8780275
## ENSG00000187498 0.5062902
## ENSG00000231991 0.4586070
## ENSG00000078098 0.8978534
## ENSG00000024526 1.9888425
## ENSG00000079931 0.3988987
## ENSG00000137161 0.8531971
## ENSG00000173744 0.4114459
## ENSG00000118785 0.4641015
## ENSG00000182718 0.4591773
## ENSG00000162366 0.4144262
## ENSG00000168542 0.7522467
varImpPlot(rf1.model)
plot(rf1.model)
predict.rf <- predict(rf1.model, newdata = TestSet )
Model.Results <- table(observed = Testing.Outcome, predict = predict.rf)
Model.Results #confusion matrix
## predict
## observed 0 1
## 0 16 7
## 1 4 21
summary(Model.Results)
## Number of cases in table: 48
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 14.141, df = 1, p-value = 0.0001696
rf2.model <- randomForest(defNASH ~ ., TrainSet, mtry= 5, importance= T )
rf2.model
##
## Call:
## randomForest(formula = defNASH ~ ., data = TrainSet, mtry = 5, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 27.66%
## Confusion matrix:
## 0 1 class.error
## 0 14 6 0.3000000
## 1 7 20 0.2592593
importance(rf2.model)
## 0 1 MeanDecreaseAccuracy
## ENSG00000213606 3.11590105 2.1905363 3.35549384
## ENSG00000083857 -0.07336268 0.1828988 -0.01993989
## ENSG00000198074 3.61582091 3.2289963 4.21542342
## ENSG00000173391 3.32152677 5.0406765 5.74056630
## ENSG00000165912 3.14229713 4.9359925 5.10535489
## ENSG00000115009 3.61123014 0.8664017 2.91663340
## ENSG00000148773 2.68594003 4.1504435 4.72386266
## ENSG00000129474 -1.11461906 -2.5223644 -2.73387149
## ENSG00000138448 -0.31639910 -0.4755444 -0.38423434
## ENSG00000169429 2.78695598 5.1361860 5.96556448
## ENSG00000156535 -1.43628155 1.0483876 -0.15844836
## ENSG00000196177 -0.10065899 1.7153110 0.82560156
## ENSG00000060982 1.08215285 -0.2462770 0.19316944
## ENSG00000234964 -0.49064921 4.0325338 2.74701812
## ENSG00000187498 -1.55853652 -0.2875719 -1.32511070
## ENSG00000231991 -2.09252388 0.4581523 -1.00552164
## ENSG00000078098 1.24471389 3.6562970 3.60570017
## ENSG00000024526 8.17853046 6.0813947 8.76652826
## ENSG00000079931 -1.25359363 -0.3601234 -1.92414198
## ENSG00000137161 2.08443571 0.3551238 1.69167367
## ENSG00000173744 1.46716617 -2.1105797 -1.03452499
## ENSG00000118785 -1.23827851 -2.3968991 -2.31616698
## ENSG00000182718 0.41001708 -1.5302467 -1.16187978
## ENSG00000162366 3.41923389 1.1138149 2.81887172
## ENSG00000168542 -1.33768219 1.0256917 0.10532312
## MeanDecreaseGini
## ENSG00000213606 1.5302938
## ENSG00000083857 0.5286481
## ENSG00000198074 1.7938538
## ENSG00000173391 1.3737931
## ENSG00000165912 1.1408689
## ENSG00000115009 0.7168474
## ENSG00000148773 1.3763909
## ENSG00000129474 0.4536993
## ENSG00000138448 0.5821236
## ENSG00000169429 1.6329493
## ENSG00000156535 0.4618170
## ENSG00000196177 1.1301533
## ENSG00000060982 1.0935404
## ENSG00000234964 0.8781137
## ENSG00000187498 0.6542392
## ENSG00000231991 0.4395772
## ENSG00000078098 0.9800931
## ENSG00000024526 1.8198968
## ENSG00000079931 0.3637957
## ENSG00000137161 0.8006828
## ENSG00000173744 0.4964026
## ENSG00000118785 0.4656719
## ENSG00000182718 0.5275583
## ENSG00000162366 0.4430622
## ENSG00000168542 0.7642677
varImpPlot(rf2.model)
plot(rf2.model)
predict.rf <- predict(rf2.model, newdata = TestSet )
Model.Results <- table(observed = Testing.Outcome, predict = predict.rf)
Model.Results #confusion matrix
## predict
## observed 0 1
## 0 16 7
## 1 4 21
summary(Model.Results)
## Number of cases in table: 48
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 14.141, df = 1, p-value = 0.0001696
rf3.model <- randomForest(defNASH ~ ., TrainSet, mtry=4, importance= T )
rf3.model
##
## Call:
## randomForest(formula = defNASH ~ ., data = TrainSet, mtry = 4, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 27.66%
## Confusion matrix:
## 0 1 class.error
## 0 14 6 0.3000000
## 1 7 20 0.2592593
importance(rf3.model)
## 0 1 MeanDecreaseAccuracy
## ENSG00000213606 4.2974915 1.83014758 3.77537472
## ENSG00000083857 1.8890784 0.34535037 1.17407833
## ENSG00000198074 5.1506795 3.72160072 5.52440774
## ENSG00000173391 4.2609332 3.19306441 4.78521285
## ENSG00000165912 3.5673935 3.94470713 4.61079211
## ENSG00000115009 5.8363188 1.14035797 4.54711159
## ENSG00000148773 2.3969522 4.77005210 4.80713007
## ENSG00000129474 1.4371025 -1.56118961 -0.18016846
## ENSG00000138448 -0.2601430 0.77694059 0.01787410
## ENSG00000169429 2.6997836 3.49821260 3.87433263
## ENSG00000156535 -1.2424999 -0.56679119 -1.19787455
## ENSG00000196177 3.6609472 -0.58789651 1.93015582
## ENSG00000060982 0.1801544 1.69539674 1.19738180
## ENSG00000234964 3.0239648 4.66385734 5.13385036
## ENSG00000187498 -0.4164196 0.17194524 -0.44201979
## ENSG00000231991 0.6991853 -0.56542911 0.03814908
## ENSG00000078098 1.7177667 2.68874960 2.75976380
## ENSG00000024526 7.3342960 4.69089952 7.53021141
## ENSG00000079931 -1.8742482 -0.03452988 -1.33914984
## ENSG00000137161 1.1926255 0.11764821 0.52439947
## ENSG00000173744 1.2324301 -1.98221583 -0.88660102
## ENSG00000118785 -3.1240164 -1.01420094 -2.86607930
## ENSG00000182718 1.2612993 0.50157671 0.80728144
## ENSG00000162366 3.8771593 -0.28983760 1.85564627
## ENSG00000168542 0.7508060 2.75427484 2.37725744
## MeanDecreaseGini
## ENSG00000213606 1.4120397
## ENSG00000083857 0.7715356
## ENSG00000198074 1.5934935
## ENSG00000173391 1.2211124
## ENSG00000165912 1.1957401
## ENSG00000115009 0.8823206
## ENSG00000148773 1.1589112
## ENSG00000129474 0.5423547
## ENSG00000138448 0.5948034
## ENSG00000169429 1.4190480
## ENSG00000156535 0.6040489
## ENSG00000196177 0.9205694
## ENSG00000060982 1.3284664
## ENSG00000234964 0.9545475
## ENSG00000187498 0.6503239
## ENSG00000231991 0.4768369
## ENSG00000078098 0.8557878
## ENSG00000024526 1.8039866
## ENSG00000079931 0.5191374
## ENSG00000137161 0.6559942
## ENSG00000173744 0.4915720
## ENSG00000118785 0.5764491
## ENSG00000182718 0.4634314
## ENSG00000162366 0.5171184
## ENSG00000168542 0.8953922
varImpPlot(rf3.model)
plot(rf3.model)
predict.rf <- predict(rf3.model, newdata = TestSet )
Model.Results <- table(observed = Testing.Outcome, predict = predict.rf)
Model.Results #confusion matrix
## predict
## observed 0 1
## 0 16 7
## 1 4 21
summary(Model.Results)
## Number of cases in table: 48
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 14.141, df = 1, p-value = 0.0001696
rf4.model <- randomForest(defNASH ~ ., TrainSet, mtry = 3, importance= T )
rf4.model
##
## Call:
## randomForest(formula = defNASH ~ ., data = TrainSet, mtry = 3, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 27.66%
## Confusion matrix:
## 0 1 class.error
## 0 15 5 0.2500000
## 1 8 19 0.2962963
importance(rf4.model)
## 0 1 MeanDecreaseAccuracy
## ENSG00000213606 4.5874651 3.4911454 5.402255222
## ENSG00000083857 -0.5308425 2.1364439 1.126240664
## ENSG00000198074 4.6093844 2.6089452 4.885664749
## ENSG00000173391 3.5886437 4.5230023 4.902255894
## ENSG00000165912 2.6974145 4.5386046 4.585519806
## ENSG00000115009 5.1925386 1.8311811 5.162500287
## ENSG00000148773 2.8347537 2.4720637 3.162338071
## ENSG00000129474 -0.7485664 -1.5641834 -1.629675228
## ENSG00000138448 -0.8996921 -0.9198803 -1.651649564
## ENSG00000169429 4.2684566 5.5945065 6.473263270
## ENSG00000156535 -2.8392588 -0.4765459 -2.251803002
## ENSG00000196177 -0.7681466 0.4804241 -0.086506717
## ENSG00000060982 1.5840341 1.6096992 2.014405995
## ENSG00000234964 1.8041471 2.3902237 2.716135738
## ENSG00000187498 -0.6073575 1.0790521 0.621871383
## ENSG00000231991 -1.4838539 0.3024591 -0.822746570
## ENSG00000078098 1.4694394 3.5066547 3.204142030
## ENSG00000024526 6.7702558 4.2838527 6.707241789
## ENSG00000079931 -0.3353972 -0.4639980 -0.581715679
## ENSG00000137161 3.0861283 1.1560082 2.390833474
## ENSG00000173744 0.2688846 -2.0076024 -1.477128669
## ENSG00000118785 -1.1332482 0.5776126 -0.334214542
## ENSG00000182718 0.6045072 -0.2429381 -0.004377996
## ENSG00000162366 2.9016499 1.6072362 3.112457261
## ENSG00000168542 1.0101595 1.0983792 1.232200955
## MeanDecreaseGini
## ENSG00000213606 1.2200044
## ENSG00000083857 0.8599981
## ENSG00000198074 1.6456633
## ENSG00000173391 1.1773349
## ENSG00000165912 1.1380842
## ENSG00000115009 0.7195974
## ENSG00000148773 1.1975523
## ENSG00000129474 0.6567509
## ENSG00000138448 0.5527933
## ENSG00000169429 1.5949500
## ENSG00000156535 0.5672682
## ENSG00000196177 1.1384986
## ENSG00000060982 1.0934111
## ENSG00000234964 0.9786290
## ENSG00000187498 0.6105460
## ENSG00000231991 0.5356567
## ENSG00000078098 0.8888001
## ENSG00000024526 1.5923593
## ENSG00000079931 0.5063392
## ENSG00000137161 0.8576097
## ENSG00000173744 0.4839495
## ENSG00000118785 0.6188079
## ENSG00000182718 0.4765763
## ENSG00000162366 0.5403537
## ENSG00000168542 0.7995296
varImpPlot(rf4.model)
plot(rf4.model)
predict.rf <- predict(rf4.model, newdata = TestSet )
Model.Results <- table(observed = Testing.Outcome, predict = predict.rf)
Model.Results #confusion matrix
## predict
## observed 0 1
## 0 17 6
## 1 4 21
summary(Model.Results)
## Number of cases in table: 48
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 16.326, df = 1, p-value = 5.333e-05
rf5.model <- randomForest(defNASH ~ ENSG00000198074+ENSG00000196177+ENSG00000115009 +ENSG00000078098, TrainSet, importance= T )
rf5.model
##
## Call:
## randomForest(formula = defNASH ~ ENSG00000198074 + ENSG00000196177 + ENSG00000115009 + ENSG00000078098, data = TrainSet, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 27.66%
## Confusion matrix:
## 0 1 class.error
## 0 15 5 0.2500000
## 1 8 19 0.2962963
importance(rf5.model)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## ENSG00000198074 8.758810 3.7627259 8.286933 7.679080
## ENSG00000196177 4.810424 2.8079125 5.128385 5.541758
## ENSG00000115009 7.919137 4.1087794 8.652059 4.451713
## ENSG00000078098 6.140682 0.8775337 4.495579 4.776457
varImpPlot(rf5.model)
plot(rf5.model)
predict.rf <- predict(rf5.model, newdata = TestSet )
Model.Results <- table(observed = Testing.Outcome, predict = predict.rf)
Model.Results #confusion matrix
## predict
## observed 0 1
## 0 16 7
## 1 4 21
summary(Model.Results)
## Number of cases in table: 48
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 14.141, df = 1, p-value = 0.0001696