\(~\)
library("randomForestSRC")
library("tidyverse")
library("survival")
data(peakVO2, package = "randomForestSRC")
dta <- peakVO2
NTree <- 10
# Building a RSF
RF_obj <- rfsrc(Surv(ttodead,died)~., dta, ntree = NTree, membership = TRUE)
# Printing the RF object
print(RF_obj)
## Sample size: 2231
## Number of deaths: 726
## Number of trees: 10
## Forest terminal node size: 15
## Average no. of terminal nodes: 106
## No. of variables tried at each split: 7
## Total no. of variables: 39
## Resampling used to grow trees: swor
## Resample size used to grow trees: 1410
## Analysis: RSF
## Family: surv
## Splitting rule: logrank *random*
## Number of random split points: 10
## (OOB) CRPS: 0.16695617
## (OOB) Requested performance error: 0.33514454
# Extracting the mebership of individual on a node
head(RF_obj$membership,n=30)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 5 20 37 13 69 9 99 8 51 3
## [2,] 9 17 89 55 72 58 14 102 54 58
## [3,] 95 4 26 16 59 68 31 24 15 79
## [4,] 100 26 49 68 77 77 49 47 42 102
## [5,] 93 15 81 89 92 105 75 82 86 99
## [6,] 1 3 65 59 19 26 73 109 19 13
## [7,] 3 52 7 6 60 5 6 22 1 21
## [8,] 90 29 44 61 77 74 38 51 42 104
## [9,] 49 53 88 50 33 31 14 102 20 49
## [10,] 102 37 3 57 49 104 20 72 39 84
## [11,] 48 92 51 31 112 58 34 20 24 66
## [12,] 68 82 29 42 10 19 9 75 68 17
## [13,] 7 41 66 35 33 26 9 75 19 52
## [14,] 99 39 58 88 93 104 84 54 87 94
## [15,] 58 64 95 52 54 46 63 97 69 62
## [16,] 89 100 40 68 103 102 52 77 38 108
## [17,] 102 20 60 85 91 104 84 14 86 94
## [18,] 100 94 55 22 71 77 4 53 54 81
## [19,] 55 55 66 32 27 31 7 18 54 40
## [20,] 13 23 51 40 13 39 52 52 24 31
## [21,] 100 93 103 99 49 68 100 55 38 104
## [22,] 37 26 49 72 104 104 52 78 68 30
## [23,] 36 54 66 40 36 30 13 92 20 13
## [24,] 102 14 16 85 63 96 74 79 86 92
## [25,] 46 69 90 50 45 45 34 102 32 71
## [26,] 36 55 86 36 100 63 96 98 67 45
## [27,] 4 3 3 97 95 24 94 29 88 31
## [28,] 11 40 85 55 72 58 45 86 65 58
## [29,] 40 81 98 29 112 31 14 92 74 49
## [30,] 94 84 27 17 25 71 57 11 13 82
# Data manipulation
RF_member_wide <-as.data.frame(cbind(seq(1:nrow(RF_obj$membership)), RF_obj$membership))
colnames(RF_member_wide) <- c("id",as.character(paste(seq(1:NTree))))
#rownames(RF_member_wide) <- rep("",nrow(dta))
head(RF_member_wide, n=10)
## id 1 2 3 4 5 6 7 8 9 10
## 1 1 5 20 37 13 69 9 99 8 51 3
## 2 2 9 17 89 55 72 58 14 102 54 58
## 3 3 95 4 26 16 59 68 31 24 15 79
## 4 4 100 26 49 68 77 77 49 47 42 102
## 5 5 93 15 81 89 92 105 75 82 86 99
## 6 6 1 3 65 59 19 26 73 109 19 13
## 7 7 3 52 7 6 60 5 6 22 1 21
## 8 8 90 29 44 61 77 74 38 51 42 104
## 9 9 49 53 88 50 33 31 14 102 20 49
## 10 10 102 37 3 57 49 104 20 72 39 84
RF_member_aux <- gather(RF_member_wide, key="Tree", value = "Nodes",2:(NTree+1))
attach(RF_member_aux)
RF_member_aux2 <- RF_member_aux[order(Tree,Nodes),]
head(RF_member_aux2,n=50)
## id Tree Nodes
## 6 6 1 1
## 226 226 1 1
## 333 333 1 1
## 411 411 1 1
## 487 487 1 1
## 1090 1090 1 1
## 1322 1322 1 1
## 1433 1433 1 1
## 2064 2064 1 1
## 88 88 1 2
## 7 7 1 3
## 1044 1044 1 3
## 1257 1257 1 3
## 1293 1293 1 3
## 1355 1355 1 3
## 1541 1541 1 3
## 1937 1937 1 3
## 2150 2150 1 3
## 27 27 1 4
## 34 34 1 4
## 177 177 1 4
## 191 191 1 4
## 236 236 1 4
## 272 272 1 4
## 352 352 1 4
## 404 404 1 4
## 466 466 1 4
## 498 498 1 4
## 1016 1016 1 4
## 1139 1139 1 4
## 1311 1311 1 4
## 1319 1319 1 4
## 1336 1336 1 4
## 1361 1361 1 4
## 1377 1377 1 4
## 1435 1435 1 4
## 1440 1440 1 4
## 1441 1441 1 4
## 1490 1490 1 4
## 1603 1603 1 4
## 1664 1664 1 4
## 1704 1704 1 4
## 1722 1722 1 4
## 1740 1740 1 4
## 1816 1816 1 4
## 1853 1853 1 4
## 1893 1893 1 4
## 1922 1922 1 4
## 1932 1932 1 4
## 1936 1936 1 4
RF_member_long <- RF_member_aux2[ ,c(2,3,1)]
# Final (long) format of the dataframe concerning Trees, nodes and individuals
head(RF_member_long,n=30)
## Tree Nodes id
## 6 1 1 6
## 226 1 1 226
## 333 1 1 333
## 411 1 1 411
## 487 1 1 487
## 1090 1 1 1090
## 1322 1 1 1322
## 1433 1 1 1433
## 2064 1 1 2064
## 88 1 2 88
## 7 1 3 7
## 1044 1 3 1044
## 1257 1 3 1257
## 1293 1 3 1293
## 1355 1 3 1355
## 1541 1 3 1541
## 1937 1 3 1937
## 2150 1 3 2150
## 27 1 4 27
## 34 1 4 34
## 177 1 4 177
## 191 1 4 191
## 236 1 4 236
## 272 1 4 272
## 352 1 4 352
## 404 1 4 404
## 466 1 4 466
## 498 1 4 498
## 1016 1 4 1016
## 1139 1 4 1139
# Prediction for a new data: uses the package function to take advantage of the prediction object
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names
newdata
## age betablok dilver nifed acei angioten.II anti.arrhy anti.coag aspirin
## 1 55 1 0 0 1 0 0 0 0
## digoxin nitrates vasodilator diuretic.loop diuretic.thiazide
## 1 1 0 0 1 0
## diuretic.potassium.spar lipidrx.statin insulin surgery.pacemaker surgery.cabg
## 1 0 0 0 0 0
## surgery.pci surgery.aicd.implant resting.systolic.bp resting.hr smknow
## 1 0 0 110 75 0
## q.wave.mi bmi niddm lvef.metabl peak.rer peak.vo2 interval cad bun
## 1 0 27.77673 0 20 1.1 15.7 480 0 23
## sodium hgb glucose male black crcl
## 1 139.8042 13.58482 96.92888 1 0 83.13088
y.pred <- predict(RF_obj,newdata,membership=TRUE)
#names(y.pred)
# Corresponding node of this observation, on each tree of the forest
y.pred$membership
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 42 104 41 61 33 39 45 52 63 13
# Organizing the dataset
y.pred_member_wide <-as.data.frame(cbind(seq(1:nrow(y.pred$membership)), y.pred$membership))
colnames(y.pred_member_wide) <- c("id",as.character(paste(seq(1:NTree))))
#rownames(RF_member_wide) <- rep("",nrow(dta))
#head(RF_member_wide,n=10)
y.pred_member_long= gather(y.pred_member_wide, key="Tree", value = "Nodes",2:(NTree+1))
pred = numeric(nrow(y.pred_member_long))
y.pred_member_long <- cbind(y.pred_member_long,pred)
# Final (long) format of the dataframe concerning Trees, nodes
head(y.pred_member_long, n=30)
## id Tree Nodes pred
## 1 1 1 42 0
## 2 1 2 104 0
## 3 1 3 41 0
## 4 1 4 61 0
## 5 1 5 33 0
## 6 1 6 39 0
## 7 1 7 45 0
## 8 1 8 52 0
## 9 1 9 63 0
## 10 1 10 13 0
# Example of a prediction:
# Tree 1, observation corresponds to node 28:
indexes <- RF_member_long$id[Tree==1&Nodes==28]
# individuos no nó 28
indexes
## [1] 1740 1922 1489 1702 364 691 240 1961 174 1575 126 1954 590 1261 2128
## [16] 1185 40 804 910 950 1917 2118 1288 719 549 1560 2130 745 505 594
## [31] 2012 937 447 347 1375 109 416 564 2073
km_fit <- survfit(Surv(ttodead,died)~1,data=dta[indexes,])
# Tree 2, observation corresponds to node 21:
indexes <- RF_member_long$id[Tree==2&Nodes==21]
# individuos no nó 21
indexes
## [1] 1928 1853 1191 1811 1256 1927 1607 821 147 1381 553 879 325 1308 1522
## [16] 1761 874 1110 261 2226 43
km_fit2 <- survfit(Surv(ttodead,died)~1,data=dta[indexes,])
# Tree 3, observation corresponds to node 28:
indexes <- RF_member_long$id[Tree==3&Nodes==45]
# individuos no nó 45
indexes
## [1] 1249 1737 639 818 689 808 805 1127 1483 1540 2091 1269 484 33 909
## [16] 1148 136 435 441 2206 752 1837 1339
km_fit3 <- survfit(Surv(ttodead,died)~1,data=dta[indexes,])
# Tree 4, observation corresponds to node 60:
indexes <- RF_member_long$id[Tree==4&Nodes==60]
# individuos no nó 60
indexes
## [1] 1950 213 2171
km_fit4 <- survfit(Surv(ttodead,died)~1,data=dta[indexes,])
par(mfrow=c(2,2))
plot(km_fit, xlab="Days", main = 'Kaplan Meyer estimate (Tree 1, Node 28)')
plot(km_fit2, xlab="Days", main = 'Kaplan Meyer estimate (Tree 2, Node 21)')
plot(km_fit3, xlab="Days", main = 'Kaplan Meyer estimate (Tree 3, Node 45)')
plot(km_fit4, xlab="Days", main = 'Kaplan Meyer estimate (Tree 4, Node 60)')