Libraries

library("TH.data")
library("tidyverse")
library("randomForest")
library("survival")
library("randomForestSRC")

Dataset

data("GBSG2")
GBSG2%>%head(.,2)
##   horTh age menostat tsize tgrade pnodes progrec estrec time cens
## 1    no  70     Post    21     II      3      48     66 1814    1
## 2   yes  56     Post    12     II      7      61     77 2018    1

Breast10

Using 10 random variables.

set.seed(2000)
#breast10
n10<- as.data.frame(replicate(10,matrix(runif(10), nrow = 686)))
breast10 <- cbind(GBSG2,n10)
breast10%>%str(.)
## 'data.frame':    686 obs. of  20 variables:
##  $ horTh   : Factor w/ 2 levels "no","yes": 1 2 2 2 1 1 2 1 1 1 ...
##  $ age     : int  70 56 58 59 73 32 59 65 80 66 ...
##  $ menostat: Factor w/ 2 levels "Pre","Post": 2 2 2 2 2 1 2 2 2 2 ...
##  $ tsize   : int  21 12 35 17 35 57 8 16 39 18 ...
##  $ tgrade  : Ord.factor w/ 3 levels "I"<"II"<"III": 2 2 2 2 2 3 2 2 2 2 ...
##  $ pnodes  : int  3 7 9 4 1 24 2 1 30 7 ...
##  $ progrec : int  48 61 52 60 26 0 181 192 0 0 ...
##  $ estrec  : int  66 77 271 29 65 13 0 25 59 3 ...
##  $ time    : int  1814 2018 712 1807 772 448 2172 2161 471 2014 ...
##  $ cens    : int  1 1 1 1 1 1 0 0 1 0 ...
##  $ V1      : num  0.197 0.716 0.362 0.391 0.813 ...
##  $ V2      : num  0.1197 0.1424 0.0263 0.0731 0.9819 ...
##  $ V3      : num  0.971 0.91 0.798 0.718 0.911 ...
##  $ V4      : num  0.861 0.171 0.121 0.915 0.576 ...
##  $ V5      : num  0.6755 0.0697 0.0388 0.2222 0.6726 ...
##  $ V6      : num  0.476 0.6018 0.0711 0.5739 0.0445 ...
##  $ V7      : num  0.147 0.698 0.281 0.141 0.208 ...
##  $ V8      : num  0.346 0.344 0.756 0.726 0.705 ...
##  $ V9      : num  0.498 0.776 0.112 0.997 0.671 ...
##  $ V10     : num  0.806 0.692 0.83 0.664 0.299 ...
#train/ test
n = nrow(breast10)
split = sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.75, 0.25))

training = breast10[split, ]
testing = breast10[!split, ]

#Fitting randomForest 
breast10.rf <- randomForest(time ~ ., data=training,keep.forest=FALSE, importance=TRUE)

#fitting cox model
surv.object <- Surv(breast10$time, breast10$cens)
fit.coxph <- coxph(surv.object~ ., data = breast10)

#c-index
sum.surv <- fit.coxph %>% summary(.)
c_index <- sum.surv$concordance %>% print(.)
##            C        se(C) 
## 1.000000e+00 3.783583e-18

Breast 50

set.seed(2000)
n50<- as.data.frame(replicate(50,matrix(runif(50), nrow = 686)))
breast50 <- cbind(GBSG2,n50)

#train/ test
n = nrow(breast50)
split = sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.75, 0.25))

training = breast50[split, ]
testing = breast50[!split, ]

#Fitting randomForest
breast50.rf <- randomForest(time ~ ., data=training,keep.forest=FALSE, importance=TRUE)

#fitting cox model
surv.object <- Surv(breast50$time, breast50$cens)
fit.coxph <- coxph(surv.object~ ., data = breast50)

#c-index
sum.surv <- fit.coxph %>% summary(.)
c_index <- sum.surv$concordance %>% print(.)
##            C        se(C) 
## 9.999850e-01 1.497032e-05

Breast 100

set.seed(2000)
n100<- as.data.frame(replicate(100,matrix(runif(100), nrow = 686)))
breast100 <- cbind(GBSG2,n100)

#train/ test
n = nrow(breast100)
split = sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.75, 0.25))

training = breast100[split, ]
testing = breast100[!split, ]

#Fitting randomForest 
breast100.rf <- randomForest(time ~ ., data=training,keep.forest=FALSE, importance=TRUE)

#fitting cox model
surv.object <- Surv(breast100$time, breast100$cens)
fit.coxph <- coxph(surv.object~ ., data = breast100)

#c-index
sum.surv <- fit.coxph %>% summary(.)
c_index <- sum.surv$concordance %>% print(.)
##            C        se(C) 
## 9.999850e-01 1.493809e-05

Repeating (1-5) for 100times

for (x in 1:5) {
  n10<- as.data.frame(replicate(10,matrix(runif(10), nrow = 686)))
  breast10 <- cbind(GBSG2,n10)
  n = nrow(breast10)
  split = sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.75, 0.25))
  training = breast10[split, ]
  testing = breast10[!split, ]
  breast10.rf <- randomForest(time ~ ., data=training,keep.forest=FALSE, importance=TRUE)
  surv.object <- Surv(breast10$time, breast10$cens)
  fit.coxph <- coxph(surv.object~ ., data = breast10)
  sum.surv <- summary(fit.coxph)
  c_index <- sum.surv$concordance
  c_index %>% print(.)
  
}
##            C        se(C) 
## 1.000000e+00 3.783583e-18 
##            C        se(C) 
## 1.000000e+00 3.783583e-18 
##            C        se(C) 
## 1.000000e+00 3.783583e-18 
##            C        se(C) 
## 1.000000e+00 3.783583e-18 
##            C        se(C) 
## 1.000000e+00 3.783583e-18

Boxplots