knitr::opts_chunk$set(echo = TRUE, warning=F, message=F)

library(caret)
library(tidyverse)
library(ggplot2)

# Använder lung data som exempel
data(lung, package="survival")

df <- lung

# Ibland är caret lite gnällig om det inte är explicit att det är en faktor
df$status <- as.factor(ifelse(df$status == 1, "dead", "alive"))

Partitionerar data i test och träning

# Blocked partitioning, on survival
inTrain   <- createDataPartition(df$status, p=0.75, list=F)
trainData <- df[inTrain,]
testData  <- df[-inTrain,]

# Check
nrow(df) == nrow(trainData) + nrow(testData)
## [1] TRUE

Pre-processing

# Pre-processing
set.seed(1)
preProcValues    <- preProcess(trainData, method = c("center", "scale", "knnImpute"))

# What will caret do?
preProcValues
## Created from 126 samples and 10 variables
## 
## Pre-processing:
##   - centered (9)
##   - ignored (1)
##   - 5 nearest neighbor imputation (9)
##   - scaled (9)
# Do it:
trainTransformed <- predict(preProcValues, trainData)
testTransformed  <- predict(preProcValues, testData)

# Cross validation för alla modeller
fitControl <- trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           ## repeated 2 times
                           repeats = 2)

Modellerna

Modell 1: XGBOOST

xgbFit <- train(status ~ .,
                data = trainTransformed,
                method = "xgbTree",
                metric="Accuracy",
                tuneGrid = expand.grid(nrounds = c(50, 100, 150),
                                       max_depth = c(5, 10, 15),
                                       eta = 0.02,
                                       gamma = 5,
                                       colsample_bytree = 0.75,
                                       min_child_weight = 0,
                                       subsample = 0.5),
                trControl=fitControl,
                tuneLength = 10,
                na.action = na.omit)
xgbFit
## eXtreme Gradient Boosting 
## 
## 172 samples
##   9 predictor
##   2 classes: 'alive', 'dead' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 138, 138, 138, 137, 137, 137, ... 
## Resampling results across tuning parameters:
## 
##   max_depth  nrounds  Accuracy   Kappa     
##    5          50      0.7239037  0.04030966
##    5         100      0.7386147  0.08739710
##    5         150      0.7414719  0.10790899
##   10          50      0.7384416  0.09380395
##   10         100      0.7443239  0.12064889
##   10         150      0.7444079  0.13559180
##   15          50      0.7297861  0.05392884
##   15         100      0.7327273  0.07548552
##   15         150      0.7355844  0.09865160
## 
## Tuning parameter 'eta' was held constant at a value of 0.02
## Tuning
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 0
## 
## Tuning parameter 'subsample' was held constant at a value of 0.5
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 150, max_depth = 10, eta
##  = 0.02, gamma = 5, colsample_bytree = 0.75, min_child_weight = 0 and
##  subsample = 0.5.
plot(xgbFit)

Modell 2: Random forest

rfFit <- train(status ~ .,
                data = trainTransformed,
                method = "rf",
                metric="Accuracy",
                tuneGrid = expand.grid(mtry=c(2, 4, 6, 8)),
                trControl=fitControl,
                ntrees=1000,
                tuneLength = 10,
                na.action = na.omit)
plot(rfFit)

## Modell 3: Logistisk regression

logregFit <- train(status ~ .,
                data = trainTransformed,
                method = "glm",
                metric="Accuracy",
                trControl=fitControl,
                na.action = na.omit)

Resampling och utvärdering av modell 1, 2, 3

# collect resamples
results <- resamples(list(LOGISTIC_REGRESSION=logregFit,
                          RANDOM_FOREST = rfFit,
                          XGBOOST = xgbFit))

# summarize the distributions
#summary(results)

# boxplots of results
bwplot(results)

# dot plots of results
dotplot(results)

Conditional variable importance enligt Strobl

Ingen av ovanstående är conditional variable importance (CVI).

https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-307

CVI Metod 1: standardpaketet

library(party)

ntrees=100
mtrys = 7

bst <- cforest (status~.,
               data = df,
               # cforest_unbiased = unbiased variable importance estiamtes
               control = cforest_unbiased(ntree = ntrees, mtry=mtrys))

# Skapar graferna
res2            <- data.frame(varImp(bst)) # hämtar variable importance
res2$impProcent <- round(res2$Overall/sum(res2$Overall)*100) # räknar  om importance för varje till procent av hela
res2$var        <- rownames(res2) # kopierar radnamnet

res2 %>% ggplot(aes(x=impProcent, y=reorder(var, impProcent))) +
  geom_bar(stat="identity")

Metod 2: paketet bygger på party men är snabbare

library(moreparty)
library(doParallel)

registerDoParallel(cores=4)
cfFit = fastcforest(status~., data=df, parallel=T)
stopImplicitCluster()

registerDoParallel(cores=4)
vi = fastvarImp(cfFit, measure='ACC', parallel=TRUE)
stopImplicitCluster()

varimp <- data.frame(rev(sort(vi)))
varimp$imp <- varimp$rev.sort.vi..
varimp$var <- rownames(varimp)
varimp %>% ggplot(aes(x=imp, y=reorder(var, imp))) +
  geom_bar(stat="identity")