knitr::opts_chunk$set(echo = TRUE, warning=F, message=F)
library(caret)
library(tidyverse)
library(ggplot2)
# Använder lung data som exempel
data(lung, package="survival")
df <- lung
# Ibland är caret lite gnällig om det inte är explicit att det är en faktor
df$status <- as.factor(ifelse(df$status == 1, "dead", "alive"))
# Blocked partitioning, on survival
inTrain <- createDataPartition(df$status, p=0.75, list=F)
trainData <- df[inTrain,]
testData <- df[-inTrain,]
# Check
nrow(df) == nrow(trainData) + nrow(testData)
## [1] TRUE
# Pre-processing
set.seed(1)
preProcValues <- preProcess(trainData, method = c("center", "scale", "knnImpute"))
# What will caret do?
preProcValues
## Created from 126 samples and 10 variables
##
## Pre-processing:
## - centered (9)
## - ignored (1)
## - 5 nearest neighbor imputation (9)
## - scaled (9)
# Do it:
trainTransformed <- predict(preProcValues, trainData)
testTransformed <- predict(preProcValues, testData)
# Cross validation för alla modeller
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated 2 times
repeats = 2)
xgbFit <- train(status ~ .,
data = trainTransformed,
method = "xgbTree",
metric="Accuracy",
tuneGrid = expand.grid(nrounds = c(50, 100, 150),
max_depth = c(5, 10, 15),
eta = 0.02,
gamma = 5,
colsample_bytree = 0.75,
min_child_weight = 0,
subsample = 0.5),
trControl=fitControl,
tuneLength = 10,
na.action = na.omit)
xgbFit
## eXtreme Gradient Boosting
##
## 172 samples
## 9 predictor
## 2 classes: 'alive', 'dead'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 138, 138, 138, 137, 137, 137, ...
## Resampling results across tuning parameters:
##
## max_depth nrounds Accuracy Kappa
## 5 50 0.7239037 0.04030966
## 5 100 0.7386147 0.08739710
## 5 150 0.7414719 0.10790899
## 10 50 0.7384416 0.09380395
## 10 100 0.7443239 0.12064889
## 10 150 0.7444079 0.13559180
## 15 50 0.7297861 0.05392884
## 15 100 0.7327273 0.07548552
## 15 150 0.7355844 0.09865160
##
## Tuning parameter 'eta' was held constant at a value of 0.02
## Tuning
##
## Tuning parameter 'min_child_weight' was held constant at a value of 0
##
## Tuning parameter 'subsample' was held constant at a value of 0.5
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 150, max_depth = 10, eta
## = 0.02, gamma = 5, colsample_bytree = 0.75, min_child_weight = 0 and
## subsample = 0.5.
plot(xgbFit)
rfFit <- train(status ~ .,
data = trainTransformed,
method = "rf",
metric="Accuracy",
tuneGrid = expand.grid(mtry=c(2, 4, 6, 8)),
trControl=fitControl,
ntrees=1000,
tuneLength = 10,
na.action = na.omit)
plot(rfFit)
## Modell 3: Logistisk regression
logregFit <- train(status ~ .,
data = trainTransformed,
method = "glm",
metric="Accuracy",
trControl=fitControl,
na.action = na.omit)
# collect resamples
results <- resamples(list(LOGISTIC_REGRESSION=logregFit,
RANDOM_FOREST = rfFit,
XGBOOST = xgbFit))
# summarize the distributions
#summary(results)
# boxplots of results
bwplot(results)
# dot plots of results
dotplot(results)
Ingen av ovanstående är conditional variable importance (CVI).
https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-307
library(party)
ntrees=100
mtrys = 7
bst <- cforest (status~.,
data = df,
# cforest_unbiased = unbiased variable importance estiamtes
control = cforest_unbiased(ntree = ntrees, mtry=mtrys))
# Skapar graferna
res2 <- data.frame(varImp(bst)) # hämtar variable importance
res2$impProcent <- round(res2$Overall/sum(res2$Overall)*100) # räknar om importance för varje till procent av hela
res2$var <- rownames(res2) # kopierar radnamnet
res2 %>% ggplot(aes(x=impProcent, y=reorder(var, impProcent))) +
geom_bar(stat="identity")
library(moreparty)
library(doParallel)
registerDoParallel(cores=4)
cfFit = fastcforest(status~., data=df, parallel=T)
stopImplicitCluster()
registerDoParallel(cores=4)
vi = fastvarImp(cfFit, measure='ACC', parallel=TRUE)
stopImplicitCluster()
varimp <- data.frame(rev(sort(vi)))
varimp$imp <- varimp$rev.sort.vi..
varimp$var <- rownames(varimp)
varimp %>% ggplot(aes(x=imp, y=reorder(var, imp))) +
geom_bar(stat="identity")