There is a good document available for model based boosting in R. If you are interested in mboost package and detailed application on body fat data set A Click here.
But following example code is using tree method on quantitative variable.
#to get help on package mboost, use package?mboost
library(mboost)
## Loading required package: parallel
## Loading required package: survival
## Loading required package: splines
## This is mboost 2.2-3. See 'package?mboost' and the NEWS file
## for a complete list of changes.
## Note: The default for the computation of the degrees of freedom has changed.
## For details see section 'Global Options' of '?bols'.
str(bodyfat)
## 'data.frame': 71 obs. of 10 variables:
## $ age : num 57 65 59 58 60 61 56 60 58 62 ...
## $ DEXfat : num 41.7 43.3 35.4 22.8 36.4 ...
## $ waistcirc : num 100 99.5 96 72 89.5 83.5 81 89 80 79 ...
## $ hipcirc : num 112 116.5 108.5 96.5 100.5 ...
## $ elbowbreadth: num 7.1 6.5 6.2 6.1 7.1 6.5 6.9 6.2 6.4 7 ...
## $ kneebreadth : num 9.4 8.9 8.9 9.2 10 8.8 8.9 8.5 8.8 8.8 ...
## $ anthro3a : num 4.42 4.63 4.12 4.03 4.24 3.55 4.14 4.04 3.91 3.66 ...
## $ anthro3b : num 4.95 5.01 4.74 4.48 4.68 4.06 4.52 4.7 4.32 4.21 ...
## $ anthro3c : num 4.5 4.48 4.6 3.91 4.15 3.64 4.31 4.47 3.47 3.6 ...
## $ anthro4 : num 6.13 6.37 5.82 5.66 5.91 5.14 5.69 5.7 5.49 5.25 ...
head(bodyfat)
## age DEXfat waistcirc hipcirc elbowbreadth kneebreadth anthro3a anthro3b
## 47 57 41.68 100.0 112.0 7.1 9.4 4.42 4.95
## 48 65 43.29 99.5 116.5 6.5 8.9 4.63 5.01
## 49 59 35.41 96.0 108.5 6.2 8.9 4.12 4.74
## 50 58 22.79 72.0 96.5 6.1 9.2 4.03 4.48
## 51 60 36.42 89.5 100.5 7.1 10.0 4.24 4.68
## 52 61 24.13 83.5 97.0 6.5 8.8 3.55 4.06
## anthro3c anthro4
## 47 4.50 6.13
## 48 4.48 6.37
## 49 4.60 5.82
## 50 3.91 5.66
## 51 4.15 5.91
## 52 3.64 5.14
Preparing training set and test set
set.seed(3456) #To get reproducible result
ind <- sample(2,nrow(bodyfat),replace=TRUE,prob=c(0.7,0.3))
trainingSet <- bodyfat[ind==1,]
testSet <- bodyfat[ind==2,]
Train a decision tree
library(rpart)
formula <- DEXfat ~ age + waistcirc + hipcirc + elbowbreadth + kneebreadth
bodyfat_rpart <- rpart(formula,data=trainingSet,control=rpart.control(minsplit=10))
attributes(bodyfat_rpart)
## $names
## [1] "frame" "where" "call"
## [4] "terms" "cptable" "method"
## [7] "parms" "control" "functions"
## [10] "numresp" "splits" "variable.importance"
## [13] "y" "ordered"
##
## $xlevels
## named list()
##
## $class
## [1] "rpart"
print(bodyfat_rpart$cptable)
## CP nsplit rel error xerror xstd
## 1 0.64556 0 1.00000 1.0327 0.22197
## 2 0.12808 1 0.35444 0.5619 0.12904
## 3 0.10034 2 0.22637 0.5836 0.11064
## 4 0.01976 3 0.12602 0.3742 0.07321
## 5 0.01446 4 0.10627 0.3341 0.06619
## 6 0.01276 5 0.09180 0.3225 0.06761
## 7 0.01000 6 0.07904 0.3194 0.06771
print(bodyfat_rpart)
## n= 44
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 44 5333.00 32.20
## 2) waistcirc< 90 29 1028.00 25.84
## 4) waistcirc< 80.75 17 189.10 21.76
## 8) age< 31.5 3 22.91 17.16 *
## 9) age>=31.5 14 89.09 22.75 *
## 5) waistcirc>=80.75 12 155.90 31.62
## 10) waistcirc< 85.25 4 5.33 28.25 *
## 11) waistcirc>=85.25 8 82.48 33.30 *
## 3) waistcirc>=90 15 862.20 44.50
## 6) kneebreadth< 11.15 12 180.80 41.51
## 12) waistcirc< 102.1 5 56.37 38.01 *
## 13) waistcirc>=102.1 7 19.06 44.02 *
## 7) kneebreadth>=11.15 3 146.30 56.45 *
Next, plotting rpart tree
plot(bodyfat_rpart)
#Ass text on the plot to get more details
text(bodyfat_rpart,use.n=T)
Here is some method for tree pruning
opt <- which.min(bodyfat_rpart$cptable[,"xerror"])
cp <- bodyfat_rpart$cptable[opt,"CP"]
bodyfat_prune <- prune(bodyfat_rpart,cp=cp)
print(bodyfat_prune)
## n= 44
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 44 5333.00 32.20
## 2) waistcirc< 90 29 1028.00 25.84
## 4) waistcirc< 80.75 17 189.10 21.76
## 8) age< 31.5 3 22.91 17.16 *
## 9) age>=31.5 14 89.09 22.75 *
## 5) waistcirc>=80.75 12 155.90 31.62
## 10) waistcirc< 85.25 4 5.33 28.25 *
## 11) waistcirc>=85.25 8 82.48 33.30 *
## 3) waistcirc>=90 15 862.20 44.50
## 6) kneebreadth< 11.15 12 180.80 41.51
## 12) waistcirc< 102.1 5 56.37 38.01 *
## 13) waistcirc>=102.1 7 19.06 44.02 *
## 7) kneebreadth>=11.15 3 146.30 56.45 *
plot(bodyfat_prune)
text(bodyfat_prune)
#Fancy plot
#fancyRpartPlot(bodyfat_prune)
Validating model using test set data
bodyfat_predict <- predict(bodyfat_prune,newdata=testSet)
xlim <- range(bodyfat$DEXfat)
plot(bodyfat_predict ~ DEXfat, data=testSet,xlab="Observed",ylab="Predicted",ylim=xlim,xlim=xlim)
abline(a=0,b=1)