require(xgboost)
## Loading required package: xgboost
## Warning: package 'xgboost' was built under R version 3.4.3
set.seed(0623)
data("agaricus.train")
data("agaricus.test")
train <- agaricus.train
test <- agaricus.test
# Setting up parameters
param <- list("objective" = "binary:logistic",
"eval_metric" = "logloss",
"eta" = 0.4, "max.depth" = 2)
# Cross Validation
## we should take a look at "test-logloss", enlarge nrounds until "test-logloss" is increasing (usually more than 200)
bst.cv <- xgb.cv(params = param, data = as.matrix(train$data), label = train$label, nfold = 10, nrounds = 20)
## [1] train-logloss:0.428018+0.001055 test-logloss:0.428053+0.005663
## [2] train-logloss:0.296569+0.001026 test-logloss:0.298132+0.006891
## [3] train-logloss:0.218695+0.001026 test-logloss:0.218566+0.007334
## [4] train-logloss:0.159530+0.000814 test-logloss:0.159424+0.006636
## [5] train-logloss:0.127319+0.002352 test-logloss:0.128026+0.007416
## [6] train-logloss:0.105626+0.001151 test-logloss:0.105759+0.006576
## [7] train-logloss:0.085071+0.000836 test-logloss:0.085173+0.005306
## [8] train-logloss:0.073516+0.000973 test-logloss:0.073882+0.004798
## [9] train-logloss:0.063583+0.000842 test-logloss:0.063772+0.004682
## [10] train-logloss:0.054924+0.001004 test-logloss:0.055157+0.003888
## [11] train-logloss:0.044613+0.001016 test-logloss:0.044996+0.004492
## [12] train-logloss:0.038552+0.000644 test-logloss:0.038580+0.003533
## [13] train-logloss:0.033377+0.000600 test-logloss:0.033699+0.003222
## [14] train-logloss:0.029660+0.000491 test-logloss:0.029929+0.002657
## [15] train-logloss:0.026624+0.000766 test-logloss:0.027126+0.002683
## [16] train-logloss:0.023987+0.001071 test-logloss:0.024526+0.002629
## [17] train-logloss:0.021818+0.000946 test-logloss:0.022357+0.002678
## [18] train-logloss:0.019594+0.001025 test-logloss:0.019727+0.002481
## [19] train-logloss:0.017575+0.000926 test-logloss:0.017811+0.002446
## [20] train-logloss:0.015766+0.000838 test-logloss:0.016235+0.002633
## little plot of above CV result
plot(log(bst.cv$evaluation_log$test_logloss_mean), type = "l")

# Xgboost
## This shows train-error, which we don't need to care a lot
bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 0.5, nrounds = 5, objective = "binary:logistic")
## [1] train-error:0.046522
## [2] train-error:0.043605
## [3] train-error:0.023338
## [4] train-error:0.028251
## [5] train-error:0.011823
## predict
pred = predict(bst, test$data)
## little plot of prediction
trees = xgb.model.dt.tree(dimnames(train$data)[[2]], model = bst);trees
## Tree Node ID Feature Split Yes No Missing
## 1: 0 0 0-0 odor=none 0.5 0-1 0-2 0-1
## 2: 0 1 0-1 stalk-root=club 0.5 0-3 0-4 0-3
## 3: 0 2 0-2 spore-print-color=green 0.5 0-5 0-6 0-5
## 4: 0 3 0-3 Leaf NA NA NA NA
## 5: 0 4 0-4 Leaf NA NA NA NA
## 6: 0 5 0-5 Leaf NA NA NA NA
## 7: 0 6 0-6 Leaf NA NA NA NA
## 8: 1 0 1-0 odor=none 0.5 1-1 1-2 1-1
## 9: 1 1 1-1 bruises?=bruises 0.5 1-3 1-4 1-3
## 10: 1 2 1-2 stalk-surface-below-ring=scaly 0.5 1-5 1-6 1-5
## 11: 1 3 1-3 Leaf NA NA NA NA
## 12: 1 4 1-4 Leaf NA NA NA NA
## 13: 1 5 1-5 Leaf NA NA NA NA
## 14: 1 6 1-6 Leaf NA NA NA NA
## 15: 2 0 2-0 odor=none 0.5 2-1 2-2 2-1
## 16: 2 1 2-1 odor=almond 0.5 2-3 2-4 2-3
## 17: 2 2 2-2 spore-print-color=green 0.5 2-5 2-6 2-5
## 18: 2 3 2-3 Leaf NA NA NA NA
## 19: 2 4 2-4 Leaf NA NA NA NA
## 20: 2 5 2-5 Leaf NA NA NA NA
## 21: 2 6 2-6 Leaf NA NA NA NA
## 22: 3 0 3-0 odor=foul 0.5 3-1 3-2 3-1
## 23: 3 1 3-1 gill-size=broad 0.5 3-3 3-4 3-3
## 24: 3 2 3-2 bruises?=bruises 0.5 3-5 3-6 3-5
## 25: 3 3 3-3 Leaf NA NA NA NA
## 26: 3 4 3-4 Leaf NA NA NA NA
## 27: 3 5 3-5 Leaf NA NA NA NA
## 28: 3 6 3-6 Leaf NA NA NA NA
## 29: 4 0 4-0 odor=anise 0.5 4-1 4-2 4-1
## 30: 4 1 4-1 odor=none 0.5 4-3 4-4 4-3
## 31: 4 2 4-2 stalk-root=club 0.5 4-5 4-6 4-5
## 32: 4 3 4-3 Leaf NA NA NA NA
## 33: 4 4 4-4 Leaf NA NA NA NA
## 34: 4 5 4-5 Leaf NA NA NA NA
## 35: 4 6 4-6 Leaf NA NA NA NA
## Tree Node ID Feature Split Yes No Missing
## Quality Cover
## 1: 4000.530000 1628.25000
## 2: 1158.210000 924.50000
## 3: 198.174000 703.75000
## 4: 0.856089 812.00000
## 5: -0.850220 112.50000
## 6: -0.970354 690.50000
## 7: 0.929825 13.25000
## 8: 1398.450000 1335.25000
## 9: 739.400000 774.06400
## 10: 101.665000 561.19100
## 11: 0.722832 547.43900
## 12: -0.349625 226.62600
## 13: -0.654067 552.62100
## 14: 1.002310 8.56944
## 15: 653.938000 1001.97000
## 16: 460.588000 604.34500
## 17: 101.090000 397.62200
## 18: 0.466722 538.35200
## 19: -0.923932 65.99350
## 20: -0.558035 384.62100
## 21: 0.816683 13.00130
## 22: 413.394000 739.47100
## 23: 467.862000 538.68600
## 24: 0.479467 200.78400
## 25: 0.395493 184.61100
## 26: -0.584344 354.07500
## 27: 0.561015 154.40900
## 28: 0.674397 46.37520
## 29: 253.710000 527.49900
## 30: 261.060000 469.14800
## 31: 39.105400 58.35120
## 32: 0.404699 285.17500
## 33: -0.357531 183.97200
## 34: -1.470560 24.36120
## 35: -0.615790 33.99000
## Quality Cover
## Feature Selection
names <- dimnames(train$data)[[2]]
imprt_matrix <- xgb.importance(names,model=bst)
xgb.plot.importance(imprt_matrix[1:10]) # we should take variables with higher value

require(DiagrammeR)
## Loading required package: DiagrammeR
## Warning: package 'DiagrammeR' was built under R version 3.4.3
xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2)
## Warning: package 'bindrcpp' was built under R version 3.4.3