Load the Library
xgboost <- xgb.train is an advanced interface for training an xgboost model. The xgboost function is a simpler wrapper for xgb.train.
magrittr <- The operators pipe their left-hand side values forward into expressions that appear on the right-hand side, i.e. one can replace f(x) with x %>% f, where %>% is the (main) pipe-operator.
dplyr <- %>%
Matrix <- Construct a Matrix of a class that inherits from Matrix
Warning: package ‘xgboost’ was built under R version 4.1.3
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘xgboost’
The following object is masked from ‘package:dplyr’:
slice
Making model Train and Test
Splitting data 80/20
set.seed(123) # same observation
ind <- sample(2, nrow(data), replace = T, prob = c(0.8, 0.2))
train <- data[ind==1,]
test <- data[ind==2,]
train model
train_label <- train[,"admit"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
test model
# test model
testm <- sparse.model.matrix(admit~.-1, data = test)
test_label <- test[,"admit"]
test_matrix <- xgb.DMatrix(data = as.matrix(testm), label= test_label)
xgb_params <- list("objective" = "multi:softprob",
"eval_metric" = "mlogloss",
"num_class" = nc)
watchlist <- list(train = train_matrix, test = test_matrix)
# eXtreme Gradian Boosting Model
bst_model <- xgb.train(params = xgb_params,
data = train_matrix,
nrounds = 107,
watchlist = watchlist,
eta = 0.001, # more robust to overfitting
max.depth = 3,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
missing = NA,
seed = 333)
# Training and test error plot
e <- data.frame(bst_model$evaluation_log)
plot(e$iter, e$train_mlogloss, col = 'blue')
lines(e$iter,e$test_mlogloss, col = 'red')
min(e$test_mlogloss)
# not the best model because of the red line
# e[e$test_mlogloss == 0.6014728,]
imp <- xgb.importance(colnames(train_matrix), model = bst_model)
print(imp)
xgb.plot.importance(imp)
# Prediction and confusin matrix - test data
p <- predict(bst_model, newdata = test_matrix)
pred <- matrix(p, nrow = nc, ncol = length(p)/nc) %>%
t() %>%
data.frame() %>%
mutate(label = test_label, max_prob = max.col(.,"last")-1)
head(pred)
Confussion Matrix Result (test)
59 out of 75 prediction is correct from data test 47+3 78% accurate
table(Prediction = pred$max_prob, Actual = pred$label)
59/75
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IA0KIGh0bWxfbm90ZWJvb2s6DQogICB0b2M6IHRydWUNCiAgIHRvY19mbG9hdDogdHJ1ZQ0KICAgaGlnaGxpZ2h0OiB6ZW5idXJuDQogICBkZl9wcmludDogcGFnZWQNCiAgIHRoZW1lOiBmbGF0bHkNCi0tLQ0KDQojIExvYWQgdGhlIExpYnJhcnkgDQoNCnhnYm9vc3QgPC0geGdiLnRyYWluIGlzIGFuIGFkdmFuY2VkIGludGVyZmFjZSBmb3IgdHJhaW5pbmcgYW4geGdib29zdCBtb2RlbC4gVGhlIHhnYm9vc3QgZnVuY3Rpb24gaXMgYSBzaW1wbGVyIHdyYXBwZXIgZm9yIHhnYi50cmFpbi4NCg0KbWFncml0dHIgPC0gVGhlIG9wZXJhdG9ycyBwaXBlIHRoZWlyIGxlZnQtaGFuZCBzaWRlIHZhbHVlcyBmb3J3YXJkIGludG8gZXhwcmVzc2lvbnMgdGhhdCBhcHBlYXIgb24gdGhlIHJpZ2h0LWhhbmQgc2lkZSwgaS5lLiBvbmUgY2FuIHJlcGxhY2UgZih4KSB3aXRoIHggJT4lIGYsIHdoZXJlICU+JSBpcyB0aGUgKG1haW4pIHBpcGUtb3BlcmF0b3IuDQoNCmRwbHlyIDwtICU+JSANCg0KTWF0cml4IDwtIENvbnN0cnVjdCBhIE1hdHJpeCBvZiBhIGNsYXNzIHRoYXQgaW5oZXJpdHMgZnJvbSBNYXRyaXgNCg0KYGBge3J9DQpsaWJyYXJ5KHhnYm9vc3QpDQpsaWJyYXJ5KG1hZ3JpdHRyKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoTWF0cml4KQ0KYGBgDQoNCg0KIyByZWFkIHRoZSBkYXRhIA0KDQpzb3VyY2U6IGh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9ia3JhaS9Ub3AtMTAtTWFjaGluZS1MZWFybmluZy1NZXRob2RzLVdpdGgtUi9tYXN0ZXIvYmluYXJ5LmNzdg0KDQpgYGB7cn0NCmRhdGEgPC0gcmVhZC5jc3YoImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9ia3JhaS9Ub3AtMTAtTWFjaGluZS1MZWFybmluZy1NZXRob2RzLVdpdGgtUi9tYXN0ZXIvYmluYXJ5LmNzdiIpDQpoZWFkKGRhdGEpDQpgYGANCg0KDQp3ZSBuZWVkIHRvIGNoYW5nZSB0aGUgcmFuayBpbnRvIGZhY3RvciBub3QgImludGVnZXIiDQoNCmBgYHtyfQ0KIyBjaGVjayBkYXRhIHZhbHVlIA0KY2xhc3MoZGF0YSRyYW5rKQ0KYGBgDQoNCg0KYGBge3J9DQpkYXRhJHJhbmsgPC0gYXMuZmFjdG9yKGRhdGEkcmFuaykNCmBgYA0KDQp3ZSBjYW4gdXNlIHN0cigpIHRvIGNoZWNrIGRhdGEgdHlwZSANCg0KYGBge3J9DQpzdHIoZGF0YSRyYW5rKQ0KYGBgDQoNCiMgTWFraW5nIG1vZGVsIFRyYWluIGFuZCBUZXN0DQoNCg0KIyMgU3BsaXR0aW5nIGRhdGEgODAvMjANCg0KYGBge3J9DQojIHBhcnRpdGlvbiBkYXRhDQpzZXQuc2VlZCgxMjMpICMgc2FtZSBvYnNlcnZhdGlvbiANCmluZCA8LSBzYW1wbGUoMiwgbnJvdyhkYXRhKSwgcmVwbGFjZSA9IFQsIHByb2IgPSBjKDAuOCwgMC4yKSkNCnRyYWluIDwtIGRhdGFbaW5kPT0xLF0NCnRlc3QgPC0gZGF0YVtpbmQ9PTIsXQ0KYGBgDQoNCiMjIHRyYWluIG1vZGVsDQoNCmBgYHtyfQ0KdHJhaW5tIDwtIHNwYXJzZS5tb2RlbC5tYXRyaXgoYWRtaXR+LiAtMSwgZGF0YSA9IHRyYWluKQ0KaGVhZCh0cmFpbm0pDQp0cmFpbl9sYWJlbCA8LSB0cmFpblssImFkbWl0Il0NCnRyYWluX21hdHJpeCA8LSB4Z2IuRE1hdHJpeChkYXRhID0gYXMubWF0cml4KHRyYWlubSksIGxhYmVsID0gdHJhaW5fbGFiZWwpDQpgYGANCg0KIyMgdGVzdCBtb2RlbA0KDQpgYGB7cn0NCnRlc3RtIDwtIHNwYXJzZS5tb2RlbC5tYXRyaXgoYWRtaXR+Li0xLCBkYXRhID0gdGVzdCkNCnRlc3RfbGFiZWwgPC0gdGVzdFssImFkbWl0Il0NCnRlc3RfbWF0cml4IDwtIHhnYi5ETWF0cml4KGRhdGEgPSBhcy5tYXRyaXgodGVzdG0pLCBsYWJlbD0gdGVzdF9sYWJlbCkNCm5jIDwtIGxlbmd0aCh1bmlxdWUodHJhaW5fbGFiZWwpKSAjIDIgY2xhc3NlcyAwLzENCmBgYA0KDQoNCmBgYHtyfQ0KeGdiX3BhcmFtcyA8LSBsaXN0KCJvYmplY3RpdmUiID0gIm11bHRpOnNvZnRwcm9iIiwNCiAgICAgICAgICAgICAgICAgICAiZXZhbF9tZXRyaWMiID0gIm1sb2dsb3NzIiwNCiAgICAgICAgICAgICAgICAgICAibnVtX2NsYXNzIiA9IG5jKQ0KYGBgDQoNCg0KYGBge3J9DQp3YXRjaGxpc3QgPC0gbGlzdCh0cmFpbiA9IHRyYWluX21hdHJpeCwgdGVzdCA9IHRlc3RfbWF0cml4KQ0KYGBgDQoNCg0KYGBge3J9DQojIGVYdHJlbWUgR3JhZGlhbiBCb29zdGluZyBNb2RlbA0KYnN0X21vZGVsIDwtIHhnYi50cmFpbihwYXJhbXMgPSB4Z2JfcGFyYW1zLA0KICAgICAgICAgICAgICAgICAgICAgICBkYXRhID0gdHJhaW5fbWF0cml4LA0KICAgICAgICAgICAgICAgICAgICAgICBucm91bmRzID0gMTA3LA0KICAgICAgICAgICAgICAgICAgICAgICB3YXRjaGxpc3QgPSB3YXRjaGxpc3QsDQogICAgICAgICAgICAgICAgICAgICAgIGV0YSA9IDAuMDAxLCAjIG1vcmUgcm9idXN0IHRvIG92ZXJmaXR0aW5nDQogICAgICAgICAgICAgICAgICAgICAgIG1heC5kZXB0aCA9IDMsDQogICAgICAgICAgICAgICAgICAgICAgIGdhbW1hID0gMCwNCiAgICAgICAgICAgICAgICAgICAgICAgc3Vic2FtcGxlID0gMSwNCiAgICAgICAgICAgICAgICAgICAgICAgY29sc2FtcGxlX2J5dHJlZSA9IDEsDQogICAgICAgICAgICAgICAgICAgICAgIG1pc3NpbmcgPSBOQSwNCiAgICAgICAgICAgICAgICAgICAgICAgc2VlZCA9IDMzMykNCmBgYA0KDQoNCmBgYHtyfQ0KIyBUcmFpbmluZyBhbmQgdGVzdCBlcnJvciBwbG90DQplIDwtIGRhdGEuZnJhbWUoYnN0X21vZGVsJGV2YWx1YXRpb25fbG9nKQ0KcGxvdChlJGl0ZXIsIGUkdHJhaW5fbWxvZ2xvc3MsIGNvbCA9ICdibHVlJykNCmxpbmVzKGUkaXRlcixlJHRlc3RfbWxvZ2xvc3MsIGNvbCA9ICdyZWQnKQ0KbWluKGUkdGVzdF9tbG9nbG9zcykNCiMgbm90IHRoZSBiZXN0IG1vZGVsIGJlY2F1c2Ugb2YgdGhlIHJlZCBsaW5lDQpgYGANCg0KDQpgYGB7cn0NCiMgZVtlJHRlc3RfbWxvZ2xvc3MgPT0gMC42MDE0NzI4LF0NCmBgYA0KDQoNCmBgYHtyfQ0KaW1wIDwtIHhnYi5pbXBvcnRhbmNlKGNvbG5hbWVzKHRyYWluX21hdHJpeCksIG1vZGVsID0gYnN0X21vZGVsKQ0KcHJpbnQoaW1wKQ0KeGdiLnBsb3QuaW1wb3J0YW5jZShpbXApDQpgYGANCg0KDQpgYGB7cn0NCiMgUHJlZGljdGlvbiBhbmQgY29uZnVzaW4gbWF0cml4IC0gdGVzdCBkYXRhDQpwIDwtIHByZWRpY3QoYnN0X21vZGVsLCBuZXdkYXRhID0gdGVzdF9tYXRyaXgpDQpgYGANCg0KYGBge3J9DQpwcmVkIDwtIG1hdHJpeChwLCBucm93ID0gbmMsIG5jb2wgPSBsZW5ndGgocCkvbmMpICU+JSANCiAgdCgpICU+JSANCiAgZGF0YS5mcmFtZSgpICU+JSANCiAgbXV0YXRlKGxhYmVsID0gdGVzdF9sYWJlbCwgbWF4X3Byb2IgPSBtYXguY29sKC4sImxhc3QiKS0xKQ0KaGVhZChwcmVkKQ0KYGBgDQoNCiMgQ29uZnVzc2lvbiBNYXRyaXggUmVzdWx0ICh0ZXN0KSANCjU5IG91dCBvZiA3NSBwcmVkaWN0aW9uIGlzIGNvcnJlY3QgZnJvbSBkYXRhIHRlc3QgNDcrMw0KNzglIGFjY3VyYXRlDQoNCmBgYHtyfQ0KdGFibGUoUHJlZGljdGlvbiA9IHByZWQkbWF4X3Byb2IsIEFjdHVhbCA9IHByZWQkbGFiZWwpDQo1OS83NQ0KYGBgDQoNCg0KYGBge3J9DQpgYGANCg0KDQpgYGB7cn0NCmBgYA0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQo=