Load Dependencies
Rows: 200
Columns: 6
$ Age [3m[38;5;246m<int>[39m[23m 23, 47, 47, 28, 61, 22, 49, 41, 60, 43, 47, 34, 43, 74, 50, 16, 69, 43, 23, 32, 57, 63, 47, 48, 33, 28, 31, 49, ~
$ Sex [3m[38;5;246m<fct>[39m[23m F, M, M, F, F, F, F, M, M, M, F, F, M, F, F, F, M, M, M, F, M, M, M, F, F, F, M, F, F, M, F, M, M, F, M, M, M, M~
$ BP [3m[38;5;246m<fct>[39m[23m HIGH, LOW, LOW, NORMAL, LOW, NORMAL, NORMAL, LOW, NORMAL, LOW, LOW, HIGH, LOW, LOW, NORMAL, HIGH, LOW, HIGH, LOW~
$ Cholesterol [3m[38;5;246m<fct>[39m[23m HIGH, HIGH, HIGH, HIGH, HIGH, HIGH, HIGH, HIGH, HIGH, NORMAL, HIGH, NORMAL, HIGH, HIGH, HIGH, NORMAL, NORMAL, HI~
$ Na_to_K [3m[38;5;246m<dbl>[39m[23m 25.355, 13.093, 10.114, 7.798, 18.043, 8.607, 16.275, 11.037, 15.171, 19.368, 11.767, 19.199, 15.376, 20.942, 12~
$ Drug [3m[38;5;246m<fct>[39m[23m DrugY, drugC, drugC, drugX, DrugY, drugX, DrugY, drugC, DrugY, DrugY, drugC, DrugY, DrugY, DrugY, drugX, DrugY, ~
Data Inspection and Cleaning
all(is.na(drug_df)) ## No missing value
[1] FALSE
null_vars <- (sapply(drug_df, function(x) sum(is.na(x))))
t(data.frame(null_vars))
Age Sex BP Cholesterol Na_to_K Drug
null_vars 0 0 0 0 0 0
## strip the "Drug" Prefix
df <- drug_df %>%
mutate(Sex = if_else(Sex == "M", 1, 0)) %>%
mutate(BP = if_else(BP == "LOW", 0,
if_else(BP == "NORMAL", 1, 2))) %>%
mutate(Cholesterol = if_else(Cholesterol == "LOW", 0,
if_else(Cholesterol == "NORMAL", 1, 2))) %>%
transform(Drug = str_replace(Drug,"Drug","")) %>%
transform(Drug = str_replace(Drug,"drug","")) %>%
mutate(y = case_when(
Drug == 'A' ~ 1,
Drug == 'B' ~ 2,
Drug == 'C' ~ 3,
Drug == 'X' ~ 4,
TRUE ~ 5
)) %>%
select(-Drug)
Exploratory Data Analysis
Model Fitting
## Split Data
## Train-Test
n_split <- round(0.8 * nrow(df))
train_indices <- sample(1:nrow(df), n_split)
train_set <- df[train_indices, ]
test_set <- df[-train_indices, ]
# train_y <- train_set$y
# train_set <- train_set %>%
# select(-y)
## To-Matrix for XGBoost
trainMatrix <- train_set %>% as.matrix
testMatrix <- test_set %>% as.matrix
library(xgboost)
numberOfClasses <- max(train_set$y) + 1
param <- list("objective" = "multi:softprob",
"eval_metric" = "mlogloss",
"num_class" = numberOfClasses)
cv.nround <- 5
cv.nfold <- 3
bst.cv = xgb.cv(param=param, data = trainMatrix, label = train_set$y,
nfold = cv.nfold, nrounds = cv.nround)
[1] train-mlogloss:1.077649+0.000739 test-mlogloss:1.089493+0.009163
[2] train-mlogloss:0.745274+0.000545 test-mlogloss:0.763276+0.016646
[3] train-mlogloss:0.537023+0.000426 test-mlogloss:0.559466+0.022398
[4] train-mlogloss:0.395605+0.000347 test-mlogloss:0.421520+0.027038
[5] train-mlogloss:0.295891+0.000292 test-mlogloss:0.324650+0.030933
nround = 50
bst = xgboost(param=param, data = trainMatrix, label = train_set$y, nrounds=nround)
[1] train-mlogloss:1.052531
[2] train-mlogloss:0.722018
[3] train-mlogloss:0.516496
[4] train-mlogloss:0.377545
[5] train-mlogloss:0.279939
[6] train-mlogloss:0.209908
[7] train-mlogloss:0.158994
[8] train-mlogloss:0.121637
[9] train-mlogloss:0.094026
[10] train-mlogloss:0.073677
[11] train-mlogloss:0.058536
[12] train-mlogloss:0.047498
[13] train-mlogloss:0.039821
[14] train-mlogloss:0.033347
[15] train-mlogloss:0.029046
[16] train-mlogloss:0.025628
[17] train-mlogloss:0.022862
[18] train-mlogloss:0.020906
[19] train-mlogloss:0.019785
[20] train-mlogloss:0.018986
[21] train-mlogloss:0.018379
[22] train-mlogloss:0.017860
[23] train-mlogloss:0.017417
[24] train-mlogloss:0.017205
[25] train-mlogloss:0.017023
[26] train-mlogloss:0.016865
[27] train-mlogloss:0.016727
[28] train-mlogloss:0.016606
[29] train-mlogloss:0.016498
[30] train-mlogloss:0.016403
[31] train-mlogloss:0.016317
[32] train-mlogloss:0.016239
[33] train-mlogloss:0.016167
[34] train-mlogloss:0.016102
[35] train-mlogloss:0.016043
[36] train-mlogloss:0.015988
[37] train-mlogloss:0.015937
[38] train-mlogloss:0.015891
[39] train-mlogloss:0.015848
[40] train-mlogloss:0.015807
[41] train-mlogloss:0.015770
[42] train-mlogloss:0.015734
[43] train-mlogloss:0.015701
[44] train-mlogloss:0.015670
[45] train-mlogloss:0.015641
[46] train-mlogloss:0.015613
[47] train-mlogloss:0.015587
[48] train-mlogloss:0.015562
[49] train-mlogloss:0.015538
[50] train-mlogloss:0.015516
Model Evaluation
model <- xgb.dump(bst, with.stats = T)
'with.stats' is deprecated.
Use 'with_stats' instead.
See help("Deprecated") and help("xgboost-deprecated").
model[1:10]
[1] "booster[0]"
[2] "0:leaf=-0.176039129,cover=44.4444466"
[3] "booster[1]"
[4] "0:[f5<1.5] yes=1,no=2,missing=1,gain=52.3891869,cover=44.4444466"
[5] "1:leaf=0.756637156,cover=5.27777815"
[6] "2:leaf=-0.175518662,cover=39.1666679"
[7] "booster[2]"
[8] "0:[f5<2.5] yes=1,no=2,missing=1,gain=12.6746588,cover=44.4444466"
[9] "1:[f0<51.5] yes=3,no=4,missing=3,gain=19.8158855,cover=8.61111164"
[10] "3:leaf=-0.151327431,cover=5.27777815"
# Get the feature real names
names <- dimnames(trainMatrix)[[2]]
# Compute feature importance matrix
importance_matrix <- xgb.importance(names, model = bst)
# Nice graph
xgb.plot.importance(importance_matrix[1:10,])
xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2)
'n_first_tree' is deprecated.
Use 'trees' instead.
See help("Deprecated") and help("xgboost-deprecated").Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
LS0tDQp0aXRsZTogIkRydWcgQ2xhc3NpZmljYXRpb24gdXNpbmcgWEdCb29zdCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCg0KIyMgTG9hZCBEZXBlbmRlbmNpZXMNCg0KYGBge3IgZWNobyA9IEZBTFNFLCBtZXNzYWdlPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KGRhdGEudGFibGUpDQoNCg0KZHJ1Z19kZiA8LSByZWFkLmNzdigiQzovVXNlcnMvUEMvRG9jdW1lbnRzL1JfNERTL2RydWcyMDAuY3N2IikNCmdsaW1wc2UoZHJ1Z19kZikNCmBgYA0KDQoNCiMjIERhdGEgSW5zcGVjdGlvbiBhbmQgQ2xlYW5pbmcNCg0KYGBge3J9DQphbGwoaXMubmEoZHJ1Z19kZikpICMjIE5vIG1pc3NpbmcgdmFsdWUNCg0KbnVsbF92YXJzIDwtIChzYXBwbHkoZHJ1Z19kZiwgZnVuY3Rpb24oeCkgc3VtKGlzLm5hKHgpKSkpDQp0KGRhdGEuZnJhbWUobnVsbF92YXJzKSkNCmBgYA0KDQpgYGB7cn0NCiMjIHN0cmlwIHRoZSAiRHJ1ZyIgUHJlZml4DQpkZiA8LSBkcnVnX2RmICU+JSANCiAgbXV0YXRlKFNleCA9IGlmX2Vsc2UoU2V4ID09ICJNIiwgMSwgMCkpICU+JSANCiAgbXV0YXRlKEJQID0gaWZfZWxzZShCUCA9PSAiTE9XIiwgMCwgDQogICAgICAgICAgICAgICAgICAgICAgaWZfZWxzZShCUCA9PSAiTk9STUFMIiwgMSwgMikpKSAlPiUgDQogIG11dGF0ZShDaG9sZXN0ZXJvbCA9IGlmX2Vsc2UoQ2hvbGVzdGVyb2wgPT0gIkxPVyIsIDAsIA0KICAgICAgICAgICAgICAgICAgICAgIGlmX2Vsc2UoQ2hvbGVzdGVyb2wgPT0gIk5PUk1BTCIsIDEsIDIpKSkgJT4lIA0KICB0cmFuc2Zvcm0oRHJ1ZyA9IHN0cl9yZXBsYWNlKERydWcsIkRydWciLCIiKSkgJT4lIA0KICB0cmFuc2Zvcm0oRHJ1ZyA9IHN0cl9yZXBsYWNlKERydWcsImRydWciLCIiKSkgJT4lIA0KICBtdXRhdGUoeSA9IGNhc2Vfd2hlbigNCiAgICBEcnVnID09ICdBJyB+IDEsDQogICAgRHJ1ZyA9PSAnQicgfiAyLA0KICAgIERydWcgPT0gJ0MnIH4gMywNCiAgICBEcnVnID09ICdYJyB+IDQsDQogICAgVFJVRSB+IDUNCiAgKSkgJT4lIA0KICBzZWxlY3QoLURydWcpDQpgYGANCg0KDQojIyBFeHBsb3JhdG9yeSBEYXRhIEFuYWx5c2lzDQoNCg0KIyMgTW9kZWwgRml0dGluZw0KDQpgYGB7cn0NCiMjIFNwbGl0IERhdGENCiMjIFRyYWluLVRlc3QNCm5fc3BsaXQgPC0gcm91bmQoMC44ICogbnJvdyhkZikpDQp0cmFpbl9pbmRpY2VzIDwtIHNhbXBsZSgxOm5yb3coZGYpLCBuX3NwbGl0KQ0KdHJhaW5fc2V0IDwtIGRmW3RyYWluX2luZGljZXMsIF0NCnRlc3Rfc2V0IDwtIGRmWy10cmFpbl9pbmRpY2VzLCBdDQoNCiMgdHJhaW5feSA8LSB0cmFpbl9zZXQkeQ0KIyB0cmFpbl9zZXQgPC0gdHJhaW5fc2V0ICU+JSANCiMgICBzZWxlY3QoLXkpDQoNCiMjIFRvLU1hdHJpeCBmb3IgWEdCb29zdA0KdHJhaW5NYXRyaXggPC0gdHJhaW5fc2V0ICU+JSBhcy5tYXRyaXgNCnRlc3RNYXRyaXggPC0gdGVzdF9zZXQgJT4lIGFzLm1hdHJpeA0KYGBgDQoNCg0KYGBge3J9DQpsaWJyYXJ5KHhnYm9vc3QpDQoNCm51bWJlck9mQ2xhc3NlcyA8LSBtYXgodHJhaW5fc2V0JHkpICsgMQ0KDQpwYXJhbSA8LSBsaXN0KCJvYmplY3RpdmUiID0gIm11bHRpOnNvZnRwcm9iIiwNCiAgICAgICAgICAgICAgImV2YWxfbWV0cmljIiA9ICJtbG9nbG9zcyIsDQogICAgICAgICAgICAgICJudW1fY2xhc3MiID0gbnVtYmVyT2ZDbGFzc2VzKQ0KDQpjdi5ucm91bmQgPC0gNQ0KY3YubmZvbGQgPC0gMw0KDQpic3QuY3YgPSB4Z2IuY3YocGFyYW09cGFyYW0sIGRhdGEgPSB0cmFpbk1hdHJpeCwgbGFiZWwgPSB0cmFpbl9zZXQkeSwgDQogICAgICAgICAgICAgICAgbmZvbGQgPSBjdi5uZm9sZCwgbnJvdW5kcyA9IGN2Lm5yb3VuZCkNCmBgYA0KDQpgYGB7ciBNb2RlbCBUcmFpbmluZ30NCm5yb3VuZCA9IDUwDQpic3QgPSB4Z2Jvb3N0KHBhcmFtPXBhcmFtLCBkYXRhID0gdHJhaW5NYXRyaXgsIGxhYmVsID0gdHJhaW5fc2V0JHksIG5yb3VuZHM9bnJvdW5kKQ0KYGBgDQoNCg0KIyMgTW9kZWwgRXZhbHVhdGlvbg0KYGBge3J9DQptb2RlbCA8LSB4Z2IuZHVtcChic3QsIHdpdGguc3RhdHMgPSBUKQ0KbW9kZWxbMToxMF0NCmBgYA0KDQoNCg0KYGBge3IgaW1wb3J0YW5jZUZlYXR1cmUsIGZpZy5hbGlnbj0nY2VudGVyJywgZmlnLmhlaWdodD01LCBmaWcud2lkdGg9MTB9DQojIEdldCB0aGUgZmVhdHVyZSByZWFsIG5hbWVzDQpuYW1lcyA8LSBkaW1uYW1lcyh0cmFpbk1hdHJpeClbWzJdXQ0KDQojIENvbXB1dGUgZmVhdHVyZSBpbXBvcnRhbmNlIG1hdHJpeA0KaW1wb3J0YW5jZV9tYXRyaXggPC0geGdiLmltcG9ydGFuY2UobmFtZXMsIG1vZGVsID0gYnN0KQ0KDQojIE5pY2UgZ3JhcGgNCnhnYi5wbG90LmltcG9ydGFuY2UoaW1wb3J0YW5jZV9tYXRyaXhbMToxMCxdKQ0KYGBgDQoNCg0KYGBge3IgdHJlZUdyYXBoLCBkcGk9MTUwMCwgZmlnLmFsaWduPSdsZWZ0J30NCnhnYi5wbG90LnRyZWUoZmVhdHVyZV9uYW1lcyA9IG5hbWVzLCBtb2RlbCA9IGJzdCwgbl9maXJzdF90cmVlID0gMikNCmBgYA==