#Packages
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.5.3
library(magrittr)
## Warning: package 'magrittr' was built under R version 3.5.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Matrix)
data <- read.csv('F:/Machine Learning/Data Science/Machine Learning/Gradient Boosting Machine/binary.csv')
str(data)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
#convert rank as factor variable
data$rank <- as.factor(data$rank)
str(data)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : Factor w/ 4 levels "1","2","3","4": 3 3 1 4 4 2 1 2 3 2 ...
#data partition
set.seed(1234)
ind <- sample(2, nrow(data), replace = T, prob = c(.8,.2))
train <- data[ind==1,]
test <- data[ind==2,]
dim(train)
## [1] 325 4
dim(test)
## [1] 75 4
#Create matrix and one-hot encoding for factor variables
trainm <- sparse.model.matrix(admit ~ .-1, data=train)
head(trainm)
## 6 x 6 sparse Matrix of class "dgCMatrix"
## gre gpa rank1 rank2 rank3 rank4
## 1 380 3.61 . . 1 .
## 2 660 3.67 . . 1 .
## 3 800 4.00 1 . . .
## 4 640 3.19 . . . 1
## 6 760 3.00 . 1 . .
## 7 560 2.98 1 . . .
train_label <- train[ , "admit"]
#now using trainm and train_label, we can create train matrix
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label=train_label)
#converting test data into matrix
testm <- sparse.model.matrix(admit ~ .-1, data=test)
test_label <- test[ ,"admit"]
test_matrix <- xgb.DMatrix(data=as.matrix(testm), label=test_label)
#Parameters
nc <- length(unique(train_label))
xgb_params <- list("objective" = "multi:softprob",
"eval_metric"="mlogloss",
"num_class" = nc)
watchlist <- list(train = train_matrix, test= test_matrix)
#extreme gradient boosting on train
best.model <- xgb.train(params = xgb_params,
data = train_matrix,
nrounds = 100,
watchlist = watchlist)
## [1] train-mlogloss:0.594324 test-mlogloss:0.651085
## [2] train-mlogloss:0.534790 test-mlogloss:0.612848
## [3] train-mlogloss:0.483394 test-mlogloss:0.595096
## [4] train-mlogloss:0.454567 test-mlogloss:0.597930
## [5] train-mlogloss:0.423043 test-mlogloss:0.599238
## [6] train-mlogloss:0.385208 test-mlogloss:0.595708
## [7] train-mlogloss:0.372651 test-mlogloss:0.614298
## [8] train-mlogloss:0.355396 test-mlogloss:0.612562
## [9] train-mlogloss:0.345466 test-mlogloss:0.632218
## [10] train-mlogloss:0.337584 test-mlogloss:0.649025
## [11] train-mlogloss:0.321141 test-mlogloss:0.649074
## [12] train-mlogloss:0.312773 test-mlogloss:0.664441
## [13] train-mlogloss:0.309723 test-mlogloss:0.677517
## [14] train-mlogloss:0.296634 test-mlogloss:0.677277
## [15] train-mlogloss:0.284527 test-mlogloss:0.689391
## [16] train-mlogloss:0.277117 test-mlogloss:0.684779
## [17] train-mlogloss:0.270126 test-mlogloss:0.688089
## [18] train-mlogloss:0.265546 test-mlogloss:0.701466
## [19] train-mlogloss:0.260600 test-mlogloss:0.700825
## [20] train-mlogloss:0.256453 test-mlogloss:0.717978
## [21] train-mlogloss:0.250929 test-mlogloss:0.718609
## [22] train-mlogloss:0.242707 test-mlogloss:0.724423
## [23] train-mlogloss:0.238471 test-mlogloss:0.731681
## [24] train-mlogloss:0.233459 test-mlogloss:0.730155
## [25] train-mlogloss:0.227987 test-mlogloss:0.728484
## [26] train-mlogloss:0.219835 test-mlogloss:0.726116
## [27] train-mlogloss:0.213605 test-mlogloss:0.732111
## [28] train-mlogloss:0.210453 test-mlogloss:0.736513
## [29] train-mlogloss:0.205479 test-mlogloss:0.741593
## [30] train-mlogloss:0.199184 test-mlogloss:0.733172
## [31] train-mlogloss:0.191293 test-mlogloss:0.731860
## [32] train-mlogloss:0.188662 test-mlogloss:0.733568
## [33] train-mlogloss:0.184796 test-mlogloss:0.736170
## [34] train-mlogloss:0.179961 test-mlogloss:0.734115
## [35] train-mlogloss:0.177305 test-mlogloss:0.725511
## [36] train-mlogloss:0.175682 test-mlogloss:0.735683
## [37] train-mlogloss:0.170574 test-mlogloss:0.742145
## [38] train-mlogloss:0.165731 test-mlogloss:0.737996
## [39] train-mlogloss:0.162826 test-mlogloss:0.738484
## [40] train-mlogloss:0.157599 test-mlogloss:0.737389
## [41] train-mlogloss:0.155526 test-mlogloss:0.737402
## [42] train-mlogloss:0.151330 test-mlogloss:0.735363
## [43] train-mlogloss:0.149537 test-mlogloss:0.747170
## [44] train-mlogloss:0.146031 test-mlogloss:0.743095
## [45] train-mlogloss:0.144352 test-mlogloss:0.746088
## [46] train-mlogloss:0.143068 test-mlogloss:0.747172
## [47] train-mlogloss:0.142036 test-mlogloss:0.744163
## [48] train-mlogloss:0.140835 test-mlogloss:0.750826
## [49] train-mlogloss:0.139641 test-mlogloss:0.752630
## [50] train-mlogloss:0.136947 test-mlogloss:0.757487
## [51] train-mlogloss:0.135305 test-mlogloss:0.760034
## [52] train-mlogloss:0.133389 test-mlogloss:0.757275
## [53] train-mlogloss:0.132064 test-mlogloss:0.754275
## [54] train-mlogloss:0.129843 test-mlogloss:0.756860
## [55] train-mlogloss:0.128410 test-mlogloss:0.757140
## [56] train-mlogloss:0.127972 test-mlogloss:0.761527
## [57] train-mlogloss:0.126455 test-mlogloss:0.762565
## [58] train-mlogloss:0.124876 test-mlogloss:0.765913
## [59] train-mlogloss:0.124022 test-mlogloss:0.771464
## [60] train-mlogloss:0.123537 test-mlogloss:0.778794
## [61] train-mlogloss:0.122750 test-mlogloss:0.775988
## [62] train-mlogloss:0.120722 test-mlogloss:0.777184
## [63] train-mlogloss:0.119031 test-mlogloss:0.773307
## [64] train-mlogloss:0.118473 test-mlogloss:0.775605
## [65] train-mlogloss:0.116922 test-mlogloss:0.772655
## [66] train-mlogloss:0.114859 test-mlogloss:0.774613
## [67] train-mlogloss:0.114263 test-mlogloss:0.774168
## [68] train-mlogloss:0.113307 test-mlogloss:0.771853
## [69] train-mlogloss:0.111874 test-mlogloss:0.776193
## [70] train-mlogloss:0.111036 test-mlogloss:0.777659
## [71] train-mlogloss:0.110099 test-mlogloss:0.771273
## [72] train-mlogloss:0.108421 test-mlogloss:0.771452
## [73] train-mlogloss:0.107680 test-mlogloss:0.774453
## [74] train-mlogloss:0.107223 test-mlogloss:0.774905
## [75] train-mlogloss:0.105983 test-mlogloss:0.780966
## [76] train-mlogloss:0.105145 test-mlogloss:0.786020
## [77] train-mlogloss:0.104694 test-mlogloss:0.784021
## [78] train-mlogloss:0.103925 test-mlogloss:0.786791
## [79] train-mlogloss:0.102773 test-mlogloss:0.786294
## [80] train-mlogloss:0.102311 test-mlogloss:0.791616
## [81] train-mlogloss:0.099847 test-mlogloss:0.797639
## [82] train-mlogloss:0.098553 test-mlogloss:0.796077
## [83] train-mlogloss:0.097660 test-mlogloss:0.793205
## [84] train-mlogloss:0.096524 test-mlogloss:0.800134
## [85] train-mlogloss:0.095765 test-mlogloss:0.802955
## [86] train-mlogloss:0.094912 test-mlogloss:0.808380
## [87] train-mlogloss:0.094318 test-mlogloss:0.807422
## [88] train-mlogloss:0.093423 test-mlogloss:0.808523
## [89] train-mlogloss:0.092886 test-mlogloss:0.808366
## [90] train-mlogloss:0.092598 test-mlogloss:0.814068
## [91] train-mlogloss:0.091697 test-mlogloss:0.813045
## [92] train-mlogloss:0.090543 test-mlogloss:0.815959
## [93] train-mlogloss:0.089979 test-mlogloss:0.814781
## [94] train-mlogloss:0.088689 test-mlogloss:0.822758
## [95] train-mlogloss:0.087909 test-mlogloss:0.826897
## [96] train-mlogloss:0.087498 test-mlogloss:0.830181
## [97] train-mlogloss:0.085682 test-mlogloss:0.832040
## [98] train-mlogloss:0.084519 test-mlogloss:0.832928
## [99] train-mlogloss:0.083779 test-mlogloss:0.833188
## [100] train-mlogloss:0.082495 test-mlogloss:0.835784
This model runs 100 iterations. And we can see that after every iteration, it prints what was the error based on the train and the test data. On the 1st iteration, the train data error in terms of mlogloss it as 0.594 and the test data error in terms of mlogloss it as 0.651. So we can see now there is a higher error for test data.
best.model
## ##### xgb.Booster
## raw: 275.6 Kb
## call:
## xgb.train(params = xgb_params, data = train_matrix, nrounds = 100,
## watchlist = watchlist)
## params (as set within xgb.train):
## objective = "multi:softprob", eval_metric = "mlogloss", num_class = "2", silent = "1"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## cb.evaluation.log()
## # of features: 6
## niter: 100
## nfeatures : 6
## evaluation_log:
## iter train_mlogloss test_mlogloss
## 1 0.594324 0.651085
## 2 0.534790 0.612848
## ---
## 99 0.083779 0.833188
## 100 0.082495 0.835784
#train data error plot
e <- data.frame(best.model$evaluation_log)
plot(e$iter, e$train_mlogloss, col = 'blue')
This plot is based on train error and we can see the error is quite high at the beginning and as the iterations increase errors come down.
#add test data error
e <- data.frame(best.model$evaluation_log)
plot(e$iter, e$train_mlogloss, col = 'blue')
lines(e$iter, e$test_mlogloss, col = "red")
We can see in the plot the train error is in red-colored line. Initially the error comes down, but very quickly that error increases, which is not good and needs a lot of improvement.
#find minimum error from test data
min(e$test_mlogloss)
## [1] 0.595096
#more parameters
best.model <- xgb.train(params = xgb_params,
data = train_matrix,
nrounds = 100,
watchlist = watchlist,
eta=0.05)
## [1] train-mlogloss:0.674595 test-mlogloss:0.684052
## [2] train-mlogloss:0.658466 test-mlogloss:0.676982
## [3] train-mlogloss:0.642685 test-mlogloss:0.667532
## [4] train-mlogloss:0.628284 test-mlogloss:0.660133
## [5] train-mlogloss:0.615050 test-mlogloss:0.653341
## [6] train-mlogloss:0.604056 test-mlogloss:0.645568
## [7] train-mlogloss:0.592582 test-mlogloss:0.640064
## [8] train-mlogloss:0.582170 test-mlogloss:0.637070
## [9] train-mlogloss:0.571289 test-mlogloss:0.634656
## [10] train-mlogloss:0.561741 test-mlogloss:0.630252
## [11] train-mlogloss:0.551731 test-mlogloss:0.628331
## [12] train-mlogloss:0.542504 test-mlogloss:0.622866
## [13] train-mlogloss:0.533755 test-mlogloss:0.618194
## [14] train-mlogloss:0.525246 test-mlogloss:0.617723
## [15] train-mlogloss:0.517447 test-mlogloss:0.613506
## [16] train-mlogloss:0.509825 test-mlogloss:0.613574
## [17] train-mlogloss:0.502859 test-mlogloss:0.609476
## [18] train-mlogloss:0.496269 test-mlogloss:0.606218
## [19] train-mlogloss:0.489355 test-mlogloss:0.606840
## [20] train-mlogloss:0.483546 test-mlogloss:0.604335
## [21] train-mlogloss:0.477940 test-mlogloss:0.601589
## [22] train-mlogloss:0.472660 test-mlogloss:0.603554
## [23] train-mlogloss:0.467709 test-mlogloss:0.601737
## [24] train-mlogloss:0.462791 test-mlogloss:0.603217
## [25] train-mlogloss:0.457759 test-mlogloss:0.602431
## [26] train-mlogloss:0.452025 test-mlogloss:0.601008
## [27] train-mlogloss:0.447286 test-mlogloss:0.599531
## [28] train-mlogloss:0.442713 test-mlogloss:0.601384
## [29] train-mlogloss:0.436665 test-mlogloss:0.601275
## [30] train-mlogloss:0.430421 test-mlogloss:0.600096
## [31] train-mlogloss:0.424855 test-mlogloss:0.601189
## [32] train-mlogloss:0.419260 test-mlogloss:0.600852
## [33] train-mlogloss:0.414203 test-mlogloss:0.599578
## [34] train-mlogloss:0.409391 test-mlogloss:0.598732
## [35] train-mlogloss:0.404360 test-mlogloss:0.597642
## [36] train-mlogloss:0.401515 test-mlogloss:0.600937
## [37] train-mlogloss:0.397695 test-mlogloss:0.601390
## [38] train-mlogloss:0.394715 test-mlogloss:0.605061
## [39] train-mlogloss:0.391696 test-mlogloss:0.607762
## [40] train-mlogloss:0.388934 test-mlogloss:0.610551
## [41] train-mlogloss:0.386231 test-mlogloss:0.613116
## [42] train-mlogloss:0.383636 test-mlogloss:0.615719
## [43] train-mlogloss:0.381193 test-mlogloss:0.618251
## [44] train-mlogloss:0.378891 test-mlogloss:0.621451
## [45] train-mlogloss:0.376704 test-mlogloss:0.624204
## [46] train-mlogloss:0.374638 test-mlogloss:0.627592
## [47] train-mlogloss:0.372433 test-mlogloss:0.631762
## [48] train-mlogloss:0.370552 test-mlogloss:0.634909
## [49] train-mlogloss:0.368771 test-mlogloss:0.638242
## [50] train-mlogloss:0.366839 test-mlogloss:0.642697
## [51] train-mlogloss:0.365023 test-mlogloss:0.645836
## [52] train-mlogloss:0.363385 test-mlogloss:0.648524
## [53] train-mlogloss:0.361754 test-mlogloss:0.651874
## [54] train-mlogloss:0.360215 test-mlogloss:0.654807
## [55] train-mlogloss:0.358758 test-mlogloss:0.658177
## [56] train-mlogloss:0.357375 test-mlogloss:0.660911
## [57] train-mlogloss:0.356061 test-mlogloss:0.664054
## [58] train-mlogloss:0.355085 test-mlogloss:0.666163
## [59] train-mlogloss:0.353856 test-mlogloss:0.669077
## [60] train-mlogloss:0.352128 test-mlogloss:0.669121
## [61] train-mlogloss:0.350970 test-mlogloss:0.672129
## [62] train-mlogloss:0.350100 test-mlogloss:0.674397
## [63] train-mlogloss:0.347835 test-mlogloss:0.674108
## [64] train-mlogloss:0.346265 test-mlogloss:0.674245
## [65] train-mlogloss:0.345036 test-mlogloss:0.675887
## [66] train-mlogloss:0.343361 test-mlogloss:0.678195
## [67] train-mlogloss:0.342477 test-mlogloss:0.681462
## [68] train-mlogloss:0.339530 test-mlogloss:0.681365
## [69] train-mlogloss:0.338692 test-mlogloss:0.684718
## [70] train-mlogloss:0.335926 test-mlogloss:0.684679
## [71] train-mlogloss:0.333322 test-mlogloss:0.684691
## [72] train-mlogloss:0.332514 test-mlogloss:0.686307
## [73] train-mlogloss:0.330068 test-mlogloss:0.686369
## [74] train-mlogloss:0.327887 test-mlogloss:0.686098
## [75] train-mlogloss:0.327181 test-mlogloss:0.689232
## [76] train-mlogloss:0.325257 test-mlogloss:0.689029
## [77] train-mlogloss:0.324533 test-mlogloss:0.691260
## [78] train-mlogloss:0.322059 test-mlogloss:0.692792
## [79] train-mlogloss:0.321511 test-mlogloss:0.694793
## [80] train-mlogloss:0.320585 test-mlogloss:0.695442
## [81] train-mlogloss:0.318265 test-mlogloss:0.696007
## [82] train-mlogloss:0.316331 test-mlogloss:0.696236
## [83] train-mlogloss:0.314384 test-mlogloss:0.697028
## [84] train-mlogloss:0.313358 test-mlogloss:0.698705
## [85] train-mlogloss:0.311368 test-mlogloss:0.697788
## [86] train-mlogloss:0.309793 test-mlogloss:0.698183
## [87] train-mlogloss:0.307065 test-mlogloss:0.696647
## [88] train-mlogloss:0.305800 test-mlogloss:0.698444
## [89] train-mlogloss:0.304122 test-mlogloss:0.699178
## [90] train-mlogloss:0.302536 test-mlogloss:0.699942
## [91] train-mlogloss:0.300930 test-mlogloss:0.700201
## [92] train-mlogloss:0.299742 test-mlogloss:0.702003
## [93] train-mlogloss:0.298106 test-mlogloss:0.702526
## [94] train-mlogloss:0.296199 test-mlogloss:0.701763
## [95] train-mlogloss:0.295262 test-mlogloss:0.703287
## [96] train-mlogloss:0.293755 test-mlogloss:0.704557
## [97] train-mlogloss:0.291396 test-mlogloss:0.705007
## [98] train-mlogloss:0.289136 test-mlogloss:0.705150
## [99] train-mlogloss:0.287452 test-mlogloss:0.704563
## [100] train-mlogloss:0.285235 test-mlogloss:0.704321
#Error plot based on above optimization
e <- data.frame(best.model$evaluation_log)
plot(e$iter, e$train_mlogloss, col = 'blue')
lines(e$iter, e$test_mlogloss, col = "red")
Now we can see there is some improvement now at least the red curve is visible more compared to the previous plot.
#find minimum error iteration from test data
bestn <- e[e$test_mlogloss==min(e$test_mlogloss),]
best.model <- xgb.train(params = xgb_params,
data = train_matrix,
nrounds = bestn$iter,
watchlist = watchlist,
eta=0.05)
## [1] train-mlogloss:0.674595 test-mlogloss:0.684052
## [2] train-mlogloss:0.658466 test-mlogloss:0.676982
## [3] train-mlogloss:0.642685 test-mlogloss:0.667532
## [4] train-mlogloss:0.628284 test-mlogloss:0.660133
## [5] train-mlogloss:0.615050 test-mlogloss:0.653341
## [6] train-mlogloss:0.604056 test-mlogloss:0.645568
## [7] train-mlogloss:0.592582 test-mlogloss:0.640064
## [8] train-mlogloss:0.582170 test-mlogloss:0.637070
## [9] train-mlogloss:0.571289 test-mlogloss:0.634656
## [10] train-mlogloss:0.561741 test-mlogloss:0.630252
## [11] train-mlogloss:0.551731 test-mlogloss:0.628331
## [12] train-mlogloss:0.542504 test-mlogloss:0.622866
## [13] train-mlogloss:0.533755 test-mlogloss:0.618194
## [14] train-mlogloss:0.525246 test-mlogloss:0.617723
## [15] train-mlogloss:0.517447 test-mlogloss:0.613506
## [16] train-mlogloss:0.509825 test-mlogloss:0.613574
## [17] train-mlogloss:0.502859 test-mlogloss:0.609476
## [18] train-mlogloss:0.496269 test-mlogloss:0.606218
## [19] train-mlogloss:0.489355 test-mlogloss:0.606840
## [20] train-mlogloss:0.483546 test-mlogloss:0.604335
## [21] train-mlogloss:0.477940 test-mlogloss:0.601589
## [22] train-mlogloss:0.472660 test-mlogloss:0.603554
## [23] train-mlogloss:0.467709 test-mlogloss:0.601737
## [24] train-mlogloss:0.462791 test-mlogloss:0.603217
## [25] train-mlogloss:0.457759 test-mlogloss:0.602431
## [26] train-mlogloss:0.452025 test-mlogloss:0.601008
## [27] train-mlogloss:0.447286 test-mlogloss:0.599531
## [28] train-mlogloss:0.442713 test-mlogloss:0.601384
## [29] train-mlogloss:0.436665 test-mlogloss:0.601275
## [30] train-mlogloss:0.430421 test-mlogloss:0.600096
## [31] train-mlogloss:0.424855 test-mlogloss:0.601189
## [32] train-mlogloss:0.419260 test-mlogloss:0.600852
## [33] train-mlogloss:0.414203 test-mlogloss:0.599578
## [34] train-mlogloss:0.409391 test-mlogloss:0.598732
## [35] train-mlogloss:0.404360 test-mlogloss:0.597642
#Feature importance
imp <- xgb.importance(colnames(train_matrix), best.model)
print(imp)
## Feature Gain Cover Frequency
## 1: gpa 0.56073244 0.49328263 0.48390342
## 2: gre 0.28995177 0.31556724 0.36921529
## 3: rank1 0.09361724 0.09675462 0.05432596
## 4: rank2 0.02864025 0.07481324 0.04728370
## 5: rank4 0.02705830 0.01958227 0.04527163
xgb.plot.importance(imp)
#prediction test data
p <- predict(best.model, newdata = test_matrix)
head(p)
## [1] 0.8964934 0.1035066 0.7355127 0.2644872 0.7821574 0.2178426
These values are probabilities of a student should not be accepted ‘0’ or should be accepted ‘1’. The 1st probability is high, 0.8964934 tells us that the first applicant should not be accepted.
pred <- matrix(p, nrow = nc, ncol = length(p)/nc) %>%
t() %>%
data.frame() %>%
mutate(label = test_label, max_prob = max.col(.,"last")-1 )
head(pred)
## X1 X2 label max_prob
## 1 0.8964934 0.1035066 0 0
## 2 0.7355127 0.2644872 0 0
## 3 0.7821574 0.2178426 0 0
## 4 0.2949583 0.7050417 1 1
## 5 0.5168998 0.4831002 1 0
## 6 0.3385653 0.6614348 1 1
#create confusion matrix on test data
(tab <- table(Prediction = pred$max_prob, Actual=pred$label))
## Actual
## Prediction 0 1
## 0 43 18
## 1 7 7
#accuracy
sum(diag(tab))/sum(tab)
## [1] 0.6666667
#misclassification
1-sum(diag(tab))/sum(tab)
## [1] 0.3333333
#more xgboost parameters
best.model <- xgb.train(params = xgb_params,
data = train_matrix,
nrounds = 200,
watchlist = watchlist,
eta=0.01,
max.depth = 3,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
missing = NA,
seed = 333)
## [1] train-mlogloss:0.690576 test-mlogloss:0.691426
## [2] train-mlogloss:0.688062 test-mlogloss:0.689451
## [3] train-mlogloss:0.685597 test-mlogloss:0.687521
## [4] train-mlogloss:0.683178 test-mlogloss:0.685633
## [5] train-mlogloss:0.680805 test-mlogloss:0.683788
## [6] train-mlogloss:0.678476 test-mlogloss:0.681985
## [7] train-mlogloss:0.676192 test-mlogloss:0.680221
## [8] train-mlogloss:0.673950 test-mlogloss:0.678497
## [9] train-mlogloss:0.671750 test-mlogloss:0.676812
## [10] train-mlogloss:0.669590 test-mlogloss:0.675196
## [11] train-mlogloss:0.667471 test-mlogloss:0.673466
## [12] train-mlogloss:0.665389 test-mlogloss:0.671922
## [13] train-mlogloss:0.663345 test-mlogloss:0.670264
## [14] train-mlogloss:0.661337 test-mlogloss:0.668787
## [15] train-mlogloss:0.659366 test-mlogloss:0.667199
## [16] train-mlogloss:0.657430 test-mlogloss:0.665788
## [17] train-mlogloss:0.655528 test-mlogloss:0.664264
## [18] train-mlogloss:0.653661 test-mlogloss:0.662893
## [19] train-mlogloss:0.651826 test-mlogloss:0.661433
## [20] train-mlogloss:0.650024 test-mlogloss:0.660144
## [21] train-mlogloss:0.648252 test-mlogloss:0.658776
## [22] train-mlogloss:0.646512 test-mlogloss:0.657460
## [23] train-mlogloss:0.644801 test-mlogloss:0.656124
## [24] train-mlogloss:0.643120 test-mlogloss:0.654863
## [25] train-mlogloss:0.641468 test-mlogloss:0.653688
## [26] train-mlogloss:0.639845 test-mlogloss:0.652479
## [27] train-mlogloss:0.638159 test-mlogloss:0.651368
## [28] train-mlogloss:0.636502 test-mlogloss:0.650390
## [29] train-mlogloss:0.634871 test-mlogloss:0.649426
## [30] train-mlogloss:0.633269 test-mlogloss:0.648488
## [31] train-mlogloss:0.631537 test-mlogloss:0.648137
## [32] train-mlogloss:0.629943 test-mlogloss:0.646987
## [33] train-mlogloss:0.628265 test-mlogloss:0.646726
## [34] train-mlogloss:0.626761 test-mlogloss:0.645596
## [35] train-mlogloss:0.625134 test-mlogloss:0.645322
## [36] train-mlogloss:0.623677 test-mlogloss:0.644186
## [37] train-mlogloss:0.622099 test-mlogloss:0.643950
## [38] train-mlogloss:0.620687 test-mlogloss:0.643216
## [39] train-mlogloss:0.619157 test-mlogloss:0.643014
## [40] train-mlogloss:0.617653 test-mlogloss:0.642831
## [41] train-mlogloss:0.616305 test-mlogloss:0.641790
## [42] train-mlogloss:0.614846 test-mlogloss:0.641692
## [43] train-mlogloss:0.613539 test-mlogloss:0.640735
## [44] train-mlogloss:0.612123 test-mlogloss:0.640619
## [45] train-mlogloss:0.610856 test-mlogloss:0.639999
## [46] train-mlogloss:0.609481 test-mlogloss:0.639770
## [47] train-mlogloss:0.608206 test-mlogloss:0.638873
## [48] train-mlogloss:0.606872 test-mlogloss:0.638675
## [49] train-mlogloss:0.605679 test-mlogloss:0.637817
## [50] train-mlogloss:0.604383 test-mlogloss:0.637840
## [51] train-mlogloss:0.603108 test-mlogloss:0.637645
## [52] train-mlogloss:0.601967 test-mlogloss:0.636834
## [53] train-mlogloss:0.600669 test-mlogloss:0.636113
## [54] train-mlogloss:0.599446 test-mlogloss:0.636054
## [55] train-mlogloss:0.598184 test-mlogloss:0.635360
## [56] train-mlogloss:0.596941 test-mlogloss:0.634681
## [57] train-mlogloss:0.595717 test-mlogloss:0.634016
## [58] train-mlogloss:0.594512 test-mlogloss:0.633365
## [59] train-mlogloss:0.593326 test-mlogloss:0.632728
## [60] train-mlogloss:0.592157 test-mlogloss:0.632483
## [61] train-mlogloss:0.591042 test-mlogloss:0.632414
## [62] train-mlogloss:0.589903 test-mlogloss:0.631943
## [63] train-mlogloss:0.588782 test-mlogloss:0.631736
## [64] train-mlogloss:0.587676 test-mlogloss:0.631157
## [65] train-mlogloss:0.586586 test-mlogloss:0.630692
## [66] train-mlogloss:0.585512 test-mlogloss:0.630519
## [67] train-mlogloss:0.584480 test-mlogloss:0.630608
## [68] train-mlogloss:0.583432 test-mlogloss:0.630071
## [69] train-mlogloss:0.582400 test-mlogloss:0.629649
## [70] train-mlogloss:0.581383 test-mlogloss:0.629259
## [71] train-mlogloss:0.580379 test-mlogloss:0.629140
## [72] train-mlogloss:0.579389 test-mlogloss:0.628643
## [73] train-mlogloss:0.578432 test-mlogloss:0.628653
## [74] train-mlogloss:0.577467 test-mlogloss:0.628278
## [75] train-mlogloss:0.576515 test-mlogloss:0.627912
## [76] train-mlogloss:0.575576 test-mlogloss:0.627843
## [77] train-mlogloss:0.574651 test-mlogloss:0.627516
## [78] train-mlogloss:0.573737 test-mlogloss:0.627279
## [79] train-mlogloss:0.572847 test-mlogloss:0.627389
## [80] train-mlogloss:0.571955 test-mlogloss:0.626962
## [81] train-mlogloss:0.571075 test-mlogloss:0.627040
## [82] train-mlogloss:0.570207 test-mlogloss:0.626734
## [83] train-mlogloss:0.569351 test-mlogloss:0.626727
## [84] train-mlogloss:0.568513 test-mlogloss:0.626967
## [85] train-mlogloss:0.567679 test-mlogloss:0.626787
## [86] train-mlogloss:0.566853 test-mlogloss:0.626511
## [87] train-mlogloss:0.566041 test-mlogloss:0.626245
## [88] train-mlogloss:0.565237 test-mlogloss:0.626379
## [89] train-mlogloss:0.564448 test-mlogloss:0.626569
## [90] train-mlogloss:0.563666 test-mlogloss:0.626324
## [91] train-mlogloss:0.562890 test-mlogloss:0.626083
## [92] train-mlogloss:0.562128 test-mlogloss:0.625952
## [93] train-mlogloss:0.561372 test-mlogloss:0.625725
## [94] train-mlogloss:0.560627 test-mlogloss:0.626032
## [95] train-mlogloss:0.559956 test-mlogloss:0.625897
## [96] train-mlogloss:0.559231 test-mlogloss:0.625690
## [97] train-mlogloss:0.558509 test-mlogloss:0.625788
## [98] train-mlogloss:0.557865 test-mlogloss:0.625671
## [99] train-mlogloss:0.557162 test-mlogloss:0.625927
## [100] train-mlogloss:0.556472 test-mlogloss:0.625744
## [101] train-mlogloss:0.555781 test-mlogloss:0.625563
## [102] train-mlogloss:0.555169 test-mlogloss:0.625469
## [103] train-mlogloss:0.554495 test-mlogloss:0.625300
## [104] train-mlogloss:0.553900 test-mlogloss:0.625217
## [105] train-mlogloss:0.553241 test-mlogloss:0.625588
## [106] train-mlogloss:0.552661 test-mlogloss:0.625418
## [107] train-mlogloss:0.552026 test-mlogloss:0.625272
## [108] train-mlogloss:0.551322 test-mlogloss:0.625603
## [109] train-mlogloss:0.550691 test-mlogloss:0.625994
## [110] train-mlogloss:0.550070 test-mlogloss:0.626262
## [111] train-mlogloss:0.549396 test-mlogloss:0.626170
## [112] train-mlogloss:0.548788 test-mlogloss:0.626597
## [113] train-mlogloss:0.548189 test-mlogloss:0.627007
## [114] train-mlogloss:0.547600 test-mlogloss:0.627594
## [115] train-mlogloss:0.546947 test-mlogloss:0.627513
## [116] train-mlogloss:0.546370 test-mlogloss:0.627855
## [117] train-mlogloss:0.545801 test-mlogloss:0.628282
## [118] train-mlogloss:0.545164 test-mlogloss:0.628211
## [119] train-mlogloss:0.544608 test-mlogloss:0.628813
## [120] train-mlogloss:0.544060 test-mlogloss:0.629251
## [121] train-mlogloss:0.543439 test-mlogloss:0.629187
## [122] train-mlogloss:0.542871 test-mlogloss:0.629289
## [123] train-mlogloss:0.542312 test-mlogloss:0.629394
## [124] train-mlogloss:0.541709 test-mlogloss:0.629340
## [125] train-mlogloss:0.541164 test-mlogloss:0.629453
## [126] train-mlogloss:0.540503 test-mlogloss:0.629907
## [127] train-mlogloss:0.539986 test-mlogloss:0.630368
## [128] train-mlogloss:0.539341 test-mlogloss:0.630831
## [129] train-mlogloss:0.538761 test-mlogloss:0.630792
## [130] train-mlogloss:0.538128 test-mlogloss:0.631263
## [131] train-mlogloss:0.537634 test-mlogloss:0.631657
## [132] train-mlogloss:0.537016 test-mlogloss:0.632138
## [133] train-mlogloss:0.536424 test-mlogloss:0.632220
## [134] train-mlogloss:0.535714 test-mlogloss:0.632147
## [135] train-mlogloss:0.535110 test-mlogloss:0.632639
## [136] train-mlogloss:0.534413 test-mlogloss:0.632575
## [137] train-mlogloss:0.533821 test-mlogloss:0.632988
## [138] train-mlogloss:0.533137 test-mlogloss:0.632933
## [139] train-mlogloss:0.532463 test-mlogloss:0.632749
## [140] train-mlogloss:0.531883 test-mlogloss:0.633260
## [141] train-mlogloss:0.531221 test-mlogloss:0.633220
## [142] train-mlogloss:0.530652 test-mlogloss:0.633740
## [143] train-mlogloss:0.530000 test-mlogloss:0.633573
## [144] train-mlogloss:0.529441 test-mlogloss:0.634010
## [145] train-mlogloss:0.528801 test-mlogloss:0.633851
## [146] train-mlogloss:0.528251 test-mlogloss:0.634384
## [147] train-mlogloss:0.527644 test-mlogloss:0.634857
## [148] train-mlogloss:0.527105 test-mlogloss:0.635306
## [149] train-mlogloss:0.526484 test-mlogloss:0.635163
## [150] train-mlogloss:0.525955 test-mlogloss:0.635619
## [151] train-mlogloss:0.525368 test-mlogloss:0.636102
## [152] train-mlogloss:0.524905 test-mlogloss:0.635979
## [153] train-mlogloss:0.524302 test-mlogloss:0.635848
## [154] train-mlogloss:0.523787 test-mlogloss:0.636313
## [155] train-mlogloss:0.523333 test-mlogloss:0.636197
## [156] train-mlogloss:0.522767 test-mlogloss:0.636689
## [157] train-mlogloss:0.522264 test-mlogloss:0.637159
## [158] train-mlogloss:0.521680 test-mlogloss:0.637043
## [159] train-mlogloss:0.521186 test-mlogloss:0.637610
## [160] train-mlogloss:0.520746 test-mlogloss:0.637502
## [161] train-mlogloss:0.520183 test-mlogloss:0.637418
## [162] train-mlogloss:0.519651 test-mlogloss:0.637947
## [163] train-mlogloss:0.519170 test-mlogloss:0.638429
## [164] train-mlogloss:0.518741 test-mlogloss:0.638327
## [165] train-mlogloss:0.518193 test-mlogloss:0.638253
## [166] train-mlogloss:0.517722 test-mlogloss:0.638833
## [167] train-mlogloss:0.517189 test-mlogloss:0.638822
## [168] train-mlogloss:0.516733 test-mlogloss:0.639269
## [169] train-mlogloss:0.516228 test-mlogloss:0.639810
## [170] train-mlogloss:0.515771 test-mlogloss:0.640303
## [171] train-mlogloss:0.515254 test-mlogloss:0.640304
## [172] train-mlogloss:0.514805 test-mlogloss:0.640801
## [173] train-mlogloss:0.514292 test-mlogloss:0.640833
## [174] train-mlogloss:0.513789 test-mlogloss:0.640696
## [175] train-mlogloss:0.513350 test-mlogloss:0.641200
## [176] train-mlogloss:0.512852 test-mlogloss:0.641213
## [177] train-mlogloss:0.512408 test-mlogloss:0.641684
## [178] train-mlogloss:0.511914 test-mlogloss:0.641729
## [179] train-mlogloss:0.511487 test-mlogloss:0.642286
## [180] train-mlogloss:0.511004 test-mlogloss:0.642164
## [181] train-mlogloss:0.510525 test-mlogloss:0.642190
## [182] train-mlogloss:0.510105 test-mlogloss:0.642707
## [183] train-mlogloss:0.509631 test-mlogloss:0.642763
## [184] train-mlogloss:0.509166 test-mlogloss:0.642653
## [185] train-mlogloss:0.508740 test-mlogloss:0.643144
## [186] train-mlogloss:0.508332 test-mlogloss:0.643617
## [187] train-mlogloss:0.507872 test-mlogloss:0.643683
## [188] train-mlogloss:0.507420 test-mlogloss:0.643584
## [189] train-mlogloss:0.507005 test-mlogloss:0.644083
## [190] train-mlogloss:0.506553 test-mlogloss:0.644194
## [191] train-mlogloss:0.506155 test-mlogloss:0.644726
## [192] train-mlogloss:0.505671 test-mlogloss:0.644861
## [193] train-mlogloss:0.505195 test-mlogloss:0.644830
## [194] train-mlogloss:0.504792 test-mlogloss:0.645340
## [195] train-mlogloss:0.504315 test-mlogloss:0.645514
## [196] train-mlogloss:0.503919 test-mlogloss:0.646028
## [197] train-mlogloss:0.503536 test-mlogloss:0.646517
## [198] train-mlogloss:0.503110 test-mlogloss:0.646605
## [199] train-mlogloss:0.502651 test-mlogloss:0.646585
## [200] train-mlogloss:0.502198 test-mlogloss:0.646730
#Error plot based on the above optimization
e <- data.frame(best.model$evaluation_log)
plot(e$iter, e$train_mlogloss, col = 'blue')
lines(e$iter, e$test_mlogloss, col = "red")
But still we can see overfitting is present.
bestn <- e[e$test_mlogloss==min(e$test_mlogloss),]
#model tuning
best.model <- xgb.train(params = xgb_params,
data = train_matrix,
nrounds = bestn$iter,
watchlist = watchlist,
eta=0.01,
max.depth = 3,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
missing = NA,
seed = 333)
## [1] train-mlogloss:0.690576 test-mlogloss:0.691426
## [2] train-mlogloss:0.688062 test-mlogloss:0.689451
## [3] train-mlogloss:0.685597 test-mlogloss:0.687521
## [4] train-mlogloss:0.683178 test-mlogloss:0.685633
## [5] train-mlogloss:0.680805 test-mlogloss:0.683788
## [6] train-mlogloss:0.678476 test-mlogloss:0.681985
## [7] train-mlogloss:0.676192 test-mlogloss:0.680221
## [8] train-mlogloss:0.673950 test-mlogloss:0.678497
## [9] train-mlogloss:0.671750 test-mlogloss:0.676812
## [10] train-mlogloss:0.669590 test-mlogloss:0.675196
## [11] train-mlogloss:0.667471 test-mlogloss:0.673466
## [12] train-mlogloss:0.665389 test-mlogloss:0.671922
## [13] train-mlogloss:0.663345 test-mlogloss:0.670264
## [14] train-mlogloss:0.661337 test-mlogloss:0.668787
## [15] train-mlogloss:0.659366 test-mlogloss:0.667199
## [16] train-mlogloss:0.657430 test-mlogloss:0.665788
## [17] train-mlogloss:0.655528 test-mlogloss:0.664264
## [18] train-mlogloss:0.653661 test-mlogloss:0.662893
## [19] train-mlogloss:0.651826 test-mlogloss:0.661433
## [20] train-mlogloss:0.650024 test-mlogloss:0.660144
## [21] train-mlogloss:0.648252 test-mlogloss:0.658776
## [22] train-mlogloss:0.646512 test-mlogloss:0.657460
## [23] train-mlogloss:0.644801 test-mlogloss:0.656124
## [24] train-mlogloss:0.643120 test-mlogloss:0.654863
## [25] train-mlogloss:0.641468 test-mlogloss:0.653688
## [26] train-mlogloss:0.639845 test-mlogloss:0.652479
## [27] train-mlogloss:0.638159 test-mlogloss:0.651368
## [28] train-mlogloss:0.636502 test-mlogloss:0.650390
## [29] train-mlogloss:0.634871 test-mlogloss:0.649426
## [30] train-mlogloss:0.633269 test-mlogloss:0.648488
## [31] train-mlogloss:0.631537 test-mlogloss:0.648137
## [32] train-mlogloss:0.629943 test-mlogloss:0.646987
## [33] train-mlogloss:0.628265 test-mlogloss:0.646726
## [34] train-mlogloss:0.626761 test-mlogloss:0.645596
## [35] train-mlogloss:0.625134 test-mlogloss:0.645322
## [36] train-mlogloss:0.623677 test-mlogloss:0.644186
## [37] train-mlogloss:0.622099 test-mlogloss:0.643950
## [38] train-mlogloss:0.620687 test-mlogloss:0.643216
## [39] train-mlogloss:0.619157 test-mlogloss:0.643014
## [40] train-mlogloss:0.617653 test-mlogloss:0.642831
## [41] train-mlogloss:0.616305 test-mlogloss:0.641790
## [42] train-mlogloss:0.614846 test-mlogloss:0.641692
## [43] train-mlogloss:0.613539 test-mlogloss:0.640735
## [44] train-mlogloss:0.612123 test-mlogloss:0.640619
## [45] train-mlogloss:0.610856 test-mlogloss:0.639999
## [46] train-mlogloss:0.609481 test-mlogloss:0.639770
## [47] train-mlogloss:0.608206 test-mlogloss:0.638873
## [48] train-mlogloss:0.606872 test-mlogloss:0.638675
## [49] train-mlogloss:0.605679 test-mlogloss:0.637817
## [50] train-mlogloss:0.604383 test-mlogloss:0.637840
## [51] train-mlogloss:0.603108 test-mlogloss:0.637645
## [52] train-mlogloss:0.601967 test-mlogloss:0.636834
## [53] train-mlogloss:0.600669 test-mlogloss:0.636113
## [54] train-mlogloss:0.599446 test-mlogloss:0.636054
## [55] train-mlogloss:0.598184 test-mlogloss:0.635360
## [56] train-mlogloss:0.596941 test-mlogloss:0.634681
## [57] train-mlogloss:0.595717 test-mlogloss:0.634016
## [58] train-mlogloss:0.594512 test-mlogloss:0.633365
## [59] train-mlogloss:0.593326 test-mlogloss:0.632728
## [60] train-mlogloss:0.592157 test-mlogloss:0.632483
## [61] train-mlogloss:0.591042 test-mlogloss:0.632414
## [62] train-mlogloss:0.589903 test-mlogloss:0.631943
## [63] train-mlogloss:0.588782 test-mlogloss:0.631736
## [64] train-mlogloss:0.587676 test-mlogloss:0.631157
## [65] train-mlogloss:0.586586 test-mlogloss:0.630692
## [66] train-mlogloss:0.585512 test-mlogloss:0.630519
## [67] train-mlogloss:0.584480 test-mlogloss:0.630608
## [68] train-mlogloss:0.583432 test-mlogloss:0.630071
## [69] train-mlogloss:0.582400 test-mlogloss:0.629649
## [70] train-mlogloss:0.581383 test-mlogloss:0.629259
## [71] train-mlogloss:0.580379 test-mlogloss:0.629140
## [72] train-mlogloss:0.579389 test-mlogloss:0.628643
## [73] train-mlogloss:0.578432 test-mlogloss:0.628653
## [74] train-mlogloss:0.577467 test-mlogloss:0.628278
## [75] train-mlogloss:0.576515 test-mlogloss:0.627912
## [76] train-mlogloss:0.575576 test-mlogloss:0.627843
## [77] train-mlogloss:0.574651 test-mlogloss:0.627516
## [78] train-mlogloss:0.573737 test-mlogloss:0.627279
## [79] train-mlogloss:0.572847 test-mlogloss:0.627389
## [80] train-mlogloss:0.571955 test-mlogloss:0.626962
## [81] train-mlogloss:0.571075 test-mlogloss:0.627040
## [82] train-mlogloss:0.570207 test-mlogloss:0.626734
## [83] train-mlogloss:0.569351 test-mlogloss:0.626727
## [84] train-mlogloss:0.568513 test-mlogloss:0.626967
## [85] train-mlogloss:0.567679 test-mlogloss:0.626787
## [86] train-mlogloss:0.566853 test-mlogloss:0.626511
## [87] train-mlogloss:0.566041 test-mlogloss:0.626245
## [88] train-mlogloss:0.565237 test-mlogloss:0.626379
## [89] train-mlogloss:0.564448 test-mlogloss:0.626569
## [90] train-mlogloss:0.563666 test-mlogloss:0.626324
## [91] train-mlogloss:0.562890 test-mlogloss:0.626083
## [92] train-mlogloss:0.562128 test-mlogloss:0.625952
## [93] train-mlogloss:0.561372 test-mlogloss:0.625725
## [94] train-mlogloss:0.560627 test-mlogloss:0.626032
## [95] train-mlogloss:0.559956 test-mlogloss:0.625897
## [96] train-mlogloss:0.559231 test-mlogloss:0.625690
## [97] train-mlogloss:0.558509 test-mlogloss:0.625788
## [98] train-mlogloss:0.557865 test-mlogloss:0.625671
## [99] train-mlogloss:0.557162 test-mlogloss:0.625927
## [100] train-mlogloss:0.556472 test-mlogloss:0.625744
## [101] train-mlogloss:0.555781 test-mlogloss:0.625563
## [102] train-mlogloss:0.555169 test-mlogloss:0.625469
## [103] train-mlogloss:0.554495 test-mlogloss:0.625300
## [104] train-mlogloss:0.553900 test-mlogloss:0.625217
e <- data.frame(best.model$evaluation_log)
plot(e$iter, e$train_mlogloss, col = 'blue')
lines(e$iter, e$test_mlogloss, col = "red")
#Prediction and confusion matrix
p <- predict(best.model, newdata = test_matrix)
pred <- matrix(p, nrow = nc, ncol = length(p)/nc) %>%
t() %>%
data.frame() %>%
mutate(label = test_label, max_prob = max.col(.,"last")-1 )
(tab <- table(Prediction = pred$max_prob, Actual=pred$label))
## Actual
## Prediction 0 1
## 0 49 21
## 1 1 4
#accuracy
sum(diag(tab))/sum(tab)
## [1] 0.7066667
#misclassification
1-sum(diag(tab))/sum(tab)
## [1] 0.2933333
This gives us lower error and higher accuracy but still there is scope for improvement.
Thanks!!!