Setting up environment and Prepare Data

Load library

library(conflicted)
library(xgboost)
library(dplyr)
library(tidyr)
#library(neuralnet)
library(ggplot2)
library(DiagrammeR)
library(DiagrammeRsvg)
library(caret)
## Loading required package: lattice
library(gridExtra)

Load csv file

train_data <- read.csv("sensor_prepare_train_data.csv",stringsAsFactors = FALSE)
test_data <- read.csv("sensor_prepare_test_data.csv",stringsAsFactors = FALSE)

Set Features and Target Variable

features_column <- c("Seismic_Max","Seismic_Min","Seismic_Mean","Seismic_Range","Gyro_Gx",
                     "Gyro_Gy","Gyro_Gz")
target_column <- c("Event_Actual")

แบ่งข้อมูลตัวแปรต้นสำหรับการฝึกสอน

Train data features

train_data_features <- train_data[,features_column]
# Summary train data features
summary(train_data_features)
##   Seismic_Max       Seismic_Min        Seismic_Mean     Seismic_Range   
##  Min.   :0.02802   Min.   :-3.59999   Min.   :-0.3224   Min.   :0.7593  
##  1st Qu.:2.75628   1st Qu.:-3.43122   1st Qu.: 0.6112   1st Qu.:5.5405  
##  Median :3.20628   Median :-3.17808   Median : 0.7429   Median :6.1985  
##  Mean   :3.06429   Mean   :-2.88286   Mean   : 0.7419   Mean   :5.9472  
##  3rd Qu.:3.45941   3rd Qu.:-2.64375   3rd Qu.: 0.8763   3rd Qu.:6.6656  
##  Max.   :3.59989   Max.   : 0.00924   Max.   : 1.5837   Max.   :7.1999  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz       
##  Min.   :-9.7710   Min.   :-10.4733   Min.   :  7.809  
##  1st Qu.:-7.6641   1st Qu.: -8.9160   1st Qu.:125.008  
##  Median :-2.7634   Median : -2.3206   Median :125.557  
##  Mean   :-2.9722   Mean   : -2.6277   Mean   :125.496  
##  3rd Qu.: 0.7634   3rd Qu.:  0.8855   3rd Qu.:126.107  
##  Max.   : 8.7328   Max.   : 13.0076   Max.   :128.489

กำหนดตัวแปรตามสำหรับการฝึกสอน

Train data target

train_data_target <- train_data[,target_column]
#Summary train data target
summary(train_data_target)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.594   4.000   7.000

กำหนดตัวแปรต้นสำหรับการทดสอบ

Test data features

test_data_features <- test_data[,features_column]
summary(test_data_features)
##   Seismic_Max      Seismic_Min        Seismic_Mean      Seismic_Range  
##  Min.   :0.5062   Min.   :-3.60000   Min.   :-0.07793   Min.   :1.266  
##  1st Qu.:2.7844   1st Qu.:-3.43122   1st Qu.: 0.62230   1st Qu.:5.513  
##  Median :3.2063   Median :-3.17809   Median : 0.74292   Median :6.188  
##  Mean   :3.0661   Mean   :-2.84109   Mean   : 0.74299   Mean   :5.907  
##  3rd Qu.:3.4594   3rd Qu.:-2.67184   3rd Qu.: 0.87471   3rd Qu.:6.638  
##  Max.   :3.5999   Max.   :-0.02807   Max.   : 1.38095   Max.   :7.172  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz     
##  Min.   :-9.2824   Min.   :-10.5038   Min.   :123.3  
##  1st Qu.:-7.6336   1st Qu.: -9.0687   1st Qu.:125.1  
##  Median :-3.8473   Median : -2.5038   Median :125.6  
##  Mean   :-3.0997   Mean   : -2.8223   Mean   :125.7  
##  3rd Qu.: 0.7634   3rd Qu.:  0.7634   3rd Qu.:126.2  
##  Max.   : 8.3359   Max.   : 12.7634   Max.   :128.2

กำหนดตัวแปรตามสำหรับการทดสอบ

Test data target

test_data_target <- test_data[,target_column]
summary(test_data_target)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.431   4.000   7.000

Setup XGBoost parameter

convert data into Matrix

xgb_train= xgb.DMatrix(data=as.matrix(train_data_features), label = train_data_target)
xgb_test = xgb.DMatrix(data=as.matrix(test_data_features), label = test_data_target)
watchlist = list(train=xgb_train,test = xgb_test)

Set Parameter

numberOfClasses = 8
xgb_params <- list("objective" = "multi:softprob",
                   "eval_metric" = "mlogloss",
                   "num_class" = numberOfClasses
                   )
nround <- 50 # number of XGBoost rounds
cv.nfold <- 5

# Fit cv.nfold  * cv.nround XGB models and save predictions
cv_model <- xgb.cv(params = xgb_params,
                   data = xgb_train,
                   nrounds = nround,
                   nfold = cv.nfold,
                   verbose = TRUE,
                   prediction = TRUE
                   )
## [1]  train-mlogloss:1.319805+0.004075    test-mlogloss:1.358115+0.008540 
## [2]  train-mlogloss:1.029494+0.005227    test-mlogloss:1.092179+0.010931 
## [3]  train-mlogloss:0.844991+0.005279    test-mlogloss:0.927039+0.012804 
## [4]  train-mlogloss:0.714495+0.006615    test-mlogloss:0.812838+0.014576 
## [5]  train-mlogloss:0.619041+0.005925    test-mlogloss:0.730699+0.015956 
## [6]  train-mlogloss:0.547137+0.006397    test-mlogloss:0.671434+0.015803 
## [7]  train-mlogloss:0.492395+0.006781    test-mlogloss:0.627690+0.015533 
## [8]  train-mlogloss:0.448155+0.006666    test-mlogloss:0.593997+0.014408 
## [9]  train-mlogloss:0.413868+0.005808    test-mlogloss:0.569870+0.014959 
## [10] train-mlogloss:0.386275+0.005420    test-mlogloss:0.549870+0.015348 
## [11] train-mlogloss:0.364219+0.005412    test-mlogloss:0.535161+0.015661 
## [12] train-mlogloss:0.345796+0.005775    test-mlogloss:0.523927+0.015824 
## [13] train-mlogloss:0.329429+0.006115    test-mlogloss:0.515459+0.016175 
## [14] train-mlogloss:0.315362+0.006499    test-mlogloss:0.507455+0.016829 
## [15] train-mlogloss:0.303470+0.005907    test-mlogloss:0.500942+0.016268 
## [16] train-mlogloss:0.292128+0.006159    test-mlogloss:0.494985+0.017127 
## [17] train-mlogloss:0.281429+0.005931    test-mlogloss:0.491269+0.017220 
## [18] train-mlogloss:0.271050+0.006898    test-mlogloss:0.488895+0.017101 
## [19] train-mlogloss:0.262075+0.006672    test-mlogloss:0.485587+0.017047 
## [20] train-mlogloss:0.253898+0.007176    test-mlogloss:0.483378+0.017813 
## [21] train-mlogloss:0.244365+0.006303    test-mlogloss:0.482003+0.018849 
## [22] train-mlogloss:0.236098+0.005891    test-mlogloss:0.480513+0.017993 
## [23] train-mlogloss:0.228372+0.005258    test-mlogloss:0.479409+0.018352 
## [24] train-mlogloss:0.220965+0.005235    test-mlogloss:0.479195+0.018384 
## [25] train-mlogloss:0.213438+0.003685    test-mlogloss:0.478226+0.018997 
## [26] train-mlogloss:0.207318+0.003923    test-mlogloss:0.477943+0.019448 
## [27] train-mlogloss:0.201562+0.003822    test-mlogloss:0.477209+0.019870 
## [28] train-mlogloss:0.195435+0.004678    test-mlogloss:0.476335+0.019798 
## [29] train-mlogloss:0.189622+0.004539    test-mlogloss:0.475720+0.019945 
## [30] train-mlogloss:0.183982+0.003995    test-mlogloss:0.475191+0.020642 
## [31] train-mlogloss:0.179060+0.004468    test-mlogloss:0.475004+0.021404 
## [32] train-mlogloss:0.172530+0.004752    test-mlogloss:0.475041+0.022372 
## [33] train-mlogloss:0.167428+0.005088    test-mlogloss:0.474813+0.022213 
## [34] train-mlogloss:0.162674+0.005123    test-mlogloss:0.476026+0.022537 
## [35] train-mlogloss:0.158572+0.005416    test-mlogloss:0.476190+0.022502 
## [36] train-mlogloss:0.153760+0.005645    test-mlogloss:0.477072+0.022025 
## [37] train-mlogloss:0.149150+0.005638    test-mlogloss:0.476620+0.021345 
## [38] train-mlogloss:0.145249+0.004970    test-mlogloss:0.477113+0.021252 
## [39] train-mlogloss:0.140930+0.004443    test-mlogloss:0.477483+0.020920 
## [40] train-mlogloss:0.136871+0.003999    test-mlogloss:0.478410+0.021347 
## [41] train-mlogloss:0.133298+0.004499    test-mlogloss:0.478983+0.021485 
## [42] train-mlogloss:0.129151+0.004084    test-mlogloss:0.480155+0.022098 
## [43] train-mlogloss:0.124921+0.003372    test-mlogloss:0.479985+0.022240 
## [44] train-mlogloss:0.121168+0.003538    test-mlogloss:0.480608+0.022558 
## [45] train-mlogloss:0.117668+0.002858    test-mlogloss:0.480944+0.023269 
## [46] train-mlogloss:0.114460+0.002359    test-mlogloss:0.481381+0.023418 
## [47] train-mlogloss:0.111164+0.002797    test-mlogloss:0.482151+0.023971 
## [48] train-mlogloss:0.108022+0.002695    test-mlogloss:0.483023+0.024746 
## [49] train-mlogloss:0.105163+0.002764    test-mlogloss:0.484129+0.025220 
## [50] train-mlogloss:0.102158+0.001789    test-mlogloss:0.485882+0.025082
prediction <- data.frame(cv_model$pred) %>%
  mutate(max_prob = max.col(., ties.method = "last"),
         label = train_data_target + 1)
head(prediction,8)

Confusion Matrix

confusionMatrix(factor(prediction$max_prob),
                factor(prediction$label),
                mode = "everything")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3    4    5    6    7    8
##          1 1302    0    0    0    0   27   83  125
##          2    0  629    0    2   20    0    1    0
##          3    0    0  521   69    0    1    0    0
##          4    0    3   23  512    8   53    0    0
##          5    0   18    0   14  518    0    0    0
##          6   14    0    0   90    0  400   55   34
##          7   38    0    0    0    0   41  156   50
##          8   45    0    0    0    0   31   33  130
## 
## Overall Statistics
##                                           
##                Accuracy : 0.826           
##                  95% CI : (0.8153, 0.8364)
##     No Information Rate : 0.2772          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7925          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.9307   0.9677   0.9577   0.7453   0.9487  0.72333
## Specificity            0.9356   0.9948   0.9845   0.9800   0.9929  0.95704
## Pos Pred Value         0.8471   0.9647   0.8816   0.8548   0.9418  0.67454
## Neg Pred Value         0.9724   0.9952   0.9948   0.9606   0.9938  0.96564
## Precision              0.8471   0.9647   0.8816   0.8548   0.9418  0.67454
## Recall                 0.9307   0.9677   0.9577   0.7453   0.9487  0.72333
## F1                     0.8869   0.9662   0.9181   0.7963   0.9453  0.69808
## Prevalence             0.2772   0.1288   0.1078   0.1361   0.1082  0.10959
## Detection Rate         0.2580   0.1247   0.1033   0.1015   0.1027  0.07927
## Detection Prevalence   0.3046   0.1292   0.1171   0.1187   0.1090  0.11752
## Balanced Accuracy      0.9331   0.9812   0.9711   0.8627   0.9708  0.84019
##                      Class: 7 Class: 8
## Sensitivity           0.47561  0.38348
## Specificity           0.97266  0.97684
## Pos Pred Value        0.54737  0.54393
## Neg Pred Value        0.96387  0.95652
## Precision             0.54737  0.54393
## Recall                0.47561  0.38348
## F1                    0.50897  0.44983
## Prevalence            0.06500  0.06718
## Detection Rate        0.03092  0.02576
## Detection Prevalence  0.05648  0.04736
## Balanced Accuracy     0.72413  0.68016

Train Model with Hyperparameter

GS_T0 <- Sys.time()
xgboost_model <- xgb.train(data = xgb_train,
                        params = xgb_params,
                        nrounds = nround,
                        )
GS_T1<-Sys.time()
training_time <- GS_T1-GS_T0
# ใช้เวลาในการฝึกสอน
training_time
## Time difference of 1.523358 secs

Save XGBoost model

xgboost_model$best_iteration
## NULL
filename <- 'xgboost_model.rds'
saveRDS(xgboost_model,file=filename)

Create a confusion matrix of Evaluation

Predictions

xgb_predictions <- predict(xgboost_model, as.matrix(test_data_features))
xgb_test_prediction <- matrix(xgb_predictions, nrow = numberOfClasses,
                              ncol = length(xgb_predictions)/numberOfClasses) %>%
  t()%>%
  data.frame()%>%
  mutate(label = test_data_target+1,
         max_prob = max.col(.,"last"))
# confusion matrix of test set
confusionMatrix(factor(xgb_test_prediction$max_prob),
                factor(xgb_test_prediction$label),
                mode = "everything")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2   3   4   5   6   7   8
##          1 379   0   0   0   0   9  27  50
##          2   0 136   0   2   5   0   0   0
##          3   0   0 138  17   0   0   0   0
##          4   0   1   1 131   0  13   0   0
##          5   0   6   0   1 130   0   0   0
##          6   8   0   0  20   0  76  10  15
##          7   6   0   0   0   0  16  22   6
##          8   7   0   0   0   0   4   6  15
## 
## Overall Statistics
##                                          
##                Accuracy : 0.817          
##                  95% CI : (0.7945, 0.838)
##     No Information Rate : 0.3182         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7754         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.9475   0.9510   0.9928   0.7661   0.9630  0.64407
## Specificity            0.8996   0.9937   0.9848   0.9862   0.9938  0.95347
## Pos Pred Value         0.8151   0.9510   0.8903   0.8973   0.9489  0.58915
## Neg Pred Value         0.9735   0.9937   0.9991   0.9640   0.9955  0.96277
## Precision              0.8151   0.9510   0.8903   0.8973   0.9489  0.58915
## Recall                 0.9475   0.9510   0.9928   0.7661   0.9630  0.64407
## F1                     0.8763   0.9510   0.9388   0.8265   0.9559  0.61538
## Prevalence             0.3182   0.1138   0.1106   0.1360   0.1074  0.09387
## Detection Rate         0.3015   0.1082   0.1098   0.1042   0.1034  0.06046
## Detection Prevalence   0.3699   0.1138   0.1233   0.1161   0.1090  0.10263
## Balanced Accuracy      0.9236   0.9724   0.9888   0.8761   0.9784  0.79877
##                      Class: 7 Class: 8
## Sensitivity           0.33846  0.17442
## Specificity           0.97651  0.98548
## Pos Pred Value        0.44000  0.46875
## Neg Pred Value        0.96437  0.94204
## Precision             0.44000  0.46875
## Recall                0.33846  0.17442
## F1                    0.38261  0.25424
## Prevalence            0.05171  0.06842
## Detection Rate        0.01750  0.01193
## Detection Prevalence  0.03978  0.02546
## Balanced Accuracy     0.65749  0.57995

Plot XGBoost model

names <- colnames(train_data[,-1])
#xgboost_model = xgboost(data=xgb_train, max.depth = 3, nrounds=1355,eta = 0.01,lambda = 0.01,alpha=0.5,verbose =0)
summary(xgboost_model)
##               Length Class              Mode       
## handle             1 xgb.Booster.handle externalptr
## raw           767180 -none-             raw        
## niter              1 -none-             numeric    
## call               4 -none-             call       
## params             4 -none-             list       
## callbacks          1 -none-             list       
## feature_names      7 -none-             character  
## nfeatures          1 -none-             numeric
xgb.plot.tree(model = xgboost_model, trees = 1)
tree_plot <- xgb.plot.tree(model = xgboost_model, trees = 1, plot_width = 1000, 
                           plot_height = 1000, render = FALSE)
export_graph(tree_plot, "xgboost_tree_plot1.pdf",width = 1000, height = 1000)
#xgb.dump(xgboost_model, with_stats = TRUE)
xgb.plot.deepness(
  model = xgboost_model,
  which = c("2x1", "max.depth","med.depth","med.weight")
)

p <- xgb.plot.multi.trees(model = xgboost_model, 
                          features_keep = 5, 
                          feature_names = NULL,
                          fill=TRUE)
## Column 2 ['No'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.
p
importance_matrix = xgb.importance(colnames(train_data_features),model = xgboost_model)
head(importance_matrix)
importance_matrix
xgb.plot.importance(importance_matrix)

gp = xgb.ggplot.importance(importance_matrix)
print(gp)