Setting up environment and Prepare Data

Load library

library(conflicted)
library(xgboost)
library(dplyr)
library(tidyr)
#library(neuralnet)
library(ggplot2)
library(DiagrammeR)
library(DiagrammeRsvg)
library(caret)

## Loading required package: lattice

library(gridExtra)

Load csv file

train_data <- read.csv("sensor_prepare_train_data.csv",stringsAsFactors = FALSE)
test_data <- read.csv("sensor_prepare_test_data.csv",stringsAsFactors = FALSE)

Set Features and Target Variable

features_column <- c("Seismic_Max","Seismic_Min","Seismic_Mean","Seismic_Range","Gyro_Gx",
                     "Gyro_Gy","Gyro_Gz")
target_column <- c("Event_Actual")

แบ่งข้อมูลตัวแปรต้นสำหรับการฝึกสอน

Train data features

train_data_features <- train_data[,features_column]
# Summary train data features
summary(train_data_features)

##   Seismic_Max       Seismic_Min        Seismic_Mean     Seismic_Range   
##  Min.   :0.02802   Min.   :-3.59999   Min.   :-0.3224   Min.   :0.7593  
##  1st Qu.:2.75628   1st Qu.:-3.43122   1st Qu.: 0.6112   1st Qu.:5.5405  
##  Median :3.20628   Median :-3.17808   Median : 0.7429   Median :6.1985  
##  Mean   :3.06429   Mean   :-2.88286   Mean   : 0.7419   Mean   :5.9472  
##  3rd Qu.:3.45941   3rd Qu.:-2.64375   3rd Qu.: 0.8763   3rd Qu.:6.6656  
##  Max.   :3.59989   Max.   : 0.00924   Max.   : 1.5837   Max.   :7.1999  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz       
##  Min.   :-9.7710   Min.   :-10.4733   Min.   :  7.809  
##  1st Qu.:-7.6641   1st Qu.: -8.9160   1st Qu.:125.008  
##  Median :-2.7634   Median : -2.3206   Median :125.557  
##  Mean   :-2.9722   Mean   : -2.6277   Mean   :125.496  
##  3rd Qu.: 0.7634   3rd Qu.:  0.8855   3rd Qu.:126.107  
##  Max.   : 8.7328   Max.   : 13.0076   Max.   :128.489

กำหนดตัวแปรตามสำหรับการฝึกสอน

Train data target

train_data_target <- train_data[,target_column]
#Summary train data target
summary(train_data_target)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.594   4.000   7.000

กำหนดตัวแปรต้นสำหรับการทดสอบ

Test data features

test_data_features <- test_data[,features_column]
summary(test_data_features)

##   Seismic_Max      Seismic_Min        Seismic_Mean      Seismic_Range  
##  Min.   :0.5062   Min.   :-3.60000   Min.   :-0.07793   Min.   :1.266  
##  1st Qu.:2.7844   1st Qu.:-3.43122   1st Qu.: 0.62230   1st Qu.:5.513  
##  Median :3.2063   Median :-3.17809   Median : 0.74292   Median :6.188  
##  Mean   :3.0661   Mean   :-2.84109   Mean   : 0.74299   Mean   :5.907  
##  3rd Qu.:3.4594   3rd Qu.:-2.67184   3rd Qu.: 0.87471   3rd Qu.:6.638  
##  Max.   :3.5999   Max.   :-0.02807   Max.   : 1.38095   Max.   :7.172  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz     
##  Min.   :-9.2824   Min.   :-10.5038   Min.   :123.3  
##  1st Qu.:-7.6336   1st Qu.: -9.0687   1st Qu.:125.1  
##  Median :-3.8473   Median : -2.5038   Median :125.6  
##  Mean   :-3.0997   Mean   : -2.8223   Mean   :125.7  
##  3rd Qu.: 0.7634   3rd Qu.:  0.7634   3rd Qu.:126.2  
##  Max.   : 8.3359   Max.   : 12.7634   Max.   :128.2

กำหนดตัวแปรตามสำหรับการทดสอบ

Test data target

test_data_target <- test_data[,target_column]
summary(test_data_target)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.431   4.000   7.000

Setup XGBoost parameter

convert data into Matrix

xgb_train= xgb.DMatrix(data=as.matrix(train_data_features), label = train_data_target)
xgb_test = xgb.DMatrix(data=as.matrix(test_data_features), label = test_data_target)
watchlist = list(train=xgb_train,test = xgb_test)

Set Parameter

numberOfClasses = 8
xgb_params <- list("objective" = "multi:softprob",
                   "eval_metric" = "mlogloss",
                   "num_class" = numberOfClasses
                   )
nround <- 50 # number of XGBoost rounds
cv.nfold <- 5

# Fit cv.nfold  * cv.nround XGB models and save predictions
cv_model <- xgb.cv(params = xgb_params,
                   data = xgb_train,
                   nrounds = nround,
                   nfold = cv.nfold,
                   verbose = TRUE,
                   prediction = TRUE
                   )

## [1]  train-mlogloss:1.319805+0.004075    test-mlogloss:1.358115+0.008540 
## [2]  train-mlogloss:1.029494+0.005227    test-mlogloss:1.092179+0.010931 
## [3]  train-mlogloss:0.844991+0.005279    test-mlogloss:0.927039+0.012804 
## [4]  train-mlogloss:0.714495+0.006615    test-mlogloss:0.812838+0.014576 
## [5]  train-mlogloss:0.619041+0.005925    test-mlogloss:0.730699+0.015956 
## [6]  train-mlogloss:0.547137+0.006397    test-mlogloss:0.671434+0.015803 
## [7]  train-mlogloss:0.492395+0.006781    test-mlogloss:0.627690+0.015533 
## [8]  train-mlogloss:0.448155+0.006666    test-mlogloss:0.593997+0.014408 
## [9]  train-mlogloss:0.413868+0.005808    test-mlogloss:0.569870+0.014959 
## [10] train-mlogloss:0.386275+0.005420    test-mlogloss:0.549870+0.015348 
## [11] train-mlogloss:0.364219+0.005412    test-mlogloss:0.535161+0.015661 
## [12] train-mlogloss:0.345796+0.005775    test-mlogloss:0.523927+0.015824 
## [13] train-mlogloss:0.329429+0.006115    test-mlogloss:0.515459+0.016175 
## [14] train-mlogloss:0.315362+0.006499    test-mlogloss:0.507455+0.016829 
## [15] train-mlogloss:0.303470+0.005907    test-mlogloss:0.500942+0.016268 
## [16] train-mlogloss:0.292128+0.006159    test-mlogloss:0.494985+0.017127 
## [17] train-mlogloss:0.281429+0.005931    test-mlogloss:0.491269+0.017220 
## [18] train-mlogloss:0.271050+0.006898    test-mlogloss:0.488895+0.017101 
## [19] train-mlogloss:0.262075+0.006672    test-mlogloss:0.485587+0.017047 
## [20] train-mlogloss:0.253898+0.007176    test-mlogloss:0.483378+0.017813 
## [21] train-mlogloss:0.244365+0.006303    test-mlogloss:0.482003+0.018849 
## [22] train-mlogloss:0.236098+0.005891    test-mlogloss:0.480513+0.017993 
## [23] train-mlogloss:0.228372+0.005258    test-mlogloss:0.479409+0.018352 
## [24] train-mlogloss:0.220965+0.005235    test-mlogloss:0.479195+0.018384 
## [25] train-mlogloss:0.213438+0.003685    test-mlogloss:0.478226+0.018997 
## [26] train-mlogloss:0.207318+0.003923    test-mlogloss:0.477943+0.019448 
## [27] train-mlogloss:0.201562+0.003822    test-mlogloss:0.477209+0.019870 
## [28] train-mlogloss:0.195435+0.004678    test-mlogloss:0.476335+0.019798 
## [29] train-mlogloss:0.189622+0.004539    test-mlogloss:0.475720+0.019945 
## [30] train-mlogloss:0.183982+0.003995    test-mlogloss:0.475191+0.020642 
## [31] train-mlogloss:0.179060+0.004468    test-mlogloss:0.475004+0.021404 
## [32] train-mlogloss:0.172530+0.004752    test-mlogloss:0.475041+0.022372 
## [33] train-mlogloss:0.167428+0.005088    test-mlogloss:0.474813+0.022213 
## [34] train-mlogloss:0.162674+0.005123    test-mlogloss:0.476026+0.022537 
## [35] train-mlogloss:0.158572+0.005416    test-mlogloss:0.476190+0.022502 
## [36] train-mlogloss:0.153760+0.005645    test-mlogloss:0.477072+0.022025 
## [37] train-mlogloss:0.149150+0.005638    test-mlogloss:0.476620+0.021345 
## [38] train-mlogloss:0.145249+0.004970    test-mlogloss:0.477113+0.021252 
## [39] train-mlogloss:0.140930+0.004443    test-mlogloss:0.477483+0.020920 
## [40] train-mlogloss:0.136871+0.003999    test-mlogloss:0.478410+0.021347 
## [41] train-mlogloss:0.133298+0.004499    test-mlogloss:0.478983+0.021485 
## [42] train-mlogloss:0.129151+0.004084    test-mlogloss:0.480155+0.022098 
## [43] train-mlogloss:0.124921+0.003372    test-mlogloss:0.479985+0.022240 
## [44] train-mlogloss:0.121168+0.003538    test-mlogloss:0.480608+0.022558 
## [45] train-mlogloss:0.117668+0.002858    test-mlogloss:0.480944+0.023269 
## [46] train-mlogloss:0.114460+0.002359    test-mlogloss:0.481381+0.023418 
## [47] train-mlogloss:0.111164+0.002797    test-mlogloss:0.482151+0.023971 
## [48] train-mlogloss:0.108022+0.002695    test-mlogloss:0.483023+0.024746 
## [49] train-mlogloss:0.105163+0.002764    test-mlogloss:0.484129+0.025220 
## [50] train-mlogloss:0.102158+0.001789    test-mlogloss:0.485882+0.025082

prediction <- data.frame(cv_model$pred) %>%
  mutate(max_prob = max.col(., ties.method = "last"),
         label = train_data_target + 1)
head(prediction,8)

Confusion Matrix

confusionMatrix(factor(prediction$max_prob),
                factor(prediction$label),
                mode = "everything")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3    4    5    6    7    8
##          1 1302    0    0    0    0   27   83  125
##          2    0  629    0    2   20    0    1    0
##          3    0    0  521   69    0    1    0    0
##          4    0    3   23  512    8   53    0    0
##          5    0   18    0   14  518    0    0    0
##          6   14    0    0   90    0  400   55   34
##          7   38    0    0    0    0   41  156   50
##          8   45    0    0    0    0   31   33  130
## 
## Overall Statistics
##                                           
##                Accuracy : 0.826           
##                  95% CI : (0.8153, 0.8364)
##     No Information Rate : 0.2772          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7925          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.9307   0.9677   0.9577   0.7453   0.9487  0.72333
## Specificity            0.9356   0.9948   0.9845   0.9800   0.9929  0.95704
## Pos Pred Value         0.8471   0.9647   0.8816   0.8548   0.9418  0.67454
## Neg Pred Value         0.9724   0.9952   0.9948   0.9606   0.9938  0.96564
## Precision              0.8471   0.9647   0.8816   0.8548   0.9418  0.67454
## Recall                 0.9307   0.9677   0.9577   0.7453   0.9487  0.72333
## F1                     0.8869   0.9662   0.9181   0.7963   0.9453  0.69808
## Prevalence             0.2772   0.1288   0.1078   0.1361   0.1082  0.10959
## Detection Rate         0.2580   0.1247   0.1033   0.1015   0.1027  0.07927
## Detection Prevalence   0.3046   0.1292   0.1171   0.1187   0.1090  0.11752
## Balanced Accuracy      0.9331   0.9812   0.9711   0.8627   0.9708  0.84019
##                      Class: 7 Class: 8
## Sensitivity           0.47561  0.38348
## Specificity           0.97266  0.97684
## Pos Pred Value        0.54737  0.54393
## Neg Pred Value        0.96387  0.95652
## Precision             0.54737  0.54393
## Recall                0.47561  0.38348
## F1                    0.50897  0.44983
## Prevalence            0.06500  0.06718
## Detection Rate        0.03092  0.02576
## Detection Prevalence  0.05648  0.04736
## Balanced Accuracy     0.72413  0.68016

Train Model with Hyperparameter

GS_T0 <- Sys.time()
xgboost_model <- xgb.train(data = xgb_train,
                        params = xgb_params,
                        nrounds = nround,
                        )
GS_T1<-Sys.time()
training_time <- GS_T1-GS_T0
# ใช้เวลาในการฝึกสอน
training_time

## Time difference of 1.523358 secs

Save XGBoost model

xgboost_model$best_iteration

## NULL

filename <- 'xgboost_model.rds'
saveRDS(xgboost_model,file=filename)

Create a confusion matrix of Evaluation

Predictions

xgb_predictions <- predict(xgboost_model, as.matrix(test_data_features))
xgb_test_prediction <- matrix(xgb_predictions, nrow = numberOfClasses,
                              ncol = length(xgb_predictions)/numberOfClasses) %>%
  t()%>%
  data.frame()%>%
  mutate(label = test_data_target+1,
         max_prob = max.col(.,"last"))
# confusion matrix of test set
confusionMatrix(factor(xgb_test_prediction$max_prob),
                factor(xgb_test_prediction$label),
                mode = "everything")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2   3   4   5   6   7   8
##          1 379   0   0   0   0   9  27  50
##          2   0 136   0   2   5   0   0   0
##          3   0   0 138  17   0   0   0   0
##          4   0   1   1 131   0  13   0   0
##          5   0   6   0   1 130   0   0   0
##          6   8   0   0  20   0  76  10  15
##          7   6   0   0   0   0  16  22   6
##          8   7   0   0   0   0   4   6  15
## 
## Overall Statistics
##                                          
##                Accuracy : 0.817          
##                  95% CI : (0.7945, 0.838)
##     No Information Rate : 0.3182         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7754         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.9475   0.9510   0.9928   0.7661   0.9630  0.64407
## Specificity            0.8996   0.9937   0.9848   0.9862   0.9938  0.95347
## Pos Pred Value         0.8151   0.9510   0.8903   0.8973   0.9489  0.58915
## Neg Pred Value         0.9735   0.9937   0.9991   0.9640   0.9955  0.96277
## Precision              0.8151   0.9510   0.8903   0.8973   0.9489  0.58915
## Recall                 0.9475   0.9510   0.9928   0.7661   0.9630  0.64407
## F1                     0.8763   0.9510   0.9388   0.8265   0.9559  0.61538
## Prevalence             0.3182   0.1138   0.1106   0.1360   0.1074  0.09387
## Detection Rate         0.3015   0.1082   0.1098   0.1042   0.1034  0.06046
## Detection Prevalence   0.3699   0.1138   0.1233   0.1161   0.1090  0.10263
## Balanced Accuracy      0.9236   0.9724   0.9888   0.8761   0.9784  0.79877
##                      Class: 7 Class: 8
## Sensitivity           0.33846  0.17442
## Specificity           0.97651  0.98548
## Pos Pred Value        0.44000  0.46875
## Neg Pred Value        0.96437  0.94204
## Precision             0.44000  0.46875
## Recall                0.33846  0.17442
## F1                    0.38261  0.25424
## Prevalence            0.05171  0.06842
## Detection Rate        0.01750  0.01193
## Detection Prevalence  0.03978  0.02546
## Balanced Accuracy     0.65749  0.57995

Plot XGBoost model

names <- colnames(train_data[,-1])
#xgboost_model = xgboost(data=xgb_train, max.depth = 3, nrounds=1355,eta = 0.01,lambda = 0.01,alpha=0.5,verbose =0)
summary(xgboost_model)

##               Length Class              Mode       
## handle             1 xgb.Booster.handle externalptr
## raw           767180 -none-             raw        
## niter              1 -none-             numeric    
## call               4 -none-             call       
## params             4 -none-             list       
## callbacks          1 -none-             list       
## feature_names      7 -none-             character  
## nfeatures          1 -none-             numeric

xgb.plot.tree(model = xgboost_model, trees = 1)

tree_plot <- xgb.plot.tree(model = xgboost_model, trees = 1, plot_width = 1000, 
                           plot_height = 1000, render = FALSE)
export_graph(tree_plot, "xgboost_tree_plot1.pdf",width = 1000, height = 1000)
#xgb.dump(xgboost_model, with_stats = TRUE)
xgb.plot.deepness(
  model = xgboost_model,
  which = c("2x1", "max.depth","med.depth","med.weight")
)

p <- xgb.plot.multi.trees(model = xgboost_model, 
                          features_keep = 5, 
                          feature_names = NULL,
                          fill=TRUE)

## Column 2 ['No'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.

importance_matrix = xgb.importance(colnames(train_data_features),model = xgboost_model)
head(importance_matrix)

importance_matrix

xgb.plot.importance(importance_matrix)

gp = xgb.ggplot.importance(importance_matrix)
print(gp)

Machine Learning (XGBoost) For Smart Alert Sensor

Surasak Popwandee

2024-01-21

Setting up environment and Prepare Data

Load library

Load csv file

Set Features and Target Variable

แบ่งข้อมูลตัวแปรต้นสำหรับการฝึกสอน

Train data features

กำหนดตัวแปรตามสำหรับการฝึกสอน

Train data target

กำหนดตัวแปรต้นสำหรับการทดสอบ

Test data features

กำหนดตัวแปรตามสำหรับการทดสอบ

Test data target

Setup XGBoost parameter

convert data into Matrix

Set Parameter

Confusion Matrix

Train Model with Hyperparameter

Save XGBoost model

Create a confusion matrix of Evaluation

Predictions

Plot XGBoost model