Setting up environment and Prepare Data

Load library

library(conflicted)
library(dplyr)
library(tidyr)
library(ggplot2)
library(DiagrammeR)
library(DiagrammeRsvg)
library(caret)
## Loading required package: lattice
library(tree)

Load csv file

train_data <- read.csv("sensor_prepare_train_data.csv",stringsAsFactors = FALSE)
test_data <- read.csv("sensor_prepare_test_data.csv",stringsAsFactors = FALSE)

Set Features and Target Variable

features_column <- c("Seismic_Max","Seismic_Min","Seismic_Mean","Seismic_Range","Gyro_Gx",
                     "Gyro_Gy","Gyro_Gz")
target_column <- c("Event_Actual")

แบ่งข้อมูลตัวแปรต้นสำหรับการฝึกสอน

Train data features

train_data_features <- train_data[,features_column]
# Summary train data features
summary(train_data_features)
##   Seismic_Max       Seismic_Min        Seismic_Mean     Seismic_Range   
##  Min.   :0.02802   Min.   :-3.59999   Min.   :-0.3224   Min.   :0.7593  
##  1st Qu.:2.75628   1st Qu.:-3.43122   1st Qu.: 0.6112   1st Qu.:5.5405  
##  Median :3.20628   Median :-3.17808   Median : 0.7429   Median :6.1985  
##  Mean   :3.06429   Mean   :-2.88286   Mean   : 0.7419   Mean   :5.9472  
##  3rd Qu.:3.45941   3rd Qu.:-2.64375   3rd Qu.: 0.8763   3rd Qu.:6.6656  
##  Max.   :3.59989   Max.   : 0.00924   Max.   : 1.5837   Max.   :7.1999  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz       
##  Min.   :-9.7710   Min.   :-10.4733   Min.   :  7.809  
##  1st Qu.:-7.6641   1st Qu.: -8.9160   1st Qu.:125.008  
##  Median :-2.7634   Median : -2.3206   Median :125.557  
##  Mean   :-2.9722   Mean   : -2.6277   Mean   :125.496  
##  3rd Qu.: 0.7634   3rd Qu.:  0.8855   3rd Qu.:126.107  
##  Max.   : 8.7328   Max.   : 13.0076   Max.   :128.489

กำหนดตัวแปรตามสำหรับการฝึกสอน

Train data target

train_data_target <- train_data[,target_column]
#Summary train data target
summary(train_data_target)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.594   4.000   7.000

แบ่งข้อมูลตัวแปรต้นสำหรับการทดสอบ

Test data features

test_data_features <- test_data[,features_column]
# Summary test data features
summary(test_data_features)
##   Seismic_Max      Seismic_Min        Seismic_Mean      Seismic_Range  
##  Min.   :0.5062   Min.   :-3.60000   Min.   :-0.07793   Min.   :1.266  
##  1st Qu.:2.7844   1st Qu.:-3.43122   1st Qu.: 0.62230   1st Qu.:5.513  
##  Median :3.2063   Median :-3.17809   Median : 0.74292   Median :6.188  
##  Mean   :3.0661   Mean   :-2.84109   Mean   : 0.74299   Mean   :5.907  
##  3rd Qu.:3.4594   3rd Qu.:-2.67184   3rd Qu.: 0.87471   3rd Qu.:6.638  
##  Max.   :3.5999   Max.   :-0.02807   Max.   : 1.38095   Max.   :7.172  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz     
##  Min.   :-9.2824   Min.   :-10.5038   Min.   :123.3  
##  1st Qu.:-7.6336   1st Qu.: -9.0687   1st Qu.:125.1  
##  Median :-3.8473   Median : -2.5038   Median :125.6  
##  Mean   :-3.0997   Mean   : -2.8223   Mean   :125.7  
##  3rd Qu.: 0.7634   3rd Qu.:  0.7634   3rd Qu.:126.2  
##  Max.   : 8.3359   Max.   : 12.7634   Max.   :128.2

กำหนดตัวแปรตามสำหรับการทดสอบ

Test data target

test_data_target <- test_data[,target_column]
#Summary test data target
summary(test_data_target)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.431   4.000   7.000

Setup target variable to factor

train_data$Event_Actual <- as.factor(train_data$Event_Actual)
train_data_target <- as.factor(train_data$Event_Actual)
table(train_data_target)
## train_data_target
##    0    1    2    3    4    5    6    7 
## 1399  650  544  687  546  553  328  339
test_data$Event_Actual <- as.factor((test_data$Event_Actual))
test_data_target <- as.factor((test_data$Event_Actual))
table(test_data_target)
## test_data_target
##   0   1   2   3   4   5   6   7 
## 400 143 139 171 135 118  65  86

Train Decision Tree

GS_T0 <- Sys.time()
dt_model <-rpart::rpart(Event_Actual~Seismic_Max+Seismic_Min+Seismic_Mean+Seismic_Range+Gyro_Gx+Gyro_Gy+Gyro_Gz,
                         data=train_data)
print(dt_model)
## n= 5046 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 5046 3647 0 (0.28 0.13 0.11 0.14 0.11 0.11 0.065 0.067)  
##     2) Gyro_Gy< -8.412214 1990  622 0 (0.69 0 0 0 0 0.058 0.11 0.14) *
##     3) Gyro_Gy>=-8.412214 3056 2369 3 (0.01 0.21 0.18 0.22 0.18 0.14 0.035 0.017)  
##       6) Gyro_Gy>=5.694656 932  425 3 (0 0 0.46 0.54 0 0 0 0)  
##        12) Gyro_Gy< 8.122137 496   77 2 (0 0 0.84 0.16 0 0 0 0) *
##        13) Gyro_Gy>=8.122137 436    6 3 (0 0 0.014 0.99 0 0 0 0) *
##       7) Gyro_Gy< 5.694656 2124 1474 1 (0.015 0.31 0.056 0.085 0.26 0.21 0.051 0.025)  
##        14) Gyro_Gy>=-1.358779 1350  700 1 (0 0.48 0.088 0.03 0.39 0.011 0.00074 0)  
##          28) Gyro_Gy< 2.610687 1052  402 1 (0 0.62 0.11 0.039 0.22 0.013 0.00095 0)  
##            56) Gyro_Gx< -0.6259542 435    1 1 (0 1 0 0 0 0 0.0023 0) *
##            57) Gyro_Gx>=-0.6259542 617  390 4 (0 0.35 0.19 0.066 0.37 0.023 0 0)  
##             114) Gyro_Gx< 3.022901 135   16 2 (0 0 0.88 0.015 0 0.1 0 0) *
##             115) Gyro_Gx>=3.022901 482  255 4 (0 0.45 0 0.081 0.47 0 0 0)  
##               230) Gyro_Gy>=-0.1984733 245   44 1 (0 0.82 0 0.094 0.086 0 0 0) *
##               231) Gyro_Gy< -0.1984733 237   31 4 (0 0.063 0 0.068 0.87 0 0 0) *
##          29) Gyro_Gy>=2.610687 298    1 4 (0 0 0 0 1 0.0034 0 0) *
##        15) Gyro_Gy< -1.358779 774  352 5 (0.04 0 0 0.18 0.028 0.55 0.14 0.068) *
GS_T1<-Sys.time()
training_time <- GS_T1-GS_T0
# ใช้เวลาในการฝึกสอน
training_time
## Time difference of 0.09727001 secs

Save decision tree model

filename <- 'dt_model.rds'
saveRDS(dt_model,file=filename)

Create a confusion matrix of Evaluation

Predictions of train data (for re-check)

dt_predictions <- predict(dt_model, train_data_features, type = "class")
(tab1 <-table(dt_predictions, train_data_target))
##               train_data_target
## dt_predictions    0    1    2    3    4    5    6    7
##              0 1368    0    0    0    0  116  220  286
##              1    0  635    0   23   21    0    1    0
##              2    0    0  538   79    0   14    0    0
##              3    0    0    6  430    0    0    0    0
##              4    0   15    0   16  503    1    0    0
##              5   31    0    0  139   22  422  107   53
##              6    0    0    0    0    0    0    0    0
##              7    0    0    0    0    0    0    0    0
confusionMatrix(dt_predictions,train_data_target, positive='y')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7
##          0 1368    0    0    0    0  116  220  286
##          1    0  635    0   23   21    0    1    0
##          2    0    0  538   79    0   14    0    0
##          3    0    0    6  430    0    0    0    0
##          4    0   15    0   16  503    1    0    0
##          5   31    0    0  139   22  422  107   53
##          6    0    0    0    0    0    0    0    0
##          7    0    0    0    0    0    0    0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7721          
##                  95% CI : (0.7603, 0.7836)
##     No Information Rate : 0.2772          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.722           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9778   0.9769   0.9890  0.62591  0.92125  0.76311
## Specificity            0.8294   0.9898   0.9793  0.99862  0.99289  0.92166
## Pos Pred Value         0.6874   0.9338   0.8526  0.98624  0.94019  0.54522
## Neg Pred Value         0.9899   0.9966   0.9986  0.94425  0.99047  0.96934
## Prevalence             0.2772   0.1288   0.1078  0.13615  0.10820  0.10959
## Detection Rate         0.2711   0.1258   0.1066  0.08522  0.09968  0.08363
## Detection Prevalence   0.3944   0.1348   0.1250  0.08641  0.10602  0.15339
## Balanced Accuracy      0.9036   0.9833   0.9842  0.81227  0.95707  0.84238
##                      Class: 6 Class: 7
## Sensitivity             0.000  0.00000
## Specificity             1.000  1.00000
## Pos Pred Value            NaN      NaN
## Neg Pred Value          0.935  0.93282
## Prevalence              0.065  0.06718
## Detection Rate          0.000  0.00000
## Detection Prevalence    0.000  0.00000
## Balanced Accuracy       0.500  0.50000

Predictions of test data (for assessment)

dt_predictions <- predict(dt_model, test_data_features, type = 'class') 

confusionMatrix(dt_predictions,test_data_target, positive = 'y')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7
##          0 392   0   0   0   0  24  43  74
##          1   0 138   0   8   4   1   0   0
##          2   0   0 139  17   0   0   0   0
##          3   0   0   0 108   0   0   0   0
##          4   0   5   0   0 127   0   0   0
##          5   8   0   0  38   4  93  22  12
##          6   0   0   0   0   0   0   0   0
##          7   0   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7932          
##                  95% CI : (0.7697, 0.8152)
##     No Information Rate : 0.3182          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7419          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9800   0.9650   1.0000  0.63158   0.9407  0.78814
## Specificity            0.8355   0.9883   0.9848  1.00000   0.9955  0.92625
## Pos Pred Value         0.7355   0.9139   0.8910  1.00000   0.9621  0.52542
## Neg Pred Value         0.9890   0.9955   1.0000  0.94517   0.9929  0.97685
## Prevalence             0.3182   0.1138   0.1106  0.13604   0.1074  0.09387
## Detection Rate         0.3119   0.1098   0.1106  0.08592   0.1010  0.07399
## Detection Prevalence   0.4240   0.1201   0.1241  0.08592   0.1050  0.14081
## Balanced Accuracy      0.9077   0.9767   0.9924  0.81579   0.9681  0.85719
##                      Class: 6 Class: 7
## Sensitivity           0.00000  0.00000
## Specificity           1.00000  1.00000
## Pos Pred Value            NaN      NaN
## Neg Pred Value        0.94829  0.93158
## Prevalence            0.05171  0.06842
## Detection Rate        0.00000  0.00000
## Detection Prevalence  0.00000  0.00000
## Balanced Accuracy     0.50000  0.50000
#1-sum(diag(tab2))/ sum(tab2) # Error

Test data Accuracy is 79.32 %