Setting up environment and Prepare Data

Load library

library(conflicted)
library(dplyr)
library(tidyr)
library(ggplot2)
library(DiagrammeR)
library(DiagrammeRsvg)
library(caret)

## Loading required package: lattice

library(tree)

Load csv file

train_data <- read.csv("sensor_prepare_train_data.csv",stringsAsFactors = FALSE)
test_data <- read.csv("sensor_prepare_test_data.csv",stringsAsFactors = FALSE)

Set Features and Target Variable

features_column <- c("Seismic_Max","Seismic_Min","Seismic_Mean","Seismic_Range","Gyro_Gx",
                     "Gyro_Gy","Gyro_Gz")
target_column <- c("Event_Actual")

แบ่งข้อมูลตัวแปรต้นสำหรับการฝึกสอน

Train data features

train_data_features <- train_data[,features_column]
# Summary train data features
summary(train_data_features)

##   Seismic_Max       Seismic_Min        Seismic_Mean     Seismic_Range   
##  Min.   :0.02802   Min.   :-3.59999   Min.   :-0.3224   Min.   :0.7593  
##  1st Qu.:2.75628   1st Qu.:-3.43122   1st Qu.: 0.6112   1st Qu.:5.5405  
##  Median :3.20628   Median :-3.17808   Median : 0.7429   Median :6.1985  
##  Mean   :3.06429   Mean   :-2.88286   Mean   : 0.7419   Mean   :5.9472  
##  3rd Qu.:3.45941   3rd Qu.:-2.64375   3rd Qu.: 0.8763   3rd Qu.:6.6656  
##  Max.   :3.59989   Max.   : 0.00924   Max.   : 1.5837   Max.   :7.1999  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz       
##  Min.   :-9.7710   Min.   :-10.4733   Min.   :  7.809  
##  1st Qu.:-7.6641   1st Qu.: -8.9160   1st Qu.:125.008  
##  Median :-2.7634   Median : -2.3206   Median :125.557  
##  Mean   :-2.9722   Mean   : -2.6277   Mean   :125.496  
##  3rd Qu.: 0.7634   3rd Qu.:  0.8855   3rd Qu.:126.107  
##  Max.   : 8.7328   Max.   : 13.0076   Max.   :128.489

กำหนดตัวแปรตามสำหรับการฝึกสอน

Train data target

train_data_target <- train_data[,target_column]
#Summary train data target
summary(train_data_target)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.594   4.000   7.000

แบ่งข้อมูลตัวแปรต้นสำหรับการทดสอบ

Test data features

test_data_features <- test_data[,features_column]
# Summary test data features
summary(test_data_features)

##   Seismic_Max      Seismic_Min        Seismic_Mean      Seismic_Range  
##  Min.   :0.5062   Min.   :-3.60000   Min.   :-0.07793   Min.   :1.266  
##  1st Qu.:2.7844   1st Qu.:-3.43122   1st Qu.: 0.62230   1st Qu.:5.513  
##  Median :3.2063   Median :-3.17809   Median : 0.74292   Median :6.188  
##  Mean   :3.0661   Mean   :-2.84109   Mean   : 0.74299   Mean   :5.907  
##  3rd Qu.:3.4594   3rd Qu.:-2.67184   3rd Qu.: 0.87471   3rd Qu.:6.638  
##  Max.   :3.5999   Max.   :-0.02807   Max.   : 1.38095   Max.   :7.172  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz     
##  Min.   :-9.2824   Min.   :-10.5038   Min.   :123.3  
##  1st Qu.:-7.6336   1st Qu.: -9.0687   1st Qu.:125.1  
##  Median :-3.8473   Median : -2.5038   Median :125.6  
##  Mean   :-3.0997   Mean   : -2.8223   Mean   :125.7  
##  3rd Qu.: 0.7634   3rd Qu.:  0.7634   3rd Qu.:126.2  
##  Max.   : 8.3359   Max.   : 12.7634   Max.   :128.2

กำหนดตัวแปรตามสำหรับการทดสอบ

Test data target

test_data_target <- test_data[,target_column]
#Summary test data target
summary(test_data_target)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.431   4.000   7.000

Setup target variable to factor

train_data$Event_Actual <- as.factor(train_data$Event_Actual)
train_data_target <- as.factor(train_data$Event_Actual)
table(train_data_target)

## train_data_target
##    0    1    2    3    4    5    6    7 
## 1399  650  544  687  546  553  328  339

test_data$Event_Actual <- as.factor((test_data$Event_Actual))
test_data_target <- as.factor((test_data$Event_Actual))
table(test_data_target)

## test_data_target
##   0   1   2   3   4   5   6   7 
## 400 143 139 171 135 118  65  86

Train Decision Tree

GS_T0 <- Sys.time()
dt_model <-rpart::rpart(Event_Actual~Seismic_Max+Seismic_Min+Seismic_Mean+Seismic_Range+Gyro_Gx+Gyro_Gy+Gyro_Gz,
                         data=train_data)
print(dt_model)

## n= 5046 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 5046 3647 0 (0.28 0.13 0.11 0.14 0.11 0.11 0.065 0.067)  
##     2) Gyro_Gy< -8.412214 1990  622 0 (0.69 0 0 0 0 0.058 0.11 0.14) *
##     3) Gyro_Gy>=-8.412214 3056 2369 3 (0.01 0.21 0.18 0.22 0.18 0.14 0.035 0.017)  
##       6) Gyro_Gy>=5.694656 932  425 3 (0 0 0.46 0.54 0 0 0 0)  
##        12) Gyro_Gy< 8.122137 496   77 2 (0 0 0.84 0.16 0 0 0 0) *
##        13) Gyro_Gy>=8.122137 436    6 3 (0 0 0.014 0.99 0 0 0 0) *
##       7) Gyro_Gy< 5.694656 2124 1474 1 (0.015 0.31 0.056 0.085 0.26 0.21 0.051 0.025)  
##        14) Gyro_Gy>=-1.358779 1350  700 1 (0 0.48 0.088 0.03 0.39 0.011 0.00074 0)  
##          28) Gyro_Gy< 2.610687 1052  402 1 (0 0.62 0.11 0.039 0.22 0.013 0.00095 0)  
##            56) Gyro_Gx< -0.6259542 435    1 1 (0 1 0 0 0 0 0.0023 0) *
##            57) Gyro_Gx>=-0.6259542 617  390 4 (0 0.35 0.19 0.066 0.37 0.023 0 0)  
##             114) Gyro_Gx< 3.022901 135   16 2 (0 0 0.88 0.015 0 0.1 0 0) *
##             115) Gyro_Gx>=3.022901 482  255 4 (0 0.45 0 0.081 0.47 0 0 0)  
##               230) Gyro_Gy>=-0.1984733 245   44 1 (0 0.82 0 0.094 0.086 0 0 0) *
##               231) Gyro_Gy< -0.1984733 237   31 4 (0 0.063 0 0.068 0.87 0 0 0) *
##          29) Gyro_Gy>=2.610687 298    1 4 (0 0 0 0 1 0.0034 0 0) *
##        15) Gyro_Gy< -1.358779 774  352 5 (0.04 0 0 0.18 0.028 0.55 0.14 0.068) *

GS_T1<-Sys.time()
training_time <- GS_T1-GS_T0
# ใช้เวลาในการฝึกสอน
training_time

## Time difference of 0.09727001 secs

Save decision tree model

filename <- 'dt_model.rds'
saveRDS(dt_model,file=filename)

Create a confusion matrix of Evaluation

Predictions of train data (for re-check)

dt_predictions <- predict(dt_model, train_data_features, type = "class")
(tab1 <-table(dt_predictions, train_data_target))

##               train_data_target
## dt_predictions    0    1    2    3    4    5    6    7
##              0 1368    0    0    0    0  116  220  286
##              1    0  635    0   23   21    0    1    0
##              2    0    0  538   79    0   14    0    0
##              3    0    0    6  430    0    0    0    0
##              4    0   15    0   16  503    1    0    0
##              5   31    0    0  139   22  422  107   53
##              6    0    0    0    0    0    0    0    0
##              7    0    0    0    0    0    0    0    0

confusionMatrix(dt_predictions,train_data_target, positive='y')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7
##          0 1368    0    0    0    0  116  220  286
##          1    0  635    0   23   21    0    1    0
##          2    0    0  538   79    0   14    0    0
##          3    0    0    6  430    0    0    0    0
##          4    0   15    0   16  503    1    0    0
##          5   31    0    0  139   22  422  107   53
##          6    0    0    0    0    0    0    0    0
##          7    0    0    0    0    0    0    0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7721          
##                  95% CI : (0.7603, 0.7836)
##     No Information Rate : 0.2772          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.722           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9778   0.9769   0.9890  0.62591  0.92125  0.76311
## Specificity            0.8294   0.9898   0.9793  0.99862  0.99289  0.92166
## Pos Pred Value         0.6874   0.9338   0.8526  0.98624  0.94019  0.54522
## Neg Pred Value         0.9899   0.9966   0.9986  0.94425  0.99047  0.96934
## Prevalence             0.2772   0.1288   0.1078  0.13615  0.10820  0.10959
## Detection Rate         0.2711   0.1258   0.1066  0.08522  0.09968  0.08363
## Detection Prevalence   0.3944   0.1348   0.1250  0.08641  0.10602  0.15339
## Balanced Accuracy      0.9036   0.9833   0.9842  0.81227  0.95707  0.84238
##                      Class: 6 Class: 7
## Sensitivity             0.000  0.00000
## Specificity             1.000  1.00000
## Pos Pred Value            NaN      NaN
## Neg Pred Value          0.935  0.93282
## Prevalence              0.065  0.06718
## Detection Rate          0.000  0.00000
## Detection Prevalence    0.000  0.00000
## Balanced Accuracy       0.500  0.50000

Predictions of test data (for assessment)

dt_predictions <- predict(dt_model, test_data_features, type = 'class') 

confusionMatrix(dt_predictions,test_data_target, positive = 'y')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7
##          0 392   0   0   0   0  24  43  74
##          1   0 138   0   8   4   1   0   0
##          2   0   0 139  17   0   0   0   0
##          3   0   0   0 108   0   0   0   0
##          4   0   5   0   0 127   0   0   0
##          5   8   0   0  38   4  93  22  12
##          6   0   0   0   0   0   0   0   0
##          7   0   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7932          
##                  95% CI : (0.7697, 0.8152)
##     No Information Rate : 0.3182          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7419          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9800   0.9650   1.0000  0.63158   0.9407  0.78814
## Specificity            0.8355   0.9883   0.9848  1.00000   0.9955  0.92625
## Pos Pred Value         0.7355   0.9139   0.8910  1.00000   0.9621  0.52542
## Neg Pred Value         0.9890   0.9955   1.0000  0.94517   0.9929  0.97685
## Prevalence             0.3182   0.1138   0.1106  0.13604   0.1074  0.09387
## Detection Rate         0.3119   0.1098   0.1106  0.08592   0.1010  0.07399
## Detection Prevalence   0.4240   0.1201   0.1241  0.08592   0.1050  0.14081
## Balanced Accuracy      0.9077   0.9767   0.9924  0.81579   0.9681  0.85719
##                      Class: 6 Class: 7
## Sensitivity           0.00000  0.00000
## Specificity           1.00000  1.00000
## Pos Pred Value            NaN      NaN
## Neg Pred Value        0.94829  0.93158
## Prevalence            0.05171  0.06842
## Detection Rate        0.00000  0.00000
## Detection Prevalence  0.00000  0.00000
## Balanced Accuracy     0.50000  0.50000

#1-sum(diag(tab2))/ sum(tab2) # Error

Test data Accuracy is 79.32 %

Machine Learning (Decision Tree) For Smart Alert Sensor

Surasak Popwandee

2024-01-27

Setting up environment and Prepare Data

Load library

Load csv file

Set Features and Target Variable

แบ่งข้อมูลตัวแปรต้นสำหรับการฝึกสอน

Train data features

กำหนดตัวแปรตามสำหรับการฝึกสอน

Train data target

แบ่งข้อมูลตัวแปรต้นสำหรับการทดสอบ

Test data features

กำหนดตัวแปรตามสำหรับการทดสอบ

Test data target

Setup target variable to factor

Train Decision Tree

Save decision tree model

Create a confusion matrix of Evaluation

Predictions of train data (for re-check)

Predictions of test data (for assessment)

Test data Accuracy is 79.32 %