Setting up environment and Prepare Data
Load library
library(conflicted)
library(dplyr)
library(tidyr)
library(ggplot2)
library(DiagrammeR)
library(DiagrammeRsvg)
library(caret)
## Loading required package: lattice
library(tree)
Load csv file
train_data <- read.csv("sensor_prepare_train_data.csv",stringsAsFactors = FALSE)
test_data <- read.csv("sensor_prepare_test_data.csv",stringsAsFactors = FALSE)
Set Features and Target Variable
features_column <- c("Seismic_Max","Seismic_Min","Seismic_Mean","Seismic_Range","Gyro_Gx",
"Gyro_Gy","Gyro_Gz")
target_column <- c("Event_Actual")
แบ่งข้อมูลตัวแปรต้นสำหรับการฝึกสอน
Train data features
train_data_features <- train_data[,features_column]
# Summary train data features
summary(train_data_features)
## Seismic_Max Seismic_Min Seismic_Mean Seismic_Range
## Min. :0.02802 Min. :-3.59999 Min. :-0.3224 Min. :0.7593
## 1st Qu.:2.75628 1st Qu.:-3.43122 1st Qu.: 0.6112 1st Qu.:5.5405
## Median :3.20628 Median :-3.17808 Median : 0.7429 Median :6.1985
## Mean :3.06429 Mean :-2.88286 Mean : 0.7419 Mean :5.9472
## 3rd Qu.:3.45941 3rd Qu.:-2.64375 3rd Qu.: 0.8763 3rd Qu.:6.6656
## Max. :3.59989 Max. : 0.00924 Max. : 1.5837 Max. :7.1999
## Gyro_Gx Gyro_Gy Gyro_Gz
## Min. :-9.7710 Min. :-10.4733 Min. : 7.809
## 1st Qu.:-7.6641 1st Qu.: -8.9160 1st Qu.:125.008
## Median :-2.7634 Median : -2.3206 Median :125.557
## Mean :-2.9722 Mean : -2.6277 Mean :125.496
## 3rd Qu.: 0.7634 3rd Qu.: 0.8855 3rd Qu.:126.107
## Max. : 8.7328 Max. : 13.0076 Max. :128.489
กำหนดตัวแปรตามสำหรับการฝึกสอน
Train data target
train_data_target <- train_data[,target_column]
#Summary train data target
summary(train_data_target)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 2.000 2.594 4.000 7.000
แบ่งข้อมูลตัวแปรต้นสำหรับการทดสอบ
Test data features
test_data_features <- test_data[,features_column]
# Summary test data features
summary(test_data_features)
## Seismic_Max Seismic_Min Seismic_Mean Seismic_Range
## Min. :0.5062 Min. :-3.60000 Min. :-0.07793 Min. :1.266
## 1st Qu.:2.7844 1st Qu.:-3.43122 1st Qu.: 0.62230 1st Qu.:5.513
## Median :3.2063 Median :-3.17809 Median : 0.74292 Median :6.188
## Mean :3.0661 Mean :-2.84109 Mean : 0.74299 Mean :5.907
## 3rd Qu.:3.4594 3rd Qu.:-2.67184 3rd Qu.: 0.87471 3rd Qu.:6.638
## Max. :3.5999 Max. :-0.02807 Max. : 1.38095 Max. :7.172
## Gyro_Gx Gyro_Gy Gyro_Gz
## Min. :-9.2824 Min. :-10.5038 Min. :123.3
## 1st Qu.:-7.6336 1st Qu.: -9.0687 1st Qu.:125.1
## Median :-3.8473 Median : -2.5038 Median :125.6
## Mean :-3.0997 Mean : -2.8223 Mean :125.7
## 3rd Qu.: 0.7634 3rd Qu.: 0.7634 3rd Qu.:126.2
## Max. : 8.3359 Max. : 12.7634 Max. :128.2
กำหนดตัวแปรตามสำหรับการทดสอบ
Test data target
test_data_target <- test_data[,target_column]
#Summary test data target
summary(test_data_target)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 2.000 2.431 4.000 7.000
Setup target variable to factor
train_data$Event_Actual <- as.factor(train_data$Event_Actual)
train_data_target <- as.factor(train_data$Event_Actual)
table(train_data_target)
## train_data_target
## 0 1 2 3 4 5 6 7
## 1399 650 544 687 546 553 328 339
test_data$Event_Actual <- as.factor((test_data$Event_Actual))
test_data_target <- as.factor((test_data$Event_Actual))
table(test_data_target)
## test_data_target
## 0 1 2 3 4 5 6 7
## 400 143 139 171 135 118 65 86
Train Decision Tree
GS_T0 <- Sys.time()
dt_model <-rpart::rpart(Event_Actual~Seismic_Max+Seismic_Min+Seismic_Mean+Seismic_Range+Gyro_Gx+Gyro_Gy+Gyro_Gz,
data=train_data)
print(dt_model)
## n= 5046
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5046 3647 0 (0.28 0.13 0.11 0.14 0.11 0.11 0.065 0.067)
## 2) Gyro_Gy< -8.412214 1990 622 0 (0.69 0 0 0 0 0.058 0.11 0.14) *
## 3) Gyro_Gy>=-8.412214 3056 2369 3 (0.01 0.21 0.18 0.22 0.18 0.14 0.035 0.017)
## 6) Gyro_Gy>=5.694656 932 425 3 (0 0 0.46 0.54 0 0 0 0)
## 12) Gyro_Gy< 8.122137 496 77 2 (0 0 0.84 0.16 0 0 0 0) *
## 13) Gyro_Gy>=8.122137 436 6 3 (0 0 0.014 0.99 0 0 0 0) *
## 7) Gyro_Gy< 5.694656 2124 1474 1 (0.015 0.31 0.056 0.085 0.26 0.21 0.051 0.025)
## 14) Gyro_Gy>=-1.358779 1350 700 1 (0 0.48 0.088 0.03 0.39 0.011 0.00074 0)
## 28) Gyro_Gy< 2.610687 1052 402 1 (0 0.62 0.11 0.039 0.22 0.013 0.00095 0)
## 56) Gyro_Gx< -0.6259542 435 1 1 (0 1 0 0 0 0 0.0023 0) *
## 57) Gyro_Gx>=-0.6259542 617 390 4 (0 0.35 0.19 0.066 0.37 0.023 0 0)
## 114) Gyro_Gx< 3.022901 135 16 2 (0 0 0.88 0.015 0 0.1 0 0) *
## 115) Gyro_Gx>=3.022901 482 255 4 (0 0.45 0 0.081 0.47 0 0 0)
## 230) Gyro_Gy>=-0.1984733 245 44 1 (0 0.82 0 0.094 0.086 0 0 0) *
## 231) Gyro_Gy< -0.1984733 237 31 4 (0 0.063 0 0.068 0.87 0 0 0) *
## 29) Gyro_Gy>=2.610687 298 1 4 (0 0 0 0 1 0.0034 0 0) *
## 15) Gyro_Gy< -1.358779 774 352 5 (0.04 0 0 0.18 0.028 0.55 0.14 0.068) *
GS_T1<-Sys.time()
training_time <- GS_T1-GS_T0
# ใช้เวลาในการฝึกสอน
training_time
## Time difference of 0.09727001 secs
Save decision tree model
filename <- 'dt_model.rds'
saveRDS(dt_model,file=filename)
Create a confusion matrix of Evaluation
Predictions of train data (for re-check)
dt_predictions <- predict(dt_model, train_data_features, type = "class")
(tab1 <-table(dt_predictions, train_data_target))
## train_data_target
## dt_predictions 0 1 2 3 4 5 6 7
## 0 1368 0 0 0 0 116 220 286
## 1 0 635 0 23 21 0 1 0
## 2 0 0 538 79 0 14 0 0
## 3 0 0 6 430 0 0 0 0
## 4 0 15 0 16 503 1 0 0
## 5 31 0 0 139 22 422 107 53
## 6 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0
confusionMatrix(dt_predictions,train_data_target, positive='y')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7
## 0 1368 0 0 0 0 116 220 286
## 1 0 635 0 23 21 0 1 0
## 2 0 0 538 79 0 14 0 0
## 3 0 0 6 430 0 0 0 0
## 4 0 15 0 16 503 1 0 0
## 5 31 0 0 139 22 422 107 53
## 6 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7721
## 95% CI : (0.7603, 0.7836)
## No Information Rate : 0.2772
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.722
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.9778 0.9769 0.9890 0.62591 0.92125 0.76311
## Specificity 0.8294 0.9898 0.9793 0.99862 0.99289 0.92166
## Pos Pred Value 0.6874 0.9338 0.8526 0.98624 0.94019 0.54522
## Neg Pred Value 0.9899 0.9966 0.9986 0.94425 0.99047 0.96934
## Prevalence 0.2772 0.1288 0.1078 0.13615 0.10820 0.10959
## Detection Rate 0.2711 0.1258 0.1066 0.08522 0.09968 0.08363
## Detection Prevalence 0.3944 0.1348 0.1250 0.08641 0.10602 0.15339
## Balanced Accuracy 0.9036 0.9833 0.9842 0.81227 0.95707 0.84238
## Class: 6 Class: 7
## Sensitivity 0.000 0.00000
## Specificity 1.000 1.00000
## Pos Pred Value NaN NaN
## Neg Pred Value 0.935 0.93282
## Prevalence 0.065 0.06718
## Detection Rate 0.000 0.00000
## Detection Prevalence 0.000 0.00000
## Balanced Accuracy 0.500 0.50000
Predictions of test data (for assessment)
dt_predictions <- predict(dt_model, test_data_features, type = 'class')
confusionMatrix(dt_predictions,test_data_target, positive = 'y')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7
## 0 392 0 0 0 0 24 43 74
## 1 0 138 0 8 4 1 0 0
## 2 0 0 139 17 0 0 0 0
## 3 0 0 0 108 0 0 0 0
## 4 0 5 0 0 127 0 0 0
## 5 8 0 0 38 4 93 22 12
## 6 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7932
## 95% CI : (0.7697, 0.8152)
## No Information Rate : 0.3182
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7419
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.9800 0.9650 1.0000 0.63158 0.9407 0.78814
## Specificity 0.8355 0.9883 0.9848 1.00000 0.9955 0.92625
## Pos Pred Value 0.7355 0.9139 0.8910 1.00000 0.9621 0.52542
## Neg Pred Value 0.9890 0.9955 1.0000 0.94517 0.9929 0.97685
## Prevalence 0.3182 0.1138 0.1106 0.13604 0.1074 0.09387
## Detection Rate 0.3119 0.1098 0.1106 0.08592 0.1010 0.07399
## Detection Prevalence 0.4240 0.1201 0.1241 0.08592 0.1050 0.14081
## Balanced Accuracy 0.9077 0.9767 0.9924 0.81579 0.9681 0.85719
## Class: 6 Class: 7
## Sensitivity 0.00000 0.00000
## Specificity 1.00000 1.00000
## Pos Pred Value NaN NaN
## Neg Pred Value 0.94829 0.93158
## Prevalence 0.05171 0.06842
## Detection Rate 0.00000 0.00000
## Detection Prevalence 0.00000 0.00000
## Balanced Accuracy 0.50000 0.50000
#1-sum(diag(tab2))/ sum(tab2) # Error
Test data Accuracy is 79.32 %