Setting up environment and Prepare Data

Load library

library(conflicted)
library(dplyr)
library(tidyr)
#library(neuralnet)
library(ggplot2)
library(DiagrammeR)
library(DiagrammeRsvg)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.

Load csv file

train_data <- read.csv("sensor_prepare_train_data.csv",stringsAsFactors = FALSE)
test_data <- read.csv("sensor_prepare_test_data.csv",stringsAsFactors = FALSE)

Set Features and Target Variable

features_column <- c("Seismic_Max","Seismic_Min","Seismic_Mean","Seismic_Range","Gyro_Gx",
                     "Gyro_Gy","Gyro_Gz")
target_column <- c("Event_Actual")

แบ่งข้อมูลตัวแปรต้นสำหรับการฝึกสอน

Train data features

train_data_features <- train_data[,features_column]
# Summary train data features
summary(train_data_features)
##   Seismic_Max       Seismic_Min        Seismic_Mean     Seismic_Range   
##  Min.   :0.02802   Min.   :-3.59999   Min.   :-0.3224   Min.   :0.7593  
##  1st Qu.:2.75628   1st Qu.:-3.43122   1st Qu.: 0.6112   1st Qu.:5.5405  
##  Median :3.20628   Median :-3.17808   Median : 0.7429   Median :6.1985  
##  Mean   :3.06429   Mean   :-2.88286   Mean   : 0.7419   Mean   :5.9472  
##  3rd Qu.:3.45941   3rd Qu.:-2.64375   3rd Qu.: 0.8763   3rd Qu.:6.6656  
##  Max.   :3.59989   Max.   : 0.00924   Max.   : 1.5837   Max.   :7.1999  
##     Gyro_Gx           Gyro_Gy            Gyro_Gz       
##  Min.   :-9.7710   Min.   :-10.4733   Min.   :  7.809  
##  1st Qu.:-7.6641   1st Qu.: -8.9160   1st Qu.:125.008  
##  Median :-2.7634   Median : -2.3206   Median :125.557  
##  Mean   :-2.9722   Mean   : -2.6277   Mean   :125.496  
##  3rd Qu.: 0.7634   3rd Qu.:  0.8855   3rd Qu.:126.107  
##  Max.   : 8.7328   Max.   : 13.0076   Max.   :128.489

กำหนดตัวแปรตามสำหรับการฝึกสอน

Train data target

train_data_target <- train_data[,target_column]
#Summary train data target
summary(train_data_target)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.594   4.000   7.000

กำหนดตัวแปรตามสำหรับการทดสอบ

Setup Random forest parameter

train_data$Event_Actual <- as.factor(train_data$Event_Actual)
table(train_data$Event_Actual)
## 
##    0    1    2    3    4    5    6    7 
## 1399  650  544  687  546  553  328  339
test_data$Event_Actual <- as.factor((test_data$Event_Actual))
table(test_data$Event_Actual)
## 
##   0   1   2   3   4   5   6   7 
## 400 143 139 171 135 118  65  86

Setup RandomForest parameter

Tune rf model

tuned <- tuneRF(train_data_features, train_data_target,
                stepFactor = 0.5,
                plot = TRUE,
                ntreeTry = 150,
                trace = TRUE,
                improve = 0.05)
## mtry = 2  OOB error = 1.839452 
## Searching left ...
## mtry = 4     OOB error = 1.801475 
## 0.02064588 0.05 
## Searching right ...
## mtry = 1     OOB error = 2.029885 
## -0.1035271 0.05

# Train Random Forest

GS_T0 <- Sys.time()
rf_model <- randomForest(Event_Actual~Seismic_Max+Seismic_Min+Seismic_Mean+Seismic_Range+Gyro_Gx+Gyro_Gy+Gyro_Gz,
                         data=train_data,mtry=4, proximity=TRUE)
print(rf_model)
## 
## Call:
##  randomForest(formula = Event_Actual ~ Seismic_Max + Seismic_Min +      Seismic_Mean + Seismic_Range + Gyro_Gx + Gyro_Gy + Gyro_Gz,      data = train_data, mtry = 4, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 15.56%
## Confusion matrix:
##      0   1   2   3   4   5   6   7 class.error
## 0 1326   0   0   0   0   6  37  30  0.05218013
## 1    0 634   0   3  13   0   0   0  0.02461538
## 2    0   0 532  12   0   0   0   0  0.02205882
## 3    0   3  72 504  14  94   0   0  0.26637555
## 4    0  15   0   9 522   0   0   0  0.04395604
## 5   19   0   1  34   1 426  43  29  0.22965642
## 6   62   0   0   0   0  44 176  46  0.46341463
## 7  116   0   0   0   0  38  44 141  0.58407080
GS_T1<-Sys.time()
training_time <- GS_T1-GS_T0
# ใช้เวลาในการฝึกสอน
training_time
## Time difference of 14.45895 secs

Out of Bag error is 15.54 % , so the train data set model accuracy is around 84.46% Numberof trees is 500 and mtry is 4

Save RF model

filename <- 'rf_model.rds'
saveRDS(rf_model,file=filename)

Create a confusion matrix of Evaluation

Predictions of train data (for re-check)

rf_predictions <- predict(rf_model, train_data)

# confusion matrix of test set
confusionMatrix(rf_predictions,train_data$Event_Actual)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7
##          0 1399    0    0    0    0    0    0    0
##          1    0  650    0    0    0    0    0    0
##          2    0    0  544    0    0    0    0    0
##          3    0    0    0  687    0    0    0    0
##          4    0    0    0    0  546    0    0    0
##          5    0    0    0    0    0  553    0    0
##          6    0    0    0    0    0    0  328    0
##          7    0    0    0    0    0    0    0  339
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9993, 1)
##     No Information Rate : 0.2772     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            1.0000   1.0000   1.0000   1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   1.0000   1.0000   1.0000
## Prevalence             0.2772   0.1288   0.1078   0.1361   0.1082   0.1096
## Detection Rate         0.2772   0.1288   0.1078   0.1361   0.1082   0.1096
## Detection Prevalence   0.2772   0.1288   0.1078   0.1361   0.1082   0.1096
## Balanced Accuracy      1.0000   1.0000   1.0000   1.0000   1.0000   1.0000
##                      Class: 6 Class: 7
## Sensitivity             1.000  1.00000
## Specificity             1.000  1.00000
## Pos Pred Value          1.000  1.00000
## Neg Pred Value          1.000  1.00000
## Prevalence              0.065  0.06718
## Detection Rate          0.065  0.06718
## Detection Prevalence    0.065  0.06718
## Balanced Accuracy       1.000  1.00000

Train data accuracy 100% that indicates all the values classified correctly ### Predictions of test data (for assessment)

rf_predictions <- predict(rf_model, test_data)

# confusion matrix of test set
confusionMatrix(rf_predictions,test_data$Event_Actual)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7
##          0 372   0   0   0   0   7  25  42
##          1   0 136   0   3   4   1   0   0
##          2   0   0 138  16   0   0   0   0
##          3   0   1   1 124   0   9   0   0
##          4   0   6   0   0 131   0   0   0
##          5   9   0   0  28   0  75  13  17
##          6   5   0   0   0   0  17  19  10
##          7  14   0   0   0   0   9   8  17
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8051          
##                  95% CI : (0.7821, 0.8267)
##     No Information Rate : 0.3182          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.762           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9300   0.9510   0.9928  0.72515   0.9704  0.63559
## Specificity            0.9137   0.9928   0.9857  0.98987   0.9947  0.94118
## Pos Pred Value         0.8341   0.9444   0.8961  0.91852   0.9562  0.52817
## Neg Pred Value         0.9655   0.9937   0.9991  0.95811   0.9964  0.96143
## Prevalence             0.3182   0.1138   0.1106  0.13604   0.1074  0.09387
## Detection Rate         0.2959   0.1082   0.1098  0.09865   0.1042  0.05967
## Detection Prevalence   0.3548   0.1146   0.1225  0.10740   0.1090  0.11297
## Balanced Accuracy      0.9218   0.9719   0.9892  0.85751   0.9825  0.78838
##                      Class: 6 Class: 7
## Sensitivity           0.29231  0.19767
## Specificity           0.97315  0.97353
## Pos Pred Value        0.37255  0.35417
## Neg Pred Value        0.96186  0.94293
## Prevalence            0.05171  0.06842
## Detection Rate        0.01512  0.01352
## Detection Prevalence  0.04057  0.03819
## Balanced Accuracy     0.63273  0.58560

Test data Accuracy is 80.67 % ## Plot rf model

plot(rf_model)

## plot rf model ### Histrogram

hist(treesize(rf_model),
     main = "No. of Nodes for the Trees",
     col = "green")

### Variable Importance

varImpPlot(rf_model,
           sort = T,
           n.var = 7,
           main = "Top 7 - Variable Importance")

### Importance

importance(rf_model)
##               MeanDecreaseGini
## Seismic_Max           209.1636
## Seismic_Min           179.2694
## Seismic_Mean          216.3609
## Seismic_Range         172.7976
## Gyro_Gx              1248.7964
## Gyro_Gy              1956.0386
## Gyro_Gz               275.4437

MDSplot

MDSplot(rf_model,train_data$Event_Actual)

### Partial plot

partialPlot(rf_model, train_data,Gyro_Gy,"0")

partialPlot(rf_model, train_data,Gyro_Gy,"1")

partialPlot(rf_model, train_data,Gyro_Gy,"2")

partialPlot(rf_model, train_data,Gyro_Gy,"3")

partialPlot(rf_model, train_data,Gyro_Gy,"4")

partialPlot(rf_model, train_data,Gyro_Gy,"5")

partialPlot(rf_model, train_data,Gyro_Gy,"6")

partialPlot(rf_model, train_data,Gyro_Gy,"7")