library(conflicted)
library(dplyr)
library(tidyr)
#library(neuralnet)
library(ggplot2)
library(DiagrammeR)
library(DiagrammeRsvg)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
train_data <- read.csv("sensor_prepare_train_data.csv",stringsAsFactors = FALSE)
test_data <- read.csv("sensor_prepare_test_data.csv",stringsAsFactors = FALSE)
features_column <- c("Seismic_Max","Seismic_Min","Seismic_Mean","Seismic_Range","Gyro_Gx",
"Gyro_Gy","Gyro_Gz")
target_column <- c("Event_Actual")
train_data_features <- train_data[,features_column]
# Summary train data features
summary(train_data_features)
## Seismic_Max Seismic_Min Seismic_Mean Seismic_Range
## Min. :0.02802 Min. :-3.59999 Min. :-0.3224 Min. :0.7593
## 1st Qu.:2.75628 1st Qu.:-3.43122 1st Qu.: 0.6112 1st Qu.:5.5405
## Median :3.20628 Median :-3.17808 Median : 0.7429 Median :6.1985
## Mean :3.06429 Mean :-2.88286 Mean : 0.7419 Mean :5.9472
## 3rd Qu.:3.45941 3rd Qu.:-2.64375 3rd Qu.: 0.8763 3rd Qu.:6.6656
## Max. :3.59989 Max. : 0.00924 Max. : 1.5837 Max. :7.1999
## Gyro_Gx Gyro_Gy Gyro_Gz
## Min. :-9.7710 Min. :-10.4733 Min. : 7.809
## 1st Qu.:-7.6641 1st Qu.: -8.9160 1st Qu.:125.008
## Median :-2.7634 Median : -2.3206 Median :125.557
## Mean :-2.9722 Mean : -2.6277 Mean :125.496
## 3rd Qu.: 0.7634 3rd Qu.: 0.8855 3rd Qu.:126.107
## Max. : 8.7328 Max. : 13.0076 Max. :128.489
train_data_target <- train_data[,target_column]
#Summary train data target
summary(train_data_target)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 2.000 2.594 4.000 7.000
train_data$Event_Actual <- as.factor(train_data$Event_Actual)
table(train_data$Event_Actual)
##
## 0 1 2 3 4 5 6 7
## 1399 650 544 687 546 553 328 339
test_data$Event_Actual <- as.factor((test_data$Event_Actual))
table(test_data$Event_Actual)
##
## 0 1 2 3 4 5 6 7
## 400 143 139 171 135 118 65 86
tuned <- tuneRF(train_data_features, train_data_target,
stepFactor = 0.5,
plot = TRUE,
ntreeTry = 150,
trace = TRUE,
improve = 0.05)
## mtry = 2 OOB error = 1.839452
## Searching left ...
## mtry = 4 OOB error = 1.801475
## 0.02064588 0.05
## Searching right ...
## mtry = 1 OOB error = 2.029885
## -0.1035271 0.05
# Train Random Forest
GS_T0 <- Sys.time()
rf_model <- randomForest(Event_Actual~Seismic_Max+Seismic_Min+Seismic_Mean+Seismic_Range+Gyro_Gx+Gyro_Gy+Gyro_Gz,
data=train_data,mtry=4, proximity=TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = Event_Actual ~ Seismic_Max + Seismic_Min + Seismic_Mean + Seismic_Range + Gyro_Gx + Gyro_Gy + Gyro_Gz, data = train_data, mtry = 4, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 15.56%
## Confusion matrix:
## 0 1 2 3 4 5 6 7 class.error
## 0 1326 0 0 0 0 6 37 30 0.05218013
## 1 0 634 0 3 13 0 0 0 0.02461538
## 2 0 0 532 12 0 0 0 0 0.02205882
## 3 0 3 72 504 14 94 0 0 0.26637555
## 4 0 15 0 9 522 0 0 0 0.04395604
## 5 19 0 1 34 1 426 43 29 0.22965642
## 6 62 0 0 0 0 44 176 46 0.46341463
## 7 116 0 0 0 0 38 44 141 0.58407080
GS_T1<-Sys.time()
training_time <- GS_T1-GS_T0
# ใช้เวลาในการฝึกสอน
training_time
## Time difference of 14.45895 secs
Out of Bag error is 15.54 % , so the train data set model accuracy is around 84.46% Numberof trees is 500 and mtry is 4
filename <- 'rf_model.rds'
saveRDS(rf_model,file=filename)
rf_predictions <- predict(rf_model, train_data)
# confusion matrix of test set
confusionMatrix(rf_predictions,train_data$Event_Actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7
## 0 1399 0 0 0 0 0 0 0
## 1 0 650 0 0 0 0 0 0
## 2 0 0 544 0 0 0 0 0
## 3 0 0 0 687 0 0 0 0
## 4 0 0 0 0 546 0 0 0
## 5 0 0 0 0 0 553 0 0
## 6 0 0 0 0 0 0 328 0
## 7 0 0 0 0 0 0 0 339
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9993, 1)
## No Information Rate : 0.2772
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
## Prevalence 0.2772 0.1288 0.1078 0.1361 0.1082 0.1096
## Detection Rate 0.2772 0.1288 0.1078 0.1361 0.1082 0.1096
## Detection Prevalence 0.2772 0.1288 0.1078 0.1361 0.1082 0.1096
## Balanced Accuracy 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
## Class: 6 Class: 7
## Sensitivity 1.000 1.00000
## Specificity 1.000 1.00000
## Pos Pred Value 1.000 1.00000
## Neg Pred Value 1.000 1.00000
## Prevalence 0.065 0.06718
## Detection Rate 0.065 0.06718
## Detection Prevalence 0.065 0.06718
## Balanced Accuracy 1.000 1.00000
Train data accuracy 100% that indicates all the values classified correctly ### Predictions of test data (for assessment)
rf_predictions <- predict(rf_model, test_data)
# confusion matrix of test set
confusionMatrix(rf_predictions,test_data$Event_Actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7
## 0 372 0 0 0 0 7 25 42
## 1 0 136 0 3 4 1 0 0
## 2 0 0 138 16 0 0 0 0
## 3 0 1 1 124 0 9 0 0
## 4 0 6 0 0 131 0 0 0
## 5 9 0 0 28 0 75 13 17
## 6 5 0 0 0 0 17 19 10
## 7 14 0 0 0 0 9 8 17
##
## Overall Statistics
##
## Accuracy : 0.8051
## 95% CI : (0.7821, 0.8267)
## No Information Rate : 0.3182
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.762
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.9300 0.9510 0.9928 0.72515 0.9704 0.63559
## Specificity 0.9137 0.9928 0.9857 0.98987 0.9947 0.94118
## Pos Pred Value 0.8341 0.9444 0.8961 0.91852 0.9562 0.52817
## Neg Pred Value 0.9655 0.9937 0.9991 0.95811 0.9964 0.96143
## Prevalence 0.3182 0.1138 0.1106 0.13604 0.1074 0.09387
## Detection Rate 0.2959 0.1082 0.1098 0.09865 0.1042 0.05967
## Detection Prevalence 0.3548 0.1146 0.1225 0.10740 0.1090 0.11297
## Balanced Accuracy 0.9218 0.9719 0.9892 0.85751 0.9825 0.78838
## Class: 6 Class: 7
## Sensitivity 0.29231 0.19767
## Specificity 0.97315 0.97353
## Pos Pred Value 0.37255 0.35417
## Neg Pred Value 0.96186 0.94293
## Prevalence 0.05171 0.06842
## Detection Rate 0.01512 0.01352
## Detection Prevalence 0.04057 0.03819
## Balanced Accuracy 0.63273 0.58560
Test data Accuracy is 80.67 % ## Plot rf model
plot(rf_model)
## plot rf model ### Histrogram
hist(treesize(rf_model),
main = "No. of Nodes for the Trees",
col = "green")
### Variable Importance
varImpPlot(rf_model,
sort = T,
n.var = 7,
main = "Top 7 - Variable Importance")
### Importance
importance(rf_model)
## MeanDecreaseGini
## Seismic_Max 209.1636
## Seismic_Min 179.2694
## Seismic_Mean 216.3609
## Seismic_Range 172.7976
## Gyro_Gx 1248.7964
## Gyro_Gy 1956.0386
## Gyro_Gz 275.4437
MDSplot(rf_model,train_data$Event_Actual)
### Partial plot
partialPlot(rf_model, train_data,Gyro_Gy,"0")
partialPlot(rf_model, train_data,Gyro_Gy,"1")
partialPlot(rf_model, train_data,Gyro_Gy,"2")
partialPlot(rf_model, train_data,Gyro_Gy,"3")
partialPlot(rf_model, train_data,Gyro_Gy,"4")
partialPlot(rf_model, train_data,Gyro_Gy,"5")
partialPlot(rf_model, train_data,Gyro_Gy,"6")
partialPlot(rf_model, train_data,Gyro_Gy,"7")