## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'rattle' was built under R version 3.3.2
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'rpart.plot' was built under R version 3.3.3
turbine <- read_csv("C:/Lloyds Register/UCI CBM Dataset/data.csv")
## Parsed with column specification:
## cols(
## Lever_position = col_double(),
## Ship_speed = col_double(),
## Gas_turbine_shaft_torque = col_double(),
## Turb_rev_rate = col_double(),
## Gen_rev_rate = col_double(),
## Starboard_Propeller_torque = col_double(),
## Port_propeller_torque = col_double(),
## Turb_exit_temp_HP = col_double(),
## Gas_turbine_compressor_inlet_air_temp_T1 = col_double(),
## Gas_turbine_compressor_outlet_air_temp_T2 = col_double(),
## High_pressure_turbine_exit_pressure = col_double(),
## Gas_turbine_compressor_inlet_air_temp_P1 = col_double(),
## Gas_turbine_compressor_outlet_air_temp_P2 = col_double(),
## Tur_exh_gas_press = col_double(),
## Turb_inj_cntrl = col_double(),
## Fuel_flow = col_double(),
## Compressor_decay = col_double(),
## Turbine_decay = col_double()
## )
turbine<- dplyr::select(turbine, -(19:20))
Hmisc::describe(turbine)
## turbine
##
## 18 Variables 11934 Observations
## ---------------------------------------------------------------------------
## Lever_position
## n missing distinct Info Mean Gmd
## 11934 0 9 0.988 5.168 3.015
##
## Value 1.14 2.09 3.14 4.16 5.14 6.18 7.15 8.21 9.30
## Frequency 1326 1326 1326 1326 1326 1326 1326 1326 1326
## Proportion 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111
## ---------------------------------------------------------------------------
## Ship_speed
## n missing distinct Info Mean Gmd
## 11934 0 9 0.988 15 8.89
##
## Value 3 6 9 12 15 18 21 24 27
## Frequency 1326 1326 1326 1326 1326 1326 1326 1326 1326
## Proportion 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111
## ---------------------------------------------------------------------------
## Gas_turbine_shaft_torque
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 789 0.991 27249 24438 2796 3633
## .25 .50 .75 .90 .95
## 8380 21600 39000 72800 72800
##
## Value 0 1000 2000 3000 4000 5000 6000 7000 8000 15000
## Frequency 72 274 200 574 466 412 318 336 1326 1326
## Proportion 0.006 0.023 0.017 0.048 0.039 0.035 0.027 0.028 0.111 0.111
##
## Value 22000 30000 39000 51000 73000
## Frequency 1326 1326 1326 1326 1326
## Proportion 0.111 0.111 0.111 0.111 0.111
## ---------------------------------------------------------------------------
## Turb_rev_rate
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 18 0.988 2137 855 1350 1360
## .25 .50 .75 .90 .95
## 1390 1920 2680 3560 3560
##
## Value 1310 1320 1330 1340 1350 1360 1370 1380 1390 1400
## Frequency 83 92 99 251 408 305 273 339 1817 108
## Proportion 0.007 0.008 0.008 0.021 0.034 0.026 0.023 0.028 0.152 0.009
##
## Value 1410 1420 1550 1920 2310 2680 3090 3560
## Frequency 148 55 1326 1326 1326 1326 1326 1326
## Proportion 0.012 0.005 0.111 0.111 0.111 0.111 0.111 0.111
## ---------------------------------------------------------------------------
## Gen_rev_rate
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 101 1 8201 1239 6660 6720
## .25 .50 .75 .90 .95
## 7060 8480 9130 9740 9760
##
## lowest : 6590 6600 6610 6620 6630, highest: 9760 9770 9780 9790 9800
## ---------------------------------------------------------------------------
## Starboard_Propeller_torque
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 535 0.991 227.3 220.1 8.216 11.100
## .25 .50 .75 .90 .95
## 60.300 175.000 332.000 645.000 645.000
##
## Value 5 10 20 25 30 60 115 175 245 330
## Frequency 447 879 118 622 586 1326 1326 1326 1326 1268
## Proportion 0.037 0.074 0.010 0.052 0.049 0.111 0.111 0.111 0.111 0.106
##
## Value 335 440 645
## Frequency 58 1326 1326
## Proportion 0.005 0.111 0.111
## ---------------------------------------------------------------------------
## Port_propeller_torque
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 535 0.991 227.3 220.1 8.216 11.100
## .25 .50 .75 .90 .95
## 60.300 175.000 332.000 645.000 645.000
##
## Value 5 10 20 25 30 60 115 175 245 330
## Frequency 447 879 118 622 586 1326 1326 1326 1326 1268
## Proportion 0.037 0.074 0.010 0.052 0.049 0.111 0.111 0.111 0.111 0.106
##
## Value 335 440 645
## Frequency 58 1326 1326
## Proportion 0.005 0.111 0.111
## ---------------------------------------------------------------------------
## Turb_exit_temp_HP
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 457 1 735.5 195.7 504 535
## .25 .50 .75 .90 .95
## 590 706 834 1050 1080
##
## lowest : 442 445 446 447 448, highest: 1080 1090 1100 1110 1120
## ---------------------------------------------------------------------------
## Gas_turbine_compressor_inlet_air_temp_T1
## n missing distinct Info Mean Gmd
## 11934 0 1 0 288 0
##
## Value 288
## Frequency 11934
## Proportion 1
## ---------------------------------------------------------------------------
## Gas_turbine_compressor_outlet_air_temp_T2
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 165 1 646.2 82.56 556 562
## .25 .50 .75 .90 .95
## 578 637 694 771 779
##
## lowest : 540 541 542 543 544, highest: 785 786 787 788 789
## ---------------------------------------------------------------------------
## High_pressure_turbine_exit_pressure
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 60 0.997 2.353 1.199 1.20 1.23
## .25 .50 .75 .90 .95
## 1.39 2.08 2.98 4.49 4.52
##
## lowest : 1.09 1.10 1.11 1.12 1.13, highest: 4.52 4.53 4.54 4.55 4.56
## ---------------------------------------------------------------------------
## Gas_turbine_compressor_inlet_air_temp_P1
## n missing distinct Info Mean Gmd
## 11934 0 1 0 0.998 0
##
## Value 0.998
## Frequency 11934
## Proportion 1
## ---------------------------------------------------------------------------
## Gas_turbine_compressor_outlet_air_temp_P2
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 234 1 12.3 5.948 6.41 6.61
## .25 .50 .75 .90 .95
## 7.45 11.10 15.70 22.30 22.70
##
## lowest : 5.83 5.84 5.85 5.86 5.87, highest: 22.70 22.80 22.90 23.00 23.10
## ---------------------------------------------------------------------------
## Tur_exh_gas_press
## n missing distinct Info Mean Gmd
## 11934 0 4 0.889 1.03 0.01136
##
## Value 1.02 1.03 1.04 1.05
## Frequency 5304 2652 2652 1326
## Proportion 0.444 0.222 0.222 0.111
## ---------------------------------------------------------------------------
## Turb_inj_cntrl
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 908 1 33.64 28.18 0.0 5.6
## .25 .50 .75 .90 .95
## 13.7 25.3 44.6 87.4 89.5
##
## lowest : 0.000 0.013 0.019 0.029 0.033, highest: 92.200 92.300 92.400 92.500 92.600
## ---------------------------------------------------------------------------
## Fuel_flow
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 503 1 0.6626 0.5456 0.134 0.174
## .25 .50 .75 .90 .95
## 0.246 0.496 0.882 1.730 1.770
##
## lowest : 0.068 0.069 0.070 0.071 0.072, highest: 1.790 1.800 1.810 1.820 1.830
## ---------------------------------------------------------------------------
## Compressor_decay
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 51 1 0.975 0.01699 0.952 0.955
## .25 .50 .75 .90 .95
## 0.962 0.975 0.988 0.995 0.998
##
## lowest : 0.950 0.951 0.952 0.953 0.954, highest: 0.996 0.997 0.998 0.999 1.000
## ---------------------------------------------------------------------------
## Turbine_decay
## n missing distinct Info Mean Gmd .05 .10
## 11934 0 26 0.999 0.9875 0.008655 0.9760 0.9770
## .25 .50 .75 .90 .95
## 0.9810 0.9875 0.9940 0.9980 0.9990
##
## lowest : 0.975 0.976 0.977 0.978 0.979, highest: 0.996 0.997 0.998 0.999 1.000
## ---------------------------------------------------------------------------
turbine<- dplyr::mutate(turbine, V19=1, V20=1)
for(i in 1:nrow(turbine)){
if (turbine[i,17] >= mean(turbine$Compressor_decay)) { turbine[i,19]="Compressor_E"}
else{
turbine[i,19]="Compressor_Non"}
}
for(i in 1:nrow(turbine)){
if (turbine[i,18] >= mean(turbine$Turbine_decay)) { turbine[i,20]="Turbine_E"}
else{
turbine[i,20]="Turbine_Non"}
}
turbine$V19<- as.factor(turbine$V19)
turbine$V20<- as.factor(turbine$V20)
turbine<- select(turbine, -Turbine_decay,-Compressor_decay, -Lever_position )
turbine <- turbine[, colSums(is.na(turbine)) ==0]
nzv <- nearZeroVar(turbine, saveMetrics=TRUE)
nzv
## freqRatio percentUnique zeroVar
## Ship_speed 1.000000 0.07541478 FALSE
## Gas_turbine_shaft_torque 1.000000 6.61136249 FALSE
## Turb_rev_rate 1.370287 0.15082956 FALSE
## Gen_rev_rate 1.096842 0.84632143 FALSE
## Starboard_Propeller_torque 1.000000 4.48298978 FALSE
## Port_propeller_torque 1.000000 4.48298978 FALSE
## Turb_exit_temp_HP 1.030303 3.82939501 FALSE
## Gas_turbine_compressor_inlet_air_temp_T1 0.000000 0.00837942 TRUE
## Gas_turbine_compressor_outlet_air_temp_T2 1.023669 1.38260432 FALSE
## High_pressure_turbine_exit_pressure 1.179715 0.50276521 FALSE
## Gas_turbine_compressor_inlet_air_temp_P1 0.000000 0.00837942 TRUE
## Gas_turbine_compressor_outlet_air_temp_P2 1.073810 1.96078431 FALSE
## Tur_exh_gas_press 2.000000 0.03351768 FALSE
## Turb_inj_cntrl 5.006579 7.60851349 FALSE
## Fuel_flow 1.047619 4.21484833 FALSE
## V19 1.040000 0.01675884 FALSE
## V20 1.000000 0.01675884 FALSE
## nzv
## Ship_speed FALSE
## Gas_turbine_shaft_torque FALSE
## Turb_rev_rate FALSE
## Gen_rev_rate FALSE
## Starboard_Propeller_torque FALSE
## Port_propeller_torque FALSE
## Turb_exit_temp_HP FALSE
## Gas_turbine_compressor_inlet_air_temp_T1 TRUE
## Gas_turbine_compressor_outlet_air_temp_T2 FALSE
## High_pressure_turbine_exit_pressure FALSE
## Gas_turbine_compressor_inlet_air_temp_P1 TRUE
## Gas_turbine_compressor_outlet_air_temp_P2 FALSE
## Tur_exh_gas_press FALSE
## Turb_inj_cntrl FALSE
## Fuel_flow FALSE
## V19 FALSE
## V20 FALSE
turbine <- turbine[, nzv$nzv==FALSE]
set.seed(1000)
inTrain<- createDataPartition(turbine$V19, p=0.7, list=FALSE)
training <- turbine[inTrain,]
testing <- turbine[-inTrain,]
modFit_C<- train(V19~., method="rpart", data= training)
modFit_C
## CART
##
## 8354 samples
## 14 predictor
## 2 classes: 'Compressor_E', 'Compressor_Non'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 8354, 8354, 8354, 8354, 8354, 8354, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.08424908 0.6622369 0.3161117
## 0.08876679 0.5751976 0.1362984
## 0.09450549 0.5356647 0.0541274
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.08424908.
print(modFit_C$finalModel)
## n= 8354
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 8354 4095 Compressor_E (0.509815657 0.490184343)
## 2) Gas_turbine_compressor_outlet_air_temp_T2< 779.5 7961 3705 Compressor_E (0.534606205 0.465393795)
## 4) High_pressure_turbine_exit_pressure>=3.585 983 117 Compressor_E (0.880976602 0.119023398) *
## 5) High_pressure_turbine_exit_pressure< 3.585 6978 3390 Compressor_Non (0.485812554 0.514187446)
## 10) Gas_turbine_compressor_outlet_air_temp_T2< 690.5 6085 2778 Compressor_E (0.543467543 0.456532457) *
## 11) Gas_turbine_compressor_outlet_air_temp_T2>=690.5 893 83 Compressor_Non (0.092945129 0.907054871) *
## 3) Gas_turbine_compressor_outlet_air_temp_T2>=779.5 393 3 Compressor_Non (0.007633588 0.992366412) *
predictions_C <- predict(modFit_C, newdata=testing)
confusionMatrix(predictions_C, testing$V19)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Compressor_E Compressor_Non
## Compressor_E 1771 1237
## Compressor_Non 54 518
##
## Accuracy : 0.6394
## 95% CI : (0.6234, 0.6551)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.269
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9704
## Specificity : 0.2952
## Pos Pred Value : 0.5888
## Neg Pred Value : 0.9056
## Prevalence : 0.5098
## Detection Rate : 0.4947
## Detection Prevalence : 0.8402
## Balanced Accuracy : 0.6328
##
## 'Positive' Class : Compressor_E
##
cart <- rpart(V19~., data=training, method="class")
prp(cart)
Varimpcompr <- varImp(modFit_C, scale=TRUE)
plot(Varimpcompr, main="Critical variables for the Compressor Decay")
#### Random Forests ##### Predictions are based on the generation of multiple classification trees.
set.seed(42)
model_rf <- caret::train(V19~.,
data = training,
method = "rf",
preProcess = c("scale", "center"),
trControl = trainControl(method = "repeatedcv",
number = 3,
repeats = 3,
savePredictions = TRUE,
verboseIter = FALSE))
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.3.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
model_rf$finalModel$confusion
## Compressor_E Compressor_Non class.error
## Compressor_E 4142 117 0.02747124
## Compressor_Non 112 3983 0.02735043
imp <- model_rf$finalModel$importance
imp[order(imp, decreasing = TRUE), ]
## Gas_turbine_compressor_outlet_air_temp_T2
## 1197.08402
## Gen_rev_rate
## 847.13702
## High_pressure_turbine_exit_pressure
## 661.73455
## Gas_turbine_compressor_outlet_air_temp_P2
## 400.68219
## Turb_exit_temp_HP
## 271.11699
## Turb_inj_cntrl
## 169.38045
## Fuel_flow
## 132.69759
## Port_propeller_torque
## 120.04169
## Starboard_Propeller_torque
## 117.35956
## Gas_turbine_shaft_torque
## 80.93833
## Turb_rev_rate
## 63.72909
## V20Turbine_Non
## 45.54016
## Ship_speed
## 24.34596
## Tur_exh_gas_press
## 5.75573
importance <- varImp(model_rf, scale = TRUE)
plot(importance)
##### Predicting test data
confusionMatrix(predict(model_rf, testing), testing$V19)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Compressor_E Compressor_Non
## Compressor_E 1773 41
## Compressor_Non 52 1714
##
## Accuracy : 0.974
## 95% CI : (0.9683, 0.979)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.948
## Mcnemar's Test P-Value : 0.2998
##
## Sensitivity : 0.9715
## Specificity : 0.9766
## Pos Pred Value : 0.9774
## Neg Pred Value : 0.9706
## Prevalence : 0.5098
## Detection Rate : 0.4953
## Detection Prevalence : 0.5067
## Balanced Accuracy : 0.9741
##
## 'Positive' Class : Compressor_E
##
set.seed(27)
model_glmnet <- caret::train(V19~.,
data = training,
method = "glmnet",
preProcess = NULL,
trControl = trainControl(method = "repeatedcv", number = 3, repeats = 3, verboseIter = F))
## Loading required package: glmnet
## Warning: package 'glmnet' was built under R version 3.3.2
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
## Warning: from glmnet Fortran code (error code -94); Convergence for 94th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -52); Convergence for 52th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -94); Convergence for 94th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -53); Convergence for 53th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -98); Convergence for 98th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -55); Convergence for 55th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -96); Convergence for 96th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -54); Convergence for 54th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -97); Convergence for 97th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -55); Convergence for 55th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -93); Convergence for 93th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -50); Convergence for 50th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -98); Convergence for 98th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -56); Convergence for 56th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -91); Convergence for 91th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -48); Convergence for 48th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -97); Convergence for 97th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -55); Convergence for 55th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
## Warning: from glmnet Fortran code (error code -54); Convergence for 54th
## lambda value not reached after maxit=100000 iterations; solutions for
## larger lambdas returned
model_glmnet
## glmnet
##
## 8354 samples
## 14 predictor
## 2 classes: 'Compressor_E', 'Compressor_Non'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 3 times)
## Summary of sample sizes: 5569, 5570, 5569, 5569, 5569, 5570, ...
## Resampling results across tuning parameters:
##
## alpha lambda Accuracy Kappa
## 0.10 4.148582e-05 0.9336846 0.8672930
## 0.10 4.148582e-04 0.8832101 0.7661081
## 0.10 4.148582e-03 0.7821408 0.5635993
## 0.55 4.148582e-05 0.9363980 0.8727435
## 0.55 4.148582e-04 0.9018835 0.8035368
## 0.55 4.148582e-03 0.7921556 0.5837073
## 1.00 4.148582e-05 0.9435003 0.8869633
## 1.00 4.148582e-04 0.9365574 0.8730733
## 1.00 4.148582e-03 0.8609450 0.7215522
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda
## = 4.148582e-05.
confusionMatrix(predict(model_glmnet, testing), testing$V19)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Compressor_E Compressor_Non
## Compressor_E 1718 102
## Compressor_Non 107 1653
##
## Accuracy : 0.9416
## 95% CI : (0.9334, 0.9491)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8832
## Mcnemar's Test P-Value : 0.782
##
## Sensitivity : 0.9414
## Specificity : 0.9419
## Pos Pred Value : 0.9440
## Neg Pred Value : 0.9392
## Prevalence : 0.5098
## Detection Rate : 0.4799
## Detection Prevalence : 0.5084
## Balanced Accuracy : 0.9416
##
## 'Positive' Class : Compressor_E
##
set.seed(27)
model_kknn <- caret::train(V19~.,
data = training,
method = "kknn",
preProcess = NULL,
trControl = trainControl(method = "repeatedcv", number = 3, repeats = 3, verboseIter = F))
## Loading required package: kknn
## Warning: package 'kknn' was built under R version 3.3.2
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
model_kknn
## k-Nearest Neighbors
##
## 8354 samples
## 14 predictor
## 2 classes: 'Compressor_E', 'Compressor_Non'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 3 times)
## Summary of sample sizes: 5569, 5570, 5569, 5569, 5569, 5570, ...
## Resampling results across tuning parameters:
##
## kmax Accuracy Kappa
## 5 0.9654460 0.9308673
## 7 0.9658848 0.9317447
## 9 0.9659646 0.9319039
##
## Tuning parameter 'distance' was held constant at a value of 2
##
## Tuning parameter 'kernel' was held constant at a value of optimal
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were kmax = 9, distance = 2 and
## kernel = optimal.
confusionMatrix(predict(model_kknn, testing), testing$V19)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Compressor_E Compressor_Non
## Compressor_E 1746 41
## Compressor_Non 79 1714
##
## Accuracy : 0.9665
## 95% CI : (0.9601, 0.9721)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.933
## Mcnemar's Test P-Value : 0.0007312
##
## Sensitivity : 0.9567
## Specificity : 0.9766
## Pos Pred Value : 0.9771
## Neg Pred Value : 0.9559
## Prevalence : 0.5098
## Detection Rate : 0.4877
## Detection Prevalence : 0.4992
## Balanced Accuracy : 0.9667
##
## 'Positive' Class : Compressor_E
##
set.seed(27)
model_pda <- caret::train(V19~.,
data = training,
method = "pda",
preProcess = NULL,
trControl = trainControl(method = "repeatedcv", number = 3, repeats = 3, verboseIter = FALSE))
## Loading required package: mda
## Warning: package 'mda' was built under R version 3.3.2
## Loading required package: class
## Loaded mda 0.4-9
## Warning: predictions failed for Fold1.Rep1: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep1: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep1: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep2: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep2: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep2: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold1.Rep3: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold2.Rep3: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning: predictions failed for Fold3.Rep3: lambda=0e+00 Error in mindist[l] <- ndist[l] :
## NAs are not allowed in subscripted assignments
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
model_pda
## Penalized Discriminant Analysis
##
## 8354 samples
## 14 predictor
## 2 classes: 'Compressor_E', 'Compressor_Non'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 3 times)
## Summary of sample sizes: 5569, 5570, 5569, 5569, 5569, 5570, ...
## Resampling results across tuning parameters:
##
## lambda Accuracy Kappa
## 0e+00 NaN NaN
## 1e-04 0.9306122 0.8611721
## 1e-01 0.9114202 0.8228023
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was lambda = 1e-04.
confusionMatrix(predict(model_pda, testing), testing$V19)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Compressor_E Compressor_Non
## Compressor_E 1699 112
## Compressor_Non 126 1643
##
## Accuracy : 0.9335
## 95% CI : (0.9249, 0.9415)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.867
## Mcnemar's Test P-Value : 0.3994
##
## Sensitivity : 0.9310
## Specificity : 0.9362
## Pos Pred Value : 0.9382
## Neg Pred Value : 0.9288
## Prevalence : 0.5098
## Detection Rate : 0.4746
## Detection Prevalence : 0.5059
## Balanced Accuracy : 0.9336
##
## 'Positive' Class : Compressor_E
##
set.seed(27)
model_C5.0Tree <- caret::train(V19~.,
data = training,
method = "C5.0Tree",
preProcess = NULL,
trControl = trainControl(method = "repeatedcv", number = 3, repeats = 3, verboseIter = F))
## Loading required package: C50
## Warning: package 'C50' was built under R version 3.3.2
model_C5.0Tree
## Single C5.0 Tree
##
## 8354 samples
## 14 predictor
## 2 classes: 'Compressor_E', 'Compressor_Non'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 3 times)
## Summary of sample sizes: 5569, 5570, 5569, 5569, 5569, 5570, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9656055 0.9311863
confusionMatrix(predict(model_C5.0Tree, testing), testing$V19)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Compressor_E Compressor_Non
## Compressor_E 1764 51
## Compressor_Non 61 1704
##
## Accuracy : 0.9687
## 95% CI : (0.9625, 0.9742)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9374
## Mcnemar's Test P-Value : 0.3951
##
## Sensitivity : 0.9666
## Specificity : 0.9709
## Pos Pred Value : 0.9719
## Neg Pred Value : 0.9654
## Prevalence : 0.5098
## Detection Rate : 0.4927
## Detection Prevalence : 0.5070
## Balanced Accuracy : 0.9688
##
## 'Positive' Class : Compressor_E
##
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
set.seed(1951) # set the seed
# Set up to do parallel processing
doParallel::registerDoParallel(4) # Registrer a parallel backend for train
getDoParWorkers()
## [1] 4
# Set up training control
ctrl <- trainControl(method = "repeatedcv", # 10fold cross validation
number = 5, # do 5 repititions of cv
summaryFunction=twoClassSummary, # Use AUC to pick the best model
classProbs=TRUE,
allowParallel = TRUE)
# Use the expand.grid to specify the search space
# Note that the default search grid selects multiple values of each tuning parameter
grid <- expand.grid(interaction.depth=c(1,2), # Depth of variable interactions
n.trees=c(10,20), # Num trees to fit
shrinkage=c(0.01,0.1), # Try 2 values for learning rate
n.minobsinnode = 20)
gbm.tune <- train(V19~.,
data = training,
method = "gbm",
metric = "ROC",
trControl = ctrl,
tuneGrid=grid,
verbose=F)
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.3.3
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.3.3
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loaded gbm 2.1.3
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
# Look at the tuning results
# Note that ROC was the performance criterion used to select the optimal model.
gbm.tune$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 8 20 2 0.1 20
plot(gbm.tune) # Plot the performance of the training models
res <- gbm.tune$results
res
## shrinkage interaction.depth n.minobsinnode n.trees ROC Sens
## 1 0.01 1 20 10 0.5662687 0.9967133
## 5 0.10 1 20 10 0.6325383 0.9976523
## 3 0.01 2 20 10 0.6263945 0.9976523
## 7 0.10 2 20 10 0.8188783 0.6689642
## 2 0.01 1 20 20 0.5910135 0.9964786
## 6 0.10 1 20 20 0.7065819 0.5750010
## 4 0.01 2 20 20 0.6349838 0.9976523
## 8 0.10 2 20 20 0.8688931 0.7365665
## Spec ROCSD SensSD SpecSD
## 1 0.0991453 0.023505890 0.003559831 0.003802787
## 5 0.1035409 0.021582325 0.003319628 0.003802787
## 3 0.0971917 0.004068504 0.003319628 0.004104453
## 7 0.8346764 0.025488444 0.088502386 0.062436038
## 2 0.1001221 0.004068492 0.003421684 0.002590135
## 6 0.7057387 0.013293666 0.229079075 0.294244828
## 4 0.0971917 0.012086597 0.003319628 0.004104453
## 8 0.8390720 0.009686768 0.062524099 0.047166062
### GBM Model Predictions and Performance
# Make predictions using the test data set
gbm.pred <- predict(gbm.tune,testing)
#Look at the confusion matrix
confusionMatrix(gbm.pred,testing$V19)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Compressor_E Compressor_Non
## Compressor_E 1364 431
## Compressor_Non 461 1324
##
## Accuracy : 0.7508
## 95% CI : (0.7363, 0.7649)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5016
## Mcnemar's Test P-Value : 0.3316
##
## Sensitivity : 0.7474
## Specificity : 0.7544
## Pos Pred Value : 0.7599
## Neg Pred Value : 0.7417
## Prevalence : 0.5098
## Detection Rate : 0.3810
## Detection Prevalence : 0.5014
## Balanced Accuracy : 0.7509
##
## 'Positive' Class : Compressor_E
##
#Draw the ROC curve
gbm.probs <- predict(gbm.tune,testing,type="prob")
head(gbm.probs)
## Compressor_E Compressor_Non
## 1 0.4876732 0.5123268
## 2 0.5037738 0.4962262
## 3 0.4876732 0.5123268
## 4 0.4876732 0.5123268
## 5 0.4920193 0.5079807
## 6 0.3921560 0.6078440
library(pROC)
## Warning: package 'pROC' was built under R version 3.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
gbm.ROC <- roc(predictor=gbm.probs$Compressor_E,
response=testing$V19,
levels=rev(levels(testing$V19)))
gbm.ROC$auc
## Area under the curve: 0.8239
plot(gbm.ROC,main="GBM ROC")
#### XGBOOST ##### Extreme Gradient Boosting model
set.seed(1951) # set the seed
# Set up to do parallel processing
#doParallel::registerDoParallel(4) # Registrer a parallel backend for train
#getDoParWorkers()
# Set up training control
ctrl <- trainControl(method = "repeatedcv", # 10fold cross validation
number = 5, # do 5 repititions of cv
summaryFunction=twoClassSummary, # Use AUC to pick the best model
classProbs=TRUE,
allowParallel = TRUE)
# Use the expand.grid to specify the search space
# Note that the default search grid selects multiple values of each tuning parameter
grid <- expand.grid(interaction.depth=c(1,2), # Depth of variable interactions
n.trees=c(10,20), # Num trees to fit
shrinkage=c(0.01,0.1), # Try 2 values for learning rate
n.minobsinnode = 20)
gbm.tune <- train(V19~.,
data = training,
method = "gbm",
metric = "ROC",
trControl = ctrl,
tuneGrid=grid,
verbose=FALSE)
# Look at the tuning results
# Note that ROC was the performance criterion used to select the optimal model.
gbm.tune$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 8 20 2 0.1 20
plot(gbm.tune) # Plot the performance of the training models
res <- gbm.tune$results
res
## shrinkage interaction.depth n.minobsinnode n.trees ROC Sens
## 1 0.01 1 20 10 0.5662687 0.9967133
## 5 0.10 1 20 10 0.6325383 0.9976523
## 3 0.01 2 20 10 0.6263945 0.9976523
## 7 0.10 2 20 10 0.8188783 0.6689642
## 2 0.01 1 20 20 0.5910135 0.9964786
## 6 0.10 1 20 20 0.7065819 0.5750010
## 4 0.01 2 20 20 0.6349838 0.9976523
## 8 0.10 2 20 20 0.8688931 0.7365665
## Spec ROCSD SensSD SpecSD
## 1 0.0991453 0.023505890 0.003559831 0.003802787
## 5 0.1035409 0.021582325 0.003319628 0.003802787
## 3 0.0971917 0.004068504 0.003319628 0.004104453
## 7 0.8346764 0.025488444 0.088502386 0.062436038
## 2 0.1001221 0.004068492 0.003421684 0.002590135
## 6 0.7057387 0.013293666 0.229079075 0.294244828
## 4 0.0971917 0.012086597 0.003319628 0.004104453
## 8 0.8390720 0.009686768 0.062524099 0.047166062
### GBM Model Predictions and Performance
# Make predictions using the test data set
gbm.pred <- predict(gbm.tune,testing)
#Look at the confusion matrix
confusionMatrix(gbm.pred,testing$V19)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Compressor_E Compressor_Non
## Compressor_E 1364 431
## Compressor_Non 461 1324
##
## Accuracy : 0.7508
## 95% CI : (0.7363, 0.7649)
## No Information Rate : 0.5098
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5016
## Mcnemar's Test P-Value : 0.3316
##
## Sensitivity : 0.7474
## Specificity : 0.7544
## Pos Pred Value : 0.7599
## Neg Pred Value : 0.7417
## Prevalence : 0.5098
## Detection Rate : 0.3810
## Detection Prevalence : 0.5014
## Balanced Accuracy : 0.7509
##
## 'Positive' Class : Compressor_E
##
#Draw the ROC curve
gbm.probs <- predict(gbm.tune,testing,type="prob")
head(gbm.probs)
## Compressor_E Compressor_Non
## 1 0.4876732 0.5123268
## 2 0.5037738 0.4962262
## 3 0.4876732 0.5123268
## 4 0.4876732 0.5123268
## 5 0.4920193 0.5079807
## 6 0.3921560 0.6078440
gbm.ROC <- roc(predictor=gbm.probs$Compressor_E,
response=testing$V19,
levels=rev(levels(testing$V19)))
gbm.ROC$auc
## Area under the curve: 0.8239
#Area under the curve: 0.8731
plot(gbm.ROC,main="GBM ROC")
#### Comparing accuracy of models #####
# Create a list of models
models <- list(rf = model_rf, glmnet = model_glmnet, kknn = model_kknn, pda = model_pda,
C5.0Tree = model_C5.0Tree)
# Resample the models
resample_results <- resamples(models)
# Generate a summary
summary(resample_results, metric = c("Kappa", "Accuracy"))
##
## Call:
## summary.resamples(object = resample_results, metric =
## c("Kappa", "Accuracy"))
##
## Models: rf, glmnet, kknn, pda, C5.0Tree
## Number of resamples: 9
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.9411 0.9418 0.9418 0.9440 0.9454 0.9533 0
## glmnet 0.8750 0.8829 0.8887 0.8870 0.8922 0.8951 0
## kknn 0.9239 0.9289 0.9303 0.9319 0.9353 0.9418 0
## pda 0.8484 0.8556 0.8606 0.8612 0.8635 0.8736 0
## C5.0Tree 0.9239 0.9267 0.9289 0.9312 0.9325 0.9432 0
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.9705 0.9709 0.9709 0.9720 0.9727 0.9767 0
## glmnet 0.9375 0.9415 0.9443 0.9435 0.9461 0.9476 0
## kknn 0.9619 0.9645 0.9652 0.9660 0.9677 0.9709 0
## pda 0.9242 0.9278 0.9303 0.9306 0.9318 0.9368 0
## C5.0Tree 0.9619 0.9634 0.9645 0.9656 0.9662 0.9716 0
bwplot(resample_results , metric = c("Kappa","Accuracy"))
### Deep Learning with H2O framework
############# H2O DEEP LEARNING #############################
require(h2o);
## Loading required package: h2o
## Warning: package 'h2o' was built under R version 3.3.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following object is masked from 'package:pROC':
##
## var
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.no_progress()
localH2O <- h2o.init() # Initialise
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\OLEG~1.BAY\AppData\Local\Temp\Rtmp08KrWL/h2o_oleg_baydakov_started_from_r.out
## C:\Users\OLEG~1.BAY\AppData\Local\Temp\Rtmp08KrWL/h2o_oleg_baydakov_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 414 milliseconds
## H2O cluster version: 3.10.5.2
## H2O cluster version age: 7 days, 17 hours and 30 minutes
## H2O cluster name: H2O_started_from_R_oleg.baydakov_jim163
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.76 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 3.3.1 (2016-06-21)
h2o.train <- as.h2o(training)
h2o.test <- as.h2o(testing)
## Train deep learning neural net with 5 hidden layers,
## ReLU with dropout, 10000 epochs, then predict on held-out test set:
model <- h2o.deeplearning(
x = setdiff(colnames(h2o.train),
c("V19","V20")),
y = "V19",
training_frame = h2o.train,
activation = "RectifierWithDropout",
hidden = c(10, 10, 10, 10, 10),
epochs = 10000)
predictions <- h2o.predict(model, h2o.test,type='prob')
## Calculate AUC:
suppressMessages(require(ROCR, quietly = T))
## Warning: package 'ROCR' was built under R version 3.3.2
## Warning: package 'gplots' was built under R version 3.3.2
preds <- as.data.frame(predictions$Compressor_N)
labels <- as.data.frame(h2o.test[,c("V19")])
p <- prediction(preds[,1], labels)
auc.perf <- performance(p, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.9881252
## Plot ROC curve:
plot(performance(p, measure = "tpr", x.measure = "fpr"), col = "red")
abline(a = 0, b = 1, lty = 2)