Forest classification using spectral data. 4 forest classes (a,b,c,d) and 27 spectral variables. These data have been taken from the UCI Machine Learning repo (http://archive.ics.uci.edu/ml/datasets/Forest+type+mapping). The objective is to build a model to classify the four forest types based on 27 spectral data predictor variables. For this we have been provided with a training dataset which has 198 observations in all. The test dataset has 325 observations and the first coulm has the actual forest class labels. The idea is to build the model using the train data and test it on the predictor variables provided in the test dataset. We will compare the overall accuracy of our predictions with the actual class labels provided in the test dataset.
setwd("C:\\Users\\Minerva\\Dropbox\\Desk2015\\Rwork-Datascience\\ForestTypes")
train=read.csv("training.csv")
attach(train)
head(train)
## class b1 b2 b3 b4 b5 b6 b7 b8 b9 pred_minus_obs_H_b1
## 1 d 39 36 57 91 59 101 93 27 60 75.70
## 2 h 84 30 57 112 51 98 92 26 62 30.58
## 3 s 53 25 49 99 51 93 84 26 58 63.20
## 4 s 59 26 49 103 47 92 82 25 56 55.54
## 5 d 57 49 66 103 64 106 114 28 59 59.44
## 6 h 85 28 56 120 52 98 101 27 65 35.14
## pred_minus_obs_H_b2 pred_minus_obs_H_b3 pred_minus_obs_H_b4
## 1 14.86 40.35 7.97
## 2 20.42 39.83 -16.74
## 3 26.70 49.28 3.25
## 4 24.50 47.90 -6.20
## 5 2.62 32.02 -1.33
## 6 23.43 42.29 -16.58
## pred_minus_obs_H_b5 pred_minus_obs_H_b6 pred_minus_obs_H_b7
## 1 -32.92 -38.92 -14.94
## 2 -24.92 -36.33 -15.67
## 3 -24.89 -30.38 -3.60
## 4 -20.98 -30.28 -5.03
## 5 -37.99 -43.57 -34.25
## 6 -25.43 -34.14 -17.45
## pred_minus_obs_H_b8 pred_minus_obs_H_b9 pred_minus_obs_S_b1
## 1 4.47 -2.36 -18.41
## 2 8.16 -2.26 -16.27
## 3 4.15 -1.46 -15.92
## 4 7.77 2.68 -13.77
## 5 1.83 -2.94 -21.74
## 6 1.58 -10.28 -26.18
## pred_minus_obs_S_b2 pred_minus_obs_S_b3 pred_minus_obs_S_b4
## 1 -1.88 -6.43 -21.03
## 2 -1.95 -6.25 -18.79
## 3 -1.79 -4.64 -17.73
## 4 -2.53 -6.34 -22.03
## 5 -1.64 -4.62 -23.74
## 6 -1.89 -5.89 -34.92
## pred_minus_obs_S_b5 pred_minus_obs_S_b6 pred_minus_obs_S_b7
## 1 -1.60 -6.18 -22.50
## 2 -1.99 -6.18 -23.41
## 3 -0.48 -4.69 -19.97
## 4 -2.34 -6.60 -27.10
## 5 -0.85 -5.50 -22.83
## 6 -1.89 -8.05 -29.72
## pred_minus_obs_S_b8 pred_minus_obs_S_b9
## 1 -5.20 -7.86
## 2 -8.87 -10.83
## 3 -4.10 -7.07
## 4 -7.99 -10.81
## 5 -2.74 -5.84
## 6 -1.94 -4.94
test=read.csv("testing.csv")
head(test)
## class b1 b2 b3 b4 b5 b6 b7 b8 b9 pred_minus_obs_H_b1
## 1 d 67 51 68 115 69 111 136 31 67 47.70
## 2 s 67 28 51 99 50 97 82 26 59 47.93
## 3 s 63 26 50 95 49 91 81 26 57 53.09
## 4 d 63 42 63 97 66 108 111 28 59 52.41
## 5 s 46 27 50 83 51 90 76 26 56 68.54
## 6 d 59 59 84 93 70 104 92 29 58 55.56
## pred_minus_obs_H_b2 pred_minus_obs_H_b3 pred_minus_obs_H_b4
## 1 -0.27 29.16 -16.32
## 2 23.77 47.98 1.76
## 3 25.72 48.33 7.16
## 4 9.76 35.69 4.44
## 5 24.27 48.21 16.37
## 6 -8.33 13.12 4.98
## pred_minus_obs_H_b5 pred_minus_obs_H_b6 pred_minus_obs_H_b7
## 1 -42.93 -49.01 -58.09
## 2 -23.88 -34.41 -2.89
## 3 -22.89 -28.40 -0.69
## 4 -39.89 -45.42 -31.33
## 5 -24.89 -27.66 2.19
## 6 -43.96 -42.10 -14.52
## pred_minus_obs_H_b8 pred_minus_obs_H_b9 pred_minus_obs_S_b1
## 1 0.71 -9.17 -18.27
## 2 4.32 -2.25 -20.13
## 3 4.16 -0.44 -17.64
## 4 2.24 -2.34 -20.20
## 5 4.93 1.25 -18.62
## 6 3.15 0.23 -17.16
## pred_minus_obs_S_b2 pred_minus_obs_S_b3 pred_minus_obs_S_b4
## 1 -1.80 -6.32 -20.88
## 2 -2.11 -6.35 -21.94
## 3 -1.81 -4.70 -19.39
## 4 -1.89 -5.47 -21.65
## 5 -2.17 -7.11 -21.12
## 6 -1.98 -6.48 -20.86
## pred_minus_obs_S_b5 pred_minus_obs_S_b6 pred_minus_obs_S_b7
## 1 -1.63 -6.13 -22.56
## 2 -1.22 -6.13 -22.20
## 3 -0.65 -5.01 -20.89
## 4 -0.99 -5.71 -22.19
## 5 -1.56 -6.35 -22.19
## 6 -1.79 -6.25 -23.44
## pred_minus_obs_S_b8 pred_minus_obs_S_b9
## 1 -5.53 -8.11
## 2 -3.41 -6.57
## 3 -3.96 -6.85
## 4 -3.41 -6.52
## 5 -4.45 -7.32
## 6 -6.50 -8.93
x2=test[,-1]
summary(train)
## class b1 b2 b3
## d :54 Min. : 34.00 Min. : 25.00 Min. : 47.00
## h :48 1st Qu.: 54.00 1st Qu.: 28.00 1st Qu.: 52.00
## o :37 Median : 60.00 Median : 31.50 Median : 57.00
## s :59 Mean : 62.95 Mean : 41.02 Mean : 63.68
## 3rd Qu.: 70.75 3rd Qu.: 50.75 3rd Qu.: 69.00
## Max. :105.00 Max. :160.00 Max. :196.00
## b4 b5 b6 b7
## Min. : 54.00 Min. :44.00 Min. : 84.0 Min. : 54.0
## 1st Qu.: 92.25 1st Qu.:49.00 1st Qu.: 92.0 1st Qu.: 80.0
## Median : 99.50 Median :55.00 Median : 98.0 Median : 91.0
## Mean :101.41 Mean :58.73 Mean :100.7 Mean : 90.6
## 3rd Qu.:111.75 3rd Qu.:65.00 3rd Qu.:107.0 3rd Qu.:101.0
## Max. :172.00 Max. :98.00 Max. :136.0 Max. :139.0
## b8 b9 pred_minus_obs_H_b1 pred_minus_obs_H_b2
## Min. :21.00 Min. : 50.00 Min. : 7.66 Min. :-112.6000
## 1st Qu.:24.00 1st Qu.: 55.00 1st Qu.:40.67 1st Qu.: 0.2725
## Median :25.00 Median : 58.00 Median :53.03 Median : 18.8050
## Mean :28.69 Mean : 61.12 Mean :50.82 Mean : 9.8083
## 3rd Qu.:27.00 3rd Qu.: 63.00 3rd Qu.:59.92 3rd Qu.: 22.2575
## Max. :82.00 Max. :109.00 Max. :83.32 Max. : 29.7900
## pred_minus_obs_H_b3 pred_minus_obs_H_b4 pred_minus_obs_H_b5
## Min. :-106.12 Min. :-77.010 Min. :-73.29
## 1st Qu.: 27.20 1st Qu.:-15.922 1st Qu.:-39.77
## Median : 37.61 Median : -2.180 Median :-29.16
## Mean : 32.54 Mean : -3.899 Mean :-33.42
## 3rd Qu.: 43.33 3rd Qu.: 6.657 3rd Qu.:-23.89
## Max. : 55.97 Max. : 40.820 Max. :-19.49
## pred_minus_obs_H_b6 pred_minus_obs_H_b7 pred_minus_obs_H_b8
## Min. :-76.09 Min. :-62.740 Min. :-52.000
## 1st Qu.:-46.16 1st Qu.:-23.585 1st Qu.: 1.978
## Median :-37.51 Median :-14.835 Median : 4.140
## Mean :-40.45 Mean :-13.912 Mean : 1.005
## 3rd Qu.:-32.94 3rd Qu.: -3.248 3rd Qu.: 5.500
## Max. :-25.68 Max. : 24.330 Max. : 10.830
## pred_minus_obs_H_b9 pred_minus_obs_S_b1 pred_minus_obs_S_b2
## Min. :-53.5300 Min. :-32.95 Min. :-8.8000
## 1st Qu.: -6.6275 1st Qu.:-23.32 1st Qu.:-1.8600
## Median : -2.2550 Median :-20.02 Median :-0.9700
## Mean : -5.5941 Mean :-20.04 Mean :-1.0071
## 3rd Qu.: 0.2475 3rd Qu.:-17.79 3rd Qu.:-0.0425
## Max. : 5.7400 Max. : 5.13 Max. :12.4600
## pred_minus_obs_S_b3 pred_minus_obs_S_b4 pred_minus_obs_S_b5
## Min. :-11.210 Min. :-40.37 Min. :-3.2700
## 1st Qu.: -5.790 1st Qu.:-24.09 1st Qu.:-1.2900
## Median : -4.350 Median :-20.46 Median :-0.9450
## Mean : -4.356 Mean :-21.00 Mean :-0.9737
## 3rd Qu.: -2.882 3rd Qu.:-17.95 3rd Qu.:-0.6425
## Max. : 7.370 Max. : 1.88 Max. : 3.4400
## pred_minus_obs_S_b6 pred_minus_obs_S_b7 pred_minus_obs_S_b8
## Min. :-8.730 Min. :-34.14 Min. :-8.870
## 1st Qu.:-5.747 1st Qu.:-22.24 1st Qu.:-2.370
## Median :-4.540 Median :-19.20 Median :-1.420
## Mean :-4.598 Mean :-18.84 Mean :-1.571
## 3rd Qu.:-3.618 3rd Qu.:-16.23 3rd Qu.:-0.655
## Max. : 3.940 Max. : 3.67 Max. : 8.840
## pred_minus_obs_S_b9
## Min. :-10.830
## 1st Qu.: -5.122
## Median : -4.125
## Mean : -4.156
## 3rd Qu.: -3.105
## Max. : 7.790
train=na.omit(train)
str(test)
## 'data.frame': 325 obs. of 28 variables:
## $ class : Factor w/ 4 levels "d ","h ","o ",..: 1 4 4 1 4 1 2 3 4 1 ...
## $ b1 : int 67 67 63 63 46 59 83 63 77 57 ...
## $ b2 : int 51 28 26 42 27 59 28 37 29 44 ...
## $ b3 : int 68 51 50 63 50 84 54 58 52 65 ...
## $ b4 : int 115 99 95 97 83 93 117 95 103 107 ...
## $ b5 : int 69 50 49 66 51 70 51 58 51 59 ...
## $ b6 : int 111 97 91 108 90 104 96 101 93 104 ...
## $ b7 : int 136 82 81 111 76 92 105 89 87 98 ...
## $ b8 : int 31 26 26 28 26 29 27 27 27 26 ...
## $ b9 : int 67 59 57 59 56 58 64 62 59 59 ...
## $ pred_minus_obs_H_b1: num 47.7 47.9 53.1 52.4 68.5 ...
## $ pred_minus_obs_H_b2: num -0.27 23.77 25.72 9.76 24.27 ...
## $ pred_minus_obs_H_b3: num 29.2 48 48.3 35.7 48.2 ...
## $ pred_minus_obs_H_b4: num -16.32 1.76 7.16 4.44 16.37 ...
## $ pred_minus_obs_H_b5: num -42.9 -23.9 -22.9 -39.9 -24.9 ...
## $ pred_minus_obs_H_b6: num -49 -34.4 -28.4 -45.4 -27.7 ...
## $ pred_minus_obs_H_b7: num -58.09 -2.89 -0.69 -31.33 2.19 ...
## $ pred_minus_obs_H_b8: num 0.71 4.32 4.16 2.24 4.93 3.15 1.66 3.14 4.8 4.11 ...
## $ pred_minus_obs_H_b9: num -9.17 -2.25 -0.44 -2.34 1.25 0.23 -9.18 -5.46 -1.07 -2.38 ...
## $ pred_minus_obs_S_b1: num -18.3 -20.1 -17.6 -20.2 -18.6 ...
## $ pred_minus_obs_S_b2: num -1.8 -2.11 -1.81 -1.89 -2.17 -1.98 -1.87 -1.74 -2.31 -2.18 ...
## $ pred_minus_obs_S_b3: num -6.32 -6.35 -4.7 -5.47 -7.11 -6.48 -5.87 -4.98 -6.72 -6.74 ...
## $ pred_minus_obs_S_b4: num -20.9 -21.9 -19.4 -21.6 -21.1 ...
## $ pred_minus_obs_S_b5: num -1.63 -1.22 -0.65 -0.99 -1.56 -1.79 -1.83 -0.93 -1.77 -1.21 ...
## $ pred_minus_obs_S_b6: num -6.13 -6.13 -5.01 -5.71 -6.35 -6.25 -7.97 -5.59 -6.29 -6.24 ...
## $ pred_minus_obs_S_b7: num -22.6 -22.2 -20.9 -22.2 -22.2 ...
## $ pred_minus_obs_S_b8: num -5.53 -3.41 -3.96 -3.41 -4.45 -6.5 -2 -3.26 -6.11 -3.06 ...
## $ pred_minus_obs_S_b9: num -8.11 -6.57 -6.85 -6.52 -7.32 -8.93 -5.03 -6.37 -8.57 -6.32 ...
Pre-process
#10 fold cross validation
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
ML models- GBM, RF and SVM (radial bais function) can be used for forest classification. We compare all these to see which performs better in training
set.seed(825)
gbmFit1 <- train(class ~ .,data=train,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr
Random Forests (RF)
rfFit1 <- train(class ~ .,data=train,
method = "rf",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
rfFit1
## Random Forest
##
## 198 samples
## 27 predictor
## 4 classes: 'd ', 'h ', 'o ', 's '
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 178, 178, 178, 179, 179, 178, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9553634 0.9399024 0.04927229 0.06627086
## 14 0.9598237 0.9458730 0.04678634 0.06293982
## 27 0.9518638 0.9351078 0.04868627 0.06552391
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 14.
SVM with RBF
svmFit1 <- train(class ~ .,data=train,
method = "svmRadial",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
## Loading required package: kernlab
svmFit1
## Support Vector Machines with Radial Basis Function Kernel
##
## 198 samples
## 27 predictor
## 4 classes: 'd ', 'h ', 'o ', 's '
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 178, 177, 179, 179, 179, 179, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa Accuracy SD Kappa SD
## 0.25 0.9411859 0.9205275 0.04731999 0.06397366
## 0.50 0.9491483 0.9314965 0.04652552 0.06250731
## 1.00 0.9621980 0.9491877 0.03967537 0.05322154
##
## Tuning parameter 'sigma' was held constant at a value of 0.03685315
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03685315 and C = 1.
All these models have an acuracy of greater than 90% with the training data. We compare and select the model which has the highest accuracy with training data
results <- resamples(list(svm=svmFit1, GBM=gbmFit1, rf=rfFit1))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: svm, GBM, rf
## Number of resamples: 100
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svm 0.8333 0.9474 0.9524 0.9622 1 1 0
## GBM 0.8333 0.9474 0.9512 0.9624 1 1 0
## rf 0.7500 0.9474 0.9524 0.9598 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svm 0.7796 0.9288 0.9361 0.9492 1 1 0
## GBM 0.7750 0.9288 0.9345 0.9494 1 1 0
## rf 0.6656 0.9288 0.9362 0.9459 1 1 0
bwplot(results)
#ref: #http://machinelearningmastery.com/compare-models-and-select-the-best-using-the-caret-r-package/
Their performance is virtually alike on the training data. So we will select GBM as our classification model. The most important thing is to examine how well our model (built on training data) predicts the test data (over here, it is stored under variable test)
#The first column of the test data contains the actual classification. We will leave this out and fit our model onto the predictor variables to see how well the predicted classifications match the actual class labels.
x2=test[,-1]
tp=predict(gbmFit1, newdata = x2)
t= test[,1]
postResample(tp,t)
## Accuracy Kappa
## 0.8184615 0.7346608
confusionMatrix(tp,t)
## Confusion Matrix and Statistics
##
## Reference
## Prediction d h o s
## d 85 0 10 10
## h 3 27 0 7
## o 9 0 35 0
## s 8 11 1 119
##
## Overall Statistics
##
## Accuracy : 0.8185
## 95% CI : (0.7722, 0.8588)
## No Information Rate : 0.4185
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7347
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: d Class: h Class: o Class: s
## Sensitivity 0.8095 0.71053 0.7609 0.8750
## Specificity 0.9091 0.96516 0.9677 0.8942
## Pos Pred Value 0.8095 0.72973 0.7955 0.8561
## Neg Pred Value 0.9091 0.96181 0.9609 0.9086
## Prevalence 0.3231 0.11692 0.1415 0.4185
## Detection Rate 0.2615 0.08308 0.1077 0.3662
## Detection Prevalence 0.3231 0.11385 0.1354 0.4277
## Balanced Accuracy 0.8593 0.83784 0.8643 0.8846
In some cases, using all predictor variables for model building produces models that fit the training data very well but not the unseen data. That is known as overfitting. We demonstrtae a simple way of doing feature engineering- we only select variables that contribute more than a % to our model by using varImp of the caret package. Then we build our model using these variables.
varImp(gbmFit1)
## gbm variable importance
##
## only 20 most important variables shown (out of 27)
##
## Overall
## b3 100.000
## pred_minus_obs_H_b1 98.408
## pred_minus_obs_H_b8 79.059
## b8 78.585
## b2 55.731
## pred_minus_obs_S_b9 16.200
## b9 13.691
## pred_minus_obs_H_b9 10.724
## pred_minus_obs_H_b2 9.793
## pred_minus_obs_H_b7 6.878
## pred_minus_obs_H_b6 5.527
## b1 4.960
## pred_minus_obs_H_b4 4.050
## pred_minus_obs_S_b8 2.586
## pred_minus_obs_H_b3 2.079
## b7 1.709
## pred_minus_obs_S_b5 1.707
## b6 1.609
## b5 1.559
## pred_minus_obs_S_b2 1.555
#select all variables that have a greater than a 50% contribution
gbmFit2 <- train(class ~b3+pred_minus_obs_H_b1+pred_minus_obs_H_b8+b2+b8 ,data=train,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit2
## Stochastic Gradient Boosting
##
## 198 samples
## 27 predictor
## 4 classes: 'd ', 'h ', 'o ', 's '
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 179, 178, 179, 179, 177, 178, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa Accuracy SD
## 1 50 0.9429613 0.9229496 0.05299440
## 1 100 0.9430495 0.9231403 0.05167768
## 1 150 0.9451021 0.9259443 0.05175558
## 2 50 0.9516385 0.9347453 0.04569954
## 2 100 0.9556723 0.9402833 0.04672476
## 2 150 0.9571510 0.9422479 0.04748126
## 3 50 0.9549956 0.9391768 0.05091394
## 3 100 0.9581510 0.9435442 0.04556957
## 3 150 0.9571723 0.9422330 0.04760272
## Kappa SD
## 0.07179091
## 0.07005852
## 0.07010052
## 0.06184412
## 0.06302088
## 0.06408950
## 0.06913293
## 0.06161771
## 0.06443209
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 100,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
x2=test[,-1]
tp=predict(gbmFit1, newdata = x2) #predict on explanatory variables provided in the testing data
t= test[,1] #acttual class labels
postResample(tp,t)
## Accuracy Kappa
## 0.8184615 0.7346608
confusionMatrix(tp,t)
## Confusion Matrix and Statistics
##
## Reference
## Prediction d h o s
## d 85 0 10 10
## h 3 27 0 7
## o 9 0 35 0
## s 8 11 1 119
##
## Overall Statistics
##
## Accuracy : 0.8185
## 95% CI : (0.7722, 0.8588)
## No Information Rate : 0.4185
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7347
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: d Class: h Class: o Class: s
## Sensitivity 0.8095 0.71053 0.7609 0.8750
## Specificity 0.9091 0.96516 0.9677 0.8942
## Pos Pred Value 0.8095 0.72973 0.7955 0.8561
## Neg Pred Value 0.9091 0.96181 0.9609 0.9086
## Prevalence 0.3231 0.11692 0.1415 0.4185
## Detection Rate 0.2615 0.08308 0.1077 0.3662
## Detection Prevalence 0.3231 0.11385 0.1354 0.4277
## Balanced Accuracy 0.8593 0.83784 0.8643 0.8846
GBM based model predicts the actual forets class labels with an overall accuracy of 81.85%
Ensemble modelling
rfresult <- predict(rfFit1, x2)
gbmresult <- predict(gbmFit1, x2)
svmresult <- predict(svmFit1, x2)
combined.data <- data.frame(rfresult, gbmresult, svmresult, class=test$class)
combined.model <- train(class~., data=combined.data, method="rf") #stack results of all 3 models using RF
combined.result <- predict(combined.model, x2)
confusionMatrix(test$class, rfresult)
## Confusion Matrix and Statistics
##
## Reference
## Prediction d h o s
## d 76 0 10 19
## h 0 29 0 9
## o 11 0 33 2
## s 5 7 0 124
##
## Overall Statistics
##
## Accuracy : 0.8062
## 95% CI : (0.7589, 0.8477)
## No Information Rate : 0.4738
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7143
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: d Class: h Class: o Class: s
## Sensitivity 0.8261 0.80556 0.7674 0.8052
## Specificity 0.8755 0.96886 0.9539 0.9298
## Pos Pred Value 0.7238 0.76316 0.7174 0.9118
## Neg Pred Value 0.9273 0.97561 0.9642 0.8413
## Prevalence 0.2831 0.11077 0.1323 0.4738
## Detection Rate 0.2338 0.08923 0.1015 0.3815
## Detection Prevalence 0.3231 0.11692 0.1415 0.4185
## Balanced Accuracy 0.8508 0.88721 0.8607 0.8675
confusionMatrix(test$class, combined.result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction d h o s
## d 87 1 7 10
## h 0 34 0 4
## o 10 0 35 1
## s 5 9 0 122
##
## Overall Statistics
##
## Accuracy : 0.8554
## 95% CI : (0.8124, 0.8918)
## No Information Rate : 0.4215
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7898
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: d Class: h Class: o Class: s
## Sensitivity 0.8529 0.7727 0.8333 0.8905
## Specificity 0.9193 0.9858 0.9611 0.9255
## Pos Pred Value 0.8286 0.8947 0.7609 0.8971
## Neg Pred Value 0.9318 0.9652 0.9749 0.9206
## Prevalence 0.3138 0.1354 0.1292 0.4215
## Detection Rate 0.2677 0.1046 0.1077 0.3754
## Detection Prevalence 0.3231 0.1169 0.1415 0.4185
## Balanced Accuracy 0.8861 0.8792 0.8972 0.9080
85% prediction accuracy achieved