# Required Packages
require(e1071)
require(caret)
require(randomForest)
require(C50)
# Read Data
xtr <- read.csv('training.csv')
colnames(xtr) <- c('suit_1', 'rank_1', 'suit_2', 'rank_2', 'suit_3', 'rank_3', 'suit_4', 'rank_4', 'suit_5', 'rank_5', 'poker_hand')
xte <- read.csv('testing.csv')
colnames(xte) <- c('suit_1', 'rank_1', 'suit_2', 'rank_2', 'suit_3', 'rank_3', 'suit_4', 'rank_4', 'suit_5', 'rank_5', 'poker_hand')
xtt <- xte
summary(xte)
## suit_1 rank_1 suit_2 rank_2
## Min. :1.0 Min. : 1.000 Min. :1.000 Min. : 1.000
## 1st Qu.:2.0 1st Qu.: 4.000 1st Qu.:2.000 1st Qu.: 4.000
## Median :2.0 Median : 7.000 Median :2.000 Median : 7.000
## Mean :2.5 Mean : 6.987 Mean :2.501 Mean : 7.019
## 3rd Qu.:3.0 3rd Qu.:10.000 3rd Qu.:4.000 3rd Qu.:10.000
## Max. :4.0 Max. :13.000 Max. :4.000 Max. :13.000
##
## suit_3 rank_3 suit_4 rank_4
## Min. :1.000 Min. : 1.000 Min. :1.000 Min. : 1.000
## 1st Qu.:2.000 1st Qu.: 4.000 1st Qu.:1.000 1st Qu.: 4.000
## Median :3.000 Median : 7.000 Median :2.000 Median : 7.000
## Mean :2.505 Mean : 7.002 Mean :2.498 Mean : 6.992
## 3rd Qu.:4.000 3rd Qu.:10.000 3rd Qu.:3.000 3rd Qu.:10.000
## Max. :4.000 Max. :13.000 Max. :4.000 Max. :13.000
## NA's :1
## suit_5 rank_5 poker_hand
## Min. :1.000 Min. : 1.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.: 4.000 1st Qu.:0.000
## Median :3.000 Median : 7.000 Median :0.000
## Mean :2.502 Mean : 6.992 Mean :0.617
## 3rd Qu.:4.000 3rd Qu.:10.000 3rd Qu.:1.000
## Max. :4.000 Max. :13.000 Max. :9.000
## NA's :1 NA's :1 NA's :1
# Clean Data
table(is.na(xte))
##
## FALSE TRUE
## 1175137 4
xte <- xte[!is.na(xte$rank_4),]
table(is.na(xte))
##
## FALSE
## 1175130
xtt <- xte
# Format Data
xtr$suit_1 <- as.factor(xtr$suit_1)
xtr$suit_2 <- as.factor(xtr$suit_2)
xtr$suit_3 <- as.factor(xtr$suit_3)
xtr$suit_4 <- as.factor(xtr$suit_4)
xtr$suit_5 <- as.factor(xtr$suit_5)
xtr$rank_1 <- as.factor(xtr$rank_1)
xtr$rank_2 <- as.factor(xtr$rank_2)
xtr$rank_3 <- as.factor(xtr$rank_3)
xtr$rank_4 <- as.factor(xtr$rank_4)
xtr$rank_5 <- as.factor(xtr$rank_5)
xtr$poker_hand <- as.factor(xtr$poker_hand)
xte$suit_1 <- as.factor(xte$suit_1)
xte$suit_2 <- as.factor(xte$suit_2)
xte$suit_3 <- as.factor(xte$suit_3)
xte$suit_4 <- as.factor(xte$suit_4)
xte$suit_5 <- as.factor(xte$suit_5)
xte$rank_1 <- as.factor(xte$rank_1)
xte$rank_2 <- as.factor(xte$rank_2)
xte$rank_3 <- as.factor(xte$rank_3)
xte$rank_4 <- as.factor(xte$rank_4)
xte$rank_5 <- as.factor(xte$rank_5)
xte$poker_hand <- as.factor(xte$poker_hand)
xtt$suit_1 <- as.factor(xtt$suit_1)
xtt$suit_2 <- as.factor(xtt$suit_2)
xtt$suit_3 <- as.factor(xtt$suit_3)
xtt$suit_4 <- as.factor(xtt$suit_4)
xtt$suit_5 <- as.factor(xtt$suit_5)
xtt$rank_1 <- as.factor(xtt$rank_1)
xtt$rank_2 <- as.factor(xtt$rank_2)
xtt$rank_3 <- as.factor(xtt$rank_3)
xtt$rank_4 <- as.factor(xtt$rank_4)
xtt$rank_5 <- as.factor(xtt$rank_5)
xtt$poker_hand <- as.factor(xtt$poker_hand)
# NaiveBayes
nb <- naiveBayes(xtr$poker_hand ~., xtr)
predict_bn <- predict(nb, xte)
cm_nb <- confusionMatrix(xtt$poker_hand,predict_bn)
cm_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 49775 3748 0 0 0 0 0 3 0 0
## 1 42110 2985 0 0 0 0 0 4 0 6
## 2 4899 293 0 0 0 0 0 0 0 0
## 3 2089 120 0 0 0 0 0 0 0 2
## 4 376 27 0 0 0 0 0 0 0 1
## 5 186 19 0 0 0 0 0 0 0 0
## 6 159 7 0 0 0 0 0 0 0 0
## 7 19 0 0 0 0 0 0 0 0 0
## 8 1 0 0 0 0 0 0 0 0 0
## 9 1 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.4939
## 95% CI : (0.4909, 0.4969)
## No Information Rate : 0.9325
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0035
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.49967 0.41464 NA NA NA NA
## Specificity 0.48011 0.57724 0.9514 0.9793 0.996218 0.998081
## Pos Pred Value 0.92992 0.06618 NA NA NA NA
## Neg Pred Value 0.06499 0.93173 NA NA NA NA
## Prevalence 0.93246 0.06739 0.0000 0.0000 0.000000 0.000000
## Detection Rate 0.46593 0.02794 0.0000 0.0000 0.000000 0.000000
## Detection Prevalence 0.50104 0.42221 0.0486 0.0207 0.003782 0.001919
## Balanced Accuracy 0.48989 0.49594 NA NA NA NA
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity NA 0.000e+00 NA 0.000e+00
## Specificity 0.998446 9.998e-01 1.000e+00 1.000e+00
## Pos Pred Value NA 0.000e+00 NA 0.000e+00
## Neg Pred Value NA 9.999e-01 NA 9.999e-01
## Prevalence 0.000000 6.552e-05 0.000e+00 8.425e-05
## Detection Rate 0.000000 0.000e+00 0.000e+00 0.000e+00
## Detection Prevalence 0.001554 1.779e-04 9.361e-06 9.361e-06
## Balanced Accuracy NA 4.999e-01 NA 5.000e-01
# Support Vector Machines
svm <- svm(xtr$poker_hand ~., xtr)
predict_svm <- predict(svm, xte)
cm_svm <- confusionMatrix(xtt$poker_hand,predict_svm)
cm_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 53526 0 0 0 0 0 0 0 0 0
## 1 45105 0 0 0 0 0 0 0 0 0
## 2 5192 0 0 0 0 0 0 0 0 0
## 3 2211 0 0 0 0 0 0 0 0 0
## 4 404 0 0 0 0 0 0 0 0 0
## 5 205 0 0 0 0 0 0 0 0 0
## 6 166 0 0 0 0 0 0 0 0 0
## 7 19 0 0 0 0 0 0 0 0 0
## 8 1 0 0 0 0 0 0 0 0 0
## 9 1 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.501
## 95% CI : (0.498, 0.504)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.501 NA NA NA NA NA
## Specificity NA 0.5778 0.9514 0.9793 0.996218 0.998081
## Pos Pred Value NA NA NA NA NA NA
## Neg Pred Value NA NA NA NA NA NA
## Prevalence 1.000 0.0000 0.0000 0.0000 0.000000 0.000000
## Detection Rate 0.501 0.0000 0.0000 0.0000 0.000000 0.000000
## Detection Prevalence 0.501 0.4222 0.0486 0.0207 0.003782 0.001919
## Balanced Accuracy NA NA NA NA NA NA
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity NA NA NA NA
## Specificity 0.998446 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value NA NA NA NA
## Neg Pred Value NA NA NA NA
## Prevalence 0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Rate 0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.001554 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy NA NA NA NA
# Random Forest
set.seed(12)
bestTry <- tuneRF(xtr[,-11],xtr[,11], stepFactor = 1.5)
## mtry = 3 OOB error = 41.46%
## Searching left ...
## mtry = 2 OOB error = 44.16%
## -0.06520062 0.05
## Searching right ...
## mtry = 4 OOB error = 40.01%
## 0.03481867 0.05

rf <- randomForest(xtr$poker_hand ~., xtr, mtry = 4, ntree = 25)
predict_rf <- predict(rf, xte)
cm_rf <- confusionMatrix(xtt$poker_hand,predict_rf)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 42477 11049 0 0 0 0 0 0 0 0
## 1 22128 22969 8 0 0 0 0 0 0 0
## 2 1172 3991 29 0 0 0 0 0 0 0
## 3 320 1885 2 4 0 0 0 0 0 0
## 4 321 82 0 0 1 0 0 0 0 0
## 5 171 34 0 0 0 0 0 0 0 0
## 6 6 158 1 0 0 0 1 0 0 0
## 7 0 19 0 0 0 0 0 0 0 0
## 8 1 0 0 0 0 0 0 0 0 0
## 9 1 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6129
## 95% CI : (0.61, 0.6159)
## No Information Rate : 0.6234
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2681
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.6378 0.5716 0.7250000 1.000e+00 1.000e+00
## Specificity 0.7254 0.6678 0.9516528 9.793e-01 9.962e-01
## Pos Pred Value 0.7936 0.5092 0.0055855 1.809e-03 2.475e-03
## Neg Pred Value 0.5475 0.7211 0.9998918 1.000e+00 1.000e+00
## Prevalence 0.6234 0.3762 0.0003744 3.744e-05 9.361e-06
## Detection Rate 0.3976 0.2150 0.0002715 3.744e-05 9.361e-06
## Detection Prevalence 0.5010 0.4222 0.0486006 2.070e-02 3.782e-03
## Balanced Accuracy 0.6816 0.6197 0.8383264 9.897e-01 9.981e-01
## Class: 5 Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity NA 1.000e+00 NA NA NA
## Specificity 0.998081 9.985e-01 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value NA 6.024e-03 NA NA NA
## Neg Pred Value NA 1.000e+00 NA NA NA
## Prevalence 0.000000 9.361e-06 0.0000000 0.000e+00 0.000e+00
## Detection Rate 0.000000 9.361e-06 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.001919 1.554e-03 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy NA 9.992e-01 NA NA NA
# Decision Tree # Best Accuracy !! # Only Acceptable P-Value for model significance !!
c <- C5.0(xtr$poker_hand ~., xtr)
predict_c <- predict(c, xte)
cm_c <- confusionMatrix(xtt$poker_hand,predict_c)
cm_c
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 50735 2732 13 14 16 8 0 0 0 8
## 1 17823 26838 295 135 3 5 0 0 0 6
## 2 401 4741 36 14 0 0 0 0 0 0
## 3 301 936 5 961 0 0 8 0 0 0
## 4 372 32 0 0 0 0 0 0 0 0
## 5 193 11 0 1 0 0 0 0 0 0
## 6 0 98 0 67 0 0 1 0 0 0
## 7 0 4 0 14 0 0 1 0 0 0
## 8 1 0 0 0 0 0 0 0 0 0
## 9 1 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7355
## 95% CI : (0.7328, 0.7381)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.503
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.7266 0.7583 0.103152 0.796849 0.0000000
## Specificity 0.9246 0.7443 0.951578 0.988166 0.9962176
## Pos Pred Value 0.9479 0.5950 0.006934 0.434645 0.0000000
## Neg Pred Value 0.6418 0.8614 0.996920 0.997658 0.9998215
## Prevalence 0.6536 0.3313 0.003267 0.011289 0.0001779
## Detection Rate 0.4749 0.2512 0.000337 0.008996 0.0000000
## Detection Prevalence 0.5010 0.4222 0.048601 0.020696 0.0037817
## Balanced Accuracy 0.8256 0.7513 0.527365 0.892507 0.4981088
## Class: 5 Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.0000000 1.000e-01 NA NA 0.000e+00
## Specificity 0.9980808 9.985e-01 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value 0.0000000 6.024e-03 NA NA 0.000e+00
## Neg Pred Value 0.9998781 9.999e-01 NA NA 9.999e-01
## Prevalence 0.0001217 9.361e-05 0.0000000 0.000e+00 1.310e-04
## Detection Rate 0.0000000 9.361e-06 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.0019189 1.554e-03 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy 0.4990404 5.492e-01 NA NA 5.000e-01