# Required Packages
require(e1071)
require(caret)
require(randomForest)
require(C50)
# Read Data
xtr <- read.csv('training.csv')
colnames(xtr) <- c('suit_1', 'rank_1', 'suit_2', 'rank_2', 'suit_3', 'rank_3', 'suit_4', 'rank_4', 'suit_5', 'rank_5', 'poker_hand')
xte <- read.csv('testing.csv')
colnames(xte) <- c('suit_1', 'rank_1', 'suit_2', 'rank_2', 'suit_3', 'rank_3', 'suit_4', 'rank_4', 'suit_5', 'rank_5', 'poker_hand')
xtt <- xte
summary(xte)
##      suit_1        rank_1           suit_2          rank_2      
##  Min.   :1.0   Min.   : 1.000   Min.   :1.000   Min.   : 1.000  
##  1st Qu.:2.0   1st Qu.: 4.000   1st Qu.:2.000   1st Qu.: 4.000  
##  Median :2.0   Median : 7.000   Median :2.000   Median : 7.000  
##  Mean   :2.5   Mean   : 6.987   Mean   :2.501   Mean   : 7.019  
##  3rd Qu.:3.0   3rd Qu.:10.000   3rd Qu.:4.000   3rd Qu.:10.000  
##  Max.   :4.0   Max.   :13.000   Max.   :4.000   Max.   :13.000  
##                                                                 
##      suit_3          rank_3           suit_4          rank_4      
##  Min.   :1.000   Min.   : 1.000   Min.   :1.000   Min.   : 1.000  
##  1st Qu.:2.000   1st Qu.: 4.000   1st Qu.:1.000   1st Qu.: 4.000  
##  Median :3.000   Median : 7.000   Median :2.000   Median : 7.000  
##  Mean   :2.505   Mean   : 7.002   Mean   :2.498   Mean   : 6.992  
##  3rd Qu.:4.000   3rd Qu.:10.000   3rd Qu.:3.000   3rd Qu.:10.000  
##  Max.   :4.000   Max.   :13.000   Max.   :4.000   Max.   :13.000  
##                                                   NA's   :1       
##      suit_5          rank_5         poker_hand   
##  Min.   :1.000   Min.   : 1.000   Min.   :0.000  
##  1st Qu.:1.000   1st Qu.: 4.000   1st Qu.:0.000  
##  Median :3.000   Median : 7.000   Median :0.000  
##  Mean   :2.502   Mean   : 6.992   Mean   :0.617  
##  3rd Qu.:4.000   3rd Qu.:10.000   3rd Qu.:1.000  
##  Max.   :4.000   Max.   :13.000   Max.   :9.000  
##  NA's   :1       NA's   :1        NA's   :1
# Clean Data
table(is.na(xte))
## 
##   FALSE    TRUE 
## 1175137       4
xte <- xte[!is.na(xte$rank_4),]
table(is.na(xte))
## 
##   FALSE 
## 1175130
xtt <- xte
# Format Data
xtr$suit_1 <- as.factor(xtr$suit_1)
xtr$suit_2 <- as.factor(xtr$suit_2)
xtr$suit_3 <- as.factor(xtr$suit_3)
xtr$suit_4 <- as.factor(xtr$suit_4)
xtr$suit_5 <- as.factor(xtr$suit_5)
xtr$rank_1 <- as.factor(xtr$rank_1)
xtr$rank_2 <- as.factor(xtr$rank_2)
xtr$rank_3 <- as.factor(xtr$rank_3)
xtr$rank_4 <- as.factor(xtr$rank_4)
xtr$rank_5 <- as.factor(xtr$rank_5)
xtr$poker_hand <- as.factor(xtr$poker_hand)
xte$suit_1 <- as.factor(xte$suit_1)
xte$suit_2 <- as.factor(xte$suit_2)
xte$suit_3 <- as.factor(xte$suit_3)
xte$suit_4 <- as.factor(xte$suit_4)
xte$suit_5 <- as.factor(xte$suit_5)
xte$rank_1 <- as.factor(xte$rank_1)
xte$rank_2 <- as.factor(xte$rank_2)
xte$rank_3 <- as.factor(xte$rank_3)
xte$rank_4 <- as.factor(xte$rank_4)
xte$rank_5 <- as.factor(xte$rank_5)
xte$poker_hand <- as.factor(xte$poker_hand)
xtt$suit_1 <- as.factor(xtt$suit_1)
xtt$suit_2 <- as.factor(xtt$suit_2)
xtt$suit_3 <- as.factor(xtt$suit_3)
xtt$suit_4 <- as.factor(xtt$suit_4)
xtt$suit_5 <- as.factor(xtt$suit_5)
xtt$rank_1 <- as.factor(xtt$rank_1)
xtt$rank_2 <- as.factor(xtt$rank_2)
xtt$rank_3 <- as.factor(xtt$rank_3)
xtt$rank_4 <- as.factor(xtt$rank_4)
xtt$rank_5 <- as.factor(xtt$rank_5)
xtt$poker_hand <- as.factor(xtt$poker_hand)
# NaiveBayes
nb <- naiveBayes(xtr$poker_hand ~., xtr)
predict_bn <- predict(nb, xte)
cm_nb <- confusionMatrix(xtt$poker_hand,predict_bn)
cm_nb
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3     4     5     6     7     8     9
##          0 49775  3748     0     0     0     0     0     3     0     0
##          1 42110  2985     0     0     0     0     0     4     0     6
##          2  4899   293     0     0     0     0     0     0     0     0
##          3  2089   120     0     0     0     0     0     0     0     2
##          4   376    27     0     0     0     0     0     0     0     1
##          5   186    19     0     0     0     0     0     0     0     0
##          6   159     7     0     0     0     0     0     0     0     0
##          7    19     0     0     0     0     0     0     0     0     0
##          8     1     0     0     0     0     0     0     0     0     0
##          9     1     0     0     0     0     0     0     0     0     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4939          
##                  95% CI : (0.4909, 0.4969)
##     No Information Rate : 0.9325          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0035         
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.49967  0.41464       NA       NA       NA       NA
## Specificity           0.48011  0.57724   0.9514   0.9793 0.996218 0.998081
## Pos Pred Value        0.92992  0.06618       NA       NA       NA       NA
## Neg Pred Value        0.06499  0.93173       NA       NA       NA       NA
## Prevalence            0.93246  0.06739   0.0000   0.0000 0.000000 0.000000
## Detection Rate        0.46593  0.02794   0.0000   0.0000 0.000000 0.000000
## Detection Prevalence  0.50104  0.42221   0.0486   0.0207 0.003782 0.001919
## Balanced Accuracy     0.48989  0.49594       NA       NA       NA       NA
##                      Class: 6  Class: 7  Class: 8  Class: 9
## Sensitivity                NA 0.000e+00        NA 0.000e+00
## Specificity          0.998446 9.998e-01 1.000e+00 1.000e+00
## Pos Pred Value             NA 0.000e+00        NA 0.000e+00
## Neg Pred Value             NA 9.999e-01        NA 9.999e-01
## Prevalence           0.000000 6.552e-05 0.000e+00 8.425e-05
## Detection Rate       0.000000 0.000e+00 0.000e+00 0.000e+00
## Detection Prevalence 0.001554 1.779e-04 9.361e-06 9.361e-06
## Balanced Accuracy          NA 4.999e-01        NA 5.000e-01
# Support Vector Machines
svm <- svm(xtr$poker_hand ~., xtr)
predict_svm <- predict(svm, xte)
cm_svm <- confusionMatrix(xtt$poker_hand,predict_svm)
cm_svm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3     4     5     6     7     8     9
##          0 53526     0     0     0     0     0     0     0     0     0
##          1 45105     0     0     0     0     0     0     0     0     0
##          2  5192     0     0     0     0     0     0     0     0     0
##          3  2211     0     0     0     0     0     0     0     0     0
##          4   404     0     0     0     0     0     0     0     0     0
##          5   205     0     0     0     0     0     0     0     0     0
##          6   166     0     0     0     0     0     0     0     0     0
##          7    19     0     0     0     0     0     0     0     0     0
##          8     1     0     0     0     0     0     0     0     0     0
##          9     1     0     0     0     0     0     0     0     0     0
## 
## Overall Statistics
##                                         
##                Accuracy : 0.501         
##                  95% CI : (0.498, 0.504)
##     No Information Rate : 1             
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0             
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity             0.501       NA       NA       NA       NA       NA
## Specificity                NA   0.5778   0.9514   0.9793 0.996218 0.998081
## Pos Pred Value             NA       NA       NA       NA       NA       NA
## Neg Pred Value             NA       NA       NA       NA       NA       NA
## Prevalence              1.000   0.0000   0.0000   0.0000 0.000000 0.000000
## Detection Rate          0.501   0.0000   0.0000   0.0000 0.000000 0.000000
## Detection Prevalence    0.501   0.4222   0.0486   0.0207 0.003782 0.001919
## Balanced Accuracy          NA       NA       NA       NA       NA       NA
##                      Class: 6  Class: 7  Class: 8  Class: 9
## Sensitivity                NA        NA        NA        NA
## Specificity          0.998446 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value             NA        NA        NA        NA
## Neg Pred Value             NA        NA        NA        NA
## Prevalence           0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Rate       0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.001554 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy          NA        NA        NA        NA
# Random Forest
set.seed(12)
bestTry <- tuneRF(xtr[,-11],xtr[,11], stepFactor = 1.5)
## mtry = 3  OOB error = 41.46% 
## Searching left ...
## mtry = 2     OOB error = 44.16% 
## -0.06520062 0.05 
## Searching right ...
## mtry = 4     OOB error = 40.01% 
## 0.03481867 0.05

rf <- randomForest(xtr$poker_hand ~., xtr, mtry = 4, ntree = 25)
predict_rf <- predict(rf, xte)
cm_rf <- confusionMatrix(xtt$poker_hand,predict_rf)
cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3     4     5     6     7     8     9
##          0 42477 11049     0     0     0     0     0     0     0     0
##          1 22128 22969     8     0     0     0     0     0     0     0
##          2  1172  3991    29     0     0     0     0     0     0     0
##          3   320  1885     2     4     0     0     0     0     0     0
##          4   321    82     0     0     1     0     0     0     0     0
##          5   171    34     0     0     0     0     0     0     0     0
##          6     6   158     1     0     0     0     1     0     0     0
##          7     0    19     0     0     0     0     0     0     0     0
##          8     1     0     0     0     0     0     0     0     0     0
##          9     1     0     0     0     0     0     0     0     0     0
## 
## Overall Statistics
##                                         
##                Accuracy : 0.6129        
##                  95% CI : (0.61, 0.6159)
##     No Information Rate : 0.6234        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.2681        
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1  Class: 2  Class: 3  Class: 4
## Sensitivity            0.6378   0.5716 0.7250000 1.000e+00 1.000e+00
## Specificity            0.7254   0.6678 0.9516528 9.793e-01 9.962e-01
## Pos Pred Value         0.7936   0.5092 0.0055855 1.809e-03 2.475e-03
## Neg Pred Value         0.5475   0.7211 0.9998918 1.000e+00 1.000e+00
## Prevalence             0.6234   0.3762 0.0003744 3.744e-05 9.361e-06
## Detection Rate         0.3976   0.2150 0.0002715 3.744e-05 9.361e-06
## Detection Prevalence   0.5010   0.4222 0.0486006 2.070e-02 3.782e-03
## Balanced Accuracy      0.6816   0.6197 0.8383264 9.897e-01 9.981e-01
##                      Class: 5  Class: 6  Class: 7  Class: 8  Class: 9
## Sensitivity                NA 1.000e+00        NA        NA        NA
## Specificity          0.998081 9.985e-01 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value             NA 6.024e-03        NA        NA        NA
## Neg Pred Value             NA 1.000e+00        NA        NA        NA
## Prevalence           0.000000 9.361e-06 0.0000000 0.000e+00 0.000e+00
## Detection Rate       0.000000 9.361e-06 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.001919 1.554e-03 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy          NA 9.992e-01        NA        NA        NA
# Decision Tree # Best Accuracy !! # Only Acceptable P-Value for model significance !!
c <- C5.0(xtr$poker_hand ~., xtr)
predict_c <- predict(c, xte)
cm_c <- confusionMatrix(xtt$poker_hand,predict_c)
cm_c
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3     4     5     6     7     8     9
##          0 50735  2732    13    14    16     8     0     0     0     8
##          1 17823 26838   295   135     3     5     0     0     0     6
##          2   401  4741    36    14     0     0     0     0     0     0
##          3   301   936     5   961     0     0     8     0     0     0
##          4   372    32     0     0     0     0     0     0     0     0
##          5   193    11     0     1     0     0     0     0     0     0
##          6     0    98     0    67     0     0     1     0     0     0
##          7     0     4     0    14     0     0     1     0     0     0
##          8     1     0     0     0     0     0     0     0     0     0
##          9     1     0     0     0     0     0     0     0     0     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7355          
##                  95% CI : (0.7328, 0.7381)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.503           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3  Class: 4
## Sensitivity            0.7266   0.7583 0.103152 0.796849 0.0000000
## Specificity            0.9246   0.7443 0.951578 0.988166 0.9962176
## Pos Pred Value         0.9479   0.5950 0.006934 0.434645 0.0000000
## Neg Pred Value         0.6418   0.8614 0.996920 0.997658 0.9998215
## Prevalence             0.6536   0.3313 0.003267 0.011289 0.0001779
## Detection Rate         0.4749   0.2512 0.000337 0.008996 0.0000000
## Detection Prevalence   0.5010   0.4222 0.048601 0.020696 0.0037817
## Balanced Accuracy      0.8256   0.7513 0.527365 0.892507 0.4981088
##                       Class: 5  Class: 6  Class: 7  Class: 8  Class: 9
## Sensitivity          0.0000000 1.000e-01        NA        NA 0.000e+00
## Specificity          0.9980808 9.985e-01 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value       0.0000000 6.024e-03        NA        NA 0.000e+00
## Neg Pred Value       0.9998781 9.999e-01        NA        NA 9.999e-01
## Prevalence           0.0001217 9.361e-05 0.0000000 0.000e+00 1.310e-04
## Detection Rate       0.0000000 9.361e-06 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.0019189 1.554e-03 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy    0.4990404 5.492e-01        NA        NA 5.000e-01