setwd("~/Downloads/Data Mining Test2")

#Inserting the train dataset
require(readr)
require(caret)
require(mlbench)
require(e1071)
require(randomForest)
require(C50)
require(pROC)


train <- read.csv("training.csv")
test <- read.csv("testing.csv")
str(train)
## 'data.frame':    25009 obs. of  11 variables:
##  $ X1  : int  2 3 4 4 1 1 2 3 4 1 ...
##  $ X10 : int  11 12 10 1 2 9 1 5 1 1 ...
##  $ X1.1: int  2 3 4 4 1 1 2 3 4 2 ...
##  $ X11 : int  13 11 11 13 4 12 2 6 4 1 ...
##  $ X1.2: int  2 3 4 4 1 1 2 3 4 3 ...
##  $ X13 : int  10 13 1 12 5 10 3 9 2 9 ...
##  $ X1.3: int  2 3 4 4 1 1 2 3 4 1 ...
##  $ X12 : int  12 10 13 11 3 11 4 7 3 5 ...
##  $ X1.4: int  2 3 4 4 1 1 2 3 4 2 ...
##  $ X1.5: int  1 1 12 10 6 13 5 8 5 3 ...
##  $ X9  : int  9 9 9 9 8 8 8 8 8 1 ...
colnames(train)<-c("S1","C1","S2","C2","S3","C3","S4","C4","S5","C5","Poker_Hand")
colnames(test)<-c("S1","C1","S2","C2","S3","C3","S4","C4","S5","C5","Poker_Hand")

#Change data type from interger to factor
train$S1<- as.factor(train$S1)
train$C1<- as.factor(train$C1)
train$S2<- as.factor(train$S2)
train$C2<- as.factor(train$C2)
train$S3<- as.factor(train$S3)
train$C3<- as.factor(train$C3)
train$S4<- as.factor(train$S4)
train$C4<- as.factor(train$C4)
train$S5<- as.factor(train$S5)
train$C5<- as.factor(train$C5)
train$Poker_Hand<-as.factor(train$Poker_Hand)



test$S1<- as.factor(test$S1)
test$C1<- as.factor(test$C1)
test$S2<- as.factor(test$S2)
test$C2<- as.factor(test$C2)
test$S3<- as.factor(test$S3)
test$C3<- as.factor(test$C3)
test$S4<- as.factor(test$S4)
test$C4<- as.factor(test$C4)
test$S5<- as.factor(test$S5)
test$C5<- as.factor(test$C5)
test$Poker_Hand<-as.factor(test$Poker_Hand)

str(train)
## 'data.frame':    25009 obs. of  11 variables:
##  $ S1        : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 1 ...
##  $ C1        : Factor w/ 13 levels "1","2","3","4",..: 11 12 10 1 2 9 1 5 1 1 ...
##  $ S2        : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 2 ...
##  $ C2        : Factor w/ 13 levels "1","2","3","4",..: 13 11 11 13 4 12 2 6 4 1 ...
##  $ S3        : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 3 ...
##  $ C3        : Factor w/ 13 levels "1","2","3","4",..: 10 13 1 12 5 10 3 9 2 9 ...
##  $ S4        : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 1 ...
##  $ C4        : Factor w/ 13 levels "1","2","3","4",..: 12 10 13 11 3 11 4 7 3 5 ...
##  $ S5        : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 2 ...
##  $ C5        : Factor w/ 13 levels "1","2","3","4",..: 1 1 12 10 6 13 5 8 5 3 ...
##  $ Poker_Hand: Factor w/ 10 levels "0","1","2","3",..: 10 10 10 10 9 9 9 9 9 2 ...
head(train)
##   S1 C1 S2 C2 S3 C3 S4 C4 S5 C5 Poker_Hand
## 1  2 11  2 13  2 10  2 12  2  1          9
## 2  3 12  3 11  3 13  3 10  3  1          9
## 3  4 10  4 11  4  1  4 13  4 12          9
## 4  4  1  4 13  4 12  4 11  4 10          9
## 5  1  2  1  4  1  5  1  3  1  6          8
## 6  1  9  1 12  1 10  1 11  1 13          8
colSums(is.na(test))
##         S1         C1         S2         C2         S3         C3 
##          0          0          0          0          0          0 
##         S4         C4         S5         C5 Poker_Hand 
##          0          0          0          0          0
colSums(is.na(train))
##         S1         C1         S2         C2         S3         C3 
##          0          0          0          0          0          0 
##         S4         C4         S5         C5 Poker_Hand 
##          0          0          0          0          0
# NaiveBayes Algorithm
model1 <- naiveBayes(Poker_Hand~.,data = train)
summary(model1)
##         Length Class  Mode     
## apriori 10     table  numeric  
## tables  10     -none- list     
## levels  10     -none- character
## call     4     -none- call
prediction1 <- predict(model1,test)
confusionMatrix(test[,11],prediction1) #Accuracy 49.39 %
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3     4     5     6     7     8     9
##          0 49775  3748     0     0     0     0     0     3     0     0
##          1 42110  2985     0     0     0     0     0     4     0     6
##          2  4899   293     0     0     0     0     0     0     0     0
##          3  2089   120     0     0     0     0     0     0     0     2
##          4   376    27     0     0     0     0     0     0     0     1
##          5   186    19     0     0     0     0     0     0     0     0
##          6   159     7     0     0     0     0     0     0     0     0
##          7    19     0     0     0     0     0     0     0     0     0
##          8     1     0     0     0     0     0     0     0     0     0
##          9     1     0     0     0     0     0     0     0     0     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4939          
##                  95% CI : (0.4909, 0.4969)
##     No Information Rate : 0.9325          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0035         
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.49967  0.41464       NA       NA       NA       NA
## Specificity           0.48011  0.57724   0.9514   0.9793 0.996218 0.998081
## Pos Pred Value        0.92992  0.06618       NA       NA       NA       NA
## Neg Pred Value        0.06499  0.93173       NA       NA       NA       NA
## Prevalence            0.93246  0.06739   0.0000   0.0000 0.000000 0.000000
## Detection Rate        0.46593  0.02794   0.0000   0.0000 0.000000 0.000000
## Detection Prevalence  0.50104  0.42221   0.0486   0.0207 0.003782 0.001919
## Balanced Accuracy     0.48989  0.49594       NA       NA       NA       NA
##                      Class: 6  Class: 7  Class: 8  Class: 9
## Sensitivity                NA 0.000e+00        NA 0.000e+00
## Specificity          0.998446 9.998e-01 1.000e+00 1.000e+00
## Pos Pred Value             NA 0.000e+00        NA 0.000e+00
## Neg Pred Value             NA 9.999e-01        NA 9.999e-01
## Prevalence           0.000000 6.552e-05 0.000e+00 8.425e-05
## Detection Rate       0.000000 0.000e+00 0.000e+00 0.000e+00
## Detection Prevalence 0.001554 1.779e-04 9.361e-06 9.361e-06
## Balanced Accuracy          NA 4.999e-01        NA 5.000e-01
# Support vector machine
set.seed(123)
model2 <- svm(Poker_Hand~.,data = train)
prediction2 <- predict(model2,test)
confusionMatrix(test[,11],prediction2) #Accuracy 50.1 %
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3     4     5     6     7     8     9
##          0 53526     0     0     0     0     0     0     0     0     0
##          1 45105     0     0     0     0     0     0     0     0     0
##          2  5192     0     0     0     0     0     0     0     0     0
##          3  2211     0     0     0     0     0     0     0     0     0
##          4   404     0     0     0     0     0     0     0     0     0
##          5   205     0     0     0     0     0     0     0     0     0
##          6   166     0     0     0     0     0     0     0     0     0
##          7    19     0     0     0     0     0     0     0     0     0
##          8     1     0     0     0     0     0     0     0     0     0
##          9     1     0     0     0     0     0     0     0     0     0
## 
## Overall Statistics
##                                         
##                Accuracy : 0.501         
##                  95% CI : (0.498, 0.504)
##     No Information Rate : 1             
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0             
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity             0.501       NA       NA       NA       NA       NA
## Specificity                NA   0.5778   0.9514   0.9793 0.996218 0.998081
## Pos Pred Value             NA       NA       NA       NA       NA       NA
## Neg Pred Value             NA       NA       NA       NA       NA       NA
## Prevalence              1.000   0.0000   0.0000   0.0000 0.000000 0.000000
## Detection Rate          0.501   0.0000   0.0000   0.0000 0.000000 0.000000
## Detection Prevalence    0.501   0.4222   0.0486   0.0207 0.003782 0.001919
## Balanced Accuracy          NA       NA       NA       NA       NA       NA
##                      Class: 6  Class: 7  Class: 8  Class: 9
## Sensitivity                NA        NA        NA        NA
## Specificity          0.998446 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value             NA        NA        NA        NA
## Neg Pred Value             NA        NA        NA        NA
## Prevalence           0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Rate       0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.001554 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy          NA        NA        NA        NA
#C5.0 Algorithm
model3<-C5.0(Poker_Hand~.,data = train)
prediction3<-predict(model3,test)
confusionMatrix(test[,11],prediction3) #Accuracy 73.55 %
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3     4     5     6     7     8     9
##          0 50735  2732    13    14    16     8     0     0     0     8
##          1 17823 26838   295   135     3     5     0     0     0     6
##          2   401  4741    36    14     0     0     0     0     0     0
##          3   301   936     5   961     0     0     8     0     0     0
##          4   372    32     0     0     0     0     0     0     0     0
##          5   193    11     0     1     0     0     0     0     0     0
##          6     0    98     0    67     0     0     1     0     0     0
##          7     0     4     0    14     0     0     1     0     0     0
##          8     1     0     0     0     0     0     0     0     0     0
##          9     1     0     0     0     0     0     0     0     0     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7355          
##                  95% CI : (0.7328, 0.7381)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.503           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3  Class: 4
## Sensitivity            0.7266   0.7583 0.103152 0.796849 0.0000000
## Specificity            0.9246   0.7443 0.951578 0.988166 0.9962176
## Pos Pred Value         0.9479   0.5950 0.006934 0.434645 0.0000000
## Neg Pred Value         0.6418   0.8614 0.996920 0.997658 0.9998215
## Prevalence             0.6536   0.3313 0.003267 0.011289 0.0001779
## Detection Rate         0.4749   0.2512 0.000337 0.008996 0.0000000
## Detection Prevalence   0.5010   0.4222 0.048601 0.020696 0.0037817
## Balanced Accuracy      0.8256   0.7513 0.527365 0.892507 0.4981088
##                       Class: 5  Class: 6  Class: 7  Class: 8  Class: 9
## Sensitivity          0.0000000 1.000e-01        NA        NA 0.000e+00
## Specificity          0.9980808 9.985e-01 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value       0.0000000 6.024e-03        NA        NA 0.000e+00
## Neg Pred Value       0.9998781 9.999e-01        NA        NA 9.999e-01
## Prevalence           0.0001217 9.361e-05 0.0000000 0.000e+00 1.310e-04
## Detection Rate       0.0000000 9.361e-06 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.0019189 1.554e-03 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy    0.4990404 5.492e-01        NA        NA 5.000e-01
#after running tuneRF it is found that the best mtry will 10
set.seed(121)
tuneRF(train[,-11],train[,11],stepFactor = 2)
## mtry = 3  OOB error = 41.58% 
## Searching left ...
## mtry = 2     OOB error = 43.73% 
## -0.05173574 0.05 
## Searching right ...
## mtry = 6     OOB error = 38.51% 
## 0.07385326 0.05 
## mtry = 10    OOB error = 38.02% 
## 0.01266743 0.05

##        mtry  OOBError
## 2.OOB     2 0.4373226
## 3.OOB     3 0.4158103
## 6.OOB     6 0.3851014
## 10.OOB   10 0.3802231
# Random forest
set.seed(111)
model4<-randomForest(Poker_Hand~.,data=train)
prediction4<-predict(model4,test)
confusionMatrix(test[,11],prediction4)#Accuracy 68.27 % 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3     4     5     6     7     8     9
##          0 51188  2338     0     0     0     0     0     0     0     0
##          1 23373 21732     0     0     0     0     0     0     0     0
##          2   282  4903     7     0     0     0     0     0     0     0
##          3     7  2203     0     1     0     0     0     0     0     0
##          4   378    25     0     0     1     0     0     0     0     0
##          5   204     1     0     0     0     0     0     0     0     0
##          6     0   166     0     0     0     0     0     0     0     0
##          7     0    19     0     0     0     0     0     0     0     0
##          8     0     1     0     0     0     0     0     0     0     0
##          9     1     0     0     0     0     0     0     0     0     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6827          
##                  95% CI : (0.6799, 0.6855)
##     No Information Rate : 0.7061          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3923          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1  Class: 2  Class: 3  Class: 4
## Sensitivity            0.6786   0.6924 1.000e+00 1.000e+00 1.000e+00
## Specificity            0.9255   0.6902 9.515e-01 9.793e-01 9.962e-01
## Pos Pred Value         0.9563   0.4818 1.348e-03 4.523e-04 2.475e-03
## Neg Pred Value         0.5452   0.8436 1.000e+00 1.000e+00 1.000e+00
## Prevalence             0.7061   0.2938 6.552e-05 9.361e-06 9.361e-06
## Detection Rate         0.4792   0.2034 6.552e-05 9.361e-06 9.361e-06
## Detection Prevalence   0.5010   0.4222 4.860e-02 2.070e-02 3.782e-03
## Balanced Accuracy      0.8021   0.6913 9.757e-01 9.897e-01 9.981e-01
##                      Class: 5 Class: 6  Class: 7  Class: 8  Class: 9
## Sensitivity                NA       NA        NA        NA        NA
## Specificity          0.998081 0.998446 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value             NA       NA        NA        NA        NA
## Neg Pred Value             NA       NA        NA        NA        NA
## Prevalence           0.000000 0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Rate       0.000000 0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.001919 0.001554 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy          NA       NA        NA        NA        NA
#Even though Decision tree provides better accuracy than other methods it overfits .. 
#Random forest Avoids overfitting. So it the best method with Accuracy 68.7 %