setwd("~/Downloads/Data Mining Test2")
#Inserting the train dataset
require(readr)
require(caret)
require(mlbench)
require(e1071)
require(randomForest)
require(C50)
require(pROC)
train <- read.csv("training.csv")
test <- read.csv("testing.csv")
str(train)
## 'data.frame': 25009 obs. of 11 variables:
## $ X1 : int 2 3 4 4 1 1 2 3 4 1 ...
## $ X10 : int 11 12 10 1 2 9 1 5 1 1 ...
## $ X1.1: int 2 3 4 4 1 1 2 3 4 2 ...
## $ X11 : int 13 11 11 13 4 12 2 6 4 1 ...
## $ X1.2: int 2 3 4 4 1 1 2 3 4 3 ...
## $ X13 : int 10 13 1 12 5 10 3 9 2 9 ...
## $ X1.3: int 2 3 4 4 1 1 2 3 4 1 ...
## $ X12 : int 12 10 13 11 3 11 4 7 3 5 ...
## $ X1.4: int 2 3 4 4 1 1 2 3 4 2 ...
## $ X1.5: int 1 1 12 10 6 13 5 8 5 3 ...
## $ X9 : int 9 9 9 9 8 8 8 8 8 1 ...
colnames(train)<-c("S1","C1","S2","C2","S3","C3","S4","C4","S5","C5","Poker_Hand")
colnames(test)<-c("S1","C1","S2","C2","S3","C3","S4","C4","S5","C5","Poker_Hand")
#Change data type from interger to factor
train$S1<- as.factor(train$S1)
train$C1<- as.factor(train$C1)
train$S2<- as.factor(train$S2)
train$C2<- as.factor(train$C2)
train$S3<- as.factor(train$S3)
train$C3<- as.factor(train$C3)
train$S4<- as.factor(train$S4)
train$C4<- as.factor(train$C4)
train$S5<- as.factor(train$S5)
train$C5<- as.factor(train$C5)
train$Poker_Hand<-as.factor(train$Poker_Hand)
test$S1<- as.factor(test$S1)
test$C1<- as.factor(test$C1)
test$S2<- as.factor(test$S2)
test$C2<- as.factor(test$C2)
test$S3<- as.factor(test$S3)
test$C3<- as.factor(test$C3)
test$S4<- as.factor(test$S4)
test$C4<- as.factor(test$C4)
test$S5<- as.factor(test$S5)
test$C5<- as.factor(test$C5)
test$Poker_Hand<-as.factor(test$Poker_Hand)
str(train)
## 'data.frame': 25009 obs. of 11 variables:
## $ S1 : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 1 ...
## $ C1 : Factor w/ 13 levels "1","2","3","4",..: 11 12 10 1 2 9 1 5 1 1 ...
## $ S2 : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 2 ...
## $ C2 : Factor w/ 13 levels "1","2","3","4",..: 13 11 11 13 4 12 2 6 4 1 ...
## $ S3 : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 3 ...
## $ C3 : Factor w/ 13 levels "1","2","3","4",..: 10 13 1 12 5 10 3 9 2 9 ...
## $ S4 : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 1 ...
## $ C4 : Factor w/ 13 levels "1","2","3","4",..: 12 10 13 11 3 11 4 7 3 5 ...
## $ S5 : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 1 2 3 4 2 ...
## $ C5 : Factor w/ 13 levels "1","2","3","4",..: 1 1 12 10 6 13 5 8 5 3 ...
## $ Poker_Hand: Factor w/ 10 levels "0","1","2","3",..: 10 10 10 10 9 9 9 9 9 2 ...
head(train)
## S1 C1 S2 C2 S3 C3 S4 C4 S5 C5 Poker_Hand
## 1 2 11 2 13 2 10 2 12 2 1 9
## 2 3 12 3 11 3 13 3 10 3 1 9
## 3 4 10 4 11 4 1 4 13 4 12 9
## 4 4 1 4 13 4 12 4 11 4 10 9
## 5 1 2 1 4 1 5 1 3 1 6 8
## 6 1 9 1 12 1 10 1 11 1 13 8
colSums(is.na(test))
## S1 C1 S2 C2 S3 C3
## 0 0 0 0 0 0
## S4 C4 S5 C5 Poker_Hand
## 0 0 0 0 0
colSums(is.na(train))
## S1 C1 S2 C2 S3 C3
## 0 0 0 0 0 0
## S4 C4 S5 C5 Poker_Hand
## 0 0 0 0 0
# NaiveBayes Algorithm
model1 <- naiveBayes(Poker_Hand~.,data = train)
summary(model1)
## Length Class Mode
## apriori 10 table numeric
## tables 10 -none- list
## levels 10 -none- character
## call 4 -none- call
prediction1 <- predict(model1,test)
confusionMatrix(test[,11],prediction1) #Accuracy 49.39 %
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 49775 3748 0 0 0 0 0 3 0 0
## 1 42110 2985 0 0 0 0 0 4 0 6
## 2 4899 293 0 0 0 0 0 0 0 0
## 3 2089 120 0 0 0 0 0 0 0 2
## 4 376 27 0 0 0 0 0 0 0 1
## 5 186 19 0 0 0 0 0 0 0 0
## 6 159 7 0 0 0 0 0 0 0 0
## 7 19 0 0 0 0 0 0 0 0 0
## 8 1 0 0 0 0 0 0 0 0 0
## 9 1 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.4939
## 95% CI : (0.4909, 0.4969)
## No Information Rate : 0.9325
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0035
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.49967 0.41464 NA NA NA NA
## Specificity 0.48011 0.57724 0.9514 0.9793 0.996218 0.998081
## Pos Pred Value 0.92992 0.06618 NA NA NA NA
## Neg Pred Value 0.06499 0.93173 NA NA NA NA
## Prevalence 0.93246 0.06739 0.0000 0.0000 0.000000 0.000000
## Detection Rate 0.46593 0.02794 0.0000 0.0000 0.000000 0.000000
## Detection Prevalence 0.50104 0.42221 0.0486 0.0207 0.003782 0.001919
## Balanced Accuracy 0.48989 0.49594 NA NA NA NA
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity NA 0.000e+00 NA 0.000e+00
## Specificity 0.998446 9.998e-01 1.000e+00 1.000e+00
## Pos Pred Value NA 0.000e+00 NA 0.000e+00
## Neg Pred Value NA 9.999e-01 NA 9.999e-01
## Prevalence 0.000000 6.552e-05 0.000e+00 8.425e-05
## Detection Rate 0.000000 0.000e+00 0.000e+00 0.000e+00
## Detection Prevalence 0.001554 1.779e-04 9.361e-06 9.361e-06
## Balanced Accuracy NA 4.999e-01 NA 5.000e-01
# Support vector machine
set.seed(123)
model2 <- svm(Poker_Hand~.,data = train)
prediction2 <- predict(model2,test)
confusionMatrix(test[,11],prediction2) #Accuracy 50.1 %
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 53526 0 0 0 0 0 0 0 0 0
## 1 45105 0 0 0 0 0 0 0 0 0
## 2 5192 0 0 0 0 0 0 0 0 0
## 3 2211 0 0 0 0 0 0 0 0 0
## 4 404 0 0 0 0 0 0 0 0 0
## 5 205 0 0 0 0 0 0 0 0 0
## 6 166 0 0 0 0 0 0 0 0 0
## 7 19 0 0 0 0 0 0 0 0 0
## 8 1 0 0 0 0 0 0 0 0 0
## 9 1 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.501
## 95% CI : (0.498, 0.504)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.501 NA NA NA NA NA
## Specificity NA 0.5778 0.9514 0.9793 0.996218 0.998081
## Pos Pred Value NA NA NA NA NA NA
## Neg Pred Value NA NA NA NA NA NA
## Prevalence 1.000 0.0000 0.0000 0.0000 0.000000 0.000000
## Detection Rate 0.501 0.0000 0.0000 0.0000 0.000000 0.000000
## Detection Prevalence 0.501 0.4222 0.0486 0.0207 0.003782 0.001919
## Balanced Accuracy NA NA NA NA NA NA
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity NA NA NA NA
## Specificity 0.998446 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value NA NA NA NA
## Neg Pred Value NA NA NA NA
## Prevalence 0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Rate 0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.001554 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy NA NA NA NA
#C5.0 Algorithm
model3<-C5.0(Poker_Hand~.,data = train)
prediction3<-predict(model3,test)
confusionMatrix(test[,11],prediction3) #Accuracy 73.55 %
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 50735 2732 13 14 16 8 0 0 0 8
## 1 17823 26838 295 135 3 5 0 0 0 6
## 2 401 4741 36 14 0 0 0 0 0 0
## 3 301 936 5 961 0 0 8 0 0 0
## 4 372 32 0 0 0 0 0 0 0 0
## 5 193 11 0 1 0 0 0 0 0 0
## 6 0 98 0 67 0 0 1 0 0 0
## 7 0 4 0 14 0 0 1 0 0 0
## 8 1 0 0 0 0 0 0 0 0 0
## 9 1 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7355
## 95% CI : (0.7328, 0.7381)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.503
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.7266 0.7583 0.103152 0.796849 0.0000000
## Specificity 0.9246 0.7443 0.951578 0.988166 0.9962176
## Pos Pred Value 0.9479 0.5950 0.006934 0.434645 0.0000000
## Neg Pred Value 0.6418 0.8614 0.996920 0.997658 0.9998215
## Prevalence 0.6536 0.3313 0.003267 0.011289 0.0001779
## Detection Rate 0.4749 0.2512 0.000337 0.008996 0.0000000
## Detection Prevalence 0.5010 0.4222 0.048601 0.020696 0.0037817
## Balanced Accuracy 0.8256 0.7513 0.527365 0.892507 0.4981088
## Class: 5 Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.0000000 1.000e-01 NA NA 0.000e+00
## Specificity 0.9980808 9.985e-01 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value 0.0000000 6.024e-03 NA NA 0.000e+00
## Neg Pred Value 0.9998781 9.999e-01 NA NA 9.999e-01
## Prevalence 0.0001217 9.361e-05 0.0000000 0.000e+00 1.310e-04
## Detection Rate 0.0000000 9.361e-06 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.0019189 1.554e-03 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy 0.4990404 5.492e-01 NA NA 5.000e-01
#after running tuneRF it is found that the best mtry will 10
set.seed(121)
tuneRF(train[,-11],train[,11],stepFactor = 2)
## mtry = 3 OOB error = 41.58%
## Searching left ...
## mtry = 2 OOB error = 43.73%
## -0.05173574 0.05
## Searching right ...
## mtry = 6 OOB error = 38.51%
## 0.07385326 0.05
## mtry = 10 OOB error = 38.02%
## 0.01266743 0.05

## mtry OOBError
## 2.OOB 2 0.4373226
## 3.OOB 3 0.4158103
## 6.OOB 6 0.3851014
## 10.OOB 10 0.3802231
# Random forest
set.seed(111)
model4<-randomForest(Poker_Hand~.,data=train)
prediction4<-predict(model4,test)
confusionMatrix(test[,11],prediction4)#Accuracy 68.27 %
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 51188 2338 0 0 0 0 0 0 0 0
## 1 23373 21732 0 0 0 0 0 0 0 0
## 2 282 4903 7 0 0 0 0 0 0 0
## 3 7 2203 0 1 0 0 0 0 0 0
## 4 378 25 0 0 1 0 0 0 0 0
## 5 204 1 0 0 0 0 0 0 0 0
## 6 0 166 0 0 0 0 0 0 0 0
## 7 0 19 0 0 0 0 0 0 0 0
## 8 0 1 0 0 0 0 0 0 0 0
## 9 1 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6827
## 95% CI : (0.6799, 0.6855)
## No Information Rate : 0.7061
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3923
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.6786 0.6924 1.000e+00 1.000e+00 1.000e+00
## Specificity 0.9255 0.6902 9.515e-01 9.793e-01 9.962e-01
## Pos Pred Value 0.9563 0.4818 1.348e-03 4.523e-04 2.475e-03
## Neg Pred Value 0.5452 0.8436 1.000e+00 1.000e+00 1.000e+00
## Prevalence 0.7061 0.2938 6.552e-05 9.361e-06 9.361e-06
## Detection Rate 0.4792 0.2034 6.552e-05 9.361e-06 9.361e-06
## Detection Prevalence 0.5010 0.4222 4.860e-02 2.070e-02 3.782e-03
## Balanced Accuracy 0.8021 0.6913 9.757e-01 9.897e-01 9.981e-01
## Class: 5 Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity NA NA NA NA NA
## Specificity 0.998081 0.998446 0.9998221 1.000e+00 1.000e+00
## Pos Pred Value NA NA NA NA NA
## Neg Pred Value NA NA NA NA NA
## Prevalence 0.000000 0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Rate 0.000000 0.000000 0.0000000 0.000e+00 0.000e+00
## Detection Prevalence 0.001919 0.001554 0.0001779 9.361e-06 9.361e-06
## Balanced Accuracy NA NA NA NA NA
#Even though Decision tree provides better accuracy than other methods it overfits ..
#Random forest Avoids overfitting. So it the best method with Accuracy 68.7 %