Classification using different Algorithms

##     X Y label
## 1   5 a  BLUE
## 2   5 b BLACK
## 3   5 c  BLUE
## 4   5 d BLACK
## 5   5 e BLACK
## 6   5 f BLACK
## 7  19 a  BLUE
## 8  19 b  BLUE
## 9  19 c  BLUE
## 10 19 d  BLUE
## 11 19 e BLACK
## 12 19 f  BLUE
## 13 35 a BLACK
## 14 35 b BLACK
## 15 35 c  BLUE
## 16 35 d BLACK
## 17 35 e BLACK
## 18 35 f BLACK
## 19 51 a BLACK
## 20 51 b BLACK
## 21 51 c  BLUE
## 22 51 d BLACK
## 23 51 e BLACK
## 24 51 f BLACK
## 25 55 a BLACK
## 26 55 b BLACK
## 27 55 c BLACK
## 28 55 d BLACK
## 29 55 e BLACK
## 30 55 f BLACK
## 31 63 a BLACK
## 32 63 b  BLUE
## 33 63 c  BLUE
## 34 63 d  BLUE
## 35 63 e  BLUE
## 36 63 f  BLUE
# Data Preparation and Training/Test datasets split
data_trans <- data
levels(data_trans$label) <- c(1,2)
levels(data_trans$Y) <- c(1,2,3,4,5,6)

set.seed(123)
trainidx<-sample(1:nrow(data) , size=round(0.7*nrow(data_trans)),replace=F) 
data_train <- data_trans[trainidx,]
data_test <- data_trans[-trainidx,]

# KNN

library(class)
KNN <- knn(data_train, data_test, data_train$label, k=3)

cm_KNN <-  table(KNN, data_test$label)

acc_KNN <- sum(diag(cm_KNN))/sum(cm_KNN)
tpr_KNN <- cm_KNN[1,1]/sum(cm_KNN[1,1], cm_KNN[2,1])
fpr_KNN <- cm_KNN[1,2]/sum(cm_KNN[1,2], cm_KNN[2,2])
tnr_KNN <- 1 - fpr_KNN

library(pROC)
roc_KNN <- roc(data_test$label, as.numeric(KNN))
auc_KNN <- roc_KNN$auc

KNNrow <- c("KNN",round(acc_KNN,2),round(tpr_KNN,2),round(fpr_KNN,2), round(tnr_KNN,2), round(auc_KNN,2))


# Logistic Regression

LOGR <- glm(label ~ ., data = data_train, family = "binomial")
LOGR_pred <- predict(LOGR, data_test, type = 'response')
LOGR_pred_labels <- ifelse(LOGR_pred > 0.5,1,2) 
#LOGR_cm <-  table(data_test$label, LOGR_pred > 0.5)
cm_LOGR <-  table(LOGR_pred_labels, data_test$label)


acc_LOGR <- sum(diag(cm_LOGR)) / sum(cm_LOGR)
tpr_LOGR <- cm_LOGR[1,1]/sum(cm_LOGR[1,1], cm_LOGR[2,1])
fpr_LOGR <- cm_LOGR[1,2]/sum(cm_LOGR[1,2], cm_LOGR[2,2])
tnr_LOGR <- 1 - fpr_LOGR

library(ROCR)
ROCRpred <- prediction(LOGR_pred, data_test$label)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
#plot(ROCRperf, col="blue")
ROCRperfauc <- performance(ROCRpred, 'auc')
auc_LOGR <- ROCRperfauc@y.values[[1]]

#library(pROC)
#roc_LOGR <- roc(data_test$label, LOGR_pred)
#auc_LOGR <- roc_LOGR$auc

LOGRrow <- c("LOGR",round(acc_LOGR,2),round(tpr_LOGR,2),round(fpr_LOGR,2), round(tnr_LOGR,2), round(auc_LOGR,2))

# SVM

library(e1071)
SVM <- svm(label ~ ., data = data_train, kernel="radial")
SVM_pred <- predict(SVM, data_test)

cm_SVM <-  table(SVM_pred, data_test$label)

acc_SVM <- sum(diag(cm_SVM))/sum(cm_SVM)
tpr_SVM <- cm_SVM[1,1]/sum(cm_SVM[1,1], cm_SVM[2,1])
fpr_SVM <- cm_SVM[1,2]/sum(cm_SVM[1,2], cm_SVM[2,2])
tnr_SVM <- 1 - fpr_SVM

library(pROC)
roc_SVM <- roc(data_test$label, as.numeric(SVM_pred))
auc_SVM <- roc_SVM$auc

SVMrow <- c("SVM",round(acc_SVM,2),round(tpr_SVM,2),round(fpr_SVM,2), round(tnr_SVM,2), round(auc_SVM,2))

# Naive Bayes

library(e1071)
NAIVEB <- naiveBayes(label ~ ., data = data_train)
NAIVEB_pred <- predict(NAIVEB, data_test)
NAIVEB_pred_raw <- predict(NAIVEB, data_test, type = "raw")

cm_NAIVEB <-  table(NAIVEB_pred, data_test$label)

acc_NAIVEB <- sum(diag(cm_NAIVEB))/sum(cm_NAIVEB)
tpr_NAIVEB <- cm_NAIVEB[1,1]/sum(cm_NAIVEB[1,1], cm_NAIVEB[2,1])
fpr_NAIVEB <- cm_NAIVEB[1,2]/sum(cm_NAIVEB[1,2], cm_NAIVEB[2,2])
tnr_NAIVEB<- 1 - fpr_NAIVEB

library(pROC)
roc_NAIVEB <- roc(data_test$label, as.numeric(NAIVEB_pred))
auc_NAIVEB <- roc_NAIVEB$auc

NAIVEBrow <- c("NAIVEB",round(acc_NAIVEB,2),round(tpr_NAIVEB,2),round(fpr_NAIVEB,2), round(tnr_NAIVEB,2), round(auc_NAIVEB,2))

# Linear Discriminant Analysis

library(MASS)
LDA <- lda(label ~ ., data = data_train)
LDA_pred <- predict(LDA, data_test)

cm_LDA <-  table(LDA_pred$class, data_test$label)

acc_LDA <- sum(diag(cm_LDA))/sum(cm_LDA)
tpr_LDA <- cm_LDA[1,1]/sum(cm_LDA[1,1], cm_LDA[2,1])
fpr_LDA <- cm_LDA[1,2]/sum(cm_LDA[1,2], cm_LDA[2,2])
tnr_LDA<- 1 - fpr_LDA

library(pROC)
roc_LDA <- roc(data_test$label, as.numeric(LDA_pred$class))
auc_LDA <- roc_LDA$auc

LDArow <- c("LDA",round(acc_LDA,2),round(tpr_LDA,2),round(fpr_LDA,2), round(tnr_LDA,2), round(auc_LDA,2))

# Decision Trees

library(rpart)
DT <- rpart(label ~ ., data = data_train, method = "class")
DT_pred <- predict(DT, data_test, type = "class")

cm_DT <-  table(DT_pred, data_test$label)

acc_DT <- sum(diag(cm_DT))/sum(cm_DT)
tpr_DT <- cm_DT[1,1]/sum(cm_DT[1,1], cm_DT[2,1])
fpr_DT <- cm_DT[1,2]/sum(cm_DT[1,2], cm_DT[2,2])
tnr_DT<- 1 - fpr_DT

library(pROC)
roc_DT <- roc(data_test$label, as.numeric(DT_pred))
auc_DT <- roc_DT$auc

DTrow <- c("DT",round(acc_DT,2),round(tpr_DT,2),round(fpr_DT,2), round(tnr_DT,2), round(auc_DT,2))

# Final Results

results <- data.frame(matrix(ncol = 6, nrow = 0))
results <- rbind(results,KNNrow,LOGRrow,SVMrow,NAIVEBrow,LDArow,DTrow)
colnames(results) <- c("ALGO", "ACC", "TPR (Sensitivity)", "FPR", "TNR (Specificity)", "AUC")
#results
library(knitr)
kable(results[order(-as.numeric(results$ACC)),])
ALGO ACC TPR (Sensitivity) FPR TNR (Specificity) AUC
1 KNN 0.91 1 0.2 0.8 0.9
4 NAIVEB 0.64 1 0.8 0.2 0.6
5 LDA 0.64 0.83 0.6 0.4 0.62
3 SVM 0.55 1 1 0 0.5
6 DT 0.55 0.83 0.8 0.2 0.52
2 LOGR 0.36 0.17 0.4 0.6 0.53

Conclusion

KNN model provides the highest performance amongst all algorithms tested with the dataset. Almost all the models had high Sensitivity (TPR) with the exception of the Logistic Regression (0.17), the best ones were KNN, Naive Bayes and Support Vector Machines. In terms of Specificity (1-FPR), KNN and Logistic Regression showed the best performance (0.8 and 0.6 respectively), Finally, in terms of AUC, KNN and LDA showed better numbers. It was interesting to see that Logistic Regression was the worst across the board.