## X Y label
## 1 5 a BLUE
## 2 5 b BLACK
## 3 5 c BLUE
## 4 5 d BLACK
## 5 5 e BLACK
## 6 5 f BLACK
## 7 19 a BLUE
## 8 19 b BLUE
## 9 19 c BLUE
## 10 19 d BLUE
## 11 19 e BLACK
## 12 19 f BLUE
## 13 35 a BLACK
## 14 35 b BLACK
## 15 35 c BLUE
## 16 35 d BLACK
## 17 35 e BLACK
## 18 35 f BLACK
## 19 51 a BLACK
## 20 51 b BLACK
## 21 51 c BLUE
## 22 51 d BLACK
## 23 51 e BLACK
## 24 51 f BLACK
## 25 55 a BLACK
## 26 55 b BLACK
## 27 55 c BLACK
## 28 55 d BLACK
## 29 55 e BLACK
## 30 55 f BLACK
## 31 63 a BLACK
## 32 63 b BLUE
## 33 63 c BLUE
## 34 63 d BLUE
## 35 63 e BLUE
## 36 63 f BLUE
# Data Preparation and Training/Test datasets split
data_trans <- data
levels(data_trans$label) <- c(1,2)
levels(data_trans$Y) <- c(1,2,3,4,5,6)
set.seed(123)
trainidx<-sample(1:nrow(data) , size=round(0.7*nrow(data_trans)),replace=F)
data_train <- data_trans[trainidx,]
data_test <- data_trans[-trainidx,]
# KNN
library(class)
KNN <- knn(data_train, data_test, data_train$label, k=3)
cm_KNN <- table(KNN, data_test$label)
acc_KNN <- sum(diag(cm_KNN))/sum(cm_KNN)
tpr_KNN <- cm_KNN[1,1]/sum(cm_KNN[1,1], cm_KNN[2,1])
fpr_KNN <- cm_KNN[1,2]/sum(cm_KNN[1,2], cm_KNN[2,2])
tnr_KNN <- 1 - fpr_KNN
library(pROC)
roc_KNN <- roc(data_test$label, as.numeric(KNN))
auc_KNN <- roc_KNN$auc
KNNrow <- c("KNN",round(acc_KNN,2),round(tpr_KNN,2),round(fpr_KNN,2), round(tnr_KNN,2), round(auc_KNN,2))
# Logistic Regression
LOGR <- glm(label ~ ., data = data_train, family = "binomial")
LOGR_pred <- predict(LOGR, data_test, type = 'response')
LOGR_pred_labels <- ifelse(LOGR_pred > 0.5,1,2)
#LOGR_cm <- table(data_test$label, LOGR_pred > 0.5)
cm_LOGR <- table(LOGR_pred_labels, data_test$label)
acc_LOGR <- sum(diag(cm_LOGR)) / sum(cm_LOGR)
tpr_LOGR <- cm_LOGR[1,1]/sum(cm_LOGR[1,1], cm_LOGR[2,1])
fpr_LOGR <- cm_LOGR[1,2]/sum(cm_LOGR[1,2], cm_LOGR[2,2])
tnr_LOGR <- 1 - fpr_LOGR
library(ROCR)
ROCRpred <- prediction(LOGR_pred, data_test$label)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
#plot(ROCRperf, col="blue")
ROCRperfauc <- performance(ROCRpred, 'auc')
auc_LOGR <- ROCRperfauc@y.values[[1]]
#library(pROC)
#roc_LOGR <- roc(data_test$label, LOGR_pred)
#auc_LOGR <- roc_LOGR$auc
LOGRrow <- c("LOGR",round(acc_LOGR,2),round(tpr_LOGR,2),round(fpr_LOGR,2), round(tnr_LOGR,2), round(auc_LOGR,2))
# SVM
library(e1071)
SVM <- svm(label ~ ., data = data_train, kernel="radial")
SVM_pred <- predict(SVM, data_test)
cm_SVM <- table(SVM_pred, data_test$label)
acc_SVM <- sum(diag(cm_SVM))/sum(cm_SVM)
tpr_SVM <- cm_SVM[1,1]/sum(cm_SVM[1,1], cm_SVM[2,1])
fpr_SVM <- cm_SVM[1,2]/sum(cm_SVM[1,2], cm_SVM[2,2])
tnr_SVM <- 1 - fpr_SVM
library(pROC)
roc_SVM <- roc(data_test$label, as.numeric(SVM_pred))
auc_SVM <- roc_SVM$auc
SVMrow <- c("SVM",round(acc_SVM,2),round(tpr_SVM,2),round(fpr_SVM,2), round(tnr_SVM,2), round(auc_SVM,2))
# Naive Bayes
library(e1071)
NAIVEB <- naiveBayes(label ~ ., data = data_train)
NAIVEB_pred <- predict(NAIVEB, data_test)
NAIVEB_pred_raw <- predict(NAIVEB, data_test, type = "raw")
cm_NAIVEB <- table(NAIVEB_pred, data_test$label)
acc_NAIVEB <- sum(diag(cm_NAIVEB))/sum(cm_NAIVEB)
tpr_NAIVEB <- cm_NAIVEB[1,1]/sum(cm_NAIVEB[1,1], cm_NAIVEB[2,1])
fpr_NAIVEB <- cm_NAIVEB[1,2]/sum(cm_NAIVEB[1,2], cm_NAIVEB[2,2])
tnr_NAIVEB<- 1 - fpr_NAIVEB
library(pROC)
roc_NAIVEB <- roc(data_test$label, as.numeric(NAIVEB_pred))
auc_NAIVEB <- roc_NAIVEB$auc
NAIVEBrow <- c("NAIVEB",round(acc_NAIVEB,2),round(tpr_NAIVEB,2),round(fpr_NAIVEB,2), round(tnr_NAIVEB,2), round(auc_NAIVEB,2))
# Linear Discriminant Analysis
library(MASS)
LDA <- lda(label ~ ., data = data_train)
LDA_pred <- predict(LDA, data_test)
cm_LDA <- table(LDA_pred$class, data_test$label)
acc_LDA <- sum(diag(cm_LDA))/sum(cm_LDA)
tpr_LDA <- cm_LDA[1,1]/sum(cm_LDA[1,1], cm_LDA[2,1])
fpr_LDA <- cm_LDA[1,2]/sum(cm_LDA[1,2], cm_LDA[2,2])
tnr_LDA<- 1 - fpr_LDA
library(pROC)
roc_LDA <- roc(data_test$label, as.numeric(LDA_pred$class))
auc_LDA <- roc_LDA$auc
LDArow <- c("LDA",round(acc_LDA,2),round(tpr_LDA,2),round(fpr_LDA,2), round(tnr_LDA,2), round(auc_LDA,2))
# Decision Trees
library(rpart)
DT <- rpart(label ~ ., data = data_train, method = "class")
DT_pred <- predict(DT, data_test, type = "class")
cm_DT <- table(DT_pred, data_test$label)
acc_DT <- sum(diag(cm_DT))/sum(cm_DT)
tpr_DT <- cm_DT[1,1]/sum(cm_DT[1,1], cm_DT[2,1])
fpr_DT <- cm_DT[1,2]/sum(cm_DT[1,2], cm_DT[2,2])
tnr_DT<- 1 - fpr_DT
library(pROC)
roc_DT <- roc(data_test$label, as.numeric(DT_pred))
auc_DT <- roc_DT$auc
DTrow <- c("DT",round(acc_DT,2),round(tpr_DT,2),round(fpr_DT,2), round(tnr_DT,2), round(auc_DT,2))
# Final Results
results <- data.frame(matrix(ncol = 6, nrow = 0))
results <- rbind(results,KNNrow,LOGRrow,SVMrow,NAIVEBrow,LDArow,DTrow)
colnames(results) <- c("ALGO", "ACC", "TPR (Sensitivity)", "FPR", "TNR (Specificity)", "AUC")
#results
library(knitr)
kable(results[order(-as.numeric(results$ACC)),])
ALGO | ACC | TPR (Sensitivity) | FPR | TNR (Specificity) | AUC | |
---|---|---|---|---|---|---|
1 | KNN | 0.91 | 1 | 0.2 | 0.8 | 0.9 |
4 | NAIVEB | 0.64 | 1 | 0.8 | 0.2 | 0.6 |
5 | LDA | 0.64 | 0.83 | 0.6 | 0.4 | 0.62 |
3 | SVM | 0.55 | 1 | 1 | 0 | 0.5 |
6 | DT | 0.55 | 0.83 | 0.8 | 0.2 | 0.52 |
2 | LOGR | 0.36 | 0.17 | 0.4 | 0.6 | 0.53 |
KNN model provides the highest performance amongst all algorithms tested with the dataset. Almost all the models had high Sensitivity (TPR) with the exception of the Logistic Regression (0.17), the best ones were KNN, Naive Bayes and Support Vector Machines. In terms of Specificity (1-FPR), KNN and Logistic Regression showed the best performance (0.8 and 0.6 respectively), Finally, in terms of AUC, KNN and LDA showed better numbers. It was interesting to see that Logistic Regression was the worst across the board.