Load the data

Load the data and set the categorical data as factor

require(stringr)
## Loading required package: stringr
df <- read.csv("C:\\Users\\Charls\\Documents\\CunyMSDS\\Data622\\Assignments\\HW2\\dataset.csv")

#df$Y_1 <- as.integer(as.factor(df$Y))

df$Y_n <- sapply(df$Y, function(x)
{switch(as.character(x), "a" = 1, "b" = 2, "c" = 3, "d" = 4, "e" = 5 , "f" = 6 )
})

df$label_n <- ifelse(as.character(df$label) == "BLACK", 1, 0)
table(df[,5])
## 
##  0  1 
## 14 22
library(caTools)

set.seed(123)
split = sample.split(df$label_n, SplitRatio = 0.8)
training_set = subset(df, split == TRUE)
test_set = subset(df, split == FALSE)
ClassiferMetrics <- data.frame(matrix(ncol = 5, nrow = 0) ,stringsAsFactors = FALSE)

KNN Model

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(class)  #knn
library(caret)   # confusion metric
## Loading required package: lattice
## Loading required package: ggplot2
trainData1 = training_set[,c(1,4)]
testData1 = test_set[,c(1,4)]

train_lbls <- training_set$label_n
test_lbls <- test_set$label_n

knn_model <- knn(train = trainData1, test = testData1, cl= train_lbls,k = 3, prob = TRUE)

roc <- roc(test_set$label_n, attributes(knn_model)$prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.8333
plot.roc(roc,
main="KNN Classifer | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best") 

predict.knn_result <- confusionMatrix(knn_model, as.factor(test_set$label_n), positive = "1")
predict.knn_result$table
##           Reference
## Prediction 0 1
##          0 2 0
##          1 1 4
knn_auc <- round(as.numeric(roc$auc),2)
knn_tpr <- round(as.numeric(predict.knn_result$byClass['Sensitivity']),2)
knn_fpr <- round(as.numeric(1- predict.knn_result$byClass['Specificity']),2)
knn_acc <- round(as.numeric(predict.knn_result$overall['Accuracy']),2)

paste("AUC" , knn_auc, sep = ": ")
## [1] "AUC: 0.83"
paste("TPR" , knn_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , knn_fpr, sep = ": ")
## [1] "FPR: 0.33"
paste("Accuracy" , knn_acc, sep = ": ")
## [1] "Accuracy: 0.86"
ClassiferMetrics <- rbind(ClassiferMetrics, c("KNN", knn_auc , knn_tpr , knn_fpr , knn_acc), stringsAsFactors = FALSE)

Naive Bayes

library(e1071)

nb_model=naiveBayes(as.factor(label_n) ~ X + Y_n, data=training_set)

nm_pred_prob=predict(nb_model,newdata = test_set, type =  "raw" )
nm_pred_class=predict(nb_model,newdata = test_set, type ="class")

print(nm_pred_prob[,2])
## [1] 0.5999947 0.5825399 0.6438226 0.7253454 0.6095118 0.6396973 0.6198102
roc <- roc(test_set$label_n, nm_pred_prob[,2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.5
plot.roc(roc,
main="Naive Bayes | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best") 

predict.nb_result <- confusionMatrix(nm_pred_class, as.factor(test_set$label_n), positive = "1")
predict.nb_result$table
##           Reference
## Prediction 0 1
##          0 0 0
##          1 3 4
nb_auc <- round(as.numeric(roc$auc),2)
nb_tpr <- round(as.numeric(predict.nb_result$byClass['Sensitivity']),2)
nb_fpr <- round(as.numeric(1- predict.nb_result$byClass['Specificity']),2)
nb_acc <- round(as.numeric(predict.nb_result$overall['Accuracy']),2)

paste("AUC" , nb_auc, sep = ": ")
## [1] "AUC: 0.5"
paste("TPR" , nb_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , nb_fpr, sep = ": ")
## [1] "FPR: 1"
paste("Accuracy" , nb_acc, sep = ": ")
## [1] "Accuracy: 0.57"
ClassiferMetrics <- rbind(ClassiferMetrics, c("Naive Bayes", nb_auc , nb_tpr , nb_fpr , nb_acc), stringsAsFactors = FALSE)

Linear regression

log_classifer <- glm(label_n ~ X + Y_n, data=training_set, family = "binomial")
lr_pred_prob=predict(log_classifer,newdata = test_set, type ="response" )
# consider the threshold of the linear regression classifer as 0.5
lr_pred_class  <- ifelse(lr_pred_prob > 0.5, 1, 0)

roc <- roc(test_set$label_n, lr_pred_class)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.5
plot.roc(roc,
main="Logistic Regression | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best") 

predict.lr_result <- confusionMatrix(as.factor(lr_pred_class), as.factor(test_set$label_n), positive = "1")
## Warning in confusionMatrix.default(as.factor(lr_pred_class),
## as.factor(test_set$label_n), : Levels are not in the same order for reference
## and data. Refactoring data to match.
predict.lr_result$table
##           Reference
## Prediction 0 1
##          0 0 0
##          1 3 4
lr_auc <- round(as.numeric(roc$auc),2)
lr_tpr <- round(as.numeric(predict.lr_result$byClass['Sensitivity']),2)
lr_fpr <- round(as.numeric(1- predict.lr_result$byClass['Specificity']),2)
lr_acc <- round(as.numeric(predict.lr_result$overall['Accuracy']),2)

paste("AUC" , lr_auc, sep = ": ")
## [1] "AUC: 0.5"
paste("TPR" , lr_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , lr_fpr, sep = ": ")
## [1] "FPR: 1"
paste("Accuracy" , lr_acc, sep = ": ")
## [1] "Accuracy: 0.57"
ClassiferMetrics <- rbind(ClassiferMetrics, c("Logistic Regression", lr_auc , lr_tpr , lr_fpr , lr_acc), stringsAsFactors = FALSE)

LDA

library("MASS")
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
lda_classifer <- lda(label_n ~ X + Y_n, data=training_set)
lda_pred_class=predict(lda_classifer,newdata = test_set)

roc <- roc(test_set$label_n, as.numeric(lda_pred_class$class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.5
plot.roc(roc,
main="LDA | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best") 

predict.lda_result <- confusionMatrix(lda_pred_class$class, as.factor(test_set$label_n), positive = "1")
predict.lda_result$table
##           Reference
## Prediction 0 1
##          0 0 0
##          1 3 4
lda_auc <- round(as.numeric(roc$auc),2)
lda_tpr <- round(as.numeric(predict.lda_result$byClass['Sensitivity']),2)
lda_fpr <- round(as.numeric(1- predict.lda_result$byClass['Specificity']),2)
lda_acc <- round(as.numeric(predict.lda_result$overall['Accuracy']),2)

paste("AUC" , lda_auc, sep = ": ")
## [1] "AUC: 0.5"
paste("TPR" , lda_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , lda_fpr, sep = ": ")
## [1] "FPR: 1"
paste("Accuracy" , lda_acc, sep = ": ")
## [1] "Accuracy: 0.57"
ClassiferMetrics <- rbind(ClassiferMetrics, c("LDA", lda_auc , lda_tpr , lda_fpr , lda_acc), stringsAsFactors = FALSE)

Decision Tree

library(rpart)
tree <- rpart(label_n ~ X + Y_n, data=training_set)
tree_predicted_prob<- predict(tree,newdata = test_set)
tree_predicted_prob
##         9        13        15        18        26        27        33 
## 0.4545455 0.7222222 0.7222222 0.7222222 0.7222222 0.7222222 0.7222222
tree_predicted_class<-round(tree_predicted_prob)
tree_predicted_class
##  9 13 15 18 26 27 33 
##  0  1  1  1  1  1  1
roc <- roc(test_set$label_n, as.numeric(tree_predicted_class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.6667
plot.roc(roc,
main="Decision Tree | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best") 

predict.tree_result <- confusionMatrix(as.factor(tree_predicted_class), as.factor(test_set$label_n), positive = "1")
predict.tree_result$table
##           Reference
## Prediction 0 1
##          0 1 0
##          1 2 4
tree_auc <- round(as.numeric(roc$auc),2)
tree_tpr <- round(as.numeric(predict.tree_result$byClass['Sensitivity']),2)
tree_fpr <- round(as.numeric(1- predict.tree_result$byClass['Specificity']),2)
tree_acc <- round(as.numeric(predict.tree_result$overall['Accuracy']),2)

paste("AUC" , tree_auc, sep = ": ")
## [1] "AUC: 0.67"
paste("TPR" , tree_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , tree_fpr, sep = ": ")
## [1] "FPR: 0.67"
paste("Accuracy" , tree_acc, sep = ": ")
## [1] "Accuracy: 0.71"
ClassiferMetrics <- rbind(ClassiferMetrics, c("Decision Tree", tree_auc , tree_tpr , tree_fpr , tree_acc), stringsAsFactors = FALSE)

SVM

data.svm<-svm(factor(label_n)~ X + Y_n,data=training_set,scale=FALSE,kernel="radial",cost=5)
svm_predicted_class<- predict(data.svm,newdata = test_set)


roc <- roc(test_set$label_n, as.numeric(svm_predicted_class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.8333
plot.roc(roc,
main="SVM - RBS Kernal | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best") 

predict.svm_result <- confusionMatrix(as.factor(svm_predicted_class), as.factor(test_set$label_n), positive = "1")
predict.svm_result$table
##           Reference
## Prediction 0 1
##          0 2 0
##          1 1 4
svm_auc <- round(as.numeric(roc$auc),2)
svm_tpr <- round(as.numeric(predict.svm_result$byClass['Sensitivity']),2)
svm_fpr <- round(as.numeric(1- predict.svm_result$byClass['Specificity']),2)
svm_acc <- round(as.numeric(predict.svm_result$overall['Accuracy']),2)

paste("AUC" , svm_auc, sep = ": ")
## [1] "AUC: 0.83"
paste("TPR" , svm_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , svm_fpr, sep = ": ")
## [1] "FPR: 0.33"
paste("Accuracy" , svm_acc, sep = ": ")
## [1] "Accuracy: 0.86"
ClassiferMetrics <- rbind(ClassiferMetrics, c("SVM", svm_auc , svm_tpr , svm_fpr , svm_acc), stringsAsFactors = FALSE)
library(knitr)

metrics <- c("Classifer Algo", "AUC", "TPR","FPR" ,  "Accuracy")
colnames(ClassiferMetrics) <- metrics

kable(ClassiferMetrics)
Classifer Algo AUC TPR FPR Accuracy
KNN 0.83 1 0.33 0.86
Naive Bayes 0.5 1 1 0.57
Logistic Regression 0.5 1 1 0.57
LDA 0.5 1 1 0.57
Decision Tree 0.67 1 0.67 0.71
SVM 0.83 1 0.33 0.86