Load the data

Load the data and set the categorical data as factor

require(stringr)

## Loading required package: stringr

df <- read.csv("C:\\Users\\Charls\\Documents\\CunyMSDS\\Data622\\Assignments\\HW2\\dataset.csv")

#df$Y_1 <- as.integer(as.factor(df$Y))

df$Y_n <- sapply(df$Y, function(x)
{switch(as.character(x), "a" = 1, "b" = 2, "c" = 3, "d" = 4, "e" = 5 , "f" = 6 )
})

df$label_n <- ifelse(as.character(df$label) == "BLACK", 1, 0)

table(df[,5])

## 
##  0  1 
## 14 22

library(caTools)

set.seed(123)
split = sample.split(df$label_n, SplitRatio = 0.8)
training_set = subset(df, split == TRUE)
test_set = subset(df, split == FALSE)

ClassiferMetrics <- data.frame(matrix(ncol = 5, nrow = 0) ,stringsAsFactors = FALSE)

KNN Model

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(class)  #knn
library(caret)   # confusion metric

## Loading required package: lattice

## Loading required package: ggplot2

trainData1 = training_set[,c(1,4)]
testData1 = test_set[,c(1,4)]

train_lbls <- training_set$label_n
test_lbls <- test_set$label_n

knn_model <- knn(train = trainData1, test = testData1, cl= train_lbls,k = 3, prob = TRUE)

roc <- roc(test_set$label_n, attributes(knn_model)$prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc$auc

## Area under the curve: 0.8333

plot.roc(roc,
main="KNN Classifer | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best")

predict.knn_result <- confusionMatrix(knn_model, as.factor(test_set$label_n), positive = "1")
predict.knn_result$table

##           Reference
## Prediction 0 1
##          0 2 0
##          1 1 4

knn_auc <- round(as.numeric(roc$auc),2)
knn_tpr <- round(as.numeric(predict.knn_result$byClass['Sensitivity']),2)
knn_fpr <- round(as.numeric(1- predict.knn_result$byClass['Specificity']),2)
knn_acc <- round(as.numeric(predict.knn_result$overall['Accuracy']),2)

paste("AUC" , knn_auc, sep = ": ")

## [1] "AUC: 0.83"

paste("TPR" , knn_tpr, sep = ": ")

## [1] "TPR: 1"

paste("FPR" , knn_fpr, sep = ": ")

## [1] "FPR: 0.33"

paste("Accuracy" , knn_acc, sep = ": ")

## [1] "Accuracy: 0.86"

ClassiferMetrics <- rbind(ClassiferMetrics, c("KNN", knn_auc , knn_tpr , knn_fpr , knn_acc), stringsAsFactors = FALSE)

Naive Bayes

library(e1071)

nb_model=naiveBayes(as.factor(label_n) ~ X + Y_n, data=training_set)

nm_pred_prob=predict(nb_model,newdata = test_set, type =  "raw" )
nm_pred_class=predict(nb_model,newdata = test_set, type ="class")

print(nm_pred_prob[,2])

## [1] 0.5999947 0.5825399 0.6438226 0.7253454 0.6095118 0.6396973 0.6198102

roc <- roc(test_set$label_n, nm_pred_prob[,2])

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc$auc

## Area under the curve: 0.5

plot.roc(roc,
main="Naive Bayes | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best")

predict.nb_result <- confusionMatrix(nm_pred_class, as.factor(test_set$label_n), positive = "1")
predict.nb_result$table

##           Reference
## Prediction 0 1
##          0 0 0
##          1 3 4

nb_auc <- round(as.numeric(roc$auc),2)
nb_tpr <- round(as.numeric(predict.nb_result$byClass['Sensitivity']),2)
nb_fpr <- round(as.numeric(1- predict.nb_result$byClass['Specificity']),2)
nb_acc <- round(as.numeric(predict.nb_result$overall['Accuracy']),2)

paste("AUC" , nb_auc, sep = ": ")

## [1] "AUC: 0.5"

paste("TPR" , nb_tpr, sep = ": ")

## [1] "TPR: 1"

paste("FPR" , nb_fpr, sep = ": ")

## [1] "FPR: 1"

paste("Accuracy" , nb_acc, sep = ": ")

## [1] "Accuracy: 0.57"

ClassiferMetrics <- rbind(ClassiferMetrics, c("Naive Bayes", nb_auc , nb_tpr , nb_fpr , nb_acc), stringsAsFactors = FALSE)

Linear regression

log_classifer <- glm(label_n ~ X + Y_n, data=training_set, family = "binomial")
lr_pred_prob=predict(log_classifer,newdata = test_set, type ="response" )
# consider the threshold of the linear regression classifer as 0.5
lr_pred_class  <- ifelse(lr_pred_prob > 0.5, 1, 0)

roc <- roc(test_set$label_n, lr_pred_class)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc$auc

## Area under the curve: 0.5

plot.roc(roc,
main="Logistic Regression | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best")

predict.lr_result <- confusionMatrix(as.factor(lr_pred_class), as.factor(test_set$label_n), positive = "1")

## Warning in confusionMatrix.default(as.factor(lr_pred_class),
## as.factor(test_set$label_n), : Levels are not in the same order for reference
## and data. Refactoring data to match.

predict.lr_result$table

##           Reference
## Prediction 0 1
##          0 0 0
##          1 3 4

lr_auc <- round(as.numeric(roc$auc),2)
lr_tpr <- round(as.numeric(predict.lr_result$byClass['Sensitivity']),2)
lr_fpr <- round(as.numeric(1- predict.lr_result$byClass['Specificity']),2)
lr_acc <- round(as.numeric(predict.lr_result$overall['Accuracy']),2)

paste("AUC" , lr_auc, sep = ": ")

## [1] "AUC: 0.5"

paste("TPR" , lr_tpr, sep = ": ")

## [1] "TPR: 1"

paste("FPR" , lr_fpr, sep = ": ")

## [1] "FPR: 1"

paste("Accuracy" , lr_acc, sep = ": ")

## [1] "Accuracy: 0.57"

ClassiferMetrics <- rbind(ClassiferMetrics, c("Logistic Regression", lr_auc , lr_tpr , lr_fpr , lr_acc), stringsAsFactors = FALSE)

LDA

library("MASS")

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

lda_classifer <- lda(label_n ~ X + Y_n, data=training_set)
lda_pred_class=predict(lda_classifer,newdata = test_set)

roc <- roc(test_set$label_n, as.numeric(lda_pred_class$class))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc$auc

## Area under the curve: 0.5

plot.roc(roc,
main="LDA | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best")

predict.lda_result <- confusionMatrix(lda_pred_class$class, as.factor(test_set$label_n), positive = "1")
predict.lda_result$table

##           Reference
## Prediction 0 1
##          0 0 0
##          1 3 4

lda_auc <- round(as.numeric(roc$auc),2)
lda_tpr <- round(as.numeric(predict.lda_result$byClass['Sensitivity']),2)
lda_fpr <- round(as.numeric(1- predict.lda_result$byClass['Specificity']),2)
lda_acc <- round(as.numeric(predict.lda_result$overall['Accuracy']),2)

paste("AUC" , lda_auc, sep = ": ")

## [1] "AUC: 0.5"

paste("TPR" , lda_tpr, sep = ": ")

## [1] "TPR: 1"

paste("FPR" , lda_fpr, sep = ": ")

## [1] "FPR: 1"

paste("Accuracy" , lda_acc, sep = ": ")

## [1] "Accuracy: 0.57"

ClassiferMetrics <- rbind(ClassiferMetrics, c("LDA", lda_auc , lda_tpr , lda_fpr , lda_acc), stringsAsFactors = FALSE)

Decision Tree

library(rpart)
tree <- rpart(label_n ~ X + Y_n, data=training_set)
tree_predicted_prob<- predict(tree,newdata = test_set)
tree_predicted_prob

##         9        13        15        18        26        27        33 
## 0.4545455 0.7222222 0.7222222 0.7222222 0.7222222 0.7222222 0.7222222

tree_predicted_class<-round(tree_predicted_prob)
tree_predicted_class

##  9 13 15 18 26 27 33 
##  0  1  1  1  1  1  1

roc <- roc(test_set$label_n, as.numeric(tree_predicted_class))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc$auc

## Area under the curve: 0.6667

plot.roc(roc,
main="Decision Tree | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best")

predict.tree_result <- confusionMatrix(as.factor(tree_predicted_class), as.factor(test_set$label_n), positive = "1")
predict.tree_result$table

##           Reference
## Prediction 0 1
##          0 1 0
##          1 2 4

tree_auc <- round(as.numeric(roc$auc),2)
tree_tpr <- round(as.numeric(predict.tree_result$byClass['Sensitivity']),2)
tree_fpr <- round(as.numeric(1- predict.tree_result$byClass['Specificity']),2)
tree_acc <- round(as.numeric(predict.tree_result$overall['Accuracy']),2)

paste("AUC" , tree_auc, sep = ": ")

## [1] "AUC: 0.67"

paste("TPR" , tree_tpr, sep = ": ")

## [1] "TPR: 1"

paste("FPR" , tree_fpr, sep = ": ")

## [1] "FPR: 0.67"

paste("Accuracy" , tree_acc, sep = ": ")

## [1] "Accuracy: 0.71"

ClassiferMetrics <- rbind(ClassiferMetrics, c("Decision Tree", tree_auc , tree_tpr , tree_fpr , tree_acc), stringsAsFactors = FALSE)

SVM

data.svm<-svm(factor(label_n)~ X + Y_n,data=training_set,scale=FALSE,kernel="radial",cost=5)
svm_predicted_class<- predict(data.svm,newdata = test_set)


roc <- roc(test_set$label_n, as.numeric(svm_predicted_class))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc$auc

## Area under the curve: 0.8333

plot.roc(roc,
main="SVM - RBS Kernal | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE, 
print.thres="best")

predict.svm_result <- confusionMatrix(as.factor(svm_predicted_class), as.factor(test_set$label_n), positive = "1")
predict.svm_result$table

##           Reference
## Prediction 0 1
##          0 2 0
##          1 1 4

svm_auc <- round(as.numeric(roc$auc),2)
svm_tpr <- round(as.numeric(predict.svm_result$byClass['Sensitivity']),2)
svm_fpr <- round(as.numeric(1- predict.svm_result$byClass['Specificity']),2)
svm_acc <- round(as.numeric(predict.svm_result$overall['Accuracy']),2)

paste("AUC" , svm_auc, sep = ": ")

## [1] "AUC: 0.83"

paste("TPR" , svm_tpr, sep = ": ")

## [1] "TPR: 1"

paste("FPR" , svm_fpr, sep = ": ")

## [1] "FPR: 0.33"

paste("Accuracy" , svm_acc, sep = ": ")

## [1] "Accuracy: 0.86"

ClassiferMetrics <- rbind(ClassiferMetrics, c("SVM", svm_auc , svm_tpr , svm_fpr , svm_acc), stringsAsFactors = FALSE)

library(knitr)

metrics <- c("Classifer Algo", "AUC", "TPR","FPR" ,  "Accuracy")
colnames(ClassiferMetrics) <- metrics

kable(ClassiferMetrics)

Classifer Algo	AUC	TPR	FPR	Accuracy
KNN	0.83	1	0.33	0.86
Naive Bayes	0.5	1	1	0.57
Logistic Regression	0.5	1	1	0.57
LDA	0.5	1	1	0.57
Decision Tree	0.67	1	0.67	0.71
SVM	0.83	1	0.33	0.86

Data622 Assignment 2

Charls Joseph

April 16, 2020

Load the data

KNN Model

Naive Bayes

Linear regression

LDA

Decision Tree

SVM