Load the data and set the categorical data as factor
require(stringr)
## Loading required package: stringr
df <- read.csv("C:\\Users\\Charls\\Documents\\CunyMSDS\\Data622\\Assignments\\HW2\\dataset.csv")
#df$Y_1 <- as.integer(as.factor(df$Y))
df$Y_n <- sapply(df$Y, function(x)
{switch(as.character(x), "a" = 1, "b" = 2, "c" = 3, "d" = 4, "e" = 5 , "f" = 6 )
})
df$label_n <- ifelse(as.character(df$label) == "BLACK", 1, 0)
table(df[,5])
##
## 0 1
## 14 22
library(caTools)
set.seed(123)
split = sample.split(df$label_n, SplitRatio = 0.8)
training_set = subset(df, split == TRUE)
test_set = subset(df, split == FALSE)
ClassiferMetrics <- data.frame(matrix(ncol = 5, nrow = 0) ,stringsAsFactors = FALSE)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(class) #knn
library(caret) # confusion metric
## Loading required package: lattice
## Loading required package: ggplot2
trainData1 = training_set[,c(1,4)]
testData1 = test_set[,c(1,4)]
train_lbls <- training_set$label_n
test_lbls <- test_set$label_n
knn_model <- knn(train = trainData1, test = testData1, cl= train_lbls,k = 3, prob = TRUE)
roc <- roc(test_set$label_n, attributes(knn_model)$prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.8333
plot.roc(roc,
main="KNN Classifer | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE,
print.thres="best")
predict.knn_result <- confusionMatrix(knn_model, as.factor(test_set$label_n), positive = "1")
predict.knn_result$table
## Reference
## Prediction 0 1
## 0 2 0
## 1 1 4
knn_auc <- round(as.numeric(roc$auc),2)
knn_tpr <- round(as.numeric(predict.knn_result$byClass['Sensitivity']),2)
knn_fpr <- round(as.numeric(1- predict.knn_result$byClass['Specificity']),2)
knn_acc <- round(as.numeric(predict.knn_result$overall['Accuracy']),2)
paste("AUC" , knn_auc, sep = ": ")
## [1] "AUC: 0.83"
paste("TPR" , knn_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , knn_fpr, sep = ": ")
## [1] "FPR: 0.33"
paste("Accuracy" , knn_acc, sep = ": ")
## [1] "Accuracy: 0.86"
ClassiferMetrics <- rbind(ClassiferMetrics, c("KNN", knn_auc , knn_tpr , knn_fpr , knn_acc), stringsAsFactors = FALSE)
library(e1071)
nb_model=naiveBayes(as.factor(label_n) ~ X + Y_n, data=training_set)
nm_pred_prob=predict(nb_model,newdata = test_set, type = "raw" )
nm_pred_class=predict(nb_model,newdata = test_set, type ="class")
print(nm_pred_prob[,2])
## [1] 0.5999947 0.5825399 0.6438226 0.7253454 0.6095118 0.6396973 0.6198102
roc <- roc(test_set$label_n, nm_pred_prob[,2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.5
plot.roc(roc,
main="Naive Bayes | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE,
print.thres="best")
predict.nb_result <- confusionMatrix(nm_pred_class, as.factor(test_set$label_n), positive = "1")
predict.nb_result$table
## Reference
## Prediction 0 1
## 0 0 0
## 1 3 4
nb_auc <- round(as.numeric(roc$auc),2)
nb_tpr <- round(as.numeric(predict.nb_result$byClass['Sensitivity']),2)
nb_fpr <- round(as.numeric(1- predict.nb_result$byClass['Specificity']),2)
nb_acc <- round(as.numeric(predict.nb_result$overall['Accuracy']),2)
paste("AUC" , nb_auc, sep = ": ")
## [1] "AUC: 0.5"
paste("TPR" , nb_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , nb_fpr, sep = ": ")
## [1] "FPR: 1"
paste("Accuracy" , nb_acc, sep = ": ")
## [1] "Accuracy: 0.57"
ClassiferMetrics <- rbind(ClassiferMetrics, c("Naive Bayes", nb_auc , nb_tpr , nb_fpr , nb_acc), stringsAsFactors = FALSE)
log_classifer <- glm(label_n ~ X + Y_n, data=training_set, family = "binomial")
lr_pred_prob=predict(log_classifer,newdata = test_set, type ="response" )
# consider the threshold of the linear regression classifer as 0.5
lr_pred_class <- ifelse(lr_pred_prob > 0.5, 1, 0)
roc <- roc(test_set$label_n, lr_pred_class)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.5
plot.roc(roc,
main="Logistic Regression | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE,
print.thres="best")
predict.lr_result <- confusionMatrix(as.factor(lr_pred_class), as.factor(test_set$label_n), positive = "1")
## Warning in confusionMatrix.default(as.factor(lr_pred_class),
## as.factor(test_set$label_n), : Levels are not in the same order for reference
## and data. Refactoring data to match.
predict.lr_result$table
## Reference
## Prediction 0 1
## 0 0 0
## 1 3 4
lr_auc <- round(as.numeric(roc$auc),2)
lr_tpr <- round(as.numeric(predict.lr_result$byClass['Sensitivity']),2)
lr_fpr <- round(as.numeric(1- predict.lr_result$byClass['Specificity']),2)
lr_acc <- round(as.numeric(predict.lr_result$overall['Accuracy']),2)
paste("AUC" , lr_auc, sep = ": ")
## [1] "AUC: 0.5"
paste("TPR" , lr_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , lr_fpr, sep = ": ")
## [1] "FPR: 1"
paste("Accuracy" , lr_acc, sep = ": ")
## [1] "Accuracy: 0.57"
ClassiferMetrics <- rbind(ClassiferMetrics, c("Logistic Regression", lr_auc , lr_tpr , lr_fpr , lr_acc), stringsAsFactors = FALSE)
library("MASS")
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lda_classifer <- lda(label_n ~ X + Y_n, data=training_set)
lda_pred_class=predict(lda_classifer,newdata = test_set)
roc <- roc(test_set$label_n, as.numeric(lda_pred_class$class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.5
plot.roc(roc,
main="LDA | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE,
print.thres="best")
predict.lda_result <- confusionMatrix(lda_pred_class$class, as.factor(test_set$label_n), positive = "1")
predict.lda_result$table
## Reference
## Prediction 0 1
## 0 0 0
## 1 3 4
lda_auc <- round(as.numeric(roc$auc),2)
lda_tpr <- round(as.numeric(predict.lda_result$byClass['Sensitivity']),2)
lda_fpr <- round(as.numeric(1- predict.lda_result$byClass['Specificity']),2)
lda_acc <- round(as.numeric(predict.lda_result$overall['Accuracy']),2)
paste("AUC" , lda_auc, sep = ": ")
## [1] "AUC: 0.5"
paste("TPR" , lda_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , lda_fpr, sep = ": ")
## [1] "FPR: 1"
paste("Accuracy" , lda_acc, sep = ": ")
## [1] "Accuracy: 0.57"
ClassiferMetrics <- rbind(ClassiferMetrics, c("LDA", lda_auc , lda_tpr , lda_fpr , lda_acc), stringsAsFactors = FALSE)
library(rpart)
tree <- rpart(label_n ~ X + Y_n, data=training_set)
tree_predicted_prob<- predict(tree,newdata = test_set)
tree_predicted_prob
## 9 13 15 18 26 27 33
## 0.4545455 0.7222222 0.7222222 0.7222222 0.7222222 0.7222222 0.7222222
tree_predicted_class<-round(tree_predicted_prob)
tree_predicted_class
## 9 13 15 18 26 27 33
## 0 1 1 1 1 1 1
roc <- roc(test_set$label_n, as.numeric(tree_predicted_class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.6667
plot.roc(roc,
main="Decision Tree | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE,
print.thres="best")
predict.tree_result <- confusionMatrix(as.factor(tree_predicted_class), as.factor(test_set$label_n), positive = "1")
predict.tree_result$table
## Reference
## Prediction 0 1
## 0 1 0
## 1 2 4
tree_auc <- round(as.numeric(roc$auc),2)
tree_tpr <- round(as.numeric(predict.tree_result$byClass['Sensitivity']),2)
tree_fpr <- round(as.numeric(1- predict.tree_result$byClass['Specificity']),2)
tree_acc <- round(as.numeric(predict.tree_result$overall['Accuracy']),2)
paste("AUC" , tree_auc, sep = ": ")
## [1] "AUC: 0.67"
paste("TPR" , tree_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , tree_fpr, sep = ": ")
## [1] "FPR: 0.67"
paste("Accuracy" , tree_acc, sep = ": ")
## [1] "Accuracy: 0.71"
ClassiferMetrics <- rbind(ClassiferMetrics, c("Decision Tree", tree_auc , tree_tpr , tree_fpr , tree_acc), stringsAsFactors = FALSE)
data.svm<-svm(factor(label_n)~ X + Y_n,data=training_set,scale=FALSE,kernel="radial",cost=5)
svm_predicted_class<- predict(data.svm,newdata = test_set)
roc <- roc(test_set$label_n, as.numeric(svm_predicted_class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc$auc
## Area under the curve: 0.8333
plot.roc(roc,
main="SVM - RBS Kernal | ROC Curve", percent=TRUE, of="thresholds", # compute AUC (of threshold)
thresholds="best", # select the (best) threshold
print.auc = TRUE,
print.thres="best")
predict.svm_result <- confusionMatrix(as.factor(svm_predicted_class), as.factor(test_set$label_n), positive = "1")
predict.svm_result$table
## Reference
## Prediction 0 1
## 0 2 0
## 1 1 4
svm_auc <- round(as.numeric(roc$auc),2)
svm_tpr <- round(as.numeric(predict.svm_result$byClass['Sensitivity']),2)
svm_fpr <- round(as.numeric(1- predict.svm_result$byClass['Specificity']),2)
svm_acc <- round(as.numeric(predict.svm_result$overall['Accuracy']),2)
paste("AUC" , svm_auc, sep = ": ")
## [1] "AUC: 0.83"
paste("TPR" , svm_tpr, sep = ": ")
## [1] "TPR: 1"
paste("FPR" , svm_fpr, sep = ": ")
## [1] "FPR: 0.33"
paste("Accuracy" , svm_acc, sep = ": ")
## [1] "Accuracy: 0.86"
ClassiferMetrics <- rbind(ClassiferMetrics, c("SVM", svm_auc , svm_tpr , svm_fpr , svm_acc), stringsAsFactors = FALSE)
library(knitr)
metrics <- c("Classifer Algo", "AUC", "TPR","FPR" , "Accuracy")
colnames(ClassiferMetrics) <- metrics
kable(ClassiferMetrics)
| Classifer Algo | AUC | TPR | FPR | Accuracy |
|---|---|---|---|---|
| KNN | 0.83 | 1 | 0.33 | 0.86 |
| Naive Bayes | 0.5 | 1 | 1 | 0.57 |
| Logistic Regression | 0.5 | 1 | 1 | 0.57 |
| LDA | 0.5 | 1 | 1 | 0.57 |
| Decision Tree | 0.67 | 1 | 0.67 | 0.71 |
| SVM | 0.83 | 1 | 0.33 | 0.86 |