library(kableExtra)
library(ggplot2)

library(e1071)
library(MASS)
library(caret)
library(naivebayes) 
library(C50)

library(partykit)
library(pROC)

Import Data

data<-read.csv("dataset.csv")
data %>% kable() %>% kable_styling() %>% scroll_box(width = "800px", height = "400px")
X Y label
5 a BLUE
5 b BLACK
5 c BLUE
5 d BLACK
5 e BLACK
5 f BLACK
19 a BLUE
19 b BLUE
19 c BLUE
19 d BLUE
19 e BLACK
19 f BLUE
35 a BLACK
35 b BLACK
35 c BLUE
35 d BLACK
35 e BLACK
35 f BLACK
51 a BLACK
51 b BLACK
51 c BLUE
51 d BLACK
51 e BLACK
51 f BLACK
55 a BLACK
55 b BLACK
55 c BLACK
55 d BLACK
55 e BLACK
55 f BLACK
63 a BLACK
63 b BLUE
63 c BLUE
63 d BLUE
63 e BLUE
63 f BLUE
ggplot(data,aes(y=Y,x=X,color=as.factor(label))) + geom_point()

Split data

set.seed(300)
#Spliting data as training and test set. Using createDataPartition() function from caret
indxTrain <- createDataPartition(y = data$label,p = 0.75,list = FALSE)
data_train <- data[indxTrain,]
data_test <- data[-indxTrain,]

#Checking distibution in origanl data and partitioned data
prop.table(table(data_train$label)) * 100
## 
##    BLACK     BLUE 
## 60.71429 39.28571
ggplot(data_train,aes(y=Y,x=X,color=as.factor(label))) + geom_point()

ggplot(data_test,aes(y=Y,x=X,color=as.factor(label))) + geom_point()

Run Models

kNN

To determine K we use a “rule of thumb”, k is the square rooth of the number of samples

k=round(length(data_train[,1])**0.5,0)

We use the class library to obtain the model. This class does not take non-numeric features, we transfor feature Y to intergers

data_train_knn<-data_train
data_train_knn$Y<-apply(as.data.frame(data_train_knn$Y),1,utf8ToInt)
data_test_knn<-data_test
data_test_knn$Y<-apply(as.data.frame(data_test_knn$Y),1,utf8ToInt)
knn_model <- class::knn(cl = data_train$label,
                  test = data_test_knn[,1:2],
                  train = data_train_knn[,1:2],
                  k = 5,
                  prob = TRUE)
ggplot(data_test,aes(y=Y,x=X,color=as.factor(knn_model))) + geom_point()

Confusion Matrix

print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(knn_model,data_test[,3]))
##          
## knn_model BLACK BLUE
##     BLACK     5    0
##     BLUE      0    3

AUC

(knn_auc<-roc(data_test$label, attributes(knn_model)$prob)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls > cases
## Area under the curve: 0.7667

ACC

(knn_acc<-sum(diag(confusion_matrix))/sum(confusion_matrix))
## [1] 1

TPR

knn_tp<-confusion_matrix[1,1]
print(paste("Number of true positives=",knn_tp))
## [1] "Number of true positives= 5"
knn_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",knn_tpr))
## [1] "True positive ratio= 1"

FPR

knn_fp<-confusion_matrix[2,1]
print(paste("Number of false positives=",knn_fp))
## [1] "Number of false positives= 0"
knn_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",knn_fpr))
## [1] "False positive ratio= 0"

Best K

We can use the caret library to check our selection of k. This library can run several k and selects the best performing

knn_model<-train(label~.,data=data_train,method="knn",tuneLength=10)
knn_model
## k-Nearest Neighbors 
## 
## 28 samples
##  2 predictor
##  2 classes: 'BLACK', 'BLUE' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 28, 28, 28, 28, 28, 28, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa       
##    5  0.6292378   0.232282563
##    7  0.5731430   0.132921615
##    9  0.5322416   0.048896058
##   11  0.5229425   0.022845761
##   13  0.5515890   0.035252248
##   15  0.5603566   0.020510936
##   17  0.5619728  -0.010601504
##   19  0.5466395  -0.022469636
##   21  0.5139122  -0.005430702
##   23  0.5062758   0.002689412
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knn_model)

We see we obtain the same value of k first selected.

We can plot the test prediction and confirm we obtain the same results with this library.

pred_test<-predict(knn_model,newdata = data_test)
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()

kNN Results

results<-data.frame(c(knn_auc,knn_acc,knn_tpr,knn_fpr))
rownames(results)<-c('AUC','ACC','TPR','FPR')
colnames(results)<-c('kNN')

Tree

tree_model<-C5.0(label~.,data=data_train)
tree_model
## 
## Call:
## C5.0.formula(formula = label ~ ., data = data_train)
## 
## Classification Tree
## Number of samples: 28 
## Number of predictors: 2 
## 
## Tree size: 1 
## 
## Non-standard options: attempt to group attributes
plot(as.party(tree_model),main = "tree model")

pred_test<-predict(tree_model,newdata = data_test)
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()

AUC

pred_test<-predict(tree_model,newdata = data_test,type="prob")
as.data.frame(pred_test)$BLACK
## [1] 0.6071429 0.6071429 0.6071429 0.6071429 0.6071429 0.6071429 0.6071429
## [8] 0.6071429
(tree_auc<-roc(data_test$label,as.data.frame(pred_test)$BLACK)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases
## Area under the curve: 0.5

ACC

pred_test<-predict(tree_model,newdata = data_test)
(tree_acc<-mean(pred_test == data_test$label))
## [1] 0.625

Confusion Matrix

pred_test<-predict(tree_model,newdata = data_test)
print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##          
## pred_test BLACK BLUE
##     BLACK     5    3
##     BLUE      0    0

TPR

print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 5"
tree_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",tree_tpr))
## [1] "True positive ratio= 1"

FPR

print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 0"
tree_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",tree_fpr))
## [1] "False positive ratio= 0"

Tree Results

results<-cbind(results,data.frame(c(tree_auc,tree_acc,tree_tpr,tree_fpr)))
colnames(results)<-c('kNN','Tree')

Naive Bayes

nb_model<-naive_bayes(label~.,data=data_train)
nb_model
## 
## ================================== Naive Bayes ================================== 
##  
##  Call: 
## naive_bayes.formula(formula = label ~ ., data = data_train)
## 
## --------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## --------------------------------------------------------------------------------- 
##  
##  A priori probabilities: 
## 
##     BLACK      BLUE 
## 0.6071429 0.3928571 
## 
## --------------------------------------------------------------------------------- 
##  
##  Tables: 
## 
## --------------------------------------------------------------------------------- 
##  ::: X (Gaussian) 
## --------------------------------------------------------------------------------- 
##       
## X         BLACK     BLUE
##   mean 38.64706 36.81818
##   sd   19.35504 24.33852
## 
## --------------------------------------------------------------------------------- 
##  ::: Y (Categorical) 
## --------------------------------------------------------------------------------- 
##    
## Y        BLACK       BLUE
##   a 0.23529412 0.18181818
##   b 0.17647059 0.09090909
##   c 0.05882353 0.36363636
##   d 0.17647059 0.09090909
##   e 0.23529412 0.09090909
##   f 0.11764706 0.18181818
## 
## ---------------------------------------------------------------------------------
pred_test<-predict(nb_model,data_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()

AUC

pred_test<-predict(nb_model,data_test,type = "prob")
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
as.data.frame(pred_test)$BLACK
## [1] 0.3947491 0.7465890 0.1971181 0.7879742 0.7772706 0.8231029 0.5377328
## [8] 0.7530158
(nb_auc<-roc(data_test$label,as.data.frame(pred_test)$BLACK)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls > cases
## Area under the curve: 0.7333

ACC

pred_test<-predict(nb_model,data_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(nb_acc<-mean(pred_test == data_test$label))
## [1] 0.625

Confusion Matrix

print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##          
## pred_test BLACK BLUE
##     BLACK     4    2
##     BLUE      1    1

TPR

print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 4"
nb_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",nb_tpr))
## [1] "True positive ratio= 0.8"

FPR

print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 1"
nb_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",nb_fpr))
## [1] "False positive ratio= 0.2"

Naive Bayes Results

results<-cbind(results,data.frame(c(nb_auc,nb_acc,nb_tpr,nb_fpr)))
colnames(results)<-c('kNN','Tree','NB')

LDA

lda_model<-lda(label~.,data=data_train)
lda_model
## Call:
## lda(label ~ ., data = data_train)
## 
## Prior probabilities of groups:
##     BLACK      BLUE 
## 0.6071429 0.3928571 
## 
## Group means:
##              X         Yb         Yc         Yd         Ye        Yf
## BLACK 38.64706 0.17647059 0.05882353 0.17647059 0.23529412 0.1176471
## BLUE  36.81818 0.09090909 0.36363636 0.09090909 0.09090909 0.1818182
## 
## Coefficients of linear discriminants:
##            LD1
## X  -0.01187782
## Yb -0.40793331
## Yc  2.36282013
## Yd -0.47920022
## Ye -0.70258013
## Yf  0.87525571
pred_test<-predict(lda_model,data_test)[1]$class
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()

AUC

pred_test<-predict(lda_model,data_test,type = "prob")
df<-as.data.frame(pred_test$posterior[,1])
colnames(df)<-c('black')
(lda_auc<-roc(data_test$label,df$black)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls > cases
## Area under the curve: 0.6

ACC

pred_test<-predict(lda_model,data_test)[1]$class
(lda_acc<-mean(pred_test == data_test$label))
## [1] 0.625

Confusion Matrix

print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##          
## pred_test BLACK BLUE
##     BLACK     4    2
##     BLUE      1    1

TPR

print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 4"
lda_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",lda_tpr))
## [1] "True positive ratio= 0.8"

FPR

print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 1"
lda_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",lda_fpr))
## [1] "False positive ratio= 0.2"

LDA Results

results<-cbind(results,data.frame(c(lda_auc,lda_acc,lda_tpr,lda_fpr)))
colnames(results)<-c('kNN','Tree','NB','LDA')

Logistic Regression

lr_model<-glm(label~.,data=data_train,family="binomial"(link="logit"))
lr_model
## 
## Call:  glm(formula = label ~ ., family = binomial(link = "logit"), data = data_train)
## 
## Coefficients:
## (Intercept)            X           Yb           Yc           Yd           Ye  
##    -0.22366     -0.01264     -0.40684      2.15833     -0.48190     -0.74622  
##          Yf  
##     0.75470  
## 
## Degrees of Freedom: 27 Total (i.e. Null);  21 Residual
## Null Deviance:       37.52 
## Residual Deviance: 31.84     AIC: 45.84
pred_training<-predict(lr_model,type="response")
pred_training[pred_training<0.5]<-0
pred_training[pred_training>=0.5]<-1
pred_training[pred_training==0]<-"BLACK"
pred_training[pred_training==1]<-"BLUE"

pred_test<-predict(lr_model,type="response",newdata = data_test)
pred_test[pred_test<0.5]<-0
pred_test[pred_test>=0.5]<-1
pred_test[pred_test==0]<-"BLACK"
pred_test[pred_test==1]<-"BLUE"

ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()

AUC

pred_test<-predict(lr_model,type="response",newdata = data_test)
df<-as.data.frame(pred_test)
colnames(df)<-c('black')
(lr_auc<-roc(data_test$label,df$black)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases
## Area under the curve: 0.6

Confusion Matrix

pred_test<-predict(lr_model,type="response",newdata = data_test)
pred_test[pred_test<0.5]<-0
pred_test[pred_test>=0.5]<-1
pred_test[pred_test==0]<-"BLACK"
pred_test[pred_test==1]<-"BLUE"

print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##          
## pred_test BLACK BLUE
##     BLACK     4    2
##     BLUE      1    1

ACC

(lr_acc<-sum(diag(confusion_matrix))/sum(confusion_matrix))
## [1] 0.625

TPR

print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 4"
lr_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",lr_tpr))
## [1] "True positive ratio= 0.8"

FPR

print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 1"
lr_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",lr_fpr))
## [1] "False positive ratio= 0.2"

LR Results

results<-cbind(results,data.frame(c(lr_auc,lr_acc,lr_tpr,lr_fpr)))
colnames(results)<-c('kNN','Tree','NB','LDA','LR')

SVM with RBS Kernel

svm_model<-svm(label ~ ., data = data_train,cost=5, cross=10,type="C-classification",kernel="radial",na.action=na.omit)
svm_model
## 
## Call:
## svm(formula = label ~ ., data = data_train, cost = 5, cross = 10, 
##     type = "C-classification", kernel = "radial", na.action = na.omit)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  5 
## 
## Number of Support Vectors:  21
pred_test<- predict(svm_model, data_test)
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()

AUC

class1.svm.model <- svm(label ~ ., data = data_train,cost=20, cross=10,type="C-classification",kernel="radial",na.action=na.omit)
class1.svm.pred <- predict(class1.svm.model, data_test)
finalmatrix<-data.matrix(svm_model, rownames.force = F)

test<-table(pred = pred_test, true = data_test[,3])

confusionMatrix(test)
## Confusion Matrix and Statistics
## 
##        true
## pred    BLACK BLUE
##   BLACK     5    2
##   BLUE      0    1
##                                           
##                Accuracy : 0.75            
##                  95% CI : (0.3491, 0.9681)
##     No Information Rate : 0.625           
##     P-Value [Acc > NIR] : 0.3697          
##                                           
##                   Kappa : 0.3846          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.3333          
##          Pos Pred Value : 0.7143          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.6250          
##          Detection Rate : 0.6250          
##    Detection Prevalence : 0.8750          
##       Balanced Accuracy : 0.6667          
##                                           
##        'Positive' Class : BLACK           
## 
roc_svm_test <- roc(response = data_test$label, predictor =as.numeric(pred_test))
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases
(svm_auc<-roc_svm_test$auc)
## Area under the curve: 0.6667

ACC

svm_model<-svm(label~.,data=data_train,kernel="radial",cost=5)
(svm_acc<-mean(pred_test == data_test$label))
## [1] 0.75

Confusion Matrix

print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##          
## pred_test BLACK BLUE
##     BLACK     5    2
##     BLUE      0    1

TPR

print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 5"
svm_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",svm_tpr))
## [1] "True positive ratio= 1"

FPR

print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 0"
svm_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",svm_fpr))
## [1] "False positive ratio= 0"
results<-cbind(results,data.frame(c(svm_auc,svm_acc,svm_tpr,svm_fpr)))
colnames(results)<-c('kNN','Tree','NB','LDA','LR','SVM')

Summary classifier performance

Summarize and provide a explanatory commentary on the observed performance of these classifiers

results %>% kable() %>% kable_styling() %>% scroll_box(width = "800px")
kNN Tree NB LDA LR SVM
AUC 0.7666667 0.500 0.7333333 0.600 0.600 0.6666667
ACC 1.0000000 0.625 0.6250000 0.625 0.625 0.7500000
TPR 1.0000000 1.000 0.8000000 0.800 0.800 1.0000000
FPR 0.0000000 0.000 0.2000000 0.200 0.200 0.0000000

At first glance it will seem that all classifiers have similar performance. But a second look shows more details. First to keep in mind, metrics were computed agains a rather small sample size, the test set, with only 8 samples. This is a very small set, so small changes in performance affect the metrics substantially.

On all classifiers we see accuracy around 0.5, which isn’t very good. When we look at the test samples we see that even if we missed all the samples and classify all tin the same class, accuracy would still be around 0.5. This is because of the distribution of samples in the test set. We easily see this in the Tree classifier. So although accuracy is similar to others, a look at true positives and false positives reveals its poor performance, especially compared to others.

With this in mind, we find the best performing classifiers are LDA, LR and SVM. A closer look reveals SVM is the best performing. Even as the accuracy is comparable to others, true positives and false negatives shows better performance.

Classifier Performance differences

What aspects of the data and or aspects of the algorithms, explain these performance differences

A quick look at the scattered plots for the entire dataset and the training and test sets reveals how it isn’t really easy to draw linear classifiers to serrate the different classes. We see many samples of different classes intertwined between each other.This makes it harder for several of these classifier to properly assign classes.

kNN for example needs to be able to find groups of samples of the same class, this is hard when many samples are isolated.

At the other extreme, classifiers such as SVM with a non linear kernel do better at classifying this kind of data, mainly because the kernel allows the classifier work in higher dimension that now do show the data segregated.

What is particularly interesting is how the logistic regression seems to show good performance. This is not a classifier that is good at data which can’t be serrated linearly. The better performance could be due to the reduced test set and its distribution between classes.