library(kableExtra)
library(ggplot2)
library(e1071)
library(MASS)
library(caret)
library(naivebayes)
library(C50)
library(partykit)
library(pROC)
data<-read.csv("dataset.csv")
data %>% kable() %>% kable_styling() %>% scroll_box(width = "800px", height = "400px")
| X | Y | label |
|---|---|---|
| 5 | a | BLUE |
| 5 | b | BLACK |
| 5 | c | BLUE |
| 5 | d | BLACK |
| 5 | e | BLACK |
| 5 | f | BLACK |
| 19 | a | BLUE |
| 19 | b | BLUE |
| 19 | c | BLUE |
| 19 | d | BLUE |
| 19 | e | BLACK |
| 19 | f | BLUE |
| 35 | a | BLACK |
| 35 | b | BLACK |
| 35 | c | BLUE |
| 35 | d | BLACK |
| 35 | e | BLACK |
| 35 | f | BLACK |
| 51 | a | BLACK |
| 51 | b | BLACK |
| 51 | c | BLUE |
| 51 | d | BLACK |
| 51 | e | BLACK |
| 51 | f | BLACK |
| 55 | a | BLACK |
| 55 | b | BLACK |
| 55 | c | BLACK |
| 55 | d | BLACK |
| 55 | e | BLACK |
| 55 | f | BLACK |
| 63 | a | BLACK |
| 63 | b | BLUE |
| 63 | c | BLUE |
| 63 | d | BLUE |
| 63 | e | BLUE |
| 63 | f | BLUE |
ggplot(data,aes(y=Y,x=X,color=as.factor(label))) + geom_point()
set.seed(300)
#Spliting data as training and test set. Using createDataPartition() function from caret
indxTrain <- createDataPartition(y = data$label,p = 0.75,list = FALSE)
data_train <- data[indxTrain,]
data_test <- data[-indxTrain,]
#Checking distibution in origanl data and partitioned data
prop.table(table(data_train$label)) * 100
##
## BLACK BLUE
## 60.71429 39.28571
ggplot(data_train,aes(y=Y,x=X,color=as.factor(label))) + geom_point()
ggplot(data_test,aes(y=Y,x=X,color=as.factor(label))) + geom_point()
To determine K we use a “rule of thumb”, k is the square rooth of the number of samples
k=round(length(data_train[,1])**0.5,0)
We use the class library to obtain the model. This class does not take non-numeric features, we transfor feature Y to intergers
data_train_knn<-data_train
data_train_knn$Y<-apply(as.data.frame(data_train_knn$Y),1,utf8ToInt)
data_test_knn<-data_test
data_test_knn$Y<-apply(as.data.frame(data_test_knn$Y),1,utf8ToInt)
knn_model <- class::knn(cl = data_train$label,
test = data_test_knn[,1:2],
train = data_train_knn[,1:2],
k = 5,
prob = TRUE)
ggplot(data_test,aes(y=Y,x=X,color=as.factor(knn_model))) + geom_point()
Confusion Matrix
print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(knn_model,data_test[,3]))
##
## knn_model BLACK BLUE
## BLACK 5 0
## BLUE 0 3
AUC
(knn_auc<-roc(data_test$label, attributes(knn_model)$prob)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls > cases
## Area under the curve: 0.7667
ACC
(knn_acc<-sum(diag(confusion_matrix))/sum(confusion_matrix))
## [1] 1
TPR
knn_tp<-confusion_matrix[1,1]
print(paste("Number of true positives=",knn_tp))
## [1] "Number of true positives= 5"
knn_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",knn_tpr))
## [1] "True positive ratio= 1"
FPR
knn_fp<-confusion_matrix[2,1]
print(paste("Number of false positives=",knn_fp))
## [1] "Number of false positives= 0"
knn_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",knn_fpr))
## [1] "False positive ratio= 0"
Best K
We can use the caret library to check our selection of k. This library can run several k and selects the best performing
knn_model<-train(label~.,data=data_train,method="knn",tuneLength=10)
knn_model
## k-Nearest Neighbors
##
## 28 samples
## 2 predictor
## 2 classes: 'BLACK', 'BLUE'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 28, 28, 28, 28, 28, 28, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.6292378 0.232282563
## 7 0.5731430 0.132921615
## 9 0.5322416 0.048896058
## 11 0.5229425 0.022845761
## 13 0.5515890 0.035252248
## 15 0.5603566 0.020510936
## 17 0.5619728 -0.010601504
## 19 0.5466395 -0.022469636
## 21 0.5139122 -0.005430702
## 23 0.5062758 0.002689412
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knn_model)
We see we obtain the same value of k first selected.
We can plot the test prediction and confirm we obtain the same results with this library.
pred_test<-predict(knn_model,newdata = data_test)
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()
kNN Results
results<-data.frame(c(knn_auc,knn_acc,knn_tpr,knn_fpr))
rownames(results)<-c('AUC','ACC','TPR','FPR')
colnames(results)<-c('kNN')
tree_model<-C5.0(label~.,data=data_train)
tree_model
##
## Call:
## C5.0.formula(formula = label ~ ., data = data_train)
##
## Classification Tree
## Number of samples: 28
## Number of predictors: 2
##
## Tree size: 1
##
## Non-standard options: attempt to group attributes
plot(as.party(tree_model),main = "tree model")
pred_test<-predict(tree_model,newdata = data_test)
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()
AUC
pred_test<-predict(tree_model,newdata = data_test,type="prob")
as.data.frame(pred_test)$BLACK
## [1] 0.6071429 0.6071429 0.6071429 0.6071429 0.6071429 0.6071429 0.6071429
## [8] 0.6071429
(tree_auc<-roc(data_test$label,as.data.frame(pred_test)$BLACK)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases
## Area under the curve: 0.5
ACC
pred_test<-predict(tree_model,newdata = data_test)
(tree_acc<-mean(pred_test == data_test$label))
## [1] 0.625
Confusion Matrix
pred_test<-predict(tree_model,newdata = data_test)
print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##
## pred_test BLACK BLUE
## BLACK 5 3
## BLUE 0 0
TPR
print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 5"
tree_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",tree_tpr))
## [1] "True positive ratio= 1"
FPR
print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 0"
tree_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",tree_fpr))
## [1] "False positive ratio= 0"
Tree Results
results<-cbind(results,data.frame(c(tree_auc,tree_acc,tree_tpr,tree_fpr)))
colnames(results)<-c('kNN','Tree')
nb_model<-naive_bayes(label~.,data=data_train)
nb_model
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = label ~ ., data = data_train)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## BLACK BLUE
## 0.6071429 0.3928571
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: X (Gaussian)
## ---------------------------------------------------------------------------------
##
## X BLACK BLUE
## mean 38.64706 36.81818
## sd 19.35504 24.33852
##
## ---------------------------------------------------------------------------------
## ::: Y (Categorical)
## ---------------------------------------------------------------------------------
##
## Y BLACK BLUE
## a 0.23529412 0.18181818
## b 0.17647059 0.09090909
## c 0.05882353 0.36363636
## d 0.17647059 0.09090909
## e 0.23529412 0.09090909
## f 0.11764706 0.18181818
##
## ---------------------------------------------------------------------------------
pred_test<-predict(nb_model,data_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()
AUC
pred_test<-predict(nb_model,data_test,type = "prob")
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
as.data.frame(pred_test)$BLACK
## [1] 0.3947491 0.7465890 0.1971181 0.7879742 0.7772706 0.8231029 0.5377328
## [8] 0.7530158
(nb_auc<-roc(data_test$label,as.data.frame(pred_test)$BLACK)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls > cases
## Area under the curve: 0.7333
ACC
pred_test<-predict(nb_model,data_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(nb_acc<-mean(pred_test == data_test$label))
## [1] 0.625
Confusion Matrix
print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##
## pred_test BLACK BLUE
## BLACK 4 2
## BLUE 1 1
TPR
print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 4"
nb_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",nb_tpr))
## [1] "True positive ratio= 0.8"
FPR
print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 1"
nb_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",nb_fpr))
## [1] "False positive ratio= 0.2"
Naive Bayes Results
results<-cbind(results,data.frame(c(nb_auc,nb_acc,nb_tpr,nb_fpr)))
colnames(results)<-c('kNN','Tree','NB')
lda_model<-lda(label~.,data=data_train)
lda_model
## Call:
## lda(label ~ ., data = data_train)
##
## Prior probabilities of groups:
## BLACK BLUE
## 0.6071429 0.3928571
##
## Group means:
## X Yb Yc Yd Ye Yf
## BLACK 38.64706 0.17647059 0.05882353 0.17647059 0.23529412 0.1176471
## BLUE 36.81818 0.09090909 0.36363636 0.09090909 0.09090909 0.1818182
##
## Coefficients of linear discriminants:
## LD1
## X -0.01187782
## Yb -0.40793331
## Yc 2.36282013
## Yd -0.47920022
## Ye -0.70258013
## Yf 0.87525571
pred_test<-predict(lda_model,data_test)[1]$class
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()
AUC
pred_test<-predict(lda_model,data_test,type = "prob")
df<-as.data.frame(pred_test$posterior[,1])
colnames(df)<-c('black')
(lda_auc<-roc(data_test$label,df$black)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls > cases
## Area under the curve: 0.6
ACC
pred_test<-predict(lda_model,data_test)[1]$class
(lda_acc<-mean(pred_test == data_test$label))
## [1] 0.625
Confusion Matrix
print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##
## pred_test BLACK BLUE
## BLACK 4 2
## BLUE 1 1
TPR
print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 4"
lda_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",lda_tpr))
## [1] "True positive ratio= 0.8"
FPR
print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 1"
lda_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",lda_fpr))
## [1] "False positive ratio= 0.2"
LDA Results
results<-cbind(results,data.frame(c(lda_auc,lda_acc,lda_tpr,lda_fpr)))
colnames(results)<-c('kNN','Tree','NB','LDA')
lr_model<-glm(label~.,data=data_train,family="binomial"(link="logit"))
lr_model
##
## Call: glm(formula = label ~ ., family = binomial(link = "logit"), data = data_train)
##
## Coefficients:
## (Intercept) X Yb Yc Yd Ye
## -0.22366 -0.01264 -0.40684 2.15833 -0.48190 -0.74622
## Yf
## 0.75470
##
## Degrees of Freedom: 27 Total (i.e. Null); 21 Residual
## Null Deviance: 37.52
## Residual Deviance: 31.84 AIC: 45.84
pred_training<-predict(lr_model,type="response")
pred_training[pred_training<0.5]<-0
pred_training[pred_training>=0.5]<-1
pred_training[pred_training==0]<-"BLACK"
pred_training[pred_training==1]<-"BLUE"
pred_test<-predict(lr_model,type="response",newdata = data_test)
pred_test[pred_test<0.5]<-0
pred_test[pred_test>=0.5]<-1
pred_test[pred_test==0]<-"BLACK"
pred_test[pred_test==1]<-"BLUE"
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()
AUC
pred_test<-predict(lr_model,type="response",newdata = data_test)
df<-as.data.frame(pred_test)
colnames(df)<-c('black')
(lr_auc<-roc(data_test$label,df$black)$auc)
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases
## Area under the curve: 0.6
Confusion Matrix
pred_test<-predict(lr_model,type="response",newdata = data_test)
pred_test[pred_test<0.5]<-0
pred_test[pred_test>=0.5]<-1
pred_test[pred_test==0]<-"BLACK"
pred_test[pred_test==1]<-"BLUE"
print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##
## pred_test BLACK BLUE
## BLACK 4 2
## BLUE 1 1
ACC
(lr_acc<-sum(diag(confusion_matrix))/sum(confusion_matrix))
## [1] 0.625
TPR
print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 4"
lr_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",lr_tpr))
## [1] "True positive ratio= 0.8"
FPR
print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 1"
lr_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",lr_fpr))
## [1] "False positive ratio= 0.2"
LR Results
results<-cbind(results,data.frame(c(lr_auc,lr_acc,lr_tpr,lr_fpr)))
colnames(results)<-c('kNN','Tree','NB','LDA','LR')
svm_model<-svm(label ~ ., data = data_train,cost=5, cross=10,type="C-classification",kernel="radial",na.action=na.omit)
svm_model
##
## Call:
## svm(formula = label ~ ., data = data_train, cost = 5, cross = 10,
## type = "C-classification", kernel = "radial", na.action = na.omit)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 5
##
## Number of Support Vectors: 21
pred_test<- predict(svm_model, data_test)
ggplot(data_test,aes(y=Y,x=X,color=as.factor(pred_test))) + geom_point()
AUC
class1.svm.model <- svm(label ~ ., data = data_train,cost=20, cross=10,type="C-classification",kernel="radial",na.action=na.omit)
class1.svm.pred <- predict(class1.svm.model, data_test)
finalmatrix<-data.matrix(svm_model, rownames.force = F)
test<-table(pred = pred_test, true = data_test[,3])
confusionMatrix(test)
## Confusion Matrix and Statistics
##
## true
## pred BLACK BLUE
## BLACK 5 2
## BLUE 0 1
##
## Accuracy : 0.75
## 95% CI : (0.3491, 0.9681)
## No Information Rate : 0.625
## P-Value [Acc > NIR] : 0.3697
##
## Kappa : 0.3846
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.3333
## Pos Pred Value : 0.7143
## Neg Pred Value : 1.0000
## Prevalence : 0.6250
## Detection Rate : 0.6250
## Detection Prevalence : 0.8750
## Balanced Accuracy : 0.6667
##
## 'Positive' Class : BLACK
##
roc_svm_test <- roc(response = data_test$label, predictor =as.numeric(pred_test))
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases
(svm_auc<-roc_svm_test$auc)
## Area under the curve: 0.6667
ACC
svm_model<-svm(label~.,data=data_train,kernel="radial",cost=5)
(svm_acc<-mean(pred_test == data_test$label))
## [1] 0.75
Confusion Matrix
print("BLACK are positives")
## [1] "BLACK are positives"
(confusion_matrix<-table(pred_test,data_test$label))
##
## pred_test BLACK BLUE
## BLACK 5 2
## BLUE 0 1
TPR
print(paste("Number of true positives=",confusion_matrix[1,1]))
## [1] "Number of true positives= 5"
svm_tpr<-confusion_matrix[1,1]/sum(confusion_matrix[,1])
print(paste("True positive ratio=",svm_tpr))
## [1] "True positive ratio= 1"
FPR
print(paste("Number of false positives=",confusion_matrix[2,1]))
## [1] "Number of false positives= 0"
svm_fpr<-confusion_matrix[2,1]/sum(confusion_matrix[,1])
print(paste("False positive ratio=",svm_fpr))
## [1] "False positive ratio= 0"
results<-cbind(results,data.frame(c(svm_auc,svm_acc,svm_tpr,svm_fpr)))
colnames(results)<-c('kNN','Tree','NB','LDA','LR','SVM')
Summarize and provide a explanatory commentary on the observed performance of these classifiers
results %>% kable() %>% kable_styling() %>% scroll_box(width = "800px")
| kNN | Tree | NB | LDA | LR | SVM | |
|---|---|---|---|---|---|---|
| AUC | 0.7666667 | 0.500 | 0.7333333 | 0.600 | 0.600 | 0.6666667 |
| ACC | 1.0000000 | 0.625 | 0.6250000 | 0.625 | 0.625 | 0.7500000 |
| TPR | 1.0000000 | 1.000 | 0.8000000 | 0.800 | 0.800 | 1.0000000 |
| FPR | 0.0000000 | 0.000 | 0.2000000 | 0.200 | 0.200 | 0.0000000 |
At first glance it will seem that all classifiers have similar performance. But a second look shows more details. First to keep in mind, metrics were computed agains a rather small sample size, the test set, with only 8 samples. This is a very small set, so small changes in performance affect the metrics substantially.
On all classifiers we see accuracy around 0.5, which isn’t very good. When we look at the test samples we see that even if we missed all the samples and classify all tin the same class, accuracy would still be around 0.5. This is because of the distribution of samples in the test set. We easily see this in the Tree classifier. So although accuracy is similar to others, a look at true positives and false positives reveals its poor performance, especially compared to others.
With this in mind, we find the best performing classifiers are LDA, LR and SVM. A closer look reveals SVM is the best performing. Even as the accuracy is comparable to others, true positives and false negatives shows better performance.
What aspects of the data and or aspects of the algorithms, explain these performance differences
A quick look at the scattered plots for the entire dataset and the training and test sets reveals how it isn’t really easy to draw linear classifiers to serrate the different classes. We see many samples of different classes intertwined between each other.This makes it harder for several of these classifier to properly assign classes.
kNN for example needs to be able to find groups of samples of the same class, this is hard when many samples are isolated.
At the other extreme, classifiers such as SVM with a non linear kernel do better at classifying this kind of data, mainly because the kernel allows the classifier work in higher dimension that now do show the data segregated.
What is particularly interesting is how the logistic regression seems to show good performance. This is not a classifier that is good at data which can’t be serrated linearly. The better performance could be due to the reduced test set and its distribution between classes.