Data Summary

Summary of data shows that we have a dataframe with 2 factor columns and 1 integer column.

##        X            Y             label   
##  Min.   : 5        a :5         BLACK:22  
##  1st Qu.:19        b :5         BLUE :14  
##  Median :43        c :5                   
##  Mean   :38        d :5                   
##  3rd Qu.:55        e :5                   
##  Max.   :63        f :5                   
##               (Other):6
## Observations: 36
## Variables: 3
## $ X     <int> 5, 5, 5, 5, 5, 5, 19, 19, 19, 19, 19, 19, 35, 35, 35, 35, 35,...
## $ Y     <fct>       a,       b,       c,       d,       e,       f,      a,...
## $ label <fct>       BLUE,       BLACK,       BLUE,       BLACK,       BLACK...

Run Models

Run kNN, Tree, NB, LDA and LR, SVM with RBS Kernel (60%)

KNN

##  [1]       BLACK       BLACK       BLUE        BLACK       BLACK       BLACK
##  [7]       BLACK       BLACK       BLUE        BLACK       BLACK       BLACK
## [13]       BLACK       BLACK       BLUE        BLACK       BLACK       BLACK
## Levels:       BLACK       BLUE

The knn model has a confusion matrix below with a accuracy of 0.72 , TPR of 0.38 and FPR of 0.

##              Actual
## Predicted           BLACK       BLUE
##         BLACK          10          5
##         BLUE            0          3

TREE

## 
## Call:
## C5.0.formula(formula = label ~ ., data = traind)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Apr 12 16:27:45 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 26 cases (3 attributes) from undefined.data
## 
## Decision tree:
##  BLACK (26/10)
## 
## 
## Evaluation on training data (26 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       1   10(38.5%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##      16          (a): class BLACK
##      10          (b): class BLUE
## 
## 
## Time: 0.0 secs

The Tree model has a confusion matrix below with a accuracy of 0.6 , TPR of 0 and FPR of 0.

##              Actual
## Predicted           BLACK       BLUE
##         BLACK           6          4
##         BLUE            0          0

NB

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace, type = "class", 
##     useKernal = TRUE)
## 
## A-priori probabilities:
## Y
##       BLACK        BLUE 
##   0.6153846   0.3846154 
## 
## Conditional probabilities:
##              X
## Y             [,1]     [,2]
##         BLACK 34.5 20.36337
##         BLUE  42.8 24.11915
## 
##              Y
## Y                   a       b       c       d       e       f      a      b
##         BLACK  0.0000  0.0625  0.0000  0.0625  0.0625  0.0625 0.1875 0.1250
##         BLUE   0.0000  0.0000  0.1000  0.0000  0.0000  0.0000 0.0000 0.1000
##              Y
## Y                  c      d      e      f
##         BLACK 0.0625 0.0625 0.2500 0.0625
##         BLUE  0.3000 0.2000 0.1000 0.2000

The Naive Bayes model has a confusion matrix below with a accuracy of 0.3 , TPR of 0.25 and FPR of 1.

##              Actual
## Predicted           BLACK       BLUE
##         BLACK           2          3
##         BLUE            4          1

LDA

## Call:
## lda(label ~ ., data = traind)
## 
## Prior probabilities of groups:
##       BLACK        BLUE 
##   0.6153846   0.3846154 
## 
## Group means:
##                X Y      b Y      c Y      d Y      e Y      f Y     a Y     b
##       BLACK 34.5   0.0625      0.0   0.0625   0.0625   0.0625  0.1875   0.125
##       BLUE  42.8   0.0000      0.1   0.0000   0.0000   0.0000  0.0000   0.100
##             Y     c Y     d Y     e Y     f
##       BLACK  0.0625  0.0625    0.25  0.0625
##       BLUE   0.3000  0.2000    0.10  0.2000
## 
## Coefficients of linear discriminants:
##                  LD1
## X         0.01447635
## Y      b -1.04603812
## Y      c  2.93852420
## Y      d -1.04603812
## Y      e -1.04603812
## Y      f -1.04603812
## Y     a  -1.65404474
## Y     b  -0.36446089
## Y     c   1.33437701
## Y     d   1.02163194
## Y     e  -0.82238904
## Y     f   1.11814093

The LDA model has a confusion matrix below with a accuracy of 0.3 , TPR of 0.25 and FPR of 1.

##              Actual
## Predicted           BLACK       BLUE
##         BLACK           2          3
##         BLUE            4          1

LR

## 
## Call:
## glm(formula = label ~ ., family = binomial, data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6932  -0.8992  -0.2606   0.7306   2.0317  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.870e+01  6.523e+03   0.003    0.998
## X           -2.661e-02  2.650e-02  -1.004    0.315
## Y      b    -3.713e+01  9.224e+03  -0.004    0.997
## Y      c    -3.231e-08  9.224e+03   0.000    1.000
## Y      d    -3.713e+01  9.224e+03  -0.004    0.997
## Y      e    -3.713e+01  9.224e+03  -0.004    0.997
## Y      f    -3.713e+01  9.224e+03  -0.004    0.997
## Y     a     -1.895e+01  6.523e+03  -0.003    0.998
## Y     b     -1.793e+01  6.523e+03  -0.003    0.998
## Y     c     -1.607e+01  6.523e+03  -0.002    0.998
## Y     d     -1.793e+01  6.523e+03  -0.003    0.998
## Y     e     -1.895e+01  6.523e+03  -0.003    0.998
## Y     f     -1.793e+01  6.523e+03  -0.003    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 48.114  on 35  degrees of freedom
## Residual deviance: 34.166  on 23  degrees of freedom
## AIC: 60.166
## 
## Number of Fisher Scoring iterations: 17

The Linear model has a confusion matrix below with a accuracy of 0.81 , TPR of 0.6 and FPR of 0.1.

##          Actual
## Predicted       BLACK       BLUE
##         0          15          4
##         1           1          6

SVM

## Support Vector Machines with Linear Kernel 
## 
## 26 samples
##  2 predictor
##  2 classes: '      BLACK', '      BLUE' 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 23, 24, 23, 23, 24, 23, ... 
## Resampling results:
## 
##   Accuracy   Kappa
##   0.6333333  0.15 
## 
## Tuning parameter 'C' was held constant at a value of 1

The SVM model has a confusion matrix below with a accuracy of 0.58 , TPR of 0.81 FPR of 0.8:

## Confusion Matrix and Statistics
## 
##              
## test_pred           BLACK       BLUE
##         BLACK          13          2
##         BLUE            3          8
##                                           
##                Accuracy : 0.8077          
##                  95% CI : (0.6065, 0.9345)
##     No Information Rate : 0.6154          
##     P-Value [Acc > NIR] : 0.03075         
##                                           
##                   Kappa : 0.6012          
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.8125          
##             Specificity : 0.8000          
##          Pos Pred Value : 0.8667          
##          Neg Pred Value : 0.7273          
##              Prevalence : 0.6154          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.5769          
##       Balanced Accuracy : 0.8063          
##                                           
##        'Positive' Class :       BLACK     
## 

Performance Table

Determine the AUC, ACCURACY, TPR,FPR for each algorithm, create a table as shown below ALGO AUC,ACC,TPR,FPR LR LDA NB SVM kNN TREE

finaldf<-data.frame(ALGO=c('LR','LDA','NB','SVM','kNN','TREE'),
                    AUC=c(lr.auc,lda.auc,nb.auc,svm.auc,knn.auc,tr.auc),
                    ACC=c(lr.acc,lda.acc,nb.acc,svm.acc,knn.acc,tr.acc),
                    TPR=c(lr.tpr,lda.tpr,nb.tpr,svm.tpr,knn.tpr,tr.tpr),
                    FPR=c(lr.fpr,lda.fpr,nb.fpr,svm.fpr,knn.fpr,tr.fpr))

write.csv(finaldf, "finaldf.csv")

Commentary

Summarize and provide a explanatory commentary on the observed performance of these classifiers. What aspects of the data and or aspects of the algorithms, explain these performance differences.

  • TPR - True positive rate (sensitivity) tries to find the percentage of true positives where predictions were correctly identified

  • FPR - False positives rate (1 - specificity(True Negative)) tries to find percentage of true positive where prediction is incorrectly identified (Type 1 Errors)

    (Specificity and sensitivity are inversely proportional, while TPR and FPR are not.)

  • Accuracy - Finds the percentage of true positive and true negatives where predictions were correctly identified.

  • AUC - Area under the curve is the area under the ROC curve when comparing TPR to FPR for several models. The closer the curve is to the upper left corner the better the model classification.

References:

https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5 https://towardsdatascience.com/understanding-confusion-matrix-a9ad42dcfd62 https://towardsdatascience.com/understanding-the-roc-and-auc-curves-a05b68550b69

ALGO AUC ACC TPR FPR
LR 0.79 0.81 0.6 0.1
LDA 0.75 0.3 0.25 1
NB 0.71 0.3 0.25 1
SVM 0.98 0.58 0.81 0.8
kNN 0.5 0.72 0.38 0
TREE 0.5 0.6 0 0

LR - The GLM linear regression model shows that the accuracy of TP and TN is the highest at 0.81 while the AUC is the 2nd highest of all models. This tells us the model did an above average job at classfication.

LDA - The LDA model accuracy is 0.3 with an AUC of 0.75. These performance stats are average.

NB - The NB model has the exact performance stats as LDA which also catergorizes it as an average model.

SVM - The SVM model accuracy is the 3rd highest at 0.58, It also has the highest AUC value of 0.98 and above average TPR and FPR values at 0.81 and 0.8 respectively.

KNN - The KNN model accuracy is the 2nd highest at 0.72, but the AUC is average and TPR is below average.

TREE - The Tree model accuracy is above average , but AUC is average and TPR is 0.

Conclusion

The performance data tells us LR and SVM are the best models for this data. While SVM has the higher AUC, LR has the higher accuracy. While SVM has a higher TPR it also has a higher FPR. I would lean toward using the LR model since it has a lower FPR while SVM has a higher FPR, since the above average FPR on SVM can have a higher impact on the performance of the model.

Appendix

Code used in analysis

knitr::opts_chunk$set(
    echo = FALSE,
    message = FALSE,
    warning = FALSE
)
#knitr::opts_chunk$set(echo = TRUE)
require(knitr)
library(ggplot2)
library(tidyr)
library(MASS)
library(psych)
library(kableExtra)
library(dplyr)
library(faraway)
library(gridExtra)
library(reshape2)
library(leaps)
library(pROC)
library(caret)
library(naniar)
library(pander)
library(pROC)
library(mlbench)
library(e1071)
library(fpp2)
library(mlr)
library(GGally)
library(neuralnet)
#x<-read.clipboard(header=TRUE, sep=',')
#summary(x)
#write.csv((x),"x.csv")

d<-read.csv("y.csv")
d<-d[,2:4]
summary(d)
glimpse(d)

#Split train/test
set.seed(1)
intrain<- createDataPartition(y = d$label, p=.7, list=FALSE)
traind <- d[intrain,]
test<- d[-intrain, ]

require("class")
attach(d)
Xlog<-cbind(X,Y)
trainm<-X<51
knn.pred<-knn(Xlog[trainm,],Xlog[!trainm,],label[trainm],k=1, prob=FALSE)
knn.pred
confusionMatrix<-table(knn.pred,label[trainm],dnn=c('Predicted','Actual'))


knn.acc<-round(sum(diag(confusionMatrix))/sum(confusionMatrix),2)
knn.miss<-round((confusionMatrix[2,1]+confusionMatrix[1,2])/sum(confusionMatrix),2)
knn.tpr<-round(sum(confusionMatrix[2,2])/sum(confusionMatrix[,2]),2)
knn.fpr<-round(sum(confusionMatrix[2,1])/sum(confusionMatrix[,2]),2)

knnn<-roc(data=trainm,label, X)
knn.auc<-round(knnn$auc[1],2)
confusionMatrix
require(C50)
tr.d<-C5.0(label~.,data=traind)
summary(tr.d)

tr.prob<-predict(tr.d, newdata=test)
confusionMatrix<-table(tr.prob,test[,3],dnn=c('Predicted','Actual'))

tr.acc<-round(sum(diag(confusionMatrix))/sum(confusionMatrix),2)
tr.miss<-round((confusionMatrix[2,1]+confusionMatrix[1,2])/sum(confusionMatrix),2)
tr.tpr<-round(sum(confusionMatrix[2,2])/sum(confusionMatrix[,2]),2)
tr.fpr<-round(sum(confusionMatrix[2,1])/sum(confusionMatrix[,2]),2)

targethat<-predict(tr.d, newdata=test, type="prob")
tr.auc<-round((auc(test$label, targethat[,1])+auc(test$label, targethat[,2]))/2,2)

confusionMatrix
require(e1071)
d.nb<-naiveBayes(label~., data=traind, type="class",useKernal=TRUE)
d.nb
n.prob<-predict(d.nb, newdata=test, type="class")
confusionMatrix<-table(n.prob,test[,3],dnn=c('Predicted','Actual'))

nb.acc<-round(sum(diag(confusionMatrix))/sum(confusionMatrix),2)
nb.miss<-round((confusionMatrix[2,1]+confusionMatrix[1,2])/sum(confusionMatrix),2)
nb.tpr<-round(sum(confusionMatrix[2,2])/sum(confusionMatrix[,2]),2)
nb.fpr<-round(sum(confusionMatrix[2,1])/sum(confusionMatrix[,2]),2)

targethat<-predict(d.nb,type="raw", newdata=test)
n.prob<-predict(d.nb, newdata=test, type="raw")
nb.auc<-round((auc(test$label, n.prob[,1])+auc(test$label, n.prob[,2]))/2,2)

confusionMatrix
require(MASS)
require(C50)
d.5<-d[!trainm,]
lda.fit<-lda(label~., data=traind)
lda.fit
lda.pred<-predict(lda.fit, newdata=test)
lda.class<-lda.pred$class
confusionMatrix<-table(lda.class, test[,3],dnn=c('Predicted','Actual'))

lda.acc<-round(sum(diag(confusionMatrix))/sum(confusionMatrix),2)
lda.miss<-round((confusionMatrix[2,1]+confusionMatrix[1,2])/sum(confusionMatrix),2)
lda.tpr<-round(sum(confusionMatrix[2,2])/sum(confusionMatrix[,2]),2)
lda.fpr<-round(sum(confusionMatrix[2,1])/sum(confusionMatrix[,2]),2)

targethat<-predict(lda.fit,type="response", newdata=test)
lda.auc<-round((auc(test$label, targethat$posterior[,1])+auc(test$label, targethat$posterior[,2]))/2,2)

confusionMatrix
require(stats)
glm.fit<-glm(label~.,data=d,family=binomial)
summary(glm.fit)
glm.prob<-predict(glm.fit,newdata=traind, type="response")
glm.pred<-ifelse(glm.prob>.5,1,0)
confusionMatrix<-table(glm.pred,traind$label,dnn=c('Predicted','Actual'))

lr.acc<-round(sum(diag(confusionMatrix))/sum(confusionMatrix),2)
lr.miss<-round((confusionMatrix[2,1]+confusionMatrix[1,2])/sum(confusionMatrix),2)
lr.tpr<-round(sum(confusionMatrix[2,2])/sum(confusionMatrix[,2]),2)
lr.fpr<-round(sum(confusionMatrix[2,1])/sum(confusionMatrix[,2]),2)

targethat<-predict(glm.fit,type="response")
lrr<-roc(label~targethat,data=d)
lr.auc<-round(lrr$auc[1],2)
confusionMatrix

set.seed(1)
tune.out<-tune(svm,label~.,data=traind, kernel="linear",
               ranges=list(cost=c(.00001,.0001, .001, .01, 1,10), 
                           gamma=c(1,2,3,4,5))) #get best model with tune
bestmod<-tune.out$best.model

trctrl<- trainControl(method="repeatedcv", number=10,repeats=1)
svm_lin<- caret::train(label~., data=traind, method="svmLinear", 
                trControl=trctrl, preProcess=c("center","scale"),
                tuneLength =10, cost=bestmod$cost,gamma=bestmod$gamma)

svm_lin

test_pred<- predict.train(svm_lin, newdata=traind)
confusionMatrix<-confusionMatrix(table(test_pred,traind$label))

svm.acc<-round(confusionMatrix$byClass[10],2)
svm.miss<-round(1-confusionMatrix$byClass[10],2)
svm.tpr<-round(confusionMatrix$byClass[1],2)
svm.fpr<-round(confusionMatrix$byClass[2],2)

svmfit<-svm(label~., data=traind, kernel="radial", gamma=2, cost=2,decision.values=T)
fitted<-attributes(predict(svmfit,traind,decision.values=T))$decision.values
svm.auc<-round(auc(traind$label,fitted),2)

confusionMatrix
finaldf<-data.frame(ALGO=c('LR','LDA','NB','SVM','kNN','TREE'),
                    AUC=c(lr.auc,lda.auc,nb.auc,svm.auc,knn.auc,tr.auc),
                    ACC=c(lr.acc,lda.acc,nb.acc,svm.acc,knn.acc,tr.acc),
                    TPR=c(lr.tpr,lda.tpr,nb.tpr,svm.tpr,knn.tpr,tr.tpr),
                    FPR=c(lr.fpr,lda.fpr,nb.fpr,svm.fpr,knn.fpr,tr.fpr))

write.csv(finaldf, "finaldf.csv")