MSDS 5213 Lab 1

Part 1: KNN Exercise

Import data

library(class)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(caTools)

bank <- read.csv("banknote.csv", header = FALSE)
bank$V5 <- factor(bank$V5)

Split the data into train & test sets with an 80/20% split

set.seed(123)
split_size <- 0.8
train_size<- floor(nrow(bank)*split_size)
train_indices <- sample(1:nrow(bank), train_size)

cl <- bank[train_indices,]$V5

train <- subset(bank[train_indices,], select = -V5)
test <- subset(bank[-train_indices,], select = -V5)

Run the KNN Model for k=3, and observe confusion matrix

pred <- knn(train, test, cl, k=3, prob = FALSE, use.all = TRUE)
confusionMatrix(pred, bank[-train_indices,]$V5)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 148   0
##          1   1 126
##                                           
##                Accuracy : 0.9964          
##                  95% CI : (0.9799, 0.9999)
##     No Information Rate : 0.5418          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9927          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9933          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9921          
##              Prevalence : 0.5418          
##          Detection Rate : 0.5382          
##    Detection Prevalence : 0.5382          
##       Balanced Accuracy : 0.9966          
##                                           
##        'Positive' Class : 0               
##

Use the caret package to obtain optimal number for k

ktune <- train(train, cl, method = "knn", tuneGrid =
                 data.frame(.k=1:20), trControl = trainControl(method="cv")) 
ktune

## k-Nearest Neighbors 
## 
## 1097 samples
##    4 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 987, 987, 987, 987, 987, 988, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  1.0000000  1.0000000
##    2  1.0000000  1.0000000
##    3  1.0000000  1.0000000
##    4  1.0000000  1.0000000
##    5  1.0000000  1.0000000
##    6  1.0000000  1.0000000
##    7  1.0000000  1.0000000
##    8  0.9990826  0.9981428
##    9  1.0000000  1.0000000
##   10  0.9990826  0.9981428
##   11  0.9963470  0.9926054
##   12  0.9954379  0.9907690
##   13  0.9945288  0.9889252
##   14  0.9945288  0.9889252
##   15  0.9945288  0.9889252
##   16  0.9945288  0.9889252
##   17  0.9945288  0.9889252
##   18  0.9945288  0.9889252
##   19  0.9945288  0.9889252
##   20  0.9945288  0.9889252
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Run new prediction, and the new confusion matrix with this result

pred2 <- predict(ktune,test)

confusionMatrix(pred2, bank[-train_indices,]$V5)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 149   0
##          1   0 126
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9867, 1)
##     No Information Rate : 0.5418     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.5418     
##          Detection Rate : 0.5418     
##    Detection Prevalence : 0.5418     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

Part 2: Naive-Bayes Exercise

Import data

library(e1071)
library(lattice)
library(ggplot2)

vote <- read.csv("vote2.csv")
vote$party <- factor(vote$party)

Split the data into train & test sets with an 80/20% split

set.seed(124)
split_size2 <- 0.8
train_size2<- floor(nrow(vote)*split_size2)
train_indices2 <- sample(1:nrow(vote), train_size2)

cl2 <- vote[-train_indices2,]$party

train2 <- subset(vote[train_indices2,])
test2 <- subset(vote[-train_indices2,], select = -party)

Run Naive-Bayes model

NBmodel <- naiveBayes(party ~ ., data=train2)

Run predictions, and the confusion matrix

predNB <- predict(NBmodel, test2)                

confusionMatrix(factor(predNB), factor(cl2))

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   democrat republican
##   democrat         57          4
##   republican        3         23
##                                          
##                Accuracy : 0.9195         
##                  95% CI : (0.8412, 0.967)
##     No Information Rate : 0.6897         
##     P-Value [Acc > NIR] : 2.451e-07      
##                                          
##                   Kappa : 0.8101         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9500         
##             Specificity : 0.8519         
##          Pos Pred Value : 0.9344         
##          Neg Pred Value : 0.8846         
##              Prevalence : 0.6897         
##          Detection Rate : 0.6552         
##    Detection Prevalence : 0.7011         
##       Balanced Accuracy : 0.9009         
##                                          
##        'Positive' Class : democrat       
##