Project 4

Problem 1

set.seed(12345)
library(caret)
library(ISLR)
hitters=Hitters
#Removing the factor variables and all records in which Salary data is missing
hitters2=Hitters[!is.na(Hitters$Salary), ]
hitters2$League <- NULL
hitters2$Division <- NULL
hitters2$NewLeague <- NULL
#Replacing Salary with log(Salary) and calling it logSalary
hitters2$Salary = log(hitters2$Salary)
names(hitters2)[17] = "logSalary"
#Partitioning the data into training and testing sets
trainingIndices <- createDataPartition(hitters2$logSalary, p = 0.7, list = FALSE)
training <- hitters2[trainingIndices, ]
testing <- hitters2[-trainingIndices, ]

1A

#Building LM1, a linear model, for logSalary and finding its r-squared value
LM1 <- train(logSalary~., data=training, method="lm")
predictLM1 <- predict(LM1, newdata = testing)
cor(predictLM1,testing$logSalary)^2

## [1] 0.4251033

When we run LM1 on the testing data, we get an r-squared value of 0.4251033.

1B

#Building RIDGE1, a ridge regression model, for logSalary and finding its r-squared value
library(elasticnet)
RIDGE1 <- train(logSalary~., data=training, method="ridge", trControl=trainControl(method = "cv", number = 10), preProcess = c("center","scale"))
predictRIDGE1 = predict(RIDGE1, newdata = testing)
cor(predictRIDGE1,testing$logSalary)^2

## [1] 0.4149291

When we run RIDGE1 on the testing data, we get an r-squared value of 0.4149291.

1C

#Building LASSO1, a LASSO model, for logSalary and finding its r-squared value
LASSO1 <- train(logSalary~., data = training, method="lasso", trControl=trainControl(method = "CV", number = 10), preProcess = c("center","scale"))
predictLASSO1=predict(LASSO1, newdata = testing)
cor(predictLASSO1,testing$logSalary)^2

## [1] 0.4219527

When we run LASSO1 on the testing data, we get an r-squared value of 0.4219527.

1D

#Using the following code to determine which variables the LASSO removes with the model
predict.enet(LASSO1$finalModel, type="coef", s=LASSO1$bestTune$fraction, mode="fraction")

## $s
## [1] 0.5
## 
## $fraction
##   0 
## 0.5 
## 
## $mode
## [1] "fraction"
## 
## $coefficients
##        AtBat         Hits        HmRun         Runs          RBI        Walks 
## -0.145554543  0.360034394  0.084101979  0.000000000  0.000000000  0.168863355 
##        Years       CAtBat        CHits       CHmRun        CRuns         CRBI 
##  0.254701624  0.000000000  0.122613367 -0.125801613  0.295830203 -0.001881440 
##       CWalks      PutOuts      Assists       Errors 
## -0.112281097  0.058313929  0.004313357 -0.044853352

LASSO removes Runs, RBI, and CAtBat from the model.

Problem 2

2A

library(AppliedPredictiveModeling)
library(caret)

data("permeability")
#Combining the predictors and response into one data frame
prints = cbind.data.frame(permeability, fingerprints)
#Using NearZeroVar to remove predictors with small variances
zeroVarIndices <- nearZeroVar(prints)
prints = prints[, -zeroVarIndices]

2B

set.seed(12345)
#Partitioning the data into training and testing sets
trainingIndices = createDataPartition(prints$permeability, p=0.7, list = FALSE)
training2 = prints[trainingIndices, ]
testing2 = prints[-trainingIndices, ]

2C

#Building LM2, a linear model, and finding its r-squared value
LM2 = train(permeability~., data = training2, method = "lm", preProcess = c("center","scale"))
predictedLM2 = predict(LM2, newdata = testing2) 
cor(predictedLM2,testing2$permeability)^2

## [1] 0.01698163

When we run LM2 on the testing set, we get an r-squared value of 0.01698163.

2D

#Building LM2pca, a linear model with pca, and finding its r-squared value
LM2pca = train(permeability~., data = training2, method = "lm", preProcess = c("center", "scale", "pca"))
predicted2pca = predict(LM2pca, newdata = testing2)
cor(predicted2pca, testing2$permeability)^2

## [1] 0.4644431

When we run LM2pca on the testing data, we get an r-squared value of 0.4644431.

Problem 3

3A

library(data.table)
#Reading the training and testing sets from the website into R
xtest = fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.xtest")
xtrain = fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.xtrain")
ytest = fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.ytest")
ytrain = fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.ytrain")

#Making data frames for the training and testing data that combine the predictors and response, and converting the response to a factor variable
training <- transpose(xtrain)
testing <- transpose(xtest)
training$response = as.factor(t(ytrain))
testing$response = as.factor(t(ytest))

3B

#Setting the seed, using parallelization while building a linear discriminant analysis model (LDA3), and making a confusion matrix
set.seed(12345)
library(parallel)
library(doParallel)
parallelData <- startParallel()
LDA3 = train(response~., data=training, 
             method="lda", 
             trControl=trainControl(method="cv", number=10, allowParallel=TRUE),
             preProcess=c("center", "scale"))
endParallel(parallelData)

##    user  system elapsed 
##  820.66   15.87  839.05

predicted <- predict(LDA3, newdata = testing)
confusionMatrix(predicted, testing$response)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##         1  1 0 0 1 0 2 0 0 0  0  2  1  1  0
##         2  0 2 1 0 0 0 0 0 0  0  0  0  0  0
##         3  1 0 2 0 1 0 1 0 0  1  0  0  0  0
##         4  0 0 0 3 0 0 0 0 0  0  0  0  0  0
##         5  0 1 0 0 4 0 0 0 2  0  0  0  0  0
##         6  1 0 0 0 0 1 0 0 0  0  0  2  0  0
##         7  0 1 0 0 0 0 1 0 0  0  0  0  0  0
##         8  1 0 1 0 0 0 0 2 0  2  0  0  0  0
##         9  0 0 0 0 0 0 0 0 4  0  0  0  0  0
##         10 0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         11 0 0 0 0 0 0 0 0 0  0  1  0  0  0
##         12 0 1 0 0 1 0 0 0 0  0  0  0  0  0
##         13 0 1 0 0 0 0 0 0 0  0  0  1  2  0
##         14 0 0 0 0 0 0 0 0 0  0  0  0  0  4
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.3608, 0.6392)
##     No Information Rate : 0.1111          
##     P-Value [Acc > NIR] : 1.581e-12       
##                                           
##                   Kappa : 0.4594          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.25000  0.33333  0.50000  0.75000  0.66667  0.33333
## Specificity           0.86000  0.97917  0.92000  1.00000  0.93750  0.94118
## Pos Pred Value        0.12500  0.66667  0.33333  1.00000  0.57143  0.25000
## Neg Pred Value        0.93478  0.92157  0.95833  0.98039  0.95745  0.96000
## Prevalence            0.07407  0.11111  0.07407  0.07407  0.11111  0.05556
## Detection Rate        0.01852  0.03704  0.03704  0.05556  0.07407  0.01852
## Detection Prevalence  0.14815  0.05556  0.11111  0.05556  0.12963  0.07407
## Balanced Accuracy     0.55500  0.65625  0.71000  0.87500  0.80208  0.63725
##                      Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
## Sensitivity           0.50000  1.00000  0.66667   0.00000   0.33333   0.00000
## Specificity           0.98077  0.92308  1.00000   1.00000   1.00000   0.96000
## Pos Pred Value        0.50000  0.33333  1.00000       NaN   1.00000   0.00000
## Neg Pred Value        0.98077  1.00000  0.96000   0.94444   0.96226   0.92308
## Prevalence            0.03704  0.03704  0.11111   0.05556   0.05556   0.07407
## Detection Rate        0.01852  0.03704  0.07407   0.00000   0.01852   0.00000
## Detection Prevalence  0.03704  0.11111  0.07407   0.00000   0.01852   0.03704
## Balanced Accuracy     0.74038  0.96154  0.83333   0.50000   0.66667   0.48000
##                      Class: 13 Class: 14
## Sensitivity            0.66667   1.00000
## Specificity            0.96078   1.00000
## Pos Pred Value         0.50000   1.00000
## Neg Pred Value         0.98000   1.00000
## Prevalence             0.05556   0.07407
## Detection Rate         0.03704   0.07407
## Detection Prevalence   0.07407   0.07407
## Balanced Accuracy      0.81373   1.00000

3C

#Using parallelization while building a k-nearest neighbors model (KNN3), and making a confusion matrix
parallelData2 <- startParallel()
KNN3 = train(response~., data=training, 
             method="knn", 
             trControl=trainControl(method="cv", number=10, allowParallel=TRUE),
             preProcess=c("center","scale"))
endParallel(parallelData2)

##    user  system elapsed 
##   46.61    1.08   47.81

predicted2 <- predict(KNN3, newdata = testing)
confusionMatrix(predicted2, testing$response)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##         1  0 0 0 0 0 0 0 1 0  0  1  0  0  0
##         2  0 0 0 0 0 0 1 0 0  0  0  0  0  0
##         3  0 0 0 0 1 0 0 0 0  0  0  0  0  0
##         4  0 0 2 3 1 0 0 0 0  0  0  0  0  0
##         5  0 2 0 0 4 0 0 0 0  0  0  0  0  0
##         6  3 1 0 0 0 1 0 0 0  0  1  2  0  0
##         7  0 0 0 0 0 0 1 0 1  0  0  0  0  0
##         8  1 1 0 0 0 0 0 0 1  2  1  0  0  0
##         9  0 0 0 0 0 0 0 0 4  0  0  0  0  0
##         10 0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         11 0 0 0 1 0 1 0 1 0  1  0  1  1  0
##         12 0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         13 0 2 2 0 0 1 0 0 0  0  0  1  2  1
##         14 0 0 0 0 0 0 0 0 0  0  0  0  0  3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3333          
##                  95% CI : (0.2109, 0.4747)
##     No Information Rate : 0.1111          
##     P-Value [Acc > NIR] : 1.211e-05       
##                                           
##                   Kappa : 0.2845          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.00000  0.00000  0.00000  0.75000  0.66667  0.33333
## Specificity           0.96000  0.97917  0.98000  0.94000  0.95833  0.86275
## Pos Pred Value        0.00000  0.00000  0.00000  0.50000  0.66667  0.12500
## Neg Pred Value        0.92308  0.88679  0.92453  0.97917  0.95833  0.95652
## Prevalence            0.07407  0.11111  0.07407  0.07407  0.11111  0.05556
## Detection Rate        0.00000  0.00000  0.00000  0.05556  0.07407  0.01852
## Detection Prevalence  0.03704  0.01852  0.01852  0.11111  0.11111  0.14815
## Balanced Accuracy     0.48000  0.48958  0.49000  0.84500  0.81250  0.59804
##                      Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
## Sensitivity           0.50000  0.00000  0.66667   0.00000   0.00000   0.00000
## Specificity           0.98077  0.88462  1.00000   1.00000   0.88235   1.00000
## Pos Pred Value        0.50000  0.00000  1.00000       NaN   0.00000       NaN
## Neg Pred Value        0.98077  0.95833  0.96000   0.94444   0.93750   0.92593
## Prevalence            0.03704  0.03704  0.11111   0.05556   0.05556   0.07407
## Detection Rate        0.01852  0.00000  0.07407   0.00000   0.00000   0.00000
## Detection Prevalence  0.03704  0.11111  0.07407   0.00000   0.11111   0.00000
## Balanced Accuracy     0.74038  0.44231  0.83333   0.50000   0.44118   0.50000
##                      Class: 13 Class: 14
## Sensitivity            0.66667   0.75000
## Specificity            0.86275   1.00000
## Pos Pred Value         0.22222   1.00000
## Neg Pred Value         0.97778   0.98039
## Prevalence             0.05556   0.07407
## Detection Rate         0.03704   0.05556
## Detection Prevalence   0.16667   0.05556
## Balanced Accuracy      0.76471   0.87500

3D

The accuracy of the linear discriminant anaylsis classifier is 0.5. The accuracy of the k nearest neighbors classifier is 0.3333. Neither classifier is very effective since the highest accuracy (LDA3) is only accurate 50% of the time. Still, the accuracy of LDA3 is higher than that of KNN3, so LDA3 would be the more effective model.

Problem 4

4A

setwd("C:/Users/joshr/Documents/Machine Learning R")
#Reading the data files into R
MNistTraining <- read.csv(file =  'mnist_train (2).csv', header = FALSE)
MNistTesting <- read.csv(file = 'mnist_test (2).csv', header = FALSE)

colnames(MNistTraining)[1] <- "ResponseVar"
colnames(MNistTesting)[1] <- "ResponseVar"

#Renaming "0" as "c0", "1" as "c1", etc in the training and testing data
MNistTraining$ResponseVar <- sub("0", "c0", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("1", "c1", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("2", "c2", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("3", "c3", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("4", "c4", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("5", "c5", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("6", "c6", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("7", "c7", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("8", "c8", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("9", "c9", MNistTraining$ResponseVar)

MNistTesting$ResponseVar <- sub("0", "c0", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("1", "c1", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("2", "c2", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("3", "c3", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("4", "c4", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("5", "c5", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("6", "c6", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("7", "c7", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("8", "c8", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("9", "c9", MNistTesting$ResponseVar)

#Changing the response variable to a factor variable
MNistTraining$ResponseVar <- as.factor(MNistTraining$ResponseVar)
MNistTesting$ResponseVar <- as.factor(MNistTesting$ResponseVar)

#Removing the predictors with low variance in the training and testing data
library(caret)
LowVarPredictors <- nearZeroVar(MNistTraining)
MNistTraining <- MNistTraining[, -LowVarPredictors]
MNistTesting <- MNistTesting[, -LowVarPredictors]

4B

#Using parallelization while building a k-nearest neighbors model (KNN4), and making a confusion matrix
library(parallel)
library(doParallel)
pData <- startParallel()
KNN4 <- train(ResponseVar~.,
    data = MNistTraining,
    method = "knn",
    trControl=trainControl(method="cv", number=10, allowParallel=TRUE),
    preProcess=c("center","scale"))
endParallel(pData)

##    user  system elapsed 
## 8248.96   30.25 9188.42

KNN4predicted <- predict(KNN4, newdata = MNistTesting)
confusionMatrix(KNN4predicted, MNistTesting$ResponseVar)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   c0   c1   c2   c3   c4   c5   c6   c7   c8   c9
##         c0  972    0    7    0    0    3    6    0    2    3
##         c1    2 1129    5    1    6    0    2   20    0    6
##         c2    1    3  991    3    0    0    0    5    6    4
##         c3    0    2    5  975    0   11    1    0   13    4
##         c4    0    0    2    1  939    0    3    1    3    7
##         c5    0    0    0   15    0  864    1    0   15    6
##         c6    4    0    4    1    5    6  945    0    8    1
##         c7    1    0   17   10    3    3    0  992   10   10
##         c8    0    0    1    2    0    2    0    0  910    1
##         c9    0    1    0    2   29    3    0   10    7  967
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9684          
##                  95% CI : (0.9648, 0.9717)
##     No Information Rate : 0.1135          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9649          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: c0 Class: c1 Class: c2 Class: c3 Class: c4
## Sensitivity             0.9918    0.9947    0.9603    0.9653    0.9562
## Specificity             0.9977    0.9953    0.9975    0.9960    0.9981
## Pos Pred Value          0.9789    0.9641    0.9783    0.9644    0.9822
## Neg Pred Value          0.9991    0.9993    0.9954    0.9961    0.9952
## Prevalence              0.0980    0.1135    0.1032    0.1010    0.0982
## Detection Rate          0.0972    0.1129    0.0991    0.0975    0.0939
## Detection Prevalence    0.0993    0.1171    0.1013    0.1011    0.0956
## Balanced Accuracy       0.9948    0.9950    0.9789    0.9807    0.9772
##                      Class: c5 Class: c6 Class: c7 Class: c8 Class: c9
## Sensitivity             0.9686    0.9864    0.9650    0.9343    0.9584
## Specificity             0.9959    0.9968    0.9940    0.9993    0.9942
## Pos Pred Value          0.9589    0.9702    0.9484    0.9934    0.9490
## Neg Pred Value          0.9969    0.9986    0.9960    0.9930    0.9953
## Prevalence              0.0892    0.0958    0.1028    0.0974    0.1009
## Detection Rate          0.0864    0.0945    0.0992    0.0910    0.0967
## Detection Prevalence    0.0901    0.0974    0.1046    0.0916    0.1019
## Balanced Accuracy       0.9823    0.9916    0.9795    0.9668    0.9763

4C

#Using parallelization while building a linear discriminant analysis model (LDA4), and making a confusion matrix
pData <- startParallel()
LDA4 <- train(ResponseVar~., data = MNistTraining, method = "lda", trControl = trainControl(method = "cv", number = 10, allowParallel = TRUE), preProcess = c("center", "scale"))
endParallel(pData)

##    user  system elapsed 
##  295.96   11.25  363.62

LDA4predicted <- predict(LDA4, newdata = MNistTesting)
confusionMatrix(LDA4predicted, MNistTesting$ResponseVar)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   c0   c1   c2   c3   c4   c5   c6   c7   c8   c9
##         c0  914    0   13    7    1   11   11    4    5    7
##         c1    1 1077   43    8   12   11    7   33   27    8
##         c2    5    6  818   30    5    9    7   21   11    7
##         c3    2    3   18  850    0   51    1    7   33    9
##         c4    0    1   25    2  862   11   25   19   13   61
##         c5   42    3   11   48    1  719   34    4   35   13
##         c6   10    4   14    9   16   21  866    0   19    3
##         c7    2    3   22   23    1   14    1  863   11   26
##         c8    4   38   50   20    7   36    6    6  798   10
##         c9    0    0   18   13   77    9    0   71   22  865
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8632          
##                  95% CI : (0.8563, 0.8699)
##     No Information Rate : 0.1135          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8479          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: c0 Class: c1 Class: c2 Class: c3 Class: c4
## Sensitivity             0.9327    0.9489    0.7926    0.8416    0.8778
## Specificity             0.9935    0.9831    0.9887    0.9862    0.9826
## Pos Pred Value          0.9394    0.8778    0.8901    0.8727    0.8459
## Neg Pred Value          0.9927    0.9934    0.9764    0.9823    0.9866
## Prevalence              0.0980    0.1135    0.1032    0.1010    0.0982
## Detection Rate          0.0914    0.1077    0.0818    0.0850    0.0862
## Detection Prevalence    0.0973    0.1227    0.0919    0.0974    0.1019
## Balanced Accuracy       0.9631    0.9660    0.8907    0.9139    0.9302
##                      Class: c5 Class: c6 Class: c7 Class: c8 Class: c9
## Sensitivity             0.8061    0.9040    0.8395    0.8193    0.8573
## Specificity             0.9790    0.9894    0.9885    0.9804    0.9766
## Pos Pred Value          0.7901    0.9002    0.8934    0.8185    0.8047
## Neg Pred Value          0.9810    0.9898    0.9817    0.9805    0.9839
## Prevalence              0.0892    0.0958    0.1028    0.0974    0.1009
## Detection Rate          0.0719    0.0866    0.0863    0.0798    0.0865
## Detection Prevalence    0.0910    0.0962    0.0966    0.0975    0.1075
## Balanced Accuracy       0.8925    0.9467    0.9140    0.8998    0.9170

4D

The accuracy we got for KNN4 is 0.9688 and the accuracy we got for LDA4 is 0.8632. Both are very strong accuracies, but we can see that KNN4 has the highest one in this case. Both models seem to be very strong and capable in terms of prediction, though.

Problem 5

5A

library(caret)
library(AppliedPredictiveModeling)
library(kernlab)
data("spam")
#Setting the seed and partitioning the data into training and testing sets
set.seed(12345)
trainingIndex = createDataPartition(spam$type, p =0.7, list = FALSE) 
spamtraining= spam[trainingIndex, ]
spamtesting = spam[-trainingIndex, ]

5B

#Building NB5, a naive Bayes classifier, and making a confusion matrix to assess the predictive capacity
NB5 = train(type~., data=spamtraining, method = "nb", trControl = trainControl(method = "cv", number =10))
predictedNB = predict(NB5, newdata = spamtesting)
confusionMatrix(predictedNB, spamtesting$type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     499   31
##    spam        337  512
##                                          
##                Accuracy : 0.7331         
##                  95% CI : (0.709, 0.7563)
##     No Information Rate : 0.6062         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4913         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.5969         
##             Specificity : 0.9429         
##          Pos Pred Value : 0.9415         
##          Neg Pred Value : 0.6031         
##              Prevalence : 0.6062         
##          Detection Rate : 0.3619         
##    Detection Prevalence : 0.3843         
##       Balanced Accuracy : 0.7699         
##                                          
##        'Positive' Class : nonspam        
##

5C

#Building NB5pca, a naive Bayes classifier with preprocessing and pca, and making a confusion matrix to assess the predictive capacity 
NB5pca = train(type~., data=spamtraining, method = "nb", trControl = trainControl(method = "cv", number =10), preProcess = c("center", "scale", "pca"))
predictedNBpca = predict(NB5, spamtesting)
confusionMatrix(predictedNBpca, spamtesting$type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     499   31
##    spam        337  512
##                                          
##                Accuracy : 0.7331         
##                  95% CI : (0.709, 0.7563)
##     No Information Rate : 0.6062         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4913         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.5969         
##             Specificity : 0.9429         
##          Pos Pred Value : 0.9415         
##          Neg Pred Value : 0.6031         
##              Prevalence : 0.6062         
##          Detection Rate : 0.3619         
##    Detection Prevalence : 0.3843         
##       Balanced Accuracy : 0.7699         
##                                          
##        'Positive' Class : nonspam        
##

5D

Without Preprocessing (Part b): Accuracy: 0.7331 Specificity: 0.9429 Sensitivity: 0.5969 Type I Error Rate: 337/(499+337) = 0.40311 Type II Error Rate: 31/(31+512) = 0.05709

With Preprocessing (Part c): Accuracy: 0.7331 Specificity: 0.9429 Sensivity: 0.5969 Type I Error Rate: 337/(499+337) = 0.40311 Type II Error Rate: 31/(31+512) = 0.05709

As we can see, the accuracy, specificity, sensitivity, Type I error rate, and Type II error rate are the same with and without preprocessing. However, NB5 would be slightly preferable in practice because it has simpler code without losing any effectiveness in terms of these five measures.