knitr::opts_chunk$set(cache=TRUE)

Problem 1

library(caret)
library(ggplot2)
library(ISLR)
library(lattice)
set.seed(12345)
Hitters2 <- subset(Hitters, select = -c(League, Division, NewLeague))
Hitters3 <- Hitters2[complete.cases(Hitters2), ]
logSalary <- log10(Hitters3$Salary)
Hitters3$Salary <- logSalary
names(Hitters3)[names(Hitters3) == 'Salary'] <- 'logSalary'
TrainingData <- createDataPartition(y=Hitters3$logSalary, p=0.7, list=FALSE)
Training <- Hitters3[TrainingData, ]
Testing <- Hitters3[-TrainingData, ]

Part A

library(ISLR)
library(caret)
LM1 <- train(logSalary~., data=Training, method="lm", maximize = TRUE, metric="Rsquared")
summary(LM1)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55219 -0.19488  0.00698  0.17778  1.21563 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.031e+00  8.509e-02  23.872   <2e-16 ***
## AtBat       -1.707e-03  6.613e-04  -2.582   0.0107 *  
## Hits         6.274e-03  2.525e-03   2.485   0.0139 *  
## HmRun        4.085e-03  6.426e-03   0.636   0.5258    
## Runs        -1.680e-03  3.034e-03  -0.554   0.5805    
## RBI          1.517e-03  2.807e-03   0.540   0.5896    
## Walks        4.629e-03  1.951e-03   2.373   0.0188 *  
## Years        1.814e-02  1.359e-02   1.335   0.1836    
## CAtBat      -1.219e-05  1.433e-04  -0.085   0.9323    
## CHits        4.766e-04  7.577e-04   0.629   0.5302    
## CHmRun       2.202e-04  1.699e-03   0.130   0.8970    
## CRuns        3.652e-04  7.663e-04   0.477   0.6343    
## CRBI        -5.492e-04  7.912e-04  -0.694   0.4886    
## CWalks      -4.340e-04  3.602e-04  -1.205   0.2299    
## PutOuts      1.804e-04  7.663e-05   2.355   0.0197 *  
## Assists      4.570e-04  2.228e-04   2.051   0.0418 *  
## Errors      -7.152e-03  4.418e-03  -1.619   0.1074    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2687 on 168 degrees of freedom
## Multiple R-squared:  0.5713, Adjusted R-squared:  0.5305 
## F-statistic: 13.99 on 16 and 168 DF,  p-value: < 2.2e-16
Predicted_LM1 <- predict(LM1, newdata=Testing)
Rsquared_LM1 <- cor(Predicted_LM1, Testing$logSalary)^2
Rsquared_LM1
## [1] 0.4518025

The Rsquared value for LM1 is .4518025, which means it is not a great predictor. ## Part B

library(ISLR)
library(caret)
library(elasticnet)
library(lars)
RIDGE1 <- train(logSalary~., data=Training, method="ridge", maximize=TRUE, metric="Rsquared", trControl=trainControl(method = "cv", number=10), preProcess=c("center", "scale"))
summary(RIDGE1)
##             Length Class      Mode     
## call          4    -none-     call     
## actions      17    -none-     list     
## allset       16    -none-     numeric  
## beta.pure   272    -none-     numeric  
## vn           16    -none-     character
## mu            1    -none-     numeric  
## normx        16    -none-     numeric  
## meanx        16    -none-     numeric  
## lambda        1    -none-     numeric  
## L1norm       17    -none-     numeric  
## penalty      17    -none-     numeric  
## df           17    -none-     numeric  
## Cp           17    -none-     numeric  
## sigma2        1    -none-     numeric  
## xNames       16    -none-     character
## problemType   1    -none-     character
## tuneValue     1    data.frame list     
## obsLevels     1    -none-     logical  
## param         0    -none-     list
Predicted_RIDGE1 <- predict(RIDGE1, newdata = Testing)
Rsquared_RIDGE1 <- cor(Predicted_RIDGE1, Testing$logSalary)^2
Rsquared_RIDGE1
## [1] 0.4812084

The Rsquared value for RIDGE1 is .4812084. This is slightly better than LM1, but still does not provide a strong predictor. ## Part C

library(ISLR)
library(caret)
library(lars)
LASSO1 <- train(logSalary~., data=Training, method="lasso", maximize=TRUE, metric="Rsquared", trControl=trainControl(method = "cv", number=10), preProcess=c("center", "scale"))
summary(LASSO1)
##             Length Class      Mode     
## call          4    -none-     call     
## actions      19    -none-     list     
## allset       16    -none-     numeric  
## beta.pure   304    -none-     numeric  
## vn           16    -none-     character
## mu            1    -none-     numeric  
## normx        16    -none-     numeric  
## meanx        16    -none-     numeric  
## lambda        1    -none-     numeric  
## L1norm       19    -none-     numeric  
## penalty      19    -none-     numeric  
## df           19    -none-     numeric  
## Cp           19    -none-     numeric  
## sigma2        1    -none-     numeric  
## xNames       16    -none-     character
## problemType   1    -none-     character
## tuneValue     1    data.frame list     
## obsLevels     1    -none-     logical  
## param         0    -none-     list
Predicted_LASSO1 <- predict(LASSO1, newdata = Testing)
Rsquared_LASSO1 <- cor(Predicted_LASSO1, Testing$logSalary)^2
Rsquared_LASSO1
## [1] 0.4686052

The Rsquared value for LASSO1 is .4686052, which is higher than LM1, but lower than RIDGE1. ## Part D

library(ISLR)
library(caret)
library(elasticnet)
library(lars)
predict.enet(LASSO1$finalModel, type="coef", s=LASSO1$bestTune$fraction, mode='fraction')
## $s
## [1] 0.5
## 
## $fraction
##   0 
## 0.5 
## 
## $mode
## [1] "fraction"
## 
## $coefficients
##       AtBat        Hits       HmRun        Runs         RBI       Walks 
## -0.11603735  0.17393873  0.02151554  0.00000000  0.01334575  0.05446797 
##       Years      CAtBat       CHits      CHmRun       CRuns        CRBI 
##  0.05868098  0.00000000  0.19940872 -0.04250998  0.01951787  0.00000000 
##      CWalks     PutOuts     Assists      Errors 
## -0.03332302  0.04827554  0.04337863 -0.03965230

The variables removed from the LASSO1 model were Runs, CAtBat, and CRBI. All of these variables had coefficients of 0.

Problem 2

Part A

library(caret)
library(AppliedPredictiveModeling)
data("permeability")
combined_data <- cbind.data.frame(fingerprints, permeability)
updated_data <- subset(combined_data, select = -c(nearZeroVar(combined_data, saveMetrics = FALSE)))

Part B

set.seed(12345)
TrainingData_Permeability <- createDataPartition(y=updated_data$permeability, p=0.7, list = FALSE)
Training_Permeability <- updated_data[TrainingData_Permeability, ]
Testing_Permeability <- updated_data[-TrainingData_Permeability, ]

Part C

LM2 <- train(permeability~., data=Training_Permeability, method="lm", maximize=TRUE, metric="Rsquared", preProcess=c("center", "scale")) 
Predicted_LM2 <- predict(LM2, newdata=Testing_Permeability)
Rsquared_LM2 <- cor(Predicted_LM2, Testing_Permeability$permeability)^2
Rsquared_LM2
## [1] 0.2323023

Part D

LM2pca <- train(permeability~., data = Training_Permeability, method="lm", maximize=TRUE, metric="Rsquared", preProcess=c("center", "scale", "pca"))
Predicted_LM2pca <- predict(LM2pca, newdata=Testing_Permeability)
Rsquared_LM2pca <- cor(Predicted_LM2pca, Testing_Permeability$permeability)^2
Rsquared_LM2pca
## [1] 0.3688938

The Rsquared value for LM2 is 0.2323023. The Rsquared value for LM2pca is 0.3688938. Although this is an improvement from LM2, it is still low.

Problem 3

Part A

rm(list=ls())
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2026637 108.3    4039297 215.8  4039297 215.8
## Vcells 3496321  26.7   10146329  77.5 10146329  77.5
library(data.table)
set.seed(12345)
training_predictors <- fread('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.xtrain')
training_response <- fread('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.ytrain')
testing_predictors <- fread('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.xtest')
testing_response <- fread('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.ytest')

trainingC_transposed <- as.data.frame(t(training_predictors))
testingC_transposed <- as.data.frame(t(testing_predictors))

training_transposed <- as.data.frame(t(training_response))
names(training_transposed)[names(training_transposed) == 'V1'] <- 'Response'
testing_transposed <- as.data.frame(t(testing_response))
names(testing_transposed)[names(testing_transposed) == 'V1'] <- 'Response'

Cancer_training_combined <- cbind.data.frame(trainingC_transposed, training_transposed)
Cancer_testing_combined <- cbind.data.frame(testingC_transposed, testing_transposed)
Cancer_training_combined$Response <- as.factor(Cancer_training_combined$Response)
Cancer_testing_combined$Response <- as.factor(Cancer_testing_combined$Response)

Part B

library(caret)
library(kernlab)
library(ggplot2)
LDA3 <- train(Response~., Cancer_training_combined, method="lda", maximize=TRUE, trControl=trainControl(method = "CV", number = 10), preProcess=c("center", "scale"))
Predicted_LDA3 <- predict(LDA3, newdata=Cancer_testing_combined)
confusionMatrix(Predicted_LDA3, Cancer_testing_combined$Response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##         1  1 0 0 1 0 2 0 0 0  0  2  1  1  0
##         2  0 2 1 0 0 0 0 0 0  0  0  0  0  0
##         3  1 0 2 0 1 0 1 0 0  1  0  0  0  0
##         4  0 0 0 3 0 0 0 0 0  0  0  0  0  0
##         5  0 1 0 0 4 0 0 0 2  0  0  0  0  0
##         6  1 0 0 0 0 1 0 0 0  0  0  2  0  0
##         7  0 1 0 0 0 0 1 0 0  0  0  0  0  0
##         8  1 0 1 0 0 0 0 2 0  2  0  0  0  0
##         9  0 0 0 0 0 0 0 0 4  0  0  0  0  0
##         10 0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         11 0 0 0 0 0 0 0 0 0  0  1  0  0  0
##         12 0 1 0 0 1 0 0 0 0  0  0  0  0  0
##         13 0 1 0 0 0 0 0 0 0  0  0  1  2  0
##         14 0 0 0 0 0 0 0 0 0  0  0  0  0  4
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.3608, 0.6392)
##     No Information Rate : 0.1111          
##     P-Value [Acc > NIR] : 1.581e-12       
##                                           
##                   Kappa : 0.4594          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.25000  0.33333  0.50000  0.75000  0.66667  0.33333
## Specificity           0.86000  0.97917  0.92000  1.00000  0.93750  0.94118
## Pos Pred Value        0.12500  0.66667  0.33333  1.00000  0.57143  0.25000
## Neg Pred Value        0.93478  0.92157  0.95833  0.98039  0.95745  0.96000
## Prevalence            0.07407  0.11111  0.07407  0.07407  0.11111  0.05556
## Detection Rate        0.01852  0.03704  0.03704  0.05556  0.07407  0.01852
## Detection Prevalence  0.14815  0.05556  0.11111  0.05556  0.12963  0.07407
## Balanced Accuracy     0.55500  0.65625  0.71000  0.87500  0.80208  0.63725
##                      Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity           0.50000  1.00000  0.66667   0.00000   0.33333
## Specificity           0.98077  0.92308  1.00000   1.00000   1.00000
## Pos Pred Value        0.50000  0.33333  1.00000       NaN   1.00000
## Neg Pred Value        0.98077  1.00000  0.96000   0.94444   0.96226
## Prevalence            0.03704  0.03704  0.11111   0.05556   0.05556
## Detection Rate        0.01852  0.03704  0.07407   0.00000   0.01852
## Detection Prevalence  0.03704  0.11111  0.07407   0.00000   0.01852
## Balanced Accuracy     0.74038  0.96154  0.83333   0.50000   0.66667
##                      Class: 12 Class: 13 Class: 14
## Sensitivity            0.00000   0.66667   1.00000
## Specificity            0.96000   0.96078   1.00000
## Pos Pred Value         0.00000   0.50000   1.00000
## Neg Pred Value         0.92308   0.98000   1.00000
## Prevalence             0.07407   0.05556   0.07407
## Detection Rate         0.00000   0.03704   0.07407
## Detection Prevalence   0.03704   0.07407   0.07407
## Balanced Accuracy      0.48000   0.81373   1.00000

Part C

library(caret)
library(kernlab)
library(ggplot2)
kNN3 <- train(Response~., data=Cancer_training_combined, method="knn", maximize=TRUE, metric="Accuracy", trControl=trainControl(method="cv", number = 10), preProcess=c("center", "scale"))
summary(kNN3)
##             Length Class      Mode     
## learn           2  -none-     list     
## k               1  -none-     numeric  
## theDots         0  -none-     list     
## xNames      16063  -none-     character
## problemType     1  -none-     character
## tuneValue       1  data.frame list     
## obsLevels      14  -none-     character
## param           0  -none-     list
Predicted_kNN3 <- predict(kNN3, newdata=Cancer_testing_combined)
confusionMatrix(Predicted_kNN3, Cancer_testing_combined$Response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##         1  1 0 0 0 0 0 0 1 0  1  1  0  0  0
##         2  0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         3  0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         4  0 0 2 2 1 0 0 0 0  0  0  0  0  0
##         5  0 2 0 0 4 0 0 0 0  0  0  0  0  0
##         6  2 1 0 0 0 3 0 0 0  0  1  2  0  0
##         7  0 0 0 0 0 0 2 0 0  0  0  0  0  0
##         8  1 1 0 1 1 0 0 0 1  2  1  0  0  0
##         9  0 0 0 0 0 0 0 0 4  0  0  0  0  0
##         10 0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         11 0 0 0 1 0 0 0 1 0  0  0  1  1  1
##         12 0 0 0 0 0 0 0 0 0  0  0  0  0  0
##         13 0 2 2 0 0 0 0 0 1  0  0  1  2  0
##         14 0 0 0 0 0 0 0 0 0  0  0  0  0  3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3889          
##                  95% CI : (0.2592, 0.5312)
##     No Information Rate : 0.1111          
##     P-Value [Acc > NIR] : 1.196e-07       
##                                           
##                   Kappa : 0.3453          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.25000   0.0000  0.00000  0.50000  0.66667  1.00000
## Specificity           0.94000   1.0000  1.00000  0.94000  0.95833  0.88235
## Pos Pred Value        0.25000      NaN      NaN  0.40000  0.66667  0.33333
## Neg Pred Value        0.94000   0.8889  0.92593  0.95918  0.95833  1.00000
## Prevalence            0.07407   0.1111  0.07407  0.07407  0.11111  0.05556
## Detection Rate        0.01852   0.0000  0.00000  0.03704  0.07407  0.05556
## Detection Prevalence  0.07407   0.0000  0.00000  0.09259  0.11111  0.16667
## Balanced Accuracy     0.59500   0.5000  0.50000  0.72000  0.81250  0.94118
##                      Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity           1.00000  0.00000  0.66667   0.00000   0.00000
## Specificity           1.00000  0.84615  1.00000   1.00000   0.90196
## Pos Pred Value        1.00000  0.00000  1.00000       NaN   0.00000
## Neg Pred Value        1.00000  0.95652  0.96000   0.94444   0.93878
## Prevalence            0.03704  0.03704  0.11111   0.05556   0.05556
## Detection Rate        0.03704  0.00000  0.07407   0.00000   0.00000
## Detection Prevalence  0.03704  0.14815  0.07407   0.00000   0.09259
## Balanced Accuracy     1.00000  0.42308  0.83333   0.50000   0.45098
##                      Class: 12 Class: 13 Class: 14
## Sensitivity            0.00000   0.66667   0.75000
## Specificity            1.00000   0.88235   1.00000
## Pos Pred Value             NaN   0.25000   1.00000
## Neg Pred Value         0.92593   0.97826   0.98039
## Prevalence             0.07407   0.05556   0.07407
## Detection Rate         0.00000   0.03704   0.05556
## Detection Prevalence   0.00000   0.14815   0.05556
## Balanced Accuracy      0.50000   0.77451   0.87500

Part D

While both classifiers have low accuracy, the LDA3 has a higher accuracy of 0.5, while kNN3 has ab accuracy of 0.388 an accuracy of 0.3889. LDA3 would be more likely to produce more accurate results.

Problem 4

Part A

library(caret)
training4<- read.csv("U:/Public/R/mnist_train.csv", header=FALSE)
training4$V1 <- as.factor(training4$V1)
testing4 <- read.csv("U:/Public/R/mnist_test.csv", header=FALSE)
testing4$V1 <- as.factor(testing4$V1)
colnames(training4)[1]<- "Response"

colnames(testing4)[1] <- "Response"
training4$Response <- sub("0", "c0", training4$Response)
training4$Response <- sub("1", "c1", training4$Response)
training4$Response <- sub("2", "c2", training4$Response)
training4$Response <- sub("3", "c3", training4$Response)
training4$Response <- sub("4", "c4", training4$Response)
training4$Response <- sub("5", "c5", training4$Response)
training4$Response <- sub("6", "c6", training4$Response)
training4$Response <- sub("7", "c7", training4$Response)
training4$Response <- sub("8", "c8", training4$Response)
training4$Response <- sub("9", "c9", training4$Response)
testing4$Response <- sub("0", "c0", testing4$Response)
testing4$Response <- sub("1", "c1", testing4$Response)
testing4$Response <- sub("2", "c2", testing4$Response)
testing4$Response <- sub("3", "c3", testing4$Response)
testing4$Response <- sub("4", "c4", testing4$Response)
testing4$Response <- sub("5", "c5", testing4$Response)
testing4$Response <- sub("6", "c6", testing4$Response)
testing4$Response <- sub("7", "c7", testing4$Response)
testing4$Response <- sub("8", "c8", testing4$Response)
testing4$Response <- sub("9", "c9", testing4$Response)
lowvar <- caret::nearZeroVar(training4)
training4 <- training4[ , -lowvar]
testing4 <- testing4[ , -lowvar]
names(training4)[1] <- "Response"
names(testing4)[1] <- "Response"

Part B

{r 4B, warning=FALSE, message=FALSE} library(ggplot2) library(lattice) KNN4 <- caret::train(Response~., data=training4, method=“knn”, trControl=caret::trainControl(method=“cv”, number=10, allowParallel = TRUE), preProcess=c(“center”, “scale”))

predict_knn4 <- predict(KNN4, newdata=testing4) confusionMatrix(predict_knn4, testing4$Response)

Part C

LDA4 <- caret::train(Response~., data=training4, method="lda", trControl=caret::trainControl(method="cv", number=10))
predict_lda4 <- predict(LDA4, newdata=testing4)
confusionMatrix(table(predict_lda4, testing4$Response))
## Confusion Matrix and Statistics
## 
##             
## predict_lda4   c0   c1   c2   c3   c4   c5   c6   c7   c8   c9
##           c0  914    0   13    7    1   11   11    4    5    7
##           c1    1 1077   43    8   12   11    7   33   27    8
##           c2    5    6  818   30    5    9    7   21   11    7
##           c3    2    3   18  850    0   51    1    7   33    9
##           c4    0    1   25    2  862   11   25   19   13   61
##           c5   42    3   11   48    1  719   34    4   35   13
##           c6   10    4   14    9   16   21  866    0   19    3
##           c7    2    3   22   23    1   14    1  863   11   26
##           c8    4   38   50   20    7   36    6    6  798   10
##           c9    0    0   18   13   77    9    0   71   22  865
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8632          
##                  95% CI : (0.8563, 0.8699)
##     No Information Rate : 0.1135          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8479          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: c0 Class: c1 Class: c2 Class: c3 Class: c4
## Sensitivity             0.9327    0.9489    0.7926    0.8416    0.8778
## Specificity             0.9935    0.9831    0.9887    0.9862    0.9826
## Pos Pred Value          0.9394    0.8778    0.8901    0.8727    0.8459
## Neg Pred Value          0.9927    0.9934    0.9764    0.9823    0.9866
## Prevalence              0.0980    0.1135    0.1032    0.1010    0.0982
## Detection Rate          0.0914    0.1077    0.0818    0.0850    0.0862
## Detection Prevalence    0.0973    0.1227    0.0919    0.0974    0.1019
## Balanced Accuracy       0.9631    0.9660    0.8907    0.9139    0.9302
##                      Class: c5 Class: c6 Class: c7 Class: c8 Class: c9
## Sensitivity             0.8061    0.9040    0.8395    0.8193    0.8573
## Specificity             0.9790    0.9894    0.9885    0.9804    0.9766
## Pos Pred Value          0.7901    0.9002    0.8934    0.8185    0.8047
## Neg Pred Value          0.9810    0.9898    0.9817    0.9805    0.9839
## Prevalence              0.0892    0.0958    0.1028    0.0974    0.1009
## Detection Rate          0.0719    0.0866    0.0863    0.0798    0.0865
## Detection Prevalence    0.0910    0.0962    0.0966    0.0975    0.1075
## Balanced Accuracy       0.8925    0.9467    0.9140    0.8998    0.9170

Part D

The kNN accuracy is 0.9683 and the LDA accuracy is 0.8632. Both are fairly accurate, but the kNN is obviously more accurate.

Problem 5

Part A

library(caret)
library(kernlab)
set.seed(12345)
data("spam")
TrainingData_Spam <- createDataPartition(y=spam$type, p=0.7, list = FALSE)
Training_Spam <- spam[TrainingData_Spam, ]
Testing_Spam <- spam[-TrainingData_Spam, ]

Part B

library(caret)
library(kernlab)
library(klaR)
library(MASS)
NB5 <- train(type~., data=Training_Spam, method="nb", trControl=trainControl(method = "cv", number = 10))
Predicted_NB5 <- predict(NB5, Testing_Spam)
confusionMatrix(Predicted_NB5, Testing_Spam$type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     476   31
##    spam        360  512
##                                           
##                Accuracy : 0.7165          
##                  95% CI : (0.6919, 0.7401)
##     No Information Rate : 0.6062          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4631          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5694          
##             Specificity : 0.9429          
##          Pos Pred Value : 0.9389          
##          Neg Pred Value : 0.5872          
##              Prevalence : 0.6062          
##          Detection Rate : 0.3452          
##    Detection Prevalence : 0.3677          
##       Balanced Accuracy : 0.7561          
##                                           
##        'Positive' Class : nonspam         
## 

Part C

library(caret)
library(kernlab)
NB5pca <- train(type~., data=Training_Spam, method="nb", trControl=trainControl(method = "cv", number = 10), preProcess=c("center", "scale", "pca"))
Predicted_NB5pca <- predict(NB5pca, Testing_Spam)
confusionMatrix(Predicted_NB5pca, Testing_Spam$type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     734   62
##    spam        102  481
##                                           
##                Accuracy : 0.8811          
##                  95% CI : (0.8628, 0.8977)
##     No Information Rate : 0.6062          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7541          
##  Mcnemar's Test P-Value : 0.002324        
##                                           
##             Sensitivity : 0.8780          
##             Specificity : 0.8858          
##          Pos Pred Value : 0.9221          
##          Neg Pred Value : 0.8250          
##              Prevalence : 0.6062          
##          Detection Rate : 0.5323          
##    Detection Prevalence : 0.5772          
##       Balanced Accuracy : 0.8819          
##                                           
##        'Positive' Class : nonspam         
## 

Part D

For NB5, the accuracy is .7165, the sensitivity is .5694, and the specificity is .9429. For NB5pca, the accuracy is .8811, the sensitivity is .8780, and the specificity is .8858. The NB5pca model would be much better as an actual email filter since the model would send a lower percentage of nonspam emails into the spam category.