knitr::opts_chunk$set(cache=TRUE)
library(caret)
library(ggplot2)
library(ISLR)
library(lattice)
set.seed(12345)
Hitters2 <- subset(Hitters, select = -c(League, Division, NewLeague))
Hitters3 <- Hitters2[complete.cases(Hitters2), ]
logSalary <- log10(Hitters3$Salary)
Hitters3$Salary <- logSalary
names(Hitters3)[names(Hitters3) == 'Salary'] <- 'logSalary'
TrainingData <- createDataPartition(y=Hitters3$logSalary, p=0.7, list=FALSE)
Training <- Hitters3[TrainingData, ]
Testing <- Hitters3[-TrainingData, ]
library(ISLR)
library(caret)
LM1 <- train(logSalary~., data=Training, method="lm", maximize = TRUE, metric="Rsquared")
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55219 -0.19488 0.00698 0.17778 1.21563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.031e+00 8.509e-02 23.872 <2e-16 ***
## AtBat -1.707e-03 6.613e-04 -2.582 0.0107 *
## Hits 6.274e-03 2.525e-03 2.485 0.0139 *
## HmRun 4.085e-03 6.426e-03 0.636 0.5258
## Runs -1.680e-03 3.034e-03 -0.554 0.5805
## RBI 1.517e-03 2.807e-03 0.540 0.5896
## Walks 4.629e-03 1.951e-03 2.373 0.0188 *
## Years 1.814e-02 1.359e-02 1.335 0.1836
## CAtBat -1.219e-05 1.433e-04 -0.085 0.9323
## CHits 4.766e-04 7.577e-04 0.629 0.5302
## CHmRun 2.202e-04 1.699e-03 0.130 0.8970
## CRuns 3.652e-04 7.663e-04 0.477 0.6343
## CRBI -5.492e-04 7.912e-04 -0.694 0.4886
## CWalks -4.340e-04 3.602e-04 -1.205 0.2299
## PutOuts 1.804e-04 7.663e-05 2.355 0.0197 *
## Assists 4.570e-04 2.228e-04 2.051 0.0418 *
## Errors -7.152e-03 4.418e-03 -1.619 0.1074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2687 on 168 degrees of freedom
## Multiple R-squared: 0.5713, Adjusted R-squared: 0.5305
## F-statistic: 13.99 on 16 and 168 DF, p-value: < 2.2e-16
Predicted_LM1 <- predict(LM1, newdata=Testing)
Rsquared_LM1 <- cor(Predicted_LM1, Testing$logSalary)^2
Rsquared_LM1
## [1] 0.4518025
The Rsquared value for LM1 is .4518025, which means it is not a great predictor. ## Part B
library(ISLR)
library(caret)
library(elasticnet)
library(lars)
RIDGE1 <- train(logSalary~., data=Training, method="ridge", maximize=TRUE, metric="Rsquared", trControl=trainControl(method = "cv", number=10), preProcess=c("center", "scale"))
summary(RIDGE1)
## Length Class Mode
## call 4 -none- call
## actions 17 -none- list
## allset 16 -none- numeric
## beta.pure 272 -none- numeric
## vn 16 -none- character
## mu 1 -none- numeric
## normx 16 -none- numeric
## meanx 16 -none- numeric
## lambda 1 -none- numeric
## L1norm 17 -none- numeric
## penalty 17 -none- numeric
## df 17 -none- numeric
## Cp 17 -none- numeric
## sigma2 1 -none- numeric
## xNames 16 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 1 -none- logical
## param 0 -none- list
Predicted_RIDGE1 <- predict(RIDGE1, newdata = Testing)
Rsquared_RIDGE1 <- cor(Predicted_RIDGE1, Testing$logSalary)^2
Rsquared_RIDGE1
## [1] 0.4812084
The Rsquared value for RIDGE1 is .4812084. This is slightly better than LM1, but still does not provide a strong predictor. ## Part C
library(ISLR)
library(caret)
library(lars)
LASSO1 <- train(logSalary~., data=Training, method="lasso", maximize=TRUE, metric="Rsquared", trControl=trainControl(method = "cv", number=10), preProcess=c("center", "scale"))
summary(LASSO1)
## Length Class Mode
## call 4 -none- call
## actions 19 -none- list
## allset 16 -none- numeric
## beta.pure 304 -none- numeric
## vn 16 -none- character
## mu 1 -none- numeric
## normx 16 -none- numeric
## meanx 16 -none- numeric
## lambda 1 -none- numeric
## L1norm 19 -none- numeric
## penalty 19 -none- numeric
## df 19 -none- numeric
## Cp 19 -none- numeric
## sigma2 1 -none- numeric
## xNames 16 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 1 -none- logical
## param 0 -none- list
Predicted_LASSO1 <- predict(LASSO1, newdata = Testing)
Rsquared_LASSO1 <- cor(Predicted_LASSO1, Testing$logSalary)^2
Rsquared_LASSO1
## [1] 0.4686052
The Rsquared value for LASSO1 is .4686052, which is higher than LM1, but lower than RIDGE1. ## Part D
library(ISLR)
library(caret)
library(elasticnet)
library(lars)
predict.enet(LASSO1$finalModel, type="coef", s=LASSO1$bestTune$fraction, mode='fraction')
## $s
## [1] 0.5
##
## $fraction
## 0
## 0.5
##
## $mode
## [1] "fraction"
##
## $coefficients
## AtBat Hits HmRun Runs RBI Walks
## -0.11603735 0.17393873 0.02151554 0.00000000 0.01334575 0.05446797
## Years CAtBat CHits CHmRun CRuns CRBI
## 0.05868098 0.00000000 0.19940872 -0.04250998 0.01951787 0.00000000
## CWalks PutOuts Assists Errors
## -0.03332302 0.04827554 0.04337863 -0.03965230
The variables removed from the LASSO1 model were Runs, CAtBat, and CRBI. All of these variables had coefficients of 0.
library(caret)
library(AppliedPredictiveModeling)
data("permeability")
combined_data <- cbind.data.frame(fingerprints, permeability)
updated_data <- subset(combined_data, select = -c(nearZeroVar(combined_data, saveMetrics = FALSE)))
set.seed(12345)
TrainingData_Permeability <- createDataPartition(y=updated_data$permeability, p=0.7, list = FALSE)
Training_Permeability <- updated_data[TrainingData_Permeability, ]
Testing_Permeability <- updated_data[-TrainingData_Permeability, ]
LM2 <- train(permeability~., data=Training_Permeability, method="lm", maximize=TRUE, metric="Rsquared", preProcess=c("center", "scale"))
Predicted_LM2 <- predict(LM2, newdata=Testing_Permeability)
Rsquared_LM2 <- cor(Predicted_LM2, Testing_Permeability$permeability)^2
Rsquared_LM2
## [1] 0.2323023
LM2pca <- train(permeability~., data = Training_Permeability, method="lm", maximize=TRUE, metric="Rsquared", preProcess=c("center", "scale", "pca"))
Predicted_LM2pca <- predict(LM2pca, newdata=Testing_Permeability)
Rsquared_LM2pca <- cor(Predicted_LM2pca, Testing_Permeability$permeability)^2
Rsquared_LM2pca
## [1] 0.3688938
The Rsquared value for LM2 is 0.2323023. The Rsquared value for LM2pca is 0.3688938. Although this is an improvement from LM2, it is still low.
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2026637 108.3 4039297 215.8 4039297 215.8
## Vcells 3496321 26.7 10146329 77.5 10146329 77.5
library(data.table)
set.seed(12345)
training_predictors <- fread('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.xtrain')
training_response <- fread('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.ytrain')
testing_predictors <- fread('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.xtest')
testing_response <- fread('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.ytest')
trainingC_transposed <- as.data.frame(t(training_predictors))
testingC_transposed <- as.data.frame(t(testing_predictors))
training_transposed <- as.data.frame(t(training_response))
names(training_transposed)[names(training_transposed) == 'V1'] <- 'Response'
testing_transposed <- as.data.frame(t(testing_response))
names(testing_transposed)[names(testing_transposed) == 'V1'] <- 'Response'
Cancer_training_combined <- cbind.data.frame(trainingC_transposed, training_transposed)
Cancer_testing_combined <- cbind.data.frame(testingC_transposed, testing_transposed)
Cancer_training_combined$Response <- as.factor(Cancer_training_combined$Response)
Cancer_testing_combined$Response <- as.factor(Cancer_testing_combined$Response)
library(caret)
library(kernlab)
library(ggplot2)
LDA3 <- train(Response~., Cancer_training_combined, method="lda", maximize=TRUE, trControl=trainControl(method = "CV", number = 10), preProcess=c("center", "scale"))
Predicted_LDA3 <- predict(LDA3, newdata=Cancer_testing_combined)
confusionMatrix(Predicted_LDA3, Cancer_testing_combined$Response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 1 0 0 1 0 2 0 0 0 0 2 1 1 0
## 2 0 2 1 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 2 0 1 0 1 0 0 1 0 0 0 0
## 4 0 0 0 3 0 0 0 0 0 0 0 0 0 0
## 5 0 1 0 0 4 0 0 0 2 0 0 0 0 0
## 6 1 0 0 0 0 1 0 0 0 0 0 2 0 0
## 7 0 1 0 0 0 0 1 0 0 0 0 0 0 0
## 8 1 0 1 0 0 0 0 2 0 2 0 0 0 0
## 9 0 0 0 0 0 0 0 0 4 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 12 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## 13 0 1 0 0 0 0 0 0 0 0 0 1 2 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 4
##
## Overall Statistics
##
## Accuracy : 0.5
## 95% CI : (0.3608, 0.6392)
## No Information Rate : 0.1111
## P-Value [Acc > NIR] : 1.581e-12
##
## Kappa : 0.4594
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.25000 0.33333 0.50000 0.75000 0.66667 0.33333
## Specificity 0.86000 0.97917 0.92000 1.00000 0.93750 0.94118
## Pos Pred Value 0.12500 0.66667 0.33333 1.00000 0.57143 0.25000
## Neg Pred Value 0.93478 0.92157 0.95833 0.98039 0.95745 0.96000
## Prevalence 0.07407 0.11111 0.07407 0.07407 0.11111 0.05556
## Detection Rate 0.01852 0.03704 0.03704 0.05556 0.07407 0.01852
## Detection Prevalence 0.14815 0.05556 0.11111 0.05556 0.12963 0.07407
## Balanced Accuracy 0.55500 0.65625 0.71000 0.87500 0.80208 0.63725
## Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity 0.50000 1.00000 0.66667 0.00000 0.33333
## Specificity 0.98077 0.92308 1.00000 1.00000 1.00000
## Pos Pred Value 0.50000 0.33333 1.00000 NaN 1.00000
## Neg Pred Value 0.98077 1.00000 0.96000 0.94444 0.96226
## Prevalence 0.03704 0.03704 0.11111 0.05556 0.05556
## Detection Rate 0.01852 0.03704 0.07407 0.00000 0.01852
## Detection Prevalence 0.03704 0.11111 0.07407 0.00000 0.01852
## Balanced Accuracy 0.74038 0.96154 0.83333 0.50000 0.66667
## Class: 12 Class: 13 Class: 14
## Sensitivity 0.00000 0.66667 1.00000
## Specificity 0.96000 0.96078 1.00000
## Pos Pred Value 0.00000 0.50000 1.00000
## Neg Pred Value 0.92308 0.98000 1.00000
## Prevalence 0.07407 0.05556 0.07407
## Detection Rate 0.00000 0.03704 0.07407
## Detection Prevalence 0.03704 0.07407 0.07407
## Balanced Accuracy 0.48000 0.81373 1.00000
library(caret)
library(kernlab)
library(ggplot2)
kNN3 <- train(Response~., data=Cancer_training_combined, method="knn", maximize=TRUE, metric="Accuracy", trControl=trainControl(method="cv", number = 10), preProcess=c("center", "scale"))
summary(kNN3)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## theDots 0 -none- list
## xNames 16063 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 14 -none- character
## param 0 -none- list
Predicted_kNN3 <- predict(kNN3, newdata=Cancer_testing_combined)
confusionMatrix(Predicted_kNN3, Cancer_testing_combined$Response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 2 2 1 0 0 0 0 0 0 0 0 0
## 5 0 2 0 0 4 0 0 0 0 0 0 0 0 0
## 6 2 1 0 0 0 3 0 0 0 0 1 2 0 0
## 7 0 0 0 0 0 0 2 0 0 0 0 0 0 0
## 8 1 1 0 1 1 0 0 0 1 2 1 0 0 0
## 9 0 0 0 0 0 0 0 0 4 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 0 0 0 1 0 0 0 1 0 0 0 1 1 1
## 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 13 0 2 2 0 0 0 0 0 1 0 0 1 2 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 3
##
## Overall Statistics
##
## Accuracy : 0.3889
## 95% CI : (0.2592, 0.5312)
## No Information Rate : 0.1111
## P-Value [Acc > NIR] : 1.196e-07
##
## Kappa : 0.3453
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.25000 0.0000 0.00000 0.50000 0.66667 1.00000
## Specificity 0.94000 1.0000 1.00000 0.94000 0.95833 0.88235
## Pos Pred Value 0.25000 NaN NaN 0.40000 0.66667 0.33333
## Neg Pred Value 0.94000 0.8889 0.92593 0.95918 0.95833 1.00000
## Prevalence 0.07407 0.1111 0.07407 0.07407 0.11111 0.05556
## Detection Rate 0.01852 0.0000 0.00000 0.03704 0.07407 0.05556
## Detection Prevalence 0.07407 0.0000 0.00000 0.09259 0.11111 0.16667
## Balanced Accuracy 0.59500 0.5000 0.50000 0.72000 0.81250 0.94118
## Class: 7 Class: 8 Class: 9 Class: 10 Class: 11
## Sensitivity 1.00000 0.00000 0.66667 0.00000 0.00000
## Specificity 1.00000 0.84615 1.00000 1.00000 0.90196
## Pos Pred Value 1.00000 0.00000 1.00000 NaN 0.00000
## Neg Pred Value 1.00000 0.95652 0.96000 0.94444 0.93878
## Prevalence 0.03704 0.03704 0.11111 0.05556 0.05556
## Detection Rate 0.03704 0.00000 0.07407 0.00000 0.00000
## Detection Prevalence 0.03704 0.14815 0.07407 0.00000 0.09259
## Balanced Accuracy 1.00000 0.42308 0.83333 0.50000 0.45098
## Class: 12 Class: 13 Class: 14
## Sensitivity 0.00000 0.66667 0.75000
## Specificity 1.00000 0.88235 1.00000
## Pos Pred Value NaN 0.25000 1.00000
## Neg Pred Value 0.92593 0.97826 0.98039
## Prevalence 0.07407 0.05556 0.07407
## Detection Rate 0.00000 0.03704 0.05556
## Detection Prevalence 0.00000 0.14815 0.05556
## Balanced Accuracy 0.50000 0.77451 0.87500
While both classifiers have low accuracy, the LDA3 has a higher accuracy of 0.5, while kNN3 has ab accuracy of 0.388 an accuracy of 0.3889. LDA3 would be more likely to produce more accurate results.
library(caret)
training4<- read.csv("U:/Public/R/mnist_train.csv", header=FALSE)
training4$V1 <- as.factor(training4$V1)
testing4 <- read.csv("U:/Public/R/mnist_test.csv", header=FALSE)
testing4$V1 <- as.factor(testing4$V1)
colnames(training4)[1]<- "Response"
colnames(testing4)[1] <- "Response"
training4$Response <- sub("0", "c0", training4$Response)
training4$Response <- sub("1", "c1", training4$Response)
training4$Response <- sub("2", "c2", training4$Response)
training4$Response <- sub("3", "c3", training4$Response)
training4$Response <- sub("4", "c4", training4$Response)
training4$Response <- sub("5", "c5", training4$Response)
training4$Response <- sub("6", "c6", training4$Response)
training4$Response <- sub("7", "c7", training4$Response)
training4$Response <- sub("8", "c8", training4$Response)
training4$Response <- sub("9", "c9", training4$Response)
testing4$Response <- sub("0", "c0", testing4$Response)
testing4$Response <- sub("1", "c1", testing4$Response)
testing4$Response <- sub("2", "c2", testing4$Response)
testing4$Response <- sub("3", "c3", testing4$Response)
testing4$Response <- sub("4", "c4", testing4$Response)
testing4$Response <- sub("5", "c5", testing4$Response)
testing4$Response <- sub("6", "c6", testing4$Response)
testing4$Response <- sub("7", "c7", testing4$Response)
testing4$Response <- sub("8", "c8", testing4$Response)
testing4$Response <- sub("9", "c9", testing4$Response)
lowvar <- caret::nearZeroVar(training4)
training4 <- training4[ , -lowvar]
testing4 <- testing4[ , -lowvar]
names(training4)[1] <- "Response"
names(testing4)[1] <- "Response"
{r 4B, warning=FALSE, message=FALSE} library(ggplot2) library(lattice) KNN4 <- caret::train(Response~., data=training4, method=“knn”, trControl=caret::trainControl(method=“cv”, number=10, allowParallel = TRUE), preProcess=c(“center”, “scale”))
predict_knn4 <- predict(KNN4, newdata=testing4) confusionMatrix(predict_knn4, testing4$Response)
LDA4 <- caret::train(Response~., data=training4, method="lda", trControl=caret::trainControl(method="cv", number=10))
predict_lda4 <- predict(LDA4, newdata=testing4)
confusionMatrix(table(predict_lda4, testing4$Response))
## Confusion Matrix and Statistics
##
##
## predict_lda4 c0 c1 c2 c3 c4 c5 c6 c7 c8 c9
## c0 914 0 13 7 1 11 11 4 5 7
## c1 1 1077 43 8 12 11 7 33 27 8
## c2 5 6 818 30 5 9 7 21 11 7
## c3 2 3 18 850 0 51 1 7 33 9
## c4 0 1 25 2 862 11 25 19 13 61
## c5 42 3 11 48 1 719 34 4 35 13
## c6 10 4 14 9 16 21 866 0 19 3
## c7 2 3 22 23 1 14 1 863 11 26
## c8 4 38 50 20 7 36 6 6 798 10
## c9 0 0 18 13 77 9 0 71 22 865
##
## Overall Statistics
##
## Accuracy : 0.8632
## 95% CI : (0.8563, 0.8699)
## No Information Rate : 0.1135
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8479
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: c0 Class: c1 Class: c2 Class: c3 Class: c4
## Sensitivity 0.9327 0.9489 0.7926 0.8416 0.8778
## Specificity 0.9935 0.9831 0.9887 0.9862 0.9826
## Pos Pred Value 0.9394 0.8778 0.8901 0.8727 0.8459
## Neg Pred Value 0.9927 0.9934 0.9764 0.9823 0.9866
## Prevalence 0.0980 0.1135 0.1032 0.1010 0.0982
## Detection Rate 0.0914 0.1077 0.0818 0.0850 0.0862
## Detection Prevalence 0.0973 0.1227 0.0919 0.0974 0.1019
## Balanced Accuracy 0.9631 0.9660 0.8907 0.9139 0.9302
## Class: c5 Class: c6 Class: c7 Class: c8 Class: c9
## Sensitivity 0.8061 0.9040 0.8395 0.8193 0.8573
## Specificity 0.9790 0.9894 0.9885 0.9804 0.9766
## Pos Pred Value 0.7901 0.9002 0.8934 0.8185 0.8047
## Neg Pred Value 0.9810 0.9898 0.9817 0.9805 0.9839
## Prevalence 0.0892 0.0958 0.1028 0.0974 0.1009
## Detection Rate 0.0719 0.0866 0.0863 0.0798 0.0865
## Detection Prevalence 0.0910 0.0962 0.0966 0.0975 0.1075
## Balanced Accuracy 0.8925 0.9467 0.9140 0.8998 0.9170
The kNN accuracy is 0.9683 and the LDA accuracy is 0.8632. Both are fairly accurate, but the kNN is obviously more accurate.
library(caret)
library(kernlab)
set.seed(12345)
data("spam")
TrainingData_Spam <- createDataPartition(y=spam$type, p=0.7, list = FALSE)
Training_Spam <- spam[TrainingData_Spam, ]
Testing_Spam <- spam[-TrainingData_Spam, ]
library(caret)
library(kernlab)
library(klaR)
library(MASS)
NB5 <- train(type~., data=Training_Spam, method="nb", trControl=trainControl(method = "cv", number = 10))
Predicted_NB5 <- predict(NB5, Testing_Spam)
confusionMatrix(Predicted_NB5, Testing_Spam$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 476 31
## spam 360 512
##
## Accuracy : 0.7165
## 95% CI : (0.6919, 0.7401)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4631
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5694
## Specificity : 0.9429
## Pos Pred Value : 0.9389
## Neg Pred Value : 0.5872
## Prevalence : 0.6062
## Detection Rate : 0.3452
## Detection Prevalence : 0.3677
## Balanced Accuracy : 0.7561
##
## 'Positive' Class : nonspam
##
library(caret)
library(kernlab)
NB5pca <- train(type~., data=Training_Spam, method="nb", trControl=trainControl(method = "cv", number = 10), preProcess=c("center", "scale", "pca"))
Predicted_NB5pca <- predict(NB5pca, Testing_Spam)
confusionMatrix(Predicted_NB5pca, Testing_Spam$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 734 62
## spam 102 481
##
## Accuracy : 0.8811
## 95% CI : (0.8628, 0.8977)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7541
## Mcnemar's Test P-Value : 0.002324
##
## Sensitivity : 0.8780
## Specificity : 0.8858
## Pos Pred Value : 0.9221
## Neg Pred Value : 0.8250
## Prevalence : 0.6062
## Detection Rate : 0.5323
## Detection Prevalence : 0.5772
## Balanced Accuracy : 0.8819
##
## 'Positive' Class : nonspam
##
For NB5, the accuracy is .7165, the sensitivity is .5694, and the specificity is .9429. For NB5pca, the accuracy is .8811, the sensitivity is .8780, and the specificity is .8858. The NB5pca model would be much better as an actual email filter since the model would send a lower percentage of nonspam emails into the spam category.