set.seed(12345)
library(caret)
library(ISLR)
hitters=Hitters
#Removing the factor variables and all records in which Salary data is missing
hitters2=Hitters[!is.na(Hitters$Salary), ]
hitters2$League <- NULL
hitters2$Division <- NULL
hitters2$NewLeague <- NULL
#Replacing Salary with log(Salary) and calling it logSalary
hitters2$Salary = log(hitters2$Salary)
names(hitters2)[17] = "logSalary"
#Partitioning the data into training and testing sets
trainingIndices <- createDataPartition(hitters2$logSalary, p = 0.7, list = FALSE)
training <- hitters2[trainingIndices, ]
testing <- hitters2[-trainingIndices, ]
#Building LM1, a linear model, for logSalary and finding its r-squared value
LM1 <- train(logSalary~., data=training, method="lm")
predictLM1 <- predict(LM1, newdata = testing)
cor(predictLM1,testing$logSalary)^2
## [1] 0.4251033
When we run LM1 on the testing data, we get an r-squared value of 0.4251033.
#Building RIDGE1, a ridge regression model, for logSalary and finding its r-squared value
library(elasticnet)
RIDGE1 <- train(logSalary~., data=training, method="ridge", trControl=trainControl(method = "cv", number = 10), preProcess = c("center","scale"))
predictRIDGE1 = predict(RIDGE1, newdata = testing)
cor(predictRIDGE1,testing$logSalary)^2
## [1] 0.4149291
When we run RIDGE1 on the testing data, we get an r-squared value of 0.4149291.
#Building LASSO1, a LASSO model, for logSalary and finding its r-squared value
LASSO1 <- train(logSalary~., data = training, method="lasso", trControl=trainControl(method = "CV", number = 10), preProcess = c("center","scale"))
predictLASSO1=predict(LASSO1, newdata = testing)
cor(predictLASSO1,testing$logSalary)^2
## [1] 0.4219527
When we run LASSO1 on the testing data, we get an r-squared value of 0.4219527.
#Using the following code to determine which variables the LASSO removes with the model
predict.enet(LASSO1$finalModel, type="coef", s=LASSO1$bestTune$fraction, mode="fraction")
## $s
## [1] 0.5
##
## $fraction
## 0
## 0.5
##
## $mode
## [1] "fraction"
##
## $coefficients
## AtBat Hits HmRun Runs RBI Walks
## -0.145554543 0.360034394 0.084101979 0.000000000 0.000000000 0.168863355
## Years CAtBat CHits CHmRun CRuns CRBI
## 0.254701624 0.000000000 0.122613367 -0.125801613 0.295830203 -0.001881440
## CWalks PutOuts Assists Errors
## -0.112281097 0.058313929 0.004313357 -0.044853352
LASSO removes Runs, RBI, and CAtBat from the model.
library(AppliedPredictiveModeling)
library(caret)
data("permeability")
#Combining the predictors and response into one data frame
prints = cbind.data.frame(permeability, fingerprints)
#Using NearZeroVar to remove predictors with small variances
zeroVarIndices <- nearZeroVar(prints)
prints = prints[, -zeroVarIndices]
set.seed(12345)
#Partitioning the data into training and testing sets
trainingIndices = createDataPartition(prints$permeability, p=0.7, list = FALSE)
training2 = prints[trainingIndices, ]
testing2 = prints[-trainingIndices, ]
#Building LM2, a linear model, and finding its r-squared value
LM2 = train(permeability~., data = training2, method = "lm", preProcess = c("center","scale"))
predictedLM2 = predict(LM2, newdata = testing2)
cor(predictedLM2,testing2$permeability)^2
## [1] 0.01698163
When we run LM2 on the testing set, we get an r-squared value of 0.01698163.
#Building LM2pca, a linear model with pca, and finding its r-squared value
LM2pca = train(permeability~., data = training2, method = "lm", preProcess = c("center", "scale", "pca"))
predicted2pca = predict(LM2pca, newdata = testing2)
cor(predicted2pca, testing2$permeability)^2
## [1] 0.4644431
When we run LM2pca on the testing data, we get an r-squared value of 0.4644431.
library(data.table)
#Reading the training and testing sets from the website into R
xtest = fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.xtest")
xtrain = fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.xtrain")
ytest = fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.ytest")
ytrain = fread("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/14cancer.ytrain")
#Making data frames for the training and testing data that combine the predictors and response, and converting the response to a factor variable
training <- transpose(xtrain)
testing <- transpose(xtest)
training$response = as.factor(t(ytrain))
testing$response = as.factor(t(ytest))
#Setting the seed, using parallelization while building a linear discriminant analysis model (LDA3), and making a confusion matrix
set.seed(12345)
library(parallel)
library(doParallel)
parallelData <- startParallel()
LDA3 = train(response~., data=training,
method="lda",
trControl=trainControl(method="cv", number=10, allowParallel=TRUE),
preProcess=c("center", "scale"))
endParallel(parallelData)
## user system elapsed
## 820.66 15.87 839.05
predicted <- predict(LDA3, newdata = testing)
confusionMatrix(predicted, testing$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 1 0 0 1 0 2 0 0 0 0 2 1 1 0
## 2 0 2 1 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 2 0 1 0 1 0 0 1 0 0 0 0
## 4 0 0 0 3 0 0 0 0 0 0 0 0 0 0
## 5 0 1 0 0 4 0 0 0 2 0 0 0 0 0
## 6 1 0 0 0 0 1 0 0 0 0 0 2 0 0
## 7 0 1 0 0 0 0 1 0 0 0 0 0 0 0
## 8 1 0 1 0 0 0 0 2 0 2 0 0 0 0
## 9 0 0 0 0 0 0 0 0 4 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 12 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## 13 0 1 0 0 0 0 0 0 0 0 0 1 2 0
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 4
##
## Overall Statistics
##
## Accuracy : 0.5
## 95% CI : (0.3608, 0.6392)
## No Information Rate : 0.1111
## P-Value [Acc > NIR] : 1.581e-12
##
## Kappa : 0.4594
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.25000 0.33333 0.50000 0.75000 0.66667 0.33333
## Specificity 0.86000 0.97917 0.92000 1.00000 0.93750 0.94118
## Pos Pred Value 0.12500 0.66667 0.33333 1.00000 0.57143 0.25000
## Neg Pred Value 0.93478 0.92157 0.95833 0.98039 0.95745 0.96000
## Prevalence 0.07407 0.11111 0.07407 0.07407 0.11111 0.05556
## Detection Rate 0.01852 0.03704 0.03704 0.05556 0.07407 0.01852
## Detection Prevalence 0.14815 0.05556 0.11111 0.05556 0.12963 0.07407
## Balanced Accuracy 0.55500 0.65625 0.71000 0.87500 0.80208 0.63725
## Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
## Sensitivity 0.50000 1.00000 0.66667 0.00000 0.33333 0.00000
## Specificity 0.98077 0.92308 1.00000 1.00000 1.00000 0.96000
## Pos Pred Value 0.50000 0.33333 1.00000 NaN 1.00000 0.00000
## Neg Pred Value 0.98077 1.00000 0.96000 0.94444 0.96226 0.92308
## Prevalence 0.03704 0.03704 0.11111 0.05556 0.05556 0.07407
## Detection Rate 0.01852 0.03704 0.07407 0.00000 0.01852 0.00000
## Detection Prevalence 0.03704 0.11111 0.07407 0.00000 0.01852 0.03704
## Balanced Accuracy 0.74038 0.96154 0.83333 0.50000 0.66667 0.48000
## Class: 13 Class: 14
## Sensitivity 0.66667 1.00000
## Specificity 0.96078 1.00000
## Pos Pred Value 0.50000 1.00000
## Neg Pred Value 0.98000 1.00000
## Prevalence 0.05556 0.07407
## Detection Rate 0.03704 0.07407
## Detection Prevalence 0.07407 0.07407
## Balanced Accuracy 0.81373 1.00000
#Using parallelization while building a k-nearest neighbors model (KNN3), and making a confusion matrix
parallelData2 <- startParallel()
KNN3 = train(response~., data=training,
method="knn",
trControl=trainControl(method="cv", number=10, allowParallel=TRUE),
preProcess=c("center","scale"))
endParallel(parallelData2)
## user system elapsed
## 46.61 1.08 47.81
predicted2 <- predict(KNN3, newdata = testing)
confusionMatrix(predicted2, testing$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0
## 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 4 0 0 2 3 1 0 0 0 0 0 0 0 0 0
## 5 0 2 0 0 4 0 0 0 0 0 0 0 0 0
## 6 3 1 0 0 0 1 0 0 0 0 1 2 0 0
## 7 0 0 0 0 0 0 1 0 1 0 0 0 0 0
## 8 1 1 0 0 0 0 0 0 1 2 1 0 0 0
## 9 0 0 0 0 0 0 0 0 4 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11 0 0 0 1 0 1 0 1 0 1 0 1 1 0
## 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 13 0 2 2 0 0 1 0 0 0 0 0 1 2 1
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 3
##
## Overall Statistics
##
## Accuracy : 0.3333
## 95% CI : (0.2109, 0.4747)
## No Information Rate : 0.1111
## P-Value [Acc > NIR] : 1.211e-05
##
## Kappa : 0.2845
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.00000 0.00000 0.00000 0.75000 0.66667 0.33333
## Specificity 0.96000 0.97917 0.98000 0.94000 0.95833 0.86275
## Pos Pred Value 0.00000 0.00000 0.00000 0.50000 0.66667 0.12500
## Neg Pred Value 0.92308 0.88679 0.92453 0.97917 0.95833 0.95652
## Prevalence 0.07407 0.11111 0.07407 0.07407 0.11111 0.05556
## Detection Rate 0.00000 0.00000 0.00000 0.05556 0.07407 0.01852
## Detection Prevalence 0.03704 0.01852 0.01852 0.11111 0.11111 0.14815
## Balanced Accuracy 0.48000 0.48958 0.49000 0.84500 0.81250 0.59804
## Class: 7 Class: 8 Class: 9 Class: 10 Class: 11 Class: 12
## Sensitivity 0.50000 0.00000 0.66667 0.00000 0.00000 0.00000
## Specificity 0.98077 0.88462 1.00000 1.00000 0.88235 1.00000
## Pos Pred Value 0.50000 0.00000 1.00000 NaN 0.00000 NaN
## Neg Pred Value 0.98077 0.95833 0.96000 0.94444 0.93750 0.92593
## Prevalence 0.03704 0.03704 0.11111 0.05556 0.05556 0.07407
## Detection Rate 0.01852 0.00000 0.07407 0.00000 0.00000 0.00000
## Detection Prevalence 0.03704 0.11111 0.07407 0.00000 0.11111 0.00000
## Balanced Accuracy 0.74038 0.44231 0.83333 0.50000 0.44118 0.50000
## Class: 13 Class: 14
## Sensitivity 0.66667 0.75000
## Specificity 0.86275 1.00000
## Pos Pred Value 0.22222 1.00000
## Neg Pred Value 0.97778 0.98039
## Prevalence 0.05556 0.07407
## Detection Rate 0.03704 0.05556
## Detection Prevalence 0.16667 0.05556
## Balanced Accuracy 0.76471 0.87500
The accuracy of the linear discriminant anaylsis classifier is 0.5. The accuracy of the k nearest neighbors classifier is 0.3333. Neither classifier is very effective since the highest accuracy (LDA3) is only accurate 50% of the time. Still, the accuracy of LDA3 is higher than that of KNN3, so LDA3 would be the more effective model.
setwd("C:/Users/joshr/Documents/Machine Learning R")
#Reading the data files into R
MNistTraining <- read.csv(file = 'mnist_train (2).csv', header = FALSE)
MNistTesting <- read.csv(file = 'mnist_test (2).csv', header = FALSE)
colnames(MNistTraining)[1] <- "ResponseVar"
colnames(MNistTesting)[1] <- "ResponseVar"
#Renaming "0" as "c0", "1" as "c1", etc in the training and testing data
MNistTraining$ResponseVar <- sub("0", "c0", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("1", "c1", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("2", "c2", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("3", "c3", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("4", "c4", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("5", "c5", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("6", "c6", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("7", "c7", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("8", "c8", MNistTraining$ResponseVar)
MNistTraining$ResponseVar <- sub("9", "c9", MNistTraining$ResponseVar)
MNistTesting$ResponseVar <- sub("0", "c0", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("1", "c1", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("2", "c2", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("3", "c3", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("4", "c4", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("5", "c5", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("6", "c6", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("7", "c7", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("8", "c8", MNistTesting$ResponseVar)
MNistTesting$ResponseVar <- sub("9", "c9", MNistTesting$ResponseVar)
#Changing the response variable to a factor variable
MNistTraining$ResponseVar <- as.factor(MNistTraining$ResponseVar)
MNistTesting$ResponseVar <- as.factor(MNistTesting$ResponseVar)
#Removing the predictors with low variance in the training and testing data
library(caret)
LowVarPredictors <- nearZeroVar(MNistTraining)
MNistTraining <- MNistTraining[, -LowVarPredictors]
MNistTesting <- MNistTesting[, -LowVarPredictors]
#Using parallelization while building a k-nearest neighbors model (KNN4), and making a confusion matrix
library(parallel)
library(doParallel)
pData <- startParallel()
KNN4 <- train(ResponseVar~.,
data = MNistTraining,
method = "knn",
trControl=trainControl(method="cv", number=10, allowParallel=TRUE),
preProcess=c("center","scale"))
endParallel(pData)
## user system elapsed
## 8248.96 30.25 9188.42
KNN4predicted <- predict(KNN4, newdata = MNistTesting)
confusionMatrix(KNN4predicted, MNistTesting$ResponseVar)
## Confusion Matrix and Statistics
##
## Reference
## Prediction c0 c1 c2 c3 c4 c5 c6 c7 c8 c9
## c0 972 0 7 0 0 3 6 0 2 3
## c1 2 1129 5 1 6 0 2 20 0 6
## c2 1 3 991 3 0 0 0 5 6 4
## c3 0 2 5 975 0 11 1 0 13 4
## c4 0 0 2 1 939 0 3 1 3 7
## c5 0 0 0 15 0 864 1 0 15 6
## c6 4 0 4 1 5 6 945 0 8 1
## c7 1 0 17 10 3 3 0 992 10 10
## c8 0 0 1 2 0 2 0 0 910 1
## c9 0 1 0 2 29 3 0 10 7 967
##
## Overall Statistics
##
## Accuracy : 0.9684
## 95% CI : (0.9648, 0.9717)
## No Information Rate : 0.1135
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9649
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: c0 Class: c1 Class: c2 Class: c3 Class: c4
## Sensitivity 0.9918 0.9947 0.9603 0.9653 0.9562
## Specificity 0.9977 0.9953 0.9975 0.9960 0.9981
## Pos Pred Value 0.9789 0.9641 0.9783 0.9644 0.9822
## Neg Pred Value 0.9991 0.9993 0.9954 0.9961 0.9952
## Prevalence 0.0980 0.1135 0.1032 0.1010 0.0982
## Detection Rate 0.0972 0.1129 0.0991 0.0975 0.0939
## Detection Prevalence 0.0993 0.1171 0.1013 0.1011 0.0956
## Balanced Accuracy 0.9948 0.9950 0.9789 0.9807 0.9772
## Class: c5 Class: c6 Class: c7 Class: c8 Class: c9
## Sensitivity 0.9686 0.9864 0.9650 0.9343 0.9584
## Specificity 0.9959 0.9968 0.9940 0.9993 0.9942
## Pos Pred Value 0.9589 0.9702 0.9484 0.9934 0.9490
## Neg Pred Value 0.9969 0.9986 0.9960 0.9930 0.9953
## Prevalence 0.0892 0.0958 0.1028 0.0974 0.1009
## Detection Rate 0.0864 0.0945 0.0992 0.0910 0.0967
## Detection Prevalence 0.0901 0.0974 0.1046 0.0916 0.1019
## Balanced Accuracy 0.9823 0.9916 0.9795 0.9668 0.9763
#Using parallelization while building a linear discriminant analysis model (LDA4), and making a confusion matrix
pData <- startParallel()
LDA4 <- train(ResponseVar~., data = MNistTraining, method = "lda", trControl = trainControl(method = "cv", number = 10, allowParallel = TRUE), preProcess = c("center", "scale"))
endParallel(pData)
## user system elapsed
## 295.96 11.25 363.62
LDA4predicted <- predict(LDA4, newdata = MNistTesting)
confusionMatrix(LDA4predicted, MNistTesting$ResponseVar)
## Confusion Matrix and Statistics
##
## Reference
## Prediction c0 c1 c2 c3 c4 c5 c6 c7 c8 c9
## c0 914 0 13 7 1 11 11 4 5 7
## c1 1 1077 43 8 12 11 7 33 27 8
## c2 5 6 818 30 5 9 7 21 11 7
## c3 2 3 18 850 0 51 1 7 33 9
## c4 0 1 25 2 862 11 25 19 13 61
## c5 42 3 11 48 1 719 34 4 35 13
## c6 10 4 14 9 16 21 866 0 19 3
## c7 2 3 22 23 1 14 1 863 11 26
## c8 4 38 50 20 7 36 6 6 798 10
## c9 0 0 18 13 77 9 0 71 22 865
##
## Overall Statistics
##
## Accuracy : 0.8632
## 95% CI : (0.8563, 0.8699)
## No Information Rate : 0.1135
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8479
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: c0 Class: c1 Class: c2 Class: c3 Class: c4
## Sensitivity 0.9327 0.9489 0.7926 0.8416 0.8778
## Specificity 0.9935 0.9831 0.9887 0.9862 0.9826
## Pos Pred Value 0.9394 0.8778 0.8901 0.8727 0.8459
## Neg Pred Value 0.9927 0.9934 0.9764 0.9823 0.9866
## Prevalence 0.0980 0.1135 0.1032 0.1010 0.0982
## Detection Rate 0.0914 0.1077 0.0818 0.0850 0.0862
## Detection Prevalence 0.0973 0.1227 0.0919 0.0974 0.1019
## Balanced Accuracy 0.9631 0.9660 0.8907 0.9139 0.9302
## Class: c5 Class: c6 Class: c7 Class: c8 Class: c9
## Sensitivity 0.8061 0.9040 0.8395 0.8193 0.8573
## Specificity 0.9790 0.9894 0.9885 0.9804 0.9766
## Pos Pred Value 0.7901 0.9002 0.8934 0.8185 0.8047
## Neg Pred Value 0.9810 0.9898 0.9817 0.9805 0.9839
## Prevalence 0.0892 0.0958 0.1028 0.0974 0.1009
## Detection Rate 0.0719 0.0866 0.0863 0.0798 0.0865
## Detection Prevalence 0.0910 0.0962 0.0966 0.0975 0.1075
## Balanced Accuracy 0.8925 0.9467 0.9140 0.8998 0.9170
The accuracy we got for KNN4 is 0.9688 and the accuracy we got for LDA4 is 0.8632. Both are very strong accuracies, but we can see that KNN4 has the highest one in this case. Both models seem to be very strong and capable in terms of prediction, though.
library(caret)
library(AppliedPredictiveModeling)
library(kernlab)
data("spam")
#Setting the seed and partitioning the data into training and testing sets
set.seed(12345)
trainingIndex = createDataPartition(spam$type, p =0.7, list = FALSE)
spamtraining= spam[trainingIndex, ]
spamtesting = spam[-trainingIndex, ]
#Building NB5, a naive Bayes classifier, and making a confusion matrix to assess the predictive capacity
NB5 = train(type~., data=spamtraining, method = "nb", trControl = trainControl(method = "cv", number =10))
predictedNB = predict(NB5, newdata = spamtesting)
confusionMatrix(predictedNB, spamtesting$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 499 31
## spam 337 512
##
## Accuracy : 0.7331
## 95% CI : (0.709, 0.7563)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4913
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5969
## Specificity : 0.9429
## Pos Pred Value : 0.9415
## Neg Pred Value : 0.6031
## Prevalence : 0.6062
## Detection Rate : 0.3619
## Detection Prevalence : 0.3843
## Balanced Accuracy : 0.7699
##
## 'Positive' Class : nonspam
##
#Building NB5pca, a naive Bayes classifier with preprocessing and pca, and making a confusion matrix to assess the predictive capacity
NB5pca = train(type~., data=spamtraining, method = "nb", trControl = trainControl(method = "cv", number =10), preProcess = c("center", "scale", "pca"))
predictedNBpca = predict(NB5, spamtesting)
confusionMatrix(predictedNBpca, spamtesting$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 499 31
## spam 337 512
##
## Accuracy : 0.7331
## 95% CI : (0.709, 0.7563)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4913
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5969
## Specificity : 0.9429
## Pos Pred Value : 0.9415
## Neg Pred Value : 0.6031
## Prevalence : 0.6062
## Detection Rate : 0.3619
## Detection Prevalence : 0.3843
## Balanced Accuracy : 0.7699
##
## 'Positive' Class : nonspam
##
Without Preprocessing (Part b): Accuracy: 0.7331 Specificity: 0.9429 Sensitivity: 0.5969 Type I Error Rate: 337/(499+337) = 0.40311 Type II Error Rate: 31/(31+512) = 0.05709
With Preprocessing (Part c): Accuracy: 0.7331 Specificity: 0.9429 Sensivity: 0.5969 Type I Error Rate: 337/(499+337) = 0.40311 Type II Error Rate: 31/(31+512) = 0.05709
As we can see, the accuracy, specificity, sensitivity, Type I error rate, and Type II error rate are the same with and without preprocessing. However, NB5 would be slightly preferable in practice because it has simpler code without losing any effectiveness in terms of these five measures.