This week’s project involves training a predictive model to distinguish between spam and ham emails. The general steps can be broken down as follows:
library(tm)
library(stringr)
library(XML)
library(RTextTools)
library(ROCR)
library(dplyr)
library(ggplot2)
setwd("~/Google Drive/CUNY/git/DATA607/Week10")
#setwd("C:/Users/bhao/Google Drive/CUNY/git/DATA607/Week10")
set.seed(123)
I wrote two functions to separate the email cleansing and corpus cleansing processes.
# create function to cleanse emails by removing non ASCII characters
cleanse_emails = function(dat) {
dat = str_c(dat, collapse = " ")
dat2 <- unlist(strsplit(dat, split=" "))
dat3 <- grep("dat2", iconv(dat2, "latin1", "ASCII", sub="dat2"))
if (length(dat3) == 0) {
dat4 = dat2
} else dat4 = dat2[-dat3]
dat5 <- paste(dat4, collapse = " ")
}
cleanse_corpus = function(x) {
x = tm_map(x, PlainTextDocument) # to make tm package play nice
x = tm_map(x, removeNumbers)
x = tm_map(x, removePunctuation) # leaving punctuation in case relevant to spam ham filter
x = tm_map(x, stripWhitespace)
x = tm_map(x, stemDocument)
x = tm_map(x, content_transformer(tolower)) # content_transformer required because tolower not tm function
x = tm_map(x, removeWords, stopwords('en'))
}
Using the tm library, I created an email corpus of documents, each of which contained the cleansed email content and its spam/ham label as metadata.
spam_path = '20021010_spam/'
# add spam to email corpus
# create initial corpus
email = readLines(str_c(spam_path, list.files(spam_path)[1]))
email = cleanse_emails(email)
email_corpus = Corpus(VectorSource(email))
email_corpus = cleanse_corpus(email_corpus)
meta(email_corpus[[1]], 'spam') = 1
for (i in 2:length(list.files(spam_path))) {
email = readLines(str_c(spam_path, list.files(spam_path)[i]))
email = cleanse_emails(email)
# add meta data to house spam classification
if (length(email) != 0) {
tmp_corpus = Corpus(VectorSource(email))
tmp_corpus = cleanse_corpus(tmp_corpus)
meta(tmp_corpus[[1]], 'spam') = 1
email_corpus = c(email_corpus, tmp_corpus)
}
}
easy_ham_path = '20021010_easy_ham/'
# add ham to email corpus
for (i in 1:length(list.files(easy_ham_path))) {
email = readLines(str_c(easy_ham_path, list.files(easy_ham_path)[i]))
email = cleanse_emails(email)
# add meta data to house spam classification
if (length(email) != 0) {
tmp_corpus = Corpus(VectorSource(email))
tmp_corpus = cleanse_corpus(tmp_corpus)
meta(tmp_corpus[[1]], 'spam') = 0
email_corpus = c(email_corpus, tmp_corpus)
}
}
hard_ham_path = '20021010_hard_ham/'
# add ham to email corpus
for (i in 1:length(list.files(hard_ham_path))) {
email = readLines(str_c(hard_ham_path, list.files(hard_ham_path)[i]))
email = cleanse_emails(email)
# add meta data to house spam classification
if (length(email) != 0) {
tmp_corpus = Corpus(VectorSource(email))
tmp_corpus = cleanse_corpus(tmp_corpus)
meta(tmp_corpus[[1]], 'spam') = 0
email_corpus = c(email_corpus, tmp_corpus)
}
}
incomplete final line found on '20021010_hard_ham/0231.7c6cc716ce3f3bfad7130dd3c8d7b072'incomplete final line found on '20021010_hard_ham/0250.7c6cc716ce3f3bfad7130dd3c8d7b072'
Because 1) the emails were read in order of spam, easy ham and hard ham and 2) the RTextTools container object requires the training data to represent the 1:X documents and the test data to represent the X:N documents, I had to randomize the emails within the corpus before creating the container object.
# randomize corpus order
N = length(email_corpus)
rand_index = sample(1:N)
email_corpus = email_corpus[rand_index]
tm_filter(email_corpus[1:100], FUN = function(x) meta(x)[['spam']] == 1) # check that spam and ham have been randomized
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 17
# create document term matrix
dtm = DocumentTermMatrix(email_corpus)
dtm = removeSparseTerms(dtm, 1-(10/length(email_corpus)))
#inspect(dtm[100:120, 100:110])
spam_labels = unlist(meta(email_corpus, 'spam'))
With the data randomized, I created an RTextTools container, which in turn would be used to train the various models and analyze their respective results. I trained three algorithms: 1) support vector machine (SVM), 2) classification tree (TREE) and 3) maximum entropy (MAXENT). As illustrated in the confusion matrices and summary analytics below, the accuracy of all three models is superb. Out of 660 emails in the test set, the models only misclassified 7-8 emails in total.
# create container
container = create_container(dtm, labels = spam_labels,
trainSize = 1:round(N*0.8,0), testSize = (round(N*0.8,0)+1):N, # use 80% of data for train
virgin = FALSE)
# estimation procedure
svm_model = train_model(container, 'SVM')
#cross_validate(container, nfold = 5, algorithm = 'SVM')
tree_model = train_model(container, 'TREE')
#cross_validate(container, nfold = 5, algorithm = 'TREE')
maxent_model = train_model(container, 'MAXENT')
#cross_validate(container, nfold = 5, algorithm = 'MAXENT')
svm_out = classify_model(container, svm_model)
tree_out = classify_model(container, tree_model)
maxent_out = classify_model(container, maxent_model)
# evaluation
labels_out = data.frame(
correct_label = spam_labels[(round(N*0.8,0)+1):N],
svm = as.character(svm_out[,1]),
tree = as.character(tree_out[,1]),
maxent = as.character(maxent_out[,1]),
stringsAsFactors = FALSE
)
# svm confusion table
table(actual = labels_out[,1], predicted = labels_out[,2])
predicted
actual 0 1
0 553 3
1 5 99
# tree confusion table
table(actual = labels_out[,1], predicted = labels_out[,3])
predicted
actual 0 1
0 549 7
1 0 104
# maxent confusion table
table(actual = labels_out[,1], predicted = labels_out[,4])
predicted
actual 0 1
0 554 2
1 6 98
# analytics
svm_analytics = create_analytics(container, svm_out)
summary(svm_analytics)
ENSEMBLE SUMMARY
n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
n >= 1 1 0.99
ALGORITHM PERFORMANCE
SVM_PRECISION SVM_RECALL SVM_FSCORE
0.980 0.970 0.975
tree_analytics = create_analytics(container, tree_out)
summary(tree_analytics)
ENSEMBLE SUMMARY
n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
n >= 1 1 0.99
ALGORITHM PERFORMANCE
TREE_PRECISION TREE_RECALL TREE_FSCORE
0.970 0.995 0.980
maxent_analytics = create_analytics(container, maxent_out)
summary(maxent_analytics)
ENSEMBLE SUMMARY
n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
n >= 1 1 0.99
ALGORITHM PERFORMANCE
MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
0.985 0.970 0.975
The RTextTools models worked wonderfully - both extremely fast and accurate; however, the package does not allow for much customization and/or fine tuning. Initially, I did not realize that the probabilities in the classify_model objects were label-specific until Judd Andermann pointed that out to me. With his pointers, I was finally able to produce ROC curves for the RTextTools models, an example of which I’ve produced below.
That said, I had already moved on to using the caret models (as outlined below), which still offered more flexibility in terms of model parameter tuning.
svm_out2 = svm_out %>% mutate(SVM_PROB2 = ifelse(SVM_LABEL == 0, 1 - SVM_PROB, SVM_PROB))
tree_out2 = tree_out %>% mutate(TREE_PROB2 = ifelse(TREE_LABEL == 0, 1 - TREE_PROB, TREE_PROB))
maxent_out2 = maxent_out %>% mutate(MAXENTROPY_PROB2 = ifelse(MAXENTROPY_LABEL == 0, 1 - MAXENTROPY_PROB,
MAXENTROPY_PROB))
# create ROCR prediction object
svm_pred1 = prediction(svm_out2$SVM_PROB2, labels_out$correct_label)
tree_pred1 = prediction(tree_out2$TREE_PROB2, labels_out$correct_label)
maxent_pred1 = prediction(maxent_out2$MAXENTROPY_PROB2, labels_out$correct_label)
# create ROCR performance object
svm_perf1 = performance(svm_pred1, measure="tpr", x.measure="fpr")
tree_perf1 = performance(tree_pred1, measure="tpr", x.measure="fpr")
maxent_perf1 = performance(maxent_pred1, measure="tpr", x.measure="fpr")
plot(svm_perf1)
lines(tree_perf1@x.values[[1]], tree_perf1@y.values[[1]], col = 2)
lines(maxent_perf1@x.values[[1]], maxent_perf1@y.values[[1]], col = 3)
legend('bottomright', legend = c('svm', 'tree', 'maxent'),
lty=1, col=c('black', 'red', 'green'), bty='n', cex=.75)
Whereas the RTextTools package is specific to text mining, the caret package is a much more general purpose predictive modeling package. In this particular case, training models using the caret package was considerably slower and/or much more prone to error. For example, training the random forest models took more than 30 minutes; I stopped the training at that point and am unsure if it would have actually completed correctly. Training the Naive Bayes and neural net models resulted in errors for which I was unable to find solutions.
That said, I was able to train linear/logistic regression and support vector machine models. The beauty of the caret package is that it allows for user-friendly, automatic and user-defined model tuning during the training process:
glmnet model has two tuning parameters: 1) alpha (0 to 1 or 100% lasso to 100% ridge) and 2) lambda (0 to Inf or the size of the penalty), and the svmRadial model has two tuning parameters: 1) sigma (0 to Inf or the complexity of the model or the bias-variance trade-off) and 2) C or the cost regularization parameter . By default, the caret train function will select the tuning parameters that produce the best out-of-sample results based on a user-specified metric, e.g. accuracy, ROC, etc. and bootstrapped resamplingcaret trainControl object then allows the user to define the resampling method, summary function for two class classification and morecaret tuneLength and tuneGrid arguments allow the user to further control the set of parameter values over which to tune the models# attempt modeling using caret package
library(caret)
Loading required package: lattice
#library(parallel)
#library(doParallel) # for OSX use library(doMC)
library(caTools)
# convert dtm to matrix
full_mat = as.matrix(dtm)
train_mat = full_mat[1:round(N*0.8,0),]
test_mat = full_mat[(round(N*0.8,0)+1):N,]
# caret twoClassSummary requires non-numeric classes
spam_labels = factor(spam_labels, levels = c(0, 1), labels = c('ham', 'spam'))
train_y = spam_labels[1:round(N*0.8,0)]
test_y = spam_labels[(round(N*0.8,0)+1):N]
#use_cores = detectCores()-1
#cl = makeCluster(use_cores)
#registerDoParallel(cl) # for OSX use registerDoMC(cl)
# summaryFunction = twoClassSummary allows train() function to use AUC metric to rank models;
# classProbs = TRUE allows summaryFunction to work properly
myControl = trainControl(method = 'cv', number = 5, summaryFunction = twoClassSummary, classProbs = T, verboseIter = T)
# glmnet = a general linear model with combination of lasso regression (penalizes number of non-zero coefficients) and
# ridge regression (penalizes absolute magnitude of coefficients);
# glmnet has two tuning parameters: 1) alpha (0 to 1 or 100% lasso to 100% ridge) and 2) lambda (0 to Inf or the size
# of the penalty)
glm = train(x = train_mat, y = train_y, method = 'glmnet', metric = 'ROC', family = 'binomial', trControl = myControl)
Loading required package: glmnet
Loading required package: Matrix
Loading required package: foreach
foreach: simple, scalable parallel programming from Revolution Analytics
Use Revolution R for scalability, fault tolerance and more.
http://www.revolutionanalytics.com
Loaded glmnet 2.0-5
+ Fold1: alpha=0.10, lambda=0.1517
- Fold1: alpha=0.10, lambda=0.1517
+ Fold1: alpha=0.55, lambda=0.1517
- Fold1: alpha=0.55, lambda=0.1517
+ Fold1: alpha=1.00, lambda=0.1517
- Fold1: alpha=1.00, lambda=0.1517
+ Fold2: alpha=0.10, lambda=0.1517
- Fold2: alpha=0.10, lambda=0.1517
+ Fold2: alpha=0.55, lambda=0.1517
- Fold2: alpha=0.55, lambda=0.1517
+ Fold2: alpha=1.00, lambda=0.1517
- Fold2: alpha=1.00, lambda=0.1517
+ Fold3: alpha=0.10, lambda=0.1517
- Fold3: alpha=0.10, lambda=0.1517
+ Fold3: alpha=0.55, lambda=0.1517
- Fold3: alpha=0.55, lambda=0.1517
+ Fold3: alpha=1.00, lambda=0.1517
- Fold3: alpha=1.00, lambda=0.1517
+ Fold4: alpha=0.10, lambda=0.1517
- Fold4: alpha=0.10, lambda=0.1517
+ Fold4: alpha=0.55, lambda=0.1517
- Fold4: alpha=0.55, lambda=0.1517
+ Fold4: alpha=1.00, lambda=0.1517
- Fold4: alpha=1.00, lambda=0.1517
+ Fold5: alpha=0.10, lambda=0.1517
- Fold5: alpha=0.10, lambda=0.1517
+ Fold5: alpha=0.55, lambda=0.1517
- Fold5: alpha=0.55, lambda=0.1517
+ Fold5: alpha=1.00, lambda=0.1517
- Fold5: alpha=1.00, lambda=0.1517
Aggregating results
Selecting tuning parameters
Fitting alpha = 0.1, lambda = 0.048 on full training set
# tuneGrid = expand.grid(alpha = seq(0, 1, 0.1), lambda = seq(0, 0.1, 0.01)))
glm
glmnet
2642 samples
5718 predictors
2 classes: 'ham', 'spam'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 2114, 2114, 2113, 2113, 2114
Resampling results across tuning parameters:
alpha lambda ROC Sens Spec
0.10 0.01517099 0.9996734 0.9986637 0.9874051
0.10 0.04797487 0.9997074 0.9986637 0.9798418
0.10 0.15170987 0.9991682 0.9991091 0.9547152
0.55 0.01517099 0.9986537 0.9986637 0.9748101
0.55 0.04797487 0.9969496 0.9995546 0.9546835
0.55 0.15170987 0.9868267 1.0000000 0.9244620
1.00 0.01517099 0.9985528 0.9991091 0.9723101
1.00 0.04797487 0.9977380 1.0000000 0.9320253
1.00 0.15170987 0.9575661 1.0000000 0.0025000
ROC was used to select the optimal model using the largest value.
The final values used for the model were alpha = 0.1 and lambda = 0.04797487.
plot(glm)
glm_pred = predict(glm, newdata = test_mat, type = 'raw')
colAUC(as.numeric(glm_pred), test_y, plotROC = T)
[,1]
ham vs. spam 0.9807692
confusionMatrix(glm_pred, test_y)
Confusion Matrix and Statistics
Reference
Prediction ham spam
ham 556 4
spam 0 100
Accuracy : 0.9939
95% CI : (0.9846, 0.9983)
No Information Rate : 0.8424
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9768
Mcnemar's Test P-Value : 0.1336
Sensitivity : 1.0000
Specificity : 0.9615
Pos Pred Value : 0.9929
Neg Pred Value : 1.0000
Prevalence : 0.8424
Detection Rate : 0.8424
Detection Prevalence : 0.8485
Balanced Accuracy : 0.9808
'Positive' Class : ham
# support vector machine
svm = train(x = train_mat, y = train_y, method = 'svmRadial', metric = 'ROC', family = 'binomial', trControl = myControl)
Loading required package: kernlab
Attaching package: ‘kernlab’
The following object is masked from ‘package:ggplot2’:
alpha
+ Fold1: sigma=0.0002201, C=0.25
- Fold1: sigma=0.0002201, C=0.25
+ Fold1: sigma=0.0002201, C=0.50
- Fold1: sigma=0.0002201, C=0.50
+ Fold1: sigma=0.0002201, C=1.00
- Fold1: sigma=0.0002201, C=1.00
+ Fold2: sigma=0.0002201, C=0.25
- Fold2: sigma=0.0002201, C=0.25
+ Fold2: sigma=0.0002201, C=0.50
- Fold2: sigma=0.0002201, C=0.50
+ Fold2: sigma=0.0002201, C=1.00
- Fold2: sigma=0.0002201, C=1.00
+ Fold3: sigma=0.0002201, C=0.25
- Fold3: sigma=0.0002201, C=0.25
+ Fold3: sigma=0.0002201, C=0.50
- Fold3: sigma=0.0002201, C=0.50
+ Fold3: sigma=0.0002201, C=1.00
- Fold3: sigma=0.0002201, C=1.00
+ Fold4: sigma=0.0002201, C=0.25
- Fold4: sigma=0.0002201, C=0.25
+ Fold4: sigma=0.0002201, C=0.50
- Fold4: sigma=0.0002201, C=0.50
+ Fold4: sigma=0.0002201, C=1.00
- Fold4: sigma=0.0002201, C=1.00
+ Fold5: sigma=0.0002201, C=0.25
- Fold5: sigma=0.0002201, C=0.25
+ Fold5: sigma=0.0002201, C=0.50
- Fold5: sigma=0.0002201, C=0.50
+ Fold5: sigma=0.0002201, C=1.00
- Fold5: sigma=0.0002201, C=1.00
Aggregating results
Selecting tuning parameters
Fitting sigma = 0.00022, C = 1 on full training set
svm
Support Vector Machines with Radial Basis Function Kernel
2642 samples
5718 predictors
2 classes: 'ham', 'spam'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 2113, 2114, 2114, 2113, 2114
Resampling results across tuning parameters:
C ROC Sens Spec
0.25 0.9943622 0.9861915 0.8967089
0.50 0.9959278 0.9946548 0.8816139
1.00 0.9977465 0.9968820 0.8892089
Tuning parameter 'sigma' was held constant at a value of 0.0002200635
ROC was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.0002200635 and C = 1.
plot(svm)
svm_pred = predict(svm, newdata = test_mat, type = 'raw')
colAUC(as.numeric(svm_pred), test_y, plotROC = T)
[,1]
ham vs. spam 0.9348022
confusionMatrix(svm_pred, test_y)
Confusion Matrix and Statistics
Reference
Prediction ham spam
ham 553 13
spam 3 91
Accuracy : 0.9758
95% CI : (0.9609, 0.9861)
No Information Rate : 0.8424
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.905
Mcnemar's Test P-Value : 0.02445
Sensitivity : 0.9946
Specificity : 0.8750
Pos Pred Value : 0.9770
Neg Pred Value : 0.9681
Prevalence : 0.8424
Detection Rate : 0.8379
Detection Prevalence : 0.8576
Balanced Accuracy : 0.9348
'Positive' Class : ham
# naive bayes -- FAILED
# nb = train(x = train_mat, y = train_y, method = 'nb', metric = 'ROC', family = 'binomial', trControl = myControl)
# plot(nb)
# nb_pred = predict(nb, newdata = test_mat, type = 'raw')
# colAUC(as.numeric(nb_pred), test_y, plotROC = T)
# random forest -- TOO SLOW
#rf = train(x = train_mat, y = train_y, method = 'rf', metric = 'ROC', family = 'binomial', trControl = myControl)
# ranger = random forest model which has one tuning parameter mtry (2 to 100) or the number of variables to consider
# at each split; this can be controlled by adjusting the tuneLength parameter or by defining a custom tuneGrid
# -- FAILED
#rf = train(x = train_mat, y = make.names(train_y), method = 'ranger', metric = 'ROC', trControl = myControl)
# neural network -- FAILED
#nnet = train(x = train_mat, y = train_y, method = 'nnet', metric = 'ROC', family = 'binomial', trControl = myControl)
#stopCluster(cl)
#registerDoSEQ()
Using the glmnet ROC curve as an example, the model achieves an extremely high true positive rate (TPR > 95%) without incurring any false positives (FPR = 0%). Practically speaking, this means that one could select a probability threshold that would not incorrectly classify any ham as spam while letting through a small amount of spam. This might be an appropriate setting for a business, where a misclassification of ham as spam could result in real business damage. Conversely, for personal email purposes, one may find it acceptable to miss a real email or two in exchange for few spam emails in one’s inbox. Either way, the ROC curve can help the modeler decide on the probability threshold appropriate for the given situation.
The caretEnsemble package then allows us compare model results. The dotplot function plots the ROC and 95% confidence intervals for each model. As illustrated below, the glmnet model outperforms the svmRadial model quite significantly.
library(caretEnsemble)
Attaching package: ‘caretEnsemble’
The following object is masked from ‘package:ggplot2’:
autoplot
# compare models using resample
model_list = list(glm = glm, svm = svm)
resamps = resamples(model_list)
summary(resamps)
Call:
summary.resamples(object = resamps)
Models: glm, svm
Number of resamples: 5
ROC
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glm 0.9994 0.9995 0.9998 0.9997 0.9999 1.0000 0
svm 0.9958 0.9978 0.9978 0.9977 0.9982 0.9992 0
Sens
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glm 0.9955 0.9978 1.0000 0.9987 1 1 0
svm 0.9933 0.9933 0.9978 0.9969 1 1 0
Spec
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glm 0.9620 0.975 0.9873 0.9798 0.9873 0.9875 0
svm 0.8354 0.875 0.8875 0.8892 0.9241 0.9241 0
dotplot(resamps, metric = 'ROC')
The glmnet model outperformed all other models trained using both the RTextTools and caret packages as it only produced 4 errors and had the higher AUC value. It also has several other factors in its favor: