Load the data,
emails <- read.csv("emails.csv", stringsAsFactors=FALSE)
(nrow(emails))
## [1] 5728
(sum(emails$spam))
## [1] 1368
(max(nchar(emails$text)))
## [1] 43952
which.min(nchar(emails$text))
## [1] 1992
Prepare DTM and sparse DTM
library(tm)
## Warning: package 'tm' was built under R version 3.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.1.3
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.1.3
corpus <- Corpus(VectorSource(emails$text))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
dtm <- DocumentTermMatrix(corpus)
spdtm <- removeSparseTerms(dtm, 1 - 0.05)
Create data frame emailsSparse from spdtm
emailsSparse <- as.data.frame(as.matrix(spdtm))
colnames(emailsSparse) <- make.names(colnames(emailsSparse))
Which word appears most frequent across all emails?
which.max(colSums(emailsSparse))
## enron
## 92
Add spam variable
emailsSparse$spam <- emails$spam
How many word stems appear at least 5000 times in ham and 1000 times in spam:
(sum(colSums(subset(emailsSparse, emailsSparse$spam==0)) >= 5000))
## [1] 6
(sum(colSums(subset(emailsSparse, emailsSparse$spam==1)) >= 1000) - 1)
## [1] 3
Split into test and training sets
emailsSparse$spam <- as.factor(emailsSparse$spam)
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(123)
spl <- sample.split(emailsSparse$spam, SplitRatio=0.7)
train <- subset(emailsSparse, spl==TRUE)
test <- subset(emailsSparse, spl==FALSE)
Train 3 differnet models: logistic regression, CART, and random forest:
spamLog <- glm(spam ~ ., train, family=binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
spamCART <- rpart(spam ~ ., train, method="class")
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.3
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
set.seed(123)
spamRF <- randomForest(spam ~ ., train)
Get predicted spam probabilities for the training set for each model:
trainLog <- predict(spamLog, type="response")
trainCART <- predict(spamCART)[,2]
trainRF <- predict(spamRF, type="prob")[,2]
How many of the training set predicted probabilities from spamLog are less than 0.00001?
(a <- sum(trainLog < 0.00001))
## [1] 3046
How many of the training set predicted probabilities from spamLog are more than 0.99999?
(b <- sum(trainLog > 0.99999))
## [1] 954
How many of the training set predicted probabilities from spamLog are between 0.00001 and 0.99999?
nrow(train) - a - b
## [1] 10
How many of the word stems “enron”, “hou”, “vinc”, and “kaminski” appear in the CART tree?
prp(spamCART)
What is the training set accuracy of spamLog, using a threshold of 0.5 for predictions?
(confmat <- table(train$spam, trainLog > 0.5))
##
## FALSE TRUE
## 0 3052 0
## 1 4 954
sum(diag(confmat)) / nrow(train)
## [1] 0.9990025
What is AUC of spamLog?
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.1.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.1.3
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
pred <- prediction(trainLog, train$spam)
perf <- performance(pred, "tpr", "fpr")
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.9999959
What is the training set accuracy of spamCART, using a threshold of 0.5 for predictions?
(confmat <- table(train$spam, trainCART > 0.5))
##
## FALSE TRUE
## 0 2885 167
## 1 64 894
sum(diag(confmat)) / nrow(train)
## [1] 0.942394
What is the training set AUC of spamCART?
pred <- prediction(trainCART, train$spam)
perf <- performance(pred, "tpr", "fpr")
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.9696044
What is the training set accuracy of spamRF, using a threshold of 0.5 for predictions?
(confmat <- table(train$spam, trainRF > 0.5))
##
## FALSE TRUE
## 0 3013 39
## 1 44 914
sum(diag(confmat)) / nrow(train)
## [1] 0.9793017
What is the training set AUC of spamRF?
pred <- prediction(trainRF, train$spam)
perf <- performance(pred, "tpr", "fpr")
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.9979116
testLog <- predict(spamLog, newdata=test, type="response")
testCART <- predict(spamCART, newdata=test)[,2]
testRF <- predict(spamRF, newdata=test, type="prob")[,2]
computeAccuracyAuc <- function(p, test) {
confmat <- table(test$spam, p > 0.5)
accuracy <- sum(diag(confmat)) / nrow(test)
pred <- prediction(p, test$spam)
perf <- performance(pred, "tpr", "fpr")
auc <- as.numeric(performance(pred, "auc")@y.values)
c(accuracy, auc)
}
(computeAccuracyAuc(testLog, test))
## [1] 0.9505239 0.9627517
(computeAccuracyAuc(testCART, test))
## [1] 0.9394645 0.9631760
(computeAccuracyAuc(testRF, test))
## [1] 0.9749709 0.9975656