The Analytics Edge - Text

Loading the dataset

Load the data,

emails <- read.csv("emails.csv", stringsAsFactors=FALSE)
(nrow(emails))

## [1] 5728

(sum(emails$spam))

## [1] 1368

(max(nchar(emails$text)))

## [1] 43952

which.min(nchar(emails$text))

## [1] 1992

Preparing Corpus

Prepare DTM and sparse DTM

library(tm)

## Warning: package 'tm' was built under R version 3.1.3

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.1.3

library(SnowballC)

## Warning: package 'SnowballC' was built under R version 3.1.3

corpus <- Corpus(VectorSource(emails$text))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
dtm <- DocumentTermMatrix(corpus)
spdtm <- removeSparseTerms(dtm, 1 - 0.05)

Create data frame emailsSparse from spdtm

emailsSparse <- as.data.frame(as.matrix(spdtm))
colnames(emailsSparse) <- make.names(colnames(emailsSparse))

Which word appears most frequent across all emails?

which.max(colSums(emailsSparse))

## enron 
##    92

Add spam variable

emailsSparse$spam <- emails$spam

How many word stems appear at least 5000 times in ham and 1000 times in spam:

(sum(colSums(subset(emailsSparse, emailsSparse$spam==0)) >= 5000))

## [1] 6

(sum(colSums(subset(emailsSparse, emailsSparse$spam==1)) >= 1000) - 1)

## [1] 3

Building Machine Learning Models

Split into test and training sets

emailsSparse$spam <- as.factor(emailsSparse$spam)

library(caTools)

## Warning: package 'caTools' was built under R version 3.1.3

set.seed(123)
spl <- sample.split(emailsSparse$spam, SplitRatio=0.7)
train <- subset(emailsSparse, spl==TRUE)
test <- subset(emailsSparse, spl==FALSE)

Train 3 differnet models: logistic regression, CART, and random forest:

spamLog <- glm(spam ~ ., train, family=binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

library(rpart)

## Warning: package 'rpart' was built under R version 3.1.3

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.1.3

spamCART <- rpart(spam ~ ., train, method="class")

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.1.3

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

set.seed(123)
spamRF <- randomForest(spam ~ ., train)

Get predicted spam probabilities for the training set for each model:

trainLog <- predict(spamLog, type="response")
trainCART <- predict(spamCART)[,2]
trainRF <- predict(spamRF, type="prob")[,2]

How many of the training set predicted probabilities from spamLog are less than 0.00001?

(a <- sum(trainLog < 0.00001))

## [1] 3046

How many of the training set predicted probabilities from spamLog are more than 0.99999?

(b <- sum(trainLog > 0.99999))

## [1] 954

How many of the training set predicted probabilities from spamLog are between 0.00001 and 0.99999?

nrow(train) - a - b

## [1] 10

How many of the word stems “enron”, “hou”, “vinc”, and “kaminski” appear in the CART tree?

prp(spamCART)

What is the training set accuracy of spamLog, using a threshold of 0.5 for predictions?

(confmat <- table(train$spam, trainLog > 0.5))

##    
##     FALSE TRUE
##   0  3052    0
##   1     4  954

sum(diag(confmat)) / nrow(train)

## [1] 0.9990025

What is AUC of spamLog?

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.1.3

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.1.3

## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

pred <- prediction(trainLog, train$spam)
perf <- performance(pred, "tpr", "fpr")
as.numeric(performance(pred, "auc")@y.values)

## [1] 0.9999959

What is the training set accuracy of spamCART, using a threshold of 0.5 for predictions?

(confmat <- table(train$spam, trainCART > 0.5))

##    
##     FALSE TRUE
##   0  2885  167
##   1    64  894

sum(diag(confmat)) / nrow(train)

## [1] 0.942394

What is the training set AUC of spamCART?

pred <- prediction(trainCART, train$spam)
perf <- performance(pred, "tpr", "fpr")
as.numeric(performance(pred, "auc")@y.values)

## [1] 0.9696044

What is the training set accuracy of spamRF, using a threshold of 0.5 for predictions?

(confmat <- table(train$spam, trainRF > 0.5))

##    
##     FALSE TRUE
##   0  3013   39
##   1    44  914

sum(diag(confmat)) / nrow(train)

## [1] 0.9793017

What is the training set AUC of spamRF?

pred <- prediction(trainRF, train$spam)
perf <- performance(pred, "tpr", "fpr")
as.numeric(performance(pred, "auc")@y.values)

## [1] 0.9979116

Ealuating on the test set:

testLog <- predict(spamLog, newdata=test, type="response")
testCART <- predict(spamCART, newdata=test)[,2]
testRF <- predict(spamRF, newdata=test, type="prob")[,2]

computeAccuracyAuc <- function(p, test) {
  confmat <- table(test$spam, p > 0.5)
  accuracy <- sum(diag(confmat)) / nrow(test)
  
  pred <- prediction(p, test$spam)
  perf <- performance(pred, "tpr", "fpr")
  auc <- as.numeric(performance(pred, "auc")@y.values)  
  
  c(accuracy, auc)
}

(computeAccuracyAuc(testLog, test))

## [1] 0.9505239 0.9627517

(computeAccuracyAuc(testCART, test))

## [1] 0.9394645 0.9631760

(computeAccuracyAuc(testRF, test))

## [1] 0.9749709 0.9975656

The Analytics Edge - Text - Spam Emails

Andy

Friday, July 03, 2015

Loading the dataset

Preparing Corpus

Building Machine Learning Models

Ealuating on the test set: