Intro

This rmd and all related input data can be found on my github. This document can also be found on rpubs

Clear the Workspace

rm(list=ls())

Load the Required Packages

library(tm)
library(RTextTools)
library(stringr)
library(SnowballC)
library(ggplot2)
library(knitr)
library(tidyr)

Load the Data

We are going to look at spam data from from http://spamassassin.apache.org/old/publiccorpus/. Given that the sample that we are using isn’t all that big, we’ll try using Vcorpus. We’ll load the “spam” and “ham” samples into variables, and then add meta-data denoting which is which.

Set Ws

basePath <- "C:/Users/Paul/OneDrive - CUNY School of Professional Studies/CUNY/DATA 607/DATA_607_Project 4/"

setwd(basePath)

The data can be downloaded directly from R as follows and can then be extracted using the untar command, or external software. For simplicity, I worked with a local copy of the data for this project, from the above local dir.

download.file("https://github.com/plb2018/DATA607/raw/master/DATA_607_Project4/20021010_easy_ham.tar", destfile = "20021010_easy_ham.tar")

download.file("https://github.com/plb2018/DATA607/raw/master/DATA_607_Project4/20021010_spam.tar", destfile = "20021010_spam.tar")

#untar("20021010_easy_ham.tar")
#untar("20021010_spam.tar")

We load the data and add the spam tags:

spam <- VCorpus(DirSource(paste(basePath,"spam/",sep=""),encoding="UTF-8"),
               readerControl = list(reader=readPlain))

ham <- VCorpus(DirSource(paste(basePath,"easy_ham/",sep=""),encoding="UTF-8"),
              readerControl = list(reader=readPlain))

meta(spam, tag = "is.Spam") <- "spam"
meta(ham, tag = "is.Spam") <- "not_spam"

Prepare the Data

Next we’ll combine the the data into a single VCorpus and clean it up a bit in preparation for conversion to a DocTermMatrix. At this stage, I’m not going to perform any other cleaning/prep of the data because I’d like to assess the impact of this step when we’re further along.

ham.nSpam <- c(spam,ham,recursive=TRUE)

ham.nSpam <- sample(ham.nSpam,3000)

ham.nSpam <- tm_map(ham.nSpam,content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))

Now we’ll create the DTM. Originally I had intended to run with the data as raw as possible for this initial attempt, however, some cleaning (as above) was required in order to get the DTM to create properly. As a side note, it appears that one can use a DTM or a TDM with similar results, so long as you make sure that the dimensions are correct when creating the “container” in the next step.

spam.dtm <- DocumentTermMatrix(ham.nSpam)
spam.dtm
## <<DocumentTermMatrix (documents: 3000, terms: 128955)>>
## Non-/sparse entries: 690002/386174998
## Sparsity           : 100%
## Maximal term length: 515
## Weighting          : term frequency (tf)
#this works too, but you need to watch the dims when creating the container below.
#spam.tdm <- TermDocumentMatrix(ham.nSpam)
#spam.tdm

We can see above that the maximal term length is 515, which seems unrealistically high. We’ll explore that later - right now we’ll create a vector with the true labels for training purposes.

spam.label <- unlist(meta(ham.nSpam, "is.Spam")[,1])
head(spam.label,5)
## [1] "not_spam" "not_spam" "not_spam" "not_spam" "not_spam"

Train the Model(s)

Now we’ll create the container that will be used to hold our training data and parameters.

N <- length(spam.label)
pct <-0.25
r <- round(N*pct,0)


container <- create_container(
  spam.dtm,
  labels = spam.label,
  trainSize = 1:r,
  testSize = (r+1):N,
  virgin=FALSE)

Next, we’ll train on the training set and then classify the test set. I’m going to start with just SVM as the point here it to demonstrate the capability. Then we’ll look at a few more things afterwards.

svm.model <- train_model(container, "SVM")
svm.out <- classify_model(container, svm.model)

kable(head(svm.out,10))
SVM_LABEL SVM_PROB
spam 0.9856443
not_spam 0.9967487
not_spam 0.9959193
not_spam 0.9954415
not_spam 0.9991381
not_spam 0.9973793
not_spam 0.9984269
spam 1.0000000
not_spam 0.9990861
not_spam 0.9999313

And finally, we check to see how accurate the models predictions were:

true.labels <- as.numeric(as.factor(spam.label[(r+1):N]))
predicted.labels <- svm.out[,"SVM_LABEL"]
recall_accuracy(true.labels,predicted.labels)
## [1] 0.9888889

Not bad at all!

Cleaning Up the Data

We’ll now take a look at what happens if we run the same analysis on data that we’ve cleaned. Normally I would be tempted to determine our the sensitivity of our results to each element of the cleaning, but I think that’s beyond the scope of this project. In this case, I’m going to clean it all in one shot. We’ll look make the following changes:

clean.spam <- ham.nSpam
clean.spam <- tm_map(clean.spam,content_transformer(tolower),lazy=TRUE)
clean.spam <- tm_map(clean.spam,removePunctuation)
clean.spam <- tm_map(clean.spam,removeNumbers)
clean.spam <- tm_map(clean.spam,removeWords,words=stopwords("en"))
clean.spam <- tm_map(clean.spam,stripWhitespace,lazy=TRUE)
clean.spam <- tm_map(clean.spam,stemDocument,lazy=TRUE)

clean.spam <- tm_map(clean.spam ,content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))

clean.dtm <- DocumentTermMatrix(clean.spam)
clean.dtm <- removeSparseTerms(clean.dtm,0.95)
clean.dtm
## <<DocumentTermMatrix (documents: 3000, terms: 459)>>
## Non-/sparse entries: 230779/1146221
## Sparsity           : 83%
## Maximal term length: 49
## Weighting          : term frequency (tf)

Testing With the Clean Data

We’ve cleaned the data and reduced the sparsity of the DTM a little bit. The maximal term length is down by about 90% from the uncleaned data. Now We’ll check which produces better results. We’ll write a little function to train the model and assess the output on a random sample of the data, and then run a few iterations:

modelTest <- function(sampleSize, test_prop, corpus){
  r<-round(sampleSize*test_prop)

  
  data <- sample(corpus,sampleSize)
  labels <- unlist(meta(data, "is.Spam")[,1])
  dtm <- DocumentTermMatrix(data)
  
  container <- create_container(
  dtm,
  labels = labels,
  trainSize = 1:r,
  testSize = (r+1):sampleSize,
  virgin=FALSE) 
  
  svm.model <- train_model(container, "SVM")
  svm.out <- classify_model(container, svm.model)
  
  true.labels <- as.numeric(as.factor(labels[(r+1):sampleSize]))
  predicted.labels <- svm.out[,"SVM_LABEL"]
  out <- recall_accuracy(true.labels,predicted.labels)
  return(out)
  
}

We’ll wrap the function in a loop and see whether we get better results for the “original” or the “clean” data.

itr <-25
orig <- rep(0,itr)
clean <- rep(0,itr)


for (i in 1:itr){
  #print(i)
  orig[i] <- modelTest(100,0.25,ham.nSpam) 
  clean[i] <- modelTest(100,0.25,clean.spam) 
  
}

The above is painfully slow and i assume that there is a much better way to accomplish similar this task. However, in this case, we’ve gotten the desired data.

data <- data.frame(orig=orig,clean=clean)
summary(data)
##       orig            clean       
##  Min.   :0.8000   Min.   :0.7333  
##  1st Qu.:0.8533   1st Qu.:0.8400  
##  Median :0.8933   Median :0.8533  
##  Mean   :0.8869   Mean   :0.8640  
##  3rd Qu.:0.9200   3rd Qu.:0.9067  
##  Max.   :1.0000   Max.   :0.9600
tidy.data <- gather(data)
ggplot(tidy.data,aes(x=value,fill=key)) + geom_histogram(alpha=0.75,bins=10)

For this particular task, the original (uncleaned) doesn’t appear to produce results that are significantly different from the “clean” data, at least on an SVM model.

Multi-Model

Finally, we’ll look at a few different methods for training our models. I’ve chosen a few and down-scoped the data a little bit as otherwise, this step takes forever to run.

#downsample the data
clean.spam <- sample(clean.spam,500)
clean.dtm <- DocumentTermMatrix(clean.spam)
clean.dtm <- removeSparseTerms(clean.dtm,0.95)
spam.label <- unlist(meta(clean.spam, "is.Spam")[,1])

N <- length(spam.label)
pct <-0.25
r <- round(N*pct,0)

container <- create_container(
  spam.dtm,
  labels = spam.label,
  trainSize = 1:r,
  testSize = (r+1):N,
  virgin=FALSE)

We’ll train and run using 3 different algorithms:

multi.model <- train_models(container, algorithm=c("SVM","RF","MAXENT"))
multi.out <- classify_models(container, multi.model)
kable(head(multi.out,10))
SVM_LABEL SVM_PROB FORESTS_LABEL FORESTS_PROB MAXENTROPY_LABEL MAXENTROPY_PROB
not_spam 0.8332421 not_spam 0.985 not_spam 0.9999999
not_spam 0.8421963 not_spam 0.945 not_spam 0.9988572
not_spam 0.8355613 not_spam 0.905 not_spam 1.0000000
not_spam 0.8626719 not_spam 0.890 not_spam 1.0000000
not_spam 0.8614069 not_spam 0.850 not_spam 0.9999977
not_spam 0.8238387 not_spam 0.990 not_spam 1.0000000
not_spam 0.8381204 not_spam 0.975 not_spam 0.9582952
not_spam 0.8277362 not_spam 0.940 not_spam 0.9999979
not_spam 0.8482152 not_spam 0.935 not_spam 1.0000000
not_spam 0.8327199 not_spam 0.990 not_spam 0.9700526
true.labels <- as.numeric(as.factor(spam.label[(r+1):N]))

predicted.labels <- multi.out[,"SVM_LABEL"]
svm <- recall_accuracy(true.labels,predicted.labels)

predicted.labels <- multi.out[,"FORESTS_LABEL"]
rf <- recall_accuracy(true.labels,predicted.labels)

predicted.labels <- multi.out[,"MAXENTROPY_LABEL"]
maxent <- recall_accuracy(true.labels,predicted.labels)

results <- data.frame(svm=svm,rf=rf,maxent=maxent)

tidy.results <- gather(results)

ggplot(tidy.results,aes(x=key, y=value,fill=key)) + 
  geom_bar(stat="identity") +
  coord_cartesian(ylim = c(min(results)*0.95, max(results)*1.05))+
  xlab("Model")+
  ylab("Success Rate")+
  ggtitle("Prediction Success Rate for Various Methods")+
  theme(plot.title = element_text(hjust = 0.5))