For this assignment, we choose the Spamassassin dataset located at https://spamassassin.apache.org/publiccorpus. The files were downloaded and uncompressed on my local drive.

library(tm)

## Loading required package: NLP

library(SnowballC)  
library(stringr)
library(RTextTools)

## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## 
## Attaching package: 'RTextTools'
## 
## The following objects are masked from 'package:SnowballC':
## 
##     getStemLanguages, wordStem

preprocessing <- function(basedir, subdir)
{
fulldir<-paste0(basedir,subdir)
#proceed to recursive reading
alldir=DirSource(fulldir, encoding = "UTF-8", recursive=TRUE)

#Creating the coupus from the directory files
the_corpus <- Corpus(alldir, readerControl=list(reader=readPlain,language="en"))

#Removing  punctuation
the_corpus <- tm_map(the_corpus, removePunctuation) 

#Removing numbers
the_corpus <- tm_map(the_corpus, removeNumbers)

the_corpus <- tm_map(the_corpus, stemDocument) 

#Removing  words that usually have no analytic value
the_corpus <- tm_map(the_corpus, removeWords, stopwords("english")) 

#Removing  white spaces
the_corpus <- tm_map(the_corpus, stripWhitespace) 

the_corpus <- tm_map(the_corpus, content_transformer(tolower))
the_corpus <- tm_map(the_corpus, PlainTextDocument)

the_corpus
}

#Data locations
basefolder="C:/data/is607/spam"

folder4spam="/spam"
folder4spam_2="/spam_2"
folder4easy_ham="/easy_ham"
folder4easy_ham_2="/easy_ham_2"
folder4hard_hard_ham="/hard_ham"

#Data preprocessing

corpus4spam <- preprocessing(basefolder, folder4spam)
corpus4spam_2 <- preprocessing(basefolder, folder4spam_2)
corpus4easy_ham <- preprocessing(basefolder, folder4easy_ham)
corpus4easy_ham_2 <- preprocessing(basefolder, folder4easy_ham_2)
corpus4hard_ham <- preprocessing(basefolder, folder4hard_hard_ham)

corpus4spam <- tm_map(corpus4spam, PlainTextDocument) 
corpus4spam_2 <- tm_map(corpus4spam_2, PlainTextDocument) 
corpus4easy_ham <- tm_map(corpus4easy_ham, PlainTextDocument) 
corpus4easy_ham_2 <- tm_map(corpus4easy_ham_2, PlainTextDocument) 
corpus4hard_ham <- tm_map(corpus4hard_ham, PlainTextDocument)

#Adding meta labels
meta(corpus4spam, tag = "type") <- "spam"
meta(corpus4easy_ham, tag = "type") <- "ham"
meta(corpus4hard_ham, tag = "type") <- "hardham"
meta(corpus4spam_2, tag = "type") <- "spam"
meta(corpus4easy_ham_2, tag = "type") <- "ham"

#Combining the datasets
training_set <-  c(corpus4spam, corpus4easy_ham, corpus4hard_ham, corpus4spam_2, corpus4easy_ham_2, recursive=T)

#Randomizing the data
training_set <- sample(training_set)

#creating a document term matrix
dtm_email <- DocumentTermMatrix(training_set)

#  Removing sparse terms 
dtms_email <- removeSparseTerms(dtm_email, 0.08) # This makes a matrix that is 8% empty space, maximum.   


emailtype <- unlist(meta(training_set, "type")[,1])
head(emailtype,5)

## [1] "spam" "ham"  "ham"  "ham"  "ham"

set.seed(2000) #for reproductible results

#Preparing the container
n <- length(emailtype)
container <- create_container(
  dtms_email,
  labels = emailtype,
  trainSize =1:(0.8*n), 
    testSize=(0.8*n+1):n, 
    virgin=FALSE
)

Training models

maxent_model <- train_model(container, "MAXENT")
svm_model <- train_model(container, "SVM")
glmnet_model <- train_model(container, "GLMNET")

Classifying data

svm_out <- classify_model(container, svm_model)
maxent_out <- classify_model(container, maxent_model)
glmnet_out <- classify_model(container, glmnet_model)

The results

head(svm_out)

##   SVM_LABEL  SVM_PROB
## 1       ham 0.8761390
## 2      spam 0.9437758
## 3      spam 0.8021331
## 4      spam 0.8025331
## 5      spam 0.8074352
## 6       ham 0.9458826

head(maxent_out)

##   MAXENTROPY_LABEL MAXENTROPY_PROB
## 1              ham       0.8199408
## 2              ham       0.7327399
## 3             spam       0.6597427
## 4              ham       0.6713027
## 5              ham       0.5144437
## 6              ham       0.8105396

head(glmnet_out)

##   GLMNET_LABEL GLMNET_PROB
## 1          ham   0.7977189
## 2          ham   0.6782402
## 3         spam   0.6645503
## 4          ham   0.6419461
## 5         spam   0.4856525
## 6          ham   0.7902051

Assignment for weeks 11/12

Edwige

November 28, 2015

Training models

Classifying data

The results