For this assignment, we choose the Spamassassin dataset located at https://spamassassin.apache.org/publiccorpus. The files were downloaded and uncompressed on my local drive.
library(tm)
## Loading required package: NLP
library(SnowballC)
library(stringr)
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
##
##
## Attaching package: 'RTextTools'
##
## The following objects are masked from 'package:SnowballC':
##
## getStemLanguages, wordStem
preprocessing <- function(basedir, subdir)
{
fulldir<-paste0(basedir,subdir)
#proceed to recursive reading
alldir=DirSource(fulldir, encoding = "UTF-8", recursive=TRUE)
#Creating the coupus from the directory files
the_corpus <- Corpus(alldir, readerControl=list(reader=readPlain,language="en"))
#Removing punctuation
the_corpus <- tm_map(the_corpus, removePunctuation)
#Removing numbers
the_corpus <- tm_map(the_corpus, removeNumbers)
the_corpus <- tm_map(the_corpus, stemDocument)
#Removing words that usually have no analytic value
the_corpus <- tm_map(the_corpus, removeWords, stopwords("english"))
#Removing white spaces
the_corpus <- tm_map(the_corpus, stripWhitespace)
the_corpus <- tm_map(the_corpus, content_transformer(tolower))
the_corpus <- tm_map(the_corpus, PlainTextDocument)
the_corpus
}
#Data locations
basefolder="C:/data/is607/spam"
folder4spam="/spam"
folder4spam_2="/spam_2"
folder4easy_ham="/easy_ham"
folder4easy_ham_2="/easy_ham_2"
folder4hard_hard_ham="/hard_ham"
#Data preprocessing
corpus4spam <- preprocessing(basefolder, folder4spam)
corpus4spam_2 <- preprocessing(basefolder, folder4spam_2)
corpus4easy_ham <- preprocessing(basefolder, folder4easy_ham)
corpus4easy_ham_2 <- preprocessing(basefolder, folder4easy_ham_2)
corpus4hard_ham <- preprocessing(basefolder, folder4hard_hard_ham)
corpus4spam <- tm_map(corpus4spam, PlainTextDocument)
corpus4spam_2 <- tm_map(corpus4spam_2, PlainTextDocument)
corpus4easy_ham <- tm_map(corpus4easy_ham, PlainTextDocument)
corpus4easy_ham_2 <- tm_map(corpus4easy_ham_2, PlainTextDocument)
corpus4hard_ham <- tm_map(corpus4hard_ham, PlainTextDocument)
#Adding meta labels
meta(corpus4spam, tag = "type") <- "spam"
meta(corpus4easy_ham, tag = "type") <- "ham"
meta(corpus4hard_ham, tag = "type") <- "hardham"
meta(corpus4spam_2, tag = "type") <- "spam"
meta(corpus4easy_ham_2, tag = "type") <- "ham"
#Combining the datasets
training_set <- c(corpus4spam, corpus4easy_ham, corpus4hard_ham, corpus4spam_2, corpus4easy_ham_2, recursive=T)
#Randomizing the data
training_set <- sample(training_set)
#creating a document term matrix
dtm_email <- DocumentTermMatrix(training_set)
# Removing sparse terms
dtms_email <- removeSparseTerms(dtm_email, 0.08) # This makes a matrix that is 8% empty space, maximum.
emailtype <- unlist(meta(training_set, "type")[,1])
head(emailtype,5)
## [1] "spam" "ham" "ham" "ham" "ham"
set.seed(2000) #for reproductible results
#Preparing the container
n <- length(emailtype)
container <- create_container(
dtms_email,
labels = emailtype,
trainSize =1:(0.8*n),
testSize=(0.8*n+1):n,
virgin=FALSE
)
maxent_model <- train_model(container, "MAXENT")
svm_model <- train_model(container, "SVM")
glmnet_model <- train_model(container, "GLMNET")
svm_out <- classify_model(container, svm_model)
maxent_out <- classify_model(container, maxent_model)
glmnet_out <- classify_model(container, glmnet_model)
head(svm_out)
## SVM_LABEL SVM_PROB
## 1 ham 0.8761390
## 2 spam 0.9437758
## 3 spam 0.8021331
## 4 spam 0.8025331
## 5 spam 0.8074352
## 6 ham 0.9458826
head(maxent_out)
## MAXENTROPY_LABEL MAXENTROPY_PROB
## 1 ham 0.8199408
## 2 ham 0.7327399
## 3 spam 0.6597427
## 4 ham 0.6713027
## 5 ham 0.5144437
## 6 ham 0.8105396
head(glmnet_out)
## GLMNET_LABEL GLMNET_PROB
## 1 ham 0.7977189
## 2 ham 0.6782402
## 3 spam 0.6645503
## 4 ham 0.6419461
## 5 spam 0.4856525
## 6 ham 0.7902051