http://spamassassin.apache.org/old/publiccorpus/

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

Here are two short videos that you may find helpful.

The first video shows how to unzip the provided files.

if (!require(stringr)) install.packages('stringr')
if (!require(tm)) install.packages('tm')
if (!require(SnowballC)) install.packages('SnowballC')
if (!require(RTextTools)) install.packages('RTextTools')
if (!require(R.utils)) install.packages('R.utils')
if (!require(utils)) install.packages('utils')

#tmp <- readLines("607ass10/easy_ham/00001.7c53336b37003a9286aba55d2945844c")
#tmp <- str_c(tmp, collapse = "")
#tmp <- str_replace_all(tmp, pattern="<.*?>", replacement = " ")
#tmp <- str_replace_all(tmp, pattern="\\=", replacement = "")
require(R.utils)
require(RTextTools)

download.file('http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2', destfile="spam_zip.tar.bz2")
bunzip2("spam_zip.tar.bz2", remove = F, overwrite = T)
untar("spam_zip.tar") #creates spam_2 folder

# download and unzip spam document from github 
download.file('http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2', destfile="ham_zip.tar.bz2")
bunzip2("ham_zip.tar.bz2", remove = F, overwrite = T)
untar("ham_zip.tar") #creates easy_ham_2 folder
# identify cmds file and delete
removeham <- list.files(path="easy_ham_2/", full.names=T, recursive=FALSE, pattern="cmds")
file.remove(removeham)
## [1] TRUE
# identify cmds file and delete
removespam <-list.files(path="spam_2/", full.names=T, recursive=FALSE, pattern="cmds")
file.remove(removespam)
## [1] TRUE

Combined Spam and ham files

# Vcorpus input directory 2501 files
spamfiles<-VCorpus(DirSource(directory = "spam_2/",encoding = "UTF-8"))
hamfiles<-VCorpus(DirSource(directory = "easy_ham_2/",encoding = "UTF-8"))

#Sample the spam & ham
spamfiles <- sample(spamfiles, 250)
hamfiles <- sample(hamfiles, 250)

#Add meta labels
meta(spamfiles, tag = "type") <- "spam"
meta(hamfiles, tag = "type") <- "ham"

#Combine corpus objects
combinedspamham <- c(spamfiles,hamfiles,recursive=T)

#Create randomized corpus
spamham <- sample(combinedspamham)
spamham
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 500

Checking by meta

meta(spamham[[2]])
##   author       : character(0)
##   datetimestamp: 2017-04-17 07:51:33
##   description  : character(0)
##   heading      : character(0)
##   id           : 00176.644d65f0ab0d19f706a493bd5c3dc5df
##   language     : en
##   origin       : character(0)

Transforming

# remove white space
spamham <- tm_map(spamham,stripWhitespace)

# remove numbers
spamham <- tm_map(spamham,removeNumbers)

# remove punctuations
spamham <- tm_map(spamham,removePunctuation)

Transforming

# transform all words in corpus to lower case
spamham_mod <- tm_map(spamham, content_transformer(tolower))

# remove all stop words 
spamham_mod <- tm_map(spamham_mod,removeWords, words = stopwords("en"))

# stem words: cut certain terms down to word root
spamham_mod <- tm_map(spamham_mod, stemDocument)

DocumentTermMatrix

dtm <- DocumentTermMatrix(spamham_mod)
dtm
## <<DocumentTermMatrix (documents: 500, terms: 22645)>>
## Non-/sparse entries: 89123/11233377
## Sparsity           : 99%
## Maximal term length: 127
## Weighting          : term frequency (tf)

remove any terms less than 10 documents

dtm <- removeSparseTerms(dtm,1-(10/length(spamham_mod)))
dtm
## <<DocumentTermMatrix (documents: 500, terms: 1297)>>
## Non-/sparse entries: 55279/593221
## Sparsity           : 91%
## Maximal term length: 65
## Weighting          : term frequency (tf)

Classifier Models

#Collect meta labels
spamtype <- unlist(meta(spamham_mod, "type")[,1])
head(spamtype,5)
## [1] "spam" "spam" "ham"  "spam" "ham"

Setup container

# set up model container; 90/10 split between train and test data
N <- length(spamtype) # number of sample mail
container <- create_container(
  dtm,
  labels = spamtype,
  trainSize = 1:(0.9*N),
  testSize = (0.8*N+1):N,
  virgin = FALSE
)

slotNames(container)
## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

Modelling and anaylsis

#Training models
svm_model <- train_model(container, "SVM")
rf_model <- train_model(container, "RF")
maxent_model <- train_model(container, "MAXENT")
TREE <- train_model(container,"TREE")

# Test models
svm_out <- classify_model(container, svm_model)
rf_out <- classify_model(container, rf_model)
maxent_out <- classify_model(container, maxent_model)
tree_out <- classify_model(container, TREE)

head(svm_out)
##   SVM_LABEL  SVM_PROB
## 1      spam 0.9801299
## 2      spam 0.9993885
## 3       ham 0.9788325
## 4      spam 0.9980559
## 5       ham 0.9999991
## 6      spam 0.9801162
head(rf_out)
##   FORESTS_LABEL FORESTS_PROB
## 1          spam        0.975
## 2          spam        0.990
## 3           ham        0.800
## 4          spam        0.995
## 5           ham        1.000
## 6          spam        0.940
head(maxent_out)
##   MAXENTROPY_LABEL MAXENTROPY_PROB
## 1             spam       0.9999996
## 2             spam       1.0000000
## 3              ham       0.9999991
## 4             spam       1.0000000
## 5              ham       1.0000000
## 6             spam       0.9999920
head(tree_out)
##   TREE_LABEL TREE_PROB
## 1       spam 1.0000000
## 2       spam 1.0000000
## 3        ham 1.0000000
## 4       spam 1.0000000
## 5        ham 1.0000000
## 6       spam 0.9285714