It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

library(tm)
library(knitr)
library(plyr)
library(wordcloud)

Setup data storing directory

#getwd()
setwd("../data/")

Dataload Directly from the URL

spam_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2"
spam2_url <- "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
hard_ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"
easy_ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2"

#Download tarballs
download.file(spam_url, destfile="spam.tar.gz")
download.file(spam2_url, destfile="spam2.tar.gz")
download.file(hard_ham_url, destfile="hardham.tar.gz")
download.file(easy_ham_url, destfile="easyham.tar.gz")

#Extract tarballs
untar("spam.tar.gz")
untar("spam2.tar.gz")
untar("hardham.tar.gz")
untar("easyham.tar.gz")

#Create spam and ham corpus
spam <- Corpus(DirSource("spam"))
spam2 <- Corpus(DirSource("spam_2"))
easy_ham <- Corpus(DirSource("easy_ham"))
hard_ham <- Corpus(DirSource("hard_ham"))
# length(spam)
# length(spam2)
# length(easy_ham)

Cleaning and filterning text files

tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2,removePunctuation("'"),removePunctuation("|"),removePunctuation("`"))

#Remove cmds file
if (file.exists("easy_ham/cmds")) file.remove("easy_ham/cmds")
## [1] TRUE
if (file.exists("hard_ham/cmds")) file.remove("hard_ham/cmds")
## [1] TRUE
if (file.exists("spam/cmds")) file.remove("spam/cmds")
## [1] TRUE
#Add meta labels
meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "easy_ham"
meta(hard_ham, tag = "type") <- "hard_ham"

Creating Document term matrix

spam <- Corpus(DirSource("spam"))


#Combine corpus objects.

spam_corpus <- c(spam, recursive=T)
ham_corpus <- c(easy_ham, recursive=T)


#Create reduced and randomized corpus

spam_corpus_sample <- sample(spam_corpus, 500)
ham_corpus_sapmle <- sample(ham_corpus, 500)


#Build document-term matrix. 

# spam_tdm <- DocumentTermMatrix(spam_corpus_sample)
# ham_tdm <- DocumentTermMatrix(ham_corpus_sapmle)


spam_tdm <- TermDocumentMatrix(spam_corpus_sample,control=tdm_dtm_opts)
ham_tdm <- TermDocumentMatrix(ham_corpus_sapmle,control=tdm_dtm_opts)

Create Spam and Ham Data Frames:

spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
NUM_TABLE_ROWS <-10
kable(head(spam_df, n = NUM_TABLE_ROWS))
TERM TYPE_SPAM SPAM_FREQ
bb SPAM 3
balbvipmail SPAM 3
bckkonshbptbwlshyb SPAM 3
bckonshkfclrdhbbcbb SPAM 3
bdbvipmailbkbgo SPAM 3
benpitmwjjgddruklkjonrb SPAM 3
benpitmwjjonrb SPAM 3
bfmanaknib SPAM 6
bgawjb SPAM 6
bgnpqhkkmb SPAM 3
spam_count <- nrow(spam_df)
ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ
“big HAM 1
“doggy” HAM 1
“during HAM 1
“lynx HAM 1
aaa HAM 1
aaaaaccbef HAM 1
aaaaaccfbbdaeb HAM 1
aaaaaddeecb HAM 1
aaaacecacae HAM 1
aaaadbcdc HAM 1
ham_count <- nrow(ham_df)

Combine both data frames using common term

# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'

Showing the combined data frame

all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
16364 received HAM 2772 SPAM 2490 -282
17304 sep HAM 1797 SPAM 2128 331
8275 esmtp HAM 1667 SPAM 1118 -549
13501 localhost HAM 1489 SPAM 1178 -311
14907 oct HAM 1156 SPAM 56 -1100
1619 aug HAM 933 SPAM 1173 240
15743 postfix HAM 912 SPAM 595 -317
12692 ist HAM 848 SPAM 740 -108
14296 mon HAM 838 SPAM 674 -164
12812 jmlocalhost HAM 818 SPAM 22 -796
kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
16364 received HAM 2772 SPAM 2490 -282
17304 sep HAM 1797 SPAM 2128 331
46903 widthd HAM 0 SPAM 1523 1523
18401 table HAM 5 SPAM 1196 1191
13501 localhost HAM 1489 SPAM 1178 -311
1619 aug HAM 933 SPAM 1173 240
19782 width HAM 1 SPAM 1157 1156
8275 esmtp HAM 1667 SPAM 1118 -549
20273 zzzzlocalhost HAM 122 SPAM 908 786
9715 font HAM 1 SPAM 899 898
###HAM C loude
wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('blue'))
## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <mailto:fork-request@xent.com?subject=subscribe> could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <mailto:fork-request@xent.com?subject=unsubscribe> could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <http://lists.freshrpms.net/mailman/listinfo/rpm-zzzlist>, could not be fit
## on page. It will not be plotted.

###SPAM Cloude

wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

summary(all_df)
##          TERM         TYPE_HAM           HAM_FREQ        
##  <U+0093>big      :    1   Length:48847       Length:48847      
##  <U+0093>doggy<U+0094>   :    1   Class :character   Class :character  
##  <U+0093>during   :    1   Mode  :character   Mode  :character  
##  <U+0093>lynx     :    1                                        
##  aaa       :    1                                        
##  aaaaaccbef:    1                                        
##  (Other)   :48841                                        
##   TYPE_SPAM          SPAM_FREQ          SPAM_WEIGHT       
##  Length:48847       Length:48847       Min.   :-1100.000  
##  Class :character   Class :character   1st Qu.:   -1.000  
##  Mode  :character   Mode  :character   Median :    1.000  
##                                        Mean   :    1.462  
##                                        3rd Qu.:    1.000  
##                                        Max.   : 1523.000  
##