It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/
library(tm)
library(knitr)
library(plyr)
library(wordcloud)
#getwd()
setwd("../data/")
spam_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2"
spam2_url <- "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
hard_ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"
easy_ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2"
#Download tarballs
download.file(spam_url, destfile="spam.tar.gz")
download.file(spam2_url, destfile="spam2.tar.gz")
download.file(hard_ham_url, destfile="hardham.tar.gz")
download.file(easy_ham_url, destfile="easyham.tar.gz")
#Extract tarballs
untar("spam.tar.gz")
untar("spam2.tar.gz")
untar("hardham.tar.gz")
untar("easyham.tar.gz")
#Create spam and ham corpus
spam <- Corpus(DirSource("spam"))
spam2 <- Corpus(DirSource("spam_2"))
easy_ham <- Corpus(DirSource("easy_ham"))
hard_ham <- Corpus(DirSource("hard_ham"))
# length(spam)
# length(spam2)
# length(easy_ham)
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2,removePunctuation("'"),removePunctuation("|"),removePunctuation("`"))
#Remove cmds file
if (file.exists("easy_ham/cmds")) file.remove("easy_ham/cmds")
## [1] TRUE
if (file.exists("hard_ham/cmds")) file.remove("hard_ham/cmds")
## [1] TRUE
if (file.exists("spam/cmds")) file.remove("spam/cmds")
## [1] TRUE
#Add meta labels
meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "easy_ham"
meta(hard_ham, tag = "type") <- "hard_ham"
spam <- Corpus(DirSource("spam"))
#Combine corpus objects.
spam_corpus <- c(spam, recursive=T)
ham_corpus <- c(easy_ham, recursive=T)
#Create reduced and randomized corpus
spam_corpus_sample <- sample(spam_corpus, 500)
ham_corpus_sapmle <- sample(ham_corpus, 500)
#Build document-term matrix.
# spam_tdm <- DocumentTermMatrix(spam_corpus_sample)
# ham_tdm <- DocumentTermMatrix(ham_corpus_sapmle)
spam_tdm <- TermDocumentMatrix(spam_corpus_sample,control=tdm_dtm_opts)
ham_tdm <- TermDocumentMatrix(ham_corpus_sapmle,control=tdm_dtm_opts)
Create Spam and Ham Data Frames:
spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
NUM_TABLE_ROWS <-10
kable(head(spam_df, n = NUM_TABLE_ROWS))
TERM | TYPE_SPAM | SPAM_FREQ |
---|---|---|
bb | SPAM | 3 |
balbvipmail | SPAM | 3 |
bckkonshbptbwlshyb | SPAM | 3 |
bckonshkfclrdhbbcbb | SPAM | 3 |
bdbvipmailbkbgo | SPAM | 3 |
benpitmwjjgddruklkjonrb | SPAM | 3 |
benpitmwjjonrb | SPAM | 3 |
bfmanaknib | SPAM | 6 |
bgawjb | SPAM | 6 |
bgnpqhkkmb | SPAM | 3 |
spam_count <- nrow(spam_df)
ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))
TERM | TYPE_HAM | HAM_FREQ |
---|---|---|
“big | HAM | 1 |
“doggy” | HAM | 1 |
“during | HAM | 1 |
“lynx | HAM | 1 |
aaa | HAM | 1 |
aaaaaccbef | HAM | 1 |
aaaaaccfbbdaeb | HAM | 1 |
aaaaaddeecb | HAM | 1 |
aaaacecacae | HAM | 1 |
aaaadbcdc | HAM | 1 |
ham_count <- nrow(ham_df)
# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'
all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))
TERM | TYPE_HAM | HAM_FREQ | TYPE_SPAM | SPAM_FREQ | SPAM_WEIGHT | |
---|---|---|---|---|---|---|
16364 | received | HAM | 2772 | SPAM | 2490 | -282 |
17304 | sep | HAM | 1797 | SPAM | 2128 | 331 |
8275 | esmtp | HAM | 1667 | SPAM | 1118 | -549 |
13501 | localhost | HAM | 1489 | SPAM | 1178 | -311 |
14907 | oct | HAM | 1156 | SPAM | 56 | -1100 |
1619 | aug | HAM | 933 | SPAM | 1173 | 240 |
15743 | postfix | HAM | 912 | SPAM | 595 | -317 |
12692 | ist | HAM | 848 | SPAM | 740 | -108 |
14296 | mon | HAM | 838 | SPAM | 674 | -164 |
12812 | jmlocalhost | HAM | 818 | SPAM | 22 | -796 |
kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))
TERM | TYPE_HAM | HAM_FREQ | TYPE_SPAM | SPAM_FREQ | SPAM_WEIGHT | |
---|---|---|---|---|---|---|
16364 | received | HAM | 2772 | SPAM | 2490 | -282 |
17304 | sep | HAM | 1797 | SPAM | 2128 | 331 |
46903 | widthd | HAM | 0 | SPAM | 1523 | 1523 |
18401 | table | HAM | 5 | SPAM | 1196 | 1191 |
13501 | localhost | HAM | 1489 | SPAM | 1178 | -311 |
1619 | aug | HAM | 933 | SPAM | 1173 | 240 |
19782 | width | HAM | 1 | SPAM | 1157 | 1156 |
8275 | esmtp | HAM | 1667 | SPAM | 1118 | -549 |
20273 | zzzzlocalhost | HAM | 122 | SPAM | 908 | 786 |
9715 | font | HAM | 1 | SPAM | 899 | 898 |
###HAM C | loude |
wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('blue'))
## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <mailto:fork-request@xent.com?subject=subscribe> could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <mailto:fork-request@xent.com?subject=unsubscribe> could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <http://lists.freshrpms.net/mailman/listinfo/rpm-zzzlist>, could not be fit
## on page. It will not be plotted.
###SPAM Cloude
wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))
summary(all_df)
## TERM TYPE_HAM HAM_FREQ
## <U+0093>big : 1 Length:48847 Length:48847
## <U+0093>doggy<U+0094> : 1 Class :character Class :character
## <U+0093>during : 1 Mode :character Mode :character
## <U+0093>lynx : 1
## aaa : 1
## aaaaaccbef: 1
## (Other) :48841
## TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
## Length:48847 Length:48847 Min. :-1100.000
## Class :character Class :character 1st Qu.: -1.000
## Mode :character Mode :character Median : 1.000
## Mean : 1.462
## 3rd Qu.: 1.000
## Max. : 1523.000
##