It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apachttps://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2he.org/old/publiccorpus/
url_spam <- "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
url_ham <- "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"Download the setup the datasets
download.file(url_spam, destfile = "20050311_spam_2 .tar.bz2")
spam_file <- untar("20050311_spam_2 .tar.bz2", list = TRUE)
length(spam_file)## [1] 1398
download.file(url_ham, destfile = "20030228_hard_ham.tar.bz2")
ham_file <- untar("20030228_hard_ham.tar.bz2", list = TRUE)
length(ham_file)## [1] 252
Get the files from the local drive
file_spam <- DirSource("/Users/karimh/Documents/Google Drive/607 - Data Acquisition and Management/607 - Project4/emails/spam")
file_ham <- DirSource("/Users/karimh/Documents/Google Drive/607 - Data Acquisition and Management/607 - Project4/emails/ham")Create the Corpus for both spam and ham
## [1] 1397
## [1] 251
Spam Corpus Cleaning
spam_corpus1 <- spam_corpus1 %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument) %>%
tolower()
spam_corpus1 <- Corpus(VectorSource(spam_corpus1))
spam_corpus1 <- tm_map(spam_corpus1, removeWords, stopwords())## Warning in tm_map.SimpleCorpus(spam_corpus1, removeWords, stopwords()):
## transformation drops documents
x1 <- TermDocumentMatrix(spam_corpus1)
x2 <- as.matrix(x1)
x3 <- sort(rowSums(x2), decreasing = TRUE)
x <- data.frame(word=names(x3), frequency=x3)
head(x,10)## word frequency
## receiv receiv 7196
## size size 4595
## jul jul 4382
## font font 3664
## widthd widthd 3547
## email email 3260
## esmtp esmtp 3139
## tabl tabl 3117
## width width 2862
## will will 2617
Ham Corpus Cleaning
ham_corpus1 <- ham_corpus1 %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument) %>%
tolower()
ham_corpus1 <- Corpus(VectorSource(ham_corpus1))
ham_corpus1 <- tm_map(ham_corpus1, removeWords, stopwords())## Warning in tm_map.SimpleCorpus(ham_corpus1, removeWords, stopwords()):
## transformation drops documents
y1 <- TermDocumentMatrix(ham_corpus1)
y2 <- as.matrix(x1)
y3 <- sort(rowSums(y2), decreasing = TRUE)
y <- data.frame(word=names(y3), frequency=y3)
y4 <- head(y,100)Word Cloud for Ham
wordcloud(x$word, max.words =100,min.freq=100,scale=c(1,.1), random.order = FALSE,rot.per=.5, color=brewer.pal(8,"Dark2"))## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
Word Cloud for Ham
wordcloud(y4$word, max.words =100,min.freq=100,scale=c(1,.1), random.order = FALSE,rot.per=.5, color=brewer.pal(8,"Dark2"))## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents