It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apachttps://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2he.org/old/publiccorpus/
library(stringr)
library(tm)
library(RTextTools)
library(tidyverse)
library(SnowballC)
library(knitr)
library(tidytext)
library(wordcloud)
library(caret)
library(gbm)
library(e1071)
library(naivebayes)
"https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
url_spam <-
"https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2" url_ham <-
Download the setup the datasets
download.file(url_spam, destfile = "20050311_spam_2 .tar.bz2")
untar("20050311_spam_2 .tar.bz2", list = TRUE)
spam_file <-
length(spam_file)
## [1] 1398
download.file(url_ham, destfile = "20030228_hard_ham.tar.bz2")
untar("20030228_hard_ham.tar.bz2", list = TRUE)
ham_file <-
length(ham_file)
## [1] 252
Get the files from the local drive
DirSource("/Users/karimh/Documents/Google Drive/607 - Data Acquisition and Management/607 - Project4/emails/spam")
file_spam <-
DirSource("/Users/karimh/Documents/Google Drive/607 - Data Acquisition and Management/607 - Project4/emails/ham") file_ham <-
Create the Corpus for both spam and ham
Corpus(file_spam, readerControl = list(reader=readPlain))
spam_corpus1 <-length(spam_corpus1)
## [1] 1397
Corpus(file_ham, readerControl = list(reader=readPlain))
ham_corpus1 <-length(ham_corpus1)
## [1] 251
Spam Corpus Cleaning
spam_corpus1 %>%
spam_corpus1 <- tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument) %>%
tolower()
Corpus(VectorSource(spam_corpus1))
spam_corpus1 <- tm_map(spam_corpus1, removeWords, stopwords()) spam_corpus1 <-
## Warning in tm_map.SimpleCorpus(spam_corpus1, removeWords, stopwords()):
## transformation drops documents
TermDocumentMatrix(spam_corpus1)
x1 <- as.matrix(x1)
x2 <- sort(rowSums(x2), decreasing = TRUE)
x3 <- data.frame(word=names(x3), frequency=x3)
x <-head(x,10)
## word frequency
## receiv receiv 7196
## size size 4595
## jul jul 4382
## font font 3664
## widthd widthd 3547
## email email 3260
## esmtp esmtp 3139
## tabl tabl 3117
## width width 2862
## will will 2617
Ham Corpus Cleaning
ham_corpus1 %>%
ham_corpus1 <- tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument) %>%
tolower()
Corpus(VectorSource(ham_corpus1))
ham_corpus1 <- tm_map(ham_corpus1, removeWords, stopwords()) ham_corpus1 <-
## Warning in tm_map.SimpleCorpus(ham_corpus1, removeWords, stopwords()):
## transformation drops documents
TermDocumentMatrix(ham_corpus1)
y1 <- as.matrix(x1)
y2 <- sort(rowSums(y2), decreasing = TRUE)
y3 <- data.frame(word=names(y3), frequency=y3)
y <- head(y,100) y4 <-
Word Cloud for Ham
wordcloud(x$word, max.words =100,min.freq=100,scale=c(1,.1), random.order = FALSE,rot.per=.5, color=brewer.pal(8,"Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
Word Cloud for Ham
wordcloud(y4$word, max.words =100,min.freq=100,scale=c(1,.1), random.order = FALSE,rot.per=.5, color=brewer.pal(8,"Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents