library(RTextTools)
library(tm)
library(SnowballC)
library(plyr)
library(stringr)
library(readr)
library(wordcloud)
library(knitr)
Perform text processing to figure out if
Read in known spam and known ham
#Load Spam and Ham into a Corpus
spam_corpus <- Corpus(DirSource("spam/spam"))
#Downloaded from: http://spamassassin.apache.org/publiccorpus/20021010_spam.tar.bz2
ham_corpus <- Corpus(DirSource("easy_ham/easy_ham"))
#Downloaded from: http://spamassassin.apache.org/publiccorpus/20021010_easy_ham.tar.bz2
meta(spam_corpus, tag = "type") <- "spam"
meta(ham_corpus, tag = "type") <- "ham"
#Combine Corpi to create a training corupus
training_courpus <- sample(c(ham_corpus, spam_corpus, recursive=T))
is_spam <- ifelse( unlist(meta(training_courpus, "type")[,1]) == "spam",1,0)
Clean training corpus
training_courpus <- tm_map( training_courpus, str_replace_all, pattern = "[[:punct:]]", replacement = " ")
training_courpus <- tm_map( training_courpus, removeWords, words = stopwords("en"))
training_courpus <- tm_map( training_courpus, tolower)
training_courpus <- tm_map( training_courpus, stemDocument)
training_courpus <- tm_map( training_courpus, PlainTextDocument)
training_tdm <- DocumentTermMatrix(training_courpus)
Train the SVM algo on
t_corpus_length <- length(is_spam)
training_container <- create_container(
training_tdm,
labels = is_spam,
trainSize = 1:floor(t_corpus_length*.2),
testSize = ceiling(t_corpus_length*.8) : t_corpus_length,
virgin = FALSE
)
svm_model <- train_model(training_container, "SVM")
Score the results
svm_out <- classify_model(training_container, svm_model)
svm_summary <- create_analytics(training_container, svm_out)
svm_summary@algorithm_summary
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0 1.00 0.99 0.99
## 1 0.95 0.98 0.96
Note: This new corpus is 100% spam
Load the unkown corpus
unkown_corpus <- Corpus(DirSource("spam_2/spam_2")) #https://spamassassin.apache.org/publiccorpus/20050311_spam_2.tar.bz2
meta(unkown_corpus, tag = "type") <- "unkown"
Clean the unkwown corpus
unkown_corpus <- tm_map( unkown_corpus, str_replace_all, pattern = "[[:punct:]]", replacement = " ")
unkown_corpus <- tm_map( unkown_corpus, removeWords, words = stopwords("en"))
unkown_corpus <- tm_map( unkown_corpus, tolower)
unkown_corpus <- tm_map( unkown_corpus, stemDocument)
unkown_corpus <- tm_map( unkown_corpus, PlainTextDocument)
Run the classify the unkown corpus
training_tdm <- DocumentTermMatrix(training_courpus)
unkown_tdm <- DocumentTermMatrix(unkown_corpus, list(dictionary=findFreqTerms(training_tdm)) )
unkown_container <- create_container(
unkown_tdm,
labels = vector(),
testSize = 1:nrow(unkown_tdm),
virgin = TRUE
)
unkown_results_summary <- classify_model(unkown_container, svm_model)
Get a summary of the results of the classification
kable(count(unkown_results_summary, 'SVM_LABEL'))
| SVM_LABEL | freq |
|---|---|
| 0 | 259 |
| 1 | 1138 |
Note: remember when viewing results 1 = spam and 0 = ham
Word Cloud of Unkown Corpus
wordcloud(unkown_corpus, max.words = 100)
Word Cloud of Known Spam Corpus
wordcloud(spam_corpus, max.words = 100)
Word Cloud of Known Ham Corpus
wordcloud(ham_corpus, max.words = 100)
An SVM is pretty good about classifing emails as spam or ham, but I should have probably scored other models. However, my computer is apparently too slow for text processing.