For this project we have to start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder) One example corpus: http://spamassassin.apache.org/old/publiccorpus/ Here we classify emails as Spam(Unwanted) and Ham (Wanted)
We download Spam and ham emails from http://spamassassin.apache.org/old/publiccorpus/. These are the files 20050311_spam_2.tar.bz2 and 20030228_easy_ham.tar.bz2
library("tm")
## Loading required package: NLP
library("RTextTools")
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library("tidyverse")
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("stringr")
library("SnowballC")
##
## Attaching package: 'SnowballC'
## The following objects are masked from 'package:RTextTools':
##
## getStemLanguages, wordStem
library("wordcloud")
## Loading required package: RColorBrewer
spam_dir <- 'E:\\github\\Rtesting\\pr4\\spam_2\\'
ham_dir <- 'E:\\github\\Rtesting\\pr4\\easy_ham\\'
spam <- spam_dir %>% DirSource() %>% VCorpus()
ham <- ham_dir %>% DirSource() %>% VCorpus()
meta(spam[[1]])
## author : character(0)
## datetimestamp: 2018-11-05 03:52:55
## description : character(0)
## heading : character(0)
## id : 00001.317e78fa8ee2f54cd4890fdc09ba8176
## language : en
## origin : character(0)
meta(ham[[1]])
## author : character(0)
## datetimestamp: 2018-11-05 03:52:57
## description : character(0)
## heading : character(0)
## id : 00001.7c53336b37003a9286aba55d2945844c
## language : en
## origin : character(0)
spam <- spam %>% tm_map(content_transformer(PlainTextDocument))
spam <- spam %>% tm_map(content_transformer(removePunctuation))
spam <- spam %>% tm_map(content_transformer(tolower))
spam <- spam %>% tm_map(content_transformer(removeNumbers))
spam <- spam %>% tm_map(content_transformer(stemDocument), language = 'english')
# Remove 'receiv' for better accuracy
spam <- spam %>% tm_map(removeWords, c('receiv', stopwords('english')))
ham <- ham %>% tm_map(content_transformer(PlainTextDocument))
ham <- ham %>% tm_map(content_transformer(removePunctuation))
ham <- ham %>% tm_map(content_transformer(tolower))
ham <- ham %>% tm_map(content_transformer(removeNumbers))
ham <- ham %>% tm_map(content_transformer(stemDocument), language = 'english')
# Remove 'receiv', 'spamassassin' for better accuracy
ham <- ham %>% tm_map(removeWords, c('receiv', 'spamassassin', stopwords('english')))
ham_spam <- c(ham,spam) #c() function puts the two Corpuses back to Back
for(i in 1:length(ham)){
meta(ham_spam[[i]],"classification") <- "Ham"
}
for(i in (length(ham)+1):(length(spam)+length(ham))){
meta(ham_spam[[i]],"classification") <- "Spam"
}
for(i in 1:5){
ham_spam <- sample(ham_spam)
} # This scramble the corpus so it is not all Ham then all Spam
meta(ham_spam[[127]])
## author : character(0)
## datetimestamp : 2018-11-05 03:52:57
## description : character(0)
## heading : character(0)
## id : 01746.06fb7b96d18dae121abb94e8a7624f4b
## language : en
## origin : character(0)
## classification: Ham
spam_dtm <- spam %>% DocumentTermMatrix()
spam_dtm <- spam_dtm %>% removeSparseTerms(1-(10/length(spam)))
spam_dtm
## <<DocumentTermMatrix (documents: 1397, terms: 2821)>>
## Non-/sparse entries: 182445/3758492
## Sparsity : 95%
## Maximal term length: 73
## Weighting : term frequency (tf)
ham_dtm <- ham %>% DocumentTermMatrix()
ham_dtm <- ham_dtm %>% removeSparseTerms(1-(10/length(ham)))
ham_dtm
## <<DocumentTermMatrix (documents: 2501, terms: 3507)>>
## Non-/sparse entries: 290643/8480364
## Sparsity : 97%
## Maximal term length: 68
## Weighting : term frequency (tf)
ham_spam_dtm <- ham_spam %>% DocumentTermMatrix()
ham_spam_dtm <- ham_spam_dtm %>% removeSparseTerms(1-(10/length(ham_spam)))
ham_spam_dtm
## <<DocumentTermMatrix (documents: 3898, terms: 5606)>>
## Non-/sparse entries: 490350/21361838
## Sparsity : 98%
## Maximal term length: 73
## Weighting : term frequency (tf)
#For Spam
spam_freq <- spam_dtm %>% as.matrix() %>% colSums()
length(spam_freq)
## [1] 2821
spam_freq_ord <- spam_freq %>% order(decreasing = TRUE)
par(las=1)
barplot(spam_freq[spam_freq_ord[1:10]], horiz = TRUE)
#For Ham
ham_freq <- ham_dtm %>% as.matrix() %>% colSums()
length(ham_freq) #Should be the same as term count, not document count.
## [1] 3507
ham_freq_ord <- ham_freq %>% order(decreasing = TRUE)
par(las=1)
barplot(ham_freq[ham_freq_ord[1:10]], horiz = TRUE)
# Used below code from R text book
lbls <- as.vector(unlist(meta(ham_spam, type="local", tag = "classification")))
head(lbls)
## [1] "Ham" "Ham" "Spam" "Spam" "Ham" "Ham"
N <- length(lbls)
container <- create_container(ham_spam_dtm, labels = lbls, trainSize = 1:501,testSize = 502:N,virgin = TRUE)
Use Support Vector Machine to supervise learning model to classify emails in the test set as ham or spam.
svm_model <- train_model(container, "SVM")
svm_result <- classify_model(container,svm_model)
head(svm_result)
## SVM_LABEL SVM_PROB
## 1 Ham 0.9543802
## 2 Ham 0.9869500
## 3 Ham 0.9997587
## 4 Ham 0.9854293
## 5 Ham 0.9879737
## 6 Spam 0.9984496
prop.table(table(svm_result[,1] == lbls[502:N]))
##
## FALSE TRUE
## 0.01118634 0.98881366
This gave 98% accuracy
Use Random Forest technique by creating multiply decision trees using the training set.
tree_model <- train_model(container, "TREE")
tree_result <- classify_model(container, tree_model)
head(tree_result)
## TREE_LABEL TREE_PROB
## 1 Spam 0.625
## 2 Ham 1.000
## 3 Ham 1.000
## 4 Ham 1.000
## 5 Ham 1.000
## 6 Spam 1.000
prop.table(table(tree_result[,1] == lbls[502:N]))
##
## FALSE TRUE
## 0.02090079 0.97909921
This gave 97% accuracy
When compared to models Support Vector Machine and Random Forest, we got 98% accuracy for SVm and 97% for Random Forest.