Upload needed libraries

library("tm")
## Loading required package: NLP
library("RTextTools")
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library("tidyverse")
## -- Attaching packages ------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
library("stringr")
library("SnowballC")
## 
## Attaching package: 'SnowballC'
## The following objects are masked from 'package:RTextTools':
## 
##     getStemLanguages, wordStem
library("wordcloud")
## Loading required package: RColorBrewer

Location where the files were extracted. The files were taken from this website https://spamassassin.apache.org/old/publiccorpus/

spam_dir<-"C:\\Users\\hangr\\Documents\\Acquisition and data management\\spam"
ham_dir<-"C:\\Users\\hangr\\Documents\\Acquisition and data management\\hard_ham"

We need to classify each individual file as spam and ham. We will use VCorpus from tidyverse to do so

spam<-spam_dir %>% DirSource() %>% VCorpus()
ham<-ham_dir %>% DirSource() %>% VCorpus()
meta(ham[[1]])
##   author       : character(0)
##   datetimestamp: 2018-11-04 19:23:26
##   description  : character(0)
##   heading      : character(0)
##   id           : 00001.7c7d6921e671bbe18ebb5f893cd9bb35
##   language     : en
##   origin       : character(0)

Cleaning and tidying the dataset

#spam dataset
spam<- spam %>% tm_map(content_transformer(PlainTextDocument)) #Transform spam to plain text
spam <- spam %>% tm_map(content_transformer(removePunctuation)) #Remove period from the text
spam <- spam %>% tm_map(content_transformer(tolower)) #Put the text in lower case
spam <- spam %>% tm_map(content_transformer(removeNumbers)) #Removing numbers if any
spam <- spam %>% tm_map(content_transformer(stemDocument), language="english")
spam <- spam %>% tm_map(removeWords, c('receiv', stopwords('english'))) #Remove list of words
#ham dataset
ham<- ham %>% tm_map(content_transformer(PlainTextDocument)) #Transform spam to plain text
ham <- ham %>% tm_map(content_transformer(removePunctuation)) #Remove period from the text
ham<- ham %>% tm_map(content_transformer(tolower)) #Put the text in lower case
ham <- ham %>% tm_map(content_transformer(removeNumbers)) #Removing numbers if any
ham <- ham %>% tm_map(content_transformer(stemDocument), language="english")
ham <- ham %>% tm_map(removeWords, c('receiv', stopwords('english')))

Joining both datasets together

ham_spam <- c(spam, ham)
meta(ham_spam[[1]])
##   author       : character(0)
##   datetimestamp: 2018-11-04 19:23:25
##   description  : character(0)
##   heading      : character(0)
##   id           : 0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1
##   language     : en
##   origin       : character(0)

In the following step we will identify which part of the data is spam and which one is not by adding 1 for spam and 0 for non spam

for (i in 1: length(ham)){
  meta(ham_spam[[i]],"classification")<-0
}

for(i in (length(ham)+1):(length(ham) + length(spam))){
  meta(ham_spam[[i]],"classification")<- 1
}
for (i in 1:10){
  ham_spam<-sample(ham_spam)
}
meta(ham_spam[[11]])
##   author        : character(0)
##   datetimestamp : 2018-11-04 19:23:25
##   description   : character(0)
##   heading       : character(0)
##   id            : 0109.601a9cd8272f22236b27e95dbe2fa22d
##   language      : en
##   origin        : character(0)
##   classification: 0

Tokenize the joint dataset using DocumentTermMatrix

ham_spam_dtm<-DocumentTermMatrix(ham_spam)
ham_spam_dtm
## <<DocumentTermMatrix (documents: 752, terms: 63577)>>
## Non-/sparse entries: 243955/47565949
## Sparsity           : 99%
## Maximal term length: 298
## Weighting          : term frequency (tf)
#Remove sparse terms
ham_spam_dtm<-ham_spam_dtm %>%removeSparseTerms(1-(10/length(ham_spam)))
ham_spam_dtm
## <<DocumentTermMatrix (documents: 752, terms: 3104)>>
## Non-/sparse entries: 149646/2184562
## Sparsity           : 94%
## Maximal term length: 95
## Weighting          : term frequency (tf)

Statistics about spam_dtm and ham_dtm

spam_dtm<-spam %>% DocumentTermMatrix()
spam_f<-spam_dtm %>% as.matrix %>% colSums()
length(spam_f)
## [1] 30450
ham_dtm<-ham %>% DocumentTermMatrix()
ham_f<-ham_dtm %>% as.matrix %>% colSums
length(ham_f)
## [1] 38026

Wordcloud

#spam wordcloud
wordcloud(spam, max.words=15, random.order = FALSE, random.color=TRUE, colors = palette())

#ham wordcloud
wordcloud(ham, max.words=15, random.order = FALSE, random.color=TRUE, colors=palette())

Classification model with Naives Bayes

lbl_ham_spam<-as.vector(unlist(meta(ham_spam, type ="local", tag = "classification")))
head(lbl_ham_spam)
## [1] 1 1 1 0 0 1
N<-length(lbl_ham_spam)
ham_spam_cont<-create_container(ham_spam_dtm,labels = lbl_ham_spam, trainSize = 1:520, testSize = 521:N, virgin = TRUE)

RandomForest

Random Forest is a classification model that uses the training set to create multiple decision trees and assign the data in the test set a classification which is the result of each individual decision tree.

tree_dec_ham_spam<-train_model(ham_spam_cont, "TREE")
tree_res_ham_spam<-classify_model(ham_spam_cont, tree_dec_ham_spam)
head(tree_res_ham_spam)
#Probability
prop.table(table(tree_res_ham_spam[,1]== lbl_ham_spam[521:N]))
## 
##      FALSE       TRUE 
## 0.04310345 0.95689655

With the random forest model, we can predict at 94% of accuracy.