library("tm")
## Loading required package: NLP
library("RTextTools")
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library("tidyverse")
## -- Attaching packages ------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("stringr")
library("SnowballC")
##
## Attaching package: 'SnowballC'
## The following objects are masked from 'package:RTextTools':
##
## getStemLanguages, wordStem
library("wordcloud")
## Loading required package: RColorBrewer
Location where the files were extracted. The files were taken from this website https://spamassassin.apache.org/old/publiccorpus/
spam_dir<-"C:\\Users\\hangr\\Documents\\Acquisition and data management\\spam"
ham_dir<-"C:\\Users\\hangr\\Documents\\Acquisition and data management\\hard_ham"
We need to classify each individual file as spam and ham. We will use VCorpus from tidyverse to do so
spam<-spam_dir %>% DirSource() %>% VCorpus()
ham<-ham_dir %>% DirSource() %>% VCorpus()
meta(ham[[1]])
## author : character(0)
## datetimestamp: 2018-11-04 19:23:26
## description : character(0)
## heading : character(0)
## id : 00001.7c7d6921e671bbe18ebb5f893cd9bb35
## language : en
## origin : character(0)
#spam dataset
spam<- spam %>% tm_map(content_transformer(PlainTextDocument)) #Transform spam to plain text
spam <- spam %>% tm_map(content_transformer(removePunctuation)) #Remove period from the text
spam <- spam %>% tm_map(content_transformer(tolower)) #Put the text in lower case
spam <- spam %>% tm_map(content_transformer(removeNumbers)) #Removing numbers if any
spam <- spam %>% tm_map(content_transformer(stemDocument), language="english")
spam <- spam %>% tm_map(removeWords, c('receiv', stopwords('english'))) #Remove list of words
#ham dataset
ham<- ham %>% tm_map(content_transformer(PlainTextDocument)) #Transform spam to plain text
ham <- ham %>% tm_map(content_transformer(removePunctuation)) #Remove period from the text
ham<- ham %>% tm_map(content_transformer(tolower)) #Put the text in lower case
ham <- ham %>% tm_map(content_transformer(removeNumbers)) #Removing numbers if any
ham <- ham %>% tm_map(content_transformer(stemDocument), language="english")
ham <- ham %>% tm_map(removeWords, c('receiv', stopwords('english')))
Joining both datasets together
ham_spam <- c(spam, ham)
meta(ham_spam[[1]])
## author : character(0)
## datetimestamp: 2018-11-04 19:23:25
## description : character(0)
## heading : character(0)
## id : 0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1
## language : en
## origin : character(0)
In the following step we will identify which part of the data is spam and which one is not by adding 1 for spam and 0 for non spam
for (i in 1: length(ham)){
meta(ham_spam[[i]],"classification")<-0
}
for(i in (length(ham)+1):(length(ham) + length(spam))){
meta(ham_spam[[i]],"classification")<- 1
}
for (i in 1:10){
ham_spam<-sample(ham_spam)
}
meta(ham_spam[[11]])
## author : character(0)
## datetimestamp : 2018-11-04 19:23:25
## description : character(0)
## heading : character(0)
## id : 0109.601a9cd8272f22236b27e95dbe2fa22d
## language : en
## origin : character(0)
## classification: 0
ham_spam_dtm<-DocumentTermMatrix(ham_spam)
ham_spam_dtm
## <<DocumentTermMatrix (documents: 752, terms: 63577)>>
## Non-/sparse entries: 243955/47565949
## Sparsity : 99%
## Maximal term length: 298
## Weighting : term frequency (tf)
#Remove sparse terms
ham_spam_dtm<-ham_spam_dtm %>%removeSparseTerms(1-(10/length(ham_spam)))
ham_spam_dtm
## <<DocumentTermMatrix (documents: 752, terms: 3104)>>
## Non-/sparse entries: 149646/2184562
## Sparsity : 94%
## Maximal term length: 95
## Weighting : term frequency (tf)
spam_dtm<-spam %>% DocumentTermMatrix()
spam_f<-spam_dtm %>% as.matrix %>% colSums()
length(spam_f)
## [1] 30450
ham_dtm<-ham %>% DocumentTermMatrix()
ham_f<-ham_dtm %>% as.matrix %>% colSums
length(ham_f)
## [1] 38026
#spam wordcloud
wordcloud(spam, max.words=15, random.order = FALSE, random.color=TRUE, colors = palette())
#ham wordcloud
wordcloud(ham, max.words=15, random.order = FALSE, random.color=TRUE, colors=palette())
lbl_ham_spam<-as.vector(unlist(meta(ham_spam, type ="local", tag = "classification")))
head(lbl_ham_spam)
## [1] 1 1 1 0 0 1
N<-length(lbl_ham_spam)
ham_spam_cont<-create_container(ham_spam_dtm,labels = lbl_ham_spam, trainSize = 1:520, testSize = 521:N, virgin = TRUE)
Random Forest is a classification model that uses the training set to create multiple decision trees and assign the data in the test set a classification which is the result of each individual decision tree.
tree_dec_ham_spam<-train_model(ham_spam_cont, "TREE")
tree_res_ham_spam<-classify_model(ham_spam_cont, tree_dec_ham_spam)
head(tree_res_ham_spam)
#Probability
prop.table(table(tree_res_ham_spam[,1]== lbl_ham_spam[521:N]))
##
## FALSE TRUE
## 0.04310345 0.95689655
With the random forest model, we can predict at 94% of accuracy.
+Automated Data Collection Chapter 10 +https://www3.nd.edu/~steve/computing_with_data/20_text_mining/text_mining_example.html#/ +https://www.tidytextmining.com/tidytext.html