Solution

We download Spam and ham emails from http://spamassassin.apache.org/old/publiccorpus/. These are the files 20050311_spam_2.tar.bz2 and 20030228_easy_ham.tar.bz2

Libraries

library("tm")

## Loading required package: NLP

library("RTextTools")

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library("tidyverse")

## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0

## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()

library("stringr")
library("SnowballC")

## 
## Attaching package: 'SnowballC'

## The following objects are masked from 'package:RTextTools':
## 
##     getStemLanguages, wordStem

library("wordcloud")

## Loading required package: RColorBrewer

unzip in following local directories and create corpus for the underlying files

spam_dir <- 'E:\\github\\Rtesting\\pr4\\spam_2\\'
ham_dir <- 'E:\\github\\Rtesting\\pr4\\easy_ham\\'
spam <- spam_dir %>% DirSource() %>% VCorpus()
ham <- ham_dir %>% DirSource() %>% VCorpus()
meta(spam[[1]])

##   author       : character(0)
##   datetimestamp: 2018-11-05 03:52:55
##   description  : character(0)
##   heading      : character(0)
##   id           : 00001.317e78fa8ee2f54cd4890fdc09ba8176
##   language     : en
##   origin       : character(0)

meta(ham[[1]])

##   author       : character(0)
##   datetimestamp: 2018-11-05 03:52:57
##   description  : character(0)
##   heading      : character(0)
##   id           : 00001.7c53336b37003a9286aba55d2945844c
##   language     : en
##   origin       : character(0)

Tidy Corpus. Truncate words using stem. Use loop to place meta data label on all documents as ham or spam

spam <- spam %>% tm_map(content_transformer(PlainTextDocument))
spam <- spam %>% tm_map(content_transformer(removePunctuation))
spam <- spam %>% tm_map(content_transformer(tolower))
spam <- spam %>% tm_map(content_transformer(removeNumbers))
spam <- spam %>% tm_map(content_transformer(stemDocument),  language = 'english')
# Remove 'receiv' for better accuracy
spam <- spam %>% tm_map(removeWords, c('receiv', stopwords('english')))

ham <- ham %>% tm_map(content_transformer(PlainTextDocument))
ham <- ham %>% tm_map(content_transformer(removePunctuation))
ham <- ham %>% tm_map(content_transformer(tolower))
ham <- ham %>% tm_map(content_transformer(removeNumbers))
ham <- ham %>% tm_map(content_transformer(stemDocument),  language = 'english')
# Remove 'receiv', 'spamassassin' for better accuracy
ham <- ham %>% tm_map(removeWords, c('receiv', 'spamassassin', stopwords('english')))

ham_spam <- c(ham,spam)  #c() function puts the two Corpuses back to Back

for(i in 1:length(ham)){
  meta(ham_spam[[i]],"classification") <- "Ham"
}
for(i in (length(ham)+1):(length(spam)+length(ham))){
  meta(ham_spam[[i]],"classification") <- "Spam"
}
for(i in 1:5){
  ham_spam <- sample(ham_spam)
} # This scramble the corpus so it is not all Ham then all Spam
meta(ham_spam[[127]])

##   author        : character(0)
##   datetimestamp : 2018-11-05 03:52:57
##   description   : character(0)
##   heading       : character(0)
##   id            : 01746.06fb7b96d18dae121abb94e8a7624f4b
##   language      : en
##   origin        : character(0)
##   classification: Ham

Applying Statistical methods to text documents

spam_dtm <- spam %>% DocumentTermMatrix()
spam_dtm <- spam_dtm %>% removeSparseTerms(1-(10/length(spam)))
spam_dtm

## <<DocumentTermMatrix (documents: 1397, terms: 2821)>>
## Non-/sparse entries: 182445/3758492
## Sparsity           : 95%
## Maximal term length: 73
## Weighting          : term frequency (tf)

ham_dtm <- ham %>% DocumentTermMatrix()
ham_dtm <- ham_dtm %>% removeSparseTerms(1-(10/length(ham)))
ham_dtm

## <<DocumentTermMatrix (documents: 2501, terms: 3507)>>
## Non-/sparse entries: 290643/8480364
## Sparsity           : 97%
## Maximal term length: 68
## Weighting          : term frequency (tf)

ham_spam_dtm <- ham_spam %>% DocumentTermMatrix()
ham_spam_dtm <- ham_spam_dtm %>% removeSparseTerms(1-(10/length(ham_spam)))
ham_spam_dtm

## <<DocumentTermMatrix (documents: 3898, terms: 5606)>>
## Non-/sparse entries: 490350/21361838
## Sparsity           : 98%
## Maximal term length: 73
## Weighting          : term frequency (tf)

Analyse

#For Spam
spam_freq <-  spam_dtm %>% as.matrix() %>% colSums()
length(spam_freq)

## [1] 2821

spam_freq_ord <- spam_freq %>% order(decreasing = TRUE)
par(las=1)
barplot(spam_freq[spam_freq_ord[1:10]], horiz = TRUE)

#For Ham
ham_freq <-  ham_dtm %>% as.matrix() %>% colSums()
length(ham_freq) #Should be the same as term count, not document count.

## [1] 3507

ham_freq_ord <- ham_freq %>% order(decreasing = TRUE)
par(las=1)
barplot(ham_freq[ham_freq_ord[1:10]], horiz = TRUE)

Document Analysis - Create container and use sets to findout accuracy

# Used below code from R text book
lbls <- as.vector(unlist(meta(ham_spam, type="local", tag = "classification")))
head(lbls)

## [1] "Ham"  "Ham"  "Spam" "Spam" "Ham"  "Ham"

N <- length(lbls)
container <- create_container(ham_spam_dtm, labels = lbls, trainSize = 1:501,testSize = 502:N,virgin = TRUE)

Model - Support Vector Machine

Use Support Vector Machine to supervise learning model to classify emails in the test set as ham or spam.

svm_model <- train_model(container, "SVM")
svm_result <- classify_model(container,svm_model)
head(svm_result)

##   SVM_LABEL  SVM_PROB
## 1       Ham 0.9543802
## 2       Ham 0.9869500
## 3       Ham 0.9997587
## 4       Ham 0.9854293
## 5       Ham 0.9879737
## 6      Spam 0.9984496

prop.table(table(svm_result[,1] == lbls[502:N]))

## 
##      FALSE       TRUE 
## 0.01118634 0.98881366

This gave 98% accuracy

Model - Random Forest

Use Random Forest technique by creating multiply decision trees using the training set.

tree_model <- train_model(container, "TREE")
tree_result <- classify_model(container, tree_model)
head(tree_result)

##   TREE_LABEL TREE_PROB
## 1       Spam     0.625
## 2        Ham     1.000
## 3        Ham     1.000
## 4        Ham     1.000
## 5        Ham     1.000
## 6       Spam     1.000

prop.table(table(tree_result[,1] == lbls[502:N]))

## 
##      FALSE       TRUE 
## 0.02090079 0.97909921

This gave 97% accuracy

Final Summary

When compared to models Support Vector Machine and Random Forest, we got 98% accuracy for SVm and 97% for Random Forest.

Project 4

Vijaya Cherukuri

November 4, 2018

Assignment - Project 4

Solution

Libraries

unzip in following local directories and create corpus for the underlying files

Tidy Corpus. Truncate words using stem. Use loop to place meta data label on all documents as ham or spam

Applying Statistical methods to text documents

Analyse

Document Analysis - Create container and use sets to findout accuracy

Model - Support Vector Machine

Model - Random Forest

Final Summary