Project 4

Nnaemezue Obi-Eyisi & Nkasi Nedd

2017-04-16


Introduction

This project explores the use of R to classify whether an email message is spam or not. The data is taken from Spam Assain’s Public Corpus found here.

The approach is taken to utilise a number of classification algorithms in order to ascertain the most accurate classification technique for this data.

To replicate this project the data in the folders spam and easy_ham (found in this project’s repository on github) should be downloaded and placed in your working folder.

Setup environment

knitr::opts_chunk$set(echo = TRUE)
if("tidytext" %in% rownames(installed.packages()) == FALSE) {install.packages("tidytext")}
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.3.3
if("rpart" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart")}
library(rpart)
if("tidyr" %in% rownames(installed.packages()) == FALSE) {install.packages("tidyr")}
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if("tm" %in% rownames(installed.packages()) == FALSE) {install.packages("tm")}
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
if("rpart.plot" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart.plot")}
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
if("e1071" %in% rownames(installed.packages()) == FALSE) {install.packages("e1071")}
library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
if("class" %in% rownames(installed.packages()) == FALSE) {install.packages("class")}
library(class)
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
## Warning: package 'stringr' was built under R version 3.3.3
if("nnet" %in% rownames(installed.packages()) == FALSE) {install.packages("nnet")}
library(nnet)
if("wordcloud" %in% rownames(installed.packages()) == FALSE) {install.packages("wordcloud")}
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.3.2

setup paths and functions

Function taken from ML for hackers Chapter 3 - https://github.com/johnmyleswhite/ML_for_Hackers/tree/master/03-Classification

#spam_path <- "spam/"
#ham_path <- "easy_ham/"
spam_path <- "C:/Users/Mezue/Documents/Data607/SpamHAM/20030228_spam/spam/"
ham_path <- "C:/Users/Mezue/Documents/Data607/SpamHAM/20021010_easy_ham/easy_ham/"
get.msg <- function(path)
{
# Return a single element vector of just the email body
# This is a very simple approach, as we are only using 
# words as features
  con <- file(path, open = "rt")
  text <- readLines(con)
  # The message always begins after the first full line break
  msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
  close(con)
  return(paste(msg, collapse = "\n"))
}
spam_docs <- dir(spam_path)
spam_docs <- spam_docs[which(spam_docs!="cmds")]
all_spam <- sapply(spam_docs, function (p) get.msg(paste(spam_path,p,sep="")))
## Warning in readLines(con): incomplete final line found on 'C:/
## Users/Mezue/Documents/Data607/SpamHAM/20030228_spam/spam/
## 00136.faa39d8e816c70f23b4bb8758d8a74f0'
ham_docs <- dir(ham_path)
ham_docs <- ham_docs[which(ham_docs!="cmds")]
all_ham <- sapply(ham_docs, function (p) get.msg(paste(ham_path,p,sep="")))
control <- list(stopwords=TRUE, removePunctuation=TRUE,removeNumbers=TRUE, minDocFreq=2)

Create Spam and Ham Corpus

Including WordLists in the form of Term Document and Document Term matrices

spam_corpus <- Corpus(VectorSource(all_spam))
spam_tdm <- TermDocumentMatrix(spam_corpus,control)
spam_dtm <- DocumentTermMatrix(spam_corpus, control)
#remove sparse items
spam_tdm2<-removeSparseTerms(spam_tdm,0.8)

ham_corpus <- Corpus(VectorSource(all_ham))
ham_tdm <- TermDocumentMatrix(ham_corpus,control)
ham_dtm <- DocumentTermMatrix(ham_corpus, control)
#remove sparse items
ham_tdm2<-removeSparseTerms(ham_tdm,0.8)

Visualise the Ham Corpus

wordcloud(ham_corpus, min.freq=600)

Visualise the Spam Corpus

wordcloud(spam_corpus, min.freq=400)

Convert corpus to tidy format

tidy_ham <- tidy(ham_tdm2)
head(tidy_ham)
## # A tibble: 6 × 3
##    term                              document count
##   <chr>                                 <chr> <dbl>
## 1   can 0001.ea7e79d3153e7469e7a9c3e0af6a357e     1
## 2   com 0001.ea7e79d3153e7469e7a9c3e0af6a357e     4
## 3  date 0001.ea7e79d3153e7469e7a9c3e0af6a357e     1
## 4   get 0001.ea7e79d3153e7469e7a9c3e0af6a357e     1
## 5  like 0001.ea7e79d3153e7469e7a9c3e0af6a357e     1
## 6  list 0001.ea7e79d3153e7469e7a9c3e0af6a357e     5
tidy_spam<- tidy(spam_tdm2)
head(tidy_spam)
## # A tibble: 6 × 3
##          term                               document count
##         <chr>                                  <chr> <dbl>
## 1       align 00001.7848dde101aa985090474a91ec93fcf0     8
## 2       arial 00001.7848dde101aa985090474a91ec93fcf0     2
## 3        body 00001.7848dde101aa985090474a91ec93fcf0     2
## 4      border 00001.7848dde101aa985090474a91ec93fcf0     3
## 5         can 00001.7848dde101aa985090474a91ec93fcf0     1
## 6 cellpadding 00001.7848dde101aa985090474a91ec93fcf0     3

Change to Wide Format

wide_ham <- spread(tidy_ham, term, count, fill = 0)
head(wide_ham)
## # A tibble: 6 × 24
##                                document   can   com  date   don   get
##                                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0001.ea7e79d3153e7469e7a9c3e0af6a357e     1     4     1     0     1
## 2 0002.b3120c4bcbf3101e661161ee7efcb8bf     0     3     0     0     0
## 3 0003.acfc5ad94bbd27118a0d8685d18c89dd     0     3     0     0     1
## 4 0004.e8d5727378ddde5c3be181df593f1712     1     1     0     0     0
## 5 0005.8c3b9e9c0f3f183ddaf7592a11b99957     1     2     0     0     2
## 6 0006.ee8b0dba12856155222be180ba122058     0     3     0     0     0
## # ... with 18 more variables: http <dbl>, just <dbl>, like <dbl>,
## #   list <dbl>, listinfo <dbl>, lists <dbl>, mailing <dbl>, mailman <dbl>,
## #   net <dbl>, new <dbl>, now <dbl>, one <dbl>, time <dbl>, url <dbl>,
## #   use <dbl>, will <dbl>, wrote <dbl>, www <dbl>
wide_spam <- spread(tidy_spam, term, count, fill = 0)
head(wide_spam)
## # A tibble: 6 × 75
##                                 document address align  also arial bgcolor
##                                    <chr>   <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 00001.7848dde101aa985090474a91ec93fcf0       0     8     0     2       0
## 2 00002.d94f1b97e48ed3b553b3508d116e6a09       0     0     0     0       0
## 3 00003.2ee33bc6eacdb11f38d052c44819ba6c       0     0     0     0       0
## 4 00004.eac8de8d759b7e74154f142194282724       0     0     0     0       0
## 5 00005.57696a39d7d84318ce497886896bf90d       0     0     0     0       0
## 6 00006.5ab5620d3d7c6c0db76234556a16f6c1       1     0     0     0       0
## # ... with 69 more variables: body <dbl>, border <dbl>, business <dbl>,
## #   can <dbl>, cellpadding <dbl>, cellspacing <dbl>, center <dbl>,
## #   charset <dbl>, click <dbl>, color <dbl>, com <dbl>, company <dbl>,
## #   content <dbl>, don <dbl>, email <dbl>, equiv <dbl>, face <dbl>,
## #   ffffff <dbl>, font <dbl>, form <dbl>, free <dbl>, get <dbl>,
## #   head <dbl>, height <dbl>, href <dbl>, html <dbl>, http <dbl>,
## #   img <dbl>, information <dbl>, iso <dbl>, just <dbl>, like <dbl>,
## #   link <dbl>, list <dbl>, mail <dbl>, may <dbl>, message <dbl>,
## #   meta <dbl>, money <dbl>, name <dbl>, nbsp <dbl>, net <dbl>, new <dbl>,
## #   now <dbl>, offer <dbl>, one <dbl>, please <dbl>, receive <dbl>,
## #   remove <dbl>, removed <dbl>, reply <dbl>, right <dbl>, send <dbl>,
## #   size <dbl>, src <dbl>, table <dbl>, text <dbl>, time <dbl>,
## #   title <dbl>, today <dbl>, top <dbl>, transfer <dbl>, type <dbl>,
## #   use <dbl>, want <dbl>, width <dbl>, will <dbl>, wish <dbl>, www <dbl>

Insert column to mark class of document

ham_class <- wide_ham %>% 
    mutate(class = "ham")

spam_class <- wide_spam %>% 
    mutate(class = "spam")

Merge Ham and Spam data

#Merge
all_data <- bind_rows(ham_class, spam_class)
#Rearrance columns
all_data <- select(all_data, class, everything())
#Set N/A to 0
all_data[is.na(all_data)] <- 0
#Make class variable a factor variable
all_data$class <- as.factor(all_data$class)
#Remove the document names
all_data <- subset(all_data, select = -c(document))
head(all_data)
## # A tibble: 6 × 82
##    class   can   com  date   don   get  http  just  like  list listinfo
##   <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1    ham     1     4     1     0     1     0     0     1     5        1
## 2    ham     0     3     0     0     0     2     0     0     0        0
## 3    ham     0     3     0     0     1     2     0     0     0        0
## 4    ham     1     1     0     0     0     2     0     0     1        1
## 5    ham     1     2     0     0     2     0     0     0     1        1
## 6    ham     0     3     0     0     0     3     1     0     0        0
## # ... with 71 more variables: lists <dbl>, mailing <dbl>, mailman <dbl>,
## #   net <dbl>, new <dbl>, now <dbl>, one <dbl>, time <dbl>, url <dbl>,
## #   use <dbl>, will <dbl>, wrote <dbl>, www <dbl>, address <dbl>,
## #   align <dbl>, also <dbl>, arial <dbl>, bgcolor <dbl>, body <dbl>,
## #   border <dbl>, business <dbl>, cellpadding <dbl>, cellspacing <dbl>,
## #   center <dbl>, charset <dbl>, click <dbl>, color <dbl>, company <dbl>,
## #   content <dbl>, email <dbl>, equiv <dbl>, face <dbl>, ffffff <dbl>,
## #   font <dbl>, form <dbl>, free <dbl>, head <dbl>, height <dbl>,
## #   href <dbl>, html <dbl>, img <dbl>, information <dbl>, iso <dbl>,
## #   link <dbl>, mail <dbl>, may <dbl>, message <dbl>, meta <dbl>,
## #   money <dbl>, name <dbl>, nbsp <dbl>, offer <dbl>, please <dbl>,
## #   receive <dbl>, remove <dbl>, removed <dbl>, reply <dbl>, right <dbl>,
## #   send <dbl>, size <dbl>, src <dbl>, table <dbl>, text <dbl>,
## #   title <dbl>, today <dbl>, top <dbl>, transfer <dbl>, type <dbl>,
## #   want <dbl>, width <dbl>, wish <dbl>

Split into Training and Test Sets

set.seed(1800)
index <- 1:nrow(all_data)
#Perform a 60/40 split
trainIndex <- sample(index, trunc(length(index) * 0.6))
train_data <- all_data[trainIndex,]
test_data <- all_data[-trainIndex,]

Visualise the Training and Test sets

table(train_data$class)
## 
##  ham spam 
## 1522  293
plot(train_data$class, main = "Training Data Set")

table(test_data$class)
## 
##  ham spam 
## 1006  205
plot(test_data$class, main = "Test Data Set")

Classification - Naive Bayes

The Naive Bayes Classifier is a probabilistic classifier the uses Bayes Theorm

pc <- proc.time()
#Create a Naive Bayes classifier object
naivebayes_model <- naiveBayes(train_data, factor(train_data$class))

proc.time() - pc
##    user  system elapsed 
##    0.13    0.00    0.12
summary(naivebayes_model)
##         Length Class  Mode     
## apriori  2     table  numeric  
## tables  82     -none- list     
## levels   2     -none- character
## call     3     -none- call

Evaluation of Naive Bayes

#Evaluate the performance on the test data
naivebayes_predict <- predict(naivebayes_model, newdata=test_data)

#Check the predictions against reality
table(`Actual Class` = test_data$class, `Predicted Class` = naivebayes_predict)
##             Predicted Class
## Actual Class  ham spam
##         ham  1006    0
##         spam  166   39

Accuracy of Naive Bayes

naivebayes_error <- sum(test_data$class != naivebayes_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - naivebayes_error))
## [1] "Accuary (Precision): 0.862923203963666"

Classification - Recursive Partitioning and Regression Trees Algorithm

This algorithm creates a decision tree for classification

pc <- proc.time()
tree_model <- rpart(class ~ ., data = train_data, method = "class")
proc.time() - pc
##    user  system elapsed 
##    0.92    0.00    0.92
printcp(tree_model)
## 
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] content email   html    mail    please 
## 
## Root node error: 293/1815 = 0.16143
## 
## n= 1815 
## 
##         CP nsplit rel error   xerror     xstd
## 1 0.549488      0  1.000000 1.000000 0.053498
## 2 0.252560      1  0.450512 0.450512 0.037759
## 3 0.078498      2  0.197952 0.197952 0.025574
## 4 0.054608      3  0.119454 0.119454 0.019996
## 5 0.030717      4  0.064846 0.064846 0.014799
## 6 0.010000      5  0.034130 0.047782 0.012721
# plot(tree_model, uniform = TRUE, main = "Classification (RPART).  Classification Tree for SPAM/HAM")
# text(tree_model, all=TRUE, cex = 0.9)

prp(tree_model,faclen=0,cex=0.75,extra=1)

Evaluation of Regression Tree

tree_predict <- predict(tree_model, newdata = test_data, type = "class")
table('Actual Class' = test_data$class, 'Predicted Class' = tree_predict)
##             Predicted Class
## Actual Class  ham spam
##         ham  1006    0
##         spam   14  191

Accuracy of Regression Tree

tree_error <- sum(test_data$class != tree_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - tree_error))
## [1] "Accuary (Precision): 0.988439306358382"

Classification - Support Vector Machine (SVM)

SVMs is a non-probabilistic linear classifier

pc <- proc.time()
svm_model <- svm(class ~ ., method = "class", data = train_data)
proc.time() - pc
##    user  system elapsed 
##    1.23    0.03    1.28
summary(svm_model)
## 
## Call:
## svm(formula = class ~ ., data = train_data, method = "class")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01234568 
## 
## Number of Support Vectors:  222
## 
##  ( 81 141 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  ham spam

Evaluation of SVM

svm_predict <- predict(svm_model, newdata = test_data, type = "class")
table(`Actual Class` = test_data$class, `Predicted Class` = svm_predict)
##             Predicted Class
## Actual Class  ham spam
##         ham  1000    6
##         spam   20  185

Accuracy of SVM

svm_error <- sum(test_data$class != svm_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - svm_error))
## [1] "Accuary (Precision): 0.978530140379851"

Classification - K-Nearest Neighbours (KNN)

This is a non-parametric classifier

#change factor to numeric
train_data_knn <- train_data
train_data_knn$class <- str_replace_all(train_data_knn$class, "ham", "1")
train_data_knn$class <- str_replace_all(train_data_knn$class, "spam", "0")
train_data_knn$class <- as.numeric(as.character(train_data_knn$class))

test_data_knn <- test_data
test_data_knn$class <- str_replace_all(test_data_knn$class, "ham", "1")
test_data_knn$class <- str_replace_all(test_data_knn$class, "spam", "0")
test_data_knn$class <- as.numeric(as.character(test_data_knn$class))


pc <- proc.time()
knn_model <- knn(train=train_data_knn,test=test_data_knn ,cl = train_data_knn$class,k = 5)

proc.time() - pc
##    user  system elapsed 
##       1       0       1
summary(knn_model)
##    0    1 
##  164 1047

Evaluation of KNN

table(`Actual Class` = test_data_knn$class, `Predicted Class` = knn_model)
##             Predicted Class
## Actual Class    0    1
##            0  163   42
##            1    1 1005

Accuracy of KNN

knn_error <- sum(test_data_knn$class != knn_model)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - knn_error))
## [1] "Accuary (Precision): 0.9644921552436"

Classification - Artificial Neural Network (ANN)

A classifier modelled on the human brain and nervous system

pc <- proc.time()
ann_model<-nnet(class~.,data=train_data,size=10,decay=0.1)
## # weights:  831
## initial  value 1369.028855 
## iter  10 value 104.928601
## iter  20 value 52.508418
## iter  30 value 32.016722
## iter  40 value 26.694688
## iter  50 value 24.833268
## iter  60 value 24.005009
## iter  70 value 23.627994
## iter  80 value 23.353777
## iter  90 value 23.195792
## iter 100 value 23.157750
## final  value 23.157750 
## stopped after 100 iterations
proc.time() - pc
##    user  system elapsed 
##    6.61    0.00    6.62

Evaluation of ANN

ann_predict <- predict(ann_model, newdata = test_data, type="class")
table(`Actual Class` = test_data$class, `Predicted Class` = ann_predict)
##             Predicted Class
## Actual Class  ham spam
##         ham  1006    0
##         spam   11  194

Accuracy of ANN

knn_error <- sum(test_data$class != ann_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - knn_error))
## [1] "Accuary (Precision): 0.990916597853014"

Conclusion

From the accuracies calculated it can be concluded that the best performing classifiers for this task are the Decision Tree and the Neural Networks. The worst performers were Naive Bayes and K-Nearest Neighbours.