Project 4

Nnaemezue Obi-Eyisi & Nkasi Nedd

2017-04-16


Introduction

This project explores the use of R to classify whether an email message is spam or not. The data is taken from Spam Assain’s Public Corpus found here.

The approach is taken to utilise a number of classification algorithms in order to ascertain the most accurate classification technique for this data.

To replicate this project the data in the folders spam and easy_ham (found in this project’s repository on github) should be downloaded and placed in your working folder.

Setup environment

knitr::opts_chunk$set(echo = TRUE)

if("tidytext" %in% rownames(installed.packages()) == FALSE) {install.packages("tidytext")}
library(tidytext)
if("rpart" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart")}
library(rpart)
if("tidyr" %in% rownames(installed.packages()) == FALSE) {install.packages("tidyr")}
library(tidyr)
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)
if("tm" %in% rownames(installed.packages()) == FALSE) {install.packages("tm")}
library(tm)
if("rpart.plot" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart.plot")}
library(rpart.plot)
if("e1071" %in% rownames(installed.packages()) == FALSE) {install.packages("e1071")}
library(e1071)
if("class" %in% rownames(installed.packages()) == FALSE) {install.packages("class")}
library(class)
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
if("nnet" %in% rownames(installed.packages()) == FALSE) {install.packages("nnet")}
library(nnet)
if("wordcloud" %in% rownames(installed.packages()) == FALSE) {install.packages("wordcloud")}
library(wordcloud)

setup paths and functions

Function taken from ML for hackers Chapter 3 - https://github.com/johnmyleswhite/ML_for_Hackers/tree/master/03-Classification

spam_path <- "spam/"
ham_path <- "easy_ham/"
get.msg <- function(path)
{
# Return a single element vector of just the email body
# This is a very simple approach, as we are only using 
# words as features
  con <- file(path, open = "rt")
  text <- readLines(con)
  # The message always begins after the first full line break
  msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
  close(con)
  return(paste(msg, collapse = "\n"))
}
spam_docs <- dir(spam_path)
spam_docs <- spam_docs[which(spam_docs!="cmds")]
all_spam <- sapply(spam_docs, function (p) get.msg(paste(spam_path,p,sep="")))
## Warning in readLines(con): incomplete final line found on 'spam/
## 00136.faa39d8e816c70f23b4bb8758d8a74f0'
ham_docs <- dir(ham_path)
ham_docs <- ham_docs[which(ham_docs!="cmds")]
all_ham <- sapply(ham_docs, function (p) get.msg(paste(ham_path,p,sep="")))
control <- list(stopwords=TRUE, removePunctuation=TRUE,removeNumbers=TRUE, minDocFreq=2)

Create Spam and Ham Corpus

Including WordLists in the form of Term Document and Document Term matrices

spam_corpus <- Corpus(VectorSource(all_spam))
spam_tdm <- TermDocumentMatrix(spam_corpus,control)
spam_dtm <- DocumentTermMatrix(spam_corpus, control)
#remove sparse items
spam_tdm2<-removeSparseTerms(spam_tdm,0.8)

ham_corpus <- Corpus(VectorSource(all_ham))
ham_tdm <- TermDocumentMatrix(ham_corpus,control)
ham_dtm <- DocumentTermMatrix(ham_corpus, control)
#remove sparse items
ham_tdm2<-removeSparseTerms(ham_tdm,0.8)

Visualise the Ham Corpus

wordcloud(ham_corpus, min.freq=600)

Visualise the Spam Corpus

wordcloud(spam_corpus, min.freq=400)

Convert corpus to tidy format

tidy_ham <- tidy(ham_tdm2)
head(tidy_ham)
## # A tibble: 6 × 3
##    term                               document count
##   <chr>                                  <chr> <dbl>
## 1   can 00001.7c53336b37003a9286aba55d2945844c     1
## 2   com 00001.7c53336b37003a9286aba55d2945844c     4
## 3  date 00001.7c53336b37003a9286aba55d2945844c     1
## 4   get 00001.7c53336b37003a9286aba55d2945844c     1
## 5  like 00001.7c53336b37003a9286aba55d2945844c     1
## 6  list 00001.7c53336b37003a9286aba55d2945844c     5
tidy_spam<- tidy(spam_tdm2)
head(tidy_spam)
## # A tibble: 6 × 3
##          term                               document count
##         <chr>                                  <chr> <dbl>
## 1       align 00001.7848dde101aa985090474a91ec93fcf0     8
## 2       arial 00001.7848dde101aa985090474a91ec93fcf0     2
## 3        body 00001.7848dde101aa985090474a91ec93fcf0     2
## 4      border 00001.7848dde101aa985090474a91ec93fcf0     3
## 5         can 00001.7848dde101aa985090474a91ec93fcf0     1
## 6 cellpadding 00001.7848dde101aa985090474a91ec93fcf0     3

Change to Wide Format

wide_ham <- spread(tidy_ham, term, count, fill = 0)
head(wide_ham)
## # A tibble: 6 × 25
##                                 document   can   com  date   don email
##                                    <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00001.7c53336b37003a9286aba55d2945844c     1     4     1     0     0
## 2 00002.9c4069e25e1ef370c078db7ee85ff9ac     0     3     0     0     1
## 3 00003.860e3c3cee1b42ead714c5c874fe25f7     0     3     0     0     1
## 4 00004.864220c5b6930b209cc287c361c99af1     1     1     0     0     0
## 5 00005.bf27cdeaf0b8c4647ecd61b1d09da613     0     3     0     0     1
## 6 00006.253ea2f9a9cc36fa0b1129b04b806608     0     3     0     0     1
## # ... with 19 more variables: get <dbl>, http <dbl>, just <dbl>,
## #   like <dbl>, list <dbl>, listinfo <dbl>, lists <dbl>, mailing <dbl>,
## #   mailman <dbl>, net <dbl>, new <dbl>, now <dbl>, one <dbl>, time <dbl>,
## #   url <dbl>, use <dbl>, will <dbl>, wrote <dbl>, www <dbl>
wide_spam <- spread(tidy_spam, term, count, fill = 0)
head(wide_spam)
## # A tibble: 6 × 75
##                                 document address align  also arial bgcolor
##                                    <chr>   <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 00001.7848dde101aa985090474a91ec93fcf0       0     8     0     2       0
## 2 00002.d94f1b97e48ed3b553b3508d116e6a09       0     0     0     0       0
## 3 00003.2ee33bc6eacdb11f38d052c44819ba6c       0     0     0     0       0
## 4 00004.eac8de8d759b7e74154f142194282724       0     0     0     0       0
## 5 00005.57696a39d7d84318ce497886896bf90d       0     0     0     0       0
## 6 00006.5ab5620d3d7c6c0db76234556a16f6c1       1     0     0     0       0
## # ... with 69 more variables: body <dbl>, border <dbl>, business <dbl>,
## #   can <dbl>, cellpadding <dbl>, cellspacing <dbl>, center <dbl>,
## #   charset <dbl>, click <dbl>, color <dbl>, com <dbl>, company <dbl>,
## #   content <dbl>, don <dbl>, email <dbl>, equiv <dbl>, face <dbl>,
## #   ffffff <dbl>, font <dbl>, form <dbl>, free <dbl>, get <dbl>,
## #   head <dbl>, height <dbl>, href <dbl>, html <dbl>, http <dbl>,
## #   img <dbl>, information <dbl>, iso <dbl>, just <dbl>, like <dbl>,
## #   link <dbl>, list <dbl>, mail <dbl>, may <dbl>, message <dbl>,
## #   meta <dbl>, money <dbl>, name <dbl>, nbsp <dbl>, net <dbl>, new <dbl>,
## #   now <dbl>, offer <dbl>, one <dbl>, please <dbl>, receive <dbl>,
## #   remove <dbl>, removed <dbl>, reply <dbl>, right <dbl>, send <dbl>,
## #   size <dbl>, src <dbl>, table <dbl>, text <dbl>, time <dbl>,
## #   title <dbl>, today <dbl>, top <dbl>, transfer <dbl>, type <dbl>,
## #   use <dbl>, want <dbl>, width <dbl>, will <dbl>, wish <dbl>, www <dbl>

Insert column to mark class of document

ham_class <- wide_ham %>% 
    mutate(class = "ham")

spam_class <- wide_spam %>% 
    mutate(class = "spam")

Merge Ham and Spam data

#Merge
all_data <- bind_rows(ham_class, spam_class)
#Rearrance columns
all_data <- select(all_data, class, everything())
#Set N/A to 0
all_data[is.na(all_data)] <- 0
#Make class variable a factor variable
all_data$class <- as.factor(all_data$class)
#Remove the document names
all_data <- subset(all_data, select = -c(document))
head(all_data)
## # A tibble: 6 × 82
##    class   can   com  date   don email   get  http  just  like  list
##   <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    ham     1     4     1     0     0     1     0     0     1     5
## 2    ham     0     3     0     0     1     0     2     0     0     0
## 3    ham     0     3     0     0     1     1     2     0     0     0
## 4    ham     1     1     0     0     0     0     2     0     0     1
## 5    ham     0     3     0     0     1     0     3     1     0     0
## 6    ham     0     3     0     0     1     0     2     2     0     0
## # ... with 71 more variables: listinfo <dbl>, lists <dbl>, mailing <dbl>,
## #   mailman <dbl>, net <dbl>, new <dbl>, now <dbl>, one <dbl>, time <dbl>,
## #   url <dbl>, use <dbl>, will <dbl>, wrote <dbl>, www <dbl>,
## #   address <dbl>, align <dbl>, also <dbl>, arial <dbl>, bgcolor <dbl>,
## #   body <dbl>, border <dbl>, business <dbl>, cellpadding <dbl>,
## #   cellspacing <dbl>, center <dbl>, charset <dbl>, click <dbl>,
## #   color <dbl>, company <dbl>, content <dbl>, equiv <dbl>, face <dbl>,
## #   ffffff <dbl>, font <dbl>, form <dbl>, free <dbl>, head <dbl>,
## #   height <dbl>, href <dbl>, html <dbl>, img <dbl>, information <dbl>,
## #   iso <dbl>, link <dbl>, mail <dbl>, may <dbl>, message <dbl>,
## #   meta <dbl>, money <dbl>, name <dbl>, nbsp <dbl>, offer <dbl>,
## #   please <dbl>, receive <dbl>, remove <dbl>, removed <dbl>, reply <dbl>,
## #   right <dbl>, send <dbl>, size <dbl>, src <dbl>, table <dbl>,
## #   text <dbl>, title <dbl>, today <dbl>, top <dbl>, transfer <dbl>,
## #   type <dbl>, want <dbl>, width <dbl>, wish <dbl>

Split into Training and Test Sets

set.seed(1800)
index <- 1:nrow(all_data)
#Perform a 60/40 split
trainIndex <- sample(index, trunc(length(index) * 0.6))
train_data <- all_data[trainIndex,]
test_data <- all_data[-trainIndex,]

Visualise the Training and Test sets

table(train_data$class)
## 
##  ham spam 
## 1491  294
plot(train_data$class, main = "Training Data Set")

table(test_data$class)
## 
##  ham spam 
##  987  204
plot(test_data$class, main = "Test Data Set")

Classification - Naive Bayes

The Naive Bayes Classifier is a probabilistic classifier the uses Bayes Theorm

pc <- proc.time()
#Create a Naive Bayes classifier object
naivebayes_model <- naiveBayes(train_data, factor(train_data$class))

proc.time() - pc
##    user  system elapsed 
##    0.01    0.00    0.01
summary(naivebayes_model)
##         Length Class  Mode     
## apriori  2     table  numeric  
## tables  82     -none- list     
## levels   2     -none- character
## call     3     -none- call

Evaluation of Naive Bayes

#Evaluate the performance on the test data
naivebayes_predict <- predict(naivebayes_model, newdata=test_data)

#Check the predictions against reality
table(`Actual Class` = test_data$class, `Predicted Class` = naivebayes_predict)
##             Predicted Class
## Actual Class ham spam
##         ham  987    0
##         spam 163   41

Accuracy of Naive Bayes

naivebayes_error <- sum(test_data$class != naivebayes_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - naivebayes_error))
## [1] "Accuary (Precision): 0.863140218303946"

Classification - Recursive Partitioning and Regression Trees Algorithm

This algorithm creates a decision tree for classification

pc <- proc.time()
tree_model <- rpart(class ~ ., data = train_data, method = "class")
proc.time() - pc
##    user  system elapsed 
##    0.24    0.00    0.25
printcp(tree_model)
## 
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] click   content free    html    mail    please 
## 
## Root node error: 294/1785 = 0.16471
## 
## n= 1785 
## 
##         CP nsplit rel error   xerror     xstd
## 1 0.568027      0  1.000000 1.000000 0.053302
## 2 0.241497      1  0.431973 0.431973 0.036943
## 3 0.064626      2  0.190476 0.190476 0.025051
## 4 0.057823      3  0.125850 0.149660 0.022282
## 5 0.027211      4  0.068027 0.081633 0.016551
## 6 0.010000      6  0.013605 0.037415 0.011246
# plot(tree_model, uniform = TRUE, main = "Classification (RPART).  Classification Tree for SPAM/HAM")
# text(tree_model, all=TRUE, cex = 0.9)

prp(tree_model,faclen=0,cex=0.75,extra=1)

Evaluation of Regression Tree

tree_predict <- predict(tree_model, newdata = test_data, type = "class")
table('Actual Class' = test_data$class, 'Predicted Class' = tree_predict)
##             Predicted Class
## Actual Class ham spam
##         ham  987    0
##         spam   7  197

Accuracy of Regression Tree

tree_error <- sum(test_data$class != tree_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - tree_error))
## [1] "Accuary (Precision): 0.994122586062133"

Classification - Support Vector Machine (SVM)

SVMs is a non-probabilistic linear classifier

pc <- proc.time()
svm_model <- svm(class ~ ., method = "class", data = train_data)
proc.time() - pc
##    user  system elapsed 
##    0.58    0.02    0.60
summary(svm_model)
## 
## Call:
## svm(formula = class ~ ., data = train_data, method = "class")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01234568 
## 
## Number of Support Vectors:  235
## 
##  ( 85 150 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  ham spam

Evaluation of SVM

svm_predict <- predict(svm_model, newdata = test_data, type = "class")
table(`Actual Class` = test_data$class, `Predicted Class` = svm_predict)
##             Predicted Class
## Actual Class ham spam
##         ham  981    6
##         spam  19  185

Accuracy of SVM

svm_error <- sum(test_data$class != svm_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - svm_error))
## [1] "Accuary (Precision): 0.979009235936188"

Classification - K-Nearest Neighbours (KNN)

This is a non-parametric classifier

#change factor to numeric
train_data_knn <- train_data
train_data_knn$class <- str_replace_all(train_data_knn$class, "ham", "1")
train_data_knn$class <- str_replace_all(train_data_knn$class, "spam", "0")
train_data_knn$class <- as.numeric(as.character(train_data_knn$class))

test_data_knn <- test_data
test_data_knn$class <- str_replace_all(test_data_knn$class, "ham", "1")
test_data_knn$class <- str_replace_all(test_data_knn$class, "spam", "0")
test_data_knn$class <- as.numeric(as.character(test_data_knn$class))


pc <- proc.time()
knn_model <- knn(train=train_data_knn,test=test_data_knn ,cl = train_data_knn$class,k = 5)

proc.time() - pc
##    user  system elapsed 
##    0.33    0.01    0.34
summary(knn_model)
##    0    1 
##  164 1027

Evaluation of KNN

table(`Actual Class` = test_data_knn$class, `Predicted Class` = knn_model)
##             Predicted Class
## Actual Class   0   1
##            0 162  42
##            1   2 985

Accuracy of KNN

knn_error <- sum(test_data_knn$class != knn_model)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - knn_error))
## [1] "Accuary (Precision): 0.963056255247691"

Classification - Artificial Neural Network (ANN)

A classifier modelled on the human brain and nervous system

pc <- proc.time()
ann_model<-nnet(class~.,data=train_data,size=10,decay=0.1)
## # weights:  831
## initial  value 1149.450383 
## iter  10 value 128.737079
## iter  20 value 53.160856
## iter  30 value 24.253617
## iter  40 value 19.910851
## iter  50 value 18.439859
## iter  60 value 17.868993
## iter  70 value 17.739144
## iter  80 value 17.637764
## iter  90 value 17.567545
## iter 100 value 17.537969
## final  value 17.537969 
## stopped after 100 iterations
proc.time() - pc
##    user  system elapsed 
##    1.97    0.00    1.98

Evaluation of ANN

ann_predict <- predict(ann_model, newdata = test_data, type="class")
table(`Actual Class` = test_data$class, `Predicted Class` = ann_predict)
##             Predicted Class
## Actual Class ham spam
##         ham  986    1
##         spam   7  197

Accuracy of ANN

knn_error <- sum(test_data$class != ann_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - knn_error))
## [1] "Accuary (Precision): 0.99328295549958"

Conclusion

From the accuracies calculated it can be concluded that the best performing classifiers for this task are the Decision Tree and the Neural Networks. The worst performers were Naive Bayes and K-Nearest Neighbours.