This project explores the use of R to classify whether an email message is spam or not. The data is taken from Spam Assain’s Public Corpus found here.
The approach is taken to utilise a number of classification algorithms in order to ascertain the most accurate classification technique for this data.
To replicate this project the data in the folders spam and easy_ham (found in this project’s repository on github) should be downloaded and placed in your working folder.
knitr::opts_chunk$set(echo = TRUE)
if("tidytext" %in% rownames(installed.packages()) == FALSE) {install.packages("tidytext")}
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.3.3
if("rpart" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart")}
library(rpart)
if("tidyr" %in% rownames(installed.packages()) == FALSE) {install.packages("tidyr")}
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if("tm" %in% rownames(installed.packages()) == FALSE) {install.packages("tm")}
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
if("rpart.plot" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart.plot")}
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
if("e1071" %in% rownames(installed.packages()) == FALSE) {install.packages("e1071")}
library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
if("class" %in% rownames(installed.packages()) == FALSE) {install.packages("class")}
library(class)
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
## Warning: package 'stringr' was built under R version 3.3.3
if("nnet" %in% rownames(installed.packages()) == FALSE) {install.packages("nnet")}
library(nnet)
if("wordcloud" %in% rownames(installed.packages()) == FALSE) {install.packages("wordcloud")}
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.3.2
Function taken from ML for hackers Chapter 3 - https://github.com/johnmyleswhite/ML_for_Hackers/tree/master/03-Classification
#spam_path <- "spam/"
#ham_path <- "easy_ham/"
spam_path <- "C:/Users/Mezue/Documents/Data607/SpamHAM/20030228_spam/spam/"
ham_path <- "C:/Users/Mezue/Documents/Data607/SpamHAM/20021010_easy_ham/easy_ham/"
get.msg <- function(path)
{
# Return a single element vector of just the email body
# This is a very simple approach, as we are only using
# words as features
con <- file(path, open = "rt")
text <- readLines(con)
# The message always begins after the first full line break
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg, collapse = "\n"))
}
spam_docs <- dir(spam_path)
spam_docs <- spam_docs[which(spam_docs!="cmds")]
all_spam <- sapply(spam_docs, function (p) get.msg(paste(spam_path,p,sep="")))
## Warning in readLines(con): incomplete final line found on 'C:/
## Users/Mezue/Documents/Data607/SpamHAM/20030228_spam/spam/
## 00136.faa39d8e816c70f23b4bb8758d8a74f0'
ham_docs <- dir(ham_path)
ham_docs <- ham_docs[which(ham_docs!="cmds")]
all_ham <- sapply(ham_docs, function (p) get.msg(paste(ham_path,p,sep="")))
control <- list(stopwords=TRUE, removePunctuation=TRUE,removeNumbers=TRUE, minDocFreq=2)
spam_corpus <- Corpus(VectorSource(all_spam))
spam_tdm <- TermDocumentMatrix(spam_corpus,control)
spam_dtm <- DocumentTermMatrix(spam_corpus, control)
#remove sparse items
spam_tdm2<-removeSparseTerms(spam_tdm,0.8)
ham_corpus <- Corpus(VectorSource(all_ham))
ham_tdm <- TermDocumentMatrix(ham_corpus,control)
ham_dtm <- DocumentTermMatrix(ham_corpus, control)
#remove sparse items
ham_tdm2<-removeSparseTerms(ham_tdm,0.8)
wordcloud(ham_corpus, min.freq=600)
wordcloud(spam_corpus, min.freq=400)
tidy_ham <- tidy(ham_tdm2)
head(tidy_ham)
## # A tibble: 6 × 3
## term document count
## <chr> <chr> <dbl>
## 1 can 0001.ea7e79d3153e7469e7a9c3e0af6a357e 1
## 2 com 0001.ea7e79d3153e7469e7a9c3e0af6a357e 4
## 3 date 0001.ea7e79d3153e7469e7a9c3e0af6a357e 1
## 4 get 0001.ea7e79d3153e7469e7a9c3e0af6a357e 1
## 5 like 0001.ea7e79d3153e7469e7a9c3e0af6a357e 1
## 6 list 0001.ea7e79d3153e7469e7a9c3e0af6a357e 5
tidy_spam<- tidy(spam_tdm2)
head(tidy_spam)
## # A tibble: 6 × 3
## term document count
## <chr> <chr> <dbl>
## 1 align 00001.7848dde101aa985090474a91ec93fcf0 8
## 2 arial 00001.7848dde101aa985090474a91ec93fcf0 2
## 3 body 00001.7848dde101aa985090474a91ec93fcf0 2
## 4 border 00001.7848dde101aa985090474a91ec93fcf0 3
## 5 can 00001.7848dde101aa985090474a91ec93fcf0 1
## 6 cellpadding 00001.7848dde101aa985090474a91ec93fcf0 3
wide_ham <- spread(tidy_ham, term, count, fill = 0)
head(wide_ham)
## # A tibble: 6 × 24
## document can com date don get
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0001.ea7e79d3153e7469e7a9c3e0af6a357e 1 4 1 0 1
## 2 0002.b3120c4bcbf3101e661161ee7efcb8bf 0 3 0 0 0
## 3 0003.acfc5ad94bbd27118a0d8685d18c89dd 0 3 0 0 1
## 4 0004.e8d5727378ddde5c3be181df593f1712 1 1 0 0 0
## 5 0005.8c3b9e9c0f3f183ddaf7592a11b99957 1 2 0 0 2
## 6 0006.ee8b0dba12856155222be180ba122058 0 3 0 0 0
## # ... with 18 more variables: http <dbl>, just <dbl>, like <dbl>,
## # list <dbl>, listinfo <dbl>, lists <dbl>, mailing <dbl>, mailman <dbl>,
## # net <dbl>, new <dbl>, now <dbl>, one <dbl>, time <dbl>, url <dbl>,
## # use <dbl>, will <dbl>, wrote <dbl>, www <dbl>
wide_spam <- spread(tidy_spam, term, count, fill = 0)
head(wide_spam)
## # A tibble: 6 × 75
## document address align also arial bgcolor
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00001.7848dde101aa985090474a91ec93fcf0 0 8 0 2 0
## 2 00002.d94f1b97e48ed3b553b3508d116e6a09 0 0 0 0 0
## 3 00003.2ee33bc6eacdb11f38d052c44819ba6c 0 0 0 0 0
## 4 00004.eac8de8d759b7e74154f142194282724 0 0 0 0 0
## 5 00005.57696a39d7d84318ce497886896bf90d 0 0 0 0 0
## 6 00006.5ab5620d3d7c6c0db76234556a16f6c1 1 0 0 0 0
## # ... with 69 more variables: body <dbl>, border <dbl>, business <dbl>,
## # can <dbl>, cellpadding <dbl>, cellspacing <dbl>, center <dbl>,
## # charset <dbl>, click <dbl>, color <dbl>, com <dbl>, company <dbl>,
## # content <dbl>, don <dbl>, email <dbl>, equiv <dbl>, face <dbl>,
## # ffffff <dbl>, font <dbl>, form <dbl>, free <dbl>, get <dbl>,
## # head <dbl>, height <dbl>, href <dbl>, html <dbl>, http <dbl>,
## # img <dbl>, information <dbl>, iso <dbl>, just <dbl>, like <dbl>,
## # link <dbl>, list <dbl>, mail <dbl>, may <dbl>, message <dbl>,
## # meta <dbl>, money <dbl>, name <dbl>, nbsp <dbl>, net <dbl>, new <dbl>,
## # now <dbl>, offer <dbl>, one <dbl>, please <dbl>, receive <dbl>,
## # remove <dbl>, removed <dbl>, reply <dbl>, right <dbl>, send <dbl>,
## # size <dbl>, src <dbl>, table <dbl>, text <dbl>, time <dbl>,
## # title <dbl>, today <dbl>, top <dbl>, transfer <dbl>, type <dbl>,
## # use <dbl>, want <dbl>, width <dbl>, will <dbl>, wish <dbl>, www <dbl>
ham_class <- wide_ham %>%
mutate(class = "ham")
spam_class <- wide_spam %>%
mutate(class = "spam")
#Merge
all_data <- bind_rows(ham_class, spam_class)
#Rearrance columns
all_data <- select(all_data, class, everything())
#Set N/A to 0
all_data[is.na(all_data)] <- 0
#Make class variable a factor variable
all_data$class <- as.factor(all_data$class)
#Remove the document names
all_data <- subset(all_data, select = -c(document))
head(all_data)
## # A tibble: 6 × 82
## class can com date don get http just like list listinfo
## <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ham 1 4 1 0 1 0 0 1 5 1
## 2 ham 0 3 0 0 0 2 0 0 0 0
## 3 ham 0 3 0 0 1 2 0 0 0 0
## 4 ham 1 1 0 0 0 2 0 0 1 1
## 5 ham 1 2 0 0 2 0 0 0 1 1
## 6 ham 0 3 0 0 0 3 1 0 0 0
## # ... with 71 more variables: lists <dbl>, mailing <dbl>, mailman <dbl>,
## # net <dbl>, new <dbl>, now <dbl>, one <dbl>, time <dbl>, url <dbl>,
## # use <dbl>, will <dbl>, wrote <dbl>, www <dbl>, address <dbl>,
## # align <dbl>, also <dbl>, arial <dbl>, bgcolor <dbl>, body <dbl>,
## # border <dbl>, business <dbl>, cellpadding <dbl>, cellspacing <dbl>,
## # center <dbl>, charset <dbl>, click <dbl>, color <dbl>, company <dbl>,
## # content <dbl>, email <dbl>, equiv <dbl>, face <dbl>, ffffff <dbl>,
## # font <dbl>, form <dbl>, free <dbl>, head <dbl>, height <dbl>,
## # href <dbl>, html <dbl>, img <dbl>, information <dbl>, iso <dbl>,
## # link <dbl>, mail <dbl>, may <dbl>, message <dbl>, meta <dbl>,
## # money <dbl>, name <dbl>, nbsp <dbl>, offer <dbl>, please <dbl>,
## # receive <dbl>, remove <dbl>, removed <dbl>, reply <dbl>, right <dbl>,
## # send <dbl>, size <dbl>, src <dbl>, table <dbl>, text <dbl>,
## # title <dbl>, today <dbl>, top <dbl>, transfer <dbl>, type <dbl>,
## # want <dbl>, width <dbl>, wish <dbl>
set.seed(1800)
index <- 1:nrow(all_data)
#Perform a 60/40 split
trainIndex <- sample(index, trunc(length(index) * 0.6))
train_data <- all_data[trainIndex,]
test_data <- all_data[-trainIndex,]
table(train_data$class)
##
## ham spam
## 1522 293
plot(train_data$class, main = "Training Data Set")
table(test_data$class)
##
## ham spam
## 1006 205
plot(test_data$class, main = "Test Data Set")
The Naive Bayes Classifier is a probabilistic classifier the uses Bayes Theorm
pc <- proc.time()
#Create a Naive Bayes classifier object
naivebayes_model <- naiveBayes(train_data, factor(train_data$class))
proc.time() - pc
## user system elapsed
## 0.13 0.00 0.12
summary(naivebayes_model)
## Length Class Mode
## apriori 2 table numeric
## tables 82 -none- list
## levels 2 -none- character
## call 3 -none- call
#Evaluate the performance on the test data
naivebayes_predict <- predict(naivebayes_model, newdata=test_data)
#Check the predictions against reality
table(`Actual Class` = test_data$class, `Predicted Class` = naivebayes_predict)
## Predicted Class
## Actual Class ham spam
## ham 1006 0
## spam 166 39
naivebayes_error <- sum(test_data$class != naivebayes_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - naivebayes_error))
## [1] "Accuary (Precision): 0.862923203963666"
This algorithm creates a decision tree for classification
pc <- proc.time()
tree_model <- rpart(class ~ ., data = train_data, method = "class")
proc.time() - pc
## user system elapsed
## 0.92 0.00 0.92
printcp(tree_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
##
## Variables actually used in tree construction:
## [1] content email html mail please
##
## Root node error: 293/1815 = 0.16143
##
## n= 1815
##
## CP nsplit rel error xerror xstd
## 1 0.549488 0 1.000000 1.000000 0.053498
## 2 0.252560 1 0.450512 0.450512 0.037759
## 3 0.078498 2 0.197952 0.197952 0.025574
## 4 0.054608 3 0.119454 0.119454 0.019996
## 5 0.030717 4 0.064846 0.064846 0.014799
## 6 0.010000 5 0.034130 0.047782 0.012721
# plot(tree_model, uniform = TRUE, main = "Classification (RPART). Classification Tree for SPAM/HAM")
# text(tree_model, all=TRUE, cex = 0.9)
prp(tree_model,faclen=0,cex=0.75,extra=1)
tree_predict <- predict(tree_model, newdata = test_data, type = "class")
table('Actual Class' = test_data$class, 'Predicted Class' = tree_predict)
## Predicted Class
## Actual Class ham spam
## ham 1006 0
## spam 14 191
tree_error <- sum(test_data$class != tree_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - tree_error))
## [1] "Accuary (Precision): 0.988439306358382"
SVMs is a non-probabilistic linear classifier
pc <- proc.time()
svm_model <- svm(class ~ ., method = "class", data = train_data)
proc.time() - pc
## user system elapsed
## 1.23 0.03 1.28
summary(svm_model)
##
## Call:
## svm(formula = class ~ ., data = train_data, method = "class")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01234568
##
## Number of Support Vectors: 222
##
## ( 81 141 )
##
##
## Number of Classes: 2
##
## Levels:
## ham spam
svm_predict <- predict(svm_model, newdata = test_data, type = "class")
table(`Actual Class` = test_data$class, `Predicted Class` = svm_predict)
## Predicted Class
## Actual Class ham spam
## ham 1000 6
## spam 20 185
svm_error <- sum(test_data$class != svm_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - svm_error))
## [1] "Accuary (Precision): 0.978530140379851"
This is a non-parametric classifier
#change factor to numeric
train_data_knn <- train_data
train_data_knn$class <- str_replace_all(train_data_knn$class, "ham", "1")
train_data_knn$class <- str_replace_all(train_data_knn$class, "spam", "0")
train_data_knn$class <- as.numeric(as.character(train_data_knn$class))
test_data_knn <- test_data
test_data_knn$class <- str_replace_all(test_data_knn$class, "ham", "1")
test_data_knn$class <- str_replace_all(test_data_knn$class, "spam", "0")
test_data_knn$class <- as.numeric(as.character(test_data_knn$class))
pc <- proc.time()
knn_model <- knn(train=train_data_knn,test=test_data_knn ,cl = train_data_knn$class,k = 5)
proc.time() - pc
## user system elapsed
## 1 0 1
summary(knn_model)
## 0 1
## 164 1047
table(`Actual Class` = test_data_knn$class, `Predicted Class` = knn_model)
## Predicted Class
## Actual Class 0 1
## 0 163 42
## 1 1 1005
knn_error <- sum(test_data_knn$class != knn_model)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - knn_error))
## [1] "Accuary (Precision): 0.9644921552436"
A classifier modelled on the human brain and nervous system
pc <- proc.time()
ann_model<-nnet(class~.,data=train_data,size=10,decay=0.1)
## # weights: 831
## initial value 1369.028855
## iter 10 value 104.928601
## iter 20 value 52.508418
## iter 30 value 32.016722
## iter 40 value 26.694688
## iter 50 value 24.833268
## iter 60 value 24.005009
## iter 70 value 23.627994
## iter 80 value 23.353777
## iter 90 value 23.195792
## iter 100 value 23.157750
## final value 23.157750
## stopped after 100 iterations
proc.time() - pc
## user system elapsed
## 6.61 0.00 6.62
ann_predict <- predict(ann_model, newdata = test_data, type="class")
table(`Actual Class` = test_data$class, `Predicted Class` = ann_predict)
## Predicted Class
## Actual Class ham spam
## ham 1006 0
## spam 11 194
knn_error <- sum(test_data$class != ann_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - knn_error))
## [1] "Accuary (Precision): 0.990916597853014"
From the accuracies calculated it can be concluded that the best performing classifiers for this task are the Decision Tree and the Neural Networks. The worst performers were Naive Bayes and K-Nearest Neighbours.
The following sources were used to complete this assignment: