This project explores the use of R to classify whether an email message is spam or not. The data is taken from Spam Assain’s Public Corpus found here.
The approach is taken to utilise a number of classification algorithms in order to ascertain the most accurate classification technique for this data.
To replicate this project the data in the folders spam and easy_ham (found in this project’s repository on github) should be downloaded and placed in your working folder.
knitr::opts_chunk$set(echo = TRUE)
if("tidytext" %in% rownames(installed.packages()) == FALSE) {install.packages("tidytext")}
library(tidytext)
if("rpart" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart")}
library(rpart)
if("tidyr" %in% rownames(installed.packages()) == FALSE) {install.packages("tidyr")}
library(tidyr)
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)
if("tm" %in% rownames(installed.packages()) == FALSE) {install.packages("tm")}
library(tm)
if("rpart.plot" %in% rownames(installed.packages()) == FALSE) {install.packages("rpart.plot")}
library(rpart.plot)
if("e1071" %in% rownames(installed.packages()) == FALSE) {install.packages("e1071")}
library(e1071)
if("class" %in% rownames(installed.packages()) == FALSE) {install.packages("class")}
library(class)
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
if("nnet" %in% rownames(installed.packages()) == FALSE) {install.packages("nnet")}
library(nnet)
if("wordcloud" %in% rownames(installed.packages()) == FALSE) {install.packages("wordcloud")}
library(wordcloud)
Function taken from ML for hackers Chapter 3 - https://github.com/johnmyleswhite/ML_for_Hackers/tree/master/03-Classification
spam_path <- "spam/"
ham_path <- "easy_ham/"
get.msg <- function(path)
{
# Return a single element vector of just the email body
# This is a very simple approach, as we are only using
# words as features
con <- file(path, open = "rt")
text <- readLines(con)
# The message always begins after the first full line break
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg, collapse = "\n"))
}
spam_docs <- dir(spam_path)
spam_docs <- spam_docs[which(spam_docs!="cmds")]
all_spam <- sapply(spam_docs, function (p) get.msg(paste(spam_path,p,sep="")))
## Warning in readLines(con): incomplete final line found on 'spam/
## 00136.faa39d8e816c70f23b4bb8758d8a74f0'
ham_docs <- dir(ham_path)
ham_docs <- ham_docs[which(ham_docs!="cmds")]
all_ham <- sapply(ham_docs, function (p) get.msg(paste(ham_path,p,sep="")))
control <- list(stopwords=TRUE, removePunctuation=TRUE,removeNumbers=TRUE, minDocFreq=2)
spam_corpus <- Corpus(VectorSource(all_spam))
spam_tdm <- TermDocumentMatrix(spam_corpus,control)
spam_dtm <- DocumentTermMatrix(spam_corpus, control)
#remove sparse items
spam_tdm2<-removeSparseTerms(spam_tdm,0.8)
ham_corpus <- Corpus(VectorSource(all_ham))
ham_tdm <- TermDocumentMatrix(ham_corpus,control)
ham_dtm <- DocumentTermMatrix(ham_corpus, control)
#remove sparse items
ham_tdm2<-removeSparseTerms(ham_tdm,0.8)
wordcloud(ham_corpus, min.freq=600)
wordcloud(spam_corpus, min.freq=400)
tidy_ham <- tidy(ham_tdm2)
head(tidy_ham)
## # A tibble: 6 × 3
## term document count
## <chr> <chr> <dbl>
## 1 can 00001.7c53336b37003a9286aba55d2945844c 1
## 2 com 00001.7c53336b37003a9286aba55d2945844c 4
## 3 date 00001.7c53336b37003a9286aba55d2945844c 1
## 4 get 00001.7c53336b37003a9286aba55d2945844c 1
## 5 like 00001.7c53336b37003a9286aba55d2945844c 1
## 6 list 00001.7c53336b37003a9286aba55d2945844c 5
tidy_spam<- tidy(spam_tdm2)
head(tidy_spam)
## # A tibble: 6 × 3
## term document count
## <chr> <chr> <dbl>
## 1 align 00001.7848dde101aa985090474a91ec93fcf0 8
## 2 arial 00001.7848dde101aa985090474a91ec93fcf0 2
## 3 body 00001.7848dde101aa985090474a91ec93fcf0 2
## 4 border 00001.7848dde101aa985090474a91ec93fcf0 3
## 5 can 00001.7848dde101aa985090474a91ec93fcf0 1
## 6 cellpadding 00001.7848dde101aa985090474a91ec93fcf0 3
wide_ham <- spread(tidy_ham, term, count, fill = 0)
head(wide_ham)
## # A tibble: 6 × 25
## document can com date don email
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00001.7c53336b37003a9286aba55d2945844c 1 4 1 0 0
## 2 00002.9c4069e25e1ef370c078db7ee85ff9ac 0 3 0 0 1
## 3 00003.860e3c3cee1b42ead714c5c874fe25f7 0 3 0 0 1
## 4 00004.864220c5b6930b209cc287c361c99af1 1 1 0 0 0
## 5 00005.bf27cdeaf0b8c4647ecd61b1d09da613 0 3 0 0 1
## 6 00006.253ea2f9a9cc36fa0b1129b04b806608 0 3 0 0 1
## # ... with 19 more variables: get <dbl>, http <dbl>, just <dbl>,
## # like <dbl>, list <dbl>, listinfo <dbl>, lists <dbl>, mailing <dbl>,
## # mailman <dbl>, net <dbl>, new <dbl>, now <dbl>, one <dbl>, time <dbl>,
## # url <dbl>, use <dbl>, will <dbl>, wrote <dbl>, www <dbl>
wide_spam <- spread(tidy_spam, term, count, fill = 0)
head(wide_spam)
## # A tibble: 6 × 75
## document address align also arial bgcolor
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00001.7848dde101aa985090474a91ec93fcf0 0 8 0 2 0
## 2 00002.d94f1b97e48ed3b553b3508d116e6a09 0 0 0 0 0
## 3 00003.2ee33bc6eacdb11f38d052c44819ba6c 0 0 0 0 0
## 4 00004.eac8de8d759b7e74154f142194282724 0 0 0 0 0
## 5 00005.57696a39d7d84318ce497886896bf90d 0 0 0 0 0
## 6 00006.5ab5620d3d7c6c0db76234556a16f6c1 1 0 0 0 0
## # ... with 69 more variables: body <dbl>, border <dbl>, business <dbl>,
## # can <dbl>, cellpadding <dbl>, cellspacing <dbl>, center <dbl>,
## # charset <dbl>, click <dbl>, color <dbl>, com <dbl>, company <dbl>,
## # content <dbl>, don <dbl>, email <dbl>, equiv <dbl>, face <dbl>,
## # ffffff <dbl>, font <dbl>, form <dbl>, free <dbl>, get <dbl>,
## # head <dbl>, height <dbl>, href <dbl>, html <dbl>, http <dbl>,
## # img <dbl>, information <dbl>, iso <dbl>, just <dbl>, like <dbl>,
## # link <dbl>, list <dbl>, mail <dbl>, may <dbl>, message <dbl>,
## # meta <dbl>, money <dbl>, name <dbl>, nbsp <dbl>, net <dbl>, new <dbl>,
## # now <dbl>, offer <dbl>, one <dbl>, please <dbl>, receive <dbl>,
## # remove <dbl>, removed <dbl>, reply <dbl>, right <dbl>, send <dbl>,
## # size <dbl>, src <dbl>, table <dbl>, text <dbl>, time <dbl>,
## # title <dbl>, today <dbl>, top <dbl>, transfer <dbl>, type <dbl>,
## # use <dbl>, want <dbl>, width <dbl>, will <dbl>, wish <dbl>, www <dbl>
ham_class <- wide_ham %>%
mutate(class = "ham")
spam_class <- wide_spam %>%
mutate(class = "spam")
#Merge
all_data <- bind_rows(ham_class, spam_class)
#Rearrance columns
all_data <- select(all_data, class, everything())
#Set N/A to 0
all_data[is.na(all_data)] <- 0
#Make class variable a factor variable
all_data$class <- as.factor(all_data$class)
#Remove the document names
all_data <- subset(all_data, select = -c(document))
head(all_data)
## # A tibble: 6 × 82
## class can com date don email get http just like list
## <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ham 1 4 1 0 0 1 0 0 1 5
## 2 ham 0 3 0 0 1 0 2 0 0 0
## 3 ham 0 3 0 0 1 1 2 0 0 0
## 4 ham 1 1 0 0 0 0 2 0 0 1
## 5 ham 0 3 0 0 1 0 3 1 0 0
## 6 ham 0 3 0 0 1 0 2 2 0 0
## # ... with 71 more variables: listinfo <dbl>, lists <dbl>, mailing <dbl>,
## # mailman <dbl>, net <dbl>, new <dbl>, now <dbl>, one <dbl>, time <dbl>,
## # url <dbl>, use <dbl>, will <dbl>, wrote <dbl>, www <dbl>,
## # address <dbl>, align <dbl>, also <dbl>, arial <dbl>, bgcolor <dbl>,
## # body <dbl>, border <dbl>, business <dbl>, cellpadding <dbl>,
## # cellspacing <dbl>, center <dbl>, charset <dbl>, click <dbl>,
## # color <dbl>, company <dbl>, content <dbl>, equiv <dbl>, face <dbl>,
## # ffffff <dbl>, font <dbl>, form <dbl>, free <dbl>, head <dbl>,
## # height <dbl>, href <dbl>, html <dbl>, img <dbl>, information <dbl>,
## # iso <dbl>, link <dbl>, mail <dbl>, may <dbl>, message <dbl>,
## # meta <dbl>, money <dbl>, name <dbl>, nbsp <dbl>, offer <dbl>,
## # please <dbl>, receive <dbl>, remove <dbl>, removed <dbl>, reply <dbl>,
## # right <dbl>, send <dbl>, size <dbl>, src <dbl>, table <dbl>,
## # text <dbl>, title <dbl>, today <dbl>, top <dbl>, transfer <dbl>,
## # type <dbl>, want <dbl>, width <dbl>, wish <dbl>
set.seed(1800)
index <- 1:nrow(all_data)
#Perform a 60/40 split
trainIndex <- sample(index, trunc(length(index) * 0.6))
train_data <- all_data[trainIndex,]
test_data <- all_data[-trainIndex,]
table(train_data$class)
##
## ham spam
## 1491 294
plot(train_data$class, main = "Training Data Set")
table(test_data$class)
##
## ham spam
## 987 204
plot(test_data$class, main = "Test Data Set")
The Naive Bayes Classifier is a probabilistic classifier the uses Bayes Theorm
pc <- proc.time()
#Create a Naive Bayes classifier object
naivebayes_model <- naiveBayes(train_data, factor(train_data$class))
proc.time() - pc
## user system elapsed
## 0.01 0.00 0.01
summary(naivebayes_model)
## Length Class Mode
## apriori 2 table numeric
## tables 82 -none- list
## levels 2 -none- character
## call 3 -none- call
#Evaluate the performance on the test data
naivebayes_predict <- predict(naivebayes_model, newdata=test_data)
#Check the predictions against reality
table(`Actual Class` = test_data$class, `Predicted Class` = naivebayes_predict)
## Predicted Class
## Actual Class ham spam
## ham 987 0
## spam 163 41
naivebayes_error <- sum(test_data$class != naivebayes_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - naivebayes_error))
## [1] "Accuary (Precision): 0.863140218303946"
This algorithm creates a decision tree for classification
pc <- proc.time()
tree_model <- rpart(class ~ ., data = train_data, method = "class")
proc.time() - pc
## user system elapsed
## 0.24 0.00 0.25
printcp(tree_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
##
## Variables actually used in tree construction:
## [1] click content free html mail please
##
## Root node error: 294/1785 = 0.16471
##
## n= 1785
##
## CP nsplit rel error xerror xstd
## 1 0.568027 0 1.000000 1.000000 0.053302
## 2 0.241497 1 0.431973 0.431973 0.036943
## 3 0.064626 2 0.190476 0.190476 0.025051
## 4 0.057823 3 0.125850 0.149660 0.022282
## 5 0.027211 4 0.068027 0.081633 0.016551
## 6 0.010000 6 0.013605 0.037415 0.011246
# plot(tree_model, uniform = TRUE, main = "Classification (RPART). Classification Tree for SPAM/HAM")
# text(tree_model, all=TRUE, cex = 0.9)
prp(tree_model,faclen=0,cex=0.75,extra=1)
tree_predict <- predict(tree_model, newdata = test_data, type = "class")
table('Actual Class' = test_data$class, 'Predicted Class' = tree_predict)
## Predicted Class
## Actual Class ham spam
## ham 987 0
## spam 7 197
tree_error <- sum(test_data$class != tree_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - tree_error))
## [1] "Accuary (Precision): 0.994122586062133"
SVMs is a non-probabilistic linear classifier
pc <- proc.time()
svm_model <- svm(class ~ ., method = "class", data = train_data)
proc.time() - pc
## user system elapsed
## 0.58 0.02 0.60
summary(svm_model)
##
## Call:
## svm(formula = class ~ ., data = train_data, method = "class")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01234568
##
## Number of Support Vectors: 235
##
## ( 85 150 )
##
##
## Number of Classes: 2
##
## Levels:
## ham spam
svm_predict <- predict(svm_model, newdata = test_data, type = "class")
table(`Actual Class` = test_data$class, `Predicted Class` = svm_predict)
## Predicted Class
## Actual Class ham spam
## ham 981 6
## spam 19 185
svm_error <- sum(test_data$class != svm_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - svm_error))
## [1] "Accuary (Precision): 0.979009235936188"
This is a non-parametric classifier
#change factor to numeric
train_data_knn <- train_data
train_data_knn$class <- str_replace_all(train_data_knn$class, "ham", "1")
train_data_knn$class <- str_replace_all(train_data_knn$class, "spam", "0")
train_data_knn$class <- as.numeric(as.character(train_data_knn$class))
test_data_knn <- test_data
test_data_knn$class <- str_replace_all(test_data_knn$class, "ham", "1")
test_data_knn$class <- str_replace_all(test_data_knn$class, "spam", "0")
test_data_knn$class <- as.numeric(as.character(test_data_knn$class))
pc <- proc.time()
knn_model <- knn(train=train_data_knn,test=test_data_knn ,cl = train_data_knn$class,k = 5)
proc.time() - pc
## user system elapsed
## 0.33 0.01 0.34
summary(knn_model)
## 0 1
## 164 1027
table(`Actual Class` = test_data_knn$class, `Predicted Class` = knn_model)
## Predicted Class
## Actual Class 0 1
## 0 162 42
## 1 2 985
knn_error <- sum(test_data_knn$class != knn_model)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - knn_error))
## [1] "Accuary (Precision): 0.963056255247691"
A classifier modelled on the human brain and nervous system
pc <- proc.time()
ann_model<-nnet(class~.,data=train_data,size=10,decay=0.1)
## # weights: 831
## initial value 1149.450383
## iter 10 value 128.737079
## iter 20 value 53.160856
## iter 30 value 24.253617
## iter 40 value 19.910851
## iter 50 value 18.439859
## iter 60 value 17.868993
## iter 70 value 17.739144
## iter 80 value 17.637764
## iter 90 value 17.567545
## iter 100 value 17.537969
## final value 17.537969
## stopped after 100 iterations
proc.time() - pc
## user system elapsed
## 1.97 0.00 1.98
ann_predict <- predict(ann_model, newdata = test_data, type="class")
table(`Actual Class` = test_data$class, `Predicted Class` = ann_predict)
## Predicted Class
## Actual Class ham spam
## ham 986 1
## spam 7 197
knn_error <- sum(test_data$class != ann_predict)/nrow(test_data)
print(paste0("Accuary (Precision): ", 1 - knn_error))
## [1] "Accuary (Precision): 0.99328295549958"
From the accuracies calculated it can be concluded that the best performing classifiers for this task are the Decision Tree and the Neural Networks. The worst performers were Naive Bayes and K-Nearest Neighbours.
The following sources were used to complete this assignment: