Goal

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/

Introduction

This rmd and all related input data can be found on my github.

Load the Required Packages

library(tm)
library(stringr)
library(SnowballC)
library(ggplot2)
library(knitr)
library(tidyr)
library(tidytext)
library(wordcloud)
library(caret)
library(gbm)
library(e1071)

Load the Data

download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2", destfile = "20021010_easy_ham.tar.bz2")
untar("20021010_easy_ham.tar.bz2", exdir="emails", compressed = "bzip2")
ham.dir="emails\\easy_ham\\"
ham_files = list.files(path = ham.dir,full.names = TRUE)
download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2", destfile = "20050311_spam_2.tar.bz2")
untar("20050311_spam_2.tar.bz2", exdir="emails",compressed = "bzip2")
spam.dir="emails\\spam_2\\"
spam_files = list.files(path = spam.dir , full.names = TRUE)

SPAM

number_spam<-length(list.files(spam.dir, all.files = "FALSE", full.names = "TRUE"))
print(paste("There is a total of",number_spam,"spam emails"))
## [1] "There is a total of 1397 spam emails"

HAM

number_ham<-length(list.files(ham.dir, all.files = "FALSE", full.names = "TRUE"))
print(paste("There is a total of",number_ham,"ham emails"))
## [1] "There is a total of 2551 ham emails"

Cleaning the data

I applied the following steps to clean the sapm_2 and ham file

1-Create a vector of file paths

2- Read the text in each file

3- Turn into VectorSource

4- Remove numbers

5- Remove punctuation symbols

6- Remove stopwords

7- Remove white spaces

spam_files = spam_files[which(spam_files!="cmds")]  # Removing the .cmds files in all the folders.
ham_files=ham_files[which(ham_files!="cmds")]
toVCorpus <- function(file_path) {
  corpus <- file_path %>%                            
    paste(., list.files(.), sep = "/") %>%          
    lapply(readLines) %>%                          
    VectorSource() %>%                             
    VCorpus()                                       
  return(corpus)
}
docClean <- function(corpus) {
    corpus <- corpus %>%
    tm_map(removeNumbers) %>%                       
    tm_map(removePunctuation) %>%                   
    tm_map(tolower) %>%                            
    tm_map(PlainTextDocument) %>%                   
    tm_map(removeWords, stopwords("en")) %>%        
    tm_map(stripWhitespace) %>%                    
    tm_map(stemDocument)                           
  return(corpus)
}
addTag <- function(corpus, tag, value){
  for (i in 1:length(corpus)){
    meta(corpus[[i]], tag) <- value                    
  }
  return(corpus)
}
# Create ham corpora
hamCorp <- ham.dir%>%
   toVCorpus %>% 
   docClean  %>% 
   addTag(tag = "emails", value = "ham")
# Create spam corpora
spamCorp <- spam.dir %>%
  toVCorpus %>%
  docClean %>%
  addTag(tag = "emails", value = "spam")

Data and Visualization

Now I combine our corpora into a single corpus and begin to build a classifer that will accurately tell the difference between ham and spam.

First, we combine the corpora into a single one:

#Ham
hamDF <-as.data.frame(unlist(hamCorp),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")

#Spam
spamDF <-as.data.frame(unlist(spamCorp),stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text","type")

spam_ham_df <- rbind(hamDF[1:1000,], spamDF[1:1000,])

# Combine both corpora
clean_corpus <- c(spamCorp, hamCorp)

Splitting training dat 70 percent and test data 30 percent

wordcloud(clean_corpus,max.words = 70, random.order = FALSE, min.freq=1000)

Splitting data using caret

Here I’m splitting training data size 0.70 and test data size 0.30 for entire data.

corpus_labels <- unlist(meta(clean_corpus, "emails"))
corpus_dtm <-DocumentTermMatrix(clean_corpus)
set.seed(123)
spam_ham_df$text[spam_ham_df$text==""] <- "NaN"
train_index <- createDataPartition(spam_ham_df$type, p=0.70, list=FALSE)
email_train <- spam_ham_df[train_index,]
email_test <- spam_ham_df[-train_index,]

# Create corpus for training and test data
train_email_corpus <- Corpus(VectorSource(email_train$text))
test_email_corpus <- Corpus(VectorSource(email_test$text))

train_clean_corpus <- tm_map(train_email_corpus ,
                             removeNumbers)
test_clean_corpus <- tm_map(test_email_corpus,
                            removeNumbers)

train_clean_corpus <- tm_map(train_clean_corpus,
                             removePunctuation)
test_clean_corpus <- tm_map(test_clean_corpus,
                            removePunctuation)

train_clean_corpus <- tm_map(train_clean_corpus,
                             removeWords,
                             stopwords())
test_clean_corpus  <- tm_map(test_clean_corpus,
                             removeWords, 
                             stopwords())

train_clean_corpus<- tm_map(train_clean_corpus,
                            stripWhitespace)
test_clean_corpus<- tm_map(test_clean_corpus,
                           stripWhitespace)

train_email_dtm <- DocumentTermMatrix(train_clean_corpus)
test_email_dtm <- DocumentTermMatrix(test_clean_corpus)

# Here I'm defining input variables 0 and 1 from string to integer
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c(0,1))
  y
}

train_sms <- apply(train_email_dtm, 2, convert_count)
test_sms <- apply(test_email_dtm, 2, convert_count)

NaiveBayes Model

classifier <- naiveBayes(train_sms, factor(email_train$type))
test_pred <- predict(classifier, newdata=test_sms)

Model Summary

.true positives (TP): These are cases in which we predicted spam (they have the disease), and emails are actuall a spam.

.true negatives (TN): We predicted ham, and emails are actuall a ham.

.false positives (FP): We predicted spam, but emails are actuall a ham. (Also known as a “Type I error.”)

.false negatives (FN): We predicted ham, but emails are actuall a spam. (Also known as a “Type II error.”)

table(test_pred, email_test$type)
##          
## test_pred ham spam
##      ham  200   23
##      spam 100  277

Accuray rate = (True Positive + True Negative )/Total

Accuracy rate = 477/ 600 = 0.80