Spam or Ham

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

#Setting the working directory and unzipping the files

wdir <- getwd()

# Set the File URL
if (!dir.exists("easy_ham")){
  download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2",
                destfile = "20021010_easy_ham.tar.bz2")
    
    untar("20021010_easy_ham.tar.bz2",compressed = "bzip2")
}

ham.files = list.files(path = "easy_ham",full.names = TRUE)

if (!dir.exists("spam_2")){
  download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2",
                destfile = "20050311_spam_2.tar.bz2")
    untar("20050311_spam_2.tar.bz2", compressed = "bzip2")
    }

spam.files = list.files(path = "spam_2", full.names = TRUE)
# Read the Directory and get list of file names 
dir <- paste(wdir,"easy_ham",sep="/")
ham.File.Names = list.files(dir)
ham.File.Path <- paste(dir, ham.File.Names, sep="/")

ham.body.df <- c()

# Read all files in a DF
for (i in ham.File.Path){
    #con <- file(i, open='r')
    text <- readLines(i)
    ham.body<- list(paste(text, collapse="\n"))
    ham.body.df = c(ham.body.df,ham.body)
}


ham.df <- c()
ham.df <- as.data.frame(unlist(ham.body.df))
names(ham.df) <- c("body")
ham.df$filename <- unlist(ham.File.Names)
ham.df$type <- "ham"

Ham and Spam into data frames

dir <- paste(wdir,"spam_2",sep="/")
spam.File.Names = list.files(dir)
spam.File.Path <- paste(dir,spam.File.Names,sep="/")


spam.body.df <- c()

# Read all files in a DF
for (i in spam.File.Path){
    #con <- file(i, open='r')
    text <- readLines(i)
    spam.body<- list(paste(text, collapse="\n"))
    spam.body.df = c(spam.body.df,spam.body)
}

# 
spam.df <- c()
spam.df <- as.data.frame(unlist(spam.body.df))
names(spam.df) <- c("body")
spam.df$filename <- unlist(spam.File.Names)

spam.df$type <- "spam"
#Merge the ham and spam dfs
spam.ham.df <- rbind(spam.df, ham.df)
control <- list(stopwords=TRUE, removePunctuation=TRUE,removeNumbers=TRUE, minDocFreq=2)

Create Spam and Ham Corpus

Including WordLists in the form of Term Document and Document Term matrices

spam_corpus <- Corpus(VectorSource(spam.df))
spam_tdm <- TermDocumentMatrix(spam_corpus,control)
spam_dtm <- DocumentTermMatrix(spam_corpus, control)
#remove sparse items
spam_tdm2<-removeSparseTerms(spam_tdm,0.8)

ham_corpus <- Corpus(VectorSource(ham.df))
ham_tdm <- TermDocumentMatrix(ham_corpus,control)
ham_dtm <- DocumentTermMatrix(ham_corpus, control)
#remove sparse items
ham_tdm2<-removeSparseTerms(ham_tdm,0.8)

Visualize the Spam and Ham Corpuses

wordcloud(ham_corpus, min.freq=600)

wordcloud(spam_corpus,min.freq = 600)

Split into Training and Test Sets

set.seed(123)
train.size <- floor(0.70 * nrow(spam.ham.df))
train.size
## [1] 2763
train.Index <- sample(seq_len(nrow(spam.ham.df)), size = train.size)

train.Spam.Ham <- spam.ham.df[train.Index, ]
test.Spam.Ham <- spam.ham.df[-train.Index, ]

# count of spam and ham in train data set
spam<-subset(train.Spam.Ham,train.Spam.Ham$type == "spam")
ham<-subset(train.Spam.Ham,train.Spam.Ham$type == "ham")
pc <- proc.time()
#Create a Naive Bayes classifier object
naivebayes_model <- naive_bayes(train.Spam.Ham, factor(train.Spam.Ham$type))

proc.time() - pc
##    user  system elapsed 
##   0.020   0.001   0.021
summary(naivebayes_model)
##           Length Class  Mode     
## data      2      -none- list     
## levels    2      -none- character
## laplace   1      -none- numeric  
## tables    3      -none- list     
## prior     2      table  numeric  
## usekernel 1      -none- logical  
## call      3      -none- call

Evaluation of Naive Bayes

#Evaluate the performance on the test data
#naivebayes_predict <- predict(naivebayes_model, newdata=test.Spam.Ham)

#Check the predictions against reality
#table(`Actual Class` = test.Spam.Ham$class, `Predicted Class` = naivebayes_predict)

Unfortunately my code didn’t work when I tried to evaluate the Naive Bayes.

#naivebayes_error <- sum(test_data$class != naivebayes_predict)/nrow(test_data)
#print(paste0("Accuary (Precision): ", 1 - naivebayes_error))
table(train.Spam.Ham$type)
## 
##  ham spam 
## 1780  983
table(test.Spam.Ham$type)
## 
##  ham spam 
##  771  414