DATA 607 Project 4 - Document Classification

INSTRUCTIONS

It can be useful to be able to classify new “test” documents using already classified “training” documents.  A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.  

For this project, I will use one of the spam/ham dataset found here:https://spamassassin.apache.org/old/publiccorpus/.

I will use the the following dataset 20030228_easy_ham.tar.bz2 and 20050311_spam_2.tar.bz2 to predict the class of new documents.

LOAD PACKAGES

library(tm)
library(caret)
library(dplyr)
library(httr)
library(tidytext)
library(tidyverse)
library(caTools)
library(e1071)
library(wordcloud)
library(wordcloud2)

DATA

DOWNLOAD THE DATA AND PREPARE THE FILES

First, we need to download and extract the datasets to a local machine.

# Assign the ham and spam dataset
ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2"
spam_url <- "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"

# Download and Extract the ham dataset
ham_filename <- basename(ham_url)
if (!file.exists(ham_filename)) {
  download.file(ham_url, ham_filename) 
  untar(ham_filename, exdir="spamham_dir", compressed = "bzip2")
}

# Download and Extract the spam dataset
spam_filename <- basename(spam_url)
if (!file.exists(spam_filename)) {
  download.file(spam_url, spam_filename) 
  untar(spam_filename, exdir="spamham_dir", compressed = "bzip2")
}

#Retrieving the respective ham and spam filenames
ham_dir <- "./spamham_dir/easy_ham/"
spam_dir <- "./spamham_dir/spam_2/"

hs_df <- function(path, tag){
  hs_files <- list.files(path=path, full.names=TRUE, recursive=TRUE)
  hs_email <- lapply(hs_files, function(x) {
    hs_body <- read_file(x)
    })
  
  hs_email <- unlist(hs_email)
  hs_data <- as.data.frame(hs_email)
  hs_data$tag <- tag
  return (hs_data)
}

ham_files <- hs_df(ham_dir, tag="ham") 
spam_files <- hs_df(spam_dir, tag="spam") 

#Joining the two dataset together and scrambling them
spamham_df <- rbind(ham_files, spam_files)
table(spamham_df$tag)
## 
##  ham spam 
## 2501 1397
#Scramble the dataset
spamham_df <- spamham_df[sample(c(1:length(spamham_df)))]

PREPARE THE DATA

Next, we need to preprocess the data. This involves removing unnecessary data from the email column as well as cleaning and transforming the data into a format that can be used by the machine learning algorithm. The tm packageis used to preprocess the text data. The corpus is also transformed into a document-term matrix.

#kable(head(spamham_df))

#Remove unnecessary data from the email column
spamham_df <- spamham_df  %>% 
    mutate(hs_email = str_remove_all(hs_email, pattern = "<.*?>")) %>%
  mutate(hs_email = str_remove_all(hs_email, pattern = "[:digit:]")) %>%
  mutate(hs_email = str_remove_all(hs_email, pattern = "[:punct:]")) %>%
  mutate(hs_email = str_remove_all(hs_email, pattern = "[\\r\\n\\t]+")) %>%
  mutate(hs_email = str_to_lower(hs_email)) %>%
  unnest_tokens(output=text,input=hs_email, token="paragraphs", format="text") %>%
  anti_join(stop_words, by=c("text"="word"))

#Clean the corpus by removing punctuation, numbers, and stop words
clean_corpus <- VCorpus(VectorSource(spamham_df$text))
clean_corpus <- tm_map(clean_corpus, removeNumbers)
clean_corpus <- tm_map(clean_corpus, removePunctuation)
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords("english")) 
clean_corpus <- tm_map(clean_corpus, stemDocument)
clean_corpus <- tm_map(clean_corpus, content_transformer(stringi::stri_trans_tolower))

# Scramble the order
spamham_corpus <- clean_corpus[sample(c(1:length(clean_corpus)))]


#Transform the corpus into a document-term matrix
spamham_tm <-  removeSparseTerms(DocumentTermMatrix(spamham_corpus, control = list(stemming = TRUE)), 1-(10/length(spamham_corpus)))

cnt <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c(0,1))
  y
}

dim(spamham_tm)
## [1] 3898 5069
hamspam_dtm <- spamham_tm %>%
  as.matrix() %>%
  as.data.frame() %>%
  sapply(., as.numeric) %>%
  as.data.frame() %>%
  mutate(class = spamham_df$tag ) %>%
  select(class, everything())

hamspam_dtm$class <- as.factor(hamspam_dtm$class)
str(hamspam_dtm$class)
##  Factor w/ 2 levels "ham","spam": 1 1 1 1 1 1 1 1 1 1 ...

EVALUATE THE DATA

After preprocessing the data, we need to split the dataset into training and testing datasets.

#Creating the training and testing dataset
training_testing <- floor(0.8 * nrow(hamspam_dtm))

set.seed(081282)
tt_i <- sample(seq_len(nrow(hamspam_dtm)), size = training_testing)

training_dtm <- hamspam_dtm[tt_i, ]
testing_dtm <-  hamspam_dtm[-tt_i, ]

#verify count of the Training and Testing dataset
training_cnt <- training_dtm$class
testing_cnt <- testing_dtm$class

#Creating proportion for training & test Spam
prop.table(table(training_cnt))
## training_cnt
##       ham      spam 
## 0.6449647 0.3550353

This code section will train the Naive Bayes classifier on the training set, and use it to predict the class of the test set, and evaluate the accuracy of the classifier. The accuracy variable will contain the accuracy of the classifier as a decimal between 0 and 1.

#Train the NaiveBayes classifier
model <- naiveBayes(training_dtm, training_cnt)
head(model$tables,3)
## $class
##             class
## training_cnt ham spam
##         ham    1    0
##         spam   0    1
## 
## $aaa
##             aaa
## training_cnt        [,1]       [,2]
##         ham  0.007458976 0.10194164
##         spam 0.009033424 0.09465683
## 
## $aaffor
##             aaffor
## training_cnt        [,1]       [,2]
##         ham  0.002486325 0.04981342
##         spam 0.006323397 0.07930380
# Predict the class of the test dataset
pred <- predict(model, testing_dtm)

table(pred, actual=testing_dtm$class)
##       actual
## pred   ham spam
##   ham  201  116
##   spam 289  174
# Evaluate the accuracy of the classifier
accuracy <- sum(pred == testing_dtm$Class) / length(testing_dtm$Class)
confusionMatrix(pred, testing_cnt, positive = "spam", 
                dnn = c("Prediction","Actual"))
## Confusion Matrix and Statistics
## 
##           Actual
## Prediction ham spam
##       ham  201  116
##       spam 289  174
##                                           
##                Accuracy : 0.4808          
##                  95% CI : (0.4452, 0.5165)
##     No Information Rate : 0.6282          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0091          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6000          
##             Specificity : 0.4102          
##          Pos Pred Value : 0.3758          
##          Neg Pred Value : 0.6341          
##              Prevalence : 0.3718          
##          Detection Rate : 0.2231          
##    Detection Prevalence : 0.5936          
##       Balanced Accuracy : 0.5051          
##                                           
##        'Positive' Class : spam            
## 

VISSUALIZE THE DATA

#Corpus Word Cloud
suppressWarnings(wordcloud(spamham_corpus,max.words = 50, random.order = FALSE, min.freq=1000, colors=brewer.pal(8, "Dark2")))

#Spam Word Cloud
spam_cloud <- which(hamspam_dtm$class == "spam")
suppressWarnings(wordcloud(clean_corpus[spam_cloud],max.words = 50, random.order = FALSE, min.freq=1000, colors=brewer.pal(8, "Dark2")))

#ham Word Cloud
spam_cloud <- which(hamspam_dtm$class == "ham")
suppressWarnings(wordcloud(clean_corpus[spam_cloud],max.words = 50, random.order = FALSE, min.freq=1000, colors=brewer.pal(8, "Dark2")))

CONCLUSION

The model prediction was performed by using Naive Bayes Classifier method. A 95% prediction was returned using 80% of the data to perform the training.