Data 607 - Project 4: Document Classification

Introduction

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

Library preparation

Download files

download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2", destfile = "20021010_easy_ham.tar.bz2")

download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2", destfile = "20050311_spam_2.tar.bz2")

Unzip files

untar("20021010_easy_ham.tar.bz2", exdir="project_4", compressed = "bzip2")
untar("20050311_spam_2.tar.bz2", exdir="project_4",compressed = "bzip2")

Store files

Ham_path <- DirSource("C:\\Users\\a\\Desktop\\607\\project_4\\easy_ham")
ham.dir="easy_ham\\"
ham_files = list.files(path = ham.dir,full.names = TRUE)

Spam_path <- DirSource("C:\\Users\\a\\Desktop\\607\\project_4\\spam_2")
spam.dir="spam_2\\"
spam_files = list.files(path = spam.dir , full.names = TRUE)

length(spam_files)

## [1] 1397

length(ham_files)

## [1] 2551

Create Corpus

m_corpus <- function(file_path) {
  corpus <- file_path %>%                            
    paste(., list.files(.), sep = "/") %>%          
    lapply(readLines) %>%                          
    VectorSource() %>%                             
    VCorpus()                                       
  return(corpus)
}

Clean “Message” data

Clean_data <- function(corpus) {
    corpus <- corpus %>%
    tm_map(removeNumbers) %>%                       
    tm_map(removePunctuation) %>%                   
    tm_map(tolower) %>%                            
    tm_map(PlainTextDocument) %>%                   
    tm_map(removeWords, stopwords("en")) %>%        
    tm_map(stripWhitespace) %>%                    
    tm_map(stemDocument)                           
  return(corpus)
}
addTag <- function(corpus, tag, value){
  for (i in 1:length(corpus)){
    meta(corpus[[i]], tag) <- value                    
  }
  return(corpus)
}

Create HAM corpora and DF

hamCorp <- ham.dir%>%
   m_corpus %>% 
   Clean_data  %>% 
   addTag(tag = "emails", value = "ham")

hamDF <-as.data.frame(unlist(hamCorp),stringsAsFactors = FALSE)
hamDF$Type <- "HAM"
colnames(hamDF) <- c("Message","Type")

Create HAM wordcloud

wordcloud(hamCorp, scale=c(3,0.5), max.words=80, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
title("Ham Wordcloud",col.main = "grey14")

Create SPAM corpora and DF

spamCorp <- spam.dir %>%
  m_corpus %>%
  Clean_data %>%
  addTag(tag = "emails", value = "spam")

spamDF <-as.data.frame(unlist(spamCorp),stringsAsFactors = FALSE)
spamDF$Type <- "SPAM"
colnames(spamDF) <- c("Message","Type")

Create SPAM wordcloud

wordcloud(spamCorp, scale=c(3,0.5), max.words=80, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
title("Spam Wordcloud",col.main = "grey14")

Combine Ham/Spam dataframes and Corpora

joinedDF <- rbind(hamDF[1:1000,], spamDF[1:1000,])
clean_corpus <- c(spamCorp, hamCorp)

kable(head(joinedDF))

Message	Type
exmhworkersadminredhatcom thu aug	HAM
returnpath exmhworkersadminexamplecom	HAM
deliveredto zzzzlocalhostnetnoteinccom	HAM
receiv localhost localhost	HAM
phoboslabsnetnoteinccom postfix esmtp id dec	HAM
zzzzlocalhost thu aug edt	HAM

corpus_labels <- unlist(meta(clean_corpus, "emails"))
corpus_dtm <-DocumentTermMatrix(clean_corpus)

Randomize data

set.seed(1010)
joinedDF$Message[joinedDF$Message ==""] <- "NaN"
train_index <- createDataPartition(joinedDF$Type, p=0.70, list=FALSE)
email_train <- joinedDF[train_index,]
email_test <- joinedDF[-train_index,]

Create corpus for Train data

train_email_corpus <- Corpus(VectorSource(email_train$Message))

# perform some cleaning
suppressWarnings({train_clean_corpus=tm_map(train_email_corpus,
                             removeNumbers)})
suppressWarnings({train_clean_corpus=tm_map(train_clean_corpus,
                             removePunctuation)})
suppressWarnings({train_clean_corpus=tm_map(train_clean_corpus,
                             removeWords,
                             words=stopwords("en"))})
suppressWarnings({train_clean_corpus=tm_map(train_clean_corpus,
                            stripWhitespace)})

Create corpus for Test data

test_email_corpus<-Corpus(VectorSource(email_test$Message))

#perform some cleaning
suppressWarnings({test_clean_corpus=tm_map(test_email_corpus,removeNumbers)})
suppressWarnings({test_clean_corpus=tm_map(test_clean_corpus,
                            removePunctuation)})
suppressWarnings({test_clean_corpus=tm_map(test_clean_corpus,
                             removeWords, 
                             words=stopwords("en"))})
suppressWarnings({test_clean_corpus=tm_map(test_clean_corpus,
                           stripWhitespace)})

Create Document-Term Matrix

train_dtm <- DocumentTermMatrix(train_clean_corpus)
train_dtm

## <<DocumentTermMatrix (documents: 1400, terms: 1254)>>
## Non-/sparse entries: 3624/1751976
## Sparsity           : 100%
## Maximal term length: 75
## Weighting          : term frequency (tf)

test_dtm <- DocumentTermMatrix(test_clean_corpus)
test_dtm

## <<DocumentTermMatrix (documents: 600, terms: 719)>>
## Non-/sparse entries: 1450/429950
## Sparsity           : 100%
## Maximal term length: 75
## Weighting          : term frequency (tf)

Define input variables 0 and 1 from string to integer

convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c(0,1))
  y
}

train_x <- apply(train_dtm, 2, convert_count)
test_x <- apply(test_dtm, 2, convert_count)

Prediction Using Naive Bayes Classifier

NBclassifier <- naiveBayes(train_x, factor(email_train$Type))
head(NBclassifier$tables,3)

## $aug
##                         aug
## factor(email_train$Type)          0          1
##                     HAM  0.88714286 0.11285714
##                     SPAM 0.98428571 0.01571429
## 
## $exmhworkersadminredhatcom
##                         exmhworkersadminredhatcom
## factor(email_train$Type)           0           1
##                     HAM  0.998571429 0.001428571
##                     SPAM 1.000000000 0.000000000
## 
## $thu
##                         thu
## factor(email_train$Type)           0           1
##                     HAM  0.927142857 0.072857143
##                     SPAM 0.994285714 0.005714286

Pred <- predict(NBclassifier, newdata=test_x)

Summarise the model

table(Pred, actual=email_test$Type)

##       actual
## Pred   HAM SPAM
##   HAM  191   23
##   SPAM 109  277

Check acccuracy rate

mean(Pred==email_test$Type)*100

## [1] 78