Introduction

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

Loading of the required libraries

library("easypackages")
library("stringr")
library("stringi")
library("readtext")
library("tm")

## Loading required package: NLP

library("SnowballC")
library("tidyr")
library(wordcloud)

## Loading required package: RColorBrewer

library(R.utils)

## Loading required package: R.oo

## Loading required package: R.methodsS3

## R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.

## R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.

## 
## Attaching package: 'R.oo'

## The following object is masked from 'package:R.methodsS3':
## 
##     throw

## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods

## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save

## R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.

## 
## Attaching package: 'R.utils'

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:utils':
## 
##     timestamp

## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
##     warnings

library(e1071)
#libraries("knitr", "downloader", "R.utils", "tm", "wordcloud","topicmodels","SnowballC", "e1071", "data.table", "RMySQL","tidyverse", "tidyr", "dplyr", "stats", "quanteda","plyr", "class")

Download the selected datasets. Unzip the files automatically and save in the working directory for further processing

url_spam <- "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
file_spam <- "20050311_spam_2.tar.bz2"
file_spam2<-"20050311_spam_2.tar"

url_ham <- "http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2"
file_ham <- "20030228_easy_ham.tar.bz2"
file_ham2 <- "20030228_easy_ham.tar"

download.file(url_spam, destfile= file_spam)
download.file(url_ham, destfile=file_ham)
bunzip2(file_spam)
bunzip2(file_ham)
untar(file_ham2, exdir="spamham")
untar(file_spam2, exdir = "spamham")

Set the spam and ham directory variables and get the directory contents

spam_dir="spamham\\spam_2\\"
ham_dir="spamham\\easy_ham\\"
spam_docs=list.files(spam_dir)
ham_docs=list.files(ham_dir)

#Remove the cmds file
spam_docs = spam_docs[which(spam_docs!="cmds")]
ham_docs=ham_docs[which(ham_docs!="cmds")]

Function to get Email body by neglecting the header

get_email_body <- function(emailContent){
  msge <- str_split(emailContent,"\n\n") %>% unlist()
  body <- paste(msge[2:length(msge)], collapse=' ' )
  return(body)
}

Read the spam mails

msgContent<-NA
for(i in 1:length(spam_docs))
{
  filepath<-paste0(spam_dir,spam_docs[i])  
  emailContent <-suppressWarnings(warning(readtext(filepath)))
  msg <- get_email_body(emailContent)
  msg <- gsub("<.*?>", " ", msg)
  eachMsg<- list(paste(msg, collapse="\n"))
  msgContent = c(msgContent,eachMsg)
  
}

spam<-as.data.frame(unlist(msgContent),stringsAsFactors = FALSE)
spam$class<-"spam"
colnames(spam)<-c("msg","class")

Read the ham mails

msgContent<-NA
for(i in 1:length(ham_docs))
{
  filepath<-paste0(ham_dir,ham_docs[i])  
  emailContent <-suppressWarnings(warning(readtext(filepath)))
  msg <- get_email_body(emailContent)
  msg <- gsub("<.*?>", " ", msg)
  eachMsg<- list(paste(msg, collapse="\n"))
  msgContent = c(msgContent,eachMsg)
  
}


ham<-as.data.frame(unlist(msgContent),stringsAsFactors = FALSE)
ham$class<-"ham"
colnames(ham)<-c("msg","class")

Merge the SAPM and HAM into one dataframe

spam_ham<-rbind(spam,ham)

#Lets verify the count

nrow(spam)

## [1] 1397

nrow(ham)

## [1] 2501

nrow(spam_ham)

## [1] 3898

Creating the Corpus and do the cleaning of the message body

corpus <- VCorpus(VectorSource(spam_ham$msg))
#Convert the content to lower case
corpus <- tm_map(corpus, content_transformer(tolower))

#Remove numbers from the message body
corpus <- tm_map(corpus, removeNumbers)

#Remove punctuation's
corpus <- tm_map(corpus, removePunctuation)

#Remove stop words
corpus <- tm_map(corpus, removeWords, stopwords())

#Stemming the word into its root word
corpus <- tm_map(corpus, stemDocument)

#Remove white spaces
corpus <- tm_map(corpus, stripWhitespace)

Document term Matrix

mtx = DocumentTermMatrix(corpus)
mtx = removeSparseTerms(mtx, 0.98)

bagOfWords = data.frame(as.matrix(mtx))
bagOfWords$outputType = spam_ham$class

Wordcloud for SPAM mails

spam_indices <- which(spam_ham$class == "spam")
suppressWarnings(wordcloud(corpus[spam_indices], min.freq=40))

Wordcloud for HAM mails

ham_indices <- which(spam_ham$class == "ham")
suppressWarnings(wordcloud(corpus[ham_indices], min.freq=40))

Splitting the data into training and testing data

I will take 70% for training and 30% for testing

sample_size <- floor(0.70 * nrow(spam_ham))
set.seed(123)
train_index <- sample(seq_len(nrow(spam_ham)), size = sample_size)

train_spam_ham <- spam_ham[train_index, ]
test_spam_ham <- spam_ham[-train_index, ]

spam_set<-subset(train_spam_ham,train_spam_ham$class == "spam")
ham_set<-subset(train_spam_ham,train_spam_ham$class == "ham")

train_email_corpus <- Corpus(VectorSource(train_spam_ham$msg))
test_email_corpus <- Corpus(VectorSource(test_spam_ham$msg))

#Convert the content to lower case
train_email_corpus = tm_map(train_email_corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(train_email_corpus,
## content_transformer(tolower)): transformation drops documents

test_email_corpus = tm_map(test_email_corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(test_email_corpus, content_transformer(tolower)):
## transformation drops documents

#Remove numbers from the message body
train_email_corpus = tm_map(train_email_corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(train_email_corpus, removeNumbers):
## transformation drops documents

test_email_corpus = tm_map(test_email_corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(test_email_corpus, removeNumbers): transformation
## drops documents

#Remove punctuation's
train_email_corpus = tm_map(train_email_corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(train_email_corpus, removePunctuation):
## transformation drops documents

test_email_corpus = tm_map(test_email_corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(test_email_corpus, removePunctuation):
## transformation drops documents

#Remove stop words
train_email_corpus = tm_map(train_email_corpus, removeWords, stopwords())

## Warning in tm_map.SimpleCorpus(train_email_corpus, removeWords, stopwords()):
## transformation drops documents

test_email_corpus = tm_map(test_email_corpus, removeWords, stopwords())

## Warning in tm_map.SimpleCorpus(test_email_corpus, removeWords, stopwords()):
## transformation drops documents

#Stemming the word into its root word
train_email_corpus = tm_map(train_email_corpus, stemDocument)

## Warning in tm_map.SimpleCorpus(train_email_corpus, stemDocument): transformation
## drops documents

test_email_corpus = tm_map(test_email_corpus, stemDocument)

## Warning in tm_map.SimpleCorpus(test_email_corpus, stemDocument): transformation
## drops documents

#Remove white spaces
train_email_corpus = tm_map(train_email_corpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(train_email_corpus, stripWhitespace):
## transformation drops documents

test_email_corpus = tm_map(test_email_corpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(test_email_corpus, stripWhitespace):
## transformation drops documents

#Create the DTM
train_email_dtm <- DocumentTermMatrix(train_email_corpus)
test_email_dtm <- DocumentTermMatrix(test_email_corpus)

# count function
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

train_data <- apply(train_email_dtm, 2, convert_count)
test_data <- apply(test_email_dtm, 2, convert_count)

# Email classification
classifier <- naiveBayes(train_data, factor(train_spam_ham$class))

Predicting using the test data

test_pred <- predict(classifier, newdata=test_data)

table(test_pred, test_spam_ham$class)

##          
## test_pred ham spam
##      ham  743   57
##      spam  14  356

DATA 607 Project 4: Document Classification

Peter

11/14/2020