The following libraries are used in this assignment
RTextTools
tm
wordcloud
RTextTools
e1071
Read the spam and ham folder files into a R Dataframe object
# Create Ham Dataframe
ham_dir="D:/Data/easy_ham"
hamFileNames = list.files(ham_dir)
# List of docs
ham_docs_list <- NA
for(i in 1:length(hamFileNames))
{
filepath<-paste0(ham_dir, "/", hamFileNames[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
ham_docs_list = c(ham_docs_list,list1)
}
# ham data frame
hamDF <-as.data.frame(unlist(ham_docs_list),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")
# Create Spam Dataframe
spam_dir="D:/Data/spam"
spamFileNames = list.files(spam_dir)
spam_docs_list <- NA
for(i in 1:length(spamFileNames))
{
filepath<-paste0(spam_dir, "/", spamFileNames[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
spam_docs_list = c(spam_docs_list,list1)
}
spamDF <-as.data.frame(unlist(spam_docs_list),stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text","type")
# creating combined data frame of spam and ham
spam_ham_df <- rbind(hamDF, spamDF)
Create email corpus from the dataframe after clean up.
Clean up process involves the following activities:
Create Corpus dataset
Removing numbers
Removing punctuation
Removing stopwords - remove common non-content words, like to, and, the, etc. These are called stop words in the lingo. The function stopwords reports a list of about 175 such words
Removing excess white space
# Create email Corpus from email text Vector (tm package)
emailCorpus <- Corpus(VectorSource(spam_ham_df$text))
cleanCorpus <- tm_map(emailCorpus, removeNumbers)
cleanCorpus <- tm_map(cleanCorpus, removePunctuation)
cleanCorpus <- tm_map(cleanCorpus, removeWords, stopwords())
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace)
Create Document-term matrix for spam and ham emails (Tokenize the Corpus). Also create wordcloud for both datasets.
# Create documen-term matrix for ham and spam emails
# documenterm matrix is the mathematical maxtrix that describes the frequency of terms that occurs in a collection of documents
email_dtm <- DocumentTermMatrix(cleanCorpus)
# spam word cloud
spam_indices <- which(spam_ham_df$type == "spam")
suppressWarnings(wordcloud(cleanCorpus[spam_indices], min.freq=40))
# ham word cloud
ham_indices <- which(spam_ham_df$type == "ham")
suppressWarnings(wordcloud(cleanCorpus[ham_indices], min.freq=50))
Create test and train data. Also use a naive Bayes classifier to build a spam filter based on the words in the mails
# Model to assess spam and ham
# sample 70% data traning and 30 % for prediction
sample_size <- floor(0.70 * nrow(spam_ham_df))
# set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(spam_ham_df)), size = sample_size)
train_spam_ham <- spam_ham_df[train_ind, ]
test_spam_ham <- spam_ham_df[-train_ind, ]
# count of spam and ham in train data set
spam<-subset(train_spam_ham,train_spam_ham$type == "spam")
ham<-subset(train_spam_ham,train_spam_ham$type == "ham")
# Create corpus for training and test data
train_email_corpus <- Corpus(VectorSource(train_spam_ham$text))
test_email_corpus <- Corpus(VectorSource(test_spam_ham$text))
train_clean_corpus <- tm_map(train_email_corpus ,removeNumbers)
test_clean_corpus <- tm_map(test_email_corpus, removeNumbers)
train_clean_corpus <- tm_map(train_clean_corpus, removePunctuation)
test_clean_corpus <- tm_map(test_clean_corpus, removePunctuation)
train_clean_corpus <- tm_map(train_clean_corpus, removeWords, stopwords())
test_clean_corpus <- tm_map(test_clean_corpus, removeWords, stopwords())
train_clean_corpus<- tm_map(train_clean_corpus, stripWhitespace)
test_clean_corpus<- tm_map(test_clean_corpus, stripWhitespace)
train_email_dtm <- DocumentTermMatrix(train_clean_corpus)
test_email_dtm <- DocumentTermMatrix(test_clean_corpus)
# count function
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
train_sms <- apply(train_email_dtm, 2, convert_count)
test_sms <- apply(test_email_dtm, 2, convert_count)
# classification of email
classifier <- naiveBayes(train_sms, factor(train_spam_ham$type))
test_pred <- predict(classifier, newdata=test_sms)
table(test_pred, test_spam_ham$type)
##
## test_pred ham spam
## ham 759 1
## spam 0 142