library(tm)
## Loading required package: NLP
library(knitr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(wordcloud)
## Loading required package: RColorBrewer
library(naivebayes)
## naivebayes 0.9.7 loaded
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Introduction

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: (https://spamassassin.apache.org/old/publiccorpus/ )[https://spamassassin.apache.org/old/publiccorpus/]

loading files

The number of files are too many to upload in the GitHub. So, I have downloaded the files in the desktop and assigned a path in the variable to load in R.

pathName_spam <- "/Users/karmagyatso/Documents/cunySps/data607/project4/spam_2"
file_names_spam <- list.files(pathName_spam)

head(file_names_spam)
## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
length_spam <- length(file_names_spam)
length_spam
## [1] 1397
pathName_ham <- "/Users/karmagyatso/Documents/cunySps/data607/project4/easy_ham_2"
file_names_ham <- list.files(pathName_ham)

head(file_names_ham)
## [1] "00001.1a31cc283af0060967a233d26548a6ce"
## [2] "00002.5a587ae61666c5aa097c8e866aedcc59"
## [3] "00003.19be8acd739ad589cd00d8425bac7115"
## [4] "00004.b2ed6c3c62bbdfab7683d60e214d1445"
## [5] "00005.07b9d4aa9e6c596440295a5170111392"
## [6] "00006.654c4ec7c059531accf388a807064363"
length_ham <- length(file_names_ham)
length_ham
## [1] 1401
file_names_spam <- file_names_spam[which(file_names_spam!="cmds")]
file_names_ham <- file_names_ham[which(file_names_ham!="cmds")]

##corpus creation - processing text data

easy_ham_corpus <- pathName_ham %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()

easy_ham_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1401
spam_corpus <- pathName_spam %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()

spam_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1397

Cleaning data with Corplus

data contains garbage like numbers, puctuation, whitespace. So, first we will remove all the unnecessary data.

Here we are removing the numbers, puctuation, whitespace, reduce the terms to their stem and remove stop words like to, from and the

Sys.setlocale("LC_ALL", "C")
## [1] "C/C/C/C/C/en_US.UTF-8"
# easy ham emails
easy_ham_corpus <- easy_ham_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)
#spam emails
spam_corpus <- spam_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)

spam_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1397

We have 1401 documents on easy_ham and 1397 documents on spam. Combining these two corpuses.

ham_or_spam_corpus <- c(easy_ham_corpus, spam_corpus)
ham_or_spam_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2798

##Building a Term Document Matrix

tdm <- DocumentTermMatrix(ham_or_spam_corpus)
tdm
## <<DocumentTermMatrix (documents: 2798, terms: 84774)>>
## Non-/sparse entries: 528459/236669193
## Sparsity           : 100%
## Maximal term length: 949
## Weighting          : term frequency (tf)

##Creating word cloud

wordcloud(ham_or_spam_corpus, max.words = 100, random.order = FALSE, rot.per=0.15, min.freq=5, colors = brewer.pal(8, "Dark2"))

Creating Data Frames

Here we are creating a new data frame and unlist all the easy_ham in df_ham and df_spam and combine the data in to one data frame. We can use Naive Bayes classifier to find any key word present in a defined class to predict if the email is spam or ham.

df_ham <- as.data.frame(unlist(easy_ham_corpus), stringsAsFactors = FALSE)
df_ham$type <- "ham"
colnames(df_ham)=c("text", "email")

df_spam <- as.data.frame(unlist(spam_corpus), stringsAsFactors = FALSE)
df_spam$type <- "spam"
colnames(df_spam)=c("text", "email")

df_ham_or_spam <- rbind(df_ham, df_spam)

kable(head(df_ham_or_spam))
text email
ReturnPath exmhworkersadminspamassassintaintorg ham
DeliveredTo yyyylocalhostnetnoteinccom ham
Receiv localhost localhost ham
phoboslabsnetnoteinccom Postfix ESMTP id C ham
jmlocalhost Wed Aug EDT ham
Receiv phobo ham

Splitting the data by 80% as training data and 20% as test data.

sample_size <- floor(0.80 * nrow(df_ham_or_spam)) # selecting sample size of 80% of the data for training. 

set.seed(123)
train <- sample(seq_len(nrow(df_ham_or_spam)), size = sample_size)

train_ham_or_spam <- df_ham_or_spam[train, ]
test_ham_or_spam <- df_ham_or_spam[-train, ]

kable(head(train_ham_or_spam))
text email
188942 252 spam
134058 ListId SpamAssassin Develop spamassassindevelexamplesourceforgenet ham
124022 ham
160997 Attention Internet Domain Registr spam
226318 zelignetnitconet jimnetnogginscom fdjnetnited zosonetnitconet spam
124507 ham
kable(head(test_ham_or_spam))
text email
2 DeliveredTo yyyylocalhostnetnoteinccom ham
28 Wed Aug ham
30 ratreepsuacth ESMTP id gLCUIl ham
40 Refer TMDAdeepeddyvirciocom ham
42 ContentTyp textplain charsetusascii ham
44 XLoop exmhworkersspamassassintaintorg ham

Corplus

Corpus is an R text processing package with full support for international text (Unicode). It includes functions for reading data from newline-delimited JSON files, for normalizing and tokenizing text, for searching for term occurrences, and for computing term occurrence frequencies (including n-grams).

Create and Clean Corpus and Create Term Document Matrix for Training and Test Data.

# corpus creation
train_corpus <- Corpus (VectorSource(train_ham_or_spam$text)) # corpus training data
test_corpus <- Corpus(VectorSource(test_ham_or_spam$text)) # corpus test data

# corpus cleaning
train_corpus <- train_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
test_corpus <- test_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
train_tdm <- DocumentTermMatrix(train_corpus)
test_tdm <- DocumentTermMatrix(test_corpus)

train_tdm
## <<DocumentTermMatrix (documents: 289992, terms: 74021)>>
## Non-/sparse entries: 705756/21464792076
## Sparsity           : 100%
## Maximal term length: 868
## Weighting          : term frequency (tf)
test_tdm
## <<DocumentTermMatrix (documents: 72499, terms: 29891)>>
## Non-/sparse entries: 175382/2166892227
## Sparsity           : 100%
## Maximal term length: 358
## Weighting          : term frequency (tf)
train_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 289992
test_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 72499

We need to filter data for training of data.

spam <- subset(train_ham_or_spam, email == "spam")
ham <- subset(train_ham_or_spam, email == "ham")

limitting the observation upto 60 times for now.

sixty_times_words<- findFreqTerms(train_tdm, 60)
length(sixty_times_words)
## [1] 1589

create a classifier for each email

train_tdm_2<- DocumentTermMatrix(train_corpus, control=list(dictionary = sixty_times_words))

test_tdm_2<- DocumentTermMatrix(test_corpus, control=list(dictionary = sixty_times_words))
train_tdm_3 <- as.matrix(train_tdm_2)
train_tdm_3 <- as.data.frame(train_tdm_3)
class(train_tdm_3)
## [1] "data.frame"
test_tdm_3 <- as.matrix(test_tdm_2)
test_tdm_3 <- as.data.frame(test_tdm_3)
class(test_tdm_3)
## [1] "data.frame"

Training the Naive Bayes classifier-

classifier <- naiveBayes(train_tdm_3, factor(train_ham_or_spam$email))

##testing the model

test_pred <- predict(classifier, newdata=test_tdm_3)
table(predicted=test_pred,actual=test_tdm_3[,1])
##          actual
## predicted     0     1
##      ham  64614   553
##      spam  7332     0
prednum<-ifelse(test_pred=="spam",1,2)

auc<-roc(as.factor(test_tdm_3[,1]),prednum)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(auc)

auc$auc
## Area under the curve: 0.551

In the ROC curve the area under the curve is 0.5485 which is not a good score and implies that the model recognize text messages as either spam or ham at around 50% accuracy. ROC curve is plotted between Sensitivity-i.e true positive rate(positive classes being classified correctly) vs the Specificity-i.e true negetive rate(negetive classes being clssified correctly)

In the Confusion matrix , the diagonals are the correctly classified examples while the off-diagonals the incorrectly classifiec examples.

##Conclusion

This was a simple article on classifying text messages as ham or spam using some basic natural language processing and then building a naive Bayes text classifier.I urge the readers to implement and use the knowledge acquired from this article in making their own text classifiers and solving different problems related to text processing and NLP etc. Ofcourse,there are various other packages to do text processing and building such models.