library(tm)
## Loading required package: NLP
library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(wordcloud)
## Loading required package: RColorBrewer
library(naivebayes)
## naivebayes 0.9.7 loaded
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: (https://spamassassin.apache.org/old/publiccorpus/ )[https://spamassassin.apache.org/old/publiccorpus/]
The number of files are too many to upload in the GitHub. So, I have downloaded the files in the desktop and assigned a path in the variable to load in R.
pathName_spam <- "/Users/karmagyatso/Documents/cunySps/data607/project4/spam_2"
file_names_spam <- list.files(pathName_spam)
head(file_names_spam)
## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
length_spam <- length(file_names_spam)
length_spam
## [1] 1397
pathName_ham <- "/Users/karmagyatso/Documents/cunySps/data607/project4/easy_ham_2"
file_names_ham <- list.files(pathName_ham)
head(file_names_ham)
## [1] "00001.1a31cc283af0060967a233d26548a6ce"
## [2] "00002.5a587ae61666c5aa097c8e866aedcc59"
## [3] "00003.19be8acd739ad589cd00d8425bac7115"
## [4] "00004.b2ed6c3c62bbdfab7683d60e214d1445"
## [5] "00005.07b9d4aa9e6c596440295a5170111392"
## [6] "00006.654c4ec7c059531accf388a807064363"
length_ham <- length(file_names_ham)
length_ham
## [1] 1401
file_names_spam <- file_names_spam[which(file_names_spam!="cmds")]
file_names_ham <- file_names_ham[which(file_names_ham!="cmds")]
##corpus creation - processing text data
easy_ham_corpus <- pathName_ham %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()
easy_ham_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1401
spam_corpus <- pathName_spam %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()
spam_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1397
data contains garbage like numbers, puctuation, whitespace. So, first we will remove all the unnecessary data.
Here we are removing the numbers, puctuation, whitespace, reduce the terms to their stem and remove stop words like to, from and the
Sys.setlocale("LC_ALL", "C")
## [1] "C/C/C/C/C/en_US.UTF-8"
# easy ham emails
easy_ham_corpus <- easy_ham_corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
#spam emails
spam_corpus <- spam_corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
spam_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1397
We have 1401 documents on easy_ham and 1397 documents on spam. Combining these two corpuses.
ham_or_spam_corpus <- c(easy_ham_corpus, spam_corpus)
ham_or_spam_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2798
##Building a Term Document Matrix
tdm <- DocumentTermMatrix(ham_or_spam_corpus)
tdm
## <<DocumentTermMatrix (documents: 2798, terms: 84774)>>
## Non-/sparse entries: 528459/236669193
## Sparsity : 100%
## Maximal term length: 949
## Weighting : term frequency (tf)
##Creating word cloud
wordcloud(ham_or_spam_corpus, max.words = 100, random.order = FALSE, rot.per=0.15, min.freq=5, colors = brewer.pal(8, "Dark2"))
Here we are creating a new data frame and unlist all the easy_ham in df_ham and df_spam and combine the data in to one data frame. We can use Naive Bayes classifier to find any key word present in a defined class to predict if the email is spam or ham.
df_ham <- as.data.frame(unlist(easy_ham_corpus), stringsAsFactors = FALSE)
df_ham$type <- "ham"
colnames(df_ham)=c("text", "email")
df_spam <- as.data.frame(unlist(spam_corpus), stringsAsFactors = FALSE)
df_spam$type <- "spam"
colnames(df_spam)=c("text", "email")
df_ham_or_spam <- rbind(df_ham, df_spam)
kable(head(df_ham_or_spam))
| text | |
|---|---|
| ReturnPath exmhworkersadminspamassassintaintorg | ham |
| DeliveredTo yyyylocalhostnetnoteinccom | ham |
| Receiv localhost localhost | ham |
| phoboslabsnetnoteinccom Postfix ESMTP id C | ham |
| jmlocalhost Wed Aug EDT | ham |
| Receiv phobo | ham |
Splitting the data by 80% as training data and 20% as test data.
sample_size <- floor(0.80 * nrow(df_ham_or_spam)) # selecting sample size of 80% of the data for training.
set.seed(123)
train <- sample(seq_len(nrow(df_ham_or_spam)), size = sample_size)
train_ham_or_spam <- df_ham_or_spam[train, ]
test_ham_or_spam <- df_ham_or_spam[-train, ]
kable(head(train_ham_or_spam))
| text | ||
|---|---|---|
| 188942 | 252 | spam |
| 134058 | ListId SpamAssassin Develop spamassassindevelexamplesourceforgenet | ham |
| 124022 | ham | |
| 160997 | Attention Internet Domain Registr | spam |
| 226318 | zelignetnitconet jimnetnogginscom fdjnetnited zosonetnitconet | spam |
| 124507 | ham |
kable(head(test_ham_or_spam))
| text | ||
|---|---|---|
| 2 | DeliveredTo yyyylocalhostnetnoteinccom | ham |
| 28 | Wed Aug | ham |
| 30 | ratreepsuacth ESMTP id gLCUIl | ham |
| 40 | Refer TMDAdeepeddyvirciocom | ham |
| 42 | ContentTyp textplain charsetusascii | ham |
| 44 | XLoop exmhworkersspamassassintaintorg | ham |
Corpus is an R text processing package with full support for international text (Unicode). It includes functions for reading data from newline-delimited JSON files, for normalizing and tokenizing text, for searching for term occurrences, and for computing term occurrence frequencies (including n-grams).
Create and Clean Corpus and Create Term Document Matrix for Training and Test Data.
# corpus creation
train_corpus <- Corpus (VectorSource(train_ham_or_spam$text)) # corpus training data
test_corpus <- Corpus(VectorSource(test_ham_or_spam$text)) # corpus test data
# corpus cleaning
train_corpus <- train_corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
test_corpus <- test_corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
train_tdm <- DocumentTermMatrix(train_corpus)
test_tdm <- DocumentTermMatrix(test_corpus)
train_tdm
## <<DocumentTermMatrix (documents: 289992, terms: 74021)>>
## Non-/sparse entries: 705756/21464792076
## Sparsity : 100%
## Maximal term length: 868
## Weighting : term frequency (tf)
test_tdm
## <<DocumentTermMatrix (documents: 72499, terms: 29891)>>
## Non-/sparse entries: 175382/2166892227
## Sparsity : 100%
## Maximal term length: 358
## Weighting : term frequency (tf)
train_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 289992
test_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 72499
We need to filter data for training of data.
spam <- subset(train_ham_or_spam, email == "spam")
ham <- subset(train_ham_or_spam, email == "ham")
limitting the observation upto 60 times for now.
sixty_times_words<- findFreqTerms(train_tdm, 60)
length(sixty_times_words)
## [1] 1589
train_tdm_2<- DocumentTermMatrix(train_corpus, control=list(dictionary = sixty_times_words))
test_tdm_2<- DocumentTermMatrix(test_corpus, control=list(dictionary = sixty_times_words))
train_tdm_3 <- as.matrix(train_tdm_2)
train_tdm_3 <- as.data.frame(train_tdm_3)
class(train_tdm_3)
## [1] "data.frame"
test_tdm_3 <- as.matrix(test_tdm_2)
test_tdm_3 <- as.data.frame(test_tdm_3)
class(test_tdm_3)
## [1] "data.frame"
Training the Naive Bayes classifier-
classifier <- naiveBayes(train_tdm_3, factor(train_ham_or_spam$email))
##testing the model
test_pred <- predict(classifier, newdata=test_tdm_3)
table(predicted=test_pred,actual=test_tdm_3[,1])
## actual
## predicted 0 1
## ham 64614 553
## spam 7332 0
prednum<-ifelse(test_pred=="spam",1,2)
auc<-roc(as.factor(test_tdm_3[,1]),prednum)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(auc)
auc$auc
## Area under the curve: 0.551
In the ROC curve the area under the curve is 0.5485 which is not a good score and implies that the model recognize text messages as either spam or ham at around 50% accuracy. ROC curve is plotted between Sensitivity-i.e true positive rate(positive classes being classified correctly) vs the Specificity-i.e true negetive rate(negetive classes being clssified correctly)
In the Confusion matrix , the diagonals are the correctly classified examples while the off-diagonals the incorrectly classifiec examples.
##Conclusion
This was a simple article on classifying text messages as ham or spam using some basic natural language processing and then building a naive Bayes text classifier.I urge the readers to implement and use the knowledge acquired from this article in making their own text classifiers and solving different problems related to text processing and NLP etc. Ofcourse,there are various other packages to do text processing and building such models.