data607_project4

library(tm)

## Loading required package: NLP

library(knitr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(wordcloud)

## Loading required package: RColorBrewer

library(naivebayes)

## naivebayes 0.9.7 loaded

library(e1071)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Introduction

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: (https://spamassassin.apache.org/old/publiccorpus/ )[https://spamassassin.apache.org/old/publiccorpus/]

loading files

The number of files are too many to upload in the GitHub. So, I have downloaded the files in the desktop and assigned a path in the variable to load in R.

pathName_spam <- "/Users/karmagyatso/Documents/cunySps/data607/project4/spam_2"
file_names_spam <- list.files(pathName_spam)

head(file_names_spam)

## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"

length_spam <- length(file_names_spam)
length_spam

## [1] 1397

pathName_ham <- "/Users/karmagyatso/Documents/cunySps/data607/project4/easy_ham_2"
file_names_ham <- list.files(pathName_ham)

head(file_names_ham)

## [1] "00001.1a31cc283af0060967a233d26548a6ce"
## [2] "00002.5a587ae61666c5aa097c8e866aedcc59"
## [3] "00003.19be8acd739ad589cd00d8425bac7115"
## [4] "00004.b2ed6c3c62bbdfab7683d60e214d1445"
## [5] "00005.07b9d4aa9e6c596440295a5170111392"
## [6] "00006.654c4ec7c059531accf388a807064363"

length_ham <- length(file_names_ham)
length_ham

## [1] 1401

file_names_spam <- file_names_spam[which(file_names_spam!="cmds")]
file_names_ham <- file_names_ham[which(file_names_ham!="cmds")]

##corpus creation - processing text data

easy_ham_corpus <- pathName_ham %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()

easy_ham_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1401

spam_corpus <- pathName_spam %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()

spam_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1397

Cleaning data with Corplus

data contains garbage like numbers, puctuation, whitespace. So, first we will remove all the unnecessary data.

Here we are removing the numbers, puctuation, whitespace, reduce the terms to their stem and remove stop words like to, from and the

Sys.setlocale("LC_ALL", "C")

## [1] "C/C/C/C/C/en_US.UTF-8"

# easy ham emails
easy_ham_corpus <- easy_ham_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)

#spam emails
spam_corpus <- spam_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)

spam_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1397

We have 1401 documents on easy_ham and 1397 documents on spam. Combining these two corpuses.

ham_or_spam_corpus <- c(easy_ham_corpus, spam_corpus)

ham_or_spam_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2798

##Building a Term Document Matrix

tdm <- DocumentTermMatrix(ham_or_spam_corpus)
tdm

## <<DocumentTermMatrix (documents: 2798, terms: 84774)>>
## Non-/sparse entries: 528459/236669193
## Sparsity           : 100%
## Maximal term length: 949
## Weighting          : term frequency (tf)

##Creating word cloud

wordcloud(ham_or_spam_corpus, max.words = 100, random.order = FALSE, rot.per=0.15, min.freq=5, colors = brewer.pal(8, "Dark2"))

Creating Data Frames

Here we are creating a new data frame and unlist all the easy_ham in df_ham and df_spam and combine the data in to one data frame. We can use Naive Bayes classifier to find any key word present in a defined class to predict if the email is spam or ham.

df_ham <- as.data.frame(unlist(easy_ham_corpus), stringsAsFactors = FALSE)
df_ham$type <- "ham"
colnames(df_ham)=c("text", "email")

df_spam <- as.data.frame(unlist(spam_corpus), stringsAsFactors = FALSE)
df_spam$type <- "spam"
colnames(df_spam)=c("text", "email")

df_ham_or_spam <- rbind(df_ham, df_spam)

kable(head(df_ham_or_spam))

text	email
ReturnPath exmhworkersadminspamassassintaintorg	ham
DeliveredTo yyyylocalhostnetnoteinccom	ham
Receiv localhost localhost	ham
phoboslabsnetnoteinccom Postfix ESMTP id C	ham
jmlocalhost Wed Aug EDT	ham
Receiv phobo	ham

Splitting the data by 80% as training data and 20% as test data.

sample_size <- floor(0.80 * nrow(df_ham_or_spam)) # selecting sample size of 80% of the data for training. 

set.seed(123)
train <- sample(seq_len(nrow(df_ham_or_spam)), size = sample_size)

train_ham_or_spam <- df_ham_or_spam[train, ]
test_ham_or_spam <- df_ham_or_spam[-train, ]

kable(head(train_ham_or_spam))

	text	email
188942	252	spam
134058	ListId SpamAssassin Develop spamassassindevelexamplesourceforgenet	ham
124022		ham
160997	Attention Internet Domain Registr	spam
226318	zelignetnitconet jimnetnogginscom fdjnetnited zosonetnitconet	spam
124507		ham

kable(head(test_ham_or_spam))

	text	email
2	DeliveredTo yyyylocalhostnetnoteinccom	ham
28	Wed Aug	ham
30	ratreepsuacth ESMTP id gLCUIl	ham
40	Refer TMDAdeepeddyvirciocom	ham
42	ContentTyp textplain charsetusascii	ham
44	XLoop exmhworkersspamassassintaintorg	ham

Corplus

Corpus is an R text processing package with full support for international text (Unicode). It includes functions for reading data from newline-delimited JSON files, for normalizing and tokenizing text, for searching for term occurrences, and for computing term occurrence frequencies (including n-grams).

Create and Clean Corpus and Create Term Document Matrix for Training and Test Data.

# corpus creation
train_corpus <- Corpus (VectorSource(train_ham_or_spam$text)) # corpus training data
test_corpus <- Corpus(VectorSource(test_ham_or_spam$text)) # corpus test data

# corpus cleaning
train_corpus <- train_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace)

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents

## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

test_corpus <- test_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace)

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents

## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

train_tdm <- DocumentTermMatrix(train_corpus)
test_tdm <- DocumentTermMatrix(test_corpus)

train_tdm

## <<DocumentTermMatrix (documents: 289992, terms: 74021)>>
## Non-/sparse entries: 705756/21464792076
## Sparsity           : 100%
## Maximal term length: 868
## Weighting          : term frequency (tf)

test_tdm

## <<DocumentTermMatrix (documents: 72499, terms: 29891)>>
## Non-/sparse entries: 175382/2166892227
## Sparsity           : 100%
## Maximal term length: 358
## Weighting          : term frequency (tf)

train_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 289992

test_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 72499

We need to filter data for training of data.

spam <- subset(train_ham_or_spam, email == "spam")
ham <- subset(train_ham_or_spam, email == "ham")

limitting the observation upto 60 times for now.

sixty_times_words<- findFreqTerms(train_tdm, 60)
length(sixty_times_words)

## [1] 1589

create a classifier for each email

train_tdm_2<- DocumentTermMatrix(train_corpus, control=list(dictionary = sixty_times_words))

test_tdm_2<- DocumentTermMatrix(test_corpus, control=list(dictionary = sixty_times_words))

train_tdm_3 <- as.matrix(train_tdm_2)
train_tdm_3 <- as.data.frame(train_tdm_3)
class(train_tdm_3)

## [1] "data.frame"

test_tdm_3 <- as.matrix(test_tdm_2)
test_tdm_3 <- as.data.frame(test_tdm_3)
class(test_tdm_3)

## [1] "data.frame"

Training the Naive Bayes classifier-

classifier <- naiveBayes(train_tdm_3, factor(train_ham_or_spam$email))

##testing the model

test_pred <- predict(classifier, newdata=test_tdm_3)

table(predicted=test_pred,actual=test_tdm_3[,1])

##          actual
## predicted     0     1
##      ham  64614   553
##      spam  7332     0

prednum<-ifelse(test_pred=="spam",1,2)

auc<-roc(as.factor(test_tdm_3[,1]),prednum)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(auc)

auc$auc

## Area under the curve: 0.551

In the ROC curve the area under the curve is 0.5485 which is not a good score and implies that the model recognize text messages as either spam or ham at around 50% accuracy. ROC curve is plotted between Sensitivity-i.e true positive rate(positive classes being classified correctly) vs the Specificity-i.e true negetive rate(negetive classes being clssified correctly)

In the Confusion matrix , the diagonals are the correctly classified examples while the off-diagonals the incorrectly classifiec examples.

##Conclusion

This was a simple article on classifying text messages as ham or spam using some basic natural language processing and then building a naive Bayes text classifier.I urge the readers to implement and use the knowledge acquired from this article in making their own text classifiers and solving different problems related to text processing and NLP etc. Ofcourse,there are various other packages to do text processing and building such models.