PROJECT 4: Document Classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

This file was used for ham - 20030228_easy_ham_2.tar.bz2
This file was used for spam - 20030228_spam_2.tar.bz2

require(tm)
require(SnowballC)
require(wordcloud)
require(e1071)

Read in the files from the Computer of Spam and Ham

ham_dir<- setwd('C:/Users/MARYL/OneDrive/Desktop/easy_ham_2/')
hamFileNames = list.files(ham_dir)


# Documents spam and ham

ham_docs<- NA
for(i in 1:length(hamFileNames))
{
  filepath<-paste0(ham_dir, "/", hamFileNames[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  ham_docs= c(ham_docs,list1)
  
}

ham_df <-as.data.frame(unlist(ham_docs),stringsAsFactors = FALSE)
ham_df$type <- "ham"
colnames(ham_df) <- c("text","type")


spam_dir<-setwd("C:/Users/MARYL/OneDrive/Desktop/spam_2/")
spamFileNames = list.files(spam_dir)

spam_docs <- NA
for(i in 1:length(spamFileNames))
{
  filepath<-paste0(spam_dir, "/", spamFileNames[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  spam_docs = c(spam_docs,list1)
  
}

spam_df <-as.data.frame(unlist(spam_docs),stringsAsFactors = FALSE)
spam_df$type <- "spam"
colnames(spam_df) <- c("text","type")
spam_ham_df <- rbind(ham_df, spam_df)


# Write CSV so that it can read in from anywhere 
write.csv(spam_ham_df, file = "spam_ham.csv")

str(spam_ham_df)

## 'data.frame':    2804 obs. of  2 variables:
##  $ text: chr  NA "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
##  $ type: chr  "ham" "ham" "ham" "ham" ...

Read in the CSV from github

spam_ham_csv<-read.csv("https://raw.githubusercontent.com/Luz917/spam_ham/master/spam_ham.csv",stringsAsFactors = FALSE)
str(spam_ham_csv)

## 'data.frame':    2801 obs. of  3 variables:
##  $ X   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ text: chr  NA "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
##  $ type: chr  "ham" "ham" "ham" "ham" ...

Randomize Table

random_spam_ham <- spam_ham_csv[sample(nrow(spam_ham_csv)),]
str(random_spam_ham)

## 'data.frame':    2801 obs. of  3 variables:
##  $ X   : int  1864 241 2251 1065 1958 1317 2285 2498 1671 1473 ...
##  $ text: chr  "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
##  $ type: chr  "spam" "ham" "spam" "ham" ...

Clean up the text With Corpus

We remove the lower case, numbers, punctuations, stop words and then strip the white space.

sms_corpus <- Corpus(VectorSource(random_spam_ham$text))
print(sms_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 2801

#translate all letters to lower case
clean_corpus <- tm_map(sms_corpus, tolower)

clean_corpus<- tm_map(sms_corpus,content_transformer(gsub), pattern="\\W",replace=" ")
removeURL <- function(x) gsub("http^\\s\\s*", "", x)%>% 
clean_corpus <- tm_map(clean_corpus, content_transformer(removeURL))
# remove numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
# remove punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)
## remove stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())
## remove whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)

Create the Bag of Words

Changing into a Document Term Matrix

sms_dtm <- DocumentTermMatrix(clean_corpus)
inspect(sms_dtm)

## <<DocumentTermMatrix (documents: 2801, terms: 556)>>
## Non-/sparse entries: 887448/669908
## Sparsity           : 43%
## Maximal term length: 15
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs com exmh invoked line list org spamassassin taint within workers
##   1   18   33      25   23   17  24           22    22     25      22
##   10  18   33      25   23   17  24           22    22     25      22
##   12  18   33      25   23   17  24           22    22     25      22
##   13  18   33      25   23   17  24           22    22     25      22
##   14  18   33      25   23   17  24           22    22     25      22
##   3   18   33      25   23   17  24           22    22     25      22
##   5   18   33      25   23   17  24           22    22     25      22
##   7   18   33      25   23   17  24           22    22     25      22
##   8   18   33      25   23   17  24           22    22     25      22
##   9   18   33      25   23   17  24           22    22     25      22

After Removing Sparse Terms

sms_dtm = removeSparseTerms(sms_dtm, 0.10)
inspect(sms_dtm)

## <<DocumentTermMatrix (documents: 2801, terms: 78)>>
## Non-/sparse entries: 218322/156
## Sparsity           : 0%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs aug com esmtp list localhost message one org received within
##   1   12  18     6   17         9      10   7  24       10     25
##   10  12  18     6   17         9      10   7  24       10     25
##   12  12  18     6   17         9      10   7  24       10     25
##   13  12  18     6   17         9      10   7  24       10     25
##   14  12  18     6   17         9      10   7  24       10     25
##   3   12  18     6   17         9      10   7  24       10     25
##   5   12  18     6   17         9      10   7  24       10     25
##   7   12  18     6   17         9      10   7  24       10     25
##   8   12  18     6   17         9      10   7  24       10     25
##   9   12  18     6   17         9      10   7  24       10     25

Seperate the Spam and the Ham

Spam

just_spam <- which(random_spam_ham$type == "spam")
just_spam[1:3]

## [1] 1 3 5

Ham

just_ham <- which(random_spam_ham$type == "ham")
just_ham[1:3]

## [1] 2 4 6

Create WordClouds to visualize Spam and Ham

Ham

wordcloud(clean_corpus[just_ham], min.freq=50, max.words=100, random.order=FALSE, rot.per=0.60, 
          colors=c(1:4),random.color=TRUE)

Spam

wordcloud(clean_corpus[just_spam], min.freq=50,max.words=100, random.order=FALSE, rot.per=0.60, 
          colors=c(1:4),random.color=TRUE)

You can visualy see that there is just about the same amount of spam and ham

Building of the Spam Filter

Divide Corpus into training and test data

Using 60% of the data for training and 40% of the dat for test

sms_raw_train <- random_spam_ham[1:1680, ]## 60% for training 
sms_raw_test <- random_spam_ham[1681:2801,]## 40% for test

The test and train of the document term matrix and the clean corpus

sms_dtm_train <- sms_dtm[1:1680, ]
sms_dtm_test <- sms_dtm[1681:2801,]
sms_corpus_train <- clean_corpus[1:1680]
sms_corpus_test <- clean_corpus[1681:2801]

Seperate the Spam and the Ham

spam <- subset(sms_raw_train, type == "spam")
ham <- subset(sms_raw_train, type == "ham")

Create DocumentTermMatrix for Train and Test

sms_train <- DocumentTermMatrix(sms_corpus_train)

sms_test <- DocumentTermMatrix(sms_corpus_test)

Create the function to convert count information to “Yes” or “No”

For Naive Bayes classification to work it needs to be present or absent on each word that is in a message.

This is used to convert the document-term matrices

convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

Convert the Document-Term Matricies

sms_train <- apply(sms_train, 2, convert_count)
sms_test <- apply(sms_test, 2, convert_count)

The Naive Bayes Function

The R function for the Naives Bayes Classifier is e1071
Naive Bayes has the capability to assign the probability that a new sample is either spam or ham.
This is bases on Bayes Rule the analysis of frequent occurences of words and its assumption.
Here we create the Naive Bayes Classifier which is done on the training data

sms_classifier <- naiveBayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)

## [1] "naiveBayes"

Here we evaluate the performance on the test data
We use the predict function to test the model for new data
This predicts the classification of messages that is in the test data that is based on the probabilities that is generated on the training data set.

sms_test_pred <- predict(sms_classifier, newdata=sms_test)

Here we check the predictions against reality

table(sms_test_pred, sms_raw_test$type)

##              
## sms_test_pred ham spam
##          ham  547    1
##          spam   0  573

Ham Accuracy

550/551

## [1] 0.9981851

Spam Accuracy

570/570

## [1] 1

Conclusion

Using the Naive Bayes method is one of the best methods for spam filtering. Based on the results it correctly classified 99.84% of ham and it correctly classified 100% of spam. Strangely enough though when I went based on the most frequent terms it did it incorrectly. I will run that in the bottom to show comparison.

Sources

1.Text mining example: spam filtering (This was used as a guideline) https://www3.nd.edu/~steve/computing_with_data/20_text_mining/text_mining_example.html#/

For comparison

mostfreq <- findFreqTerms(sms_dtm_train, 5)
length(mostfreq)

## [1] 78

mostfreq[1:5]

## [1] "admin"     "also"      "ascii"     "aug"       "beenthere"

sms_train_freq <- DocumentTermMatrix(sms_corpus_train, control=list(dictionary = mostfreq))

sms_test_freq <- DocumentTermMatrix(sms_corpus_test, control=list(dictionary = mostfreq))

sms_train_freq <- apply(sms_train_freq, 2, convert_count)
sms_test_freq <- apply(sms_test_freq, 2, convert_count)

sms_classifier1 <- naiveBayes(sms_train_freq, factor(sms_raw_train$type))

sms_test_pred1 <- predict(sms_classifier1, newdata=sms_test)

table(sms_test_pred1, sms_raw_test$type)

##               
## sms_test_pred1 ham spam
##           ham    0    1
##           spam 547  573

As you can see when the most frequent terms are taken into account it incorrectly classifies the spam and the ham not sure why it does that, which is why I decided not to go by most frequent terms.

PROJECT 4: Document Classification

Maryluz Cruz

11/4/2019