PROJECT 4: Document Classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

require(tm)
require(SnowballC)
require(wordcloud)
require(e1071)

Read in the files from the Computer of Spam and Ham

ham_dir<- setwd('C:/Users/MARYL/OneDrive/Desktop/easy_ham_2/')
hamFileNames = list.files(ham_dir)


# Documents spam and ham

ham_docs<- NA
for(i in 1:length(hamFileNames))
{
  filepath<-paste0(ham_dir, "/", hamFileNames[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  ham_docs= c(ham_docs,list1)
  
}

ham_df <-as.data.frame(unlist(ham_docs),stringsAsFactors = FALSE)
ham_df$type <- "ham"
colnames(ham_df) <- c("text","type")


spam_dir<-setwd("C:/Users/MARYL/OneDrive/Desktop/spam_2/")
spamFileNames = list.files(spam_dir)

spam_docs <- NA
for(i in 1:length(spamFileNames))
{
  filepath<-paste0(spam_dir, "/", spamFileNames[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  spam_docs = c(spam_docs,list1)
  
}

spam_df <-as.data.frame(unlist(spam_docs),stringsAsFactors = FALSE)
spam_df$type <- "spam"
colnames(spam_df) <- c("text","type")
spam_ham_df <- rbind(ham_df, spam_df)


# Write CSV so that it can read in from anywhere 
write.csv(spam_ham_df, file = "spam_ham.csv")
str(spam_ham_df)
## 'data.frame':    2804 obs. of  2 variables:
##  $ text: chr  NA "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
##  $ type: chr  "ham" "ham" "ham" "ham" ...

Read in the CSV from github

spam_ham_csv<-read.csv("https://raw.githubusercontent.com/Luz917/spam_ham/master/spam_ham.csv",stringsAsFactors = FALSE)
str(spam_ham_csv)
## 'data.frame':    2801 obs. of  3 variables:
##  $ X   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ text: chr  NA "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
##  $ type: chr  "ham" "ham" "ham" "ham" ...

Randomize Table

random_spam_ham <- spam_ham_csv[sample(nrow(spam_ham_csv)),]
str(random_spam_ham)
## 'data.frame':    2801 obs. of  3 variables:
##  $ X   : int  1864 241 2251 1065 1958 1317 2285 2498 1671 1473 ...
##  $ text: chr  "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
##  $ type: chr  "spam" "ham" "spam" "ham" ...

Clean up the text With Corpus

sms_corpus <- Corpus(VectorSource(random_spam_ham$text))
print(sms_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 2801
#translate all letters to lower case
clean_corpus <- tm_map(sms_corpus, tolower)

clean_corpus<- tm_map(sms_corpus,content_transformer(gsub), pattern="\\W",replace=" ")
removeURL <- function(x) gsub("http^\\s\\s*", "", x)%>% 
clean_corpus <- tm_map(clean_corpus, content_transformer(removeURL))
# remove numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
# remove punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)
## remove stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())
## remove whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)

Create the Bag of Words

sms_dtm <- DocumentTermMatrix(clean_corpus)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 556)>>
## Non-/sparse entries: 887448/669908
## Sparsity           : 43%
## Maximal term length: 15
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs com exmh invoked line list org spamassassin taint within workers
##   1   18   33      25   23   17  24           22    22     25      22
##   10  18   33      25   23   17  24           22    22     25      22
##   12  18   33      25   23   17  24           22    22     25      22
##   13  18   33      25   23   17  24           22    22     25      22
##   14  18   33      25   23   17  24           22    22     25      22
##   3   18   33      25   23   17  24           22    22     25      22
##   5   18   33      25   23   17  24           22    22     25      22
##   7   18   33      25   23   17  24           22    22     25      22
##   8   18   33      25   23   17  24           22    22     25      22
##   9   18   33      25   23   17  24           22    22     25      22
sms_dtm = removeSparseTerms(sms_dtm, 0.10)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 78)>>
## Non-/sparse entries: 218322/156
## Sparsity           : 0%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs aug com esmtp list localhost message one org received within
##   1   12  18     6   17         9      10   7  24       10     25
##   10  12  18     6   17         9      10   7  24       10     25
##   12  12  18     6   17         9      10   7  24       10     25
##   13  12  18     6   17         9      10   7  24       10     25
##   14  12  18     6   17         9      10   7  24       10     25
##   3   12  18     6   17         9      10   7  24       10     25
##   5   12  18     6   17         9      10   7  24       10     25
##   7   12  18     6   17         9      10   7  24       10     25
##   8   12  18     6   17         9      10   7  24       10     25
##   9   12  18     6   17         9      10   7  24       10     25

Seperate the Spam and the Ham

Spam

just_spam <- which(random_spam_ham$type == "spam")
just_spam[1:3]
## [1] 1 3 5

Ham

just_ham <- which(random_spam_ham$type == "ham")
just_ham[1:3]
## [1] 2 4 6

Create WordClouds to visualize Spam and Ham

Ham

wordcloud(clean_corpus[just_ham], min.freq=50, max.words=100, random.order=FALSE, rot.per=0.60, 
          colors=c(1:4),random.color=TRUE)

Spam

wordcloud(clean_corpus[just_spam], min.freq=50,max.words=100, random.order=FALSE, rot.per=0.60, 
          colors=c(1:4),random.color=TRUE)

  • You can visualy see that there is just about the same amount of spam and ham

Building of the Spam Filter

Divide Corpus into training and test data

Using 60% of the data for training and 40% of the dat for test

sms_raw_train <- random_spam_ham[1:1680, ]## 60% for training 
sms_raw_test <- random_spam_ham[1681:2801,]## 40% for test
  • The test and train of the document term matrix and the clean corpus
sms_dtm_train <- sms_dtm[1:1680, ]
sms_dtm_test <- sms_dtm[1681:2801,]
sms_corpus_train <- clean_corpus[1:1680]
sms_corpus_test <- clean_corpus[1681:2801]
  • Seperate the Spam and the Ham
spam <- subset(sms_raw_train, type == "spam")
ham <- subset(sms_raw_train, type == "ham")

Create DocumentTermMatrix for Train and Test

sms_train <- DocumentTermMatrix(sms_corpus_train)

sms_test <- DocumentTermMatrix(sms_corpus_test)

Create the function to convert count information to “Yes” or “No”

For Naive Bayes classification to work it needs to be present or absent on each word that is in a message.

This is used to convert the document-term matrices
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

Convert the Document-Term Matricies

sms_train <- apply(sms_train, 2, convert_count)
sms_test <- apply(sms_test, 2, convert_count)

The Naive Bayes Function

sms_classifier <- naiveBayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naiveBayes"
sms_test_pred <- predict(sms_classifier, newdata=sms_test)

Here we check the predictions against reality

table(sms_test_pred, sms_raw_test$type)
##              
## sms_test_pred ham spam
##          ham  547    1
##          spam   0  573

Ham Accuracy

550/551
## [1] 0.9981851

Spam Accuracy

570/570
## [1] 1

Conclusion

Using the Naive Bayes method is one of the best methods for spam filtering. Based on the results it correctly classified 99.84% of ham and it correctly classified 100% of spam. Strangely enough though when I went based on the most frequent terms it did it incorrectly. I will run that in the bottom to show comparison.

Sources

1.Text mining example: spam filtering (This was used as a guideline) https://www3.nd.edu/~steve/computing_with_data/20_text_mining/text_mining_example.html#/

For comparison

mostfreq <- findFreqTerms(sms_dtm_train, 5)
length(mostfreq)
## [1] 78
mostfreq[1:5]
## [1] "admin"     "also"      "ascii"     "aug"       "beenthere"
sms_train_freq <- DocumentTermMatrix(sms_corpus_train, control=list(dictionary = mostfreq))

sms_test_freq <- DocumentTermMatrix(sms_corpus_test, control=list(dictionary = mostfreq))

sms_train_freq <- apply(sms_train_freq, 2, convert_count)
sms_test_freq <- apply(sms_test_freq, 2, convert_count)

sms_classifier1 <- naiveBayes(sms_train_freq, factor(sms_raw_train$type))

sms_test_pred1 <- predict(sms_classifier1, newdata=sms_test)

table(sms_test_pred1, sms_raw_test$type)
##               
## sms_test_pred1 ham spam
##           ham    0    1
##           spam 547  573
  • As you can see when the most frequent terms are taken into account it incorrectly classifies the spam and the ham not sure why it does that, which is why I decided not to go by most frequent terms.