DATA 607 Project 4 - Document Classification

HAZAL GUNDUZ

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

Load Libraries

library(tm)
## Loading required package: NLP
library(RCurl)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
library(wordcloud)
## Loading required package: RColorBrewer
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
## The following object is masked from 'package:RCurl':
## 
##     complete
library(readr)
library(e1071)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
length(list.files("./spam"))
## [1] 500
length(list.files("./easy_ham"))
## [1] 2501
spam_fnames <- list.files("./spam")
spam_fnames[1:6]
## [1] "00001.7848dde101aa985090474a91ec93fcf0"
## [2] "00002.d94f1b97e48ed3b553b3508d116e6a09"
## [3] "00003.2ee33bc6eacdb11f38d052c44819ba6c"
## [4] "00004.eac8de8d759b7e74154f142194282724"
## [5] "00005.57696a39d7d84318ce497886896bf90d"
## [6] "00006.5ab5620d3d7c6c0db76234556a16f6c1"
ham_fnames <- list.files("./easy_ham")
ham_fnames[1:6]
## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"
## [6] "00006.253ea2f9a9cc36fa0b1129b04b806608"
folders <- c("./easy_ham", "./spam")
categories <-c("easy_ham", "spam")
symbs <- c("h", "s")

read_folder_to_df <- function(folders, categories, symbs, vec_fnames){
        df <- data_frame()
        n <- length(folders)
        
        for (i in 1:n) {
                folder <- folders[i]
                category <- categories[i]
                symb <- symbs[i]
                fnames <- vec_fnames[i]
                 
                temp <- data_frame(file = dir(folder,  full.names = TRUE)) %>%
                        mutate(text = map(file, read_lines)) %>%
                        transmute(category = category, id = basename(file), text) %>%
                        unnest(text)
                        df <- bind_rows(df, temp)
         }
         return(df)
}

# Creating corpus
corpus_tibble <- function(folders, categories, symbs, vec_fnames)
corpus_tibble
str(ham_fnames)
##  chr [1:2501] "00001.7c53336b37003a9286aba55d2945844c" ...
str(spam_fnames)
##  chr [1:500] "00001.7848dde101aa985090474a91ec93fcf0" ...

Read the csv File

spam_ham_csv<-read.csv("~/Dropbox/Mac/Downloads/spam_ham.csv" ,stringsAsFactors = FALSE)
str(spam_ham_csv)
## 'data.frame':    2801 obs. of  3 variables:
##  $ X   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ text: chr  NA "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
##  $ type: chr  "ham" "ham" "ham" "ham" ...
random_spam_ham <- spam_ham_csv[sample(nrow(spam_ham_csv)),]
str(random_spam_ham)
## 'data.frame':    2801 obs. of  3 variables:
##  $ X   : int  2784 1953 1630 130 1580 95 1528 613 612 2697 ...
##  $ text: chr  "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
##  $ type: chr  "spam" "spam" "spam" "ham" ...
sms_corpus <- Corpus(VectorSource(random_spam_ham$text))
print(sms_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 2801

Cleaning the text of Corpus

clean_corpus <- tm_map(sms_corpus, tolower)
## Warning in tm_map.SimpleCorpus(sms_corpus, tolower): transformation drops
## documents
clean_corpus<-  tm_map(sms_corpus,content_transformer(gsub), pattern="\\W",replace=" ")
## Warning in tm_map.SimpleCorpus(sms_corpus, content_transformer(gsub), pattern =
## "\\W", : transformation drops documents
removeURL <- function(x) gsub("http^\\s\\s*", "", x)%>% 
clean_corpus <- tm_map(clean_corpus, content_transformer(removeURL))
# removing numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(clean_corpus, removeNumbers): transformation
## drops documents
# removing punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(clean_corpus, removePunctuation): transformation
## drops documents
## removing stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(clean_corpus, removeWords, stopwords()):
## transformation drops documents
## removing whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(clean_corpus, stripWhitespace): transformation
## drops documents

Change into a Document Term Matrix

sms_dtm <- DocumentTermMatrix(clean_corpus)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 556)>>
## Non-/sparse entries: 887448/669908
## Sparsity           : 43%
## Maximal term length: 15
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs com exmh invoked line list org spamassassin taint within workers
##   1   18   33      25   23   17  24           22    22     25      22
##   10  18   33      25   23   17  24           22    22     25      22
##   15  18   33      25   23   17  24           22    22     25      22
##   16  18   33      25   23   17  24           22    22     25      22
##   17  18   33      25   23   17  24           22    22     25      22
##   19  18   33      25   23   17  24           22    22     25      22
##   2   18   33      25   23   17  24           22    22     25      22
##   3   18   33      25   23   17  24           22    22     25      22
##   5   18   33      25   23   17  24           22    22     25      22
##   7   18   33      25   23   17  24           22    22     25      22
sms_dtm = removeSparseTerms(sms_dtm, 0.10)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 78)>>
## Non-/sparse entries: 218322/156
## Sparsity           : 0%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs aug com esmtp list localhost message one org received within
##   1   12  18     6   17         9      10   7  24       10     25
##   10  12  18     6   17         9      10   7  24       10     25
##   15  12  18     6   17         9      10   7  24       10     25
##   16  12  18     6   17         9      10   7  24       10     25
##   17  12  18     6   17         9      10   7  24       10     25
##   19  12  18     6   17         9      10   7  24       10     25
##   2   12  18     6   17         9      10   7  24       10     25
##   3   12  18     6   17         9      10   7  24       10     25
##   5   12  18     6   17         9      10   7  24       10     25
##   7   12  18     6   17         9      10   7  24       10     25

After Remove the Sparse Terms

sms_dtm = removeSparseTerms(sms_dtm, 0.10)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 78)>>
## Non-/sparse entries: 218322/156
## Sparsity           : 0%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs aug com esmtp list localhost message one org received within
##   1   12  18     6   17         9      10   7  24       10     25
##   10  12  18     6   17         9      10   7  24       10     25
##   15  12  18     6   17         9      10   7  24       10     25
##   16  12  18     6   17         9      10   7  24       10     25
##   17  12  18     6   17         9      10   7  24       10     25
##   19  12  18     6   17         9      10   7  24       10     25
##   2   12  18     6   17         9      10   7  24       10     25
##   3   12  18     6   17         9      10   7  24       10     25
##   5   12  18     6   17         9      10   7  24       10     25
##   7   12  18     6   17         9      10   7  24       10     25

Spam

just_spam <- which(random_spam_ham$type == "spam")
just_spam[1:5]
## [1] 1 2 3 5 7

Ham

just_ham <- which(random_spam_ham$type == "ham")
just_ham[1:5]
## [1]  4  6  8  9 11
sms_raw_train <- random_spam_ham[1:1680,]
sms_raw_test <- random_spam_ham[1681:2801,]

sms_dtm_train <- sms_dtm[1:1680, ]
sms_dtm_test <- sms_dtm[1681:2801,]
sms_corpus_train <- clean_corpus[1:1680]
sms_corpus_test <- clean_corpus[1681:2801]
spam <- subset(sms_raw_train, type == "spam")
ham <- subset(sms_raw_train, type == "ham")

Create Document Term Matrix for Train and Test

sms_train <- DocumentTermMatrix(sms_corpus_train)
sms_test <- DocumentTermMatrix(sms_corpus_test)
sms_train
## <<DocumentTermMatrix (documents: 1680, terms: 556)>>
## Non-/sparse entries: 528668/405412
## Sparsity           : 43%
## Maximal term length: 15
## Weighting          : term frequency (tf)
sms_test
## <<DocumentTermMatrix (documents: 1121, terms: 556)>>
## Non-/sparse entries: 358780/264496
## Sparsity           : 42%
## Maximal term length: 15
## Weighting          : term frequency (tf)
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y }
sms_train <- apply(sms_train, 2, convert_count)
sms_test <- apply(sms_test, 2, convert_count)

The Naive Bayes

The R function for the Naives Bayes Classifier is e1071.

Naive Bayes has the capability to assign the probability that a new sample is either spam or ham.

This is bases on Bayes Rule the analysis of frequent occurrences of words and its assumption.

sms_classifier <- naiveBayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naiveBayes"

Conclusion

Using the Naive Bayes method is one of the best methods for spam filtering. Based on the results it correctly classified 99.84% of ham and it correctly classified 100% of spam. Strangely enough though when I went based on the most frequent terms did it incorrectly.

Rpubs => https://rpubs.com/gunduzhazal/833684