Document Classfication

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/.


STEP 1 : Load your libraries

# Load the libraries

lib.list <- c("RCurl",
              "DT",
              "dplyr",
              "tidyr",
              "wordcloud",
              "ggplot2",
              "data.table",
              "RMySQL",
              "tidyverse",
              "stringr",
              "ggplot2",
              "wordcloud",
              "tm",  #text mining https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf
              "RTextTools" 
              )


# Loading all libraries at once
lapply(lib.list, require, character.only = TRUE)

STEP 2 : Load the File

# Good Practise: Set up the Working Directory when working with a file system
setwd("C:\\Users\\PURA\\Dropbox\\CUNY\\607Data\\Assignments\\project04")
wdir <- getwd()

# Set the File URL
if (!dir.exists("easy_ham")){
  download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2",
                destfile = "20021010_easy_ham.tar.bz2")
    
    untar("20021010_easy_ham.tar.bz2",compressed = "bzip2")
}

ham.files = list.files(path = "easy_ham",full.names = TRUE)

if (!dir.exists("spam_2")){
  download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2",
                destfile = "20050311_spam_2.tar.bz2")
    untar("20050311_spam_2.tar.bz2", compressed = "bzip2")
    }

spam.files = list.files(path = "spam_2", full.names = TRUE)

STEP 3. Read all (HAM and SPAM) files into a DF

# Read the Directory and get list of file names 
dir <- paste(wdir,"easy_ham",sep="/")
ham.File.Names = list.files(dir)
ham.File.Path <- paste(dir, ham.File.Names, sep="/")

ham.body.df <- c()

# Read all files in a DF
for (i in ham.File.Path){
    #con <- file(i, open='r')
    text <- readLines(i)
    ham.body<- list(paste(text, collapse="\n"))
    ham.body.df = c(ham.body.df,ham.body)
}

#  
ham.df <- c()
ham.df <- as.data.frame(unlist(ham.body.df))
names(ham.df) <- c("body")
ham.df$filename <- unlist(ham.File.Names)
ham.df$type <- "ham"
dir <- paste(wdir,"spam_2",sep="/")
spam.File.Names = list.files(dir)
spam.File.Path <- paste(dir,spam.File.Names,sep="/")


spam.body.df <- c()

# Read all files in a DF
for (i in spam.File.Path){
    #con <- file(i, open='r')
    text <- readLines(i)
    spam.body<- list(paste(text, collapse="\n"))
    spam.body.df = c(spam.body.df,spam.body)
}

# 
spam.df <- c()
spam.df <- as.data.frame(unlist(spam.body.df))
names(spam.df) <- c("body")
spam.df$filename <- unlist(spam.File.Names)

spam.df$type <- "spam"
spam.ham.df <- rbind(spam.df, ham.df)

STEP 4. Cleaning the data and Text Mining

# use the #tm to mine the text
# Documentation for this can be seen at https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf

spam.Ham.Corpus <- VCorpus(VectorSource(spam.ham.df$body))


clean.Email.Corpus <- tm_map(spam.Ham.Corpus, content_transformer(tolower))
clean.Email.Corpus <- tm_map(clean.Email.Corpus, removeNumbers)
clean.Email.Corpus <- tm_map(clean.Email.Corpus, removePunctuation)
clean.Email.Corpus <- tm_map(clean.Email.Corpus, stripWhitespace)
clean.Email.Corpus <- tm_map(clean.Email.Corpus, removeWords, stopwords("english"))
#clean.Email.Corpus <- tm_map(clean.Email.Corpus, stemDocument)

email.Dtm <- DocumentTermMatrix(clean.Email.Corpus)
email.Dtm <- removeSparseTerms(email.Dtm, 1-(10/length(clean.Email.Corpus)))

email.Tdm <- TermDocumentMatrix(clean.Email.Corpus)
email.Tdm <- removeSparseTerms(email.Tdm, 1-(10/length(clean.Email.Corpus)))


spam_indices <- which(spam.ham.df$type == "spam")
suppressWarnings(wordcloud(clean.Email.Corpus[spam_indices], min.freq=100))

ham_indices <- which(spam.ham.df$type == "ham")
suppressWarnings(wordcloud(clean.Email.Corpus[ham_indices], min.freq=100))

## STEP 5. Short Analysis of the existing data The above word clouds show us the various words that are predominantly present in HAM and SPAM emails.

We could also use the TM package further to do some more analysis like shown below

findFreqTerms(email.Dtm, 500)
##   [1] "address"                                       
##   [2] "aligncenter"                                   
##   [3] "aligndcenter"                                  
##   [4] "aligndcenterfont"                              
##   [5] "also"                                          
##   [6] "arial"                                         
##   [7] "aug"                                           
##   [8] "available"                                     
##   [9] "back"                                          
##  [10] "best"                                          
##  [11] "bit"                                           
##  [12] "body"                                          
##  [13] "border"                                        
##  [14] "borderd"                                       
##  [15] "build"                                         
##  [16] "bulk"                                          
##  [17] "business"                                      
##  [18] "call"                                          
##  [19] "can"                                           
##  [20] "cdt"                                           
##  [21] "cellpadding"                                   
##  [22] "cellpaddingd"                                  
##  [23] "cellspacing"                                   
##  [24] "cellspacingd"                                  
##  [25] "center"                                        
##  [26] "change"                                        
##  [27] "charsetiso"                                    
##  [28] "charsetusascii"                                
##  [29] "check"                                         
##  [30] "clean"                                         
##  [31] "click"                                         
##  [32] "color"                                         
##  [33] "colord"                                        
##  [34] "colordff"                                      
##  [35] "companies"                                     
##  [36] "company"                                       
##  [37] "contenttransferencoding"                       
##  [38] "contenttype"                                   
##  [39] "credit"                                        
##  [40] "date"                                          
##  [41] "day"                                           
##  [42] "debian"                                        
##  [43] "deliveredto"                                   
##  [44] "deliverydate"                                  
##  [45] "discussion"                                    
##  [46] "div"                                           
##  [47] "dogmaslashnullorg"                             
##  [48] "dont"                                          
##  [49] "easy"                                          
##  [50] "edt"                                           
##  [51] "egwn"                                          
##  [52] "egwnnet"                                       
##  [53] "email"                                         
##  [54] "encodingutf"                                   
##  [55] "end"                                           
##  [56] "envelopefrom"                                  
##  [57] "errorsto"                                      
##  [58] "esmtp"                                         
##  [59] "even"                                          
##  [60] "every"                                         
##  [61] "exim"                                          
##  [62] "exmh"                                          
##  [63] "facearial"                                     
##  [64] "facedarial"                                    
##  [65] "facedverdana"                                  
##  [66] "faceverdana"                                   
##  [67] "fetchmail"                                     
##  [68] "file"                                          
##  [69] "find"                                          
##  [70] "first"                                         
##  [71] "font"                                          
##  [72] "fontfont"                                      
##  [73] "forkadminxentcom"                              
##  [74] "forkexamplecom"                                
##  [75] "forkxentcom"                                   
##  [76] "form"                                          
##  [77] "found"                                         
##  [78] "free"                                          
##  [79] "fri"                                           
##  [80] "friends"                                       
##  [81] "get"                                           
##  [82] "good"                                          
##  [83] "government"                                    
##  [84] "group"                                         
##  [85] "head"                                          
##  [86] "height"                                        
##  [87] "heightd"                                       
##  [88] "help"                                          
##  [89] "helvetica"                                     
##  [90] "hits"                                          
##  [91] "home"                                          
##  [92] "hqpronsnet"                                    
##  [93] "html"                                          
##  [94] "httplistsfreshrpmsnetmailmanlistinforpmzzzlist"
##  [95] "httpxentcommailmanlistinfofork"                
##  [96] "httpxentcompipermailfork"                      
##  [97] "iluglinuxie"                                   
##  [98] "imap"                                          
##  [99] "img"                                           
## [100] "information"                                   
## [101] "input"                                         
## [102] "inreplyto"                                     
## [103] "internet"                                      
## [104] "intmxcorpexamplecom"                           
## [105] "invoked"                                       
## [106] "ist"                                           
## [107] "ive"                                           
## [108] "jalapeno"                                      
## [109] "jmasonorg"                                     
## [110] "jmjmasonorg"                                   
## [111] "jmlocalhost"                                   
## [112] "jmnetnoteinccom"                               
## [113] "jul"                                           
## [114] "jun"                                           
## [115] "just"                                          
## [116] "khare"                                         
## [117] "know"                                          
## [118] "lairxentcom"                                   
## [119] "life"                                          
## [120] "like"                                          
## [121] "line"                                          
## [122] "link"                                          
## [123] "linux"                                         
## [124] "list"                                          
## [125] "listarchive"                                   
## [126] "listhelp"                                      
## [127] "listid"                                        
## [128] "listmanexamplecom"                             
## [129] "listpost"                                      
## [130] "listsubscribe"                                 
## [131] "listunsubscribe"                               
## [132] "localhost"                                     
## [133] "locustmindernet"                               
## [134] "long"                                          
## [135] "look"                                          
## [136] "made"                                          
## [137] "mail"                                          
## [138] "mailing"                                       
## [139] "mailtoforkexamplecom"                          
## [140] "mailtoforkrequestxentcomsubjecthelp"           
## [141] "mailtoforkrequestxentcomsubjectsubscribe"      
## [142] "mailtoforkrequestxentcomsubjectunsubscribe"    
## [143] "make"                                          
## [144] "mandarklabsnetnoteinccom"                      
## [145] "many"                                          
## [146] "marketing"                                     
## [147] "may"                                           
## [148] "message"                                       
## [149] "messageid"                                     
## [150] "meta"                                          
## [151] "microsoft"                                     
## [152] "million"                                       
## [153] "mimeversion"                                   
## [154] "mon"                                           
## [155] "money"                                         
## [156] "much"                                          
## [157] "name"                                          
## [158] "need"                                          
## [159] "network"                                       
## [160] "never"                                         
## [161] "new"                                           
## [162] "normal"                                        
## [163] "now"                                           
## [164] "number"                                        
## [165] "oct"                                           
## [166] "offer"                                         
## [167] "one"                                           
## [168] "online"                                        
## [169] "option"                                        
## [170] "order"                                         
## [171] "outlook"                                       
## [172] "pdt"                                           
## [173] "people"                                        
## [174] "per"                                           
## [175] "pfont"                                         
## [176] "phobos"                                        
## [177] "phoboslabsnetnoteinccom"                       
## [178] "phone"                                         
## [179] "please"                                        
## [180] "postfix"                                       
## [181] "precedence"                                    
## [182] "problem"                                       
## [183] "program"                                       
## [184] "qmail"                                         
## [185] "quotedprintable"                               
## [186] "really"                                        
## [187] "receive"                                       
## [188] "received"                                      
## [189] "references"                                    
## [190] "remove"                                        
## [191] "removed"                                       
## [192] "replyto"                                       
## [193] "report"                                        
## [194] "required"                                      
## [195] "returnpath"                                    
## [196] "right"                                         
## [197] "rohit"                                         
## [198] "rpm"                                           
## [199] "rpmlistfreshrpmsnet"                           
## [200] "rpmzzzlistadminfreshrpmsnet"                   
## [201] "rpmzzzlistfreshrpmsnet"                        
## [202] "rssfeedsexamplecom"                            
## [203] "rssfeedsjmasonorg"                             
## [204] "said"                                          
## [205] "sansserif"                                     
## [206] "sat"                                           
## [207] "say"                                           
## [208] "see"                                           
## [209] "send"                                          
## [210] "sender"                                        
## [211] "sent"                                          
## [212] "sep"                                           
## [213] "service"                                       
## [214] "sfnet"                                         
## [215] "since"                                         
## [216] "singledrop"                                    
## [217] "site"                                          
## [218] "size"                                          
## [219] "sized"                                         
## [220] "smtp"                                          
## [221] "software"                                      
## [222] "something"                                     
## [223] "spam"                                          
## [224] "states"                                        
## [225] "still"                                         
## [226] "subject"                                       
## [227] "sun"                                           
## [228] "system"                                        
## [229] "table"                                         
## [230] "take"                                          
## [231] "technology"                                    
## [232] "texthtml"                                      
## [233] "textplain"                                     
## [234] "thats"                                         
## [235] "think"                                         
## [236] "thu"                                           
## [237] "time"                                          
## [238] "times"                                         
## [239] "today"                                         
## [240] "tue"                                           
## [241] "two"                                           
## [242] "unknown"                                       
## [243] "url"                                           
## [244] "use"                                           
## [245] "used"                                          
## [246] "users"                                         
## [247] "using"                                         
## [248] "uswsflistsourceforgenet"                       
## [249] "vamm"                                          
## [250] "version"                                       
## [251] "versioncvs"                                    
## [252] "want"                                          
## [253] "way"                                           
## [254] "web"                                           
## [255] "webnotenet"                                    
## [256] "wed"                                           
## [257] "well"                                          
## [258] "width"                                         
## [259] "widthd"                                        
## [260] "will"                                          
## [261] "within"                                        
## [262] "without"                                       
## [263] "work"                                          
## [264] "world"                                         
## [265] "wrote"                                         
## [266] "xbeenthere"                                    
## [267] "xentcom"                                       
## [268] "xkeywords"                                     
## [269] "xmailer"                                       
## [270] "xmailmanversion"                               
## [271] "xmsmailpriority"                               
## [272] "xoriginaldate"                                 
## [273] "xpriority"                                     
## [274] "xspamlevel"                                    
## [275] "xspamstatus"                                   
## [276] "year"                                          
## [277] "years"                                         
## [278] "yyyyexamplecom"                                
## [279] "yyyylocalhostexamplecom"                       
## [280] "yyyylocalhostnetnoteinccom"                    
## [281] "zzzzlocalhost"

STEP 6. Create Test and Train Data

set.seed(123)
train.size <- floor(0.70 * nrow(spam.ham.df))
train.size
## [1] 2763
train.Index <- sample(seq_len(nrow(spam.ham.df)), size = train.size)

train.Spam.Ham <- spam.ham.df[train.Index, ]
test.Spam.Ham <- spam.ham.df[-train.Index, ]

# count of spam and ham in train data set
spam<-subset(train.Spam.Ham,train.Spam.Ham$type == "spam")
ham<-subset(train.Spam.Ham,train.Spam.Ham$type == "ham")


# Create corpus for training and test data
train.Email.Corpus <- Corpus(VectorSource(train.Spam.Ham$body))
test.Email.Corpus <- Corpus(VectorSource(test.Spam.Ham$body))

train.Clean.Corpus <- tm_map(train.Email.Corpus ,removeNumbers)
train.Clean.Corpus <- tm_map(train.Clean.Corpus, removePunctuation)
train.Clean.Corpus <- tm_map(train.Clean.Corpus, removeWords, stopwords())
train.Clean.Corpus<- tm_map(train.Clean.Corpus, stripWhitespace)

test.Clean.Corpus <- tm_map(test.Email.Corpus, removeNumbers)
test.Clean.Corpus <- tm_map(test.Clean.Corpus, removePunctuation)
test.Clean.Corpus  <- tm_map(test.Clean.Corpus, removeWords, stopwords())
test.Clean.Corpus<- tm_map(test.Clean.Corpus, stripWhitespace)

train_email_dtm <- DocumentTermMatrix(train.Clean.Corpus)
test_email_dtm <- DocumentTermMatrix(test.Clean.Corpus)

# count function
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

train_sms <- apply(train_email_dtm, 2, convert_count)
test_sms <- apply(test_email_dtm, 2, convert_count)

library(e1071)
# classification of email
classifier <- naiveBayes(train_sms, factor(train.Spam.Ham$type))

STEP 7: Predict the Data

test_pred <- predict(classifier, newdata=test_sms)

table(test_pred, test.Spam.Ham$type)
##          
## test_pred ham spam
##      ham  770   16
##      spam   1  398

STEP 8. Conclusion

Conclusion - Classification

We saw how we could use naiveBayes to accurately predict the Ham vs. Spam.
We could similarly use other algorithms like SVM, Random Forest or Maxvent models.

My computer despite having 16GB of RAM + GPU kept running our of memory. This made the project bit painful as i kept waiting for hours to see results to see the RStudio crash. I wish the dataset was bit more manageable for us to experiment more.

Professor, I also appreciate if you could please teach us spanning instances on AWS to run this code. I would love that!