DATA_607_Project4

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: http://spamassassin.apache.org/old/publiccorpus/

Environment Set Up

Data Acquisition

source: http://spamassassin.apache.org/old/publiccorpus/

ham.corpus <- VCorpus(DirSource('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 607\\project4\\easy_ham'))
spam.corpus <- VCorpus(DirSource('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 607\\project4\\spam_2'))

Data Wrangling

exploring corpus to view the content and metadata information.

text_df <- data_frame( text = ham.corpus[1])
text_df

## # A tibble: 1 x 1
##   text                                                                    
##   <S3: VCorpus>                                                           
## 1 "list(list(content = c(\"From exmh-workers-admin@redhat.com  Thu Aug 22~
## 1 list()                                                                  
## 1 list()

print(ham.corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2551

inspect(ham.corpus[1:4])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 4
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 4850
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3281
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3830
## 
## [[4]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3335

Combining both ‘spam’ and ‘ham’ corpus together using metadata information and preparing the combined corpus for further cleaning

meta(spam.corpus, tag = "type") <- "spam"
meta(ham.corpus, tag = "type") <- "ham"
corpus_clean <- c(spam.corpus, ham.corpus)
corpus_clean <- tm_map(corpus_clean, content_transformer(function(x) iconv(x, "UTF-8", sub="byte")))

converting entire corpus content to lower case

corpus_clean <- tm_map(corpus_clean, content_transformer(tolower))
#as.character(corpus_clean[[1]])

Pre-processing text data (corpus cleaning), using basic tm fucntions such as getting rid of stop words, punctuation removal, whitespce removal.

corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removeWords,stopwords("english"))
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
as.character(corpus_clean[[1]])

##   [1] " ilugadminlinuxie tue aug "                                      
##   [2] "returnpath ilugadminlinuxie"                                     
##   [3] "delivered yyyylocalhostnetnoteinccom"                            
##   [4] "received localhost localhost "                                   
##   [5] " phoboslabsnetnoteinccom postfix esmtp id efdd"                  
##   [6] " jmlocalhost tue aug edt"                                        
##   [7] "received phobos "                                                
##   [8] " localhost imap fetchmail"                                       
##   [9] " jmlocalhost singledrop tue aug ist"                             
##  [10] "received lughtuathaorg rootlughtuathaorg "                       
##  [11] " dogmaslashnullorg esmtp id glqwv "                              
##  [12] " jmilugjmasonorg fri aug "                                       
##  [13] "received lugh rootlocalhost lughtuathaorg"                       
##  [14] " esmtp id waa fri aug "                                          
##  [15] "received bettyjagessarcom wznycnydslcncnet"                      
##  [16] " lughtuathaorg esmtp id waa "                                    
##  [17] " iluglinuxie fri aug "                                           
##  [18] "xauthenticationwarning lughtuathaorg host wznycnydslcncnet"      
##  [19] " claimed bettyjagessarcom"                                       
##  [20] "received bettyjagessarcom"                                       
##  [21] " smtpd eval id aafcf fri aug "                                   
##  [22] "messageid "                                                      
##  [23] "date fri aug "                                                   
##  [24] " iluglinuxie"                                                    
##  [25] " start now startnowhotmailcom"                                   
##  [26] "mimeversion "                                                    
##  [27] "contenttype textplain charsetusascii formatflowed"               
##  [28] "subject ilug stop mlm insanity"                                  
##  [29] "sender ilugadminlinuxie"                                         
##  [30] "errors ilugadminlinuxie"                                         
##  [31] "xmailmanversion "                                                
##  [32] "precedence bulk"                                                 
##  [33] "listid irish linux users group iluglinuxie"                      
##  [34] "xbeenthere iluglinuxie"                                          
##  [35] ""                                                                
##  [36] "greetings"                                                       
##  [37] ""                                                                
##  [38] " receiving letter expressed interest "                           
##  [39] "receiving information online business opportunities "            
##  [40] "erroneous please accept sincere apology onetime "                
##  [41] "mailing removal necessary"                                       
##  [42] ""                                                                
##  [43] " burned betrayed backstabbed multilevel marketing "              
##  [44] "mlm please read letter important one "                           
##  [45] " ever landed inbox"                                              
##  [46] ""                                                                
##  [47] "multilevel marketing huge mistake people"                        
##  [48] ""                                                                
##  [49] "mlm failed deliver promises past years pursuit "                 
##  [50] " mlm dream cost hundreds thousands people friends "              
##  [51] " fortunes sacred honor fact mlm fatally "                        
##  [52] "flawed meaning work people"                                      
##  [53] ""                                                                
##  [54] " companies earn big money mlm going "                            
##  [55] "tell real story finally someone courage "                        
##  [56] "cut hype lies tell truth mlm"                                    
##  [57] ""                                                                
##  [58] " good news"                                                      
##  [59] ""                                                                
##  [60] " alternative mlm works works big yet "                           
##  [61] "abandoned dreams need see earning kind income "                  
##  [62] " dreamed easier think"                                           
##  [63] ""                                                                
##  [64] " permission like send brief letter will tell "                   
##  [65] " mlm work people will introduce "                                
##  [66] "something new refreshing wonder heard "                          
##  [67] " "                                                               
##  [68] ""                                                                
##  [69] " promise will unwanted follow sales pitch one "                  
##  [70] "will call email address will used send "                         
##  [71] "information period"                                              
##  [72] ""                                                                
##  [73] " receive free lifechanging information simply click reply type " 
##  [74] "send info subject box hit send get information "                 
##  [75] "within hours just look words mlm wall shame inbox"               
##  [76] ""                                                                
##  [77] "cordially"                                                       
##  [78] ""                                                                
##  [79] "siddhi"                                                          
##  [80] ""                                                                
##  [81] "ps someone recently sent letter "                                
##  [82] "eyeopening financially beneficial information ever received "    
##  [83] "honestly believe will feel way read "                            
##  [84] " free"                                                           
##  [85] ""                                                                
##  [86] ""                                                                
##  [87] ""                                                                
##  [88] " email never sent unsolicited spam receiving "                   
##  [89] " email explicitly signed list "                                  
##  [90] "online signup form use ffa links page emaildom "                 
##  [91] "systems explicit terms use state use "                           
##  [92] " agree receive emailings may also member altra "                 
##  [93] "computer systems list one many numerous free marketing services "
##  [94] " agreed signed list also "                                       
##  [95] "receiving emailing"                                              
##  [96] "due email message considered unsolicitated "                     
##  [97] "spam"                                                            
##  [98] ""                                                                
##  [99] ""                                                                
## [100] ""                                                                
## [101] ""                                                                
## [102] ""                                                                
## [103] " "                                                               
## [104] "irish linux users group iluglinuxie"                             
## [105] "httpwwwlinuxiemailmanlistinfoilug unsubscription information"    
## [106] "list maintainer listmasterlinuxie"                               
## [107] ""                                                                
## [108] ""

Storing pre processed text data into Document Term Matrix (dtm)

dtm <- DocumentTermMatrix(corpus_clean)
dtm

## <<DocumentTermMatrix (documents: 3948, terms: 94217)>>
## Non-/sparse entries: 665088/371303628
## Sparsity           : 100%
## Maximal term length: 868
## Weighting          : term frequency (tf)

meta_type <- as.vector(unlist(meta(corpus_clean)))
meta_data <- data.frame(type = unlist(meta_type))
table(meta_data)

## meta_data
##  ham spam 
## 2551 1397

inspect(corpus_clean[1:2])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 2
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2663
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 3223

corpus_clean[[1]]

## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2663

#dtm <- removeSparseTerms(dtm, 1-(10/length(corpus_clean)))
#dtm

Tidying Text Data

Using tidytext library to tify the the dtm text data and then arranging the

corpus.tidy <- tidy(dtm)
head(corpus.tidy )

## # A tibble: 6 x 3
##   document                               term      count
##   <chr>                                  <chr>     <dbl>
## 1 00001.317e78fa8ee2f54cd4890fdc09ba8176 aafcf        1.
## 2 00001.317e78fa8ee2f54cd4890fdc09ba8176 abandoned    1.
## 3 00001.317e78fa8ee2f54cd4890fdc09ba8176 accept       1.
## 4 00001.317e78fa8ee2f54cd4890fdc09ba8176 address      1.
## 5 00001.317e78fa8ee2f54cd4890fdc09ba8176 agree        1.
## 6 00001.317e78fa8ee2f54cd4890fdc09ba8176 agreed       1.

corpus.tidy.sort <- corpus.tidy  %>%
  arrange(desc(count))
kable(head(corpus.tidy.sort))

document	term	count
00028.60393e49c90f750226bee6381eb3e69d	arial	272
00028.60393e49c90f750226bee6381eb3e69d	colorfont	270
00028.60393e49c90f750226bee6381eb3e69d	faceverdana	270
00028.60393e49c90f750226bee6381eb3e69d	geneva	270
00028.60393e49c90f750226bee6381eb3e69d	helvetica	270
00028.60393e49c90f750226bee6381eb3e69d	sansseriffont	270

inspect(dtm)

## <<DocumentTermMatrix (documents: 3948, terms: 94217)>>
## Non-/sparse entries: 665088/371303628
## Sparsity           : 100%
## Maximal term length: 868
## Weighting          : term frequency (tf)
## Sample             :
##                                         Terms
## Docs                                     aug esmtp jmlocalhost localhost
##   00028.60393e49c90f750226bee6381eb3e69d   0     2           0         0
##   00051.8b17ce16ace4d5845e2299c0123e1f14   0     2           0         0
##   01083.a6b3c50be5abf782b585995d2c11176b   0     4           0         1
##   01094.91779ec04e5e6b27e84297c28fc7369f   0     1           2         3
##   0320.6c54ea1bb991c6fae395588219cfce37    0     5           0         4
##   0627.c9ad8730dad7bda1e1169ee00c4006fc    0     5           2         5
##   0730.9570ee3b6bf144198297b23bca5044e9    0     5           2         4
##   0737.aa298505cb31aac78d0dbf229fc45fb9    0     4           2         3
##   0826.082e92a79a15aa7f6dd5b85a40327abd    0     5           2         3
##   1022.73ab70b91862d709897cfe3dd5bb22a0    0     4           2         3
##                                         Terms
## Docs                                     mon oct postfix received sep thu
##   00028.60393e49c90f750226bee6381eb3e69d   4   0       2        2   0   0
##   00051.8b17ce16ace4d5845e2299c0123e1f14   0   0       2        2   0   0
##   01083.a6b3c50be5abf782b585995d2c11176b   0   0       0        8   0   9
##   01094.91779ec04e5e6b27e84297c28fc7369f   0   0       1        7   0   1
##   0320.6c54ea1bb991c6fae395588219cfce37    0   8       3        6   0   0
##   0627.c9ad8730dad7bda1e1169ee00c4006fc    0   0       3        6   8   8
##   0730.9570ee3b6bf144198297b23bca5044e9    0   0       3        7   8   0
##   0737.aa298505cb31aac78d0dbf229fc45fb9    0   0       3        7   9   0
##   0826.082e92a79a15aa7f6dd5b85a40327abd    0   0       3        6   8   0
##   1022.73ab70b91862d709897cfe3dd5bb22a0    0   9       3        6   0   4

Finding top 5 term in the tidy text

term.frequency <- corpus.tidy.sort%>%
  select(term,count) %>%
  group_by(term) %>%
  summarise(termFrequency = sum(count)) %>%
  arrange(desc(termFrequency))
kable(head(term.frequency))

term	termFrequency
received	20576
esmtp	11848
sep	10013
localhost	9501
aug	6467
mon	5842

ggplot(data=filter(term.frequency,termFrequency>4000), aes(x = term, y = termFrequency)) +
  geom_bar(stat = "identity", aes(fill=termFrequency)) +
  geom_text(aes(label=termFrequency), vjust=-0.2)+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 65, hjust = 1),legend.position = 'none')

Model & Prediction (Text Classifier)

Preparing training and test dataset

#Toal datset size
totalSize <- (round(length(meta_type)))
totalSize

## [1] 3948

# 70% training dataset
traindata.size <-(round(length(meta_type)*.7))
traindata.size

## [1] 2764

# 30% test  dataset
paste0('test data point starts from ',round(length(meta_type)*.7)+1, ' and ends at ',totalSize)

## [1] "test data point starts from 2765 and ends at 3948"

# Process dataset in container for model fitment
data.train.test.container <- create_container(dtm, labels = meta_type, trainSize = 1:traindata.size,testSize = (round(length(meta_type)*.7)+1):totalSize, virgin = FALSE)

slotNames(data.train.test.container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

SVM Model Evaluation

svm.model <- train_model(data.train.test.container, "SVM")
svm.predict <- classify_model(data.train.test.container, svm.model)

data.label.svm <- data.frame(
  correct_label = meta_type[2765:3948],
  svm = as.character(svm.predict[,1]),
  stringsAsFactors = F)

table(data.label.svm[,1] == data.label.svm[,2])

## 
## FALSE  TRUE 
##   136  1048

prop.table(table(data.label.svm[,1] == data.label.svm[,2]))

## 
##     FALSE      TRUE 
## 0.1148649 0.8851351

Conclusion

SVM Model appears to be an efficient classifier for the given text dataset.