#download required packages
suppressMessages(suppressWarnings(library(tm)))
suppressMessages(suppressWarnings(library(RCurl)))
suppressMessages(suppressWarnings(library(stringr)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(XML)))
suppressMessages(suppressWarnings(library(tidytext)))
suppressMessages(suppressWarnings(library(ggplot2)))

Taining data set:

I uploaded each first 30 files from spam and easy_ham files as my data.Why 30 files each? Because the limit number of files in a folder in github is 100 and my raw data folds have thoursands files. 30 is just a nic number and save time from upload but reduce the accuracy in prediction (more files is better).

spam variable: a binary (0/1) variable, spam=1 while the text is spam; if from ham files then spam=0.

All source files are from the link http://spamassassin.apache.org/old/publiccorpus/ .

(1) Create a list for the 30-easy_ham-file URLs

#Get the 30 easy_ham file names
easy_ham_list<-"https://raw.githubusercontent.com/ada2802/607-Project-4-Text-Mining/master/easy_ham/cmds"

easy_ham_df<- readLines(easy_ham_list)

easy_ham_fl <- sapply(strsplit(easy_ham_df," "),"[[",2)

easy_ham_30<-head(easy_ham_fl,30)

ham_path <- "https://raw.githubusercontent.com/ada2802/607-Project-4-Text-Mining/master/easy_ham/"

easy_ham_url_30=c()

for(i in 1:30){
  easy_ham_url_30[i] <- paste0(ham_path,easy_ham_30[i])
}

#head(easy_ham_url_30)

(2) Create a list for the 30-spam2-file URLs

#Get the 30 spam2 file names
spam2_list<-"https://raw.githubusercontent.com/ada2802/607-Project-4-Text-Mining/master/spam_2/cmds"

spam2_df<- readLines(spam2_list)

spam2_fl <- sapply(strsplit(spam2_df," "),"[[",2)

spam2_30<-head(spam2_fl,30)

#correct #7 file name from cmd file becasue it is unmatched the file name
spam2_30[7] <-"00007.acefeee792b5298f8fee175f9f65c453"

spam_path <- "https://raw.githubusercontent.com/ada2802/607-Project-4-Text-Mining/master/spam_2/"

spam2_url_30=c()

for(i in 1:30){
  spam2_url_30[i] <- paste0(spam_path,spam2_30[i])
}

#head(spam2_url_30)

(3) Easy_Ham Raw Dataset: Read easy_ham Text Messages

#create an empty vector for easy_ham train data set
easy_ham_train_set=c()
ham=c()

#read in ham file url one by one and store in the ham taining data set vector
for(i in 1:30) {  
  lines <- readLines(easy_ham_url_30[i])
  ham <- paste(lines, collapse = ' ')
  easy_ham_text <-data_frame(Spam=0, text=ham)

  #combine all easy_ham files per file per count
  easy_ham_train_set <- rbind(easy_ham_train_set,easy_ham_text)
}

str(easy_ham_train_set)
## Classes 'tbl_df', 'tbl' and 'data.frame':    30 obs. of  2 variables:
##  $ Spam: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ text: chr  "From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002 Return-Path: <exmh-workers-admin@spamassassin.tain"| __truncated__ "From Steve_Burt@cursor-system.com  Thu Aug 22 12:46:39 2002 Return-Path: <Steve_Burt@cursor-system.com> Deliver"| __truncated__ "From timc@2ubh.com  Thu Aug 22 13:52:59 2002 Return-Path: <timc@2ubh.com> Delivered-To: zzzz@localhost.netnotei"| __truncated__ "From irregulars-admin@tb.tf  Thu Aug 22 14:23:39 2002 Return-Path: <irregulars-admin@tb.tf> Delivered-To: zzzz@"| __truncated__ ...

(4) Spam Raw Dataset: Read spam Text Messages

#create an empty vector for easy_ham train data set
spam_train_set=c()
spam=c()

#read in spam file url one by one and store in the spam taining data set vector
for(i in 1:30) {  
  lines <- readLines(spam2_url_30[i])
  spam <- paste(lines, collapse = ' ')
  spam_text <-data_frame(Spam=1, text=spam)

  #combine all spam files per file per count
  spam_train_set <- rbind(spam_train_set,spam_text)
}

str(spam_train_set)
## Classes 'tbl_df', 'tbl' and 'data.frame':    30 obs. of  2 variables:
##  $ Spam: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ text: chr  "From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002 Return-Path: <ilug-admin@linux.ie> Delivered-To: yyyy@localh"| __truncated__ "From lmrn@mailexcite.com  Mon Jun 24 17:03:24 2002 Return-Path: merchantsworld2001@juno.com Delivery-Date: Mon "| __truncated__ "From amknight@mailexcite.com  Mon Jun 24 17:03:49 2002 Return-Path: merchantsworld2001@juno.com Delivery-Date: "| __truncated__ "From jordan23@mailexcite.com  Mon Jun 24 17:04:20 2002 Return-Path: merchantsworld2001@juno.com Delivery-Date: "| __truncated__ ...

(5) Raw Total Dataset: Easy_Ham and spam Text Messages, which total is 60 files.

raw_train_data =c()
raw_train_data <- rbind(easy_ham_train_set,spam_train_set)

str(raw_train_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    60 obs. of  2 variables:
##  $ Spam: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ text: chr  "From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002 Return-Path: <exmh-workers-admin@spamassassin.tain"| __truncated__ "From Steve_Burt@cursor-system.com  Thu Aug 22 12:46:39 2002 Return-Path: <Steve_Burt@cursor-system.com> Deliver"| __truncated__ "From timc@2ubh.com  Thu Aug 22 13:52:59 2002 Return-Path: <timc@2ubh.com> Delivered-To: zzzz@localhost.netnotei"| __truncated__ "From irregulars-admin@tb.tf  Thu Aug 22 14:23:39 2002 Return-Path: <irregulars-admin@tb.tf> Delivered-To: zzzz@"| __truncated__ ...
#table(raw_data$Spam)

(6) Basic data analysis: spam test by word count

#tidy data
spam_text <- raw_train_data %>% 
            filter(Spam==1)%>%
            unnest_tokens(word, text) %>%
            anti_join(stop_words ) %>%
            count(word, sort=TRUE )
## Joining, by = "word"
#data analysis
spam_text %>%
    filter(n > 100) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

(7) Raw Dataset Clearning by using Corpus

It is a better way to clean text from raw training data messages (tdm).

corpus <- Corpus(VectorSource(raw_train_data$text))                       #create a new corpus variable

corpus.tmp <- tm_map(corpus,removePunctuation)                      #remove all punctuation

corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)                   #remove all whitespace

corpus.tmp <- tm_map(corpus.tmp, tolower)                           #convert text to lowercase

corpus.tmp <- tm_map(corpus.tmp, removeWords,stopwords("english"))  #remove all English stopwords

corpus.tmp <- tm_map(corpus.tmp, stemDocument)                      #stem the words
#str(corpus.tmp)

tdm_text <- DocumentTermMatrix (corpus.tmp)                         #build a document term matrix - training data messages
#tdm_text

    
#remove spare terms: limit tdm_text containing in at lease 5% of text
tdm_text_0.95 = removeSparseTerms(tdm_text, 0.95)
tdm_text_Sparse = as.data.frame(as.matrix(tdm_text_0.95))
colnames(tdm_text_Sparse) = make.names(colnames(tdm_text_Sparse))

sort(colSums(tdm_text_Sparse))
##                                        actual 
##                                             4 
##                                       discuss 
##                                             4 
##                                          hand 
##                                             4 
##                                          mark 
##                                             4 
##                                        search 
##                                             4 
##                                         ideal 
##                                             4 
##                                      unverifi 
##                                             4 
##                                           arm 
##                                             4 
##                                         later 
##                                             4 
##                                         press 
##                                             4 
##                                         taken 
##                                             4 
##                                        across 
##                                             4 
##                                          hell 
##                                             4 
##                                       univers 
##                                             4 
##                                       adamson 
##                                             4 
##                                       attract 
##                                             4 
##                                         final 
##                                             4 
##                                        reason 
##                                             4 
##                                        recent 
##                                             4 
##                                           sit 
##                                             4 
##                                         thing 
##                                             4 
##                                          word 
##                                             4 
##                                         worth 
##                                             4 
##                                       X103113 
##                                             4 
##                                     X209sfnet 
##                                             4 
##                                 X216136171252 
##                                             4 
##                  helouswsflist1sourceforgenet 
##                                             4 
##                                      research 
##                                             4 
##                        uswsffw2sourceforgenet 
##                                             4 
##                     uswsflist1bsourceforgenet 
##                                             4 
##                      uswsflist1sourceforgenet 
##                                             4 
##                                    xoriginald 
##                                             4 
##                                           job 
##                                             4 
##                               contentdisposit 
##                                             4 
##                                       mention 
##                                             4 
##                                          best 
##                                             4 
##                                        public 
##                                             4 
##                                        repres 
##                                             4 
##                                         simpl 
##                                             4 
##                                        specif 
##                                             4 
##                                        redhat 
##                                             4 
##                                       reliabl 
##                                             4 
##                                          tool 
##                                             4 
##                                        enough 
##                                             4 
##                                        accept 
##                                             4 
##                                         X1980 
##                                             4 
##                                      X45405br 
##                                             4 
##                                charsetdefault 
##                                             4 
##                                 citystatezipa 
##                                             4 
##                                        dayton 
##                                             4 
##                                         expir 
##                                             4 
##                                          ohio 
##                                             4 
##                                       contain 
##                                             4 
##                                       size3d4 
##                                             4 
##                                         oblig 
##                                             4 
##                              jm7netnoteinccom 
##                                             4 
##                                           abl 
##                                             5 
##                                           ago 
##                                             5 
##                                          fail 
##                                             5 
##                          localhostlocaldomain 
##                                             5 
##                                        author 
##                                             5 
##                                        detail 
##                                             5 
##                                          edit 
##                                             5 
##                                          half 
##                                             5 
##                                        except 
##                                             5 
##                              xmimeautoconvert 
##                                             5 
##                                           bad 
##                                             5 
##                                          cell 
##                                             5 
##                                       current 
##                                             5 
##                                       exercis 
##                                             5 
##                                         figur 
##                                             5 
##                                        honest 
##                                             5 
##                                         howev 
##                                             5 
##                                         known 
##                                             5 
##                                          mean 
##                                             5 
##                                        result 
##                                             5 
##                                          wish 
##                                             5 
##                                        mimeol 
##                                             5 
##                                         sfnet 
##                                             5 
##                                  spamassassin 
##                                             5 
##                                     X81258125 
##                                             5 
##                                        looney 
##                                             5 
##                                           fix 
##                                             5 
##                                          full 
##                                             5 
##                                     function. 
##                                             5 
##                                          less 
##                                             5 
##                                         X2011 
##                                             5 
##                                   X6416122236 
##                                             5 
##                                      advertis 
##                                             5 
##                      httpxentcompipermailfork 
##                                             5 
##                                         khare 
##                                             5 
##                                   lairxentcom 
##                                             5 
##           mailtoforkrequestxentcomsubjecthelp 
##                                             5 
##       mailtoforkrequestxentcomsubjectsubscrib 
##                                             5 
##     mailtoforkrequestxentcomsubjectunsubscrib 
##                                             5 
##                mailtoforkspamassassintaintorg 
##                                             5 
##                                         rohit 
##                                             5 
##                                           act 
##                                             5 
##                                       general 
##                                             5 
##                                        employ 
##                                             5 
##                                         enjoy 
##                                             5 
##                                         next. 
##                                             5 
##                                        wonder 
##                                             5 
##                                           zip 
##                                             5 
##                                          stop 
##                                             5 
##                                     X81128112 
##                                             5 
##                                        namebr 
##                                             5 
##                                          neat 
##                                             5 
##                                        payabl 
##                                             5 
##                                          valu 
##                                             5 
##                                         X1000 
##                                             5 
##                                       absolut 
##                                             5 
##                                        assist 
##                                             5 
##                                      guarante 
##                                             5 
##                                           inc 
##                                             5 
##                                     X10000000 
##                                             5 
##                                          serv 
##                                             5 
##                                         along 
##                                             5 
##                                          sinc 
##                                             6 
##                                       content 
##                                             6 
##                                        martin 
##                                             6 
##                                     technolog 
##                                             6 
##                                        govern 
##                                             6 
##                                         offic 
##                                             6 
##                                        outsid 
##                                             6 
##                                           red 
##                                             6 
##                                      approach 
##                                             6 
##                                          firm 
##                                             6 
##                                          nice 
##                                             6 
##                                         owner 
##                                             6 
##                                        prefer 
##                                             6 
##                                         guess 
##                                             6 
##                                         resid 
##                                             6 
##                                         extra 
##                                             6 
##                                       freedom 
##                                             6 
##                                          past 
##                                             6 
##                                         spend 
##                                             6 
##                                         style 
##                                             6 
##                                          turn 
##                                             6 
##                                           via 
##                                             6 
##                                       xmimeol 
##                                             6 
##                                         allow 
##                                             6 
##                                    formatflow 
##                                             6 
##                                         updat 
##                                             6 
##                                       everyth 
##                                             6 
##                                          john 
##                                             6 
##                                           iii 
##                                             6 
##                                          leav 
##                                             6 
##                                           top 
##                                             6 
##                                       qualiti 
##                                             6 
##                                         typic 
##                                             6 
##                                        comput 
##                                             6 
##                                        pictur 
##                                             6 
##                                          step 
##                                             6 
##                                        server 
##                                             6 
##                                          unix 
##                                             6 
##                                        packag 
##                                             6 
##                                        welcom 
##                                             6 
##                                        easier 
##                                             6 
##                                 X213105180140 
##                                             6 
##                                     addressbr 
##                                             6 
##                                          inbr 
##                                             6 
##                     merchantsworld2001junocom 
##                                             6 
##                                        postal 
##                                             6 
##                                        return 
##                                             6 
##                                         minut 
##                                             6 
##                                        corpor 
##                                             6 
##                                         herea 
##                                             6 
##                                          head 
##                                             6 
##                                           eir 
##                                             6 
##                                       payment 
##                                             6 
##                                     uncollect 
##                                             6 
##                                        either 
##                                             6 
##                                        intern 
##                                             6 
##                       recipientsnetnoteinccom 
##                                             6 
##                                     undisclos 
##                                             6 
##                                       develop 
##                                             7 
##                                          issu 
##                                             7 
##                                          plan 
##                                             7 
##                                   X6621866218 
##                                             7 
##                                        august 
##                                             7 
##                                       continu 
##                                             7 
##                                           got 
##                                             7 
##                            mta3grpscdyahoocom 
##                                             7 
##                                          wont 
##                                             7 
##                                           ive 
##                                             7 
##                                     mozilla50 
##                                             7 
##                                          quit 
##                                             7 
##                                xacceptlanguag 
##                                             7 
##                                           end 
##                                             7 
##                                           fun 
##                                             7 
##                                          pass 
##                                             7 
##                                        promis 
##                                             7 
##                                         quick 
##                                             7 
##                                          soon 
##                                             7 
##                                        apolog 
##                                             7 
##                                         found 
##                                             7 
##                                          main 
##                                             7 
##                                          deal 
##                                             7 
##                                          fact 
##                                             7 
##                                          open 
##                                             7 
##                                        releas 
##                                             7 
##                                           old 
##                                             7 
##                                           set 
##                                             7 
##                                           box 
##                                             7 
##                                          X200 
##                                             7 
##                                           add 
##                                             7 
##                                          case 
##                                             7 
##                                          play 
##                                             7 
##                                          seek 
##                                             7 
##                                     deliveryd 
##                                             7 
##                                          fill 
##                                             7 
##                                      xkeyword 
##                                             7 
##                                           pay 
##                                             7 
##                                         stock 
##                                             7 
##                                         alway 
##                                             7 
##                                         creat 
##                                             8 
##                                  xyahooprofil 
##                                             8 
##                                          away 
##                                             8 
##                                          move 
##                                             8 
##                                        within 
##                                             8 
##                                          X300 
##                                             8 
##                                           bst 
##                                             8 
##                                        realli 
##                                             8 
##                                          life 
##                                             8 
##                                       possibl 
##                                             8 
##                                         anyon 
##                                             8 
##                                          face 
##                                             8 
##                                          hard 
##                                             8 
##                                         learn 
##                                             8 
##                                         power 
##                                             8 
##                                      transfer 
##                                             8 
##                                     X331vamm2 
##                                             8 
##                                         exist 
##                                             8 
##                      uswsflist2sourceforgenet 
##                                             8 
##                                          idea 
##                                             8 
##                  zzzzilugspamassassintaintorg 
##                                             8 
##                                      signatur 
##                                             8 
##                                           let 
##                                             8 
##                                         repli 
##                                             8 
##                                          base 
##                                             8 
##                                         bonus 
##                                             8 
##                                        famili 
##                                             8 
##                                          keep 
##                                             8 
##                                         legal 
##                                             8 
##                                           net 
##                                             8 
##                                       financi 
##                                             8 
##                                         bfont 
##                                             8 
##                                         match 
##                                             8 
##                                           usa 
##                                             8 
##                                          X100 
##                                             8 
##                                        select 
##                                             8 
##                                           win 
##                                             8 
##                                         X0800 
##                                             8 
##                                         error 
##                                             9 
##                                        happen 
##                                             9 
##                                       mercuri 
##                                             9 
##                                         still 
##                                             9 
##                                           say 
##                                             9 
##                                          noth 
##                                             9 
##                                        famous 
##                                             9 
##                                           big 
##                                             9 
##                                          diet 
##                                             9 
##                                         dream 
##                                             9 
##                                        expens 
##                                             9 
##                                          feel 
##                                             9 
##                                          made 
##                                             9 
##                                        secret 
##                                             9 
##                                          seem 
##                                             9 
##                                        produc 
##                                             9 
##                                          spam 
##                                             9 
##                                  X19412514545 
##                                             9 
##                                        friend 
##                                             9 
##                             rootlughtuathaorg 
##                                             9 
##                           xauthenticationwarn 
##                                             9 
##                                           etc 
##                                             9 
##                                          area 
##                                             9 
##                                        direct 
##                                             9 
##                                        effort 
##                                             9 
##                                         least 
##                                             9 
##                                          dell 
##                                             9 
##                                        import 
##                                             9 
##                                         trade 
##                                             9 
##                                          wait 
##                                             9 
##                                          bank 
##                                             9 
##                                         there 
##                                             9 
##                                        follow 
##                                             9 
##                                          sale 
##                                             9 
##                                   jmjmasonorg 
##                                             9 
##                                      texthtml 
##                                             9 
##                                       foreign 
##                                             9 
##                                          plus 
##                                             9 
##                                         place 
##                                             9 
##                                       size3d2 
##                                             9 
##                                          cant 
##                                            10 
##                                           hit 
##                                            10 
##                                     inreplyto 
##                                            10 
##                                          line 
##                                            10 
##                                          note 
##                                            10 
##                                         think 
##                                            10 
##                        forteanayahoogroupscom 
##                                            10 
##                                          high 
##                                            10 
##                                         claim 
##                                            10 
##                                          sign 
##                                            10 
##                                        effect 
##                                            10 
##                                         manag 
##                                            10 
##                                        better 
##                                            10 
##                                          youv 
##                                            10 
##                                         chanc 
##                                            10 
##                                         might 
##                                            10 
##                                         share 
##                                            10 
##                                        requir 
##                                            10 
##                                          X500 
##                                            10 
##                                         other 
##                                            10 
##                                          side 
##                                            10 
##                                         futur 
##                                            10 
##                                           bit 
##                                            10 
##                                       complet 
##                                            10 
##                                       exchang 
##                                            10 
##                                          citi 
##                                            10 
##                                        doesnt 
##                                            10 
##                                          back 
##                                            10 
##                                   jmlocalhost 
##                                            10 
##                                          debt 
##                                            10 
##                                           fax 
##                                            10 
##                                          safe 
##                                            10 
##                                          loss 
##                                            10 
##                                       purchas 
##                                            10 
##                                          rate 
##                                            10 
##                                       size3d3 
##                                            10 
##                                         court 
##                                            10 
##                                         chris 
##                                            11 
##                                          code 
##                                            11 
##                                          that 
##                                            11 
##                                          dvds 
##                                            11 
##                                           egp 
##                                            11 
##                   forteanaowneryahoogroupscom 
##                                            11 
##                      httpdocsyahoocominfoterm 
##                                            11 
## httpusclickyahoocompt6ybbnxieaamg3haa7gsolbtm 
##                                            11 
##                                      mail8101 
##                                            11 
##                                   mailinglist 
##                                            11 
##      mailtozzzzteanaunsubscribeyahoogroupscom 
##                                            11 
##                                          qmqp 
##                                            11 
##                                 xapparentlyto 
##                                            11 
##                                xegroupsreturn 
##                                            11 
##                                       outlook 
##                                            11 
##                                          talk 
##                                            11 
##                                          last 
##                                            11 
##                                       softwar 
##                                            11 
##                                       instead 
##                                            11 
##                                         never 
##                                            11 
##                                         organ 
##                                            11 
##                                          seen 
##                                            11 
##                                          cost 
##                                            11 
##                                        fortun 
##                                            11 
##                                           see 
##                                            11 
##                                        someth 
##                                            11 
##                                       special 
##                                            11 
##                                           yes 
##                                            11 
##                                        debian 
##                                            11 
##                                        script 
##                                            11 
##                                         great 
##                                            11 
##             httpwwwlinuxiemailmanlistinfoilug 
##                                            11 
##                              listmasterlinuxi 
##                                            11 
##                                       problem 
##                                            11 
##                                   unsubscript 
##                                            11 
##                                         anoth 
##                                            11 
##                                          bill 
##                                            11 
##                                   forkxentcom 
##                                            11 
##                                          paid 
##                                            11 
##                                       potenti 
##                                            11 
##                                        someon 
##                                            11 
##                                          save 
##                                            11 
##                                           sat 
##                                            11 
##                                       success 
##                                            11 
##                                         reach 
##                                            12 
##                                        featur 
##                                            12 
##                 forteanaunsubscribeegroupscom 
##                                            12 
##                                       xsender 
##                                            12 
##                                         avail 
##                                            12 
##                                         build 
##                                            12 
##                                          said 
##                                            12 
##                                           lot 
##                                            12 
##                                           low 
##                                            12 
##                                         X8bit 
##                                            12 
##                                         wrote 
##                                            12 
##                                          give 
##                                            12 
##                                        machin 
##                                            12 
##                                        origin 
##                                            12 
##                                         write 
##                                            12 
##                                         X0200 
##                                            12 
##                                         thank 
##                                            12 
##                                          lugh 
##                                            12 
##                                 rootlocalhost 
##                                            12 
##                                         asset 
##                                            12 
##                                          must 
##                                            12 
##                                          sure 
##                                            12 
##                      mandarklabsnetnoteinccom 
##                                            12 
##                                       protect 
##                                            12 
##                                         price 
##                                            12 
##                                          cash 
##                                            12 
##                             align3dcenterfont 
##                                            12 
##                                          meta 
##                                            12 
##                                        winner 
##                                            12 
##                                          come 
##                                            13 
##                                      listarch 
##                                            13 
##                                      listhelp 
##                                            13 
##                                      listpost 
##                                            13 
##                                  listsubscrib 
##                                            13 
##                                          part 
##                                            13 
##                                          pick 
##                                            13 
##                                         refer 
##                                            13 
##                                          post 
##                                            13 
##                                     zzzzteana 
##                                            13 
##                                       alreadi 
##                                            13 
##                                          ever 
##                                            13 
##                                           ask 
##                                            13 
##                                        userag 
##                                            13 
##                               charsetiso88591 
##                                            13 
##                                        believ 
##                                            13 
##                                           fit 
##                                            13 
##                                          unit 
##                                            13 
##                                        differ 
##                                            13 
##                                         choos 
##                                            13 
##                                       countri 
##                                            13 
##                                        experi 
##                                            13 
##                                         onlin 
##                                            13 
##                                           pdt 
##                                            13 
##                                       support 
##                                            13 
##                                          oper 
##                                            13 
##                                          long 
##                                            13 
##                                        dollar 
##                                            13 
##                                       billion 
##                                            13 
##                                         train 
##                                            13 
##                                       express 
##                                            14 
##                                         nnfmp 
##                                            14 
##                                        appear 
##                                            14 
##                                          exim 
##                                            14 
##                                         secur 
##                                            14 
##                                         X2000 
##                                            14 
##                                           web 
##                                            14 
##                                         world 
##                                            14 
##                                   quotedprint 
##                                            14 
##                                          sent 
##                                            14 
##                                         start 
##                                            14 
##                                      maintain 
##                                            14 
##                                        market 
##                                            14 
##                                         visit 
##                                            14 
##                                         X0500 
##                                            15 
##                                          form 
##                                            15 
##                                          helo 
##                                            15 
##                                         invok 
##                                            15 
##                                         qmail 
##                                            15 
##                                         didnt 
##                                            15 
##                      forkspamassassintaintorg 
##                                            15 
##                httpxentcommailmanlistinfofork 
##                                            15 
##                                           per 
##                                            15 
##                                       xentcom 
##                                            15 
##                                          real 
##                                            15 
##                                        system 
##                                            15 
##                                          amaz 
##                                            15 
##                                           run 
##                                            16 
##                                        person 
##                                            16 
##                                           put 
##                                            16 
##                                           tri 
##                                            16 
##                                         becom 
##                                            16 
##                                        provid 
##                                            16 
##                                        though 
##                                            16 
##                                          week 
##                                            16 
##                                          file 
##                                            16 
##                                          ilug 
##                                            16 
##                                          tell 
##                                            16 
##                                       control 
##                                            16 
##                                        suppli 
##                                            16 
##                                      fontfont 
##                                            16 
##                                        associ 
##                                            16 
##                                        judici 
##                                            16 
##                                          join 
##                                            17 
##                                       sponsor 
##                                            17 
##                      zzzzspamassassintaintorg 
##                                            17 
##                                          hour 
##                                            17 
##                                          name 
##                                            17 
##                                        number 
##                                            17 
##                                         incom 
##                                            17 
##                                      opportun 
##                                            17 
##                                         color 
##                                            17 
##                                          lose 
##                                            17 
##                                          size 
##                                            17 
##                                         tbodi 
##                                            17 
##                                       collect 
##                                            17 
##                             mailnetnoteinccom 
##                                            17 
##                                       version 
##                                            18 
##                                    unsubscrib 
##                                            18 
##                                        report 
##                                            18 
##                                       account 
##                                            18 
##                                          good 
##                                            18 
##                                          live 
##                                            18 
##                                          read 
##                                            18 
##                                  xmsmailprior 
##                                            18 
##                                          host 
##                                            18 
##                                          know 
##                                            18 
##                                      question 
##                                            18 
##                                        answer 
##                                            18 
##                                           div 
##                                            18 
##                                        size3d 
##                                            18 
##                                         offer 
##                                            19 
##                                         phone 
##                                            19 
##                                       product 
##                                            19 
##                                         today 
##                                            20 
##                                       without 
##                                            20 
##                                     xprioriti 
##                                            20 
##                                          home 
##                                            20 
##                                           way 
##                                            20 
##                                          mani 
##                                            20 
##                              forkadminxentcom 
##                                            20 
##                                          ship 
##                                            20 
##                                        websit 
##                                            20 
##                             yyyynetnoteinccom 
##                                            20 
##                                       network 
##                                            21 
##                                          well 
##                                            21 
##                                       xmailer 
##                                            21 
##                                          earn 
##                                            21 
##                                          even 
##                                            21 
##                                         irish 
##                                            21 
##                                       process 
##                                            21 
##                               jmnetnoteinccom 
##                                            21 
##                                        listid 
##                                            22 
##                                     xbeenther 
##                                            22 
##                                  xmailmanvers 
##                                            22 
##                                         right 
##                                            22 
##                                        design 
##                                            22 
##                                          tabl 
##                                            22 
##                                       X893893 
##                                            22 
##                                        solari 
##                                            22 
##                                charsetusascii 
##                                            23 
##                                      errorsto 
##                                            23 
##                                       replyto 
##                                            23 
##                                          find 
##                                            23 
##                                          dont 
##                                            23 
##                                         remov 
##                                            23 
##                                         click 
##                                            23 
##                                       program 
##                                            23 
##                                         cours 
##                                            23 
##                                listunsubscrib 
##                                            24 
##                                         X7bit 
##                                            25 
##                                       contact 
##                                            25 
##                                        messag 
##                                            25 
##                                          link 
##                                            25 
##                                         check 
##                                            25 
##                                        sender 
##                                            26 
##                                        window 
##                                            26 
##                                          also 
##                                            26 
##                                          need 
##                                            26 
##                                          take 
##                                            26 
##                                         everi 
##                                            27 
##                                         yahoo 
##                                            27 
##                                          enus 
##                                            27 
##                                           fat 
##                                            27 
##                                         money 
##                                            27 
##                                      interest 
##                                            27 
##                                          user 
##                                            27 
##                                       compani 
##                                            27 
##                                         state 
##                                            27 
##                                        servic 
##                                            28 
##                                     microsoft 
##                                            28 
##                                       million 
##                                            28 
##                                          much 
##                                            28 
##                                        normal 
##                                            29 
##                                          page 
##                                            29 
##                    zzzzlocalhostnetnoteinccom 
##                                            30 
##                                          call 
##                                            30 
##                                        includ 
##                                            30 
##                                          card 
##                                            30 
##                                          busi 
##                                            30 
##                                        center 
##                                            30 
##                                           wed 
##                                            31 
##                                           fri 
##                                            31 
##                                         phobo 
##                                            32 
##                                         first 
##                                            32 
##                                          help 
##                                            32 
##                                          html 
##                                            32 
##                                        preced 
##                                            33 
##                                           jul 
##                                            33 
##                                          bulk 
##                                            34 
##                                          imap 
##                                            34 
##                                     textplain 
##                                            34 
##                                      internet 
##                                            34 
##                                         order 
##                                            34 
##                                 color3d000000 
##                                            34 
##                                     face3dari 
##                                            34 
##                                  fetchmail590 
##                                            35 
##                       phoboslabsnetnoteinccom 
##                                            35 
##                                    singledrop 
##                                            35 
##                                         month 
##                                            35 
##                                           mon 
##                                            35 
##                                          send 
##                                            36 
##                                       unknown 
##                                            36 
##                                          look 
##                                            36 
##                               ilugadminlinuxi 
##                                            36 
##                                           day 
##                                            37 
##                                           sun 
##                                            38 
##                                          bodi 
##                                            38 
##                                         peopl 
##                                            38 
##                                 lughtuathaorg 
##                                            38 
##                                        credit 
##                                            38 
##                                          want 
##                                            39 
##                                          year 
##                                            39 
##                                           tue 
##                                            40 
##                                         pleas 
##                                            41 
##                                        inform 
##                                            42 
##                          contenttransferencod 
##                                            43 
##                                          just 
##                                            43 
##                                       address 
##                                            44 
##                                           edt 
##                                            45 
##                                      judgment 
##                                            45 
##                                         X0700 
##                                            46 
##                                           now 
##                                            47 
##                                    contenttyp 
##                                            48 
##                             dogmaslashnullorg 
##                                            50 
##                                         linux 
##                                            50 
##                                          make 
##                                            51 
##                                           ist 
##                                            53 
##                                      mimevers 
##                                            53 
##                                          like 
##                                            54 
##                                           get 
##                                            55 
##                                          free 
##                                            55 
##                                          work 
##                                            56 
##                                    iluglinuxi 
##                                            56 
##                                           jun 
##                                            56 
##                                          time 
##                                            57 
##                       zzzzteanayahoogroupscom 
##                                            57 
##                                     X81168116 
##                                            59 
##                                    returnpath 
##                                            59 
##                                          smtp 
##                                            60 
##                                 zzzzlocalhost 
##                                            60 
##                                     messageid 
##                                            61 
##                                          mail 
##                                            62 
##                                         group 
##                                            62 
##                                           may 
##                                            62 
##                                           new 
##                                            64 
##                                           one 
##                                            64 
##                                           use 
##                                            64 
##                                         X0400 
##                                            65 
##                                          date 
##                                            66 
##                                   deliveredto 
##                                            70 
##                                         email 
##                                            75 
##                                       postfix 
##                                            76 
##                                         X2001 
##                                            77 
##                                          list 
##                                            78 
##                                         X0000 
##                                            82 
##                                          will 
##                                            87 
##                                       subject 
##                                            88 
##                                       X127001 
##                                            93 
##                                        option 
##                                           100 
##                                           can 
##                                           101 
##                                          brbr 
##                                           106 
##                                          font 
##                                           111 
##                                     localhost 
##                                           118 
##                                         esmtp 
##                                           157 
##                                         X0100 
##                                           158 
##                                           thu 
##                                           247 
##                                           aug 
##                                           341 
##                                        receiv 
##                                           367 
##                                         X2002 
##                                           402
#dim(tdm_text_Sparse)
#head(tdm_text_Sparse)
#str(tdm_text_Sparse)

#Add spam variable in to the data frame
tdm_text_Sparse$Spam = raw_train_data$Spam
#head(tdm_text_Sparse)
#str(tdm_text_Sparse)

#Easy_Ham terms
head(sort(colSums(subset(tdm_text_Sparse, Spam == 0))))
##  accept    busi   click  easier financi  follow 
##       0       0       0       0       0       0
#Spam terms
head(sort(colSums(subset(tdm_text_Sparse, Spam == 1))))
##                chris            inreplyto localhostlocaldomain 
##                    0                    0                    0 
##              mercuri               search        zzzzlocalhost 
##                    0                    0                    0

Classification of text

(1) Method1: Train data set and test data set

train=c()
test=c()
train <- head(tdm_text_Sparse,42)
test <- head(tdm_text_Sparse,-18)

Method2: Train data set and test data set

set.seed(2802)

#for sample.split function
library(caTools)
 
#Building the model by split 70% data in training and 30% data in test
spl <- sample.split(tdm_text_Sparse$Spam, 0.7,group = NULL)
train = as.matrix(subset(tdm_text_Sparse, spl == TRUE))
test = as.matrix(subset(tdm_text_Sparse, spl == FALSE))

Classification of text by Naive Bayes Model

#library(e1071)
#model <- naiveBayes(class ~ ., data=as.matrix(train))
#class(model)
#preds <- predict(modle, newdata=test)

#Accuracy
#conf_matrix <- table(preds, test$Spam)

Classification of text by SVM

#library(RTextTools)
#container <- create_container(as.numberic(tdm_text_Sparse), tdm_text_Sparse$text, trainSize=1:42, testSize=43:60,virgin=FALSE)
#models <- train_models(container, algorithms=c("MAXENT", "SVM"))
#results <- classify_models(container, models)