For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: http://spamassassin.apache.org/old/publiccorpus/
source: http://spamassassin.apache.org/old/publiccorpus/
ham.corpus <- VCorpus(DirSource('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 607\\project4\\easy_ham'))
spam.corpus <- VCorpus(DirSource('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 607\\project4\\spam_2'))
exploring corpus to view the content and metadata information.
text_df <- data_frame( text = ham.corpus[1])
text_df
## # A tibble: 1 x 1
## text
## <S3: VCorpus>
## 1 "list(list(content = c(\"From exmh-workers-admin@redhat.com Thu Aug 22~
## 1 list()
## 1 list()
print(ham.corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2551
inspect(ham.corpus[1:4])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 4
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 4850
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 3281
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 3830
##
## [[4]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 3335
Combining both ‘spam’ and ‘ham’ corpus together using metadata information and preparing the combined corpus for further cleaning
meta(spam.corpus, tag = "type") <- "spam"
meta(ham.corpus, tag = "type") <- "ham"
corpus_clean <- c(spam.corpus, ham.corpus)
corpus_clean <- tm_map(corpus_clean, content_transformer(function(x) iconv(x, "UTF-8", sub="byte")))
converting entire corpus content to lower case
corpus_clean <- tm_map(corpus_clean, content_transformer(tolower))
#as.character(corpus_clean[[1]])
Pre-processing text data (corpus cleaning), using basic tm fucntions such as getting rid of stop words, punctuation removal, whitespce removal.
corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removeWords,stopwords("english"))
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
as.character(corpus_clean[[1]])
## [1] " ilugadminlinuxie tue aug "
## [2] "returnpath ilugadminlinuxie"
## [3] "delivered yyyylocalhostnetnoteinccom"
## [4] "received localhost localhost "
## [5] " phoboslabsnetnoteinccom postfix esmtp id efdd"
## [6] " jmlocalhost tue aug edt"
## [7] "received phobos "
## [8] " localhost imap fetchmail"
## [9] " jmlocalhost singledrop tue aug ist"
## [10] "received lughtuathaorg rootlughtuathaorg "
## [11] " dogmaslashnullorg esmtp id glqwv "
## [12] " jmilugjmasonorg fri aug "
## [13] "received lugh rootlocalhost lughtuathaorg"
## [14] " esmtp id waa fri aug "
## [15] "received bettyjagessarcom wznycnydslcncnet"
## [16] " lughtuathaorg esmtp id waa "
## [17] " iluglinuxie fri aug "
## [18] "xauthenticationwarning lughtuathaorg host wznycnydslcncnet"
## [19] " claimed bettyjagessarcom"
## [20] "received bettyjagessarcom"
## [21] " smtpd eval id aafcf fri aug "
## [22] "messageid "
## [23] "date fri aug "
## [24] " iluglinuxie"
## [25] " start now startnowhotmailcom"
## [26] "mimeversion "
## [27] "contenttype textplain charsetusascii formatflowed"
## [28] "subject ilug stop mlm insanity"
## [29] "sender ilugadminlinuxie"
## [30] "errors ilugadminlinuxie"
## [31] "xmailmanversion "
## [32] "precedence bulk"
## [33] "listid irish linux users group iluglinuxie"
## [34] "xbeenthere iluglinuxie"
## [35] ""
## [36] "greetings"
## [37] ""
## [38] " receiving letter expressed interest "
## [39] "receiving information online business opportunities "
## [40] "erroneous please accept sincere apology onetime "
## [41] "mailing removal necessary"
## [42] ""
## [43] " burned betrayed backstabbed multilevel marketing "
## [44] "mlm please read letter important one "
## [45] " ever landed inbox"
## [46] ""
## [47] "multilevel marketing huge mistake people"
## [48] ""
## [49] "mlm failed deliver promises past years pursuit "
## [50] " mlm dream cost hundreds thousands people friends "
## [51] " fortunes sacred honor fact mlm fatally "
## [52] "flawed meaning work people"
## [53] ""
## [54] " companies earn big money mlm going "
## [55] "tell real story finally someone courage "
## [56] "cut hype lies tell truth mlm"
## [57] ""
## [58] " good news"
## [59] ""
## [60] " alternative mlm works works big yet "
## [61] "abandoned dreams need see earning kind income "
## [62] " dreamed easier think"
## [63] ""
## [64] " permission like send brief letter will tell "
## [65] " mlm work people will introduce "
## [66] "something new refreshing wonder heard "
## [67] " "
## [68] ""
## [69] " promise will unwanted follow sales pitch one "
## [70] "will call email address will used send "
## [71] "information period"
## [72] ""
## [73] " receive free lifechanging information simply click reply type "
## [74] "send info subject box hit send get information "
## [75] "within hours just look words mlm wall shame inbox"
## [76] ""
## [77] "cordially"
## [78] ""
## [79] "siddhi"
## [80] ""
## [81] "ps someone recently sent letter "
## [82] "eyeopening financially beneficial information ever received "
## [83] "honestly believe will feel way read "
## [84] " free"
## [85] ""
## [86] ""
## [87] ""
## [88] " email never sent unsolicited spam receiving "
## [89] " email explicitly signed list "
## [90] "online signup form use ffa links page emaildom "
## [91] "systems explicit terms use state use "
## [92] " agree receive emailings may also member altra "
## [93] "computer systems list one many numerous free marketing services "
## [94] " agreed signed list also "
## [95] "receiving emailing"
## [96] "due email message considered unsolicitated "
## [97] "spam"
## [98] ""
## [99] ""
## [100] ""
## [101] ""
## [102] ""
## [103] " "
## [104] "irish linux users group iluglinuxie"
## [105] "httpwwwlinuxiemailmanlistinfoilug unsubscription information"
## [106] "list maintainer listmasterlinuxie"
## [107] ""
## [108] ""
Storing pre processed text data into Document Term Matrix (dtm)
dtm <- DocumentTermMatrix(corpus_clean)
dtm
## <<DocumentTermMatrix (documents: 3948, terms: 94217)>>
## Non-/sparse entries: 665088/371303628
## Sparsity : 100%
## Maximal term length: 868
## Weighting : term frequency (tf)
meta_type <- as.vector(unlist(meta(corpus_clean)))
meta_data <- data.frame(type = unlist(meta_type))
table(meta_data)
## meta_data
## ham spam
## 2551 1397
inspect(corpus_clean[1:2])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 1
## Content: documents: 2
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2663
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 3223
corpus_clean[[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2663
#dtm <- removeSparseTerms(dtm, 1-(10/length(corpus_clean)))
#dtm
Using tidytext library to tify the the dtm text data and then arranging the
corpus.tidy <- tidy(dtm)
head(corpus.tidy )
## # A tibble: 6 x 3
## document term count
## <chr> <chr> <dbl>
## 1 00001.317e78fa8ee2f54cd4890fdc09ba8176 aafcf 1.
## 2 00001.317e78fa8ee2f54cd4890fdc09ba8176 abandoned 1.
## 3 00001.317e78fa8ee2f54cd4890fdc09ba8176 accept 1.
## 4 00001.317e78fa8ee2f54cd4890fdc09ba8176 address 1.
## 5 00001.317e78fa8ee2f54cd4890fdc09ba8176 agree 1.
## 6 00001.317e78fa8ee2f54cd4890fdc09ba8176 agreed 1.
corpus.tidy.sort <- corpus.tidy %>%
arrange(desc(count))
kable(head(corpus.tidy.sort))
document | term | count |
---|---|---|
00028.60393e49c90f750226bee6381eb3e69d | arial | 272 |
00028.60393e49c90f750226bee6381eb3e69d | colorfont | 270 |
00028.60393e49c90f750226bee6381eb3e69d | faceverdana | 270 |
00028.60393e49c90f750226bee6381eb3e69d | geneva | 270 |
00028.60393e49c90f750226bee6381eb3e69d | helvetica | 270 |
00028.60393e49c90f750226bee6381eb3e69d | sansseriffont | 270 |
inspect(dtm)
## <<DocumentTermMatrix (documents: 3948, terms: 94217)>>
## Non-/sparse entries: 665088/371303628
## Sparsity : 100%
## Maximal term length: 868
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aug esmtp jmlocalhost localhost
## 00028.60393e49c90f750226bee6381eb3e69d 0 2 0 0
## 00051.8b17ce16ace4d5845e2299c0123e1f14 0 2 0 0
## 01083.a6b3c50be5abf782b585995d2c11176b 0 4 0 1
## 01094.91779ec04e5e6b27e84297c28fc7369f 0 1 2 3
## 0320.6c54ea1bb991c6fae395588219cfce37 0 5 0 4
## 0627.c9ad8730dad7bda1e1169ee00c4006fc 0 5 2 5
## 0730.9570ee3b6bf144198297b23bca5044e9 0 5 2 4
## 0737.aa298505cb31aac78d0dbf229fc45fb9 0 4 2 3
## 0826.082e92a79a15aa7f6dd5b85a40327abd 0 5 2 3
## 1022.73ab70b91862d709897cfe3dd5bb22a0 0 4 2 3
## Terms
## Docs mon oct postfix received sep thu
## 00028.60393e49c90f750226bee6381eb3e69d 4 0 2 2 0 0
## 00051.8b17ce16ace4d5845e2299c0123e1f14 0 0 2 2 0 0
## 01083.a6b3c50be5abf782b585995d2c11176b 0 0 0 8 0 9
## 01094.91779ec04e5e6b27e84297c28fc7369f 0 0 1 7 0 1
## 0320.6c54ea1bb991c6fae395588219cfce37 0 8 3 6 0 0
## 0627.c9ad8730dad7bda1e1169ee00c4006fc 0 0 3 6 8 8
## 0730.9570ee3b6bf144198297b23bca5044e9 0 0 3 7 8 0
## 0737.aa298505cb31aac78d0dbf229fc45fb9 0 0 3 7 9 0
## 0826.082e92a79a15aa7f6dd5b85a40327abd 0 0 3 6 8 0
## 1022.73ab70b91862d709897cfe3dd5bb22a0 0 9 3 6 0 4
Finding top 5 term in the tidy text
term.frequency <- corpus.tidy.sort%>%
select(term,count) %>%
group_by(term) %>%
summarise(termFrequency = sum(count)) %>%
arrange(desc(termFrequency))
kable(head(term.frequency))
term | termFrequency |
---|---|
received | 20576 |
esmtp | 11848 |
sep | 10013 |
localhost | 9501 |
aug | 6467 |
mon | 5842 |
ggplot(data=filter(term.frequency,termFrequency>4000), aes(x = term, y = termFrequency)) +
geom_bar(stat = "identity", aes(fill=termFrequency)) +
geom_text(aes(label=termFrequency), vjust=-0.2)+
theme_bw()+
theme(axis.text.x = element_text(angle = 65, hjust = 1),legend.position = 'none')
Preparing training and test dataset
#Toal datset size
totalSize <- (round(length(meta_type)))
totalSize
## [1] 3948
# 70% training dataset
traindata.size <-(round(length(meta_type)*.7))
traindata.size
## [1] 2764
# 30% test dataset
paste0('test data point starts from ',round(length(meta_type)*.7)+1, ' and ends at ',totalSize)
## [1] "test data point starts from 2765 and ends at 3948"
# Process dataset in container for model fitment
data.train.test.container <- create_container(dtm, labels = meta_type, trainSize = 1:traindata.size,testSize = (round(length(meta_type)*.7)+1):totalSize, virgin = FALSE)
slotNames(data.train.test.container)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
svm.model <- train_model(data.train.test.container, "SVM")
svm.predict <- classify_model(data.train.test.container, svm.model)
data.label.svm <- data.frame(
correct_label = meta_type[2765:3948],
svm = as.character(svm.predict[,1]),
stringsAsFactors = F)
table(data.label.svm[,1] == data.label.svm[,2])
##
## FALSE TRUE
## 136 1048
prop.table(table(data.label.svm[,1] == data.label.svm[,2]))
##
## FALSE TRUE
## 0.1148649 0.8851351
SVM Model appears to be an efficient classifier for the given text dataset.