library(tidyverse)
library(tidyr)
library(dplyr)
library(stringr)
library(tidytext)
library(tm)
library(SnowballC)
library(ggplot2)
library(wordcloud)
library(caret)
library(gbm)
library(e1071)
library(SparseM)
library(caTools)
library(randomForest)
library(tree)
library(ipred)
library(glmnet)
library(tau)
library(devtools)
library(quanteda)Our Project Team 4 above (Banu Boopalan, Samuel Kigamba, James Mundy, Alain T Kuiete), we will submit 2 RPUB documents (RPUBS LINK PROVIDED BY EACH TEAM MEMBER). This is the first document representing the first model. In this code, we have performed data transformations, exploratory data analysis, visualizations using wordclouds, frequency plots on words, and performed Naive Bayes Model and reported the Confusion Matrix results for the Naive Bayes Model. We tried to plot the prediction model using plot and mosaicplot but we were not able draw the plot for to show the plot of the model which requires further understanding. Within the model we are able to create document term matrix, segment the train and test data and then run the model to report summary model statistics. Each team member will report a different accuracy due to the files read in.
Our Project Team 4 above (Banu Boopalan, Samuel Kigamba, James Mundy, Alain T Kuiete), we will submit 2 separate RPUB documents. The 2nd document link to RPUBS, we have performed data transformations, exploratory data analysis, visualizations using wordclouds, frequency plots on words, and performed SVM model and reported the Confusion Matrix results for the SVM model. We tried to plot the model using plot but we were not successful in representing a way to plot the model, The support vector #’s are high range so we have to dive deeper into how to represent and plot the model through plot or Kernlab pacakge or Kernfit. Within the model we are able to create document term matrix and term document matrix, segment the train and test data and then run the model to report summary model. The SVM reported an accuracy for each of our teammates will be different as we are reading in our own files from the directory. The SVM reported higher accuracy than the Naive Bayes upon first review.
Collaboration via POWERPOINT, GITHUB, GOTO MEETING along with weekly meetings on Tuesday, Friday.
ham.dir="C:\\DATA607\\Project4\\spamHam\\20021010_easy_ham (1).tar\\easy_ham"
ham.file.names = list.files(ham.dir)
# List of docs
ham.docs <- ham.file.names[1]
for(i in 2:length(ham.file.names))
{
filepath<-paste0(ham.dir, "/", ham.file.names[i])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
ham.docs = c(ham.docs,list1)
}senders <- unlist(str_extract(ham.docs[2], "(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
for (i in 3:length(ham.docs)) {
s <- unlist(str_extract(ham.docs[i],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
senders <- c(senders, s)
}
summary(senders)## Length Class Mode
## 2550 character character
## [1] "Steve_Burt@cursor-system.com" "timc@2ubh.com"
sender.df %>%
group_by(email) %>%
summarise(n=n())%>%
top_n(10)%>%
mutate(email = reorder(email, n)) %>%
ggplot(aes(email, n, fill = email)) +
geom_col(show.legend = FALSE) +
labs(y = "Most Frequent Senders",
x = NULL) +
coord_flip()## Selecting by n
emails <- unlist(str_extract_all(ham.docs[2],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
for (i in 3:length(ham.docs)) {
s <- unlist(str_extract_all(ham.docs[i],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
emails <- c(emails, s)
}
summary(emails)## Length Class Mode
## 46500 character character
len <- nchar(emails[1])
for (i in 2:length(emails)) {
len <-c(len, nchar(emails[i]))
}
ham.emails <- tibble(mail = 1:length(emails), emails, len)
head(ham.emails,2)## # A tibble: 2 x 3
## mail emails len
## <int> <chr> <int>
## 1 1 Steve_Burt@cursor-system.com 28
## 2 2 Steve_Burt@cursor-system.com 28
ham.emails %>%
group_by(emails) %>%
summarise(n=n())%>%
top_n(20)%>%
mutate(emails = reorder(emails, n)) %>%
ggplot(aes(emails, n, fill = emails)) +
geom_col(show.legend = FALSE) +
labs(y = "Most Frequent emails",
x = NULL) +
coord_flip()## Selecting by n
ham.block <- ham.list %>%
unnest_tokens(word, text)%>%
group_by(files) %>%
mutate(n= n()) %>%
ungroup()
head(ham.block,2)## # A tibble: 2 x 3
## files word n
## <int> <chr> <int>
## 1 1 0001 2
## 2 1 ea7e79d3153e7469e7a9c3e0af6a357e 2
## Warning in bind_tf_idf.data.frame(., word, files, n): A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
## # A tibble: 2 x 6
## files word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1 ea7e79d3153e7469e7a9c3e0af6a357e 2 0.5 7.84 3.92
## 2 1 0001 2 0.5 4.71 2.35
We select only words with IDF greater than 0 and we remove words containing numbers
ham.block2 <- ham.block %>%
filter(idf>0,str_detect(word,"([^\\d.+\\w.+\\.\\,.+]+?)")) %>%
arrange(desc(tf_idf))
head(ham.block2, 2)## # A tibble: 2 x 6
## files word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1795 laptop's 60 0.0167 6.46 0.108
## 2 1792 neale's 108 0.00926 7.15 0.0662
## # A tibble: 4 x 6
## files word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1795 laptop's 60 0.0167 6.46 0.108
## 2 1300 laptop's 620 0.00161 6.46 0.0104
## 3 1336 laptop's 645 0.00155 6.46 0.0100
## 4 1301 laptop's 826 0.00121 6.46 0.00782
ham.block2%>%
arrange(desc(tf_idf)) %>%
top_n(20)%>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
ggplot(aes(word, tf_idf, fill = files)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf", title = "Most Relevent Words in the Body Messages") +
coord_flip()## Selecting by tf_idf
spam.dir="C:\\DATA607\\Project4\\spamHam\\spam4\\spam_2"
spam.file.names = list.files(spam.dir)
# List of docs
spam.docs <- spam.file.names[1]
for(i in 2:length(spam.file.names))
{
filepath<-paste0(spam.dir, "\\", spam.file.names[i])
text <-readLines(filepath)
l<- list(paste(text, collapse="\n"))
spam.docs = c(spam.docs,l)
}# spam.dir="C:\\DATA607\\Project4\\spamHam\\20021010_spam.tar\\spam"
# #spam.dir="C://Users//Banu//Documents//RScriptfiles//Project4//SpamHam//200#50311_spam_2.tar//spam_2"
# #spam.dir="spam_2"
# spam.file.names = list.files(spam.dir)
#
# spam_files = list.files(path = ham.dir, full.names = TRUE)
# no_of_spam_files = length(list.files(spam.dir, all.files = "FALSE", full.names = "TRUE"))
# print(paste("There are",no_of_spam_files,"spam emails in the spam_2 folder"))
# #spam_files
#
# # List of docs
# spam.docs <- spam.file.names[1]
# for(i in 2:length(spam.file.names))
# {
# filepath<-paste0(spam.dir, "\\", spam.file.names[i])
# text <-readLines(filepath)
# l<- list(paste(text, collapse="\n"))
# spam.docs = c(spam.docs,l)
# }## Warning in bind_tf_idf.data.frame(., word, block, n): A value for tf_idf is negative:
## Input should have exactly one row per document-term combination.
## # A tibble: 6 x 6
## block word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1 00001.317e78fa8ee2f54cd4890fdc09ba8176 1 1 6.14 6.14
## 2 805 4.21.157.32 109 0.00917 7.24 0.0664
## 3 805 g6l6w9415993 109 0.00917 7.24 0.0664
## 4 805 1027225826.1122 109 0.00917 7.24 0.0664
## 5 805 winnereritmugu 109 0.00917 7.24 0.0664
## 6 805 winnergkrsvyyyyl 109 0.00917 7.24 0.0664
spam.block2 <- spam.block %>%
filter(idf>0,str_detect(word,"([^\\d.+\\w.+\\.\\,.+]+?)")) %>%
arrange(desc(tf_idf))
head(spam.block2)## # A tibble: 6 x 6
## block word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 743 luke's 127 0.00787 7.24 0.0570
## 2 58 mailto:angie_pepi 192 0.00521 7.24 0.0377
## 3 382 car's 195 0.00513 7.24 0.0371
## 4 996 mailto:remove_me123 196 0.00510 7.24 0.0369
## 5 536 ident:nobody 125 0.008 4.53 0.0363
## 6 362 mailto:bluejo 202 0.00495 7.24 0.0359
spam.senders <- unlist(str_extract(spam.docs[2], "(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
for (i in 3:length(spam.docs)) {
s <- unlist(str_extract(spam.docs[i],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
spam.senders <- c(spam.senders, s)
}
summary(spam.senders)## Length Class Mode
## 1396 character character
## [1] "lmrn@mailexcite.com" "amknight@mailexcite.com"
## [3] "jordan23@mailexcite.com" "merchantsworld2001@juno.com"
## [5] "cypherpunks-forward@ds.pro-ns.net" "sales@outsrc-em.com"
spam.email.len <- nchar(spam.senders[1])
for (i in 2:length(spam.senders)) {
spam.email.len <-c(spam.email.len,nchar(spam.senders[i]))
}
spam.sender.df <- tibble(email=spam.senders, len=spam.email.len)
head(spam.sender.df)## # A tibble: 6 x 2
## email len
## <chr> <int>
## 1 lmrn@mailexcite.com 19
## 2 amknight@mailexcite.com 23
## 3 jordan23@mailexcite.com 23
## 4 merchantsworld2001@juno.com 27
## 5 cypherpunks-forward@ds.pro-ns.net 33
## 6 sales@outsrc-em.com 19
spam.sender.df %>%
group_by(email) %>%
summarise(n=n())%>%
top_n(10)%>%
mutate(email = reorder(email, n)) %>%
ggplot(aes(email, n, fill = email)) +
geom_col(show.legend = FALSE) +
labs(y = "Most Frequent Senders",
x = NULL) +
coord_flip()## Selecting by n
spam.emails <- unlist(str_extract_all(spam.docs[2],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
for (i in 3:length(spam.docs)) {
s <- unlist(str_extract_all(spam.docs[i],"(?<name>[\\w.-]+)\\@(?<domain>[-\\w+\\.\\w+]+)(\\.\\w+)?"))
spam.emails <- c(spam.emails, s)
}
summary(spam.emails)## Length Class Mode
## 22103 character character
len <- nchar(spam.emails[1])
for (i in 2:length(spam.emails)) {
len <-c(len, nchar(spam.emails[i]))
}
spam.emails <- tibble(mail = 1:length(spam.emails), spam.emails, len)
head(spam.emails)## # A tibble: 6 x 3
## mail spam.emails len
## <int> <chr> <int>
## 1 1 lmrn@mailexcite.com 19
## 2 2 merchantsworld2001@juno.com 27
## 3 3 jm@jmason.org 13
## 4 4 jm@netnoteinc.com 17
## 5 5 B0000178595@203.129.205.5.205.129.203.in-addr.arpa 50
## 6 6 B0000178595@203.129.205.5.205.129.203.in-addr.arpa 50
spam.emails %>%
group_by(spam.emails) %>%
summarise(n=n())%>%
top_n(20)%>%
mutate(spam.emails = reorder(spam.emails, n)) %>%
ggplot(aes(spam.emails, n, fill = spam.emails)) +
geom_col(show.legend = FALSE) +
labs(y = "Most Frequent emails",
x = NULL) +
coord_flip()## Selecting by n
spam.block2%>%
top_n(10)%>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
mutate(block = reorder(block, tf_idf)) %>%
arrange(desc(tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = block)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf", title = "Most Relevant Words in the Bodies of Spam Email") +
coord_flip()## Selecting by tf_idf
We create an object/model that can loop through any list of documents and create a corpus for each. This way we avoid duplicating this code for each and every set of documents that we need to loop through.
to_VCorpus <- function(file_path) {
corpus <- file_path %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()
}
docmnt_clean <- function(corpus) {
corpus <- corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(tolower) %>%
tm_map(PlainTextDocument) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
return(corpus)
}
addTag <- function(corpus, tag, value) {
for (i in 1:length(corpus)){
meta(corpus[[i]], tag) <- value
}
return(corpus)
}#Ham
Ham_Corpus <- ham.dir %>%
to_VCorpus %>%
docmnt_clean %>%
addTag(tag = "emails", value = "ham")
inspect(Ham_Corpus[1:5])## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2818
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 1925
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2214
##
## [[4]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 1904
##
## [[5]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2708
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 6
#Spam
Spam_Corpus <- spam.dir %>%
to_VCorpus %>%
docmnt_clean %>%
addTag(tag = "emails", value = "spam")
inspect(Spam_Corpus[1:5])## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2334
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2926
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 3602
##
## [[4]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 3675
##
## [[5]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2183
## steveburtcursorsystemcom thu aug
## returnpath steveburtcursorsystemcom
## deliveredto zzzzlocalhostnetnoteinccom
## receiv localhost localhost
## phoboslabsnetnoteinccom postfix esmtp id beec
## zzzzlocalhost thu aug edt
## receiv phobo
## localhost imap fetchmail
## zzzzlocalhost singledrop thu aug ist
## receiv ngrpscdyahoocom ngrpscdyahoocom
## dogmaslashnullorg smtp id
## gmbktz zzzzexamplecom thu aug
## xegroupsreturn senttozzzzexamplecomreturnsgroupsyahoocom
## receiv ngrpscdyahoocom nnfmp
## aug
## xsender steveburtcursorsystemcom
## xapparentlyto zzzzteanayahoogroupscom
## receiv egp mail aug
## receiv qmail invok network aug
## receiv unknown mgrpscdyahoocom qmqp
## aug
## receiv unknown helo mailgatewaycursorsystemcom
## mtagrpscdyahoocom smtp aug
## receiv exchangecpsloc unverifi
## mailgatewaycursorsystemcom content technolog smtprs
## esmtp id tcdefacddmailgatewaycursorsystemcom
## forteanayahoogroupscom thu aug
## receiv exchangecpsloc internet mail servic
## id pxxat thu aug
## messageid ecadddfbbdaddefbfexchangecpsloc
## zzzzteanayahoogroupscom zzzzteanayahoogroupscom
## xmailer internet mail servic
## xegroupsfrom steve burt steveburtcursorsystemcom
## steve burt steveburtcursorsystemcom
## xyahooprofil pyrus
## mimevers
## mailinglist list zzzzteanayahoogroupscom contact
## forteanaowneryahoogroupscom
## deliveredto mail list zzzzteanayahoogroupscom
## preced bulk
## listunsubscrib mailtozzzzteanaunsubscribeyahoogroupscom
## date thu aug
## subject zzzzteana re alexand
## replyto zzzzteanayahoogroupscom
## contenttyp textplain charsetusascii
## contenttransferencod bit
##
## martin post
## tasso papadopoulo greek sculptor behind plan judg
## limeston mount kerdylio mile east salonika far
## mount atho monast communiti ideal patriot sculptur
##
## well alexand granit featur ft high ft wide
## museum restor amphitheatr car park admir crowd
## plan
##
## mountain limeston granit
## limeston itll weather pretti fast
##
## yahoo group sponsor
## dvds free sp join now
## httpusclickyahoocomptybbnxieaamghaagsolbtm
##
##
## unsubscrib group send email
## forteanaunsubscribeegroupscom
##
##
##
## use yahoo group subject httpdocsyahoocominfoterm
## martinsrvemsedacuk thu aug
## returnpath martinsrvemsedacuk
## deliveredto zzzzlocalhostnetnoteinccom
## receiv localhost localhost
## phoboslabsnetnoteinccom postfix esmtp id edbc
## zzzzlocalhost thu aug edt
## receiv phobo
## localhost imap fetchmail
## zzzzlocalhost singledrop thu aug ist
## receiv ngrpscdyahoocom ngrpscdyahoocom
## dogmaslashnullorg smtp id
## gmdtz zzzzexamplecom thu aug
## xegroupsreturn senttozzzzexamplecomreturnsgroupsyahoocom
## receiv ngrpscdyahoocom nnfmp
## aug
## xsender martinsrvemsedacuk
## xapparentlyto zzzzteanayahoogroupscom
## receiv egp mail aug
## receiv qmail invok network aug
## receiv unknown mgrpscdyahoocom qmqp
## aug
## receiv unknown helo haymarketedacuk
## mtagrpscdyahoocom smtp aug
## receiv srvemsedacuk srvemsedacuk
## haymarketedacuk esmtp id gmdsv
## forteanayahoogroupscom thu aug bst
## receiv emssrvspooldir srvemsedacuk mercuri
## aug
## receiv spooldir emssrv mercuri aug
## organ manag school
## zzzzteanayahoogroupscom
## messageid dfbdeclocalhost
## prioriti normal
## xmailer pegasus mail window v
## contentdescript mail messag bodi
## martin adamson martinsrvemsedacuk
## mimevers
## mailinglist list zzzzteanayahoogroupscom contact
## forteanaowneryahoogroupscom
## deliveredto mail list zzzzteanayahoogroupscom
## preced bulk
## listunsubscrib mailtozzzzteanaunsubscribeyahoogroupscom
## date thu aug
## subject zzzzteana playboy want go bang
## replyto zzzzteanayahoogroupscom
## contenttyp textplain charsetiso
## contenttransferencod bit
## xmimeautoconvert quotedprint bit dogmaslashnullorg
## id gmdtz
##
## scotsman august
##
## playboy want go bang
##
##
## age berlin playboy come unusu offer lure women
## bed promis last woman sleep inherit
## £
##
## rolf eden berlin disco owner famous countless sex partner
## said imagin better way die arm attract
## young woman prefer
##
## put last testament last woman sleep
## get money mr eden told bild newspap
##
## want pass away beauti moment life first lot
## fun beauti woman wild sex final orgasm
## end heart attack ’m gone
##
## mr eden sell nightclub year said applic
## sent quick age end soon said
##
##
## yahoo group sponsor
## dvds free sp join now
## httpusclickyahoocomptybbnxieaamghaagsolbtm
##
##
## unsubscrib group send email
## forteanaunsubscribeegroupscom
##
##
##
## use yahoo group subject httpdocsyahoocominfoterm
#TermDocumentMatrix
docs <- Corpus(VectorSource(Ham_Corpus))
dtm1 <- TermDocumentMatrix(docs)
m <- as.matrix(dtm1)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)## word freq
## "", "", 33612
## character(0), character(0), 15306
## "receiv "receiv 14285
## esmtp esmtp 6534
## mon mon 5775
## sep", sep", 5337
## 17, 17, 5102
## ist", ist", 4382
## sep sep 4218
## "jmlocalhost "jmlocalhost 4136
## Classes 'tbl_df', 'tbl' and 'data.frame': 48553 obs. of 3 variables:
## $ term : chr "\"\")," "\"\"," "\"\\023c\\024" "\"aa" ...
## $ document: chr "1" "1" "1" "1" ...
## $ count : num 2486 33612 1 10 1 ...
mydtm_sentiments4 <- slice(mydtm4 , 1:60000) %>% inner_join(get_sentiments("bing"), by = c(term = "word"))
str(mydtm_sentiments4)## Classes 'tbl_df', 'tbl' and 'data.frame': 891 obs. of 4 variables:
## $ term : chr "abolish" "abort" "abound" "absurd" ...
## $ document : chr "1" "1" "1" "1" ...
## $ count : num 1 6 3 16 19 11 2 2 3 10 ...
## $ sentiment: chr "negative" "negative" "positive" "negative" ...
docs4 <- Corpus(VectorSource(Spam_Corpus))
dtm5 <- TermDocumentMatrix(docs4)
m5 <- as.matrix(dtm5)
v5 <- sort(rowSums(m5),decreasing=TRUE)
d5 <- data.frame(word = names(v5),freq=v5)
head(d5, 10)## word freq
## "", "", 28837
## character(0), character(0), 8382
## "tr", "tr", 6841
## "receiv "receiv 6116
## "td "td 5496
## mon mon 3230
## size size 3049
## "br", "br", 2833
## 17, 17, 2794
## esmtp esmtp 2605
## Classes 'tbl_df', 'tbl' and 'data.frame': 48553 obs. of 3 variables:
## $ term : chr "\"\")," "\"\"," "\"\\023c\\024" "\"aa" ...
## $ document: chr "1" "1" "1" "1" ...
## $ count : num 2486 33612 1 10 1 ...
mydtm_sentiments4 <- slice(mydtm4 , 1:100000) %>% inner_join(get_sentiments("bing"), by = c(term = "word"))
str(mydtm_sentiments4)## Classes 'tbl_df', 'tbl' and 'data.frame': 891 obs. of 4 variables:
## $ term : chr "abolish" "abort" "abound" "absurd" ...
## $ document : chr "1" "1" "1" "1" ...
## $ count : num 1 6 3 16 19 11 2 2 3 10 ...
## $ sentiment: chr "negative" "negative" "positive" "negative" ...
## Classes 'tbl_df', 'tbl' and 'data.frame': 77303 obs. of 3 variables:
## $ term : chr "\"\")," "\"\"," "\"aa" "\"aa\"," ...
## $ document: chr "1" "1" "1" "1" ...
## $ count : num 1186 28837 5 4 2 ...
mydtm_sentiments5 <- slice(mydtm5 , 1:100000) %>% inner_join(get_sentiments("bing"), by = c(term = "word"))
str(mydtm_sentiments5)## Classes 'tbl_df', 'tbl' and 'data.frame': 541 obs. of 4 variables:
## $ term : chr "abort" "abscond" "acclaim" "accomplish" ...
## $ document : chr "1" "1" "1" "1" ...
## $ count : num 1 1 3 3 1 2 7 27 4 1 ...
## $ sentiment: chr "negative" "negative" "positive" "positive" ...
#Side By Side
#Create two panels to add the word clouds to
#par(mfrow=c(1,2))
#set.seed(1234)
plot.new()
text(x=0.5, y=0.5, "Wordcloud using Bing Lexicon for Ham corpus")wordcloud(words = mydtm_sentiments4$term, freq = mydtm_sentiments4$count, min.freq = 50, max.words=1000, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))wordcloud(words = mydtm_sentiments5$term, freq = mydtm_sentiments5$count, min.freq = 50, max.words=1000, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))ham_DtFr = as.data.frame(unlist(Ham_Corpus), stringsAsFactors = FALSE)
ham_DtFr$type = "ham"
colnames(ham_DtFr) = c("text", "type")
spam_DtFr = as.data.frame(unlist(Spam_Corpus), stringsAsFactors = FALSE)
spam_DtFr$type = "spam"
colnames(spam_DtFr) = c("text", "type")
combined_DtFr = rbind(ham_DtFr[1:1000,], spam_DtFr[1:1000,]) # Combined dataframe of both corpuses
head(combined_DtFr, 10)## text type
## 1 exmhworkersadminredhatcom thu aug ham
## 2 returnpath exmhworkersadminexamplecom ham
## 3 deliveredto zzzzlocalhostnetnoteinccom ham
## 4 receiv localhost localhost ham
## 5 phoboslabsnetnoteinccom postfix esmtp id dec ham
## 6 zzzzlocalhost thu aug edt ham
## 7 receiv phobo ham
## 8 localhost imap fetchmail ham
## 9 zzzzlocalhost singledrop thu aug ist ham
## 10 receiv listmanexamplecom listmanexamplecom ham
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2818
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 1925
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2214
##
## [[4]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 1904
##
## [[5]]
## <<PlainTextDocument>>
## Metadata: 8
## Content: chars: 2708
set.seed(100)
combined_DtFr$text[combined_DtFr$text == ""] = "NaN"
train_index = createDataPartition(combined_DtFr$type, p = 0.70, list = FALSE)
corpus_train = combined_DtFr[train_index,]
head(corpus_train)## text type
## 1 exmhworkersadminredhatcom thu aug ham
## 2 returnpath exmhworkersadminexamplecom ham
## 4 receiv localhost localhost ham
## 6 zzzzlocalhost thu aug edt ham
## 7 receiv phobo ham
## 9 zzzzlocalhost singledrop thu aug ist ham
## text type
## 3 deliveredto zzzzlocalhostnetnoteinccom ham
## 5 phoboslabsnetnoteinccom postfix esmtp id dec ham
## 8 localhost imap fetchmail ham
## 17 receiv intmxcorpexamplecom intmxcorpexamplecom ham
## 20 edt ham
## 22 id gmbyg exmhworkerslistmanredhatcom thu aug ham
## 25 intmxcorpredhatcom smtp id gmbyy ham
## 29 thu aug ham
## 33 receiv munnariozau localhost deltacsmuozau ham
## 35 ict ham
trainCorpus = Corpus(VectorSource(corpus_train$text))
testCorpus = Corpus(VectorSource(corpus_test$text))
train_clean_corpus <- tm_map(trainCorpus, removeNumbers)## Warning in tm_map.SimpleCorpus(trainCorpus, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(testCorpus, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removeWords,
## stopwords()): transformation drops documents
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removeWords,
## stopwords()): transformation drops documents
## Warning in tm_map.SimpleCorpus(train_clean_corpus, stripWhitespace):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(test_clean_corpus, stripWhitespace):
## transformation drops documents
#Wordcloud for train_clean_Corpus
docs1 <- Corpus(VectorSource(train_clean_corpus))
dtm2 <- TermDocumentMatrix(docs1)
m <- as.matrix(dtm2)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)## word freq
## "nan", "nan", 196
## "", "", 93
## "receiv "receiv 93
## "brbr", "brbr", 63
## "br", "br", 59
## aug", aug", 57
## thu thu 51
## esmtp esmtp 27
## br", br", 24
## mail mail 23
## Classes 'tbl_df', 'tbl' and 'data.frame': 1603 obs. of 3 variables:
## $ term : chr "\"\"," "\"abandon" "\"absorb" "\"act" ...
## $ document: chr "1" "1" "1" "1" ...
## $ count : num 93 1 1 2 1 3 1 1 1 3 ...
## # A tibble: 100 x 3
## term document count
## <chr> <chr> <dbl>
## 1 "\"\"," 1 93
## 2 "\"abandon" 1 1
## 3 "\"absorb" 1 1
## 4 "\"act" 1 2
## 5 "\"ad" 1 1
## 6 "\"addressbr\"," 1 3
## 7 "\"age" 1 1
## 8 "\"agenc" 1 1
## 9 "\"agre" 1 1
## 10 "\"aid" 1 3
## # ... with 90 more rows
#slice sentiments of 1000 rows
mydtm_sentiments <- slice(mydtm , 1:100000) %>% inner_join(get_sentiments("bing"), by = c(term = "word"))
mydtm_sentiments## # A tibble: 68 x 4
## term document count sentiment
## <chr> <chr> <dbl> <chr>
## 1 attack 1 3 negative
## 2 bad 1 4 negative
## 3 betray 1 1 negative
## 4 better 1 1 positive
## 5 blow 1 1 negative
## 6 bonus 1 4 positive
## 7 boost 1 2 positive
## 8 burn 1 1 negative
## 9 cold 1 1 negative
## 10 corrupt 1 1 negative
## # ... with 58 more rows
## Classes 'tbl_df', 'tbl' and 'data.frame': 68 obs. of 4 variables:
## $ term : chr "attack" "bad" "betray" "better" ...
## $ document : chr "1" "1" "1" "1" ...
## $ count : num 3 4 1 1 1 4 2 1 1 1 ...
## $ sentiment: chr "negative" "negative" "negative" "positive" ...
mydtm_sentiments %>%
count(sentiment, term, wt = count) %>%
top_n(50) %>%
ungroup() %>%
mutate(term = reorder(term, n)) %>%
ggplot(aes(term, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip() ## Selecting by n
#Wordcloud for test_clean_Corpus
docs2 <- Corpus(VectorSource(test_clean_corpus))
dtm3 <- TermDocumentMatrix(docs2)
m3 <- as.matrix(dtm3)
v3 <- sort(rowSums(m3),decreasing=TRUE)
d3 <- data.frame(word = names(v3),freq=v3)
head(d3, 10)## word freq
## "nan", "nan", 92
## "receiv "receiv 37
## "", "", 33
## "br", "br", 29
## aug", aug", 27
## thu thu 27
## "brbr", "brbr", 24
## esmtp esmtp 13
## mail mail 11
## "p", "p", 10
## Classes 'tbl_df', 'tbl' and 'data.frame': 896 obs. of 3 variables:
## $ term : chr "\"\"," "\"aabaabhaceadbdc\"," "\"abl" "\"absorb" ...
## $ document: chr "1" "1" "1" "1" ...
## $ count : num 33 1 1 1 1 1 1 1 1 1 ...
## # A tibble: 100 x 3
## term document count
## <chr> <chr> <dbl>
## 1 "\"\"," 1 33
## 2 "\"aabaabhaceadbdc\"," 1 1
## 3 "\"abl" 1 1
## 4 "\"absorb" 1 1
## 5 "\"add" 1 1
## 6 "\"addressbr\"," 1 1
## 7 "\"agre" 1 1
## 8 "\"aid" 1 1
## 9 "\"altern" 1 1
## 10 "\"anyon" 1 1
## # ... with 90 more rows
#slice sentiments of 1000 rows
mydtm_sentiments3 <- slice(mydtm3 , 1:100000) %>% inner_join(get_sentiments("bing"), by = c(term = "word"))
mydtm_sentiments3## # A tibble: 26 x 4
## term document count sentiment
## <chr> <chr> <dbl> <chr>
## 1 boost 1 2 positive
## 2 crime 1 2 negative
## 3 death 1 1 negative
## 4 debt 1 1 negative
## 5 easier 1 1 positive
## 6 enjoy 1 1 positive
## 7 fat 1 6 negative
## 8 free 1 3 positive
## 9 good 1 4 positive
## 10 ideal 1 1 positive
## # ... with 16 more rows
## Classes 'tbl_df', 'tbl' and 'data.frame': 26 obs. of 4 variables:
## $ term : chr "boost" "crime" "death" "debt" ...
## $ document : chr "1" "1" "1" "1" ...
## $ count : num 2 2 1 1 1 1 6 3 4 1 ...
## $ sentiment: chr "positive" "negative" "negative" "negative" ...
#Side By Side
#Create two panels to add the word clouds to
#par(mfrow=c(1,2))
plot.new()
text(x=0.5, y=0.5, "Wordcloud using Bing Lexicon for Train corpus")wordcloud(words = mydtm_sentiments$term, freq = mydtm_sentiments$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))wordcloud(words = mydtm_sentiments3$term, freq = mydtm_sentiments3$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))train = apply(corpus_train_dtm, 2, convert_count)
test = apply(corpus_test_dtm, 2, convert_count)
str(train)## chr [1:1400, 1:1270] "1" "0" "0" "1" "0" "1" "0" "0" "1" "0" "1" "0" ...
## - attr(*, "dimnames")=List of 2
## ..$ Docs : chr [1:1400] "1" "2" "3" "4" ...
## ..$ Terms: chr [1:1270] "aug" "exmhworkersadminredhatcom" "thu" "exmhworkersadminexamplecom" ...
## chr [1:600, 1:732] "1" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...
## - attr(*, "dimnames")=List of 2
## ..$ Docs : chr [1:600] "1" "2" "3" "4" ...
## ..$ Terms: chr [1:732] "deliveredto" "zzzzlocalhostnetnoteinccom" "dec" "esmtp" ...
classifier = naiveBayes(train, factor(corpus_train$type))
pred = predict(classifier, newdata = test)
classifier$apriori## factor(corpus_train$type)
## ham spam
## 700 700
## $aug
## aug
## factor(corpus_train$type) 0 1
## ham 0.88142857 0.11857143
## spam 0.98857143 0.01142857
##
## $exmhworkersadminredhatcom
## exmhworkersadminredhatcom
## factor(corpus_train$type) 0 1
## ham 0.998571429 0.001428571
## spam 1.000000000 0.000000000
##
## $thu
## thu
## factor(corpus_train$type) 0 1
## ham 0.927142857 0.072857143
## spam 0.997142857 0.002857143
##
## $exmhworkersadminexamplecom
## exmhworkersadminexamplecom
## factor(corpus_train$type) 0 1
## ham 0.997142857 0.002857143
## spam 1.000000000 0.000000000
##
## $returnpath
## returnpath
## factor(corpus_train$type) 0 1
## ham 0.990000000 0.010000000
## spam 0.994285714 0.005714286
##
## $localhost
## localhost
## factor(corpus_train$type) 0 1
## ham 0.977142857 0.022857143
## spam 0.994285714 0.005714286
##
## $receiv
## receiv
## factor(corpus_train$type) 0 1
## ham 0.89714286 0.10285714
## spam 0.96571429 0.03428571
##
## $edt
## edt
## factor(corpus_train$type) 0 1
## ham 0.987142857 0.012857143
## spam 0.998571429 0.001428571
##
## $zzzzlocalhost
## zzzzlocalhost
## factor(corpus_train$type) 0 1
## ham 0.98285714 0.01714286
## spam 1.00000000 0.00000000
##
## $phobo
## phobo
## factor(corpus_train$type) 0 1
## ham 0.98571429 0.01428571
## spam 1.00000000 0.00000000
##
## $ist
## ist
## factor(corpus_train$type) 0 1
## ham 0.991428571 0.008571429
## spam 0.997142857 0.002857143
##
## $singledrop
## singledrop
## factor(corpus_train$type) 0 1
## ham 0.991428571 0.008571429
## spam 0.997142857 0.002857143
##
## $listmanexamplecom
## listmanexamplecom
## factor(corpus_train$type) 0 1
## ham 0.995714286 0.004285714
## spam 1.000000000 0.000000000
##
## $dogmaslashnullorg
## dogmaslashnullorg
## factor(corpus_train$type) 0 1
## ham 0.988571429 0.011428571
## spam 0.997142857 0.002857143
##
## $esmtp
## esmtp
## factor(corpus_train$type) 0 1
## ham 0.97142857 0.02857143
## spam 0.98714286 0.01285714
## [1] "ham" "spam"
## naiveBayes.default(x = train, y = factor(corpus_train$type))
##
## pred ham spam
## ham 196 39
## spam 104 261
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 196 39
## spam 104 261
##
## Accuracy : 0.7617
## 95% CI : (0.7255, 0.7952)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5233
##
## Mcnemar's Test P-Value : 8.701e-08
##
## Sensitivity : 0.6533
## Specificity : 0.8700
## Pos Pred Value : 0.8340
## Neg Pred Value : 0.7151
## Prevalence : 0.5000
## Detection Rate : 0.3267
## Detection Prevalence : 0.3917
## Balanced Accuracy : 0.7617
##
## 'Positive' Class : ham
##
fourfoldplot(confusion_matrix, color = c("#CC6666", "#99CC99"),
conf.level = 0, margin = 1, main = "Confusion Matrix")