It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/.
# Load the libraries
lib.list <- c("RCurl",
"DT",
"dplyr",
"tidyr",
"wordcloud",
"ggplot2",
"data.table",
"RMySQL",
"tidyverse",
"stringr",
"ggplot2",
"wordcloud",
"tm", #text mining https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf
"RTextTools"
)
# Loading all libraries at once
lapply(lib.list, require, character.only = TRUE)# Good Practise: Set up the Working Directory when working with a file system
setwd("C:\\Users\\PURA\\Dropbox\\CUNY\\607Data\\Assignments\\project04")
wdir <- getwd()
# Set the File URL
if (!dir.exists("easy_ham")){
download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2",
destfile = "20021010_easy_ham.tar.bz2")
untar("20021010_easy_ham.tar.bz2",compressed = "bzip2")
}
ham.files = list.files(path = "easy_ham",full.names = TRUE)
if (!dir.exists("spam_2")){
download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2",
destfile = "20050311_spam_2.tar.bz2")
untar("20050311_spam_2.tar.bz2", compressed = "bzip2")
}
spam.files = list.files(path = "spam_2", full.names = TRUE)# Read the Directory and get list of file names
dir <- paste(wdir,"easy_ham",sep="/")
ham.File.Names = list.files(dir)
ham.File.Path <- paste(dir, ham.File.Names, sep="/")
ham.body.df <- c()
# Read all files in a DF
for (i in ham.File.Path){
#con <- file(i, open='r')
text <- readLines(i)
ham.body<- list(paste(text, collapse="\n"))
ham.body.df = c(ham.body.df,ham.body)
}
#
ham.df <- c()
ham.df <- as.data.frame(unlist(ham.body.df))
names(ham.df) <- c("body")
ham.df$filename <- unlist(ham.File.Names)
ham.df$type <- "ham"dir <- paste(wdir,"spam_2",sep="/")
spam.File.Names = list.files(dir)
spam.File.Path <- paste(dir,spam.File.Names,sep="/")
spam.body.df <- c()
# Read all files in a DF
for (i in spam.File.Path){
#con <- file(i, open='r')
text <- readLines(i)
spam.body<- list(paste(text, collapse="\n"))
spam.body.df = c(spam.body.df,spam.body)
}
#
spam.df <- c()
spam.df <- as.data.frame(unlist(spam.body.df))
names(spam.df) <- c("body")
spam.df$filename <- unlist(spam.File.Names)
spam.df$type <- "spam"spam.ham.df <- rbind(spam.df, ham.df)# use the #tm to mine the text
# Documentation for this can be seen at https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf
spam.Ham.Corpus <- VCorpus(VectorSource(spam.ham.df$body))
clean.Email.Corpus <- tm_map(spam.Ham.Corpus, content_transformer(tolower))
clean.Email.Corpus <- tm_map(clean.Email.Corpus, removeNumbers)
clean.Email.Corpus <- tm_map(clean.Email.Corpus, removePunctuation)
clean.Email.Corpus <- tm_map(clean.Email.Corpus, stripWhitespace)
clean.Email.Corpus <- tm_map(clean.Email.Corpus, removeWords, stopwords("english"))
#clean.Email.Corpus <- tm_map(clean.Email.Corpus, stemDocument)
email.Dtm <- DocumentTermMatrix(clean.Email.Corpus)
email.Dtm <- removeSparseTerms(email.Dtm, 1-(10/length(clean.Email.Corpus)))
email.Tdm <- TermDocumentMatrix(clean.Email.Corpus)
email.Tdm <- removeSparseTerms(email.Tdm, 1-(10/length(clean.Email.Corpus)))
spam_indices <- which(spam.ham.df$type == "spam")
suppressWarnings(wordcloud(clean.Email.Corpus[spam_indices], min.freq=100))ham_indices <- which(spam.ham.df$type == "ham")
suppressWarnings(wordcloud(clean.Email.Corpus[ham_indices], min.freq=100)) ## STEP 5. Short Analysis of the existing data The above word clouds show us the various words that are predominantly present in HAM and SPAM emails.
We could also use the TM package further to do some more analysis like shown below
findFreqTerms(email.Dtm, 500)## [1] "address"
## [2] "aligncenter"
## [3] "aligndcenter"
## [4] "aligndcenterfont"
## [5] "also"
## [6] "arial"
## [7] "aug"
## [8] "available"
## [9] "back"
## [10] "best"
## [11] "bit"
## [12] "body"
## [13] "border"
## [14] "borderd"
## [15] "build"
## [16] "bulk"
## [17] "business"
## [18] "call"
## [19] "can"
## [20] "cdt"
## [21] "cellpadding"
## [22] "cellpaddingd"
## [23] "cellspacing"
## [24] "cellspacingd"
## [25] "center"
## [26] "change"
## [27] "charsetiso"
## [28] "charsetusascii"
## [29] "check"
## [30] "clean"
## [31] "click"
## [32] "color"
## [33] "colord"
## [34] "colordff"
## [35] "companies"
## [36] "company"
## [37] "contenttransferencoding"
## [38] "contenttype"
## [39] "credit"
## [40] "date"
## [41] "day"
## [42] "debian"
## [43] "deliveredto"
## [44] "deliverydate"
## [45] "discussion"
## [46] "div"
## [47] "dogmaslashnullorg"
## [48] "dont"
## [49] "easy"
## [50] "edt"
## [51] "egwn"
## [52] "egwnnet"
## [53] "email"
## [54] "encodingutf"
## [55] "end"
## [56] "envelopefrom"
## [57] "errorsto"
## [58] "esmtp"
## [59] "even"
## [60] "every"
## [61] "exim"
## [62] "exmh"
## [63] "facearial"
## [64] "facedarial"
## [65] "facedverdana"
## [66] "faceverdana"
## [67] "fetchmail"
## [68] "file"
## [69] "find"
## [70] "first"
## [71] "font"
## [72] "fontfont"
## [73] "forkadminxentcom"
## [74] "forkexamplecom"
## [75] "forkxentcom"
## [76] "form"
## [77] "found"
## [78] "free"
## [79] "fri"
## [80] "friends"
## [81] "get"
## [82] "good"
## [83] "government"
## [84] "group"
## [85] "head"
## [86] "height"
## [87] "heightd"
## [88] "help"
## [89] "helvetica"
## [90] "hits"
## [91] "home"
## [92] "hqpronsnet"
## [93] "html"
## [94] "httplistsfreshrpmsnetmailmanlistinforpmzzzlist"
## [95] "httpxentcommailmanlistinfofork"
## [96] "httpxentcompipermailfork"
## [97] "iluglinuxie"
## [98] "imap"
## [99] "img"
## [100] "information"
## [101] "input"
## [102] "inreplyto"
## [103] "internet"
## [104] "intmxcorpexamplecom"
## [105] "invoked"
## [106] "ist"
## [107] "ive"
## [108] "jalapeno"
## [109] "jmasonorg"
## [110] "jmjmasonorg"
## [111] "jmlocalhost"
## [112] "jmnetnoteinccom"
## [113] "jul"
## [114] "jun"
## [115] "just"
## [116] "khare"
## [117] "know"
## [118] "lairxentcom"
## [119] "life"
## [120] "like"
## [121] "line"
## [122] "link"
## [123] "linux"
## [124] "list"
## [125] "listarchive"
## [126] "listhelp"
## [127] "listid"
## [128] "listmanexamplecom"
## [129] "listpost"
## [130] "listsubscribe"
## [131] "listunsubscribe"
## [132] "localhost"
## [133] "locustmindernet"
## [134] "long"
## [135] "look"
## [136] "made"
## [137] "mail"
## [138] "mailing"
## [139] "mailtoforkexamplecom"
## [140] "mailtoforkrequestxentcomsubjecthelp"
## [141] "mailtoforkrequestxentcomsubjectsubscribe"
## [142] "mailtoforkrequestxentcomsubjectunsubscribe"
## [143] "make"
## [144] "mandarklabsnetnoteinccom"
## [145] "many"
## [146] "marketing"
## [147] "may"
## [148] "message"
## [149] "messageid"
## [150] "meta"
## [151] "microsoft"
## [152] "million"
## [153] "mimeversion"
## [154] "mon"
## [155] "money"
## [156] "much"
## [157] "name"
## [158] "need"
## [159] "network"
## [160] "never"
## [161] "new"
## [162] "normal"
## [163] "now"
## [164] "number"
## [165] "oct"
## [166] "offer"
## [167] "one"
## [168] "online"
## [169] "option"
## [170] "order"
## [171] "outlook"
## [172] "pdt"
## [173] "people"
## [174] "per"
## [175] "pfont"
## [176] "phobos"
## [177] "phoboslabsnetnoteinccom"
## [178] "phone"
## [179] "please"
## [180] "postfix"
## [181] "precedence"
## [182] "problem"
## [183] "program"
## [184] "qmail"
## [185] "quotedprintable"
## [186] "really"
## [187] "receive"
## [188] "received"
## [189] "references"
## [190] "remove"
## [191] "removed"
## [192] "replyto"
## [193] "report"
## [194] "required"
## [195] "returnpath"
## [196] "right"
## [197] "rohit"
## [198] "rpm"
## [199] "rpmlistfreshrpmsnet"
## [200] "rpmzzzlistadminfreshrpmsnet"
## [201] "rpmzzzlistfreshrpmsnet"
## [202] "rssfeedsexamplecom"
## [203] "rssfeedsjmasonorg"
## [204] "said"
## [205] "sansserif"
## [206] "sat"
## [207] "say"
## [208] "see"
## [209] "send"
## [210] "sender"
## [211] "sent"
## [212] "sep"
## [213] "service"
## [214] "sfnet"
## [215] "since"
## [216] "singledrop"
## [217] "site"
## [218] "size"
## [219] "sized"
## [220] "smtp"
## [221] "software"
## [222] "something"
## [223] "spam"
## [224] "states"
## [225] "still"
## [226] "subject"
## [227] "sun"
## [228] "system"
## [229] "table"
## [230] "take"
## [231] "technology"
## [232] "texthtml"
## [233] "textplain"
## [234] "thats"
## [235] "think"
## [236] "thu"
## [237] "time"
## [238] "times"
## [239] "today"
## [240] "tue"
## [241] "two"
## [242] "unknown"
## [243] "url"
## [244] "use"
## [245] "used"
## [246] "users"
## [247] "using"
## [248] "uswsflistsourceforgenet"
## [249] "vamm"
## [250] "version"
## [251] "versioncvs"
## [252] "want"
## [253] "way"
## [254] "web"
## [255] "webnotenet"
## [256] "wed"
## [257] "well"
## [258] "width"
## [259] "widthd"
## [260] "will"
## [261] "within"
## [262] "without"
## [263] "work"
## [264] "world"
## [265] "wrote"
## [266] "xbeenthere"
## [267] "xentcom"
## [268] "xkeywords"
## [269] "xmailer"
## [270] "xmailmanversion"
## [271] "xmsmailpriority"
## [272] "xoriginaldate"
## [273] "xpriority"
## [274] "xspamlevel"
## [275] "xspamstatus"
## [276] "year"
## [277] "years"
## [278] "yyyyexamplecom"
## [279] "yyyylocalhostexamplecom"
## [280] "yyyylocalhostnetnoteinccom"
## [281] "zzzzlocalhost"
set.seed(123)
train.size <- floor(0.70 * nrow(spam.ham.df))
train.size## [1] 2763
train.Index <- sample(seq_len(nrow(spam.ham.df)), size = train.size)
train.Spam.Ham <- spam.ham.df[train.Index, ]
test.Spam.Ham <- spam.ham.df[-train.Index, ]
# count of spam and ham in train data set
spam<-subset(train.Spam.Ham,train.Spam.Ham$type == "spam")
ham<-subset(train.Spam.Ham,train.Spam.Ham$type == "ham")
# Create corpus for training and test data
train.Email.Corpus <- Corpus(VectorSource(train.Spam.Ham$body))
test.Email.Corpus <- Corpus(VectorSource(test.Spam.Ham$body))
train.Clean.Corpus <- tm_map(train.Email.Corpus ,removeNumbers)
train.Clean.Corpus <- tm_map(train.Clean.Corpus, removePunctuation)
train.Clean.Corpus <- tm_map(train.Clean.Corpus, removeWords, stopwords())
train.Clean.Corpus<- tm_map(train.Clean.Corpus, stripWhitespace)
test.Clean.Corpus <- tm_map(test.Email.Corpus, removeNumbers)
test.Clean.Corpus <- tm_map(test.Clean.Corpus, removePunctuation)
test.Clean.Corpus <- tm_map(test.Clean.Corpus, removeWords, stopwords())
test.Clean.Corpus<- tm_map(test.Clean.Corpus, stripWhitespace)
train_email_dtm <- DocumentTermMatrix(train.Clean.Corpus)
test_email_dtm <- DocumentTermMatrix(test.Clean.Corpus)
# count function
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
train_sms <- apply(train_email_dtm, 2, convert_count)
test_sms <- apply(test_email_dtm, 2, convert_count)
library(e1071)
# classification of email
classifier <- naiveBayes(train_sms, factor(train.Spam.Ham$type))test_pred <- predict(classifier, newdata=test_sms)
table(test_pred, test.Spam.Ham$type)##
## test_pred ham spam
## ham 770 16
## spam 1 398
Conclusion - Classification
We saw how we could use naiveBayes to accurately predict the Ham vs. Spam.
We could similarly use other algorithms like SVM, Random Forest or Maxvent models.
My computer despite having 16GB of RAM + GPU kept running our of memory. This made the project bit painful as i kept waiting for hours to see results to see the RStudio crash. I wish the dataset was bit more manageable for us to experiment more.
Professor, I also appreciate if you could please teach us spanning instances on AWS to run this code. I would love that!