library(tm)
library(tidytext)
library(tidyverse)
library(SnowballC)
library(wordcloud)
library(stopwords)
library(RColorBrewer)
library(e1071)
library(gmodels)
#infile.choose()
nospam_folder <- "C:\\Users\\Javern\\Documents\\Data Science MS\\DATA607\\spamnospam\\easy_ham"
ns <- list.files(path = nospam_folder)
ns <- paste(nospam_folder, "\\", ns, sep = "")
ns1 <- lapply(ns, FUN = readLines)
ns1 <- lapply(ns1, FUN = paste, collapse = "")
#Create a dataframe of non-spam files
ns2 <- data.frame(unlist(ns1), stringsAsFactors = F)
# add a score column
ns2$score <- 0
names(ns2) <- c("email", "score")
spam_folder <- "C:\\Users\\Javern\\Documents\\Data Science MS\\DATA607\\spamnospam\\spam_2"
s <- list.files(path = spam_folder)
s <- paste(spam_folder, "\\", s, sep = "")
s1 <- lapply(s, FUN = readLines)
s1 <- lapply(s1, FUN = paste, collapse = "")
#create a dataframe of spam files
s2 <- data.frame(unlist(s1), stringsAsFactors = F)
# add a score column
s2$score <- 1
names(s2) <- c("email", "score")
spamnospam <- rbind(ns2, s2)
#show dimesions
dim(spamnospam)
## [1] 3898 2
corpus <- Corpus(VectorSource(spamnospam$email))
corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3898
#Print out the data on the 20th email
corpus[[20]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2378
#print out the content of the 20th email
corpus[[20]][1]
## $content
## [1] "From ilug-admin@linux.ie Thu Aug 22 17:19:25 2002Return-Path: <ilug-admin@linux.ie>Delivered-To: zzzz@localhost.netnoteinc.comReceived: from localhost (localhost [127.0.0.1])\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id CD34B47C67\tfor <zzzz@localhost>; Thu, 22 Aug 2002 12:19:21 -0400 (EDT)Received: from phobos [127.0.0.1]\tby localhost with IMAP (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 17:19:21 +0100 (IST)Received: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MGHJZ14177 for <zzzz-ilug@spamassassin.taint.org>; Thu, 22 Aug 2002 17:17:19 +0100Received: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id RAA09581; Thu, 22 Aug 2002 17:16:28 +0100X-Authentication-Warning: lugh.tuatha.org: Host root@localhost [127.0.0.1] claimed to be lughReceived: from redpie.com (redpie.com [216.122.135.208] (may be forged)) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id RAA09518 for <ilug@linux.ie>; Thu, 22 Aug 2002 17:16:08 +0100Received: from justin ([194.46.28.223]) by redpie.com (8.8.7/8.8.5) with SMTP id JAA05201 for <ilug@linux.ie>; Thu, 22 Aug 2002 09:15:59 -0700 (PDT)From: \"Kiall Mac Innes\" <kiall@redpie.com>To: \"ILUG\" <ilug@linux.ie>Date: Thu, 22 Aug 2002 17:23:15 +0100Message-Id: <BCEFLMCEIJHPCPLGADJIGEEFCAAA.kiall@redpie.com>MIME-Version: 1.0Content-Type: text/plain; charset=\"iso-8859-1\"Content-Transfer-Encoding: 7bitX-Priority: 3 (Normal)X-Msmail-Priority: NormalX-Mailer: Microsoft Outlook IMO, Build 9.0.2416 (9.0.2910.0)X-Mimeole: Produced By Microsoft MimeOLE V5.50.4522.1200Importance: NormalSubject: [ILUG] Sun Solaris..Sender: ilug-admin@linux.ieErrors-To: ilug-admin@linux.ieX-Mailman-Version: 1.1Precedence: bulkList-Id: Irish Linux Users' Group <ilug.linux.ie>X-Beenthere: ilug@linux.ieCan someone explain what type of operating system Solaris is... as ive neverseen or used it i dont know wheather to get a server from Sun or from DELL iwould prefer a linux based server and Sun seems to be the one for that butim not sure if Solaris is a distro of linux or a completely differentoperating system? can someone explain...Kiall Mac Innes-- Irish Linux Users' Group: ilug@linux.iehttp://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.List maintainer: listmaster@linux.ie"
Removing special characters, stopwords, white spaces, punctuations, converting to lowercase letters and stemming words.
#Create function to clean corpus
cleancorpus <- function(cc){
for (j in seq(cc)) {
cc[[j]] <- gsub("/", " ", cc[[j]])
cc[[j]] <- gsub("[_]+", " ", cc[[j]])
cc[[j]] <- gsub("@", " ", cc[[j]])
cc[[j]] <- gsub("\\|", " ", cc[[j]])
}
cc2 <- tm_map(cc, removeNumbers)
cc2 <- tm_map(cc2, str_replace_all, pattern = "[[:punct:]]", replacement = " ")
cc2 <- tm_map(cc2, str_replace_all, pattern = "\\W", replacement = " ")
cc2 <- tm_map(cc2, tolower)
cc2 <- tm_map(cc2, removeWords, stopwords("english"))
cc2 <- tm_map(cc2, tolower)
cc2 <- tm_map(cc2, stemDocument)
cc2 <- tm_map(cc2, stripWhitespace)
return(cc2)
}
corpus2 <- cleancorpus(corpus)
The DocumentTermMatrix function takes a corpus and create a data structure where you want to treat each document as a row.
corpus2_dtm <- DocumentTermMatrix(corpus2)
inspect(corpus2_dtm[0:5, 300:305])
## <<DocumentTermMatrix (documents: 5, terms: 6)>>
## Non-/sparse entries: 8/22
## Sparsity : 73%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs carri chapman cite claim continu custodi
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 1 1 1 1 1 1
## 4 1 0 0 0 1 0
## 5 0 0 0 0 0 0
# removes terms that are more sparse than 0.99
corpus2_dtm <- removeSparseTerms(corpus2_dtm, .99)
corpus2_dtm
## <<DocumentTermMatrix (documents: 3898, terms: 2080)>>
## Non-/sparse entries: 487495/7620345
## Sparsity : 94%
## Maximal term length: 33
## Weighting : term frequency (tf)
nospam <- which(spamnospam$score == 0)
nospamdtm <-DocumentTermMatrix(corpus2[nospam])
nospamdtm <- removeSparseTerms(nospamdtm, .99)
findFreqTerms(nospamdtm, lowfreq=500) #appears atleast 500 times
## [1] "admin" "archiv" "aug" "beenther"
## [5] "bulklist" "can" "charset" "com"
## [9] "comreceiv" "content" "corp" "date"
## [13] "dec" "deliv" "develop" "discuss"
## [17] "dogma" "drop" "edt" "esmtp"
## [21] "exmh" "fetchmail" "get" "help"
## [25] "https" "imap" "int" "ist"
## [29] "lab" "like" "line" "list"
## [33] "listinfo" "listman" "localhost" "mail"
## [37] "mailman" "mailto" "messag" "mime"
## [41] "netnoteinc" "new" "org" "orgreceiv"
## [45] "orgx" "path" "phobo" "plain"
## [49] "post" "postfix" "preced" "receiv"
## [53] "redhat" "refer" "repli" "request"
## [57] "return" "run" "singl" "slashnul"
## [61] "smtp" "spamassassin" "subject" "subscrib"
## [65] "sun" "taint" "text" "think"
## [69] "thu" "time" "type" "unsubscrib"
## [73] "use" "version" "wed" "work"
## [77] "worker" "zzzz" "comx" "email"
## [81] "encod" "free" "group" "grp"
## [85] "helo" "http" "invok" "mailer"
## [89] "network" "qmail" "scd" "sender"
## [93] "system" "transfer" "unknown" "yahoo"
## [97] "yahoogroup" "zzzzteana" "build" "exim"
## [101] "microsoft" "one" "prioriti" "said"
## [105] "talk" "want" "make" "may"
## [109] "net" "say" "world" "www"
## [113] "just" "first" "way" "year"
## [117] "found" "look" "peopl" "right"
## [121] "tri" "wrote" "also" "chang"
## [125] "need" "now" "origin" "see"
## [129] "thing" "debian" "exampl" "neterror"
## [133] "netx" "sourceforg" "spam" "user"
## [137] "usw" "devel" "html" "don"
## [141] "even" "set" "file" "friend"
## [145] "ilug" "linux" "lugh" "mon"
## [149] "problem" "tuatha" "url" "pgp"
## [153] "comerror" "fork" "khare" "know"
## [157] "lair" "pdt" "pipermail" "rohit"
## [161] "state" "xent" "edu" "fri"
## [165] "rpm" "orgfrom" "hotmail" "habea"
## [169] "sep" "perl" "sat" "tue"
## [173] "egwn" "freshrpm" "zzzlist" "jmason"
## [177] "yyyi" "jalapeno" "oct" "razor"
## [181] "rssfeed" "utf" "matthia"
findAssocs(nospamdtm, "support", 0.8) # correlated with other words atleast 80% or more
## $support
## effort develop unit state institut opportun region must
## 0.85 0.84 0.84 0.84 0.84 0.84 0.84 0.83
## nation strategi challeng continu freedom trade america militari
## 0.83 0.83 0.83 0.82 0.82 0.82 0.82 0.82
## intern goal defend capabl effect seek prevent intellig
## 0.82 0.82 0.82 0.82 0.81 0.81 0.81 0.81
## emerg potenti advanc
## 0.81 0.80 0.80
set.seed(125)
wordcloud(corpus2[nospam], min.freq=400, max.words = 100, colors=brewer.pal(8, "Dark2"))
Frequent Spam Terms
spam <- which(spamnospam$score == 1)
spamdtm <-DocumentTermMatrix(corpus2[spam])
spamdtm <- removeSparseTerms(spamdtm, .99)
findFreqTerms(spamdtm, lowfreq=500)
## [1] "address" "admin" "aug" "back"
## [5] "busi" "call" "charset" "click"
## [9] "com" "compani" "comreceiv" "content"
## [13] "date" "deliv" "dogma" "drop"
## [17] "edt" "email" "esmtp" "fetchmail"
## [21] "form" "free" "fri" "get"
## [25] "hotmail" "info" "inform" "ist"
## [29] "jmason" "just" "lab" "life"
## [33] "like" "link" "list" "listinfo"
## [37] "localhost" "mail" "market" "may"
## [41] "messag" "mime" "money" "need"
## [45] "net" "netnoteinc" "new" "now"
## [49] "one" "onlin" "org" "path"
## [53] "peopl" "phobo" "plain" "pleas"
## [57] "postfix" "receiv" "remov" "repli"
## [61] "return" "send" "servic" "singl"
## [65] "slashnul" "start" "state" "subject"
## [69] "text" "time" "tue" "type"
## [73] "use" "version" "work" "www"
## [77] "year" "yyyi" "aol" "bodi"
## [81] "can" "center" "check" "color"
## [85] "credit" "day" "don" "find"
## [89] "font" "help" "home" "html"
## [93] "http" "jul" "jun" "keyword"
## [97] "mailto" "make" "mandark" "mon"
## [101] "month" "name" "number" "order"
## [105] "person" "phone" "product" "right"
## [109] "save" "site" "smtp" "style"
## [113] "today" "valu" "web" "href"
## [117] "includ" "line" "microsoft" "offer"
## [121] "price" "program" "report" "special"
## [125] "want" "wed" "yahoo" "thu"
## [129] "webnot" "grant" "million" "nbsp"
## [133] "sat" "size" "sun" "tabl"
## [137] "unsubscrib" "width" "align" "background"
## [141] "bgcolor" "bit" "black" "border"
## [145] "cellpad" "cellspac" "class" "colspan"
## [149] "cpunk" "cypherpunk" "div" "encod"
## [153] "envelop" "face" "famili" "ffffff"
## [157] "gif" "head" "height" "input"
## [161] "locust" "meta" "minder" "mso"
## [165] "option" "pro" "quot" "sight"
## [169] "sourceforg" "spamassassin" "span" "submit"
## [173] "tbodi" "titl" "transfer" "wast"
## [177] "iso" "taint" "arial" "first"
## [181] "mailer" "margin" "prioriti" "request"
## [185] "comx" "fork" "internet" "xent"
## [189] "strong" "top" "index" "rate"
## [193] "ptsize" "darial" "dcenter" "helvetica"
## [197] "san" "serif" "verdana" "alt"
## [201] "img" "src" "blockquot" "left"
## [205] "valign" "comcc" "imag" "softwar"
## [209] "bottom" "ffff" "nextpart" "jpg"
## [213] "cfont" "ffont" "einstein" "ssz"
findAssocs(spamdtm, "admin", 0.8)
## $admin
## beenther bulklist mailman listinfo comerror
## 0.97 0.92 0.89 0.86 0.81
set.seed(200)
wordcloud(corpus2[spam], min.freq=300, max.words = 100, colors=brewer.pal(8, "Dark2"))
samp <- round((0.80 * nrow(spamnospam)))
outcomes <- nrow(spamnospam)
set.seed(300)
train <- sample(outcomes, size = samp, replace = F)
training_data <- spamnospam[train,] #3118
testing_data <- spamnospam[-train, ] #780
How much spam and non-spam elements in training data from sample?
cat("Spam:", sum(training_data$score == 1)); cat("No Spam:", sum(training_data$score == 0))
## Spam: 1125
## No Spam: 1993
Create Corpus for both training and test data
training_corpus <- Corpus(VectorSource(training_data$email))
test_corpus <- Corpus(VectorSource(testing_data$email))
train_data <- cleancorpus(training_corpus)
train_dtm <- DocumentTermMatrix(train_data)
train_dtm <- removeSparseTerms(train_dtm, 0.99)
test_data <- cleancorpus(test_corpus)
test_dtm <- DocumentTermMatrix(test_data)
test_dtm <- removeSparseTerms(test_dtm, 0.99)
Naive Bayes classifiers is used to calculate the probability of a sample being part of a certain category based on prior knowledge. Based on the Bayes Theorem, the Naive Bayes Classifier assumes that every feature of a sample is independent of each other. That means that each character of a sample contributes independently to determine the probability of the classification of that sample hence outputting the category of the highest probability of the sample. Naive Bayes classification needs information on each word in a message so we count number of occurances and convert the document-term matrices.
# function to convert score to Not Spam (No) or Spam (Yes)
convert_score <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("Not Spam", "Spam"))
y
}
trains <- apply(train_dtm, 2, convert_score)
test <- apply(test_dtm, 2, convert_score)
classify <- naiveBayes(trains, factor(training_data$score)) #learns the data
test_prediction <- predict(classify, newdata=test) #predicts data
CrossTable(test_prediction, testing_data$score, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('Prediction', 'Actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 780
##
##
## | Actual
## Prediction | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 506 | 17 | 523 |
## | 0.996 | 0.062 | |
## -------------|-----------|-----------|-----------|
## 1 | 2 | 255 | 257 |
## | 0.004 | 0.938 | |
## -------------|-----------|-----------|-----------|
## Column Total | 508 | 272 | 780 |
## | 0.651 | 0.349 | |
## -------------|-----------|-----------|-----------|
##
##
There are 17
messages that were classified as spam that should have been non-spam and 2
that are spam, classified as non-spam. By using this model, the theorem provided over 97%
of all SMS messages were correct by type, spam or non-spam even though there is a 0.6%
chance (not bad) of the user missing an important email.
For more information, please visit these sites:
Download files to directory from here: