DATA607 WK 10 Project 4: Text Mining

library(tm)
library(tidytext)
library(tidyverse)
library(SnowballC)
library(wordcloud)
library(stopwords)
library(RColorBrewer)
library(e1071)
library(gmodels)

1. Load Files Into R

Non-Spam

#infile.choose()
nospam_folder <- "C:\\Users\\Javern\\Documents\\Data Science MS\\DATA607\\spamnospam\\easy_ham"

ns <- list.files(path = nospam_folder)
ns <- paste(nospam_folder, "\\", ns, sep = "")
ns1 <- lapply(ns, FUN = readLines)
ns1 <- lapply(ns1, FUN = paste, collapse = "")

#Create a dataframe of non-spam files
ns2 <- data.frame(unlist(ns1), stringsAsFactors = F)

# add a score column
ns2$score <- 0
names(ns2) <- c("email", "score")

Spam

spam_folder <- "C:\\Users\\Javern\\Documents\\Data Science MS\\DATA607\\spamnospam\\spam_2"

s <- list.files(path = spam_folder)
s <- paste(spam_folder, "\\", s, sep = "")
s1 <- lapply(s, FUN = readLines)
s1 <- lapply(s1, FUN = paste, collapse = "")

#create a dataframe of spam files
s2 <- data.frame(unlist(s1), stringsAsFactors = F)

# add a score column
s2$score <- 1
names(s2) <- c("email", "score")

As One Dataframe

spamnospam <- rbind(ns2, s2) 

#show dimesions
dim(spamnospam)

## [1] 3898    2

2. Create Corpus

corpus <- Corpus(VectorSource(spamnospam$email))
corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3898

#Print out the data on the 20th email
corpus[[20]]

## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2378

#print out the content of the 20th email
corpus[[20]][1]

## $content
## [1] "From ilug-admin@linux.ie  Thu Aug 22 17:19:25 2002Return-Path: <ilug-admin@linux.ie>Delivered-To: zzzz@localhost.netnoteinc.comReceived: from localhost (localhost [127.0.0.1])\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id CD34B47C67\tfor <zzzz@localhost>; Thu, 22 Aug 2002 12:19:21 -0400 (EDT)Received: from phobos [127.0.0.1]\tby localhost with IMAP (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 17:19:21 +0100 (IST)Received: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MGHJZ14177 for    <zzzz-ilug@spamassassin.taint.org>; Thu, 22 Aug 2002 17:17:19 +0100Received: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id RAA09581; Thu, 22 Aug 2002 17:16:28 +0100X-Authentication-Warning: lugh.tuatha.org: Host root@localhost [127.0.0.1]    claimed to be lughReceived: from redpie.com (redpie.com [216.122.135.208] (may be forged))    by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id RAA09518 for    <ilug@linux.ie>; Thu, 22 Aug 2002 17:16:08 +0100Received: from justin ([194.46.28.223]) by redpie.com (8.8.7/8.8.5) with    SMTP id JAA05201 for <ilug@linux.ie>; Thu, 22 Aug 2002 09:15:59 -0700    (PDT)From: \"Kiall Mac Innes\" <kiall@redpie.com>To: \"ILUG\" <ilug@linux.ie>Date: Thu, 22 Aug 2002 17:23:15 +0100Message-Id: <BCEFLMCEIJHPCPLGADJIGEEFCAAA.kiall@redpie.com>MIME-Version: 1.0Content-Type: text/plain; charset=\"iso-8859-1\"Content-Transfer-Encoding: 7bitX-Priority: 3 (Normal)X-Msmail-Priority: NormalX-Mailer: Microsoft Outlook IMO, Build 9.0.2416 (9.0.2910.0)X-Mimeole: Produced By Microsoft MimeOLE V5.50.4522.1200Importance: NormalSubject: [ILUG] Sun Solaris..Sender: ilug-admin@linux.ieErrors-To: ilug-admin@linux.ieX-Mailman-Version: 1.1Precedence: bulkList-Id: Irish Linux Users' Group <ilug.linux.ie>X-Beenthere: ilug@linux.ieCan someone explain what type of operating system Solaris is... as ive neverseen or used it i dont know wheather to get a server from Sun or from DELL iwould prefer a linux based server and Sun seems to be the one for that butim not sure if Solaris is a distro of linux or a completely differentoperating system? can someone explain...Kiall Mac Innes-- Irish Linux Users' Group: ilug@linux.iehttp://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.List maintainer: listmaster@linux.ie"

3. Clean Corpus

Removing special characters, stopwords, white spaces, punctuations, converting to lowercase letters and stemming words.

#Create function to clean corpus
cleancorpus <- function(cc){

  for (j in seq(cc)) {
      cc[[j]] <- gsub("/", " ", cc[[j]])
      cc[[j]] <- gsub("[_]+", " ", cc[[j]])
      cc[[j]] <- gsub("@", " ", cc[[j]])
      cc[[j]] <- gsub("\\|", " ", cc[[j]])
  }

  cc2 <- tm_map(cc, removeNumbers)
  cc2 <- tm_map(cc2, str_replace_all, pattern = "[[:punct:]]", replacement = " ")
  cc2 <- tm_map(cc2, str_replace_all, pattern = "\\W", replacement = " ")
  cc2 <- tm_map(cc2, tolower)
  cc2 <- tm_map(cc2, removeWords, stopwords("english"))
  cc2 <- tm_map(cc2, tolower)
  cc2 <- tm_map(cc2, stemDocument)
  cc2 <- tm_map(cc2, stripWhitespace)

return(cc2)

}

corpus2 <- cleancorpus(corpus)

4. Document Term Matrix

The DocumentTermMatrix function takes a corpus and create a data structure where you want to treat each document as a row.

corpus2_dtm <- DocumentTermMatrix(corpus2)
inspect(corpus2_dtm[0:5, 300:305])

## <<DocumentTermMatrix (documents: 5, terms: 6)>>
## Non-/sparse entries: 8/22
## Sparsity           : 73%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs carri chapman cite claim continu custodi
##    1     0       0    0     0       0       0
##    2     0       0    0     0       0       0
##    3     1       1    1     1       1       1
##    4     1       0    0     0       1       0
##    5     0       0    0     0       0       0

# removes terms that are more sparse than 0.99
corpus2_dtm <- removeSparseTerms(corpus2_dtm, .99)
corpus2_dtm

## <<DocumentTermMatrix (documents: 3898, terms: 2080)>>
## Non-/sparse entries: 487495/7620345
## Sparsity           : 94%
## Maximal term length: 33
## Weighting          : term frequency (tf)

5. Exploration

No Spam

Frequent Non-Spam Terms

nospam <- which(spamnospam$score == 0)

nospamdtm <-DocumentTermMatrix(corpus2[nospam])
nospamdtm <- removeSparseTerms(nospamdtm, .99)

findFreqTerms(nospamdtm, lowfreq=500) #appears atleast 500 times

##   [1] "admin"        "archiv"       "aug"          "beenther"    
##   [5] "bulklist"     "can"          "charset"      "com"         
##   [9] "comreceiv"    "content"      "corp"         "date"        
##  [13] "dec"          "deliv"        "develop"      "discuss"     
##  [17] "dogma"        "drop"         "edt"          "esmtp"       
##  [21] "exmh"         "fetchmail"    "get"          "help"        
##  [25] "https"        "imap"         "int"          "ist"         
##  [29] "lab"          "like"         "line"         "list"        
##  [33] "listinfo"     "listman"      "localhost"    "mail"        
##  [37] "mailman"      "mailto"       "messag"       "mime"        
##  [41] "netnoteinc"   "new"          "org"          "orgreceiv"   
##  [45] "orgx"         "path"         "phobo"        "plain"       
##  [49] "post"         "postfix"      "preced"       "receiv"      
##  [53] "redhat"       "refer"        "repli"        "request"     
##  [57] "return"       "run"          "singl"        "slashnul"    
##  [61] "smtp"         "spamassassin" "subject"      "subscrib"    
##  [65] "sun"          "taint"        "text"         "think"       
##  [69] "thu"          "time"         "type"         "unsubscrib"  
##  [73] "use"          "version"      "wed"          "work"        
##  [77] "worker"       "zzzz"         "comx"         "email"       
##  [81] "encod"        "free"         "group"        "grp"         
##  [85] "helo"         "http"         "invok"        "mailer"      
##  [89] "network"      "qmail"        "scd"          "sender"      
##  [93] "system"       "transfer"     "unknown"      "yahoo"       
##  [97] "yahoogroup"   "zzzzteana"    "build"        "exim"        
## [101] "microsoft"    "one"          "prioriti"     "said"        
## [105] "talk"         "want"         "make"         "may"         
## [109] "net"          "say"          "world"        "www"         
## [113] "just"         "first"        "way"          "year"        
## [117] "found"        "look"         "peopl"        "right"       
## [121] "tri"          "wrote"        "also"         "chang"       
## [125] "need"         "now"          "origin"       "see"         
## [129] "thing"        "debian"       "exampl"       "neterror"    
## [133] "netx"         "sourceforg"   "spam"         "user"        
## [137] "usw"          "devel"        "html"         "don"         
## [141] "even"         "set"          "file"         "friend"      
## [145] "ilug"         "linux"        "lugh"         "mon"         
## [149] "problem"      "tuatha"       "url"          "pgp"         
## [153] "comerror"     "fork"         "khare"        "know"        
## [157] "lair"         "pdt"          "pipermail"    "rohit"       
## [161] "state"        "xent"         "edu"          "fri"         
## [165] "rpm"          "orgfrom"      "hotmail"      "habea"       
## [169] "sep"          "perl"         "sat"          "tue"         
## [173] "egwn"         "freshrpm"     "zzzlist"      "jmason"      
## [177] "yyyi"         "jalapeno"     "oct"          "razor"       
## [181] "rssfeed"      "utf"          "matthia"

Words Asscociation

findAssocs(nospamdtm, "support", 0.8) # correlated with other words atleast 80% or more

## $support
##   effort  develop     unit    state institut opportun   region     must 
##     0.85     0.84     0.84     0.84     0.84     0.84     0.84     0.83 
##   nation strategi challeng  continu  freedom    trade  america militari 
##     0.83     0.83     0.83     0.82     0.82     0.82     0.82     0.82 
##   intern     goal   defend   capabl   effect     seek  prevent intellig 
##     0.82     0.82     0.82     0.82     0.81     0.81     0.81     0.81 
##    emerg  potenti   advanc 
##     0.81     0.80     0.80

Word Cloud For Non-Spam

set.seed(125)
wordcloud(corpus2[nospam], min.freq=400, max.words = 100, colors=brewer.pal(8, "Dark2"))

Spam

Frequent Spam Terms

spam <- which(spamnospam$score == 1)

spamdtm <-DocumentTermMatrix(corpus2[spam])
spamdtm <- removeSparseTerms(spamdtm, .99)

findFreqTerms(spamdtm, lowfreq=500)

##   [1] "address"      "admin"        "aug"          "back"        
##   [5] "busi"         "call"         "charset"      "click"       
##   [9] "com"          "compani"      "comreceiv"    "content"     
##  [13] "date"         "deliv"        "dogma"        "drop"        
##  [17] "edt"          "email"        "esmtp"        "fetchmail"   
##  [21] "form"         "free"         "fri"          "get"         
##  [25] "hotmail"      "info"         "inform"       "ist"         
##  [29] "jmason"       "just"         "lab"          "life"        
##  [33] "like"         "link"         "list"         "listinfo"    
##  [37] "localhost"    "mail"         "market"       "may"         
##  [41] "messag"       "mime"         "money"        "need"        
##  [45] "net"          "netnoteinc"   "new"          "now"         
##  [49] "one"          "onlin"        "org"          "path"        
##  [53] "peopl"        "phobo"        "plain"        "pleas"       
##  [57] "postfix"      "receiv"       "remov"        "repli"       
##  [61] "return"       "send"         "servic"       "singl"       
##  [65] "slashnul"     "start"        "state"        "subject"     
##  [69] "text"         "time"         "tue"          "type"        
##  [73] "use"          "version"      "work"         "www"         
##  [77] "year"         "yyyi"         "aol"          "bodi"        
##  [81] "can"          "center"       "check"        "color"       
##  [85] "credit"       "day"          "don"          "find"        
##  [89] "font"         "help"         "home"         "html"        
##  [93] "http"         "jul"          "jun"          "keyword"     
##  [97] "mailto"       "make"         "mandark"      "mon"         
## [101] "month"        "name"         "number"       "order"       
## [105] "person"       "phone"        "product"      "right"       
## [109] "save"         "site"         "smtp"         "style"       
## [113] "today"        "valu"         "web"          "href"        
## [117] "includ"       "line"         "microsoft"    "offer"       
## [121] "price"        "program"      "report"       "special"     
## [125] "want"         "wed"          "yahoo"        "thu"         
## [129] "webnot"       "grant"        "million"      "nbsp"        
## [133] "sat"          "size"         "sun"          "tabl"        
## [137] "unsubscrib"   "width"        "align"        "background"  
## [141] "bgcolor"      "bit"          "black"        "border"      
## [145] "cellpad"      "cellspac"     "class"        "colspan"     
## [149] "cpunk"        "cypherpunk"   "div"          "encod"       
## [153] "envelop"      "face"         "famili"       "ffffff"      
## [157] "gif"          "head"         "height"       "input"       
## [161] "locust"       "meta"         "minder"       "mso"         
## [165] "option"       "pro"          "quot"         "sight"       
## [169] "sourceforg"   "spamassassin" "span"         "submit"      
## [173] "tbodi"        "titl"         "transfer"     "wast"        
## [177] "iso"          "taint"        "arial"        "first"       
## [181] "mailer"       "margin"       "prioriti"     "request"     
## [185] "comx"         "fork"         "internet"     "xent"        
## [189] "strong"       "top"          "index"        "rate"        
## [193] "ptsize"       "darial"       "dcenter"      "helvetica"   
## [197] "san"          "serif"        "verdana"      "alt"         
## [201] "img"          "src"          "blockquot"    "left"        
## [205] "valign"       "comcc"        "imag"         "softwar"     
## [209] "bottom"       "ffff"         "nextpart"     "jpg"         
## [213] "cfont"        "ffont"        "einstein"     "ssz"

Words Association

findAssocs(spamdtm, "admin", 0.8)

## $admin
## beenther bulklist  mailman listinfo comerror 
##     0.97     0.92     0.89     0.86     0.81

Word Cloud For Spam

set.seed(200)
wordcloud(corpus2[spam], min.freq=300, max.words = 100, colors=brewer.pal(8, "Dark2"))

6. Model for Assessing Non-Spam and Spam Emails

Divide the Corpus into training and test data (80:20)

samp <- round((0.80 * nrow(spamnospam)))
outcomes <- nrow(spamnospam)

set.seed(300)
train <- sample(outcomes, size = samp, replace = F)

training_data <- spamnospam[train,] #3118
testing_data <- spamnospam[-train, ] #780

How much spam and non-spam elements in training data from sample?

cat("Spam:", sum(training_data$score == 1)); cat("No Spam:", sum(training_data$score == 0))

## Spam: 1125

## No Spam: 1993

Create Corpus for both training and test data

training_corpus <- Corpus(VectorSource(training_data$email))
test_corpus <- Corpus(VectorSource(testing_data$email))


train_data <- cleancorpus(training_corpus)
train_dtm <- DocumentTermMatrix(train_data)
train_dtm <- removeSparseTerms(train_dtm, 0.99)

test_data <- cleancorpus(test_corpus)
test_dtm <- DocumentTermMatrix(test_data)
test_dtm <- removeSparseTerms(test_dtm, 0.99)

Naive Bayes Classifier

Naive Bayes classifiers is used to calculate the probability of a sample being part of a certain category based on prior knowledge. Based on the Bayes Theorem, the Naive Bayes Classifier assumes that every feature of a sample is independent of each other. That means that each character of a sample contributes independently to determine the probability of the classification of that sample hence outputting the category of the highest probability of the sample. Naive Bayes classification needs information on each word in a message so we count number of occurances and convert the document-term matrices.

# function to convert score to Not Spam (No) or Spam (Yes)
convert_score <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("Not Spam", "Spam"))
  y
}

trains <- apply(train_dtm, 2, convert_score)
test <- apply(test_dtm, 2, convert_score)

classify <- naiveBayes(trains, factor(training_data$score)) #learns the data

test_prediction <- predict(classify, newdata=test) #predicts data

CrossTable(test_prediction, testing_data$score, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('Prediction', 'Actual'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  780 
## 
##  
##              | Actual 
##   Prediction |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       506 |        17 |       523 | 
##              |     0.996 |     0.062 |           | 
## -------------|-----------|-----------|-----------|
##            1 |         2 |       255 |       257 | 
##              |     0.004 |     0.938 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       508 |       272 |       780 | 
##              |     0.651 |     0.349 |           | 
## -------------|-----------|-----------|-----------|
## 
##

There are 17 messages that were classified as spam that should have been non-spam and 2 that are spam, classified as non-spam. By using this model, the theorem provided over 97% of all SMS messages were correct by type, spam or non-spam even though there is a 0.6% chance (not bad) of the user missing an important email.

Sources

For more information, please visit these sites:

Download files to directory from here:

Ham-Spam

DATA607 WK 10 Project 4: Text Mining

Javern Wilson

April 6, 2019

1. Load Files Into R

Non-Spam

Spam

As One Dataframe

2. Create Corpus

3. Clean Corpus

4. Document Term Matrix

5. Exploration

No Spam

Frequent Non-Spam Terms

Words Asscociation

Word Cloud For Non-Spam

Spam

Words Association

Word Cloud For Spam

6. Model for Assessing Non-Spam and Spam Emails

Divide the Corpus into training and test data (80:20)

Naive Bayes Classifier

Sources