R Ham and Spam

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).

library(tm)

## Loading required package: NLP

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(stringr)
library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

Loading the data into R using TM Package

Spam and Ham emails

spam.dir <- file.path("C:/Users/Patrizia/Desktop/Ambra MSDA/W9/spam")
ham.dir<- file.path("C:/Users/Patrizia/Desktop/Ambra MSDA/W9/easy_ham_2")

Creating 2 corpora and filtering out the cmds documents

spamcorpus <- VCorpus(DirSource(spam.dir))
hamcorpus <- VCorpus(DirSource(ham.dir))
meta(spamcorpus[[501]])

##   author       : character(0)
##   datetimestamp: 2017-04-16 13:34:49
##   description  : character(0)
##   heading      : character(0)
##   id           : cmds
##   language     : en
##   origin       : character(0)

idx <- meta(spamcorpus, "id") == 'cmds'
idxh<-meta(hamcorpus, "id") == 'cmds'
spamcorpus<-spamcorpus[-idx]
hamcorpus<-hamcorpus[-idxh]

Wordcloud- top spam words

library(wordcloud)

## Loading required package: RColorBrewer

##Clean up the spamcorpus and remove custom list of words that are found in the first and last rows of each email

spamcorpus1<-tm_map(spamcorpus, content_transformer(function(x) str_replace_all(x, "If you wish to leave this list please use the link below", " ")))

##Remove URLS, emails and extra text
spamcorpus1<-tm_map(spamcorpus1, content_transformer(function(x) str_replace_all(x, "http.* *|\\S+@\\S+", " ")))

spamcorpus1<- spamcorpus1 %>%  tm_map(content_transformer(tolower))

spamcorpus1<-tm_map(spamcorpus1, content_transformer(function(x) str_replace_all(x, "from.*$|return-path.*$|delivered.*$|received.*$|reply.*$|to:.*$|date:.*$|x-mailer:.*$|content-transfer-encoding:.*$|x-mime-autoconverted.*$|content-type.*$|mIME-Version.*$|message-id:.*$|\t.*$|sender:.*$|precedence:.*$|list-Id:.*$|x-mailman-version:.*$|x-beenthere:.*$|list maintainer:.*$|subject|font|href|src|nbsp|esmpt|smtp|img|widthd|heightd|smtp", " ")))

##Remove HTML tags
spamcorpus1<-tm_map(spamcorpus1, content_transformer(function(x) str_replace_all(x, "<.*?>", " ")))

spamcorpus1<- spamcorpus1 %>% 
    tm_map(removeNumbers) %>%
    tm_map(removeWords, stopwords(kind="en")) %>%
    tm_map(removePunctuation) %>%
    tm_map(stripWhitespace)

spamcorpus1tdm<- as.matrix(TermDocumentMatrix(spamcorpus1))


word.freq <- sort(rowSums(spamcorpus1tdm), decreasing = T)

wordcloud(words = names(word.freq), freq = word.freq, min.freq = 30,max.freq=50,
          random.order = F)

Modifying the id tag and merge corpora

for(i in seq(length(hamcorpus))){
  meta(hamcorpus[[i]], tag = "type")<- "HAM"
}

for(i in seq(length(spamcorpus))){
  meta(spamcorpus[[i]], tag = "type")<- "SPAM"
}

emails<-c(spamcorpus, hamcorpus, recursive=T)

reshuffle docs, cleaning the corpus

emails<-sample(emails)

emails<- emails %>% tm_map(content_transformer(tolower)) %>% 
  tm_map(content_transformer(removePunctuation)) %>% 
  tm_map(content_transformer(stemDocument)) %>% 
  tm_map(content_transformer(removeNumbers)) 

emails<-tm_map(emails,removeWords, words = stopwords("en"))

How many emails are spam or ham?

spam_tags <- factor(unlist(meta(emails, "type")))
table(spam_tags)

## spam_tags
##  HAM SPAM 
## 1400  500

Create TDMs, remove sparse terms, convert TDM to Dataframe

emailstdm<-TermDocumentMatrix(emails)

emailstdm<-removeSparseTerms(emailstdm, .99)

email.df <- as.data.frame(data.matrix(emailstdm),stringsAsFactors=FALSE)

## Remove words with total frequency less than 3
email.df<- email.df[rowSums(email.df) > 3, ]

SVM Classifier

##Extract the spam tag

spam<-c()
for(i in 1:length(emails)){
  spam<-c(spam,emails[[i]]$meta$type)
}

N<-nDocs(emailstdm)

## set up model container using a 75/25 split between training and test data. Reference code as per chapter 10 of "Automated Data Collection with R"
container <- create_container(
    email.df,
    labels = spam,
    trainSize = 1:(0.75*N),
    testSize = (0.75*N+1):N,
    virgin = FALSE
)

 slotNames(container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

 emailsdf_train <- email.df[1:1425,]
 
 emailsdf_test <- email.df[1426:1900,]

##Use SVM model classifier to make predictions

svm.model <- train_model(container, "SVM")

svm.output <- classify_model(container, svm.model)

head(svm.output)

##   SVM_LABEL  SVM_PROB
## 1       HAM 0.7140429
## 2       HAM 0.7118988
## 3       HAM 0.7486656
## 4       HAM 0.7554778
## 5       HAM 0.7206037
## 6       HAM 0.7670432

##Since we know the correct labels, we can investigate how often the algorithms have misclassified the emails.  

labels_out <- data.frame( correct_label = spam[1426:N], svm = as.character(svm.output[,1]), stringsAsFactors = F)

SVM performance

round(prop.table(table(labels_out[,1] == labels_out[,2])), digits = 3)

## 
## FALSE  TRUE 
## 0.259 0.741

##SVM classified 71% of the emails correctly either as HAM or SPAM

##by Type 

head(labels_out)

##   correct_label svm
## 1           HAM HAM
## 2           HAM HAM
## 3           HAM HAM
## 4           HAM HAM
## 5           HAM HAM
## 6          SPAM HAM

library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr

## Conflicts with tidy packages ----------------------------------------------

## annotate(): ggplot2, NLP
## filter():   dplyr, stats
## lag():      dplyr, stats

labels_out_ham <- labels_out %>% filter(correct_label == "HAM")
 ## SVM performance for ham
table(labels_out_ham[,1] == labels_out_ham[,2])

## 
## TRUE 
##  352

round(prop.table(table(labels_out_ham[,1] == labels_out_ham[,2])), digits = 3)

## 
## TRUE 
##    1

labels_out_spam <- labels_out %>% filter(correct_label == "SPAM")
 ## SVM performance for spam

table(labels_out_spam[,1] == labels_out_spam[,2])

## 
## FALSE 
##   123

round(prop.table(table(labels_out_spam[,1] == labels_out_spam[,2])), digits = 3)

## 
## FALSE 
##     1

##Did the model classify all test spam data incorrectly?

Conclusions

We ingested 1400 HAM emails and 500 SPAM emails downloaded from http://spamassassin.apache.org/old/publiccorpus/ into a SVM classifier, training the model with 75% of the data. The SVM accurately classified 71% of the emails.

```

TXT classifier

Ambra

15 aprile 2017