DS607_Project4_Spam

library(tm)
library(corpus)
library(stringr)
library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(utils)
library(R.utils)

## Loading required package: R.oo

## Loading required package: R.methodsS3

## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.

## R.oo v1.22.0 (2018-04-21) successfully loaded. See ?R.oo for help.

## 
## Attaching package: 'R.oo'

## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods

## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save

## R.utils v2.8.0 successfully loaded. See ?R.utils for help.

## 
## Attaching package: 'R.utils'

## The following object is masked from 'package:utils':
## 
##     timestamp

## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings

#library(tm.plugin.webmining)
library(quanteda)

## Package version: 1.4.3

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

library(plyr)
library(class)
library(knitr)
library(wordcloud)

## Loading required package: RColorBrewer

library(SnowballC)

## 
## Attaching package: 'SnowballC'

## The following objects are masked from 'package:RTextTools':
## 
##     getStemLanguages, wordStem

Download the spam files to the working directory and unzip them

setwd("C:\\Jagdish\\Masters Programs\\CUNY\\DATA 607 Data Acquisition and Management\\Project4\\")

if (!dir.exists("spam")){
  download.file(url = "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2", destfile = "spam.tar.bz2")
  bunzip2("spam.tar.bz2", remove = F, overwrite = T)
  untar("spam.tar",compressed = "bzip2")
}
spam_files = list.files(path = "spam",full.names = T)

Download the ham files to the working directory and unzip them

setwd("C:\\Jagdish\\Masters Programs\\CUNY\\DATA 607 Data Acquisition and Management\\Project4\\")

if (!dir.exists("ham")){
  download.file(url = "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2", destfile = "ham.tar.bz2")
  bunzip2("ham.tar.bz2", remove = F, overwrite = T)
  untar("ham.tar",compressed = "bzip2")
}
ham_files = list.files(path = "ham",full.names = T)

Create 3 file lists for ham, spam and combined.

# Remove the file that only contains the list of other files
remove_ham <- list.files(path="easy_ham/", full.names=T, recursive=FALSE, pattern="cmds")
file.remove(remove_ham)

## [1] TRUE

# Remove the file that only contains the list of other files
remove_spam <- list.files(path="spam/", full.names=T, recursive=FALSE, pattern="cmds")
file.remove(remove_spam)

## logical(0)

spam_files <-list.files(path="spam/", full.names=T, recursive=FALSE)
length(spam_files)

## [1] 500

# list of ham files
ham_files <- list.files(path="easy_ham/",full.names=T, recursive=FALSE)
length(ham_files)

## [1] 2500

# concatenate ham and spam file lists
ham_spam <- c(ham_files,spam_files)
length(ham_spam)

## [1] 3000

# Write a function to find the first blank line, since that is where the actual mail starts
blank_line_num<-function(a){
  
  for (i in 1:length(a)){
       if (str_detect(a[i],"^[:space:]*$")){
            result<-i
            return(i) 
        }
    }
}

Perform clean-up

# Read in spam files and create label to tag them.Remove HTML tags.Then create a spam mail corpus

temp = readLines(con = spam_files[1])
temp = str_c(temp, collapse = "")

spam_corpus = Corpus(VectorSource(temp))
meta(spam_corpus[[1]], "label") = "spam"

# Add the remaining documents from the folder
for (i in 2: length(spam_files)) {
 
  temp = readLines(con = spam_files[i])
  start <- blank_line_num(temp)+1
  end <- length(temp)
  temp <- temp[start:end]
  temp = str_c(temp, collapse = "")
  
  
temp<-gsub(pattern = "</?[^>]+>", replacement = "", x = temp,ignore.case = TRUE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
temp<-str_trim(unlist(str_replace_all(temp,"\\s+"," ")))
temp<-str_c(temp,collapse="")

  
  if (length(temp) != 0) {
    temp_corpus1 = Corpus(VectorSource(temp))
    meta(temp_corpus1[[1]], "label") = "spam"
    spam_corpus = c(spam_corpus, temp_corpus1)
  }
}

## Warning in readLines(con = spam_files[i]): incomplete final line found on
## 'spam/00136.faa39d8e816c70f23b4bb8758d8a74f0'

#Check length of spam corpus. Check metadata of 1st email. Check body of 50th email.

length(spam_corpus)

## [1] 500

meta(spam_corpus[[1]],"label")

## [1] "spam"

spam_corpus[[50]][1]

## [1] "Save up to75% on your Term LifeInsurance! Compare rates from top insurance companies aroundthe countryIn our life and times, it's important to plan foryour family's future, while being comfortable financially. Choose the rightLife Insurance policy today.Click the link below to compare the lowest ratesand save up to 75% COMPARE YOUR COVERAGEYou'll be able to compare rates and get a freeapplication in less than a minute!*Get your FREE instant quotes...*Compare the lowest prices, then...*Select a company and Apply Online.GET A FREE QUOTE NOW!You can't predict the future, but you can alwaysprepare for it.to beexcluded from future contacts graham_adlam"

# Read in ham files and create label to tag them.Remove HTML tags.Then create a ham mail corpus
temp = readLines(con = ham_files[1])
temp = str_c(temp, collapse = "")

ham_corpus = Corpus(VectorSource(temp))
meta(ham_corpus[[1]], "label") = "ham"

# Add the remaining documents from the folder
for (i in 2: length(ham_files)) {
 
  temp = readLines(con = ham_files[i])
  start <- blank_line_num(temp)+1
  end <- length(temp)
  temp <- temp[start:end]
  temp = str_c(temp, collapse = "")
  
temp<-gsub(pattern = "</?[^>]+>", replacement = "", x = temp,ignore.case = TRUE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
temp<-str_trim(unlist(str_replace_all(temp,"\\s+"," ")))
temp<-str_c(temp,collapse="")
  
  
  if (length(temp) != 0) {
    temp_corpus2 = Corpus(VectorSource(temp))
    meta(temp_corpus2[[1]], "label") = "ham"
    ham_corpus = c(ham_corpus, temp_corpus2)
  }
}

#Check length of spam corpus. Check metadata of 1st email. Check body of 50th email.
length(ham_corpus)

## [1] 2500

meta(ham_corpus[[1]], "label")

## [1] "ham"

head(ham_corpus[[50]][1])

## [1] "On 28 Aug 2002, Daniel Quinlan wrote:> Dan Kohn writes:> > > Daniel, it's easy enough for you to change the Habeas scores yourself> > on your installation. If Habeas fails to live up to its promise to> > only license the warrant mark to non-spammers and to place all> > violators on the HIL, then I have no doubt that Justin and Craig will> > quickly remove us from the next release. But, you're trying to kill> > Habeas before it has a chance to show any promise.> > I think I've worked on SA enough to understand that I can localize a> score. I'm just not comfortable with using SpamAssassin as a vehicle> for drumming up your business at the expense of our user base.I have to agree here. If Habeas is going to die just because SA does notsupport it, that's a serious problem with the business model; but that isnobody's problem but Habeas's.A possible solution is for Habeas's business model to include some kind ofincentive for users of SA to give it the benefit of the doubt. I have yetto think of an incentive that fits the bill ...On Thu, 29 Aug 2002, Justin Mason wrote:> I don't see a problem supporting it in SpamAssassin -- but I see Dan's> points.> > - high score: as far as I can see, that's because SpamAssassin is> assigning such high scores to legit newsletters these days, and the> Habeas mark has to bring it down below that. :( IMO we have to fix> the high-scorers anyway -- no spam ever *needs* to score over 5 in our> scoring system, 5 == tagged anyway.This is off the topic of the rest of this discussion, but amavisd (in allits incarnations) and MIMEDefang and several other MTA plugins all rejectat SMTP time messages that scores higher than some threshold (often 10). If some new release were to start scoring all spam no higher than 5.1,there'd better be _zero_ FPs, because all those filters would drop theirthresholds to 5.On Thu, 29 Aug 2002, Michael Moncur wrote:> But I agree that there needs to be more focus on eliminating rules that> frequently hit on newsletters. If any newsletters actually use the Habeas> mark, that will be one way to help.Newsletters won't use the mark. Habeas is priced way too high -- a factorof at least 20 over what the market will bear, IMO -- on a per-messagebasis for most typical mailing lists (Lockergnome, say) to afford it.On Thu, 29 Aug 2002, Harold Hallikainen wrote:> Habeus has come up with a very clever way to use existing law to battle> spam. It seems that at some point they could drop the licensing fee to> $1 or less and make all their income off suing the spammers for> copyright infringement.Sorry, that just can't work.If the Habeas mark actually becomes both widespread enough in non-spam,and effectively-enforced enough to be absent from spam, such that, e.g.,SA could assign a positive score to messages that do NOT have it, thenspammers are out of business and Habeas has no one to sue. There's nobodyleft to charge except the people who want (or are forced against theirwill because their mail won't get through otherwise) to use the mark.Conversely, if there are enough spammers forging the mark for Habeas tomake all its income suing them, then the mark is useless for the purposefor which it was designed.Either way it seems to me that, after maybe a couple of lawsuits againstreal spammers and a lot of cease-and-desist letters to clueless Mom&Pops,then either (a) they're out of business, (b) they have to sell the rightsto use the mark to increasingly questionable senders, or (c) they've bothcreated and monopolized a market for \"internet postage stamps\" thateverybody has to pay them for.The latter would be quite a coup if they [*] could pull it off -- they doabsolutely nothing useful, unless you consider threatening people withlawsuits useful, yet still collect a fee either directly or indirectlyfrom everyone on the internet -- effectively we'll be paying them for theprivilege of policing their trademark for them. I don't believe they'llever get that far, but I don't particularly want to help them make it.[*] And I use the term \"they\" loosely, because the whole company could consist of one lawyer if it really got to that point.-------------------------------------------------------This sf.net email is sponsored by:ThinkGeekWelcome to geek heaven.http://thinkgeek.com/sf_______________________________________________Spamassassin-talk mailing listSpamassassin-talk@lists.sourceforge.nethttps://lists.sourceforge.net/lists/listinfo/spamassassin-talk"

Combine corpus and shuffle

# Combine the spam and ham corpus. Check length of this corpus. Check body of 1000th email.

mail_corpus = c(ham_corpus,spam_corpus)
length(mail_corpus)

## [1] 3000

meta(mail_corpus[[1]],"label")

## [1] "ham"

meta(mail_corpus[[2600]],"label")

## [1] "spam"

# Shuffle the mails in the corpus
set.seed(2020)
mail_corpus <- sample(mail_corpus,length(mail_corpus))

head(mail_corpus[[1000]][1])

## [1] "On Mon, 30 Sep 2002, Tom wrote:> If the set passes around enough then more people have these works. the> more folks that have them now, while they are still legal to have, the> likely they will be left behind in the possible/probabale copyright> chillout..and if that doesnt happen then more folks than not will stillWe will be getting BlackNet-like guerilla P2P pretty soon. Packaging it into wormcode with an initial userbase of a few 100 k to Mnodes gives you pretty bulletproof plausible deniability.> have it for uses all manner of shades."

Additional clean-up

# Perform clean-up on the corpus, remove stop words and perform stemming.

mail_corpus = tm_map(mail_corpus, tolower)
mail_corpus = tm_map(mail_corpus, removeNumbers)
mail_corpus<-tm_map(mail_corpus, removePunctuation)
mail_corpus<-tm_map(mail_corpus, stripWhitespace)
mail_corpus<-tm_map(mail_corpus, removeWords, words = stopwords("en"))
mail_corpus <- tm_map(mail_corpus, stemDocument)

Create a document term matrix

#Create a document term matrix before and after removing sparse terms

dtm.mail<-DocumentTermMatrix(mail_corpus) 
dtm.mail

## A document-term matrix (3000 documents, 45878 terms)
## 
## Non-/sparse entries: 244494/137389506
## Sparsity           : 100%
## Maximal term length: 196626 
## Weighting          : term frequency (tf)

dtm_mail<-removeSparseTerms(dtm.mail, 1-(10/length(mail_corpus))) #removing sparse terms. 
dtm_mail

## A document-term matrix (3000 documents, 3166 terms)
## 
## Non-/sparse entries: 173600/9324400
## Sparsity           : 98%
## Maximal term length: 97 
## Weighting          : term frequency (tf)

Extract label that indicates spam or ham

# Create labels to indicate spam or ham, and check counts
spam_labels<-character()
#meta(mail_corpus[[1]],"label")
# spam_labels <- c(rep(NA,length(mail_corpus)))    

for (i in 1:length(mail_corpus)){
  tmp<-meta(mail_corpus[[i]],"label")
  #spam_labels[i]<-unlist(meta(mail_corpus[[i]],"label"))
  spam_labels<-c(spam_labels, tmp)
}
spam_labels<-as.data.frame(spam_labels)
#spam_labels<-unlist(spam_labels)
length(spam_labels)

## [1] 1

table(spam_labels)

## spam_labels
##  ham spam 
## 2500  500

colnames(spam_labels)<-"Spam_or_Ham"
dim(spam_labels)

## [1] 3000    1

#spam_labels$Spam_or_Ham
#as.data.frame(spam_labels)

Create a container to act as input to 3 models and execute model classification

# Create a container for further analysis

N<-length(spam_labels)

container<-create_container(dtm_mail, labels = spam_labels$Spam_or_Ham, trainSize = 1:2250, testSize = 2251:N, virgin = FALSE)

svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")

svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)

Compare relative performance across 3 models

# Compare relative performance across 3 models

#head(svm_out,10)
#head(tree_out,10)
#head(maxent_out,10)

all_labels <- data.frame(
    correct_label = spam_labels$Spam_or_Ham[2251:N],
    svm = as.character(svm_out[,1]),
    tree = as.character(tree_out[,1]),
    maxent = as.character(maxent_out[,1]),
    stringAsFactors = F)


#SVM Performance
svm_table <- table(all_labels[,1] == all_labels[,2])
round(prop.table(svm_table),3)

## 
## FALSE  TRUE 
##  0.28  0.72

tree_table <- table(all_labels[,1] == all_labels[,3])
round(prop.table(tree_table),3)

## 
## FALSE  TRUE 
## 0.254 0.746

maxent_table <- table(all_labels[,1] == all_labels[,4])
round(prop.table(maxent_table),3)

## 
## FALSE  TRUE 
## 0.326 0.674

Conclusion

From the above, it can be seen that the Random Forest model performed the best, while the Maximum Entropy model performed the worst, based on this dataset.

DS607_Project4_Spam_Ham

Jagdish Chhabria

April 14, 2019

Download the spam files to the working directory and unzip them

Download the ham files to the working directory and unzip them

Create 3 file lists for ham, spam and combined.

Perform clean-up

Combine corpus and shuffle

Additional clean-up

Create a document term matrix

Extract label that indicates spam or ham

Create a container to act as input to 3 models and execute model classification

Compare relative performance across 3 models

Conclusion