1 Preparing the data

1.1 Packages

require(tm)
suppressWarnings(require(RTextTools))

1.2 Creating a corpus and setting the environment

The spam and ham files were downloaded, and unzipped from https://spamassassin.apache.org/publiccorpus/.

#Set up working directory
setwd("C:/Users/Robert/Desktop/CUNY/GitHub/R/data/IS607_Homework11/")

#Create spam and ham corpus
spam <- Corpus(DirSource("spam"), readerControl = list(language="lat"))
easy_ham <- Corpus(DirSource("easy_ham"), readerControl = list(language="lat"))
hard_ham <- Corpus(DirSource("hard_ham"), readerControl = list(language="lat"))

1.3 Data preprocessing

#Remove cmds file
if (file.exists("easy_ham/cmds")) file.remove("easy_ham/cmds")
if (file.exists("hard_ham/cmds")) file.remove("hard_ham/cmds")
if (file.exists("spam/cmds")) file.remove("spam/cmds")

#Add meta labels
meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "easy_ham"
meta(hard_ham, tag = "type") <- "hard_ham"

1.4 Creating document term matrices

#Combine corpus objects, with and without "hard" spam data
combinedcorpusAll <- c(spam,easy_ham, hard_ham, recursive=T)
combinedcorpusEasy <- c(spam,easy_ham, recursive=T)
combinedcorpusHard <- c(spam,hard_ham, recursive=T)

#Create reduced and randomized corpus
combinedResampledCorpusAll <- sample(combinedcorpusAll, 750)
combinedResampledCorpusEasy <- sample(combinedcorpusEasy, 750)
combinedResampledCorpusHard <- sample(combinedcorpusHard, 750)

#Build document-term matrix. 
spamTDMAll <- DocumentTermMatrix(combinedResampledCorpusAll)
spamTDMEasy<- DocumentTermMatrix(combinedResampledCorpusEasy)
spamTDMHard<- DocumentTermMatrix(combinedResampledCorpusHard)

1.5 Creating containers

#Collect meta labels
typeAll <- unlist(meta(combinedResampledCorpusAll, "type")[,1])
typeEasy <- unlist(meta(combinedResampledCorpusEasy, "type")[,1])
typeHard <- unlist(meta(combinedResampledCorpusHard, "type")[,1])
head(typeAll)

## [1] "easy_ham" "easy_ham" "easy_ham" "spam"     "spam"     "easy_ham"

head(typeEasy)

## [1] "easy_ham" "easy_ham" "spam"     "easy_ham" "easy_ham" "easy_ham"

head(typeHard)

## [1] "spam"     "spam"     "spam"     "hard_ham" "spam"     "spam"

#create container and designate training vs testing 
spamContainerAll <- create_container(spamTDMAll,  labels = typeAll, trainSize = 1:375, testSize = 375:length(typeAll), virgin = FALSE)
spamContainerEasy <- create_container(spamTDMEasy,  labels = typeEasy, trainSize = 1:375, testSize = 375:length(typeEasy), virgin = FALSE)
spamContainerHard <- create_container(spamTDMHard,  labels = typeHard, trainSize = 1:375, testSize = 375:length(typeHard), virgin = FALSE)

2 Training models and classification

2.1 Training models

#training model for ALL
svm_model_all <- train_model(spamContainerAll, "SVM")
boosting_model_all <- train_model(spamContainerAll, "BOOSTING")
glmnet_model_all <- train_model(spamContainerAll, "GLMNET")
maxent_model_all <- train_model(spamContainerAll, "MAXENT")

#training model for easy
svm_model_easy <- train_model(spamContainerEasy, "SVM")
boosting_model_easy <- train_model(spamContainerEasy, "BOOSTING")
glmnet_model_easy <- train_model(spamContainerEasy, "GLMNET")
maxent_model_easy <- train_model(spamContainerEasy, "MAXENT")

#training model for hard
svm_model_hard <- train_model(spamContainerHard, "SVM")
boosting_model_hard <- train_model(spamContainerHard, "BOOSTING")
glmnet_model_hard <- train_model(spamContainerHard, "GLMNET")
maxent_model_hard <- train_model(spamContainerHard, "MAXENT")

2.2 Classification

#Classifying all
svm_classified_all <- classify_model(spamContainerAll, svm_model_all)
rf_classified_all <- classify_model(spamContainerAll, boosting_model_all)
glmnet_classified_all <- classify_model(spamContainerAll, glmnet_model_all)
maxent_classified_all <- classify_model(spamContainerAll, maxent_model_all)

#Classifying easy
svm_classified_easy <- classify_model(spamContainerEasy, svm_model_easy)
rf_classified_easy <- classify_model(spamContainerEasy, boosting_model_easy)
glmnet_classified_easy <- classify_model(spamContainerEasy, glmnet_model_easy)
maxent_classified_easy <- classify_model(spamContainerEasy, maxent_model_easy)

#Classifying hard
svm_classified_hard <- classify_model(spamContainerHard, svm_model_hard)
rf_classified_hard <- classify_model(spamContainerHard, boosting_model_hard)
glmnet_classified_hard <- classify_model(spamContainerHard, glmnet_model_hard)
maxent_classified_hard <- classify_model(spamContainerHard, maxent_model_hard)

2.3 Data Frame Conversion and comparison analysis

#All Dataframe conversion
classified_DF_all <- data.frame(
  label = typeAll[375:length(typeAll)],
  svm = svm_classified_all[,1],
  rf = rf_classified_all[,1],
  glmnet = glmnet_classified_all[,1],
  maxent = maxent_classified_all[,1],
  stringsAsFactors = F)

#Easy Dataframe conversion
classified_DF_easy <- data.frame(
  label = typeEasy[375:length(typeEasy)],
  svm = svm_classified_easy[,1],
  rf = rf_classified_easy[,1],
  glmnet = glmnet_classified_easy[,1],
  maxent = maxent_classified_easy[,1],
  stringsAsFactors = F)

#Hard Dataframe conversion
classified_DF_hard <- data.frame(
  label = typeHard[375:length(typeHard)],
  svm = svm_classified_hard[,1],
  rf = rf_classified_hard[,1],
  glmnet = glmnet_classified_hard[,1],
  maxent = maxent_classified_hard[,1],
  stringsAsFactors = F)

#previewing our results
head(classified_DF_all)

##      label      svm       rf   glmnet   maxent
## 1     spam     spam     spam     spam     spam
## 2 easy_ham easy_ham easy_ham easy_ham easy_ham
## 3 easy_ham easy_ham easy_ham easy_ham easy_ham
## 4 hard_ham hard_ham hard_ham easy_ham hard_ham
## 5 easy_ham easy_ham easy_ham easy_ham easy_ham
## 6 easy_ham easy_ham easy_ham easy_ham easy_ham

head(classified_DF_easy)

##      label      svm       rf   glmnet   maxent
## 1 easy_ham easy_ham easy_ham easy_ham easy_ham
## 2 easy_ham easy_ham easy_ham easy_ham easy_ham
## 3 easy_ham easy_ham easy_ham easy_ham easy_ham
## 4 easy_ham easy_ham easy_ham easy_ham easy_ham
## 5 easy_ham easy_ham easy_ham easy_ham easy_ham
## 6 easy_ham easy_ham easy_ham easy_ham easy_ham

head(classified_DF_hard)

##      label      svm       rf   glmnet   maxent
## 1     spam     spam     spam     spam     spam
## 2     spam     spam     spam     spam     spam
## 3 hard_ham hard_ham hard_ham hard_ham hard_ham
## 4     spam     spam     spam     spam     spam
## 5     spam     spam     spam     spam     spam
## 6     spam     spam     spam     spam     spam

3 Data Results

3.1 Results for All

##Support Vector Machine Performance
prop.table(table(classified_DF_all[,1] == classified_DF_all[,2]))

## 
##      FALSE       TRUE 
## 0.03989362 0.96010638

##Random Forest Performance
prop.table(table(classified_DF_all[,1] == classified_DF_all[,3]))

## 
##       FALSE        TRUE 
## 0.005319149 0.994680851

##glmnet Performance
prop.table(table(classified_DF_all[,1] == classified_DF_all[,4]))

## 
##      FALSE       TRUE 
## 0.02925532 0.97074468

##Max-Entropy Performance
prop.table(table(classified_DF_all[,1] == classified_DF_all[,5]))

## 
##      FALSE       TRUE 
## 0.04787234 0.95212766

3.2 Results for Easy

##Support Vector Machine Performance
prop.table(table(classified_DF_easy[,1] == classified_DF_easy[,2]))

## 
##      FALSE       TRUE 
## 0.01861702 0.98138298

##Random Forest Performance
prop.table(table(classified_DF_easy[,1] == classified_DF_easy[,3]))

## 
## TRUE 
##    1

##glmnet Performance
prop.table(table(classified_DF_easy[,1] == classified_DF_easy[,4]))

## 
## TRUE 
##    1

##Max-Entropy Performance
prop.table(table(classified_DF_easy[,1] == classified_DF_easy[,5]))

## 
##      FALSE       TRUE 
## 0.01329787 0.98670213

3.3 Results for Hard

##Support Vector Machine Performance
prop.table(table(classified_DF_hard[,1] == classified_DF_hard[,2]))

## 
##      FALSE       TRUE 
## 0.06117021 0.93882979

##Random Forest Performance
prop.table(table(classified_DF_hard[,1] == classified_DF_hard[,3]))

## 
##      FALSE       TRUE 
## 0.02393617 0.97606383

##glmnet Performance
prop.table(table(classified_DF_hard[,1] == classified_DF_hard[,4]))

## 
##      FALSE       TRUE 
## 0.03191489 0.96808511

##Max-Entropy Performance
prop.table(table(classified_DF_hard[,1] == classified_DF_hard[,5]))

## 
##      FALSE       TRUE 
## 0.04787234 0.95212766

4 Discussion

Having initially run the data without “hard_spam”, the results of the classification algorithms were closer to 99%. Having since adding more complex, and assuming the data source labelled the e-mail data accurately, the algorithm efficacy has since been reduced.

Identifying “easy” spam was successful at greater than 99% for each of the algorithms, with the Support Vector Machine algorithm performing marginally better than the others. The “hard” spam was slightly less easy to identify, with some of the algorithms missing up to 4% of the results. The combined was expectedly towards the center of results.For a better understanding of the text-mining, spam classification process, it would be worth experimenting further on other datasets, determine an optimal sample size, and finding novel ways to plot the results.

–

Spam Document Classification using R

Robert Sellers | robertwsellers@gmail.com

4/10/2016