Assignment

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/


For the Assignment I have used the given dataset, I have downloaded “20050311_spam_2.tar.bz2” and 20030228_easy_ham_2.tar.bz2 from the link provided (https://spamassassin.apache.org/publiccorpus/). The downloaded zip file were extracted twice in a local directory and at the end I got two uncompressed filders containing ham and spam emails. I will try to load the emails in R environment and create a data corpus, the corpus will be spilted in “Training data” and “Testing data”, and finnaly conduct a supervised training and test the results using tree models.

Setting up environment

Load Required Libraries

library(tm)
## Loading required package: NLP
library(stringr)
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.3.2
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 3.3.2
## Loading required package: SparseM
## Warning: package 'SparseM' was built under R version 3.3.2
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## Attaching package: 'RTextTools'
## The following objects are masked from 'package:SnowballC':
## 
##     getStemLanguages, wordStem

Load spam emails, create a temporary spam corpus and add meta information

spam <- DirSource("~/DATA 607/spamham/spam_2")
spam_corpus <- Corpus(spam, readerControl=list(reader=readPlain))
length(spam)
## [1] 1397
meta(spam_corpus, "filter") <- "spam"

Load ham emails, create a temporary ham corpus and add meta information

ham <- DirSource("~/DATA 607/spamham/easy_ham_2")
ham_corpus <- Corpus(ham, readerControl=list(reader=readPlain))
length(ham)
## [1] 1401
meta(ham_corpus, "filter") <- "ham"

Combine spam and ham corpus and check length

td_corpus = c(spam_corpus, ham_corpus)
length(td_corpus)
## [1] 2798
td_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 2798

Term DOcument Matrix

A term document matrix was created and data was randomized

set.seed(1)
tdf_corpus = sample(td_corpus)
head(meta(tdf_corpus, "filter"))
##      filter
## 743    spam
## 1041   spam
## 1602    ham
## 2539    ham
## 564    spam
## 2510    ham
tdm <- TermDocumentMatrix(tdf_corpus)
tdm
## <<TermDocumentMatrix (terms: 165784, documents: 2798)>>
## Non-/sparse entries: 751527/463112105
## Sparsity           : 100%
## Maximal term length: 990
## Weighting          : term frequency (tf)

Data Cleansing

In this step numbers, punctuations, stopwords were removed, data was transformed to all lower case and stemming was performed

tdf_corpus <- tm_map(tdf_corpus,removeNumbers) 
  tdf_corpus <- tm_map(tdf_corpus,str_replace_all,pattern = "[[:punct:]]", replacement = " ")
  tdf_corpus <- tm_map(tdf_corpus,removeWords, words = stopwords("en"))
  tdf_corpus <- tm_map(tdf_corpus, tolower)
  tdf_corpus <- tm_map(tdf_corpus, stemDocument)
  tdf_corpus <- tm_map(tdf_corpus, PlainTextDocument) 

Remove Sparse Terms

dtm <- DocumentTermMatrix(tdf_corpus)
dtm <- removeSparseTerms(dtm, 1 - (10/length(tdf_corpus)))
dtm
## <<DocumentTermMatrix (documents: 2798, terms: 6442)>>
## Non-/sparse entries: 489843/17534873
## Sparsity           : 97%
## Maximal term length: 70
## Weighting          : term frequency (tf)

Training and Testing

Create Container

RTextTools container was created to train diffrent models

labels <- as.factor(unlist(meta(tdf_corpus, "filter")[,1]))
class(labels)
## [1] "factor"
N <- length(labels)
container <- create_container(
             dtm,
             labels = labels,
             trainSize = 1:2300,
             testSize = 2301:N,
             virgin = FALSE
             )

Estimation procedure

svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")

Evaluation

svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)

labels_out <- data.frame(
              correct_label = labels[2301:N],
              svm = as.character(svm_out[,1]),
              tree = as.character(tree_out[,1]),
              maxent = as.character(maxent_out[,1]),
              stingsAsFactors = F)

##SVM performance             
table(labels_out[,1] == labels_out[,2])
## 
## FALSE  TRUE 
##     7   491
prop.table(table(labels_out[,1] == labels_out[,2]))
## 
##      FALSE       TRUE 
## 0.01405622 0.98594378
##Random forest performance
table(labels_out[,1] == labels_out[,3])
## 
## FALSE  TRUE 
##    26   472
prop.table(table(labels_out[,1] == labels_out[,3]))
## 
##      FALSE       TRUE 
## 0.05220884 0.94779116
##Maximum entropy performace
table(labels_out[,1] == labels_out[,4])
## 
## FALSE  TRUE 
##     6   492
prop.table(table(labels_out[,1] == labels_out[,4]))
## 
##      FALSE       TRUE 
## 0.01204819 0.98795181
SVM <- table(actual = labels_out[,1], predicted = labels_out[,2])
SVM
##       predicted
## actual ham spam
##   ham  221    4
##   spam   3  270
Random_Forest <- table(actual = labels_out[,1], predicted = labels_out[,3])
Random_Forest
##       predicted
## actual ham spam
##   ham  219    6
##   spam  20  253
Maximum_Entropy <- table(actual = labels_out[,1], predicted = labels_out[,4])
Maximum_Entropy
##       predicted
## actual ham spam
##   ham  224    1
##   spam   5  268

Confusion matrix plot

fourfoldplot(SVM, color = c("#CC6666", "#99CC99"),
            conf.level = 0, margin = 1, main = "SVM Confusion Matrix")

fourfoldplot(Random_Forest, color = c("#CC6666", "#99CC99"),
            conf.level = 0, margin = 1, main = "Random Forest Confusion Matrix")

fourfoldplot(Maximum_Entropy , color = c("#CC6666", "#99CC99"),
            conf.level = 0, margin = 1, main = "Maximum Entropy  Confusion Matrix") 

Conclusion

In my assignment I have used about 82% of emails as my training data and remaining as test data, I have used 3 models to determine the accuracy of the model and all three models gave us over 95% accuracy. In my test models Maximum entropy performing the best and Random forest was the last. We can conclude that RTextTool is a reliable, speedy and simple way for basic data mining

References: