Project 4 - Text Mining

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

library(tm)
## Loading required package: NLP
library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(SparseM)

Making paths to all folders

resource_dir = "/Users/Olga/Documents/R/"
ham_train_dir = paste0(resource_dir,"train/easy_ham/")
spam_test_dir = paste0(resource_dir,"test/spam/")
spam_train_dir = paste0(resource_dir,"train/spam_2/")
ham_test_dir = paste0(resource_dir,"test/easy_ham_2/")

Сreating corpus for each record type

spam_train <- VCorpus(DirSource(spam_train_dir)) 
ham_train <- VCorpus(DirSource(ham_train_dir))
spam_test <- VCorpus(DirSource(spam_test_dir))
ham_test <- VCorpus(DirSource(ham_test_dir))

Union all corpuses into one

corpus <- c(spam_train, ham_train,spam_test,ham_test)

Preparing data: incoding issues, remove punctuation, conver to lower case.

corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, "us-ascii", sub="byte")))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)

Sticking the data into a matrix

dtm_email <- DocumentTermMatrix(corpus)

Remove elements with insignificant effect on results

dtm_email <- removeSparseTerms(dtm_email, 0.9)

Labeling the data

email_labels <- unlist(c(
                        rep(list("ham"), length(ham_train)),
                        rep(list("spam"), length(spam_train)),
                        rep(list("ham"), length(ham_test)),
                        rep(list("spam"), length(spam_test))
                        )
                       )

Calculate some constant values

train_length <- length(spam_train) + length(ham_train)
test_length <- length(ham_test) + length(spam_test)
dataset_size <- length(corpus)

Creating cottainer for analysis

email_container <- create_container(dtm_email,
                                    labels = email_labels,
                                    trainSize = 1:train_length,
                                    testSize = (train_length+1):dataset_size,
                                    virgin = F)

Lets do try all train model algorithms RTextTools has to offer:“SVM”,“SLDA”,“BOOSTING”,“BAGGING”,“RF”,“GLMNET”,“TREE”,“MAXENT”. I’m not going to use NNET as it is requires a slightly different approach

svm_model <- train_model(email_container, "SVM")
slda_model <- train_model(email_container, "SLDA")
boosting_model <- train_model(email_container, "BOOSTING")
bagging_model <- train_model(email_container, "BAGGING")
rf_model <- train_model(email_container, "RF")
glmnet_model <- train_model(email_container, "GLMNET")
tree_model <- train_model(email_container, "TREE")
maxent_model <- train_model(email_container, "MAXENT")

get original labels for test dataset

correct_email_type <- email_labels[(train_length+1):dataset_size]

Classify models and get them into a DataFrame

classifying <- c(classify_model(email_container, svm_model),
                 classify_model(email_container, slda_model),
                 classify_model(email_container, boosting_model),
                 classify_model(email_container, bagging_model),
                 classify_model(email_container, rf_model),
                 classify_model(email_container, glmnet_model),
                 classify_model(email_container, tree_model),
                 classify_model(email_container, maxent_model),
                 as.data.frame(correct_email_type)
                )

results <- data.frame(classifying)

results for SVM model

table(results[,1] == results[,17])/1901
## 
##     FALSE      TRUE 
## 0.2756444 0.7243556

results for SLDA model

table(results[,3] == results[,17])/1901
## 
##     FALSE      TRUE 
## 0.2830089 0.7169911

results for BOOSTING model

table(results[,5] == results[,17])/1901
## 
##     FALSE      TRUE 
## 0.2656497 0.7343503

results for BAGGING model

table(results[,7] == results[,17])/1901
## 
##     FALSE      TRUE 
## 0.2503945 0.7496055

results for RF model

table(results[,9] == results[,17])/1901
## 
##     FALSE      TRUE 
## 0.2640715 0.7359285

results for GLMNET model

table(results[,11] == results[,17])/1901
## 
##     FALSE      TRUE 
## 0.2593372 0.7406628

results for TREE model

table(results[,13] == results[,17])/1901
## 
##     FALSE      TRUE 
## 0.2735402 0.7264598

results for MAXENT model

table(results[,15] == results[,17])/1901
## 
##     FALSE      TRUE 
## 0.2777486 0.7222514

And the winner is… Bagging algorithm: