D607 Project 4 - Text Mining

Kyle Gilde

April 7, 2017

The Task

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).

Loaded Packages

               
prettydoc  TRUE
SnowballC  TRUE
knitr      TRUE
rvest      TRUE
stringr    TRUE
tm         TRUE
RTextTools TRUE
caret      TRUE
dplyr      TRUE

Download & Extract Email Files

url <- "http://spamassassin.apache.org/old/publiccorpus/"

# parse the HTML
files <- url %>% read_html() %>% html_nodes("a") %>% html_attr("href")

# concat function
"%&%" <- function(x, y) paste0(x, y)

# select the 2 files needed
my_files <- files[c(9, 14)]
file_urls <- url %&% my_files

# create temporary directory
data_dir <- "data-sets/spamham"
dir.create(data_dir)

# Download and unzip the doubly zipped files
for (i in 1:length(my_files)) {
    if (!my_files[i] %in% list.files(data_dir)) {
        my_dest <- file.path(data_dir, my_files[i])
        download.file(file_urls[i], destfile = my_dest)
        untar(bzfile(my_dest, "rb"), exdir = data_dir)
    }
    
}

Let’s take a look at the downloaded files & created directories

(afile <- list.files(data_dir))
## [1] "20030228_easy_ham.tar.bz2" "20050311_spam_2.tar.bz2"  
## [3] "easy_ham"                  "spam_2"
head(list.files(data_dir %&% "/" %&% afile[3]))
## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"
## [6] "00006.253ea2f9a9cc36fa0b1129b04b806608"

Now let’s create 2 corpora

# Create the 2 corpora
spam_corpus <- VCorpus(DirSource(data_dir %&% "/" %&% "spam_2"), readerControl = list(language = "lat"))

ham_corpus <- VCorpus(DirSource(data_dir %&% "/" %&% "easy_ham"), readerControl = list(language = "lat"))

Add 2 metadata attributes: email type and group

For the “group” attribute, we will alternate between assigning the attibute values of “test” and “train” to each document

# add 2 metadata attributes email group attribute
email_attr <- "email_type"
attr_value1 <- "spam"
attr_value2 <- "ham"

# group attribute
group_attr <- "group"

# loop through spam corpus and create metadata
for (i in 1:length(spam_corpus)) {
    meta(spam_corpus[[i]], email_attr) <- attr_value1
    meta(spam_corpus[[i]], group_attr) <- ifelse(i%%2 == 0, "test", "train")
}

# loop through ham corpus and create metadata
for (i in 1:length(ham_corpus)) {
    meta(ham_corpus[[i]], email_attr) <- attr_value2
    meta(ham_corpus[[i]], group_attr) <- ifelse(i%%2 == 0, "test", "train")
}

Let’s combine the corpora & see how many documents were assigned to the Test and Train groups.

# combine corpora
my_corpus <- c(ham_corpus, spam_corpus)

email_types <- unlist(meta(my_corpus, type = "local", tag = "email_type"))
group_list <- unlist(meta(my_corpus, type = "local", tag = "group"))


addmargins(table(email_types, group_list))
##            group_list
## email_types test train  Sum
##        ham  1250  1251 2501
##        spam  698   699 1397
##        Sum  1948  1950 3898

Clean my Corpus & Create a Document-Term Matrix with the Sparse Words Removed

cleaned_corpus <- tm_map(my_corpus, removeNumbers)
cleaned_corpus <- tm_map(cleaned_corpus, content_transformer(tolower))
cleaned_corpus <- tm_map(cleaned_corpus, removeWords, words = stopwords("en"))
cleaned_corpus <- tm_map(cleaned_corpus, stemDocument)
cleaned_corpus <- tm_map(cleaned_corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
cleaned_corpus <- tm_map(cleaned_corpus, stripWhitespace)


# Create Document-Term Matrix
dtm <- DocumentTermMatrix(cleaned_corpus)
# remove sparse terms
dtm <- removeSparseTerms(dtm, 1 - (10/length(cleaned_corpus)))

inspect(dtm)
## <<DocumentTermMatrix (documents: 3898, terms: 6056)>>
## Non-/sparse entries: 487070/23119218
## Sparsity           : 98%
## Maximal term length: 74
## Weighting          : term frequency (tf)
## Sample             :
##                                         Terms
## Docs                                     aug esmtp jmlocalhost localhost
##   00028.60393e49c90f750226bee6381eb3e69d   0     2           0         0
##   00051.8b17ce16ace4d5845e2299c0123e1f14   0     2           0         0
##   00265.d0ebd6ba8f3e2b8d71e9cdaa2ec6fd91   0     5           0         4
##   00570.d98ca90ac201b5d881f2397c95838eb2   0     5           2         5
##   00670.be029e37187b8615a231865e3663dcf9   0     5           2         4
##   00677.b957e34b4dd0d9263b56bf71b1168d8a   0     4           2         3
##   00765.ea01c46568902b1338c9685b55d77f6c   0     5           2         3
##   00907.74983d9d0d6ee3c681a48cf893f123b5   0     4           0         1
##   00942.727cb1619115cdee240fa418da19dd1f   0     4           2         3
##   01083.a6b3c50be5abf782b585995d2c11176b   0     4           0         1
##                                         Terms
## Docs                                     mon oct postfix received sep wed
##   00028.60393e49c90f750226bee6381eb3e69d   4   0       2        2   0   0
##   00051.8b17ce16ace4d5845e2299c0123e1f14   0   0       2        2   0   0
##   00265.d0ebd6ba8f3e2b8d71e9cdaa2ec6fd91   0   8       3        6   0   4
##   00570.d98ca90ac201b5d881f2397c95838eb2   0   0       3        6   8   0
##   00670.be029e37187b8615a231865e3663dcf9   0   0       3        6   8   0
##   00677.b957e34b4dd0d9263b56bf71b1168d8a   0   0       3        6   9   0
##   00765.ea01c46568902b1338c9685b55d77f6c   0   0       3        6   8   0
##   00907.74983d9d0d6ee3c681a48cf893f123b5   8   0       0        8   0   0
##   00942.727cb1619115cdee240fa418da19dd1f   0   9       3        6   0   4
##   01083.a6b3c50be5abf782b585995d2c11176b   0   0       0        8   0   0

Create Container, Train & Classify Models

training_indices <- which(group_list == "train")
test_indices <- which(group_list == "test")

container <- create_container(dtm, labels = email_types, trainSize = training_indices, 
    testSize = test_indices, virgin = F)

# slotNames(container)

# Train models
svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")

# Classify models
svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)

Aggregate the Results of the 3 Models with the Actuals

# Construct data frame with correct labels
results <- data.frame(actuals = email_types[test_indices], svm = as.character(svm_out[, 
    1]), tree = as.character(tree_out[, 1]), maxent = as.character(maxent_out[, 
    1]), stringsAsFactors = F)

Create Confusion Matrices using the caret package

Support Vector Machine Results

Confusion Matrix and Statistics

          Reference
Prediction  ham spam
      ham  1194    4
      spam    4  688
                                          
               Accuracy : 0.9958          
                 95% CI : (0.9917, 0.9982)
    No Information Rate : 0.6339          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.9909          
 Mcnemar's Test P-Value : 1               
                                          
            Sensitivity : 0.9942          
            Specificity : 0.9967          
         Pos Pred Value : 0.9942          
         Neg Pred Value : 0.9967          
             Prevalence : 0.3661          
         Detection Rate : 0.3640          
   Detection Prevalence : 0.3661          
      Balanced Accuracy : 0.9954          
                                          
       'Positive' Class : spam            
                                          

Random Forest Results

Confusion Matrix and Statistics

          Reference
Prediction  ham spam
      ham  1185   20
      spam   13  672
                                         
               Accuracy : 0.9825         
                 95% CI : (0.9756, 0.988)
    No Information Rate : 0.6339         
    P-Value [Acc > NIR] : <2e-16         
                                         
                  Kappa : 0.9623         
 Mcnemar's Test P-Value : 0.2963         
                                         
            Sensitivity : 0.9891         
            Specificity : 0.9711         
         Pos Pred Value : 0.9834         
         Neg Pred Value : 0.9810         
             Prevalence : 0.6339         
         Detection Rate : 0.6270         
   Detection Prevalence : 0.6376         
      Balanced Accuracy : 0.9801         
                                         
       'Positive' Class : ham            
                                         

Maximum Entropy Results

Confusion Matrix and Statistics

          Reference
Prediction  ham spam
      ham  1196    8
      spam    2  684
                                          
               Accuracy : 0.9947          
                 95% CI : (0.9903, 0.9975)
    No Information Rate : 0.6339          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.9886          
 Mcnemar's Test P-Value : 0.1138          
                                          
            Sensitivity : 0.9884          
            Specificity : 0.9983          
         Pos Pred Value : 0.9971          
         Neg Pred Value : 0.9934          
             Prevalence : 0.3661          
         Detection Rate : 0.3619          
   Detection Prevalence : 0.3630          
      Balanced Accuracy : 0.9934          
                                          
       'Positive' Class : spam            
                                          

Conclusions

  • Overall, all 3 supervised classifer models came close to correctly classifying nearly all of the documents into spam and ham. Their balanced accuracies ranged from .9801 to .9954.

  • Out of the 3 models, the Support Vector Machine supervised classifier was the most accurate, followed Maximum Entropy and then Random Forest.

# Close connections & delete local data closeAllConnections()
# unlink(data_dir, recursive = T)