Document Classification

Assignment

Label training spam and ham documents for machine learning prediction for other documents.

Load Libraries

library(tm)
library(stringr)
library(dplyr)
library(RTextTools)

Load Ham and Spam Files

We load the files that were downloaded on our local machine. Notice that there are 2,551 ham messages and 501 spam messages.

ham_files <-  list.files("easy_ham")
length(ham_files)

## [1] 2551

spam_files <-  list.files("spam")
length(spam_files)

## [1] 501

Make the Ham and Spam Corpuses

Make loops to read through, remove whitespace, and label (using meta) each ham and spam message. Place these in their respective corpuses. Look at a sample ham and spam file output.

tempham <- readLines(str_c("easy_ham/",ham_files[1]))
tempham <- str_c(tempham, collapse = "")
ham_corpus <- VCorpus(VectorSource(tempham))

n <- 1
for (i in 1:length(ham_files)){
  n <- n+1
  tempham <- readLines(str_c("easy_ham/",ham_files[i]))
  tempham <- str_c(tempham, collapse = "")
  tempham <- str_trim(unlist(str_replace_all(tempham,"\\s+"," ")))   
  tmp_corpus <- VCorpus(VectorSource(tempham))
  ham_corpus <- c(ham_corpus,tmp_corpus)
  meta(ham_corpus[[n]], "type") <- "Ham"
}

meta(ham_corpus[[5]])

##   author       : character(0)
##   datetimestamp: 2017-11-05 23:03:19
##   description  : character(0)
##   heading      : character(0)
##   id           : 1
##   language     : en
##   origin       : character(0)
##   type         : Ham

tempspam <- readLines(str_c("spam/",spam_files[1]))
tempspam <- str_c(tempspam, collapse = "")
spam_corpus <- VCorpus(VectorSource(tempspam))

n <- 1
for (i in 2:length(spam_files)){
  n <- n+1
  tempspam <- readLines(str_c("spam/",spam_files[i]))
  tempspam <- str_c(tempspam, collapse = "")
  tempspam <- str_trim(unlist(str_replace_all(tempspam,"\\s+"," ")))   
  tmp_corpus <- VCorpus(VectorSource(tempspam))
  spam_corpus <- c(spam_corpus,tmp_corpus)
  meta(spam_corpus[[n]], "type") <- "Spam"
}

meta(spam_corpus[[5]])

##   author       : character(0)
##   datetimestamp: 2017-11-05 23:03:51
##   description  : character(0)
##   heading      : character(0)
##   id           : 1
##   language     : en
##   origin       : character(0)
##   type         : Spam

Combine and Clean Ham and Spam Corpuses

Clean the contents of the files and combine them into one corpus

total_corpus <- c(ham_corpus, spam_corpus)  

total_corpus <- tm_map(total_corpus,removeNumbers)
total_corpus <- tm_map(total_corpus, content_transformer(tolower))
total_corpus <- tm_map(total_corpus, removeWords, words = stopwords("en"))
total_corpus <- tm_map(total_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
total_corpus <- tm_map(total_corpus, stemDocument)

Create TDM

Use the combined corpus to make a term document matrix to use for the machine learning models.

#High Sparse
mix_tdm <- TermDocumentMatrix(total_corpus)
mix_tdm

## <<TermDocumentMatrix (terms: 64580, documents: 3053)>>
## Non-/sparse entries: 562964/196599776
## Sparsity           : 100%
## Maximal term length: 17434
## Weighting          : term frequency (tf)

#Remove Sparse Terms
mix_tdm <- removeSparseTerms(mix_tdm, 1-(20/length(total_corpus)))
mix_tdm

## <<TermDocumentMatrix (terms: 2940, documents: 3053)>>
## Non-/sparse entries: 434541/8541279
## Sparsity           : 95%
## Maximal term length: 70
## Weighting          : term frequency (tf)

Create Container

Make the containers split between ham and spam for testing.

labels <- unlist(meta(total_corpus, "type"))

container <- create_container(mix_tdm,
                              labels = labels,
                              trainSize=1:500,
                              testSize=501:1000,
                              virgin = FALSE)

slotNames(container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

Start Model Training

Prepare and run 3 models to test other files. Unfortunately these did seem to run on my machine. I’ll continue to try other methods to get this to run.

#svm_model <- train_model(container, "SVM") 
#tree_model <- train_model(container,"TREE")
#maxent_model <- train_model(container, "MAXENT")