Project 4 KS LM

DATA607 Project 4 - Kelly Shaffer and Latif Masud

Assignment:

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).

One example corpus: https://spamassassin.apache.org/publiccorpus/

NOTE: We had to use http://spamassassin.apache.org/old/publiccorpus/ since the original link provided produces a 404 error.

#load packages
library(tm)

## Warning: package 'tm' was built under R version 3.3.3

## Loading required package: NLP

library(stringr)
library(SnowballC)
library(RTextTools)

## Warning: package 'RTextTools' was built under R version 3.3.3

## Loading required package: SparseM

## Warning: package 'SparseM' was built under R version 3.3.3

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## 
## Attaching package: 'RTextTools'

## The following objects are masked from 'package:SnowballC':
## 
##     getStemLanguages, wordStem

spam.url <- "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"

ham.url <- "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2"

#download and unzip the spam files, load them to spam_files
download.file(spam.url, destfile = "20050311_spam_2.tar.bz2")
untar("20050311_spam_2.tar.bz2", compressed = "bzip2")
spam.files = list.files(path = "spam_2", full.names = TRUE)

#download and unzip the ham files, load them to ham_files
download.file(ham.url, destfile = "20021010_easy_ham.tar.bz2")
untar("20021010_easy_ham.tar.bz2", compressed = "bzip2")
ham.files = list.files(path = "easy_ham", full.names = TRUE)

length(list.files("spam_2"))

## [1] 1397

length(list.files("easy_ham"))

## [1] 2551

#take a peek and make sure everything's good - these two methods are basically the same
list.files("spam_2")[1:6]

## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"

list.files("easy_ham")[1:6]

## [1] "0001.ea7e79d3153e7469e7a9c3e0af6a357e"
## [2] "0002.b3120c4bcbf3101e661161ee7efcb8bf"
## [3] "0003.acfc5ad94bbd27118a0d8685d18c89dd"
## [4] "0004.e8d5727378ddde5c3be181df593f1712"
## [5] "0005.8c3b9e9c0f3f183ddaf7592a11b99957"
## [6] "0006.ee8b0dba12856155222be180ba122058"

#this method is better since you can see which files are spam and which are ham
head(spam.files)

## [1] "spam_2/00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "spam_2/00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "spam_2/00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "spam_2/00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "spam_2/00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "spam_2/00006.3ca1f399ccda5d897fecb8c57669a283"

head(ham.files)

## [1] "easy_ham/0001.ea7e79d3153e7469e7a9c3e0af6a357e"
## [2] "easy_ham/0002.b3120c4bcbf3101e661161ee7efcb8bf"
## [3] "easy_ham/0003.acfc5ad94bbd27118a0d8685d18c89dd"
## [4] "easy_ham/0004.e8d5727378ddde5c3be181df593f1712"
## [5] "easy_ham/0005.8c3b9e9c0f3f183ddaf7592a11b99957"
## [6] "easy_ham/0006.ee8b0dba12856155222be180ba122058"

1398 spam files have been loaded

4953 ham files have been loaded

Now we need to take all of the separate files and store them in a single corpus

Classifying Files

We start off by defining a classification function that will run through both the spam and ham files. Inside this function, we run a for loop to read the file, create a temporary corpus to hold the data of the file that we are reading at that iteration, and finish off by adding a classification of whether the file is a spam or ham file.

Once we have this function built, we can simply run it over the the list of our spam and ham files to create a corpora for each of them, and then combine the two. Once we have done this step, we randomize the data in the mixed.corpus by running the sample function over it.

classification <- function (files, type) {
  n <- 1
  for (i in 2:length(files) - 1) {
    tmp <- readLines(files[i])
    tmp <- str_c(tmp, collapse = "")
    tmp.corpus <- VCorpus(VectorSource(tmp))
    txt.corpus <- c(txt.corpus, tmp.corpus) # Adding the Vcorpus together.
    # Now to add the meta key-value, "classification", "spam"
    n <- n + 1
    
    meta(txt.corpus[[n]], "classification") <- type
  }
  
  return (txt.corpus)
}

spam.corpus <- classification(spam.files, "spam")
ham.corpus <- classification(ham.files, "ham")
mixed.corpus <- c(spam.corpus, ham.corpus, recursive = FALSE)

NOTE: the above step was not working when trying to build an Rmd, so the code above was simplified to the following:

tmp <- readLines(spam.files[1])
tmp <- str_c(tmp, collapse = "")
mixed.corpus <- VCorpus(VectorSource(tmp))
meta(mixed.corpus[[1]], "classification") <- "spam"

n <- 1
for (i in 2:length(spam.files) - 1) {
  tmp <- readLines(spam.files[i])
  tmp.corpus <- str_c(tmp, collapse = "")
  tmp.corpus <- VCorpus(VectorSource(tmp))
  mixed.corpus <- c(mixed.corpus, tmp.corpus)
  n <- n + 1
  
  meta(mixed.corpus[[n]], "classification") <- "spam"
}

for (i in 2:length(ham.files) - 1) {
  tmp <- readLines(ham.files[i])
  tmp <- str_c(tmp, collapse = "")
  tmp.corpus <- VCorpus(VectorSource(tmp))
  mixed.corpus <- c(mixed.corpus, tmp.corpus)
  n <- n + 1
  
  meta(mixed.corpus[[n]], "classification") <- "ham"
}

Now that we have our spam and ham corpuses ready, we want to take any non words in the corporas. To do this, we run a gsub to replace anything that isn’t a word with a space. Once this step is done, we run the stemDocument function to get the stem words. We’re also going to remove any “stop words” which are the most common words in a language to make sure we’ve removed anything we don’t want to analyze. Lastly, we’ll convert to lowercase.

toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus.tmp <- tm_map(mixed.corpus, toSpace, "\\W")

corpus.tmp <- tm_map(corpus.tmp, content_transformer(stemDocument))

#taking a look at the stop words and then removing them
length(stopwords("en"))

## [1] 174

stopwords("en")[1:10]

##  [1] "i"         "me"        "my"        "myself"    "we"       
##  [6] "our"       "ours"      "ourselves" "you"       "your"

corpus.tmp <- tm_map(corpus.tmp, removeWords, words = stopwords("en"))

#convert to lower case
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))

Once we have cleaned our corpa of mixed spam and ham words, we can now create a TDM, or term document matrix. To do this, we simply call the TermDocumentMatrix, and then remove sparse terms in the matrix that are less than 20 times.

tdm <- TermDocumentMatrix(corpus.tmp)
tdm <- removeSparseTerms(tdm, 1-(20/length(corpus.tmp)))

Now we’re going to begin setting ourselves up for the predictive modeling section of the assignment by creating a document term matrix and a container. The models themselves are producing the following error:
“Error in [.matrix.coo(x, rw, cl) : Subscripts out of bound”
We weren’t able to find any fixes on the internet for the error. We did locate the maintainer of the RTextTools package since there seems to be an issue with the train_model function. We sent the following email to tpjurka@ucdavis.edu:
“Hi there,

I’m in a Data Science Masters program and we’re doing a project using predictive modeling. We’re having trouble getting the train_model() function to work under the RTextTools package. I saw that you were the maintainer of the package, so I’m hoping you can help. It’s producing the following error: “Error in [.matrix.coo(x, rw, cl) : Subscripts out of bound”

Any idea how we should go about fixing it?

Thanks!

Kelly Shaffer"

Being that today is Easter Sunday and many are preoccupied with family activities, we weren’t expecting a response right away but will be coming back to finish this assignment if he responds with any key insights.

dtm <- DocumentTermMatrix(corpus.tmp)

meta_type <- as.vector(unlist(meta(corpus.tmp, type = "local", tag = "classification")))
head(meta_type, 10)

##  [1] "spam" "spam" "spam" "spam" "spam" "spam" "spam" "spam" "spam" "spam"

sh.label <- data.frame(type = unlist(meta_type))
table(sh.label)

## sh.label
##  ham spam 
## 2550 1397

N <- length(meta_type)
container <- create_container(dtm, labels = meta_type, trainSize = 1:400, testSize = 401:N, virgin = FALSE)

slotNames(container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

#SVM Model
#svm_model <- train_model(container, "SVM")
#svm_out <- classify_model(container, svm_model)

#Random Forest Model
#tree_model <- train_model(container, "TREE")
#tree_out <- classify_model(container, tree_model)

#Maximum Entropy Model
#maxent_model <- train_model(container, "MAXENT")
#maxent_out <- classify_model(container, maxent_model)

#Setup for performance testing on the models
#labels_out <- data.frame(
#  correct_label = meta_type[401:N],
#  svm = as.character(svm_out[,1]),
#  tree = as.character(tree_out[,1]),
#  maxent = as.character(maxent_out[,1]),
#  stringsAsFactors = FALSE)

#SVM Performance
#table(labels_out[,1] == labels_out[,2])
#prop.table(table(labels_out[,1] == labels_out[,2]))

#Random Forest Performance
#table(labels_out[,1] == labels_out[,3])
#prop.table(table(labels_out[,1] == labels_out[,3]))

#Maximum Entropy Performance
#table(labels_out[,1] == labels_out[,4])
#prop.table(table(labels_out[,1] == labels_out[,4]))