Create Corpora

hamsrc <- DirSource("C:/Users/mike/Dropbox/CUNY/DATA607/spamham/easy_ham")
hamcrp <- VCorpus(hamsrc)

spamsrc <- DirSource("C:/Users/mike/Dropbox/CUNY/DATA607/spamham/spam_2")
spamcrp <- VCorpus(spamsrc)

Add spam/ham designations as meta data

for(i in 1:length(hamcrp)){
  meta(hamcrp[[i]], tag = "spam") <- "ham"
}

for(i in 1:length(spamcrp)){
  meta(spamcrp[[i]], tag = "spam") <- "spam"
}

Concatenate into one corpus

spamhamcrp <- c(spamcrp,hamcrp)

### Take a large sample to insure training data is randomized.

sampled_corpus <- list.sample(spamhamcrp, size = 2000)

Data Cleansing

Remove stop words

sampled_corpus <- tm_map(sampled_corpus, removeWords, words = stopwords("en"))

Set to lower case

sampled_corpus <- tm_map(sampled_corpus, content_transformer(tolower))

Create Document-Term Matrix

dtm <- DocumentTermMatrix(sampled_corpus)

Remove sparse terms

dtm <- removeSparseTerms(dtm, 1-(10/length(sampled_corpus)))

Pull Meta Data

meta_data <- meta(sampled_corpus, tag = "spam")
meta_data <- data.frame(unlist(meta_data))

(RTextTools) Set up Container for Supervised Model

n <- length(meta_data[,1])
container <- create_container(dtm, labels = meta_data[,1], trainSize = 1:1500, testSize = 1501:n, virgin = FALSE)

Train Models

SVM <- train_model(container,"SVM")
GLMNET <- train_model(container,"GLMNET")
MAXENT <- train_model(container,"MAXENT")

Classify Models

SVM_CLASSIFY <- classify_model(container, SVM)
GLMNET_CLASSIFY <- classify_model(container, GLMNET)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)

Compare Actual and Predicted

labels_out <- data.frame(correct_label = meta_data[1501:2000,], svm = SVM_CLASSIFY, glmnet = GLMNET_CLASSIFY, maxent = MAXENT_CLASSIFY, stringsAsFactors = FALSE)

# Percentage COrrect for SVM:

sum(labels_out$correct_label == labels_out$svm.SVM_LABEL)/500
## [1] 0.998
# Percentage Correct for GLMNET:

sum(labels_out$correct_label == labels_out$glmnet.GLMNET_LABEL)/500
## [1] 1
# Percentage Correct for MAXENT:


sum(labels_out$correct_label == labels_out$maxent.MAXENTROPY_LABEL)/500
## [1] 1

Looks like nearly perfect scores all around!

Personal Spam Folder

Let’s look at how this classifies 20 messages that were in my personal spam folder.

yahoosrc <- DirSource("C:/Users/mike/Dropbox/CUNY/DATA607/spamham/yahoo_spam")
yahoocrp <- VCorpus(yahoosrc)

for(i in 1:length(yahoocrp)){
  meta(yahoocrp[[i]], tag = "spam") <- "spam"
}

#Resent sampled corpus for training data

sampled_corpus <- list.sample(spamhamcrp, size = 2000)
 
#Concatenate Yahoo Spam emails
 
sampled_corpus <- c(sampled_corpus,yahoocrp)

Perform same cleaning as before

Remove stop words

sampled_corpus <- tm_map(sampled_corpus, removeWords, words = stopwords("en"))

Set to lower case

sampled_corpus <- tm_map(sampled_corpus, content_transformer(tolower))

Create Document-Term Matrix

dtm <- DocumentTermMatrix(sampled_corpus)

Remove sparse terms

dtm <- removeSparseTerms(dtm, 1-(10/length(sampled_corpus)))

Pull Meta Data

meta_data <- meta(sampled_corpus, tag = "spam")
meta_data <- data.frame(unlist(meta_data))

(RTextTools) Set up Container for Supervised Model

n <- length(meta_data[,1])
container <- create_container(dtm, labels = meta_data[,1], trainSize = 1:2000, testSize = 2001:n, virgin = FALSE)

Train Models

SVM <- train_model(container,"SVM")
GLMNET <- train_model(container,"GLMNET")
MAXENT <- train_model(container,"MAXENT")

Classify Models

SVM_CLASSIFY <- classify_model(container, SVM)
GLMNET_CLASSIFY <- classify_model(container, GLMNET)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)

Compare Actual and Predicted

labels_out <- data.frame(correct_label = meta_data[2001:2020,], svm = SVM_CLASSIFY, glmnet = GLMNET_CLASSIFY, maxent = MAXENT_CLASSIFY, stringsAsFactors = FALSE)

#If 100% match, will spit out errors about levels not matching, so setting levels before hand.

levels(labels_out$svm.SVM_LABEL) <- c("spam","ham")
levels(labels_out$maxent.MAXENTROPY_LABEL) <- c("spam","ham")
levels(labels_out$glmnet.GLMNET_LABEL) <- c("spam","ham")

# Percentage COrrect for SVM:


sum(labels_out$correct_label == labels_out$svm.SVM_LABEL)/20
## [1] 1
# Percentage Correct for GLMNET:



sum(labels_out$correct_label == labels_out$glmnet.GLMNET_LABEL)/20
## [1] 1
# Percentage Correct for MAXENT:


sum(labels_out$correct_label == labels_out$maxent.MAXENTROPY_LABEL)/20
## [1] 0.25

Interesting that we see some wide variance here.