hamsrc <- DirSource("C:/Users/mike/Dropbox/CUNY/DATA607/spamham/easy_ham")
hamcrp <- VCorpus(hamsrc)
spamsrc <- DirSource("C:/Users/mike/Dropbox/CUNY/DATA607/spamham/spam_2")
spamcrp <- VCorpus(spamsrc)
for(i in 1:length(hamcrp)){
meta(hamcrp[[i]], tag = "spam") <- "ham"
}
for(i in 1:length(spamcrp)){
meta(spamcrp[[i]], tag = "spam") <- "spam"
}
spamhamcrp <- c(spamcrp,hamcrp)
### Take a large sample to insure training data is randomized.
sampled_corpus <- list.sample(spamhamcrp, size = 2000)
sampled_corpus <- tm_map(sampled_corpus, removeWords, words = stopwords("en"))
sampled_corpus <- tm_map(sampled_corpus, content_transformer(tolower))
dtm <- DocumentTermMatrix(sampled_corpus)
dtm <- removeSparseTerms(dtm, 1-(10/length(sampled_corpus)))
meta_data <- meta(sampled_corpus, tag = "spam")
meta_data <- data.frame(unlist(meta_data))
n <- length(meta_data[,1])
container <- create_container(dtm, labels = meta_data[,1], trainSize = 1:1500, testSize = 1501:n, virgin = FALSE)
SVM <- train_model(container,"SVM")
GLMNET <- train_model(container,"GLMNET")
MAXENT <- train_model(container,"MAXENT")
SVM_CLASSIFY <- classify_model(container, SVM)
GLMNET_CLASSIFY <- classify_model(container, GLMNET)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)
labels_out <- data.frame(correct_label = meta_data[1501:2000,], svm = SVM_CLASSIFY, glmnet = GLMNET_CLASSIFY, maxent = MAXENT_CLASSIFY, stringsAsFactors = FALSE)
# Percentage COrrect for SVM:
sum(labels_out$correct_label == labels_out$svm.SVM_LABEL)/500
## [1] 0.998
# Percentage Correct for GLMNET:
sum(labels_out$correct_label == labels_out$glmnet.GLMNET_LABEL)/500
## [1] 1
# Percentage Correct for MAXENT:
sum(labels_out$correct_label == labels_out$maxent.MAXENTROPY_LABEL)/500
## [1] 1
Looks like nearly perfect scores all around!
Let’s look at how this classifies 20 messages that were in my personal spam folder.
yahoosrc <- DirSource("C:/Users/mike/Dropbox/CUNY/DATA607/spamham/yahoo_spam")
yahoocrp <- VCorpus(yahoosrc)
for(i in 1:length(yahoocrp)){
meta(yahoocrp[[i]], tag = "spam") <- "spam"
}
#Resent sampled corpus for training data
sampled_corpus <- list.sample(spamhamcrp, size = 2000)
#Concatenate Yahoo Spam emails
sampled_corpus <- c(sampled_corpus,yahoocrp)
sampled_corpus <- tm_map(sampled_corpus, removeWords, words = stopwords("en"))
sampled_corpus <- tm_map(sampled_corpus, content_transformer(tolower))
dtm <- DocumentTermMatrix(sampled_corpus)
dtm <- removeSparseTerms(dtm, 1-(10/length(sampled_corpus)))
meta_data <- meta(sampled_corpus, tag = "spam")
meta_data <- data.frame(unlist(meta_data))
n <- length(meta_data[,1])
container <- create_container(dtm, labels = meta_data[,1], trainSize = 1:2000, testSize = 2001:n, virgin = FALSE)
SVM <- train_model(container,"SVM")
GLMNET <- train_model(container,"GLMNET")
MAXENT <- train_model(container,"MAXENT")
SVM_CLASSIFY <- classify_model(container, SVM)
GLMNET_CLASSIFY <- classify_model(container, GLMNET)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)
labels_out <- data.frame(correct_label = meta_data[2001:2020,], svm = SVM_CLASSIFY, glmnet = GLMNET_CLASSIFY, maxent = MAXENT_CLASSIFY, stringsAsFactors = FALSE)
#If 100% match, will spit out errors about levels not matching, so setting levels before hand.
levels(labels_out$svm.SVM_LABEL) <- c("spam","ham")
levels(labels_out$maxent.MAXENTROPY_LABEL) <- c("spam","ham")
levels(labels_out$glmnet.GLMNET_LABEL) <- c("spam","ham")
# Percentage COrrect for SVM:
sum(labels_out$correct_label == labels_out$svm.SVM_LABEL)/20
## [1] 1
# Percentage Correct for GLMNET:
sum(labels_out$correct_label == labels_out$glmnet.GLMNET_LABEL)/20
## [1] 1
# Percentage Correct for MAXENT:
sum(labels_out$correct_label == labels_out$maxent.MAXENTROPY_LABEL)/20
## [1] 0.25
Interesting that we see some wide variance here.