library(tidyr)
library(meta)
library(stringr)
library(tidytext)
library(RTextTools)
library(tm)
Read the corpora for the Spam and Ham collections.
Load and initialize the Spam and Easy Ham corpora.
These are downloaded and unzipped from http://spamassassin.apache.org/old/publiccorpus/
setwd("C:/Users/vikas/cuny/data607/projects/Project_4/")
spam <- Corpus(DirSource("spam/20050311_spam_2.tar/spam_2"),
readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
length(spam)
## [1] 1396
easy_ham <- Corpus(DirSource("easy_ham/20030228_easy_ham_2.tar/easy_ham_2"),
readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
length(easy_ham)
## [1] 1400
# Metadata tagging needed for the subsequent processing based on a term-document matrix,
# container and model training and classification.
meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "ham"
spam
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 1
## Content: documents: 1396
# head(meta(spam))
easy_ham
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 1
## Content: documents: 1400
# head(meta(easy_ham))
Create aggregate term-document matrices, consisting of train and test portions obtained from the combined spam and easy_ham corpora.
l1 <- length(spam)
l2 <- length(easy_ham)
# Set aside portions for train and test.
TrainPercent <- 0.75
i1 <- sample(l1, round(TrainPercent*l1))
i2 <- sample(l2, round(TrainPercent*l2))
spam_train <- spam[i1]
spam_test <- spam[-i1]
easy_ham_train <- easy_ham[i2]
easy_ham_test <- easy_ham[-i2]
tdm_train <- c(TermDocumentMatrix(spam_train),
TermDocumentMatrix(easy_ham_train))
tdm_test <- c(TermDocumentMatrix(spam_test),
TermDocumentMatrix(easy_ham_test))
tdm_all <- c(TermDocumentMatrix(spam_train),
TermDocumentMatrix(easy_ham_train),
TermDocumentMatrix(spam_test),
TermDocumentMatrix(easy_ham_test))
label.train <- c(rep("spam", length(spam_train)),
rep("ham", length(easy_ham_train)))
label.test <- c(rep("spam", length(spam_test)),
rep("ham", length(easy_ham_test)))
label.all <- c(rep("spam", length(spam_train)),
rep("ham", length(easy_ham_train)),
rep("spam", length(spam_test)),
rep("ham", length(easy_ham_test)))
Create a container and use it to create a Support Vector Machine model.
r1 <- length(spam_train) + length(easy_ham_train)
r2 <- length(spam_test) + length(easy_ham_test)
ctr <- create_container(tdm_train,
labels = label.train,
trainSize = 1 : r1,
virgin = F)
svm_model <- train_model(ctr, "SVM")
Use the trained SVM to classify.
ctr2 <- create_container(tdm_all,
labels = label.all,
testSize = r1+1:r1+r2,
virgin = F)
svm_out <- classify_model(ctr2, svm_model)
head(svm_out)
## SVM_LABEL SVM_PROB
## 1 spam 0.996471
## 2 spam 0.996471
## 3 spam 0.996471
## 4 spam 0.996471
## 5 spam 0.996471
## 6 spam 0.996471
Create models using Random Forest and Maximum Entropy also. Use them to classify.
tree_model <- train_model(ctr, "TREE")
maxent_model <- train_model(ctr, "MAXENT")
tree_out <- classify_model(ctr2, tree_model)
maxent_out <- classify_model(ctr2, maxent_model)
head(tree_out)
## TREE_LABEL TREE_PROB
## 1 spam 0.996633
## 2 spam 0.996633
## 3 spam 0.996633
## 4 spam 0.996633
## 5 spam 0.996633
## 6 spam 0.996633
head(maxent_out)
## MAXENTROPY_LABEL MAXENTROPY_PROB
## 1 spam 0.5
## 2 spam 0.5
## 3 spam 0.5
## 4 spam 0.5
## 5 spam 0.5
## 6 spam 0.5
Compare the results from the three models.
labels_out <- data.frame(
correct_label = label.all,
svm = as.character(svm_out[,1]),
tree = as.character(tree_out[,1]),
maxent = as.character(maxent_out[,1]),
stringsAsFactors = F)
## SVM Performance
prop.table(table(labels_out[,1] == labels_out[,2]))
##
## FALSE TRUE
## 0.508226 0.491774
## Random Forest Performance
prop.table(table(labels_out[,1] == labels_out[,3]))
##
## FALSE TRUE
## 0.5193133 0.4806867
## Maximum Entropy Performance
prop.table(table(labels_out[,1] == labels_out[,4]))
##
## FALSE TRUE
## 0.6312589 0.3687411