library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(tm)
## Loading required package: NLP
library(rlist)

Creating Corpora

hamsrc <- DirSource("C:/Users/jmcon/OneDrive/Desktop/spam_ham/easy_ham")
hamcrp <- VCorpus(hamsrc)

spamsrc <- DirSource("C:/Users/jmcon/OneDrive/Desktop/spam_ham/spam_2")
spamcrp <- VCorpus(spamsrc)

Designating “Spam/Ham” as metadata

for(i in 1:length(hamcrp)){
  meta(hamcrp[[i]], tag = "spam") <- "ham"
}

for(i in 1:length(spamcrp)){
  meta(spamcrp[[i]], tag = "spam") <- "spam"
}

Creating 1 Corpus: Merging Spam and Ham Corpora

spamhamcrp <- c(spamcrp,hamcrp)

corpus_sample <- list.sample(spamhamcrp, size = 2000)

Data Cleaning

Removing Stop Words

corpus_sample <- tm_map(corpus_sample, removeWords, words = stopwords("en"))

Removing capitalized letters

corpus_sample <- tm_map(corpus_sample, content_transformer(tolower))

Creating a Document Term Matrix (dtm)

dtm <- DocumentTermMatrix(corpus_sample)

Removing Sparse Terms/Outliers

dtm <- removeSparseTerms(dtm, 1-(10/length(corpus_sample)))

Pulling Metadata

meta_data <- meta(corpus_sample, tag = "spam")
meta_data <- data.frame(unlist(meta_data))

Establishing Supervised Model Container

n <- length(meta_data[,1])
container <- create_container(dtm, labels = meta_data[,1], trainSize = 1:1500, testSize = 1501:n, virgin = FALSE)

str(container)
## Formal class 'matrix_container' [package "RTextTools"] with 6 slots
##   ..@ training_matrix      :Formal class 'matrix.csr' [package "SparseM"] with 4 slots
##   .. .. ..@ ra       : num [1:201450] 2 1 4 1 6 1 1 1 4 1 ...
##   .. .. ..@ ja       : int [1:201450] 2 8 21 25 29 88 107 125 128 130 ...
##   .. .. ..@ ia       : int [1:1501] 1 125 193 283 408 582 671 804 1238 1367 ...
##   .. .. ..@ dimension: int [1:2] 1500 4427
##   ..@ classification_matrix:Formal class 'matrix.csr' [package "SparseM"] with 4 slots
##   .. .. ..@ ra       : num [1:63507] 1 1 2 4 6 4 2 1 3 3 ...
##   .. .. ..@ ja       : int [1:63507] 8 17 21 25 27 107 110 111 112 123 ...
##   .. .. ..@ ia       : int [1:501] 1 236 391 511 668 930 1046 1255 1562 1654 ...
##   .. .. ..@ dimension: int [1:2] 500 4427
##   ..@ training_codes       : Factor w/ 2 levels "ham","spam": 1 2 2 2 2 1 1 2 1 2 ...
##   ..@ testing_codes        : Factor w/ 2 levels "ham","spam": 2 1 1 1 1 1 2 2 1 1 ...
##   ..@ column_names         : chr [1:4427] "'decline" "---" "-----" "-------" ...
##   ..@ virgin               : logi FALSE

Training Models

Support Vector Machines (SVM)

svm_t <- train_model(container,"SVM")
svm_classify <- classify_model(container, svm_t)
head(svm_classify)
##   SVM_LABEL  SVM_PROB
## 1      spam 0.9999999
## 2       ham 0.9999996
## 3       ham 0.9999999
## 4       ham 0.9999997
## 5       ham 0.9999962
## 6       ham 0.9897187

Random Forest (Tree)

tree_t <- train_model(container, "TREE")
tree_classify <- classify_model(container, tree_t)
head(tree_classify)
##   TREE_LABEL TREE_PROB
## 1       spam 0.9979036
## 2        ham 1.0000000
## 3        ham 1.0000000
## 4        ham 1.0000000
## 5        ham 1.0000000
## 6        ham 1.0000000

Lasso and Elastic-Net Regularized Generalized Linear Models (glmnet)

glmnet_t <- train_model(container,"GLMNET")
glmnet_classify <- classify_model(container, glmnet_t)
head(glmnet_classify)
##   GLMNET_LABEL GLMNET_PROB
## 1         spam   0.9982526
## 2          ham   0.9973727
## 3          ham   0.9980455
## 4          ham   0.9917614
## 5          ham   0.9613452
## 6          ham   0.9894338

Performance of Models

labels_out <- data.frame(correct_label = meta_data[1501:2000,], svm = svm_classify, glmnet = glmnet_classify, tree = tree_classify, stringsAsFactors = FALSE)

SVM

sum(labels_out$correct_label == labels_out$svm.SVM_LABEL)/500
## [1] 0.996

GLMNET

sum(labels_out$correct_label == labels_out$glmnet.GLMNET_LABEL)/500
## [1] 0.99

TREE

sum(labels_out$correct_label == labels_out$tree.TREE_LABEL)/500
## [1] 0.976

Findings

Looking at the probabilities of each model’s performance, we can see these models perform relatively well from being trained on the spamassassin dataset.

References

https://youtu.be/6IzhRaSePKU https://www.rdocumentation.org/packages/tm/versions/0.7-8/topics/meta https://journal.r-project.org/archive/2013/RJ-2013-001/index.html http://www.rtexttools.com/ https://www.svm-tutorial.com/2014/11/svm-classify-text-r/ https://journal.r-project.org/archive/2013/RJ-2013-001/RJ-2013-001.pdf https://www.rdocumentation.org/packages/RTextTools/versions/1.4.3/topics/train_model