library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(tm)
## Loading required package: NLP
library(rlist)
hamsrc <- DirSource("C:/Users/jmcon/OneDrive/Desktop/spam_ham/easy_ham")
hamcrp <- VCorpus(hamsrc)
spamsrc <- DirSource("C:/Users/jmcon/OneDrive/Desktop/spam_ham/spam_2")
spamcrp <- VCorpus(spamsrc)
Designating “Spam/Ham” as metadata
for(i in 1:length(hamcrp)){
meta(hamcrp[[i]], tag = "spam") <- "ham"
}
for(i in 1:length(spamcrp)){
meta(spamcrp[[i]], tag = "spam") <- "spam"
}
spamhamcrp <- c(spamcrp,hamcrp)
corpus_sample <- list.sample(spamhamcrp, size = 2000)
Removing Stop Words
corpus_sample <- tm_map(corpus_sample, removeWords, words = stopwords("en"))
Removing capitalized letters
corpus_sample <- tm_map(corpus_sample, content_transformer(tolower))
Creating a Document Term Matrix (dtm)
dtm <- DocumentTermMatrix(corpus_sample)
Removing Sparse Terms/Outliers
dtm <- removeSparseTerms(dtm, 1-(10/length(corpus_sample)))
Pulling Metadata
meta_data <- meta(corpus_sample, tag = "spam")
meta_data <- data.frame(unlist(meta_data))
n <- length(meta_data[,1])
container <- create_container(dtm, labels = meta_data[,1], trainSize = 1:1500, testSize = 1501:n, virgin = FALSE)
str(container)
## Formal class 'matrix_container' [package "RTextTools"] with 6 slots
## ..@ training_matrix :Formal class 'matrix.csr' [package "SparseM"] with 4 slots
## .. .. ..@ ra : num [1:201450] 2 1 4 1 6 1 1 1 4 1 ...
## .. .. ..@ ja : int [1:201450] 2 8 21 25 29 88 107 125 128 130 ...
## .. .. ..@ ia : int [1:1501] 1 125 193 283 408 582 671 804 1238 1367 ...
## .. .. ..@ dimension: int [1:2] 1500 4427
## ..@ classification_matrix:Formal class 'matrix.csr' [package "SparseM"] with 4 slots
## .. .. ..@ ra : num [1:63507] 1 1 2 4 6 4 2 1 3 3 ...
## .. .. ..@ ja : int [1:63507] 8 17 21 25 27 107 110 111 112 123 ...
## .. .. ..@ ia : int [1:501] 1 236 391 511 668 930 1046 1255 1562 1654 ...
## .. .. ..@ dimension: int [1:2] 500 4427
## ..@ training_codes : Factor w/ 2 levels "ham","spam": 1 2 2 2 2 1 1 2 1 2 ...
## ..@ testing_codes : Factor w/ 2 levels "ham","spam": 2 1 1 1 1 1 2 2 1 1 ...
## ..@ column_names : chr [1:4427] "'decline" "---" "-----" "-------" ...
## ..@ virgin : logi FALSE
Support Vector Machines (SVM)
svm_t <- train_model(container,"SVM")
svm_classify <- classify_model(container, svm_t)
head(svm_classify)
## SVM_LABEL SVM_PROB
## 1 spam 0.9999999
## 2 ham 0.9999996
## 3 ham 0.9999999
## 4 ham 0.9999997
## 5 ham 0.9999962
## 6 ham 0.9897187
Random Forest (Tree)
tree_t <- train_model(container, "TREE")
tree_classify <- classify_model(container, tree_t)
head(tree_classify)
## TREE_LABEL TREE_PROB
## 1 spam 0.9979036
## 2 ham 1.0000000
## 3 ham 1.0000000
## 4 ham 1.0000000
## 5 ham 1.0000000
## 6 ham 1.0000000
Lasso and Elastic-Net Regularized Generalized Linear Models (glmnet)
glmnet_t <- train_model(container,"GLMNET")
glmnet_classify <- classify_model(container, glmnet_t)
head(glmnet_classify)
## GLMNET_LABEL GLMNET_PROB
## 1 spam 0.9982526
## 2 ham 0.9973727
## 3 ham 0.9980455
## 4 ham 0.9917614
## 5 ham 0.9613452
## 6 ham 0.9894338
labels_out <- data.frame(correct_label = meta_data[1501:2000,], svm = svm_classify, glmnet = glmnet_classify, tree = tree_classify, stringsAsFactors = FALSE)
SVM
sum(labels_out$correct_label == labels_out$svm.SVM_LABEL)/500
## [1] 0.996
GLMNET
sum(labels_out$correct_label == labels_out$glmnet.GLMNET_LABEL)/500
## [1] 0.99
TREE
sum(labels_out$correct_label == labels_out$tree.TREE_LABEL)/500
## [1] 0.976
Looking at the probabilities of each model’s performance, we can see these models perform relatively well from being trained on the spamassassin dataset.
https://youtu.be/6IzhRaSePKU https://www.rdocumentation.org/packages/tm/versions/0.7-8/topics/meta https://journal.r-project.org/archive/2013/RJ-2013-001/index.html http://www.rtexttools.com/ https://www.svm-tutorial.com/2014/11/svm-classify-text-r/ https://journal.r-project.org/archive/2013/RJ-2013-001/RJ-2013-001.pdf https://www.rdocumentation.org/packages/RTextTools/versions/1.4.3/topics/train_model