library(RTextTools)
library(tm)
library(tidyverse)
To replicate the process, first pull the project files from GitHub
and set the base.dir accordingly (in my case base.dir=“~/GitHub/Project4”)
base.dir <- "~/GitHub/Project4"
ham.test.max <- 25
spam.test.max <- 14
ham.train.max <- 500-ham.test.max
spam.train.max <- 500-spam.test.max
ham.train.dir <- file.path(base.dir, "ham_training")
spam.train.dir <- file.path(base.dir, "spam_training")
ham.train.files <- dir(ham.train.dir)[1:ham.train.max]
spam.train.files <- dir(spam.train.dir)[1:spam.train.max]
ham.test.dir <- file.path(base.dir, "ham_testing")
spam.test.dir <- file.path(base.dir, "spam_testing")
ham.test.files <- dir(ham.test.dir)[1:ham.test.max]
spam.test.files <- dir(spam.test.dir)[1:spam.test.max]
When processing the text files, use regex expressions to remove Date and Time entries
((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\W+(\d){1,2}\W+\d{1,2}:\d{1,2}:\d{1,2}\W+\d{4}
Mon Oct 7 12:07:10 2002
Tue Nov 12 12:07:10 2002
((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\W+\d{1,2}\W+\d{4}
Mon, Oct 07, 2002
((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\W+\d{1,2}\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\W+\d{4}
Tue, 1 Oct 2002
\d{1,2}:\d{1,2}:\d{1,2}(\W?(AM|PM)?\W+(\+|\-)\d{4})?
09:43:11AM +0100
10:34:39 -0100
12:07:10
\d{4}(-|/)\d{1,2}(-|/)\d{1,2}
2002-10-03
2002/10/03
\d{1,2}(-|/)\d{1,2}(-|/)\d{4}
10-03-2002
10/03/2002
(\+|\-)\d{4}
+0100
-0100
((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\W+(\d){1,2}
Sun Mar 17
((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\W+\d{1,2}\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))
Tue, 1 Oct
txt.entry <- read_lines(file.path(ham.train.dir, ham.train.files[1])) %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}\\W+\\d{1,2}:\\d{1,2}:\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{4}") %>%
str_remove_all("\\d{1,2}:\\d{1,2}:\\d{1,2}(\\W?(AM|PM)?\\W+(\\+|\\-)\\d{4})?") %>%
str_remove_all("\\d{4}(-|/)\\d{1,2}(-|/)\\d{1,2}") %>%
str_remove_all("\\d{1,2}(-|/)\\d{1,2}(-|/)\\d{4}") %>%
str_remove_all("(\\+|\\-)\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))")
txt.entry <- str_c(txt.entry, collapse = "")
txt.labels <- "ham"
Build up the vectors of text entries and classification labels for the model.
NOTE: each file is being reduced to a single text line and added to the vector.
for(i in 2:length(ham.train.files)) {
tmp.entry <- read_lines(file.path(ham.train.dir, ham.train.files[i])) %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}\\W+\\d{1,2}:\\d{1,2}:\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{4}") %>%
str_remove_all("\\d{1,2}:\\d{1,2}:\\d{1,2}(\\W?(AM|PM)?\\W+(\\+|\\-)\\d{4})?") %>%
str_remove_all("\\d{1,2}(-|/)\\d{1,2}(-|/)\\d{4}") %>%
str_remove_all("(\\+|\\-)\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))") %>%
str_remove_all("^[[:blank:]]{0,}$[\\r\\n]")
tmp.entry <- str_c(tmp.entry, collapse = "")
txt.entry <- c(txt.entry, tmp.entry)
txt.labels <- c(txt.labels, "ham")
}
for(i in 1:length(spam.train.files)) {
tmp.entry <- read_lines(file.path(spam.train.dir, spam.train.files[i])) %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}\\W+\\d{1,2}:\\d{1,2}:\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{4}") %>%
str_remove_all("\\d{1,2}:\\d{1,2}:\\d{1,2}(\\W?(AM|PM)?\\W+(\\+|\\-)\\d{4})?") %>%
str_remove_all("\\d{1,2}(-|/)\\d{1,2}(-|/)\\d{4}") %>%
str_remove_all("(\\+|\\-)\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))") %>%
str_remove_all("^[[:blank:]]{0,}$[\\r\\n]")
tmp.entry <- str_c(tmp.entry, collapse = "")
txt.entry <- c(txt.entry, tmp.entry)
txt.labels <- c(txt.labels, "spam")
}
NOTE: Test entries can have a blank label
for(i in 1:length(ham.test.files)) {
tmp.entry <- read_lines(file.path(ham.test.dir, ham.test.files[i])) %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}\\W+\\d{1,2}:\\d{1,2}:\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{4}") %>%
str_remove_all("\\d{1,2}:\\d{1,2}:\\d{1,2}(\\W?(AM|PM)?\\W+(\\+|\\-)\\d{4})?") %>%
str_remove_all("\\d{1,2}(-|/)\\d{1,2}(-|/)\\d{4}") %>%
str_remove_all("(\\+|\\-)\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))") %>%
str_remove_all("^[[:blank:]]{0,}$[\\r\\n]")
tmp.entry <- str_c(tmp.entry, collapse = "")
txt.entry <- c(txt.entry, tmp.entry)
txt.labels <- c(txt.labels, "") #test entires don't require a label
}
for(i in 1:length(spam.test.files)) {
tmp.entry <- read_lines(file.path(spam.test.dir, spam.test.files[i])) %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}\\W+\\d{1,2}:\\d{1,2}:\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{1,2}\\W+\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+\\d{4}") %>%
str_remove_all("\\d{1,2}:\\d{1,2}:\\d{1,2}(\\W?(AM|PM)?\\W+(\\+|\\-)\\d{4})?") %>%
str_remove_all("\\d{1,2}(-|/)\\d{1,2}(-|/)\\d{4}") %>%
str_remove_all("(\\+|\\-)\\d{4}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))\\W+(\\d){1,2}") %>%
str_remove_all("((Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)|(Sun))\\W+\\d{1,2}\\W+((Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec))") %>%
str_remove_all("^[[:blank:]]{0,}$[\\r\\n]")
tmp.entry <- str_c(tmp.entry, collapse = "")
txt.entry <- c(txt.entry, tmp.entry)
txt.labels <- c(txt.labels, "")
}
the.corpus <- Corpus(VectorSource(txt.entry))
the.corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1000
meta(the.corpus, tag = "polarity", type = "indexed") <- txt.labels
head(meta(the.corpus))
## polarity
## 1 ham
## 2 ham
## 3 ham
## 4 ham
## 5 ham
## 6 ham
dtm <- DocumentTermMatrix(the.corpus)
dtm <- removeSparseTerms(dtm, 1-(10/length(the.corpus)))
dtm
## <<DocumentTermMatrix (documents: 1000, terms: 2671)>>
## Non-/sparse entries: 162571/2508429
## Sparsity : 94%
## Maximal term length: 21
## Weighting : term frequency (tf)
N <- length(txt.labels)
container <- create_container(
dtm,
labels = txt.labels,
trainSize = 1:(ham.train.max+spam.train.max),
testSize = (1+ham.train.max+spam.train.max):N,
virgin = FALSE
)
svm_model <- train_model(container, "SVM")
svm_out <- classify_model(container, svm_model)
svm_out
## SVM_LABEL SVM_PROB
## 1 ham 0.9966361
## 2 ham 0.9993780
## 3 ham 0.9990726
## 4 ham 0.9988386
## 5 ham 0.9981683
## 6 ham 0.9974880
## 7 ham 0.9858599
## 8 ham 0.9977158
## 9 ham 0.9981984
## 10 ham 0.9938239
## 11 ham 0.9990026
## 12 ham 0.9972042
## 13 ham 0.9987947
## 14 ham 0.9930310
## 15 ham 0.9356069
## 16 ham 0.7791482
## 17 ham 0.9831776
## 18 spam 0.9738938
## 19 ham 0.8331212
## 20 ham 0.7119607
## 21 ham 0.7916116
## 22 ham 0.8306141
## 23 ham 0.8224464
## 24 ham 0.8266482
## 25 ham 0.7500503
## 26 spam 0.9904520
## 27 spam 0.9756807
## 28 spam 0.9858320
## 29 spam 0.9837094
## 30 spam 0.9872383
## 31 spam 0.9746902
## 32 spam 0.9884193
## 33 spam 0.9512521
## 34 spam 0.8871192
## 35 spam 0.9474190
## 36 spam 0.9943547
## 37 ham 0.7119607
## 38 ham 0.7916116
## 39 ham 0.8306141
The results are nearly perfect. The only outlier is on line #18 which reported spam, where as the first 25 items in the test set are suppose to be HAM. For the SPAM test set, lines 26-39, the first 8 were spam entries from my personal emails and they were classified correctly. I then used 3 spam emails [lines 34-36] and 3 ham emails [lines 37-39] from the “downloaded” set and they all were also classified correctly.