Assignment
Label training spam and ham documents for machine learning prediction for other documents.
Load Libraries
library(tm)
library(stringr)
library(dplyr)
library(RTextTools)Load Ham and Spam Files
We load the files that were downloaded on our local machine. Notice that there are 2,551 ham messages and 501 spam messages.
ham_files <- list.files("easy_ham")
length(ham_files)## [1] 2551
spam_files <- list.files("spam")
length(spam_files)## [1] 501
Make the Ham and Spam Corpuses
Make loops to read through, remove whitespace, and label (using meta) each ham and spam message. Place these in their respective corpuses. Look at a sample ham and spam file output.
tempham <- readLines(str_c("easy_ham/",ham_files[1]))
tempham <- str_c(tempham, collapse = "")
ham_corpus <- VCorpus(VectorSource(tempham))
n <- 1
for (i in 1:length(ham_files)){
n <- n+1
tempham <- readLines(str_c("easy_ham/",ham_files[i]))
tempham <- str_c(tempham, collapse = "")
tempham <- str_trim(unlist(str_replace_all(tempham,"\\s+"," ")))
tmp_corpus <- VCorpus(VectorSource(tempham))
ham_corpus <- c(ham_corpus,tmp_corpus)
meta(ham_corpus[[n]], "type") <- "Ham"
}
meta(ham_corpus[[5]]) ## author : character(0)
## datetimestamp: 2017-11-05 23:03:19
## description : character(0)
## heading : character(0)
## id : 1
## language : en
## origin : character(0)
## type : Ham
tempspam <- readLines(str_c("spam/",spam_files[1]))
tempspam <- str_c(tempspam, collapse = "")
spam_corpus <- VCorpus(VectorSource(tempspam))
n <- 1
for (i in 2:length(spam_files)){
n <- n+1
tempspam <- readLines(str_c("spam/",spam_files[i]))
tempspam <- str_c(tempspam, collapse = "")
tempspam <- str_trim(unlist(str_replace_all(tempspam,"\\s+"," ")))
tmp_corpus <- VCorpus(VectorSource(tempspam))
spam_corpus <- c(spam_corpus,tmp_corpus)
meta(spam_corpus[[n]], "type") <- "Spam"
}
meta(spam_corpus[[5]])## author : character(0)
## datetimestamp: 2017-11-05 23:03:51
## description : character(0)
## heading : character(0)
## id : 1
## language : en
## origin : character(0)
## type : Spam
Combine and Clean Ham and Spam Corpuses
Clean the contents of the files and combine them into one corpus
total_corpus <- c(ham_corpus, spam_corpus)
total_corpus <- tm_map(total_corpus,removeNumbers)
total_corpus <- tm_map(total_corpus, content_transformer(tolower))
total_corpus <- tm_map(total_corpus, removeWords, words = stopwords("en"))
total_corpus <- tm_map(total_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
total_corpus <- tm_map(total_corpus, stemDocument)Create TDM
Use the combined corpus to make a term document matrix to use for the machine learning models.
#High Sparse
mix_tdm <- TermDocumentMatrix(total_corpus)
mix_tdm## <<TermDocumentMatrix (terms: 64580, documents: 3053)>>
## Non-/sparse entries: 562964/196599776
## Sparsity : 100%
## Maximal term length: 17434
## Weighting : term frequency (tf)
#Remove Sparse Terms
mix_tdm <- removeSparseTerms(mix_tdm, 1-(20/length(total_corpus)))
mix_tdm## <<TermDocumentMatrix (terms: 2940, documents: 3053)>>
## Non-/sparse entries: 434541/8541279
## Sparsity : 95%
## Maximal term length: 70
## Weighting : term frequency (tf)
Create Container
Make the containers split between ham and spam for testing.
labels <- unlist(meta(total_corpus, "type"))
container <- create_container(mix_tdm,
labels = labels,
trainSize=1:500,
testSize=501:1000,
virgin = FALSE)
slotNames(container)## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
Start Model Training
Prepare and run 3 models to test other files. Unfortunately these did seem to run on my machine. I’ll continue to try other methods to get this to run.
#svm_model <- train_model(container, "SVM")
#tree_model <- train_model(container,"TREE")
#maxent_model <- train_model(container, "MAXENT")