Prepare data
# This class cleans up the text within a corpus
clean_corp <- function(corpus){
tmp <- corpus
release_corpus <- tm_map(tmp, str_replace_all, pattern= "[[:punct:]]", replacement = " ")
release_corpus <- tm_map(release_corpus, removeNumbers)
release_corpus <- tm_map(release_corpus, stripWhitespace)
release_corpus <- tm_map(release_corpus, tolower)
release_corpus <- tm_map(release_corpus, removeWords, words = stopwords("en"))
release_corpus <- tm_map(release_corpus, PlainTextDocument)
release_corpus <- tm_map(release_corpus, stemDocument)
return(release_corpus)
}
# This function assigns classification, either Spam or Ham, to a corpus
assign_class <- function(corpus, cl){
tmp <- corpus
for(i in 1:length(tmp)){
meta(tmp[[i]], "classification") <- cl
}
return(tmp)
}
# Load files from spam and ham directories into a Vector Corpus
spam <- VCorpus(DirSource('easy_ham\\'))
ham <- VCorpus(DirSource('spam_2\\'))
# Clean text within each corpus
spam_cleaned <- clean_corp(spam)
ham_cleaned <- clean_corp(ham)
# Assign a class
spam_classified <- assign_class(spam_cleaned, 'spam')
ham_classified <- assign_class(ham_cleaned, 'ham')
# Combine the ham and spam Corpuses
full_corp <- c(spam_classified, ham_classified)
# Shuffle spam and ham records within the combined corpus
full_corp <- sample(full_corp)
# Create a Document Term Matrix and remove sparse terms
dtm <- DocumentTermMatrix(full_corp)
dtm <- removeSparseTerms(dtm, 1-(10/length(full_corp)))
Classification
# Create a vector with class labels and its length
class_labels <- unlist(meta(full_corp, type="local", tag = "classification"))
n <- length(class_labels)
# Create a contained with training and testing data and their class labels
# It's about 25% / 75%
container <- create_container(dtm,
labels = class_labels,
trainSize = 1:3000,
testSize = 3001:n,
virgin = TRUE)
# Train the model using Support Vector Machine algorithm
svm_model <- train_model(container, "SVM")
# Perform a classification for each document and show top 5 results
svm_out <- classify_model(container, svm_model)
head(svm_out)
## SVM_LABEL SVM_PROB
## 1 spam 0.9903427
## 2 ham 0.9995498
## 3 spam 0.9999998
## 4 ham 0.9994660
## 5 spam 0.9999993
## 6 ham 0.9999818
# MaxEnt and Tree
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)
#Performance Test
labels_out <- data.frame(correct_label = class_labels[3001:n],
svm = as.character(svm_out[,1]),
tree = as.character(tree_out[,1]),
maxent = as.character(maxent_out[,1]),
stringsAsFactors = F)
# SVM
prop.table(table(labels_out[,1] == labels_out[,2]))
##
## FALSE TRUE
## 0.006681514 0.993318486
# Tree
prop.table(table(labels_out[,1] == labels_out[,3]))
##
## FALSE TRUE
## 0.01113586 0.98886414
# MaxEnt
prop.table(table(labels_out[,1] == labels_out[,4]))
##
## FALSE TRUE
## 0.003340757 0.996659243
Accuracy