require(tm)
suppressWarnings(require(RTextTools))
The spam and ham files were downloaded, and unzipped from https://spamassassin.apache.org/publiccorpus/.
#Set up working directory
setwd("C:/Users/Robert/Desktop/CUNY/GitHub/R/data/IS607_Homework11/")
#Create spam and ham corpus
spam <- Corpus(DirSource("spam"), readerControl = list(language="lat"))
easy_ham <- Corpus(DirSource("easy_ham"), readerControl = list(language="lat"))
hard_ham <- Corpus(DirSource("hard_ham"), readerControl = list(language="lat"))
#Remove cmds file
if (file.exists("easy_ham/cmds")) file.remove("easy_ham/cmds")
if (file.exists("hard_ham/cmds")) file.remove("hard_ham/cmds")
if (file.exists("spam/cmds")) file.remove("spam/cmds")
#Add meta labels
meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "easy_ham"
meta(hard_ham, tag = "type") <- "hard_ham"
#Combine corpus objects, with and without "hard" spam data
combinedcorpusAll <- c(spam,easy_ham, hard_ham, recursive=T)
combinedcorpusEasy <- c(spam,easy_ham, recursive=T)
combinedcorpusHard <- c(spam,hard_ham, recursive=T)
#Create reduced and randomized corpus
combinedResampledCorpusAll <- sample(combinedcorpusAll, 750)
combinedResampledCorpusEasy <- sample(combinedcorpusEasy, 750)
combinedResampledCorpusHard <- sample(combinedcorpusHard, 750)
#Build document-term matrix.
spamTDMAll <- DocumentTermMatrix(combinedResampledCorpusAll)
spamTDMEasy<- DocumentTermMatrix(combinedResampledCorpusEasy)
spamTDMHard<- DocumentTermMatrix(combinedResampledCorpusHard)
#Collect meta labels
typeAll <- unlist(meta(combinedResampledCorpusAll, "type")[,1])
typeEasy <- unlist(meta(combinedResampledCorpusEasy, "type")[,1])
typeHard <- unlist(meta(combinedResampledCorpusHard, "type")[,1])
head(typeAll)
## [1] "easy_ham" "easy_ham" "easy_ham" "spam" "spam" "easy_ham"
head(typeEasy)
## [1] "easy_ham" "easy_ham" "spam" "easy_ham" "easy_ham" "easy_ham"
head(typeHard)
## [1] "spam" "spam" "spam" "hard_ham" "spam" "spam"
#create container and designate training vs testing
spamContainerAll <- create_container(spamTDMAll, labels = typeAll, trainSize = 1:375, testSize = 375:length(typeAll), virgin = FALSE)
spamContainerEasy <- create_container(spamTDMEasy, labels = typeEasy, trainSize = 1:375, testSize = 375:length(typeEasy), virgin = FALSE)
spamContainerHard <- create_container(spamTDMHard, labels = typeHard, trainSize = 1:375, testSize = 375:length(typeHard), virgin = FALSE)
#training model for ALL
svm_model_all <- train_model(spamContainerAll, "SVM")
boosting_model_all <- train_model(spamContainerAll, "BOOSTING")
glmnet_model_all <- train_model(spamContainerAll, "GLMNET")
maxent_model_all <- train_model(spamContainerAll, "MAXENT")
#training model for easy
svm_model_easy <- train_model(spamContainerEasy, "SVM")
boosting_model_easy <- train_model(spamContainerEasy, "BOOSTING")
glmnet_model_easy <- train_model(spamContainerEasy, "GLMNET")
maxent_model_easy <- train_model(spamContainerEasy, "MAXENT")
#training model for hard
svm_model_hard <- train_model(spamContainerHard, "SVM")
boosting_model_hard <- train_model(spamContainerHard, "BOOSTING")
glmnet_model_hard <- train_model(spamContainerHard, "GLMNET")
maxent_model_hard <- train_model(spamContainerHard, "MAXENT")
#Classifying all
svm_classified_all <- classify_model(spamContainerAll, svm_model_all)
rf_classified_all <- classify_model(spamContainerAll, boosting_model_all)
glmnet_classified_all <- classify_model(spamContainerAll, glmnet_model_all)
maxent_classified_all <- classify_model(spamContainerAll, maxent_model_all)
#Classifying easy
svm_classified_easy <- classify_model(spamContainerEasy, svm_model_easy)
rf_classified_easy <- classify_model(spamContainerEasy, boosting_model_easy)
glmnet_classified_easy <- classify_model(spamContainerEasy, glmnet_model_easy)
maxent_classified_easy <- classify_model(spamContainerEasy, maxent_model_easy)
#Classifying hard
svm_classified_hard <- classify_model(spamContainerHard, svm_model_hard)
rf_classified_hard <- classify_model(spamContainerHard, boosting_model_hard)
glmnet_classified_hard <- classify_model(spamContainerHard, glmnet_model_hard)
maxent_classified_hard <- classify_model(spamContainerHard, maxent_model_hard)
#All Dataframe conversion
classified_DF_all <- data.frame(
label = typeAll[375:length(typeAll)],
svm = svm_classified_all[,1],
rf = rf_classified_all[,1],
glmnet = glmnet_classified_all[,1],
maxent = maxent_classified_all[,1],
stringsAsFactors = F)
#Easy Dataframe conversion
classified_DF_easy <- data.frame(
label = typeEasy[375:length(typeEasy)],
svm = svm_classified_easy[,1],
rf = rf_classified_easy[,1],
glmnet = glmnet_classified_easy[,1],
maxent = maxent_classified_easy[,1],
stringsAsFactors = F)
#Hard Dataframe conversion
classified_DF_hard <- data.frame(
label = typeHard[375:length(typeHard)],
svm = svm_classified_hard[,1],
rf = rf_classified_hard[,1],
glmnet = glmnet_classified_hard[,1],
maxent = maxent_classified_hard[,1],
stringsAsFactors = F)
#previewing our results
head(classified_DF_all)
## label svm rf glmnet maxent
## 1 spam spam spam spam spam
## 2 easy_ham easy_ham easy_ham easy_ham easy_ham
## 3 easy_ham easy_ham easy_ham easy_ham easy_ham
## 4 hard_ham hard_ham hard_ham easy_ham hard_ham
## 5 easy_ham easy_ham easy_ham easy_ham easy_ham
## 6 easy_ham easy_ham easy_ham easy_ham easy_ham
head(classified_DF_easy)
## label svm rf glmnet maxent
## 1 easy_ham easy_ham easy_ham easy_ham easy_ham
## 2 easy_ham easy_ham easy_ham easy_ham easy_ham
## 3 easy_ham easy_ham easy_ham easy_ham easy_ham
## 4 easy_ham easy_ham easy_ham easy_ham easy_ham
## 5 easy_ham easy_ham easy_ham easy_ham easy_ham
## 6 easy_ham easy_ham easy_ham easy_ham easy_ham
head(classified_DF_hard)
## label svm rf glmnet maxent
## 1 spam spam spam spam spam
## 2 spam spam spam spam spam
## 3 hard_ham hard_ham hard_ham hard_ham hard_ham
## 4 spam spam spam spam spam
## 5 spam spam spam spam spam
## 6 spam spam spam spam spam
##Support Vector Machine Performance
prop.table(table(classified_DF_all[,1] == classified_DF_all[,2]))
##
## FALSE TRUE
## 0.03989362 0.96010638
##Random Forest Performance
prop.table(table(classified_DF_all[,1] == classified_DF_all[,3]))
##
## FALSE TRUE
## 0.005319149 0.994680851
##glmnet Performance
prop.table(table(classified_DF_all[,1] == classified_DF_all[,4]))
##
## FALSE TRUE
## 0.02925532 0.97074468
##Max-Entropy Performance
prop.table(table(classified_DF_all[,1] == classified_DF_all[,5]))
##
## FALSE TRUE
## 0.04787234 0.95212766
##Support Vector Machine Performance
prop.table(table(classified_DF_easy[,1] == classified_DF_easy[,2]))
##
## FALSE TRUE
## 0.01861702 0.98138298
##Random Forest Performance
prop.table(table(classified_DF_easy[,1] == classified_DF_easy[,3]))
##
## TRUE
## 1
##glmnet Performance
prop.table(table(classified_DF_easy[,1] == classified_DF_easy[,4]))
##
## TRUE
## 1
##Max-Entropy Performance
prop.table(table(classified_DF_easy[,1] == classified_DF_easy[,5]))
##
## FALSE TRUE
## 0.01329787 0.98670213
##Support Vector Machine Performance
prop.table(table(classified_DF_hard[,1] == classified_DF_hard[,2]))
##
## FALSE TRUE
## 0.06117021 0.93882979
##Random Forest Performance
prop.table(table(classified_DF_hard[,1] == classified_DF_hard[,3]))
##
## FALSE TRUE
## 0.02393617 0.97606383
##glmnet Performance
prop.table(table(classified_DF_hard[,1] == classified_DF_hard[,4]))
##
## FALSE TRUE
## 0.03191489 0.96808511
##Max-Entropy Performance
prop.table(table(classified_DF_hard[,1] == classified_DF_hard[,5]))
##
## FALSE TRUE
## 0.04787234 0.95212766
Having initially run the data without “hard_spam”, the results of the classification algorithms were closer to 99%. Having since adding more complex, and assuming the data source labelled the e-mail data accurately, the algorithm efficacy has since been reduced.
Identifying “easy” spam was successful at greater than 99% for each of the algorithms, with the Support Vector Machine algorithm performing marginally better than the others. The “hard” spam was slightly less easy to identify, with some of the algorithms missing up to 4% of the results. The combined was expectedly towards the center of results.For a better understanding of the text-mining, spam classification process, it would be worth experimenting further on other datasets, determine an optimal sample size, and finding novel ways to plot the results.
–