library(tm)
## Loading required package: NLP
library(stringr)
library(SnowballC)
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'RTextTools'
## The following objects are masked from 'package:SnowballC':
##
## getStemLanguages, wordStem
The spam and ham files were downloaded, and unzipped from https://spamassassin.apache.org/publiccorpus/.
setwd("C:/data")
pathToHam = "easy_ham"
pathToSpam = "spam_2"
corpusHam <- Corpus((DirSource(directory=pathToHam, pattern="\\d+")),
readerControl = list(reader = readPlain))
length(corpusHam)
## [1] 2500
corpusSpam <- Corpus((DirSource(directory=pathToSpam, pattern="\\d+")),
readerControl = list(reader = readPlain))
length(corpusSpam)
## [1] 1396
Diffentiate email files and combine the 2 corpuses (ham and spam emails)
meta(corpusHam, tag="emailtype") = "Ham"
meta(corpusSpam, tag="emailtype") = "Spam"
corpusHS = c(corpusHam, corpusSpam)
length(corpusHS)
## [1] 3896
Perform text mapping cleansing techiques
##HS_tdm = TermDocumentMatrix(corpusHS)
##HS_tdm
corpusHS = tm_map(corpusHS, removeNumbers)
##HS_tdm = TermDocumentMatrix(corpusHS)
##HS_tdm
corpusHS = tm_map(corpusHS, str_replace_all, pattern="[[:punct:]]", replacement=" ")
##HS_tdm = TermDocumentMatrix(corpusHS)
##HS_tdm
corpusHS = tm_map(corpusHS, removeWords, words=stopwords("en"))
##HS_tdm = TermDocumentMatrix(corpusHS)
##HS_tdm
corpusHS = tm_map(corpusHS, tolower)
corpusHS = tm_map(corpusHS, stemDocument)
#HS_tdm = TermDocumentMatrix(corpusHS, PlainTextDocument)
##HS_tdm
##HS_tdm = removeSparseTerms(HS_tdm, 1-(10/length(corpusHS)))
##HS_tdm
Perform text mapping cleansing techiques
corpusHS <- tm_map(corpusHS, PlainTextDocument)
corpusHSR = sample(corpusHS)
HS_dtm = DocumentTermMatrix(corpusHSR)
##HS_dtm
HS_dtm = removeSparseTerms(HS_dtm, 1-(10/length(corpusHSR)))
HS_dtm
## <<DocumentTermMatrix (documents: 3896, terms: 6961)>>
## Non-/sparse entries: 613091/26506965
## Sparsity : 98%
## Maximal term length: 70
## Weighting : term frequency (tf)
Divide the corpus into training and test datasets. Create training models for classification algorithms. In addition, create corresponding test datasets for each of the classification algorithms.
head(meta(corpusHSR))
## emailtype
## 1034 Ham
## 2259 Ham
## 153 Ham
## 3004 Spam
## 983 Ham
## 1845 Ham
HS_labels <- as.factor(unlist(meta(corpusHSR, "emailtype")[,1]))
class(HS_labels)
## [1] "factor"
N = nrow(meta(corpusHSR))
container = create_container(HS_dtm, labels=HS_labels, trainSize = 1:3000, testSize = 3001:N, virgin = FALSE)
slotNames(container)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
svm_model = train_model(container, "SVM")
tree_model = train_model(container, "TREE")
maxent_model = train_model(container, "MAXENT")
boost_model = train_model(container, "BOOSTING")
##bagg_model = train_model(container, "BAGGING")
##nnet_model = train_model(container, "NNET")
svm_out = classify_model(container, svm_model)
tree_out = classify_model(container, tree_model)
maxent_out = classify_model(container, maxent_model)
boost_out = classify_model(container, boost_model)
##bagg_out = classify_model(container, bagg_model)
##nnet_out = classify_model(container, nnet_model)
Determine which of the classification algorithm is the most accurate
head(svm_out)
## SVM_LABEL SVM_PROB
## 1 Spam 0.9099099
## 2 Spam 0.9674704
## 3 Ham 0.9983343
## 4 Spam 1.0000000
## 5 Spam 1.0000000
## 6 Ham 0.9998960
head(tree_out)
## TREE_LABEL TREE_PROB
## 1 Spam 0.9969789
## 2 Spam 0.9969789
## 3 Ham 1.0000000
## 4 Spam 0.9969789
## 5 Spam 0.9969789
## 6 Ham 1.0000000
head(maxent_out)
## MAXENTROPY_LABEL MAXENTROPY_PROB
## 1 Spam 1
## 2 Spam 1
## 3 Ham 1
## 4 Spam 1
## 5 Spam 1
## 6 Ham 1
head(boost_out)
## LOGITBOOST_LABEL LOGITBOOST_PROB
## 1 Spam 1.0000000
## 2 Spam 1.0000000
## 3 Ham 1.0000000
## 4 Spam 0.9999999
## 5 Spam 0.9999999
## 6 Ham 1.0000000
labels_out = data.frame(correct_label = HS_labels[3001:N],
svm = as.character(svm_out[,1]),
tree = as.character(tree_out[,1]),
maxent = as.character(maxent_out[,1]),
boost= as.character(boost_out[,1]),
stringsAsFactors = F)
table(labels_out[,1] == labels_out[,2])
##
## FALSE TRUE
## 6 890
prop.table(table(labels_out[,1] == labels_out[,2]))
##
## FALSE TRUE
## 0.006696429 0.993303571
table(labels_out[,1] == labels_out[,3])
##
## FALSE TRUE
## 15 881
prop.table(table(labels_out[,1] == labels_out[,3]))
##
## FALSE TRUE
## 0.01674107 0.98325893
table(labels_out[,1] == labels_out[,4])
##
## FALSE TRUE
## 5 891
prop.table(table(labels_out[,1] == labels_out[,4]))
##
## FALSE TRUE
## 0.005580357 0.994419643
table(labels_out[,1] == labels_out[,5])
##
## FALSE TRUE
## 9 887
prop.table(table(labels_out[,1] == labels_out[,5]))
##
## FALSE TRUE
## 0.01004464 0.98995536
All 4 supervised machine learning algorithms performed well for the given training and testing datasets. The Boosting Algorithm performed best matching all but 1 test email correctly. The SVM and Maximum Entrophy algorithms misclassified only 5 of 896 test emails while the Random Forest algorithm was wrost as it misclassified 15 of the 896 emails.