Load necessary libraries
library("easypackages")
libraries("knitr", "downloader", "R.utils", "tm", "wordcloud",
"topicmodels","SnowballC", "e1071", "data.table", "RMySQL",
"tidyverse", "tidyr", "dplyr", "stringr", "stats", "quanteda",
"plyr", "class", "stringi")## Loading required package: knitr
## Warning: package 'knitr' was built under R version 3.4.3
## Loading required package: downloader
## Loading required package: R.utils
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.21.0 (2016-10-30) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
## R.utils v2.6.0 (2017-11-04) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
## Loading required package: tm
## Warning: package 'tm' was built under R version 3.4.3
## Loading required package: NLP
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## Loading required package: topicmodels
## Loading required package: SnowballC
## Loading required package: e1071
## Loading required package: data.table
## Loading required package: RMySQL
## Loading required package: DBI
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.1 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::between() masks data.table::between()
## ✖ tidyr::extract() masks R.utils::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
## Loading required package: quanteda
## Warning: package 'quanteda' was built under R version 3.4.4
## Package version: 1.1.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: class
## Loading required package: stringi
## All packages loaded successfully
URL <- "http://spamassassin.apache.org/old/publiccorpus/"
files <- c("20021010_easy_ham.tar.bz2",
"20021010_spam.tar.bz2",
"20021010_hard_ham.tar.bz2",
"20030228_spam_2.tar.bz2")Download_and_untar used to download the files needed for analysis
download_and_untar <- function(filename, downloadOnly = FALSE) {
downloader::download(url = paste0(URL, filename), filename )
tar.file <- stri_replace_all_regex(filename, ".bz2", "")
if (!downloadOnly) {
# bunzip2 the file
bunzip2(filename, tar.file, remove = FALSE, skip = TRUE)
# untar the file
untar(tar.file, exdir = ".")
# remove the tar file
if (file.exists(tar.file)) file.remove(tar.file)
}
}CreateCorpus used to to convert a corpus into a quanteda corpus
createCorpus <- function(directory, emailType) {
quantCorpus <- corpus(Corpus(DirSource(directory = directory, encoding = "UTF-8"),
readerControl = list(language="en_US")),
notes=emailType)
docvars(quantCorpus, "email_type") <- emailType
docvars(quantCorpus, "source") <- stri_replace_all_regex(directory, "./", "")
return(quantCorpus)
}BuildDFM accepts a corpus object and converts to a document-feature matrix
buildDFM <- function(corpus, minDoc, minCount) {
# create the document-feature matrix
# dfm = document-feature matrix
dfm <- dfm(corpus, ignoredFeatures = stopwords("english"), stem = TRUE)
dfm <- trim(dfm, minDoc = minDoc, minCount = minCount)
return(dfm)
}
plotDFM <- function(dfm) {
# plot in colors with some additional options passed to wordcloud
plot(dfm, random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5))
}Create_df_matrix accepts a dfm object, applies the td-idf function, and returns a dataframe
create_df_matrix <- function(dfm, emailType) {
# apply the tfidf function
mat <- data.matrix(tfidf(dfm))
# convert to a dataframe
df <- as.data.frame(mat, stringsAsFactors = FALSE)
df$Source <- emailType
return(df)
}First let’s download and create the spam and ham corpus
# use lapply to download and untar all files specified
lapply(files, download_and_untar)## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
Next, we can create the Spam Corpus
spamCorpus <- createCorpus("./spam", "spam")## Warning in corpus.Corpus(Corpus(DirSource(directory = directory, encoding =
## "UTF-8"), : Argument notes not used.
spam2Corpus <- createCorpus("./spam_2", "spam")## Warning in corpus.Corpus(Corpus(DirSource(directory = directory, encoding =
## "UTF-8"), : Argument notes not used.
Combine the 2 Spam corpora
spamCorpusCombined <- spamCorpus + spam2Corpus
# summarize the combined corpus
summary(spamCorpusCombined, 20)## Corpus consisting of 1899 documents, showing 20 documents:
##
## Text Types Tokens Sentences email_type
## 0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1 1170 1835 1 spam
## 0001.bfc8d64d12b325ff385cca8d07b84288 326 1449 18 spam
## 0002.24b47bb3ce90708ae29d0aec1da08610 225 575 10 spam
## 0003.4b3d943b8df71af248d12f8b2e7a224a 190 436 9 spam
## 0004.1874ab60c71f0b31b580f313a3f6e777 354 1084 46 spam
## 0005.1f42bb885de0ef7fc5cd09d34dc2ba54 213 536 7 spam
## 0006.7a32642f8c22bbeb85d6c3b5f3890a2c 359 761 27 spam
## 0007.859c901719011d56f8b652ea071c1f8b 174 381 10 spam
## 0008.9562918b57e044abfbce260cc875acde 601 5897 22 spam
## 0009.c05e264fbf18783099b53dbc9a9aacda 407 893 40 spam
## 0010.7f5fb525755c45eb78efc18d7c9ea5aa 220 791 5 spam
## 0011.2a1247254a535bac29c476b86c708901 190 437 9 spam
## 0012.7bc8e619ad0264979edce15083e70a02 157 503 7 spam
## 0013.9034ac0917f6fdb82c5ee6a7509029ed 188 434 9 spam
## 0014.ed99ffe0f452b91be11684cbfe8d349c 297 1844 38 spam
## 0015.1b871d654560011a0aaa29bb4e9054f7 172 473 7 spam
## 0016.f9c349935955e1ccc7626270da898445 302 1639 10 spam
## 0017.49ab70c7a4042cb1c695a0e59a6ede54 349 756 40 spam
## 0018.259154a52bc55dcae491cfded60a5cd2 173 379 11 spam
## 0019.939e70d8367f315193e4bc5be80dc262 315 685 19 spam
## source
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
## spam
##
## Source: Combination of corpuses spamCorpus and spam2Corpus
## Created: Fri Apr 20 23:27:23 2018
## Notes:
Next, let’s build the document-feature matrix using the Spam corpus
dfmSpam <- buildDFM(spamCorpusCombined, round(length(docnames(spamCorpusCombined))/10), 50)## Warning: Argument ignoredFeatures not used.
## Warning: Argument ignoredFeatures not used.
dim(dfmSpam) # basic dimensions of the dfm## NULL
topfeatures(dfmSpam, 20) # top features of the spam dfm## Error in topfeatures.default(dfmSpam, 20): topfeatures() only works on dfm objects.
plot(topfeatures(dfmSpam, 100), log = "y", cex = .6, ylab = "Term frequency", main = "Top Features of Spam")## Error in topfeatures.default(dfmSpam, 100): topfeatures() only works on dfm objects.
Wordcloud of 100 top spam words
plotDFM(dfmSpam[, 1:100])## Error in dfmSpam[, 1:100]: incorrect number of dimensions
Let’s now create the Ham corpus
hamCorpus <- createCorpus("./easy_ham", "ham")## Warning in corpus.Corpus(Corpus(DirSource(directory = directory, encoding =
## "UTF-8"), : Argument notes not used.
ham2Corpus <- createCorpus("./hard_ham", "ham")## Warning in corpus.Corpus(Corpus(DirSource(directory = directory, encoding =
## "UTF-8"), : Argument notes not used.
#combine the 2 ham corpa
hamCorpusCombined <- hamCorpus + ham2Corpus
# summarize the combined corpus
summary(hamCorpusCombined, 20)## Corpus consisting of 2801 documents, showing 20 documents:
##
## Text Types Tokens Sentences email_type
## 0001.ea7e79d3153e7469e7a9c3e0af6a357e 289 948 25 ham
## 0002.b3120c4bcbf3101e661161ee7efcb8bf 234 722 5 ham
## 0003.acfc5ad94bbd27118a0d8685d18c89dd 302 822 11 ham
## 0004.e8d5727378ddde5c3be181df593f1712 256 668 9 ham
## 0005.8c3b9e9c0f3f183ddaf7592a11b99957 358 991 23 ham
## 0006.ee8b0dba12856155222be180ba122058 253 732 10 ham
## 0007.c75188382f64b090022fa3b095b020b0 228 726 7 ham
## 0008.20bc0b4ba2d99aae1c7098069f611a9b 286 858 9 ham
## 0009.435ae292d75abb1ca492dcc2d5cf1570 273 780 14 ham
## 0010.4996141de3f21e858c22f88231a9f463 669 1818 42 ham
## 0011.07b11073b53634cff892a7988289a72e 307 1005 30 ham
## 0012.d354b2d2f24d1036caf1374dd94f4c94 246 644 11 ham
## 0013.ff597adee000d073ae72200b0af00cd1 221 629 14 ham
## 0014.532e0a17d0674ba7a9baa7b0afe5fb52 353 1017 34 ham
## 0015.a9ff8d7550759f6ab62cc200bdf156e7 239 638 10 ham
## 0016.d82758030e304d41fb3f4ebbb7d9dd91 286 756 17 ham
## 0017.d81093a2182fc9135df6d9158a8ebfd6 257 689 15 ham
## 0018.ba70ecbeea6f427b951067f34e23bae6 382 1227 45 ham
## 0019.a8a1b2767e83b3be653e4af0148e1897 527 1435 36 ham
## 0020.ef397cef16f8041242e3b6560e168053 207 513 6 ham
## source
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
## easy_ham
##
## Source: Combination of corpuses hamCorpus and ham2Corpus
## Created: Fri Apr 20 23:31:13 2018
## Notes:
Next, let’s build the document-feature matrix using the Ham corpus
dfmHam <- buildDFM(hamCorpusCombined, round(length(docnames(hamCorpusCombined))/10), 50)## Warning: Argument ignoredFeatures not used.
## Warning: Argument ignoredFeatures not used.
dim(dfmHam)## NULL
plot(topfeatures(dfmHam, 100), log = "y", cex = .6, ylab = "Term frequency", main = "Top Features of Ham")## Error in topfeatures.default(dfmHam, 100): topfeatures() only works on dfm objects.
Build the k-Nearest Neighbor Model for Document Classification
dfSpam <- create_df_matrix(dfmSpam, "spam") ## Warning: 'tfidf' is deprecated.
## Use 'dfm_tfidf' instead.
## See help("Deprecated")
## Error in dfm_tfidf.default(dfm): dfm_tfidf() only works on dfm objects.
dfHam <- create_df_matrix(dfmHam, "ham") ## Warning: 'tfidf' is deprecated.
## Use 'dfm_tfidf' instead.
## See help("Deprecated")
## Error in dfm_tfidf.default(dfm): dfm_tfidf() only works on dfm objects.
stacked.df <- rbind.fill(dfSpam, dfHam)## Error in rbind.fill(dfSpam, dfHam): object 'dfSpam' not found
# set NA values to 0
stacked.df[is.na(stacked.df)] <- 0## Error in stacked.df[is.na(stacked.df)] <- 0: object 'stacked.df' not found
Create the training and test datasets
train.idx <- sample(nrow(stacked.df), ceiling(nrow(stacked.df) * 0.7))## Error in nrow(stacked.df): object 'stacked.df' not found
test.idx <- (1:nrow(stacked.df)) [-train.idx]## Error in nrow(stacked.df): object 'stacked.df' not found
length(train.idx) ## Error in eval(expr, envir, enclos): object 'train.idx' not found
length(test.idx)## Error in eval(expr, envir, enclos): object 'test.idx' not found
tdm.email <- stacked.df[, "Source"]## Error in eval(expr, envir, enclos): object 'stacked.df' not found
stacked.nl <- stacked.df[, !colnames(stacked.df) %in% "Source"] #stacked.nl## Error in eval(expr, envir, enclos): object 'stacked.df' not found
#Run the kNN prediction using the training and test datasets
knn.pred <- knn(stacked.nl[train.idx, ], stacked.nl[test.idx, ], tdm.email[train.idx])## Error in as.matrix(train): object 'stacked.nl' not found
The resulting Confusion Matrix:
conf.mat <- table("Predictions" = knn.pred, Actual = tdm.email[test.idx])## Error in table(Predictions = knn.pred, Actual = tdm.email[test.idx]): object 'knn.pred' not found
conf.mat## Error in eval(expr, envir, enclos): object 'conf.mat' not found
The accuracy of the model =
r (accuracy <- sum(diag(conf.mat)) / length(test.idx) * 100 )## Error in r(accuracy <- sum(diag(conf.mat))/length(test.idx) * 100): could not find function "r"
To output the predictions
df.pred <- cbind(knn.pred, stacked.nl[test.idx, ])## Error in cbind(knn.pred, stacked.nl[test.idx, ]): object 'knn.pred' not found