The corpus for this analysis is located here: https://spamassassin.apache.org/publiccorpus/
Overview of quanteda: 1. Easy to manage texts in the form of a corpus. 2. quanteda includes tools to make it easy and fast to manuipulate the texts in a corpus, by performing the most common natural language processing tasks simply and quickly, such as tokenizing, stemming, or forming ngrams. quanteda’s functions for tokenizing texts and forming multiple tokenized documents into a document-feature matrix are both extremely fast and extremely simple to use. 3. quanteda can segment texts easily by words, paragraphs, sentences, or even user-supplied delimiters and tags. For details refer the below link https://cran.r-project.org/web/packages/quanteda/quanteda.pdf
#The code for this assignment requires the following R packages:
libr <- c("wordcloud","ggplot2","tm","plyr","class","stringr","stringi","RCurl","XML","SnowballC","R.utils","quanteda","class","knitr","Rweka","RTextTools")
lapply( libr, require, character.only = TRUE)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## Loading required package: ggplot2
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Loading required package: plyr
## Loading required package: class
## Loading required package: stringr
## Loading required package: stringi
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: XML
## Loading required package: SnowballC
## Loading required package: R.utils
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.21.0 (2016-10-30) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:RCurl':
##
## clone
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
## R.utils v2.5.0 (2016-11-07) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:RCurl':
##
## reset
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
## Loading required package: quanteda
## quanteda version 0.9.9.24
## Using 3 of 4 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:R.oo':
##
## trim
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:utils':
##
## View
## The following object is masked from 'package:base':
##
## sample
## Loading required package: knitr
## Loading required package: Rweka
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Rweka'
## Loading required package: RTextTools
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'RTextTools'
## The following objects are masked from 'package:SnowballC':
##
## getStemLanguages, wordStem
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] TRUE
##
## [[6]]
## [1] TRUE
##
## [[7]]
## [1] TRUE
##
## [[8]]
## [1] TRUE
##
## [[9]]
## [1] TRUE
##
## [[10]]
## [1] TRUE
##
## [[11]]
## [1] TRUE
##
## [[12]]
## [1] TRUE
##
## [[13]]
## [1] TRUE
##
## [[14]]
## [1] TRUE
##
## [[15]]
## [1] FALSE
##
## [[16]]
## [1] TRUE
#Set Options
Options(stringsAsFactors = FALSE)
## [1] "Options: 0 options set."
spam_file_folder <- "/Users/Raghu/spamham/spam_2/"
spam_file_names <- list.files(spam_file_folder)
#check the files
head(spam_file_names ,10)
## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
## [7] "00007.acefeee792b5298f8fee175f9f65c453"
## [8] "00008.ccf927a6aec028f5472ca7b9db9eee20"
## [9] "00009.1e1a8cb4b57532ab38aa23287523659d"
## [10] "00010.2558d935f6439cb40d3acb8b8569aa9b"
createCorpus <- function(directory, emailType) {
quantCorpus <- corpus(VCorpus(DirSource(directory = directory, encoding = "UTF-8-MAC"),
readerControl = list(language="en_US")),
notes=emailType)
docvars(quantCorpus, "email_type") <- emailType
docvars(quantCorpus, "source") <- directory
return(quantCorpus)
}
buildDFM <- function(corpus, minDoc, minCount) {
# create the document-feature matrix(DFM)
dfm <- dfm(corpus, ignoredFeatures = stopwords("english"), stem = TRUE)
# word should atleast occur 10 times in 2 documents
dfm <- dfm_trim(dfm, min_count = 10, min_docfreq = 2, sparsity = NULL, verbose = TRUE)
return(dfm)
}
create_df_matrix <- function(dfm, emailType) {
# apply the tfidf function
#compute term frequency-inverse document frequency weights
mat <- data.matrix(tfidf(dfm))
# convert to a dataframe
df <- as.data.frame(mat, stringsAsFactors = FALSE)
df$Source <- emailType
#str(df)
return(df)
}
spamCorpus <- createCorpus("/Users/Raghu/spamham/spam_2", "spam")
## Warning in corpus.character(texts, docvars = metad, metacorpus =
## metacorpus, : Argument notes not used.
summary(spamCorpus,10)
## Corpus consisting of 1397 documents, showing 10 documents.
##
## Text Types Tokens Sentences author datetimestamp description
## text1 408 1079 29 <NA> 2017-04-10 22:39:47 <NA>
## text2 456 1509 31 <NA> 2017-04-10 22:39:47 <NA>
## text3 479 1742 38 <NA> 2017-04-10 22:39:47 <NA>
## text4 492 1780 38 <NA> 2017-04-10 22:39:47 <NA>
## text5 363 1114 19 <NA> 2017-04-10 22:39:47 <NA>
## text6 577 8962 15 <NA> 2017-04-10 22:39:47 <NA>
## text7 248 496 13 <NA> 2017-04-10 22:39:47 <NA>
## text8 698 4129 98 <NA> 2017-04-10 22:39:47 <NA>
## text9 482 1397 6 <NA> 2017-04-10 22:39:47 <NA>
## text10 304 956 15 <NA> 2017-04-10 22:39:47 <NA>
## heading id language origin email_type
## <NA> 00001.317e78fa8ee2f54cd4890fdc09ba8176 en_US <NA> spam
## <NA> 00002.9438920e9a55591b18e60d1ed37d992b en_US <NA> spam
## <NA> 00003.590eff932f8704d8b0fcbe69d023b54d en_US <NA> spam
## <NA> 00004.bdcc075fa4beb5157b5dd6cd41d8887b en_US <NA> spam
## <NA> 00005.ed0aba4d386c5e62bc737cf3f0ed9589 en_US <NA> spam
## <NA> 00006.3ca1f399ccda5d897fecb8c57669a283 en_US <NA> spam
## <NA> 00007.acefeee792b5298f8fee175f9f65c453 en_US <NA> spam
## <NA> 00008.ccf927a6aec028f5472ca7b9db9eee20 en_US <NA> spam
## <NA> 00009.1e1a8cb4b57532ab38aa23287523659d en_US <NA> spam
## <NA> 00010.2558d935f6439cb40d3acb8b8569aa9b en_US <NA> spam
## source
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
## /Users/Raghu/spamham/spam_2
##
## Source: Converted from tm VCorpus 'VCorpus(DirSource(directory = directory, encoding = "UTF-8-MAC"), 'Converted from tm VCorpus ' readerControl = list(language = "en_US"))'
## Created: Mon Apr 10 18:39:49 2017
## Notes:
dfmSpam <- buildDFM(spamCorpus, docnames(spamCorpus), 50)
## Warning in tokens.character(x, ...): Argument ignoredFeatures not used.
## Removing features occurring:
## - fewer than 10 times: 57,866
## - in fewer than 2 documents: 47,471
## Total features removed: 58,156 (91.7%).
dfmSpam
## Document-feature matrix of: 1,397 documents, 5,262 features (95.1% sparse).
summary(dfmSpam,20)
## Length Class Mode
## 7351014 dfmSparse S4
dim(dfmSpam)
## [1] 1397 5262
topfeatures(dfmSpam, 20)
## > < = " / - : , . font
## 162640 162341 161302 118745 107197 103283 66258 40411 35690 33900
## ; 3d ) ( to br a * td the
## 32001 30177 18750 18518 17697 16751 16561 15947 15691 14901
plot(topfeatures(dfmSpam, 100), log = "y", cex = .6, ylab = "Term frequency", main = "Top Features of Spam")
textplot_wordcloud(dfmSpam, max.words = 100,random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5))
hamCorpus <- createCorpus("/Users/Raghu/spamham/easy_ham", "ham")
## Warning in corpus.character(texts, docvars = metad, metacorpus =
## metacorpus, : Argument notes not used.
summary(hamCorpus,10)
## Corpus consisting of 2501 documents, showing 10 documents.
##
## Text Types Tokens Sentences author datetimestamp description
## text1 300 1080 25 <NA> 2017-04-10 22:39:59 <NA>
## text2 250 802 5 <NA> 2017-04-10 22:39:59 <NA>
## text3 326 904 11 <NA> 2017-04-10 22:39:59 <NA>
## text4 275 742 9 <NA> 2017-04-10 22:39:59 <NA>
## text5 272 802 10 <NA> 2017-04-10 22:39:59 <NA>
## text6 246 792 7 <NA> 2017-04-10 22:39:59 <NA>
## text7 308 932 9 <NA> 2017-04-10 22:39:59 <NA>
## text8 291 850 14 <NA> 2017-04-10 22:39:59 <NA>
## text9 697 1948 42 <NA> 2017-04-10 22:39:59 <NA>
## text10 268 812 11 <NA> 2017-04-10 22:39:59 <NA>
## heading id language origin email_type
## <NA> 00001.7c53336b37003a9286aba55d2945844c en_US <NA> ham
## <NA> 00002.9c4069e25e1ef370c078db7ee85ff9ac en_US <NA> ham
## <NA> 00003.860e3c3cee1b42ead714c5c874fe25f7 en_US <NA> ham
## <NA> 00004.864220c5b6930b209cc287c361c99af1 en_US <NA> ham
## <NA> 00005.bf27cdeaf0b8c4647ecd61b1d09da613 en_US <NA> ham
## <NA> 00006.253ea2f9a9cc36fa0b1129b04b806608 en_US <NA> ham
## <NA> 00007.37a8af848caae585af4fe35779656d55 en_US <NA> ham
## <NA> 00008.5891548d921601906337dcf1ed8543cb en_US <NA> ham
## <NA> 00009.371eca25b0169ce5cb4f71d3e07b9e2d en_US <NA> ham
## <NA> 00010.145d22c053c1a0c410242e46c01635b3 en_US <NA> ham
## source
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
## /Users/Raghu/spamham/easy_ham
##
## Source: Converted from tm VCorpus 'VCorpus(DirSource(directory = directory, encoding = "UTF-8-MAC"), 'Converted from tm VCorpus ' readerControl = list(language = "en_US"))'
## Created: Mon Apr 10 18:40:00 2017
## Notes:
dfmham <- buildDFM(hamCorpus, docnames(hamCorpus), 50)
## Warning in tokens.character(x, ...): Argument ignoredFeatures not used.
## Removing features occurring:
## - fewer than 10 times: 46,707
## - in fewer than 2 documents: 35,872
## Total features removed: 46,775 (89.3%).
dfmham
## Document-feature matrix of: 2,501 documents, 5,615 features (96.2% sparse).
summary(dfmham,10)
## Length Class Mode
## 14043115 dfmSparse S4
dim(dfmham)
## [1] 2501 5615
topfeatures(dfmham, 20)
## - : > / , . ) ( < to
## 166462 127431 59497 56192 45884 43510 41828 41305 35539 23723
## the 2002 from ; = with ] [ by for
## 23554 21812 21330 19512 19307 16127 16082 15917 15895 15501
plot(topfeatures(dfmham, 100), log = "y", cex = .6, ylab = "Term frequency", main = "Top Features of Ham")
textplot_wordcloud(dfmham, max.words = 100,random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5))
dfSpam <- create_df_matrix(dfmSpam, "spam")
dfHam <- create_df_matrix(dfmham, "ham")
stacked.df <- rbind.fill(dfSpam, dfHam)
#Stack the data frames of Spam and ham
# set NA values to 0
stacked.df[is.na(stacked.df)] <- 0
dim(stacked.df)
## [1] 3898 8186
tdm.email <- stacked.df[, "Source"]
stacked.nl <- stacked.df[, !colnames(stacked.df) %in% "Source"]
n <- length(tdm.email)
# taking the training size as 1000 which is one fourth of the whole size.
container <- create_container(stacked.nl,
tdm.email, trainSize=1:1000,
testSize=1001:n, virgin=FALSE)
slotNames(container)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
#MAXENT <- train_model(container,"MAXENT")
#BOOSTING <- train_model(container,"BOOSTING")
tree_model <- train_model(container, "TREE")
#Having problems with MAXENT,SVM and BOOSTING.
#MAXENT_out <- classify_model(container, MAXENT)
#BOOSTING_out <- classify_model(container, BOOSTING)
tree_model_out <- classify_model(container, tree_model)
head(tree_model_out,5)
## TREE_LABEL TREE_PROB
## 1 spam 1
## 2 spam 1
## 3 spam 1
## 4 spam 1
## 5 spam 1
labels_out <- data.frame(
correct_label = tdm.email[1001:n],
tree = as.character(tree_model_out[,1]),
stringAsFactors = F)
#ensure the dataframe does not have factor columns
labels_out$tree <- as.character(labels_out$tree)
#TREE Performance
table(labels_out[,1] == labels_out[,2])
##
## FALSE TRUE
## 2501 397
## Create training and test datasets
train.idx <- sample(nrow(stacked.df), ceiling(nrow(stacked.df) * 0.7))
test.idx <- (1:nrow(stacked.df)) [-train.idx]
head(train.idx,5)
## [1] 2486 2113 3653 3396 1157
head(test.idx,5)
## [1] 1 3 5 6 9
length(train.idx)
## [1] 2729
length(test.idx)
## [1] 1169
#kNN prediction using the training and test datasets
knn.pred <- knn(stacked.nl[train.idx, ], stacked.nl[test.idx, ], tdm.email[train.idx])
conf.mat <- table("Predictions" = knn.pred, Actual = tdm.email[test.idx])
conf.mat
## Actual
## Predictions ham spam
## ham 724 12
## spam 24 409
#df.pred <- cbind(knn.pred, stacked.nl[test.idx, ])
#head(df.pred,5)
Comparing the two modals- KNN and Tree, I could see that the predictions match. 397 Spam reported by these two modals.