The corpus for this analysis is located here: https://spamassassin.apache.org/publiccorpus/

Overview of quanteda: 1. Easy to manage texts in the form of a corpus. 2. quanteda includes tools to make it easy and fast to manuipulate the texts in a corpus, by performing the most common natural language processing tasks simply and quickly, such as tokenizing, stemming, or forming ngrams. quanteda’s functions for tokenizing texts and forming multiple tokenized documents into a document-feature matrix are both extremely fast and extremely simple to use. 3. quanteda can segment texts easily by words, paragraphs, sentences, or even user-supplied delimiters and tags. For details refer the below link https://cran.r-project.org/web/packages/quanteda/quanteda.pdf

1 Libraries

#The code for this assignment requires the following R packages:

libr <- c("wordcloud","ggplot2","tm","plyr","class","stringr","stringi","RCurl","XML","SnowballC","R.utils","quanteda","class","knitr","Rweka","RTextTools")
lapply( libr, require, character.only = TRUE)

## Loading required package: wordcloud

## Loading required package: RColorBrewer

## Loading required package: ggplot2

## Loading required package: tm

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## Loading required package: plyr

## Loading required package: class

## Loading required package: stringr

## Loading required package: stringi

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: XML

## Loading required package: SnowballC

## Loading required package: R.utils

## Loading required package: R.oo

## Loading required package: R.methodsS3

## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.

## R.oo v1.21.0 (2016-10-30) successfully loaded. See ?R.oo for help.

## 
## Attaching package: 'R.oo'

## The following object is masked from 'package:RCurl':
## 
##     clone

## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods

## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save

## R.utils v2.5.0 (2016-11-07) successfully loaded. See ?R.utils for help.

## 
## Attaching package: 'R.utils'

## The following object is masked from 'package:RCurl':
## 
##     reset

## The following object is masked from 'package:utils':
## 
##     timestamp

## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings

## Loading required package: quanteda

## quanteda version 0.9.9.24

## Using 3 of 4 cores for parallel computing

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:R.oo':
## 
##     trim

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following object is masked from 'package:utils':
## 
##     View

## The following object is masked from 'package:base':
## 
##     sample

## Loading required package: knitr

## Loading required package: Rweka

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Rweka'

## Loading required package: RTextTools

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## 
## Attaching package: 'RTextTools'

## The following objects are masked from 'package:SnowballC':
## 
##     getStemLanguages, wordStem

## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE
## 
## [[5]]
## [1] TRUE
## 
## [[6]]
## [1] TRUE
## 
## [[7]]
## [1] TRUE
## 
## [[8]]
## [1] TRUE
## 
## [[9]]
## [1] TRUE
## 
## [[10]]
## [1] TRUE
## 
## [[11]]
## [1] TRUE
## 
## [[12]]
## [1] TRUE
## 
## [[13]]
## [1] TRUE
## 
## [[14]]
## [1] TRUE
## 
## [[15]]
## [1] FALSE
## 
## [[16]]
## [1] TRUE

#Set Options
Options(stringsAsFactors = FALSE)

## [1] "Options: 0 options set."

2 Step 1: List the spam files

spam_file_folder <- "/Users/Raghu/spamham/spam_2/"
spam_file_names <- list.files(spam_file_folder)
#check the files
head(spam_file_names ,10)

##  [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
##  [2] "00002.9438920e9a55591b18e60d1ed37d992b"
##  [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
##  [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
##  [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
##  [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
##  [7] "00007.acefeee792b5298f8fee175f9f65c453"
##  [8] "00008.ccf927a6aec028f5472ca7b9db9eee20"
##  [9] "00009.1e1a8cb4b57532ab38aa23287523659d"
## [10] "00010.2558d935f6439cb40d3acb8b8569aa9b"

3 Step 2: Create functions 1.create corpus 2. Document feature matrix and 3. Data frame

createCorpus <- function(directory, emailType) {
    
    quantCorpus <- corpus(VCorpus(DirSource(directory = directory, encoding = "UTF-8-MAC"), 
                                    readerControl = list(language="en_US")),
                      notes=emailType)
    
    docvars(quantCorpus, "email_type") <- emailType
    docvars(quantCorpus, "source")     <- directory
    return(quantCorpus)
}

buildDFM <- function(corpus, minDoc, minCount) {
    # create the document-feature matrix(DFM)
    dfm <- dfm(corpus, ignoredFeatures = stopwords("english"), stem = TRUE)
    # word should atleast occur 10 times in 2 documents
    dfm <- dfm_trim(dfm, min_count = 10, min_docfreq = 2, sparsity = NULL, verbose = TRUE)
    return(dfm)
}

create_df_matrix <- function(dfm, emailType) {
    
    # apply the tfidf function
    #compute term frequency-inverse document frequency weights
    mat <- data.matrix(tfidf(dfm))
 
    # convert to a dataframe
    df <- as.data.frame(mat, stringsAsFactors =  FALSE)
    df$Source <- emailType
     #str(df)    
    return(df)
}

4 Step 3: Create spam corpus

spamCorpus <- createCorpus("/Users/Raghu/spamham/spam_2", "spam")

## Warning in corpus.character(texts, docvars = metad, metacorpus =
## metacorpus, : Argument notes not used.

summary(spamCorpus,10)

## Corpus consisting of 1397 documents, showing 10 documents.
## 
##    Text Types Tokens Sentences author       datetimestamp description
##   text1   408   1079        29   <NA> 2017-04-10 22:39:47        <NA>
##   text2   456   1509        31   <NA> 2017-04-10 22:39:47        <NA>
##   text3   479   1742        38   <NA> 2017-04-10 22:39:47        <NA>
##   text4   492   1780        38   <NA> 2017-04-10 22:39:47        <NA>
##   text5   363   1114        19   <NA> 2017-04-10 22:39:47        <NA>
##   text6   577   8962        15   <NA> 2017-04-10 22:39:47        <NA>
##   text7   248    496        13   <NA> 2017-04-10 22:39:47        <NA>
##   text8   698   4129        98   <NA> 2017-04-10 22:39:47        <NA>
##   text9   482   1397         6   <NA> 2017-04-10 22:39:47        <NA>
##  text10   304    956        15   <NA> 2017-04-10 22:39:47        <NA>
##  heading                                     id language origin email_type
##     <NA> 00001.317e78fa8ee2f54cd4890fdc09ba8176    en_US   <NA>       spam
##     <NA> 00002.9438920e9a55591b18e60d1ed37d992b    en_US   <NA>       spam
##     <NA> 00003.590eff932f8704d8b0fcbe69d023b54d    en_US   <NA>       spam
##     <NA> 00004.bdcc075fa4beb5157b5dd6cd41d8887b    en_US   <NA>       spam
##     <NA> 00005.ed0aba4d386c5e62bc737cf3f0ed9589    en_US   <NA>       spam
##     <NA> 00006.3ca1f399ccda5d897fecb8c57669a283    en_US   <NA>       spam
##     <NA> 00007.acefeee792b5298f8fee175f9f65c453    en_US   <NA>       spam
##     <NA> 00008.ccf927a6aec028f5472ca7b9db9eee20    en_US   <NA>       spam
##     <NA> 00009.1e1a8cb4b57532ab38aa23287523659d    en_US   <NA>       spam
##     <NA> 00010.2558d935f6439cb40d3acb8b8569aa9b    en_US   <NA>       spam
##                       source
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
##  /Users/Raghu/spamham/spam_2
## 
## Source:  Converted from tm VCorpus 'VCorpus(DirSource(directory = directory, encoding = "UTF-8-MAC"), 'Converted from tm VCorpus '    readerControl = list(language = "en_US"))'
## Created: Mon Apr 10 18:39:49 2017
## Notes:

5 Step 4: Create DFM(Document Feature Matrix) for Spam Corpus

dfmSpam <- buildDFM(spamCorpus, docnames(spamCorpus), 50)

## Warning in tokens.character(x, ...): Argument ignoredFeatures not used.

## Removing features occurring:

##   - fewer than 10 times: 57,866

##   - in fewer than 2 documents: 47,471

##   Total features removed: 58,156 (91.7%).

dfmSpam

## Document-feature matrix of: 1,397 documents, 5,262 features (95.1% sparse).

summary(dfmSpam,20)

##    Length     Class      Mode 
##   7351014 dfmSparse        S4

dim(dfmSpam)

## [1] 1397 5262

topfeatures(dfmSpam, 20)

##      >      <      =      "      /      -      :      ,      .   font 
## 162640 162341 161302 118745 107197 103283  66258  40411  35690  33900 
##      ;     3d      )      (     to     br      a      *     td    the 
##  32001  30177  18750  18518  17697  16751  16561  15947  15691  14901

6 Step 5: Create Plot and word cloud for Spam

plot(topfeatures(dfmSpam, 100), log = "y", cex = .6, ylab = "Term frequency", main = "Top Features of Spam")

textplot_wordcloud(dfmSpam, max.words = 100,random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5))

7 Step 6: Create Ham corpus

hamCorpus <- createCorpus("/Users/Raghu/spamham/easy_ham", "ham")

## Warning in corpus.character(texts, docvars = metad, metacorpus =
## metacorpus, : Argument notes not used.

summary(hamCorpus,10)

## Corpus consisting of 2501 documents, showing 10 documents.
## 
##    Text Types Tokens Sentences author       datetimestamp description
##   text1   300   1080        25   <NA> 2017-04-10 22:39:59        <NA>
##   text2   250    802         5   <NA> 2017-04-10 22:39:59        <NA>
##   text3   326    904        11   <NA> 2017-04-10 22:39:59        <NA>
##   text4   275    742         9   <NA> 2017-04-10 22:39:59        <NA>
##   text5   272    802        10   <NA> 2017-04-10 22:39:59        <NA>
##   text6   246    792         7   <NA> 2017-04-10 22:39:59        <NA>
##   text7   308    932         9   <NA> 2017-04-10 22:39:59        <NA>
##   text8   291    850        14   <NA> 2017-04-10 22:39:59        <NA>
##   text9   697   1948        42   <NA> 2017-04-10 22:39:59        <NA>
##  text10   268    812        11   <NA> 2017-04-10 22:39:59        <NA>
##  heading                                     id language origin email_type
##     <NA> 00001.7c53336b37003a9286aba55d2945844c    en_US   <NA>        ham
##     <NA> 00002.9c4069e25e1ef370c078db7ee85ff9ac    en_US   <NA>        ham
##     <NA> 00003.860e3c3cee1b42ead714c5c874fe25f7    en_US   <NA>        ham
##     <NA> 00004.864220c5b6930b209cc287c361c99af1    en_US   <NA>        ham
##     <NA> 00005.bf27cdeaf0b8c4647ecd61b1d09da613    en_US   <NA>        ham
##     <NA> 00006.253ea2f9a9cc36fa0b1129b04b806608    en_US   <NA>        ham
##     <NA> 00007.37a8af848caae585af4fe35779656d55    en_US   <NA>        ham
##     <NA> 00008.5891548d921601906337dcf1ed8543cb    en_US   <NA>        ham
##     <NA> 00009.371eca25b0169ce5cb4f71d3e07b9e2d    en_US   <NA>        ham
##     <NA> 00010.145d22c053c1a0c410242e46c01635b3    en_US   <NA>        ham
##                         source
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
##  /Users/Raghu/spamham/easy_ham
## 
## Source:  Converted from tm VCorpus 'VCorpus(DirSource(directory = directory, encoding = "UTF-8-MAC"), 'Converted from tm VCorpus '    readerControl = list(language = "en_US"))'
## Created: Mon Apr 10 18:40:00 2017
## Notes:

8 Step 7: Create DFM(Document Feature Matrix) for Ham corpus

dfmham <- buildDFM(hamCorpus, docnames(hamCorpus), 50)

## Warning in tokens.character(x, ...): Argument ignoredFeatures not used.

## Removing features occurring:

##   - fewer than 10 times: 46,707

##   - in fewer than 2 documents: 35,872

##   Total features removed: 46,775 (89.3%).

dfmham

## Document-feature matrix of: 2,501 documents, 5,615 features (96.2% sparse).

summary(dfmham,10)

##    Length     Class      Mode 
##  14043115 dfmSparse        S4

dim(dfmham)

## [1] 2501 5615

topfeatures(dfmham, 20)

##      -      :      >      /      ,      .      )      (      <     to 
## 166462 127431  59497  56192  45884  43510  41828  41305  35539  23723 
##    the   2002   from      ;      =   with      ]      [     by    for 
##  23554  21812  21330  19512  19307  16127  16082  15917  15895  15501

9 Step 8: Create Plot and wordcloud for Ham

plot(topfeatures(dfmham, 100), log = "y", cex = .6, ylab = "Term frequency", main = "Top Features of Ham")

textplot_wordcloud(dfmham, max.words = 100,random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5))

10 Step 9: Stack the DFM of Spam and Ham

dfSpam <- create_df_matrix(dfmSpam, "spam")  
dfHam <- create_df_matrix(dfmham, "ham")  
stacked.df <- rbind.fill(dfSpam, dfHam)

#Stack the data frames of Spam and ham
# set NA values to 0
stacked.df[is.na(stacked.df)] <- 0
dim(stacked.df)

## [1] 3898 8186

11 Step 10: Create the tree modal

tdm.email <- stacked.df[, "Source"]
stacked.nl <- stacked.df[, !colnames(stacked.df) %in% "Source"]  
n <- length(tdm.email)

# taking the training size as 1000 which is one fourth of the whole size.
container <- create_container(stacked.nl,
            tdm.email, trainSize=1:1000,
            testSize=1001:n, virgin=FALSE)

slotNames(container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

#MAXENT <- train_model(container,"MAXENT")
#BOOSTING <- train_model(container,"BOOSTING")
tree_model <- train_model(container, "TREE")

#Having problems with MAXENT,SVM and BOOSTING.

#MAXENT_out <- classify_model(container, MAXENT)
#BOOSTING_out <- classify_model(container, BOOSTING)
tree_model_out <- classify_model(container, tree_model)

head(tree_model_out,5)

##   TREE_LABEL TREE_PROB
## 1       spam         1
## 2       spam         1
## 3       spam         1
## 4       spam         1
## 5       spam         1

labels_out <- data.frame(
  correct_label = tdm.email[1001:n],
  tree = as.character(tree_model_out[,1]),
  stringAsFactors = F)

#ensure the dataframe does not have factor columns
labels_out$tree <- as.character(labels_out$tree)

#TREE Performance
table(labels_out[,1] == labels_out[,2])

## 
## FALSE  TRUE 
##  2501   397

12 Step 11: Create KNN modal (k-Nearest Neighbor)

## Create training and test datasets 
train.idx <- sample(nrow(stacked.df), ceiling(nrow(stacked.df) * 0.7))
test.idx <- (1:nrow(stacked.df)) [-train.idx]
head(train.idx,5)

## [1] 2486 2113 3653 3396 1157

head(test.idx,5)

## [1] 1 3 5 6 9

length(train.idx)

## [1] 2729

length(test.idx)

## [1] 1169

#kNN prediction using the training and test datasets
knn.pred <- knn(stacked.nl[train.idx, ], stacked.nl[test.idx, ], tdm.email[train.idx])
conf.mat <- table("Predictions" = knn.pred, Actual = tdm.email[test.idx])
conf.mat

##            Actual
## Predictions ham spam
##        ham  724   12
##        spam  24  409

#df.pred <- cbind(knn.pred, stacked.nl[test.idx, ])
#head(df.pred,5)

13 Conclusion:

Comparing the two modals- KNN and Tree, I could see that the predictions match. 397 Spam reported by these two modals.

607_Project4 Document Classification

Raghu Ramnath

4/6/2017