SPAM vs HAM emails

Load necessary libraries

library("easypackages")
libraries("knitr", "downloader", "R.utils", "tm", "wordcloud", 
          "topicmodels","SnowballC", "e1071", "data.table", "RMySQL", 
          "tidyverse", "tidyr", "dplyr", "stringr", "stats", "quanteda", 
          "plyr", "class", "stringi")

## Loading required package: knitr

## Warning: package 'knitr' was built under R version 3.4.3

## Loading required package: downloader

## Loading required package: R.utils

## Loading required package: R.oo

## Loading required package: R.methodsS3

## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.

## R.oo v1.21.0 (2016-10-30) successfully loaded. See ?R.oo for help.

## 
## Attaching package: 'R.oo'

## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods

## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save

## R.utils v2.6.0 (2017-11-04) successfully loaded. See ?R.utils for help.

## 
## Attaching package: 'R.utils'

## The following object is masked from 'package:utils':
## 
##     timestamp

## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings

## Loading required package: tm

## Warning: package 'tm' was built under R version 3.4.3

## Loading required package: NLP

## Loading required package: wordcloud

## Loading required package: RColorBrewer

## Loading required package: topicmodels

## Loading required package: SnowballC

## Loading required package: e1071

## Loading required package: data.table

## Loading required package: RMySQL

## Loading required package: DBI

## Loading required package: tidyverse

## ── Attaching packages ─────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.1     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0

## ── Conflicts ────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::between()    masks data.table::between()
## ✖ tidyr::extract()    masks R.utils::extract()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks data.table::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::last()       masks data.table::last()
## ✖ purrr::transpose()  masks data.table::transpose()

## Loading required package: quanteda

## Warning: package 'quanteda' was built under R version 3.4.4

## Package version: 1.1.1

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

## Loading required package: plyr

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

## Loading required package: class

## Loading required package: stringi

## All packages loaded successfully

Download the spam and ham files to the wd

URL <- "http://spamassassin.apache.org/old/publiccorpus/"

files <- c("20021010_easy_ham.tar.bz2", 
           "20021010_spam.tar.bz2", 
           "20021010_hard_ham.tar.bz2", 
           "20030228_spam_2.tar.bz2")

Download_and_untar used to download the files needed for analysis

download_and_untar <- function(filename, downloadOnly = FALSE) {

    downloader::download(url = paste0(URL, filename), filename )
    
    tar.file <- stri_replace_all_regex(filename, ".bz2", "")
    
    if (!downloadOnly) {
        
        # bunzip2 the file    
        bunzip2(filename, tar.file, remove = FALSE, skip = TRUE)
        
        # untar the file     
        untar(tar.file, exdir = ".")
        
        # remove the tar file
        if (file.exists(tar.file)) file.remove(tar.file)
        
    }
}

CreateCorpus used to to convert a corpus into a quanteda corpus

createCorpus <- function(directory, emailType) {
    
    quantCorpus <- corpus(Corpus(DirSource(directory = directory, encoding = "UTF-8"), 
                                 readerControl = list(language="en_US")),
                          notes=emailType)
    
    docvars(quantCorpus, "email_type") <- emailType
    docvars(quantCorpus, "source")     <- stri_replace_all_regex(directory, "./", "")
    
    return(quantCorpus)
    
}

BuildDFM accepts a corpus object and converts to a document-feature matrix

buildDFM <- function(corpus, minDoc, minCount) {
    # create the document-feature matrix
    
    # dfm = document-feature matrix
    dfm <- dfm(corpus, ignoredFeatures = stopwords("english"), stem = TRUE)
    
    dfm <- trim(dfm, minDoc = minDoc, minCount = minCount)
    
    return(dfm)
    
}

plotDFM <- function(dfm) {
    
    # plot in colors with some additional options passed to wordcloud
    plot(dfm, random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5))
    
}

Create_df_matrix accepts a dfm object, applies the td-idf function, and returns a dataframe

create_df_matrix <- function(dfm, emailType) {
    
    # apply the tfidf function
    mat <- data.matrix(tfidf(dfm))
    
    # convert to a dataframe
    df <- as.data.frame(mat, stringsAsFactors =  FALSE)
    df$Source <- emailType
    
    return(df)
}

First let’s download and create the spam and ham corpus

# use lapply to download and untar all files specified
lapply(files, download_and_untar)

## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE

Next, we can create the Spam Corpus

spamCorpus <- createCorpus("./spam", "spam")

## Warning in corpus.Corpus(Corpus(DirSource(directory = directory, encoding =
## "UTF-8"), : Argument notes not used.

spam2Corpus <- createCorpus("./spam_2", "spam")

## Warning in corpus.Corpus(Corpus(DirSource(directory = directory, encoding =
## "UTF-8"), : Argument notes not used.

Combine the 2 Spam corpora

spamCorpusCombined <- spamCorpus + spam2Corpus

# summarize the combined corpus
summary(spamCorpusCombined, 20)

## Corpus consisting of 1899 documents, showing 20 documents:
## 
##                                   Text Types Tokens Sentences email_type
##  0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1  1170   1835         1       spam
##  0001.bfc8d64d12b325ff385cca8d07b84288   326   1449        18       spam
##  0002.24b47bb3ce90708ae29d0aec1da08610   225    575        10       spam
##  0003.4b3d943b8df71af248d12f8b2e7a224a   190    436         9       spam
##  0004.1874ab60c71f0b31b580f313a3f6e777   354   1084        46       spam
##  0005.1f42bb885de0ef7fc5cd09d34dc2ba54   213    536         7       spam
##  0006.7a32642f8c22bbeb85d6c3b5f3890a2c   359    761        27       spam
##  0007.859c901719011d56f8b652ea071c1f8b   174    381        10       spam
##  0008.9562918b57e044abfbce260cc875acde   601   5897        22       spam
##  0009.c05e264fbf18783099b53dbc9a9aacda   407    893        40       spam
##  0010.7f5fb525755c45eb78efc18d7c9ea5aa   220    791         5       spam
##  0011.2a1247254a535bac29c476b86c708901   190    437         9       spam
##  0012.7bc8e619ad0264979edce15083e70a02   157    503         7       spam
##  0013.9034ac0917f6fdb82c5ee6a7509029ed   188    434         9       spam
##  0014.ed99ffe0f452b91be11684cbfe8d349c   297   1844        38       spam
##  0015.1b871d654560011a0aaa29bb4e9054f7   172    473         7       spam
##  0016.f9c349935955e1ccc7626270da898445   302   1639        10       spam
##  0017.49ab70c7a4042cb1c695a0e59a6ede54   349    756        40       spam
##  0018.259154a52bc55dcae491cfded60a5cd2   173    379        11       spam
##  0019.939e70d8367f315193e4bc5be80dc262   315    685        19       spam
##  source
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
##    spam
## 
## Source: Combination of corpuses spamCorpus and spam2Corpus
## Created: Fri Apr 20 23:27:23 2018
## Notes:

Next, let’s build the document-feature matrix using the Spam corpus

dfmSpam <- buildDFM(spamCorpusCombined, round(length(docnames(spamCorpusCombined))/10), 50)

## Warning: Argument ignoredFeatures not used.

## Warning: Argument ignoredFeatures not used.

dim(dfmSpam)              # basic dimensions of the dfm

## NULL

topfeatures(dfmSpam, 20)  # top features of the spam dfm

## Error in topfeatures.default(dfmSpam, 20): topfeatures() only works on dfm objects.

plot(topfeatures(dfmSpam, 100), log = "y", cex = .6, ylab = "Term frequency", main = "Top Features of Spam")

## Error in topfeatures.default(dfmSpam, 100): topfeatures() only works on dfm objects.

Wordcloud of 100 top spam words

plotDFM(dfmSpam[, 1:100])

## Error in dfmSpam[, 1:100]: incorrect number of dimensions

Let’s now create the Ham corpus

hamCorpus <- createCorpus("./easy_ham", "ham")

## Warning in corpus.Corpus(Corpus(DirSource(directory = directory, encoding =
## "UTF-8"), : Argument notes not used.

ham2Corpus <- createCorpus("./hard_ham", "ham")

## Warning in corpus.Corpus(Corpus(DirSource(directory = directory, encoding =
## "UTF-8"), : Argument notes not used.

#combine the 2 ham corpa 
hamCorpusCombined <- hamCorpus + ham2Corpus

# summarize the combined corpus
summary(hamCorpusCombined, 20)

## Corpus consisting of 2801 documents, showing 20 documents:
## 
##                                   Text Types Tokens Sentences email_type
##  0001.ea7e79d3153e7469e7a9c3e0af6a357e   289    948        25        ham
##  0002.b3120c4bcbf3101e661161ee7efcb8bf   234    722         5        ham
##  0003.acfc5ad94bbd27118a0d8685d18c89dd   302    822        11        ham
##  0004.e8d5727378ddde5c3be181df593f1712   256    668         9        ham
##  0005.8c3b9e9c0f3f183ddaf7592a11b99957   358    991        23        ham
##  0006.ee8b0dba12856155222be180ba122058   253    732        10        ham
##  0007.c75188382f64b090022fa3b095b020b0   228    726         7        ham
##  0008.20bc0b4ba2d99aae1c7098069f611a9b   286    858         9        ham
##  0009.435ae292d75abb1ca492dcc2d5cf1570   273    780        14        ham
##  0010.4996141de3f21e858c22f88231a9f463   669   1818        42        ham
##  0011.07b11073b53634cff892a7988289a72e   307   1005        30        ham
##  0012.d354b2d2f24d1036caf1374dd94f4c94   246    644        11        ham
##  0013.ff597adee000d073ae72200b0af00cd1   221    629        14        ham
##  0014.532e0a17d0674ba7a9baa7b0afe5fb52   353   1017        34        ham
##  0015.a9ff8d7550759f6ab62cc200bdf156e7   239    638        10        ham
##  0016.d82758030e304d41fb3f4ebbb7d9dd91   286    756        17        ham
##  0017.d81093a2182fc9135df6d9158a8ebfd6   257    689        15        ham
##  0018.ba70ecbeea6f427b951067f34e23bae6   382   1227        45        ham
##  0019.a8a1b2767e83b3be653e4af0148e1897   527   1435        36        ham
##  0020.ef397cef16f8041242e3b6560e168053   207    513         6        ham
##    source
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
##  easy_ham
## 
## Source: Combination of corpuses hamCorpus and ham2Corpus
## Created: Fri Apr 20 23:31:13 2018
## Notes:

Next, let’s build the document-feature matrix using the Ham corpus

dfmHam <- buildDFM(hamCorpusCombined, round(length(docnames(hamCorpusCombined))/10), 50)

## Warning: Argument ignoredFeatures not used.

## Warning: Argument ignoredFeatures not used.

dim(dfmHam)

## NULL

plot(topfeatures(dfmHam, 100), log = "y", cex = .6, ylab = "Term frequency", main = "Top Features of Ham")

## Error in topfeatures.default(dfmHam, 100): topfeatures() only works on dfm objects.

Build the k-Nearest Neighbor Model for Document Classification

dfSpam <- create_df_matrix(dfmSpam, "spam")

## Warning: 'tfidf' is deprecated.
## Use 'dfm_tfidf' instead.
## See help("Deprecated")

## Error in dfm_tfidf.default(dfm): dfm_tfidf() only works on dfm objects.

dfHam <- create_df_matrix(dfmHam, "ham")

## Warning: 'tfidf' is deprecated.
## Use 'dfm_tfidf' instead.
## See help("Deprecated")

## Error in dfm_tfidf.default(dfm): dfm_tfidf() only works on dfm objects.

stacked.df <- rbind.fill(dfSpam, dfHam)

## Error in rbind.fill(dfSpam, dfHam): object 'dfSpam' not found

# set NA values to 0
stacked.df[is.na(stacked.df)] <- 0

## Error in stacked.df[is.na(stacked.df)] <- 0: object 'stacked.df' not found

Create the training and test datasets

train.idx <- sample(nrow(stacked.df), ceiling(nrow(stacked.df) * 0.7))

## Error in nrow(stacked.df): object 'stacked.df' not found

test.idx <- (1:nrow(stacked.df)) [-train.idx]

## Error in nrow(stacked.df): object 'stacked.df' not found

length(train.idx)

## Error in eval(expr, envir, enclos): object 'train.idx' not found

length(test.idx)

## Error in eval(expr, envir, enclos): object 'test.idx' not found

tdm.email <- stacked.df[, "Source"]

## Error in eval(expr, envir, enclos): object 'stacked.df' not found

stacked.nl <- stacked.df[, !colnames(stacked.df) %in% "Source"]  #stacked.nl

## Error in eval(expr, envir, enclos): object 'stacked.df' not found

#Run the kNN prediction using the training and test datasets
knn.pred <- knn(stacked.nl[train.idx, ], stacked.nl[test.idx, ], tdm.email[train.idx])

## Error in as.matrix(train): object 'stacked.nl' not found

The resulting Confusion Matrix:

conf.mat <- table("Predictions" = knn.pred, Actual = tdm.email[test.idx])

## Error in table(Predictions = knn.pred, Actual = tdm.email[test.idx]): object 'knn.pred' not found

conf.mat

## Error in eval(expr, envir, enclos): object 'conf.mat' not found

The accuracy of the model =

r (accuracy <- sum(diag(conf.mat)) / length(test.idx) * 100 )

## Error in r(accuracy <- sum(diag(conf.mat))/length(test.idx) * 100): could not find function "r"

To output the predictions

df.pred <- cbind(knn.pred, stacked.nl[test.idx, ])

## Error in cbind(knn.pred, stacked.nl[test.idx, ]): object 'knn.pred' not found

Data 607: Project 4

RSingh

4/16/2018

SPAM vs HAM emails

Download the spam and ham files to the wd