Spam classification

Data manipulation

# loading libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli

Loading files from

# I was unable to copy the folders to github! Shame on me!

setwd("/Users/blam/Downloads/Project4/")
spam <- Corpus(DirSource("/Users/blam/Downloads/Project4/spam",  encoding = "UTF-8"))
easy_ham <- Corpus(DirSource("/Users/blam/Downloads/Project4/easy_ham",  encoding = "UTF-8"))

#hard_ham <- Corpus(DirSource(""), readerControl = list(language="lat"))

# adding email type
meta(spam, tag="class_type") <- "2"
meta(easy_ham, tag="class_type") <- "1"

#creating labels to be used in the training algorithm
labels<-unlist(c(meta(spam[]),meta(easy_ham[]))) # works

#labels_m<-as.data.frame(labels)
#labels_df<-data.frame(matrix(NA, nrow = 3051, ncol = 2),stringsAsFactors = FALSE)
#labels_df$X1<-labels_m[,1]
#labels_df$X2<-1:3051
#colnames(labels_df)<-c("type","index")

#labels<-as.numeric(factor(factor(labels))) # works

Combine corpus

#Combine corpus
corps <- c(spam,easy_ham,recursive=T)
corps <- iconv(corps, "ASCII", "UTF-8", sub="byte")

comb<- VCorpus(VectorSource(corps))

#Inspection
inspect(comb[3053])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## $content.2052.26af2a75e1e49e4ce02a01912ca86601
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 1151
meta(comb[[1000]])
##   author       : character(0)
##   datetimestamp: 2020-04-27 02:49:48
##   description  : character(0)
##   heading      : character(0)
##   id           : 1000
##   language     : en
##   origin       : character(0)

Applying transformations available

e_corpus <- tm_map(comb, removePunctuation)
e_corpus <- tm_map(comb, removeNumbers)
e_corpus <- tm_map(comb, removeWords, stopwords("english"))
e_corpus <- tm_map(comb, stripWhitespace)
# Also applying Porter's word stemmer - http://www.cs.odu.edu/~jbollen/IR04/readings/readings5.pdf
e_corpus <- tm_map(comb, stemDocument)

#Inspection
inspect(e_corpus[1000])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## $dmeta.class_type499
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 1
meta(e_corpus[[1000]])
##   author       : character(0)
##   datetimestamp: 2020-04-27 02:49:48
##   description  : character(0)
##   heading      : character(0)
##   id           : 1000
##   language     : en
##   origin       : character(0)
# resampling - was not able to resample due to issues with labeling
#corps_samp <- sample(e_corpus, 550)

#inspect(corps_samp[550])
#meta(corps_samp[[550]])

#Build document-term matrix

e_dtm <- DocumentTermMatrix(e_corpus)
e_dtm
## <<DocumentTermMatrix (documents: 6104, terms: 123591)>>
## Non-/sparse entries: 686252/753713212
## Sparsity           : 100%
## Maximal term length: 521
## Weighting          : term frequency (tf)
#suggestions to reduce dimension is to remove less frequent terms such that the sparsity is less than 95%

e_dtm <- removeSparseTerms(e_dtm, 0.99)
e_dtm
## <<DocumentTermMatrix (documents: 6104, terms: 1290)>>
## Non-/sparse entries: 388396/7485764
## Sparsity           : 95%
## Maximal term length: 76
## Weighting          : term frequency (tf)

Using RTextTools for Classification - WRAP THE DATA IN A CONTAINER

Reference: https://journal.r-project.org/archive/2013-1/collingwood-jurka-boydstun-etal.pdf

set.seed(95616)
train_size<-round(0.9*length(labels))
sample_size<-length(labels)

emails_cont<-create_container(e_dtm, labels=labels,trainSize = 1:train_size, testSize = (train_size+1):sample_size, virgin = FALSE)

# TRAIN THE ALGORITHMS USING THE CONTAINER
SVM <- train_model(emails_cont,"SVM")
GLMNET <- train_model(emails_cont,"GLMNET")
SLDA <- train_model(emails_cont,"SLDA")
BOOSTING <- train_model(emails_cont,"BOOSTING")
BAGGING <- train_model(emails_cont,"BAGGING")
RF <- train_model(emails_cont,"RF")
#NNET <- train_model(emails_cont,"NNET")
TREE <- train_model(emails_cont,"TREE")

# CLASSIFY THE TESTING DATA USING THE TRAINED MODELS.
SVM_CLASSIFY <- classify_model(emails_cont, SVM)
GLMNET_CLASSIFY <- classify_model(emails_cont, GLMNET)
SLDA_CLASSIFY <- classify_model(emails_cont, SLDA)
BOOSTING_CLASSIFY <- classify_model(emails_cont, BOOSTING)
BAGGING_CLASSIFY <- classify_model(emails_cont, BAGGING)
RF_CLASSIFY <- classify_model(emails_cont, RF)
#NNET_CLASSIFY <- classify_model(emails_cont, NNET)
TREE_CLASSIFY <- classify_model(emails_cont, TREE)

analytics <- create_analytics(emails_cont,cbind(SVM_CLASSIFY, SLDA_CLASSIFY,BOOSTING_CLASSIFY, BAGGING_CLASSIFY,RF_CLASSIFY, GLMNET_CLASSIFY, TREE_CLASSIFY))

summary(analytics)
## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                1.00                 1
## n >= 2                1.00                 1
## n >= 3                1.00                 1
## n >= 4                1.00                 1
## n >= 5                1.00                 1
## n >= 6                0.99                 1
## n >= 7                0.97                 1
## 
## 
## ALGORITHM PERFORMANCE
## 
##        SVM_PRECISION           SVM_RECALL           SVM_FSCORE 
##                    1                    1                    1 
##       SLDA_PRECISION          SLDA_RECALL          SLDA_FSCORE 
##                    1                    1                    1 
## LOGITBOOST_PRECISION    LOGITBOOST_RECALL    LOGITBOOST_FSCORE 
##                    1                    1                    1 
##    BAGGING_PRECISION       BAGGING_RECALL       BAGGING_FSCORE 
##                    1                    1                    1 
##    FORESTS_PRECISION       FORESTS_RECALL       FORESTS_FSCORE 
##                    1                    1                    1 
##     GLMNET_PRECISION        GLMNET_RECALL        GLMNET_FSCORE 
##                    1                    1                    1 
##       TREE_PRECISION          TREE_RECALL          TREE_FSCORE 
##                    1                    1                    1

Conclusion

Results look to good to be true, and this is because I was not able to resample. I struggled with the meta data manipulation in the corpus and was unable to retrieve labels.