# loading libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
# I was unable to copy the folders to github! Shame on me!
setwd("/Users/blam/Downloads/Project4/")
spam <- Corpus(DirSource("/Users/blam/Downloads/Project4/spam", encoding = "UTF-8"))
easy_ham <- Corpus(DirSource("/Users/blam/Downloads/Project4/easy_ham", encoding = "UTF-8"))
#hard_ham <- Corpus(DirSource(""), readerControl = list(language="lat"))
# adding email type
meta(spam, tag="class_type") <- "2"
meta(easy_ham, tag="class_type") <- "1"
#creating labels to be used in the training algorithm
labels<-unlist(c(meta(spam[]),meta(easy_ham[]))) # works
#labels_m<-as.data.frame(labels)
#labels_df<-data.frame(matrix(NA, nrow = 3051, ncol = 2),stringsAsFactors = FALSE)
#labels_df$X1<-labels_m[,1]
#labels_df$X2<-1:3051
#colnames(labels_df)<-c("type","index")
#labels<-as.numeric(factor(factor(labels))) # works
#Combine corpus
corps <- c(spam,easy_ham,recursive=T)
corps <- iconv(corps, "ASCII", "UTF-8", sub="byte")
comb<- VCorpus(VectorSource(corps))
#Inspection
inspect(comb[3053])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## $content.2052.26af2a75e1e49e4ce02a01912ca86601
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 1151
meta(comb[[1000]])
## author : character(0)
## datetimestamp: 2020-04-27 02:49:48
## description : character(0)
## heading : character(0)
## id : 1000
## language : en
## origin : character(0)
e_corpus <- tm_map(comb, removePunctuation)
e_corpus <- tm_map(comb, removeNumbers)
e_corpus <- tm_map(comb, removeWords, stopwords("english"))
e_corpus <- tm_map(comb, stripWhitespace)
# Also applying Porter's word stemmer - http://www.cs.odu.edu/~jbollen/IR04/readings/readings5.pdf
e_corpus <- tm_map(comb, stemDocument)
#Inspection
inspect(e_corpus[1000])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## $dmeta.class_type499
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 1
meta(e_corpus[[1000]])
## author : character(0)
## datetimestamp: 2020-04-27 02:49:48
## description : character(0)
## heading : character(0)
## id : 1000
## language : en
## origin : character(0)
# resampling - was not able to resample due to issues with labeling
#corps_samp <- sample(e_corpus, 550)
#inspect(corps_samp[550])
#meta(corps_samp[[550]])
#Build document-term matrix
e_dtm <- DocumentTermMatrix(e_corpus)
e_dtm
## <<DocumentTermMatrix (documents: 6104, terms: 123591)>>
## Non-/sparse entries: 686252/753713212
## Sparsity : 100%
## Maximal term length: 521
## Weighting : term frequency (tf)
#suggestions to reduce dimension is to remove less frequent terms such that the sparsity is less than 95%
e_dtm <- removeSparseTerms(e_dtm, 0.99)
e_dtm
## <<DocumentTermMatrix (documents: 6104, terms: 1290)>>
## Non-/sparse entries: 388396/7485764
## Sparsity : 95%
## Maximal term length: 76
## Weighting : term frequency (tf)
Reference: https://journal.r-project.org/archive/2013-1/collingwood-jurka-boydstun-etal.pdf
set.seed(95616)
train_size<-round(0.9*length(labels))
sample_size<-length(labels)
emails_cont<-create_container(e_dtm, labels=labels,trainSize = 1:train_size, testSize = (train_size+1):sample_size, virgin = FALSE)
# TRAIN THE ALGORITHMS USING THE CONTAINER
SVM <- train_model(emails_cont,"SVM")
GLMNET <- train_model(emails_cont,"GLMNET")
SLDA <- train_model(emails_cont,"SLDA")
BOOSTING <- train_model(emails_cont,"BOOSTING")
BAGGING <- train_model(emails_cont,"BAGGING")
RF <- train_model(emails_cont,"RF")
#NNET <- train_model(emails_cont,"NNET")
TREE <- train_model(emails_cont,"TREE")
# CLASSIFY THE TESTING DATA USING THE TRAINED MODELS.
SVM_CLASSIFY <- classify_model(emails_cont, SVM)
GLMNET_CLASSIFY <- classify_model(emails_cont, GLMNET)
SLDA_CLASSIFY <- classify_model(emails_cont, SLDA)
BOOSTING_CLASSIFY <- classify_model(emails_cont, BOOSTING)
BAGGING_CLASSIFY <- classify_model(emails_cont, BAGGING)
RF_CLASSIFY <- classify_model(emails_cont, RF)
#NNET_CLASSIFY <- classify_model(emails_cont, NNET)
TREE_CLASSIFY <- classify_model(emails_cont, TREE)
analytics <- create_analytics(emails_cont,cbind(SVM_CLASSIFY, SLDA_CLASSIFY,BOOSTING_CLASSIFY, BAGGING_CLASSIFY,RF_CLASSIFY, GLMNET_CLASSIFY, TREE_CLASSIFY))
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.00 1
## n >= 2 1.00 1
## n >= 3 1.00 1
## n >= 4 1.00 1
## n >= 5 1.00 1
## n >= 6 0.99 1
## n >= 7 0.97 1
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 1 1 1
## SLDA_PRECISION SLDA_RECALL SLDA_FSCORE
## 1 1 1
## LOGITBOOST_PRECISION LOGITBOOST_RECALL LOGITBOOST_FSCORE
## 1 1 1
## BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE
## 1 1 1
## FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE
## 1 1 1
## GLMNET_PRECISION GLMNET_RECALL GLMNET_FSCORE
## 1 1 1
## TREE_PRECISION TREE_RECALL TREE_FSCORE
## 1 1 1
Results look to good to be true, and this is because I was not able to resample. I struggled with the meta data manipulation in the corpus and was unable to retrieve labels.