library(tm)
## Loading required package: NLP
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#First Load Data
Spam <- VCorpus(DirSource("/Users/zachdravis/Documents/GitHub/CUNY-DATA-607/Project 4/spam_2"))
Spam <- Spam[c(1:500)] #I must truncate due to an odd error I'm getting that I don't understand
Ham <- VCorpus(DirSource("/Users/zachdravis/Documents/GitHub/CUNY-DATA-607/Project 4/easy_ham"))
#Add meta data
meta(Spam, tag = "type") <- "Spam"
meta(Ham, tag = "type") <- "Ham"
#Combine the Data
Combination <- c(Spam, Ham)
#Clean the data
Combination <- tm_map(Combination, removeNumbers) #Step one in textbook
Combination <- tm_map(Combination, removePunctuation) #Step two in textbook
Combination <- tm_map(Combination, removeWords, words =
stopwords("en")) #Step 3
Combination <- tm_map(Combination, stemDocument) #Stem the terms
dtm <- DocumentTermMatrix(Combination) #Make Document Term Matrix
dtm <- removeSparseTerms(dtm, 1-(10/length(Combination))) #Cull list by terms <10 times
dtm
## <<DocumentTermMatrix (documents: 3051, terms: 4699)>>
## Non-/sparse entries: 392272/13944377
## Sparsity : 97%
## Maximal term length: 73
## Weighting : term frequency (tf)
dtmfreq <- as.matrix(dtm)
FreqMat <- colSums(dtmfreq)
FreqDF <- data.frame(Term = names(FreqMat), Frequency = colSums(dtmfreq))
Frequencies <- FreqDF %>% arrange(desc(Frequency)) # Look at the top most frequent terms
table(meta(Combination))
##
## Ham Spam
## 2551 500
#2551 Ham, 500 Spam -- use to determine the test / training split, which I read was recommended to be 70% train, 30% test.
TypeLabel <- as.vector(unlist(meta(Combination)))
container <- create_container(dtm,
labels = TypeLabel,
trainSize = 1:2136,
testSize = 2137:3051,
virgin = F)
svm_model <- train_model(container, "SVM")
SvmTEst <- classify_model(container, svm_model)
SvmTEst <- cbind(SvmTEst, TypeLabel[2137:3051])
table(SvmTEst[,1] == SvmTEst[,3])
##
## TRUE
## 915
Model does a good job (100%) at classifying! It correctly classifies 915 e-mails.