DATA 607 Project 4

library(tm)

## Loading required package: NLP

library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()

#First Load Data
Spam <- VCorpus(DirSource("/Users/zachdravis/Documents/GitHub/CUNY-DATA-607/Project 4/spam_2"))
Spam <- Spam[c(1:500)] #I must truncate due to an odd error I'm getting that I don't understand

Ham <- VCorpus(DirSource("/Users/zachdravis/Documents/GitHub/CUNY-DATA-607/Project 4/easy_ham"))

#Add meta data
meta(Spam, tag = "type") <- "Spam"
meta(Ham, tag = "type") <- "Ham"

#Combine the Data
Combination <- c(Spam, Ham)

#Clean the data
Combination <- tm_map(Combination, removeNumbers) #Step one in textbook
Combination <- tm_map(Combination, removePunctuation) #Step two in textbook
Combination <- tm_map(Combination, removeWords, words =
stopwords("en")) #Step 3
Combination <- tm_map(Combination, stemDocument) #Stem the terms

dtm <- DocumentTermMatrix(Combination) #Make Document Term Matrix
dtm <- removeSparseTerms(dtm, 1-(10/length(Combination))) #Cull list by terms <10 times
dtm

## <<DocumentTermMatrix (documents: 3051, terms: 4699)>>
## Non-/sparse entries: 392272/13944377
## Sparsity           : 97%
## Maximal term length: 73
## Weighting          : term frequency (tf)

dtmfreq <- as.matrix(dtm)
FreqMat <- colSums(dtmfreq)
FreqDF <- data.frame(Term = names(FreqMat), Frequency = colSums(dtmfreq))
Frequencies <- FreqDF %>% arrange(desc(Frequency)) # Look at the top most frequent terms

table(meta(Combination))

## 
##  Ham Spam 
## 2551  500

#2551 Ham, 500 Spam -- use to determine the test / training split, which I read was recommended to be 70% train, 30% test.

TypeLabel <- as.vector(unlist(meta(Combination)))

container <- create_container(dtm,
                              labels = TypeLabel,
                              trainSize = 1:2136,
                              testSize = 2137:3051,
                              virgin = F)

svm_model <- train_model(container, "SVM")
SvmTEst <- classify_model(container, svm_model)

SvmTEst <- cbind(SvmTEst, TypeLabel[2137:3051])

table(SvmTEst[,1] == SvmTEst[,3])

## 
## TRUE 
##  915

Model does a good job (100%) at classifying! It correctly classifies 915 e-mails.

DATA 607 Project 4

Zach Dravis

5/12/2018