Train Data

rm(list = ls())

library(readr)
library(stringr)
library(tm)

## Warning: package 'tm' was built under R version 3.4.4

## Loading required package: NLP

library(listviewer)

## Warning: package 'listviewer' was built under R version 3.4.4

library(tidytext)

## Warning: package 'tidytext' was built under R version 3.4.4

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(caret)

## Warning: package 'caret' was built under R version 3.4.4

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.4.4

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(e1071)

## Warning: package 'e1071' was built under R version 3.4.4

library(ggplot2)

setwd("C:/Users/weberr1/Desktop/CUNY/DATA 607/Assignment/Project 4/")

textList <- NULL

# Parse "ham" files -------------------------------------------------------

filenames <- list.files(str_c(getwd(), "/", "easy_ham"), full.names=TRUE)

for (i in 1:25)#length(filenames))
{
  
  print(str_c("processing text file:", i))
  
  # Read in next item
  text <- read_file(str_c(filenames[i]))
  
  # Format
  text <- str_replace_all(text, "<.*?>", " ") # html tages
  text <- str_replace_all(text, "[:space:]", " ") # spaces
  text <- str_replace_all(text, "\\b[:alnum:]*@[:alnum:]+(\\.[:alnum:])*", "") # email addresses
  text <- str_replace_all(text, "[:alpha:]*[:digit:][:alpha:]*", "") # words with numbers
  text <- str_replace_all(text, ".*Date ", "") # Everything prior to the first "Date: " text
  text <- str_replace_all(text, "[:punct:]", " ") # punctuation
  
 textList <- c(textList, text)
  
}

## [1] "processing text file:1"
## [1] "processing text file:2"
## [1] "processing text file:3"
## [1] "processing text file:4"
## [1] "processing text file:5"
## [1] "processing text file:6"
## [1] "processing text file:7"
## [1] "processing text file:8"
## [1] "processing text file:9"
## [1] "processing text file:10"
## [1] "processing text file:11"
## [1] "processing text file:12"
## [1] "processing text file:13"
## [1] "processing text file:14"
## [1] "processing text file:15"
## [1] "processing text file:16"
## [1] "processing text file:17"
## [1] "processing text file:18"
## [1] "processing text file:19"
## [1] "processing text file:20"
## [1] "processing text file:21"
## [1] "processing text file:22"
## [1] "processing text file:23"
## [1] "processing text file:24"
## [1] "processing text file:25"

corpus1 <- VCorpus(VectorSource(textList))
meta(corpus1, "category") <- "Not spam"



### Add Parsed spam files ---------------------------------------------------

### Parse "ham" files
filenames <- list.files(str_c(getwd(), "/", "spam"), full.names=TRUE)

textList <- NULL

for (i in 1:25)#length(filenames))
{
  
  print(str_c("processing text file:", i))
  
  # Read in next item
  text <- read_file(str_c(filenames[i]))
  
  # Format
  text <- str_replace_all(text, "<.*?>", " ") # html tages
  text <- str_replace_all(text, "[:space:]", " ") # spaces
  text <- str_replace_all(text, "\\b[:alnum:]*@[:alnum:]+(\\.[:alnum:])*", "") # email addresses
  text <- str_replace_all(text, "[:alpha:]*[:digit:][:alpha:]*", "") # words with numbers
  text <- str_replace_all(text, ".*Date ", "") # Everything prior to the first "Date: " text
  text <- str_replace_all(text, "[:punct:]", " ") # punctuation
  
  textList <- c(textList, text)

}

## [1] "processing text file:1"
## [1] "processing text file:2"
## [1] "processing text file:3"
## [1] "processing text file:4"
## [1] "processing text file:5"
## [1] "processing text file:6"
## [1] "processing text file:7"
## [1] "processing text file:8"
## [1] "processing text file:9"
## [1] "processing text file:10"
## [1] "processing text file:11"
## [1] "processing text file:12"
## [1] "processing text file:13"
## [1] "processing text file:14"
## [1] "processing text file:15"
## [1] "processing text file:16"
## [1] "processing text file:17"
## [1] "processing text file:18"
## [1] "processing text file:19"
## [1] "processing text file:20"
## [1] "processing text file:21"
## [1] "processing text file:22"
## [1] "processing text file:23"
## [1] "processing text file:24"
## [1] "processing text file:25"

# Fix encoding issue
Encoding(textList) <- "latin1"

corpus2 <- VCorpus(VectorSource(textList))

meta(corpus2, "category") <- "Spam"

corpus <- c(corpus1, corpus2)

# Create a document term matrix.
tdm <- DocumentTermMatrix(corpus, list(removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))


tdm <- removeSparseTerms(tdm, sparse = .96)

# Getting error here that prescindMeta can't be found...
library(RTextTools)

## Warning: package 'RTextTools' was built under R version 3.4.4

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

# cat_labels <- unlist(prescindMeta(corpus, "category")[,2])


# container <- create_container(tdm,
#                               labels = c("Spam", "Not spam"),
#                               trainSize = 1:20,
#                               testSize = 20:40,
#                               virgin = FALSE)

# Next steps would be:

# Train model (using tree model)
# tree_model(container, "TREE")

# Test against remaining data
# tree_out <- classify_model(container, tree_model)

# Here, could evaluate the TREE_PROB to see how well model performed

Train Data

Ryan Weber

April 15, 2018