rm(list = ls())
library(readr)
library(stringr)
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(listviewer)
## Warning: package 'listviewer' was built under R version 3.4.4
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
library(ggplot2)
setwd("C:/Users/weberr1/Desktop/CUNY/DATA 607/Assignment/Project 4/")
textList <- NULL
# Parse "ham" files -------------------------------------------------------
filenames <- list.files(str_c(getwd(), "/", "easy_ham"), full.names=TRUE)
for (i in 1:25)#length(filenames))
{
print(str_c("processing text file:", i))
# Read in next item
text <- read_file(str_c(filenames[i]))
# Format
text <- str_replace_all(text, "<.*?>", " ") # html tages
text <- str_replace_all(text, "[:space:]", " ") # spaces
text <- str_replace_all(text, "\\b[:alnum:]*@[:alnum:]+(\\.[:alnum:])*", "") # email addresses
text <- str_replace_all(text, "[:alpha:]*[:digit:][:alpha:]*", "") # words with numbers
text <- str_replace_all(text, ".*Date ", "") # Everything prior to the first "Date: " text
text <- str_replace_all(text, "[:punct:]", " ") # punctuation
textList <- c(textList, text)
}
## [1] "processing text file:1"
## [1] "processing text file:2"
## [1] "processing text file:3"
## [1] "processing text file:4"
## [1] "processing text file:5"
## [1] "processing text file:6"
## [1] "processing text file:7"
## [1] "processing text file:8"
## [1] "processing text file:9"
## [1] "processing text file:10"
## [1] "processing text file:11"
## [1] "processing text file:12"
## [1] "processing text file:13"
## [1] "processing text file:14"
## [1] "processing text file:15"
## [1] "processing text file:16"
## [1] "processing text file:17"
## [1] "processing text file:18"
## [1] "processing text file:19"
## [1] "processing text file:20"
## [1] "processing text file:21"
## [1] "processing text file:22"
## [1] "processing text file:23"
## [1] "processing text file:24"
## [1] "processing text file:25"
corpus1 <- VCorpus(VectorSource(textList))
meta(corpus1, "category") <- "Not spam"
### Add Parsed spam files ---------------------------------------------------
### Parse "ham" files
filenames <- list.files(str_c(getwd(), "/", "spam"), full.names=TRUE)
textList <- NULL
for (i in 1:25)#length(filenames))
{
print(str_c("processing text file:", i))
# Read in next item
text <- read_file(str_c(filenames[i]))
# Format
text <- str_replace_all(text, "<.*?>", " ") # html tages
text <- str_replace_all(text, "[:space:]", " ") # spaces
text <- str_replace_all(text, "\\b[:alnum:]*@[:alnum:]+(\\.[:alnum:])*", "") # email addresses
text <- str_replace_all(text, "[:alpha:]*[:digit:][:alpha:]*", "") # words with numbers
text <- str_replace_all(text, ".*Date ", "") # Everything prior to the first "Date: " text
text <- str_replace_all(text, "[:punct:]", " ") # punctuation
textList <- c(textList, text)
}
## [1] "processing text file:1"
## [1] "processing text file:2"
## [1] "processing text file:3"
## [1] "processing text file:4"
## [1] "processing text file:5"
## [1] "processing text file:6"
## [1] "processing text file:7"
## [1] "processing text file:8"
## [1] "processing text file:9"
## [1] "processing text file:10"
## [1] "processing text file:11"
## [1] "processing text file:12"
## [1] "processing text file:13"
## [1] "processing text file:14"
## [1] "processing text file:15"
## [1] "processing text file:16"
## [1] "processing text file:17"
## [1] "processing text file:18"
## [1] "processing text file:19"
## [1] "processing text file:20"
## [1] "processing text file:21"
## [1] "processing text file:22"
## [1] "processing text file:23"
## [1] "processing text file:24"
## [1] "processing text file:25"
# Fix encoding issue
Encoding(textList) <- "latin1"
corpus2 <- VCorpus(VectorSource(textList))
meta(corpus2, "category") <- "Spam"
corpus <- c(corpus1, corpus2)
# Create a document term matrix.
tdm <- DocumentTermMatrix(corpus, list(removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
tdm <- removeSparseTerms(tdm, sparse = .96)
# Getting error here that prescindMeta can't be found...
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 3.4.4
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
# cat_labels <- unlist(prescindMeta(corpus, "category")[,2])
# container <- create_container(tdm,
# labels = c("Spam", "Not spam"),
# trainSize = 1:20,
# testSize = 20:40,
# virgin = FALSE)
# Next steps would be:
# Train model (using tree model)
# tree_model(container, "TREE")
# Test against remaining data
# tree_out <- classify_model(container, tree_model)
# Here, could evaluate the TREE_PROB to see how well model performed