#install.packages("tm")
#install.packages("SnowballC")
#install.packages("RTextTools")
#install.packages("caret")
library(RCurl)
## Loading required package: bitops
library(XML)
library(stringr)
library(tm)
## Warning: package 'tm' was built under R version 3.3.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.3.2
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 3.3.2
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'RTextTools'
## The following objects are masked from 'package:SnowballC':
##
## getStemLanguages, wordStem
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ham_dir <- "C:/Users/blin261/Desktop/DATA607/spamham/easy_ham/"
ham_files <- list.files(path = ham_dir)
head(ham_files)
## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"
## [6] "00006.253ea2f9a9cc36fa0b1129b04b806608"
ham <- c()
for (i in 1: length(ham_files)) {
tmp <- readLines(str_c(ham_dir, ham_files[i]))
tmp <- str_c(tmp, collapse = "")
ham <- c(ham, tmp)
}
ham_corpus <- Corpus(VectorSource(ham))
for (i in 1: length(ham_corpus)) {
meta(ham_corpus[[i]], "Type") <- "Ham"
}
ham_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2501
spam_dir <- "C:/Users/blin261/Desktop/DATA607/spamham/spam_2/"
spam_files <- list.files(path = spam_dir)
head(spam_files)
## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
spam <- c()
for (i in 1: length(spam_files)) {
tmp <- readLines(str_c(spam_dir, spam_files[i]))
tmp <- str_c(tmp, collapse = "")
spam <- c(spam, tmp)
}
spam_corpus <- Corpus(VectorSource(spam))
for (i in 1: length(spam_corpus)) {
meta(spam_corpus[[i]], "Type") <- "Spam"
}
spam_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1397
total_corpus <- c(ham_corpus, spam_corpus)
total_corpus <- tm_map(total_corpus, removeNumbers)
total_corpus <- tm_map(total_corpus, str_replace_all, pattern = "[[:punct:]]", replacement = " ")
total_corpus <- tm_map(total_corpus, removeWords, words = stopwords("en"))
total_corpus <- tm_map(total_corpus, tolower)
total_corpus <- tm_map(total_corpus, stemDocument)
total_corpus <- tm_map(total_corpus, PlainTextDocument)
tdm <- TermDocumentMatrix(total_corpus)
tdm
## <<TermDocumentMatrix (terms: 107679, documents: 3898)>>
## Non-/sparse entries: 786085/418946657
## Sparsity : 100%
## Maximal term length: 17339
## Weighting : term frequency (tf)
meta_data <- meta(spam_corpus, "Type")
head(meta_data)
## $`1`
## [1] "Spam"
##
## $`2`
## [1] "Spam"
##
## $`3`
## [1] "Spam"
##
## $`4`
## [1] "Spam"
##
## $`5`
## [1] "Spam"
##
## $`6`
## [1] "Spam"
meta_data <- meta(ham_corpus, "Type")
head(meta_data)
## $`1`
## [1] "Ham"
##
## $`2`
## [1] "Ham"
##
## $`3`
## [1] "Ham"
##
## $`4`
## [1] "Ham"
##
## $`5`
## [1] "Ham"
##
## $`6`
## [1] "Ham"
meta_data <- total_corpus$meta$type
head(meta_data)
## NULL
dtm <- DocumentTermMatrix(total_corpus)
dtm <- removeSparseTerms(dtm, 1-(10/length(total_corpus)))
dtm
## <<DocumentTermMatrix (documents: 3898, terms: 7557)>>
## Non-/sparse entries: 616550/28840636
## Sparsity : 98%
## Maximal term length: 73
## Weighting : term frequency (tf)