install.packages(“tm”) install.packages(“SnowballC”) install.packages(“RTextTools”)
library(RCurl)
## Loading required package: bitops
library(XML)
library(stringr)
library(tm)
## Loading required package: NLP
library(SnowballC)
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'RTextTools'
## The following objects are masked from 'package:SnowballC':
##
## getStemLanguages, wordStem
# Download the ham and spam files in the following path
ham_path <- "C:/Users/jenny_000/Desktop/MSDA R/DATA-607/Project 4/easy_ham/"
spam_path <- "C:/Users/jenny_000/Desktop/MSDA R/DATA-607/Project 4/spam_2/"
# Load the ham and spam emails and create corpus
ham_corpus <- Corpus(DirSource(ham_path), readerControl=list(reader=readPlain))
length(ham_corpus)
## [1] 2501
ham_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 2501
spam_corpus <- Corpus(DirSource(spam_path), readerControl = list(reader=readPlain))
length(spam_corpus)
## [1] 1397
spam_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1397
# Add meta information
meta(ham_corpus, "Type") <- "ham"
meta(spam_corpus, "Type") <- "spam"
# Data Cleansing
ham_corpus <- tm_map(ham_corpus, removeNumbers)
ham_corpus <- tm_map(ham_corpus, str_replace_all, pattern = "[[:punct:]]", replacement = " ")
ham_corpus <- tm_map(ham_corpus, removeWords, words = stopwords("en"))
ham_corpus <- tm_map(ham_corpus, tolower)
ham_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 1
## Content: documents: 2501
spam_corpus <- tm_map(spam_corpus, removeNumbers)
spam_corpus <- tm_map(spam_corpus, str_replace_all, pattern = "[[:punct:]]", replacement = " ")
spam_corpus <- tm_map(spam_corpus, removeWords, words = stopwords("en"))
spam_corpus <- tm_map(spam_corpus, tolower)
spam_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 1
## Content: documents: 1397
# Building term-document matrix and remove sparse terms
ham_tdm <- TermDocumentMatrix(ham_corpus)
ham_tdm <- removeSparseTerms(ham_tdm, 1-(10/length(ham_corpus)))
spam_tdm <- TermDocumentMatrix(spam_corpus)
spam_tdm <- removeSparseTerms(spam_tdm, 1-(10/length(spam_corpus)))
# Combine two TermDocumentMatrix using concatenation
total_tdm <- c(ham_tdm, spam_tdm)
total_tdm
## <<TermDocumentMatrix (terms: 5291, documents: 3898)>>
## Non-/sparse entries: 551645/20072673
## Sparsity : 97%
## Maximal term length: 33
## Weighting : term frequency (tf)
total_corpus <- c(ham_corpus, spam_corpus)
attributes(total_corpus) <- c(attributes(ham_corpus), attributes(spam_corpus))
length(total_corpus)
## [1] 2501
# Training
label_ham <- as.factor(unlist(meta(ham_corpus, "Type")[,1]))
class(label_ham)
## [1] "factor"
length(label_ham)
## [1] 2501
label_spam <- as.factor(unlist(meta(spam_corpus, "Type")[,1]))
class(label_spam)
## [1] "factor"
length(label_spam)
## [1] 1397
labels <- as.factor(c(label_ham, label_spam))
length(labels)
## [1] 3898
N <- length(labels)
container <- create_container(
total_tdm,
labels = labels,
trainSize = 1:2000,
testSize = 2001:N,
virgin = FALSE
)
# I have the problem when running the model with Error in svm.default(x = #container@training_matrix, y = container@training_codes, : x and y don't match.
#svm_model <- train_model(container, "SVM")
#tree_model <- train_model(container, "TREE")
#maxent_model <- train_model(container, "MAXENT")
#svm_out <- classify_model(container, svm_model)
#tree_out <- classify_model(container, tree_model)
#maxent_out <- classify_model(container, maxent_model)
#head(svm_out)
#head(tree_out)
#head(maxent_out)
#labels_out <- data.frame(
# correct_label = labels[2001:N],
# svm = as.character(svm_out[,1]),
# tree = as.character(tree_out[,1]),
# stingsAsFactors = F)
#SVM performance
#table(labels_out[,1] == labels_out[,2])
#prop.table(table(labels_out[,1] == labels_out[,2]))
# Random forest performance
#table(labels_out[,1] == labels_out[,3])
#prop.table(table(labels_out[,1] == labels_out[,3]))
# Maximum entropy performance
#table(labels_out[,1] == labels_out[,4])
#prop.table(table(labels_out[,1] == labels_out[,4]))
# Reference: Atuomated data collection with R chapter 10