It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/
This rmd and all related input data can be found on my github.
library(tm)
library(stringr)
library(SnowballC)
library(ggplot2)
library(knitr)
library(tidyr)
library(tidytext)
library(wordcloud)
library(caret)
library(gbm)
library(e1071)
download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2", destfile = "20021010_easy_ham.tar.bz2")
untar("20021010_easy_ham.tar.bz2", exdir="emails", compressed = "bzip2")
ham.dir="emails\\easy_ham\\"
ham_files = list.files(path = ham.dir,full.names = TRUE)
download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2", destfile = "20050311_spam_2.tar.bz2")
untar("20050311_spam_2.tar.bz2", exdir="emails",compressed = "bzip2")
spam.dir="emails\\spam_2\\"
spam_files = list.files(path = spam.dir , full.names = TRUE)
number_spam<-length(list.files(spam.dir, all.files = "FALSE", full.names = "TRUE"))
print(paste("There is a total of",number_spam,"spam emails"))
## [1] "There is a total of 1397 spam emails"
number_ham<-length(list.files(ham.dir, all.files = "FALSE", full.names = "TRUE"))
print(paste("There is a total of",number_ham,"ham emails"))
## [1] "There is a total of 2551 ham emails"
I applied the following steps to clean the sapm_2 and ham file
1-Create a vector of file paths
2- Read the text in each file
3- Turn into VectorSource
4- Remove numbers
5- Remove punctuation symbols
6- Remove stopwords
7- Remove white spaces
spam_files = spam_files[which(spam_files!="cmds")] # Removing the .cmds files in all the folders.
ham_files=ham_files[which(ham_files!="cmds")]
toVCorpus <- function(file_path) {
corpus <- file_path %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()
return(corpus)
}
docClean <- function(corpus) {
corpus <- corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(tolower) %>%
tm_map(PlainTextDocument) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
return(corpus)
}
addTag <- function(corpus, tag, value){
for (i in 1:length(corpus)){
meta(corpus[[i]], tag) <- value
}
return(corpus)
}
# Create ham corpora
hamCorp <- ham.dir%>%
toVCorpus %>%
docClean %>%
addTag(tag = "emails", value = "ham")
# Create spam corpora
spamCorp <- spam.dir %>%
toVCorpus %>%
docClean %>%
addTag(tag = "emails", value = "spam")
Now I combine our corpora into a single corpus and begin to build a classifer that will accurately tell the difference between ham and spam.
First, we combine the corpora into a single one:
#Ham
hamDF <-as.data.frame(unlist(hamCorp),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")
#Spam
spamDF <-as.data.frame(unlist(spamCorp),stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text","type")
spam_ham_df <- rbind(hamDF[1:1000,], spamDF[1:1000,])
# Combine both corpora
clean_corpus <- c(spamCorp, hamCorp)
Splitting training dat 70 percent and test data 30 percent
wordcloud(clean_corpus,max.words = 70, random.order = FALSE, min.freq=1000)
Here I’m splitting training data size 0.70 and test data size 0.30 for entire data.
corpus_labels <- unlist(meta(clean_corpus, "emails"))
corpus_dtm <-DocumentTermMatrix(clean_corpus)
set.seed(123)
spam_ham_df$text[spam_ham_df$text==""] <- "NaN"
train_index <- createDataPartition(spam_ham_df$type, p=0.70, list=FALSE)
email_train <- spam_ham_df[train_index,]
email_test <- spam_ham_df[-train_index,]
# Create corpus for training and test data
train_email_corpus <- Corpus(VectorSource(email_train$text))
test_email_corpus <- Corpus(VectorSource(email_test$text))
train_clean_corpus <- tm_map(train_email_corpus ,
removeNumbers)
test_clean_corpus <- tm_map(test_email_corpus,
removeNumbers)
train_clean_corpus <- tm_map(train_clean_corpus,
removePunctuation)
test_clean_corpus <- tm_map(test_clean_corpus,
removePunctuation)
train_clean_corpus <- tm_map(train_clean_corpus,
removeWords,
stopwords())
test_clean_corpus <- tm_map(test_clean_corpus,
removeWords,
stopwords())
train_clean_corpus<- tm_map(train_clean_corpus,
stripWhitespace)
test_clean_corpus<- tm_map(test_clean_corpus,
stripWhitespace)
train_email_dtm <- DocumentTermMatrix(train_clean_corpus)
test_email_dtm <- DocumentTermMatrix(test_clean_corpus)
# Here I'm defining input variables 0 and 1 from string to integer
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c(0,1))
y
}
train_sms <- apply(train_email_dtm, 2, convert_count)
test_sms <- apply(test_email_dtm, 2, convert_count)
classifier <- naiveBayes(train_sms, factor(email_train$type))
test_pred <- predict(classifier, newdata=test_sms)
.true positives (TP): These are cases in which we predicted spam (they have the disease), and emails are actuall a spam.
.true negatives (TN): We predicted ham, and emails are actuall a ham.
.false positives (FP): We predicted spam, but emails are actuall a ham. (Also known as a “Type I error.”)
.false negatives (FN): We predicted ham, but emails are actuall a spam. (Also known as a “Type II error.”)
table(test_pred, email_test$type)
##
## test_pred ham spam
## ham 200 23
## spam 100 277
Accuray rate = (True Positive + True Negative )/Total
Accuracy rate = 477/ 600 = 0.80