It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/
download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2", destfile = "20021010_easy_ham.tar.bz2")
download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2", destfile = "20050311_spam_2.tar.bz2")
untar("20021010_easy_ham.tar.bz2", exdir="project_4", compressed = "bzip2")
untar("20050311_spam_2.tar.bz2", exdir="project_4",compressed = "bzip2")
Ham_path <- DirSource("C:\\Users\\a\\Desktop\\607\\project_4\\easy_ham")
ham.dir="easy_ham\\"
ham_files = list.files(path = ham.dir,full.names = TRUE)
Spam_path <- DirSource("C:\\Users\\a\\Desktop\\607\\project_4\\spam_2")
spam.dir="spam_2\\"
spam_files = list.files(path = spam.dir , full.names = TRUE)
length(spam_files)
## [1] 1397
length(ham_files)
## [1] 2551
m_corpus <- function(file_path) {
corpus <- file_path %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()
return(corpus)
}
Clean_data <- function(corpus) {
corpus <- corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(tolower) %>%
tm_map(PlainTextDocument) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
return(corpus)
}
addTag <- function(corpus, tag, value){
for (i in 1:length(corpus)){
meta(corpus[[i]], tag) <- value
}
return(corpus)
}
hamCorp <- ham.dir%>%
m_corpus %>%
Clean_data %>%
addTag(tag = "emails", value = "ham")
hamDF <-as.data.frame(unlist(hamCorp),stringsAsFactors = FALSE)
hamDF$Type <- "HAM"
colnames(hamDF) <- c("Message","Type")
wordcloud(hamCorp, scale=c(3,0.5), max.words=80, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
title("Ham Wordcloud",col.main = "grey14")
spamCorp <- spam.dir %>%
m_corpus %>%
Clean_data %>%
addTag(tag = "emails", value = "spam")
spamDF <-as.data.frame(unlist(spamCorp),stringsAsFactors = FALSE)
spamDF$Type <- "SPAM"
colnames(spamDF) <- c("Message","Type")
wordcloud(spamCorp, scale=c(3,0.5), max.words=80, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
title("Spam Wordcloud",col.main = "grey14")
joinedDF <- rbind(hamDF[1:1000,], spamDF[1:1000,])
clean_corpus <- c(spamCorp, hamCorp)
kable(head(joinedDF))
| Message | Type |
|---|---|
| exmhworkersadminredhatcom thu aug | HAM |
| returnpath exmhworkersadminexamplecom | HAM |
| deliveredto zzzzlocalhostnetnoteinccom | HAM |
| receiv localhost localhost | HAM |
| phoboslabsnetnoteinccom postfix esmtp id dec | HAM |
| zzzzlocalhost thu aug edt | HAM |
corpus_labels <- unlist(meta(clean_corpus, "emails"))
corpus_dtm <-DocumentTermMatrix(clean_corpus)
set.seed(1010)
joinedDF$Message[joinedDF$Message ==""] <- "NaN"
train_index <- createDataPartition(joinedDF$Type, p=0.70, list=FALSE)
email_train <- joinedDF[train_index,]
email_test <- joinedDF[-train_index,]
train_email_corpus <- Corpus(VectorSource(email_train$Message))
# perform some cleaning
suppressWarnings({train_clean_corpus=tm_map(train_email_corpus,
removeNumbers)})
suppressWarnings({train_clean_corpus=tm_map(train_clean_corpus,
removePunctuation)})
suppressWarnings({train_clean_corpus=tm_map(train_clean_corpus,
removeWords,
words=stopwords("en"))})
suppressWarnings({train_clean_corpus=tm_map(train_clean_corpus,
stripWhitespace)})
test_email_corpus<-Corpus(VectorSource(email_test$Message))
#perform some cleaning
suppressWarnings({test_clean_corpus=tm_map(test_email_corpus,removeNumbers)})
suppressWarnings({test_clean_corpus=tm_map(test_clean_corpus,
removePunctuation)})
suppressWarnings({test_clean_corpus=tm_map(test_clean_corpus,
removeWords,
words=stopwords("en"))})
suppressWarnings({test_clean_corpus=tm_map(test_clean_corpus,
stripWhitespace)})
train_dtm <- DocumentTermMatrix(train_clean_corpus)
train_dtm
## <<DocumentTermMatrix (documents: 1400, terms: 1254)>>
## Non-/sparse entries: 3624/1751976
## Sparsity : 100%
## Maximal term length: 75
## Weighting : term frequency (tf)
test_dtm <- DocumentTermMatrix(test_clean_corpus)
test_dtm
## <<DocumentTermMatrix (documents: 600, terms: 719)>>
## Non-/sparse entries: 1450/429950
## Sparsity : 100%
## Maximal term length: 75
## Weighting : term frequency (tf)
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c(0,1))
y
}
train_x <- apply(train_dtm, 2, convert_count)
test_x <- apply(test_dtm, 2, convert_count)
NBclassifier <- naiveBayes(train_x, factor(email_train$Type))
head(NBclassifier$tables,3)
## $aug
## aug
## factor(email_train$Type) 0 1
## HAM 0.88714286 0.11285714
## SPAM 0.98428571 0.01571429
##
## $exmhworkersadminredhatcom
## exmhworkersadminredhatcom
## factor(email_train$Type) 0 1
## HAM 0.998571429 0.001428571
## SPAM 1.000000000 0.000000000
##
## $thu
## thu
## factor(email_train$Type) 0 1
## HAM 0.927142857 0.072857143
## SPAM 0.994285714 0.005714286
Pred <- predict(NBclassifier, newdata=test_x)
table(Pred, actual=email_test$Type)
## actual
## Pred HAM SPAM
## HAM 191 23
## SPAM 109 277
mean(Pred==email_test$Type)*100
## [1] 78