Task
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/
Loading Library
library(tm)
library(knitr)
library(plyr)
library(wordcloud)
library(tidyverse)
library(tm)
library(magrittr)
library(data.table)
library(e1071)
library(caret)
library(randomForest)
Loading Data
data link
spam<- '/Users/chunjienan/Desktop/MS Data Science/DATA 607 DADA ACQUISITION MANAGEMENT/2021 fall/Week 12. Nov 8 - Nov 14/Project4/spamdata/spam'
count_spam<-length(list.files(path = spam))
count_spam
## [1] 501
ham<-'/Users/chunjienan/Desktop/MS Data Science/DATA 607 DADA ACQUISITION MANAGEMENT/2021 fall/Week 12. Nov 8 - Nov 14/Project4/spamdata/easy_ham'
count_ham<-length(list.files(path = ham))
count_ham
## [1] 2501
spam_list<-list.files(spam)
ham_list<-list.files(ham)
There are 501 Spam and 2501 Ham in the data set.
Data Cleaning
ham_doc <- NA
for(i in 1:length(ham_list))
{
path<-paste0(ham, "/", ham_list[1])
text <-readLines(path)
list<- list(paste(text, collapse="\n"))
ham_doc = c(ham_doc,list)
}
spam_doc <- NA
for(i in 1:length(spam_list))
{
path<-paste0(spam, "/", spam_list[1])
text <-readLines(path)
list<- list(paste(text, collapse="\n"))
spam_doc = c(spam_doc,list)
}
ham_data_frame <-as.data.frame(unlist(ham_doc),stringsAsFactors = FALSE)
ham_data_frame$type <- "ham"
colnames(ham_data_frame) <- c("text","type")
spam_data_frame <-as.data.frame(unlist(spam_doc),stringsAsFactors = FALSE)
spam_data_frame$type <- "spam"
colnames(spam_data_frame) <- c("text","type")
# Bind the data frames
spam_ham_data <- rbind(ham_data_frame, spam_data_frame)
nrow(spam_ham_data)
## [1] 3004
table(spam_ham_data$type)
##
## ham spam
## 2502 502
There are total of 3004 of observations is in the data set, and 2052 are ham and 502 are spam in this combined data set.
Create Corpuses from Email
corpus <- VCorpus(VectorSource(spam_ham_data$text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords())
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
Word Cloud and Matrix
dtm<- DocumentTermMatrix(corpus)
Spam<-which(spam_ham_data$type == 'spam')
Ham<-which(spam_ham_data$type == 'ham')
wordcloud(corpus[Spam], max.words = 100, random.order = FALSE, colors=c('Red'))

wordcloud(corpus[Ham],max.words = 100 ,random.order = FALSE, colors=c('Blue'))

Split Data and Corpus for Traing Data and Test Data
spam_ham_data$text[spam_ham_data$text==""] <- "NA"
split_index <- createDataPartition(spam_ham_data$type, p=0.70, list=FALSE)
train <- spam_ham_data[split_index,]
test <- spam_ham_data[-split_index,]
# Corpus for training data and test data
train_corpus <- Corpus(VectorSource(train$text))
train_corp <- tm_map(train_corpus ,removeNumbers)
train_corp <- tm_map(train_corp,removePunctuation)
train_corp<- tm_map(train_corp, removeWords,stopwords())
train_corp<- tm_map(train_corp,stripWhitespace)
test_corpus <- Corpus(VectorSource(test$text))
test_corp<- tm_map(test_corpus,removeNumbers)
test_corp <- tm_map(test_corp,removePunctuation)
test_corp <- tm_map(test_corp,removeWords, stopwords())
test_corp<- tm_map(test_corp,stripWhitespace)
train_dtm <- DocumentTermMatrix(train_corp)
test_dtm <- DocumentTermMatrix(test_corp)
convert <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
sms.train <- apply(train_dtm, 2, convert)
sms.test <- apply(test_dtm, 2, convert)
Modeling and Prediction
Naive Bayes Classification
# classification of email
naive_classifier <- naiveBayes(sms.train, factor(train$type))
pred_naive <- predict(naive_classifier, newdata=sms.test)
naive_matrix<-table(pred_naive, test$type)
naive_matrix
##
## pred_naive ham spam
## ham 750 0
## spam 0 150
Accuracy of Naive Classification
Accuracy = (TP + TN)/Total
TP<- 750 + 150
TN<- 0 + 0
Accuracy_rate<-(TP + TN)/sum(naive_matrix)*100
confusionMatrix(pred_naive, as.factor(test$type), positive = 'spam', dnn = c('pred','actual'))
## Confusion Matrix and Statistics
##
## actual
## pred ham spam
## ham 750 0
## spam 0 150
##
## Accuracy : 1
## 95% CI : (0.9959, 1)
## No Information Rate : 0.8333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1667
## Detection Rate : 0.1667
## Detection Prevalence : 0.1667
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : spam
##
print(paste0('The accuracy_rate of naive classification model is ' , Accuracy_rate,'%'))
## [1] "The accuracy_rate of naive classification model is 100%"
Conclusion
Naive Bayes Classification is considered one of the best options to do classification analysis. It allowed me to get the 100% of accuracy of the spam email detection. According to the confusion matrix function, the model classified 100 % of spams and hams correctly.
LS0tCnRpdGxlOiAiREFUQTYwN19Qcm9qZWN0NF9DaHVuamllX05hbiIKYXV0aG9yOiAiQ2h1bmppZSBOYW4iCmRhdGU6ICIxMS8xMy8yMDIxIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHllcwogICAgY29kZV9mb2xkaW5nOiBoaWRlCiAgICBoaWdobGlnaHQ6IHB5Z21lbnRzCiAgICBudW1iZXJfc2VjdGlvbnM6IHllcwogICAgdGhlbWU6IGZsYXRseQogICAgdG9jOiB5ZXMKICAgIHRvY19mbG9hdDogeWVzCiAgcGRmX2RvY3VtZW50OgogICAgdG9jOiB5ZXMKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KHdhcm5pbmcgPSBGQUxTRSwgbWVzc2FnZSA9IEZBTFNFKSAKYGBgCgoKCgojIyBUYXNrCkl0IGNhbiBiZSB1c2VmdWwgdG8gYmUgYWJsZSB0byBjbGFzc2lmeSBuZXcgInRlc3QiIGRvY3VtZW50cyB1c2luZyBhbHJlYWR5IGNsYXNzaWZpZWQgInRyYWluaW5nIiBkb2N1bWVudHMuICBBIGNvbW1vbiBleGFtcGxlIGlzIHVzaW5nIGEgY29ycHVzIG9mIGxhYmVsZWQgc3BhbSBhbmQgaGFtIChub24tc3BhbSkgZS1tYWlscyB0byBwcmVkaWN0IHdoZXRoZXIgb3Igbm90IGEgbmV3IGRvY3VtZW50IGlzIHNwYW0uICAKCkZvciB0aGlzIHByb2plY3QsIHlvdSBjYW4gc3RhcnQgd2l0aCBhIHNwYW0vaGFtIGRhdGFzZXQsIHRoZW4gcHJlZGljdCB0aGUgY2xhc3Mgb2YgbmV3IGRvY3VtZW50cyAoZWl0aGVyIHdpdGhoZWxkIGZyb20gdGhlIHRyYWluaW5nIGRhdGFzZXQgb3IgZnJvbSBhbm90aGVyIHNvdXJjZSBzdWNoIGFzIHlvdXIgb3duIHNwYW0gZm9sZGVyKS4gICBPbmUgZXhhbXBsZSBjb3JwdXM6ICAgaHR0cHM6Ly9zcGFtYXNzYXNzaW4uYXBhY2hlLm9yZy9vbGQvcHVibGljY29ycHVzLyAKCgojIyBMb2FkaW5nIExpYnJhcnkKYGBge3J9CmxpYnJhcnkodG0pCmxpYnJhcnkoa25pdHIpCmxpYnJhcnkocGx5cikKbGlicmFyeSh3b3JkY2xvdWQpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KG1hZ3JpdHRyKQpsaWJyYXJ5KGRhdGEudGFibGUpCmxpYnJhcnkoZTEwNzEpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocmFuZG9tRm9yZXN0KQpgYGAKCgojIyBMb2FkaW5nIERhdGEKCltkYXRhIGxpbmtdKGh0dHBzOi8vc3BhbWFzc2Fzc2luLmFwYWNoZS5vcmcvb2xkL3B1YmxpY2NvcnB1cy8pCgpgYGB7cn0Kc3BhbTwtICcvVXNlcnMvY2h1bmppZW5hbi9EZXNrdG9wL01TIERhdGEgU2NpZW5jZS9EQVRBIDYwNyBEQURBIEFDUVVJU0lUSU9OIE1BTkFHRU1FTlQvMjAyMSBmYWxsL1dlZWsgMTIuIE5vdiA4IC0gTm92IDE0L1Byb2plY3Q0L3NwYW1kYXRhL3NwYW0nCmNvdW50X3NwYW08LWxlbmd0aChsaXN0LmZpbGVzKHBhdGggPSBzcGFtKSkKY291bnRfc3BhbQpoYW08LScvVXNlcnMvY2h1bmppZW5hbi9EZXNrdG9wL01TIERhdGEgU2NpZW5jZS9EQVRBIDYwNyBEQURBIEFDUVVJU0lUSU9OIE1BTkFHRU1FTlQvMjAyMSBmYWxsL1dlZWsgMTIuIE5vdiA4IC0gTm92IDE0L1Byb2plY3Q0L3NwYW1kYXRhL2Vhc3lfaGFtJwpjb3VudF9oYW08LWxlbmd0aChsaXN0LmZpbGVzKHBhdGggPSBoYW0pKQpjb3VudF9oYW0KCnNwYW1fbGlzdDwtbGlzdC5maWxlcyhzcGFtKQpoYW1fbGlzdDwtbGlzdC5maWxlcyhoYW0pCmBgYAoKClRoZXJlIGFyZSA1MDEgU3BhbSBhbmQgMjUwMSBIYW0gaW4gdGhlIGRhdGEgc2V0LiAKCgojIyBEYXRhIENsZWFuaW5nCgoKYGBge3J9CmhhbV9kb2MgPC0gTkEKZm9yKGkgaW4gMTpsZW5ndGgoaGFtX2xpc3QpKQp7CiAgcGF0aDwtcGFzdGUwKGhhbSwgIi8iLCBoYW1fbGlzdFsxXSkgIAogIHRleHQgPC1yZWFkTGluZXMocGF0aCkKICBsaXN0PC0gbGlzdChwYXN0ZSh0ZXh0LCBjb2xsYXBzZT0iXG4iKSkKICBoYW1fZG9jID0gYyhoYW1fZG9jLGxpc3QpCiAgCn0KCnNwYW1fZG9jIDwtIE5BCmZvcihpIGluIDE6bGVuZ3RoKHNwYW1fbGlzdCkpCnsKICBwYXRoPC1wYXN0ZTAoc3BhbSwgIi8iLCBzcGFtX2xpc3RbMV0pICAKICB0ZXh0IDwtcmVhZExpbmVzKHBhdGgpCiAgbGlzdDwtIGxpc3QocGFzdGUodGV4dCwgY29sbGFwc2U9IlxuIikpCiAgc3BhbV9kb2MgPSBjKHNwYW1fZG9jLGxpc3QpCn0KCgpoYW1fZGF0YV9mcmFtZSA8LWFzLmRhdGEuZnJhbWUodW5saXN0KGhhbV9kb2MpLHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkKaGFtX2RhdGFfZnJhbWUkdHlwZSA8LSAiaGFtIgpjb2xuYW1lcyhoYW1fZGF0YV9mcmFtZSkgPC0gYygidGV4dCIsInR5cGUiKQoKc3BhbV9kYXRhX2ZyYW1lIDwtYXMuZGF0YS5mcmFtZSh1bmxpc3Qoc3BhbV9kb2MpLHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkKc3BhbV9kYXRhX2ZyYW1lJHR5cGUgPC0gInNwYW0iCmNvbG5hbWVzKHNwYW1fZGF0YV9mcmFtZSkgPC0gYygidGV4dCIsInR5cGUiKQoKIyBCaW5kIHRoZSBkYXRhIGZyYW1lcwpzcGFtX2hhbV9kYXRhIDwtIHJiaW5kKGhhbV9kYXRhX2ZyYW1lLCBzcGFtX2RhdGFfZnJhbWUpCm5yb3coc3BhbV9oYW1fZGF0YSkKdGFibGUoc3BhbV9oYW1fZGF0YSR0eXBlKQpgYGAKClRoZXJlIGFyZSB0b3RhbCBvZiAzMDA0IG9mIG9ic2VydmF0aW9ucyBpcyBpbiB0aGUgZGF0YSBzZXQsIGFuZCAyMDUyIGFyZSBoYW0gYW5kIDUwMiBhcmUgc3BhbSBpbiB0aGlzIGNvbWJpbmVkIGRhdGEgc2V0LgoKCiMjIENyZWF0ZSBDb3JwdXNlcyBmcm9tIEVtYWlsCgpgYGB7cn0KY29ycHVzIDwtIFZDb3JwdXMoVmVjdG9yU291cmNlKHNwYW1faGFtX2RhdGEkdGV4dCkpCmNvcnB1cyA8LSB0bV9tYXAoY29ycHVzLCBjb250ZW50X3RyYW5zZm9ybWVyKHRvbG93ZXIpKQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgcmVtb3ZlTnVtYmVycykKY29ycHVzIDwtIHRtX21hcChjb3JwdXMsIHJlbW92ZVB1bmN0dWF0aW9uKQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgcmVtb3ZlV29yZHMsIHN0b3B3b3JkcygpKQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgc3RlbURvY3VtZW50KQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgc3RyaXBXaGl0ZXNwYWNlKQpgYGAKCgojIyBXb3JkIENsb3VkIGFuZCBNYXRyaXgKCmBgYHtyfQpkdG08LSBEb2N1bWVudFRlcm1NYXRyaXgoY29ycHVzKQpTcGFtPC13aGljaChzcGFtX2hhbV9kYXRhJHR5cGUgPT0gJ3NwYW0nKQpIYW08LXdoaWNoKHNwYW1faGFtX2RhdGEkdHlwZSA9PSAnaGFtJykKd29yZGNsb3VkKGNvcnB1c1tTcGFtXSwgbWF4LndvcmRzID0gMTAwLCByYW5kb20ub3JkZXIgPSBGQUxTRSwgY29sb3JzPWMoJ1JlZCcpKQp3b3JkY2xvdWQoY29ycHVzW0hhbV0sbWF4LndvcmRzID0gMTAwICxyYW5kb20ub3JkZXIgPSBGQUxTRSwgY29sb3JzPWMoJ0JsdWUnKSkKCgoKYGBgCgoKIyMgU3BsaXQgRGF0YSBhbmQgQ29ycHVzIGZvciBUcmFpbmcgRGF0YSBhbmQgVGVzdCBEYXRhCgpgYGB7cn0Kc3BhbV9oYW1fZGF0YSR0ZXh0W3NwYW1faGFtX2RhdGEkdGV4dD09IiJdIDwtICJOQSIKc3BsaXRfaW5kZXggPC0gY3JlYXRlRGF0YVBhcnRpdGlvbihzcGFtX2hhbV9kYXRhJHR5cGUsIHA9MC43MCwgbGlzdD1GQUxTRSkKdHJhaW4gPC0gc3BhbV9oYW1fZGF0YVtzcGxpdF9pbmRleCxdCnRlc3QgPC0gc3BhbV9oYW1fZGF0YVstc3BsaXRfaW5kZXgsXQoKCiMgQ29ycHVzIGZvciB0cmFpbmluZyBkYXRhIGFuZCB0ZXN0IGRhdGEKdHJhaW5fY29ycHVzIDwtIENvcnB1cyhWZWN0b3JTb3VyY2UodHJhaW4kdGV4dCkpCnRyYWluX2NvcnAgPC0gdG1fbWFwKHRyYWluX2NvcnB1cyAscmVtb3ZlTnVtYmVycykKdHJhaW5fY29ycCA8LSB0bV9tYXAodHJhaW5fY29ycCxyZW1vdmVQdW5jdHVhdGlvbikKdHJhaW5fY29ycDwtIHRtX21hcCh0cmFpbl9jb3JwLCByZW1vdmVXb3JkcyxzdG9wd29yZHMoKSkKdHJhaW5fY29ycDwtIHRtX21hcCh0cmFpbl9jb3JwLHN0cmlwV2hpdGVzcGFjZSkKCgp0ZXN0X2NvcnB1cyA8LSBDb3JwdXMoVmVjdG9yU291cmNlKHRlc3QkdGV4dCkpCnRlc3RfY29ycDwtIHRtX21hcCh0ZXN0X2NvcnB1cyxyZW1vdmVOdW1iZXJzKQp0ZXN0X2NvcnAgPC0gdG1fbWFwKHRlc3RfY29ycCxyZW1vdmVQdW5jdHVhdGlvbikKdGVzdF9jb3JwICA8LSB0bV9tYXAodGVzdF9jb3JwLHJlbW92ZVdvcmRzLCBzdG9wd29yZHMoKSkKdGVzdF9jb3JwPC0gdG1fbWFwKHRlc3RfY29ycCxzdHJpcFdoaXRlc3BhY2UpCgoKdHJhaW5fZHRtIDwtIERvY3VtZW50VGVybU1hdHJpeCh0cmFpbl9jb3JwKQp0ZXN0X2R0bSA8LSBEb2N1bWVudFRlcm1NYXRyaXgodGVzdF9jb3JwKQoKCmNvbnZlcnQgPC0gZnVuY3Rpb24oeCkgewogIHkgPC0gaWZlbHNlKHggPiAwLCAxLDApCiAgeSA8LSBmYWN0b3IoeSwgbGV2ZWxzPWMoMCwxKSwgbGFiZWxzPWMoIk5vIiwgIlllcyIpKQogIHkKfQoKc21zLnRyYWluIDwtIGFwcGx5KHRyYWluX2R0bSwgMiwgY29udmVydCkKc21zLnRlc3QgPC0gYXBwbHkodGVzdF9kdG0sIDIsIGNvbnZlcnQpCmBgYAoKCiMjIE1vZGVsaW5nIGFuZCBQcmVkaWN0aW9uCgojIyMgTmFpdmUgQmF5ZXMgQ2xhc3NpZmljYXRpb24KYGBge3J9CiMgY2xhc3NpZmljYXRpb24gb2YgZW1haWwKbmFpdmVfY2xhc3NpZmllciA8LSBuYWl2ZUJheWVzKHNtcy50cmFpbiwgZmFjdG9yKHRyYWluJHR5cGUpKQpwcmVkX25haXZlIDwtIHByZWRpY3QobmFpdmVfY2xhc3NpZmllciwgbmV3ZGF0YT1zbXMudGVzdCkKbmFpdmVfbWF0cml4PC10YWJsZShwcmVkX25haXZlLCB0ZXN0JHR5cGUpCm5haXZlX21hdHJpeApgYGAKCgojIyBBY2N1cmFjeSBvZiBOYWl2ZSBDbGFzc2lmaWNhdGlvbiAKCkFjY3VyYWN5ID0gKFRQICsgVE4pL1RvdGFsCmBgYHtyfQpUUDwtIDc1MCArIDE1MApUTjwtIDAgKyAwCkFjY3VyYWN5X3JhdGU8LShUUCArIFROKS9zdW0obmFpdmVfbWF0cml4KSoxMDAKCmNvbmZ1c2lvbk1hdHJpeChwcmVkX25haXZlLCBhcy5mYWN0b3IodGVzdCR0eXBlKSwgcG9zaXRpdmUgPSAnc3BhbScsIGRubiA9IGMoJ3ByZWQnLCdhY3R1YWwnKSkKCnByaW50KHBhc3RlMCgnVGhlIGFjY3VyYWN5X3JhdGUgb2YgbmFpdmUgY2xhc3NpZmljYXRpb24gbW9kZWwgaXMgJyAsIEFjY3VyYWN5X3JhdGUsJyUnKSkKCmBgYAoKCiMjIENvbmNsdXNpb24KCk5haXZlIEJheWVzIENsYXNzaWZpY2F0aW9uIGlzIGNvbnNpZGVyZWQgb25lIG9mIHRoZSBiZXN0IG9wdGlvbnMgdG8gZG8gY2xhc3NpZmljYXRpb24gYW5hbHlzaXMuIEl0IGFsbG93ZWQgbWUgdG8gZ2V0IHRoZSAxMDAlIG9mIAphY2N1cmFjeSBvZiB0aGUgc3BhbSBlbWFpbCBkZXRlY3Rpb24uIEFjY29yZGluZyB0byB0aGUgY29uZnVzaW9uIG1hdHJpeCBmdW5jdGlvbiwgdGhlIG1vZGVsIGNsYXNzaWZpZWQgMTAwICUgb2Ygc3BhbXMgYW5kIGhhbXMgY29ycmVjdGx5LgoK