0.1 Task

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

0.2 Loading Library

library(tm)
library(knitr)
library(plyr)
library(wordcloud)
library(tidyverse)
library(tm)
library(magrittr)
library(data.table)
library(e1071)
library(caret)
library(randomForest)

0.3 Loading Data

data link

spam<- '/Users/chunjienan/Desktop/MS Data Science/DATA 607 DADA ACQUISITION MANAGEMENT/2021 fall/Week 12. Nov 8 - Nov 14/Project4/spamdata/spam'
count_spam<-length(list.files(path = spam))
count_spam
## [1] 501
ham<-'/Users/chunjienan/Desktop/MS Data Science/DATA 607 DADA ACQUISITION MANAGEMENT/2021 fall/Week 12. Nov 8 - Nov 14/Project4/spamdata/easy_ham'
count_ham<-length(list.files(path = ham))
count_ham
## [1] 2501
spam_list<-list.files(spam)
ham_list<-list.files(ham)

There are 501 Spam and 2501 Ham in the data set.

0.4 Data Cleaning

ham_doc <- NA
for(i in 1:length(ham_list))
{
  path<-paste0(ham, "/", ham_list[1])  
  text <-readLines(path)
  list<- list(paste(text, collapse="\n"))
  ham_doc = c(ham_doc,list)
  
}

spam_doc <- NA
for(i in 1:length(spam_list))
{
  path<-paste0(spam, "/", spam_list[1])  
  text <-readLines(path)
  list<- list(paste(text, collapse="\n"))
  spam_doc = c(spam_doc,list)
}


ham_data_frame <-as.data.frame(unlist(ham_doc),stringsAsFactors = FALSE)
ham_data_frame$type <- "ham"
colnames(ham_data_frame) <- c("text","type")

spam_data_frame <-as.data.frame(unlist(spam_doc),stringsAsFactors = FALSE)
spam_data_frame$type <- "spam"
colnames(spam_data_frame) <- c("text","type")

# Bind the data frames
spam_ham_data <- rbind(ham_data_frame, spam_data_frame)
nrow(spam_ham_data)
## [1] 3004
table(spam_ham_data$type)
## 
##  ham spam 
## 2502  502

There are total of 3004 of observations is in the data set, and 2052 are ham and 502 are spam in this combined data set.

0.5 Create Corpuses from Email

corpus <- VCorpus(VectorSource(spam_ham_data$text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords())
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)

0.6 Word Cloud and Matrix

dtm<- DocumentTermMatrix(corpus)
Spam<-which(spam_ham_data$type == 'spam')
Ham<-which(spam_ham_data$type == 'ham')
wordcloud(corpus[Spam], max.words = 100, random.order = FALSE, colors=c('Red'))

wordcloud(corpus[Ham],max.words = 100 ,random.order = FALSE, colors=c('Blue'))

0.7 Split Data and Corpus for Traing Data and Test Data

spam_ham_data$text[spam_ham_data$text==""] <- "NA"
split_index <- createDataPartition(spam_ham_data$type, p=0.70, list=FALSE)
train <- spam_ham_data[split_index,]
test <- spam_ham_data[-split_index,]


# Corpus for training data and test data
train_corpus <- Corpus(VectorSource(train$text))
train_corp <- tm_map(train_corpus ,removeNumbers)
train_corp <- tm_map(train_corp,removePunctuation)
train_corp<- tm_map(train_corp, removeWords,stopwords())
train_corp<- tm_map(train_corp,stripWhitespace)


test_corpus <- Corpus(VectorSource(test$text))
test_corp<- tm_map(test_corpus,removeNumbers)
test_corp <- tm_map(test_corp,removePunctuation)
test_corp  <- tm_map(test_corp,removeWords, stopwords())
test_corp<- tm_map(test_corp,stripWhitespace)


train_dtm <- DocumentTermMatrix(train_corp)
test_dtm <- DocumentTermMatrix(test_corp)


convert <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

sms.train <- apply(train_dtm, 2, convert)
sms.test <- apply(test_dtm, 2, convert)

0.8 Modeling and Prediction

0.8.1 Naive Bayes Classification

# classification of email
naive_classifier <- naiveBayes(sms.train, factor(train$type))
pred_naive <- predict(naive_classifier, newdata=sms.test)
naive_matrix<-table(pred_naive, test$type)
naive_matrix
##           
## pred_naive ham spam
##       ham  750    0
##       spam   0  150

0.9 Accuracy of Naive Classification

Accuracy = (TP + TN)/Total

TP<- 750 + 150
TN<- 0 + 0
Accuracy_rate<-(TP + TN)/sum(naive_matrix)*100

confusionMatrix(pred_naive, as.factor(test$type), positive = 'spam', dnn = c('pred','actual'))
## Confusion Matrix and Statistics
## 
##       actual
## pred   ham spam
##   ham  750    0
##   spam   0  150
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9959, 1)
##     No Information Rate : 0.8333     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1667     
##          Detection Rate : 0.1667     
##    Detection Prevalence : 0.1667     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : spam       
## 
print(paste0('The accuracy_rate of naive classification model is ' , Accuracy_rate,'%'))
## [1] "The accuracy_rate of naive classification model is 100%"

0.10 Conclusion

Naive Bayes Classification is considered one of the best options to do classification analysis. It allowed me to get the 100% of accuracy of the spam email detection. According to the confusion matrix function, the model classified 100 % of spams and hams correctly.

LS0tCnRpdGxlOiAiREFUQTYwN19Qcm9qZWN0NF9DaHVuamllX05hbiIKYXV0aG9yOiAiQ2h1bmppZSBOYW4iCmRhdGU6ICIxMS8xMy8yMDIxIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHllcwogICAgY29kZV9mb2xkaW5nOiBoaWRlCiAgICBoaWdobGlnaHQ6IHB5Z21lbnRzCiAgICBudW1iZXJfc2VjdGlvbnM6IHllcwogICAgdGhlbWU6IGZsYXRseQogICAgdG9jOiB5ZXMKICAgIHRvY19mbG9hdDogeWVzCiAgcGRmX2RvY3VtZW50OgogICAgdG9jOiB5ZXMKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KHdhcm5pbmcgPSBGQUxTRSwgbWVzc2FnZSA9IEZBTFNFKSAKYGBgCgoKCgojIyBUYXNrCkl0IGNhbiBiZSB1c2VmdWwgdG8gYmUgYWJsZSB0byBjbGFzc2lmeSBuZXcgInRlc3QiIGRvY3VtZW50cyB1c2luZyBhbHJlYWR5IGNsYXNzaWZpZWQgInRyYWluaW5nIiBkb2N1bWVudHMuICBBIGNvbW1vbiBleGFtcGxlIGlzIHVzaW5nIGEgY29ycHVzIG9mIGxhYmVsZWQgc3BhbSBhbmQgaGFtIChub24tc3BhbSkgZS1tYWlscyB0byBwcmVkaWN0IHdoZXRoZXIgb3Igbm90IGEgbmV3IGRvY3VtZW50IGlzIHNwYW0uICAKCkZvciB0aGlzIHByb2plY3QsIHlvdSBjYW4gc3RhcnQgd2l0aCBhIHNwYW0vaGFtIGRhdGFzZXQsIHRoZW4gcHJlZGljdCB0aGUgY2xhc3Mgb2YgbmV3IGRvY3VtZW50cyAoZWl0aGVyIHdpdGhoZWxkIGZyb20gdGhlIHRyYWluaW5nIGRhdGFzZXQgb3IgZnJvbSBhbm90aGVyIHNvdXJjZSBzdWNoIGFzIHlvdXIgb3duIHNwYW0gZm9sZGVyKS4gICBPbmUgZXhhbXBsZSBjb3JwdXM6ICAgaHR0cHM6Ly9zcGFtYXNzYXNzaW4uYXBhY2hlLm9yZy9vbGQvcHVibGljY29ycHVzLyAKCgojIyBMb2FkaW5nIExpYnJhcnkKYGBge3J9CmxpYnJhcnkodG0pCmxpYnJhcnkoa25pdHIpCmxpYnJhcnkocGx5cikKbGlicmFyeSh3b3JkY2xvdWQpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KG1hZ3JpdHRyKQpsaWJyYXJ5KGRhdGEudGFibGUpCmxpYnJhcnkoZTEwNzEpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocmFuZG9tRm9yZXN0KQpgYGAKCgojIyBMb2FkaW5nIERhdGEKCltkYXRhIGxpbmtdKGh0dHBzOi8vc3BhbWFzc2Fzc2luLmFwYWNoZS5vcmcvb2xkL3B1YmxpY2NvcnB1cy8pCgpgYGB7cn0Kc3BhbTwtICcvVXNlcnMvY2h1bmppZW5hbi9EZXNrdG9wL01TIERhdGEgU2NpZW5jZS9EQVRBIDYwNyBEQURBIEFDUVVJU0lUSU9OIE1BTkFHRU1FTlQvMjAyMSBmYWxsL1dlZWsgMTIuIE5vdiA4IC0gTm92IDE0L1Byb2plY3Q0L3NwYW1kYXRhL3NwYW0nCmNvdW50X3NwYW08LWxlbmd0aChsaXN0LmZpbGVzKHBhdGggPSBzcGFtKSkKY291bnRfc3BhbQpoYW08LScvVXNlcnMvY2h1bmppZW5hbi9EZXNrdG9wL01TIERhdGEgU2NpZW5jZS9EQVRBIDYwNyBEQURBIEFDUVVJU0lUSU9OIE1BTkFHRU1FTlQvMjAyMSBmYWxsL1dlZWsgMTIuIE5vdiA4IC0gTm92IDE0L1Byb2plY3Q0L3NwYW1kYXRhL2Vhc3lfaGFtJwpjb3VudF9oYW08LWxlbmd0aChsaXN0LmZpbGVzKHBhdGggPSBoYW0pKQpjb3VudF9oYW0KCnNwYW1fbGlzdDwtbGlzdC5maWxlcyhzcGFtKQpoYW1fbGlzdDwtbGlzdC5maWxlcyhoYW0pCmBgYAoKClRoZXJlIGFyZSA1MDEgU3BhbSBhbmQgMjUwMSBIYW0gaW4gdGhlIGRhdGEgc2V0LiAKCgojIyBEYXRhIENsZWFuaW5nCgoKYGBge3J9CmhhbV9kb2MgPC0gTkEKZm9yKGkgaW4gMTpsZW5ndGgoaGFtX2xpc3QpKQp7CiAgcGF0aDwtcGFzdGUwKGhhbSwgIi8iLCBoYW1fbGlzdFsxXSkgIAogIHRleHQgPC1yZWFkTGluZXMocGF0aCkKICBsaXN0PC0gbGlzdChwYXN0ZSh0ZXh0LCBjb2xsYXBzZT0iXG4iKSkKICBoYW1fZG9jID0gYyhoYW1fZG9jLGxpc3QpCiAgCn0KCnNwYW1fZG9jIDwtIE5BCmZvcihpIGluIDE6bGVuZ3RoKHNwYW1fbGlzdCkpCnsKICBwYXRoPC1wYXN0ZTAoc3BhbSwgIi8iLCBzcGFtX2xpc3RbMV0pICAKICB0ZXh0IDwtcmVhZExpbmVzKHBhdGgpCiAgbGlzdDwtIGxpc3QocGFzdGUodGV4dCwgY29sbGFwc2U9IlxuIikpCiAgc3BhbV9kb2MgPSBjKHNwYW1fZG9jLGxpc3QpCn0KCgpoYW1fZGF0YV9mcmFtZSA8LWFzLmRhdGEuZnJhbWUodW5saXN0KGhhbV9kb2MpLHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkKaGFtX2RhdGFfZnJhbWUkdHlwZSA8LSAiaGFtIgpjb2xuYW1lcyhoYW1fZGF0YV9mcmFtZSkgPC0gYygidGV4dCIsInR5cGUiKQoKc3BhbV9kYXRhX2ZyYW1lIDwtYXMuZGF0YS5mcmFtZSh1bmxpc3Qoc3BhbV9kb2MpLHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkKc3BhbV9kYXRhX2ZyYW1lJHR5cGUgPC0gInNwYW0iCmNvbG5hbWVzKHNwYW1fZGF0YV9mcmFtZSkgPC0gYygidGV4dCIsInR5cGUiKQoKIyBCaW5kIHRoZSBkYXRhIGZyYW1lcwpzcGFtX2hhbV9kYXRhIDwtIHJiaW5kKGhhbV9kYXRhX2ZyYW1lLCBzcGFtX2RhdGFfZnJhbWUpCm5yb3coc3BhbV9oYW1fZGF0YSkKdGFibGUoc3BhbV9oYW1fZGF0YSR0eXBlKQpgYGAKClRoZXJlIGFyZSB0b3RhbCBvZiAzMDA0IG9mIG9ic2VydmF0aW9ucyBpcyBpbiB0aGUgZGF0YSBzZXQsIGFuZCAyMDUyIGFyZSBoYW0gYW5kIDUwMiBhcmUgc3BhbSBpbiB0aGlzIGNvbWJpbmVkIGRhdGEgc2V0LgoKCiMjIENyZWF0ZSBDb3JwdXNlcyBmcm9tIEVtYWlsCgpgYGB7cn0KY29ycHVzIDwtIFZDb3JwdXMoVmVjdG9yU291cmNlKHNwYW1faGFtX2RhdGEkdGV4dCkpCmNvcnB1cyA8LSB0bV9tYXAoY29ycHVzLCBjb250ZW50X3RyYW5zZm9ybWVyKHRvbG93ZXIpKQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgcmVtb3ZlTnVtYmVycykKY29ycHVzIDwtIHRtX21hcChjb3JwdXMsIHJlbW92ZVB1bmN0dWF0aW9uKQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgcmVtb3ZlV29yZHMsIHN0b3B3b3JkcygpKQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgc3RlbURvY3VtZW50KQpjb3JwdXMgPC0gdG1fbWFwKGNvcnB1cywgc3RyaXBXaGl0ZXNwYWNlKQpgYGAKCgojIyBXb3JkIENsb3VkIGFuZCBNYXRyaXgKCmBgYHtyfQpkdG08LSBEb2N1bWVudFRlcm1NYXRyaXgoY29ycHVzKQpTcGFtPC13aGljaChzcGFtX2hhbV9kYXRhJHR5cGUgPT0gJ3NwYW0nKQpIYW08LXdoaWNoKHNwYW1faGFtX2RhdGEkdHlwZSA9PSAnaGFtJykKd29yZGNsb3VkKGNvcnB1c1tTcGFtXSwgbWF4LndvcmRzID0gMTAwLCByYW5kb20ub3JkZXIgPSBGQUxTRSwgY29sb3JzPWMoJ1JlZCcpKQp3b3JkY2xvdWQoY29ycHVzW0hhbV0sbWF4LndvcmRzID0gMTAwICxyYW5kb20ub3JkZXIgPSBGQUxTRSwgY29sb3JzPWMoJ0JsdWUnKSkKCgoKYGBgCgoKIyMgU3BsaXQgRGF0YSBhbmQgQ29ycHVzIGZvciBUcmFpbmcgRGF0YSBhbmQgVGVzdCBEYXRhCgpgYGB7cn0Kc3BhbV9oYW1fZGF0YSR0ZXh0W3NwYW1faGFtX2RhdGEkdGV4dD09IiJdIDwtICJOQSIKc3BsaXRfaW5kZXggPC0gY3JlYXRlRGF0YVBhcnRpdGlvbihzcGFtX2hhbV9kYXRhJHR5cGUsIHA9MC43MCwgbGlzdD1GQUxTRSkKdHJhaW4gPC0gc3BhbV9oYW1fZGF0YVtzcGxpdF9pbmRleCxdCnRlc3QgPC0gc3BhbV9oYW1fZGF0YVstc3BsaXRfaW5kZXgsXQoKCiMgQ29ycHVzIGZvciB0cmFpbmluZyBkYXRhIGFuZCB0ZXN0IGRhdGEKdHJhaW5fY29ycHVzIDwtIENvcnB1cyhWZWN0b3JTb3VyY2UodHJhaW4kdGV4dCkpCnRyYWluX2NvcnAgPC0gdG1fbWFwKHRyYWluX2NvcnB1cyAscmVtb3ZlTnVtYmVycykKdHJhaW5fY29ycCA8LSB0bV9tYXAodHJhaW5fY29ycCxyZW1vdmVQdW5jdHVhdGlvbikKdHJhaW5fY29ycDwtIHRtX21hcCh0cmFpbl9jb3JwLCByZW1vdmVXb3JkcyxzdG9wd29yZHMoKSkKdHJhaW5fY29ycDwtIHRtX21hcCh0cmFpbl9jb3JwLHN0cmlwV2hpdGVzcGFjZSkKCgp0ZXN0X2NvcnB1cyA8LSBDb3JwdXMoVmVjdG9yU291cmNlKHRlc3QkdGV4dCkpCnRlc3RfY29ycDwtIHRtX21hcCh0ZXN0X2NvcnB1cyxyZW1vdmVOdW1iZXJzKQp0ZXN0X2NvcnAgPC0gdG1fbWFwKHRlc3RfY29ycCxyZW1vdmVQdW5jdHVhdGlvbikKdGVzdF9jb3JwICA8LSB0bV9tYXAodGVzdF9jb3JwLHJlbW92ZVdvcmRzLCBzdG9wd29yZHMoKSkKdGVzdF9jb3JwPC0gdG1fbWFwKHRlc3RfY29ycCxzdHJpcFdoaXRlc3BhY2UpCgoKdHJhaW5fZHRtIDwtIERvY3VtZW50VGVybU1hdHJpeCh0cmFpbl9jb3JwKQp0ZXN0X2R0bSA8LSBEb2N1bWVudFRlcm1NYXRyaXgodGVzdF9jb3JwKQoKCmNvbnZlcnQgPC0gZnVuY3Rpb24oeCkgewogIHkgPC0gaWZlbHNlKHggPiAwLCAxLDApCiAgeSA8LSBmYWN0b3IoeSwgbGV2ZWxzPWMoMCwxKSwgbGFiZWxzPWMoIk5vIiwgIlllcyIpKQogIHkKfQoKc21zLnRyYWluIDwtIGFwcGx5KHRyYWluX2R0bSwgMiwgY29udmVydCkKc21zLnRlc3QgPC0gYXBwbHkodGVzdF9kdG0sIDIsIGNvbnZlcnQpCmBgYAoKCiMjIE1vZGVsaW5nIGFuZCBQcmVkaWN0aW9uCgojIyMgTmFpdmUgQmF5ZXMgQ2xhc3NpZmljYXRpb24KYGBge3J9CiMgY2xhc3NpZmljYXRpb24gb2YgZW1haWwKbmFpdmVfY2xhc3NpZmllciA8LSBuYWl2ZUJheWVzKHNtcy50cmFpbiwgZmFjdG9yKHRyYWluJHR5cGUpKQpwcmVkX25haXZlIDwtIHByZWRpY3QobmFpdmVfY2xhc3NpZmllciwgbmV3ZGF0YT1zbXMudGVzdCkKbmFpdmVfbWF0cml4PC10YWJsZShwcmVkX25haXZlLCB0ZXN0JHR5cGUpCm5haXZlX21hdHJpeApgYGAKCgojIyBBY2N1cmFjeSBvZiBOYWl2ZSBDbGFzc2lmaWNhdGlvbiAKCkFjY3VyYWN5ID0gKFRQICsgVE4pL1RvdGFsCmBgYHtyfQpUUDwtIDc1MCArIDE1MApUTjwtIDAgKyAwCkFjY3VyYWN5X3JhdGU8LShUUCArIFROKS9zdW0obmFpdmVfbWF0cml4KSoxMDAKCmNvbmZ1c2lvbk1hdHJpeChwcmVkX25haXZlLCBhcy5mYWN0b3IodGVzdCR0eXBlKSwgcG9zaXRpdmUgPSAnc3BhbScsIGRubiA9IGMoJ3ByZWQnLCdhY3R1YWwnKSkKCnByaW50KHBhc3RlMCgnVGhlIGFjY3VyYWN5X3JhdGUgb2YgbmFpdmUgY2xhc3NpZmljYXRpb24gbW9kZWwgaXMgJyAsIEFjY3VyYWN5X3JhdGUsJyUnKSkKCmBgYAoKCiMjIENvbmNsdXNpb24KCk5haXZlIEJheWVzIENsYXNzaWZpY2F0aW9uIGlzIGNvbnNpZGVyZWQgb25lIG9mIHRoZSBiZXN0IG9wdGlvbnMgdG8gZG8gY2xhc3NpZmljYXRpb24gYW5hbHlzaXMuIEl0IGFsbG93ZWQgbWUgdG8gZ2V0IHRoZSAxMDAlIG9mIAphY2N1cmFjeSBvZiB0aGUgc3BhbSBlbWFpbCBkZXRlY3Rpb24uIEFjY29yZGluZyB0byB0aGUgY29uZnVzaW9uIG1hdHJpeCBmdW5jdGlvbiwgdGhlIG1vZGVsIGNsYXNzaWZpZWQgMTAwICUgb2Ygc3BhbXMgYW5kIGhhbXMgY29ycmVjdGx5LgoK