It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.For this project, you can start with a spam/ham data set, then predict the class of new documents (either withheld from the training data set or from another source such as your own spam folder). Here, I will create the Naive Bayes model to predict whether or not a new email is spam.
library(tm)
## Loading required package: NLP
library(reader)
## Loading required package: NCmisc
##
## Attaching package: 'reader'
## The following objects are masked from 'package:NCmisc':
##
## cat.path, get.ext, rmv.ext
## The following object is masked from 'package:tm':
##
## reader
library(NLP)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(stringr)
The spam (20050311_spam_2) and ham (20030228_easy_ham) folders containing spam and ham emails are collected from https://spamassassin.apache.org/old/publiccorpus/.
spam_dir<-"F:\\CUNY masters\\assignment607\\project 4-spamham\\spam_2"
ham_dir<-"F:\\CUNY masters\\assignment607\\project 4-spamham\\easy_ham"
file1<-list.files(spam_dir)
file2<-list.files(ham_dir)
# Creating spamlist with their titles iterating through all files in the spam folder
spamlist <- NA
for(i in 1:length(file1))
{
path<-paste0(spam_dir, "/", file1[i])
text <-readLines(path)
tmp<- list(paste(text, collapse="\n"))
spamlist = c(spamlist,tmp)
}
# Creating data frame
spam <-as.data.frame(unlist(spamlist),stringsAsFactors = FALSE)
spam$classification <- 1
colnames(spam)<- c("email","classification")
# Creating a hamlist for all files with their titles
hamlist <- NA
for(i in 1:length(file2))
{
path<-paste0(ham_dir, "/", file2[i])
text <-readLines(path)
tmp<- list(paste(text, collapse="\n"))
hamlist = c(hamlist,tmp)
}
# Creating data frame
ham <-as.data.frame(unlist(hamlist),stringsAsFactors = FALSE)
ham$classification <- 0
colnames(ham)<-c("email","classification")
df_final <- rbind(ham, spam)
prop.table(table(df_final$classification))
##
## 0 1
## 0.6415385 0.3584615
dfcorpus <- Corpus(VectorSource(df_final$email))
dfcorpus[[1]]$meta
## author : character(0)
## datetimestamp: 2022-11-25 03:20:49
## description : character(0)
## heading : character(0)
## id : 1
## language : en
## origin : character(0)
# Create a "addspace" function that finds a user specified pattern and substitutes the pattern with a space
addspace <- content_transformer(function(x, pattern) {
return(gsub(pattern, " ", x))
})
# Replace "-" with space
my_corpus <- tm_map(dfcorpus, addspace, "-")
## Warning in tm_map.SimpleCorpus(dfcorpus, addspace, "-"): transformation drops
## documents
# Remove numbers
my_corpus<-tm_map(my_corpus,content_transformer(removeNumbers))
## Warning in tm_map.SimpleCorpus(my_corpus, content_transformer(removeNumbers)):
## transformation drops documents
# Remove white spaces
my_corpus<-tm_map(my_corpus,stripWhitespace)
## Warning in tm_map.SimpleCorpus(my_corpus, stripWhitespace): transformation drops
## documents
# To Lowercase Transformation
corpus1<-tm_map(my_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(my_corpus, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation transformation
corpus2 <- tm_map(corpus1, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus1, removePunctuation): transformation drops
## documents
# Stem document transformation
corpus3<- tm_map(corpus2, stemDocument)
## Warning in tm_map.SimpleCorpus(corpus2, stemDocument): transformation drops
## documents
# Remove stopwords transformation
corpus4<-tm_map(corpus3, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(corpus3, removeWords, stopwords("en")):
## transformation drops documents
dtm1 <- DocumentTermMatrix(corpus4)
# Reducing sparsity
dtm1 <- removeSparseTerms(dtm1, 1-(10/length(corpus4)))
# Creating data frame
dtm1_df<-(as.data.frame(as.matrix(dtm1)))
dtm1_df$classification<-df_final$classification
sort(colSums(dtm1_df ) ,decreasing = TRUE)[1:30]
## receiv list esmtp sep
## 21641 15240 11549 9793
## localhost content size date
## 9246 6714 6498 6271
## messag mon aug postfix
## 6238 6013 5764 5628
## version wed oct jmlocalhost
## 5299 5268 5256 5227
## thu font ist mail
## 5194 4877 4841 4831
## subject type tue jul
## 4755 4629 4537 4434
## deliv return dogmaslashnullorg will
## 4393 4269 4252 4050
## path jalapeno
## 3799 3753
sort(colSums(dtm1_df %>% filter(`classification` == 1)) ,decreasing = TRUE)[1:30]
## receiv size font jul widthd esmtp tabl content
## 7411 6307 4794 4382 3547 3141 3125 2919
## width san mail will helvetica email may list
## 2913 2728 2695 2617 2520 2439 2419 2387
## serif messag type date color mon localhost facedari
## 2139 2131 2080 2067 1986 1978 1899 1862
## free subject can div version nsnet
## 1857 1855 1774 1725 1672 1624
sort(colSums(dtm1_df %>% filter(`classification` == 0)) ,decreasing = TRUE)[1:30]
## receiv list sep esmtp
## 14230 12853 9788 8408
## localhost oct postfix aug
## 7347 5251 4662 4476
## ist date jmlocalhost messag
## 4224 4204 4144 4107
## mon wed thu content
## 4035 3860 3839 3795
## jalapeno version deliv rpm
## 3705 3627 3540 3529
## exmh user fork dogmaslashnullorg
## 3364 3307 3093 3048
## tue subject return adminxentcom
## 3004 2900 2839 2775
## use path
## 2620 2584
sample_size <- floor(0.80 * nrow(dtm1_df))
set.seed(123)
train <- sample(seq_len(nrow(dtm1_df)), size = sample_size)
#Training data set
dtm1_train <- dtm1_df[train, ]
# Test data set
dtm1_test <- dtm1_df[-train, ]
train_labels <- dtm1_train$classification
test_labels <- dtm1_test$classification
prop.table(table(train_labels))
## train_labels
## 0 1
## 0.6384615 0.3615385
prop.table(table(test_labels))
## test_labels
## 0 1
## 0.6538462 0.3461538
# Finding minimum frequency
threshold <- 0.1
min_freq <- round(dtm1$nrow*(threshold/100),0)
min_freq
## [1] 4
# Create vector of most frequent words
freq_words <- findFreqTerms(x = dtm1, lowfreq = min_freq)
str(freq_words)
## chr [1:5571] "abl" "actual" "adminredhatcom" "adminspamassassintaintorg" ...
# Filter the DTM
dtm1_freq_train <- dtm1_train[ , freq_words]
dtm1_freq_test <- dtm1_test[ , freq_words]
dim(dtm1_freq_train)
## [1] 3120 5571
dim(dtm1_freq_test)
## [1] 780 5571
The training and test data sets now include 5571 features, which correspond to words appearing in at least five emails.
convert_values <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
train1 <- apply(dtm1_freq_train, MARGIN = 2,
convert_values)
test1 <- apply(dtm1_freq_test, MARGIN = 2,
convert_values)
email_classifier <- naiveBayes(train1, train_labels)
test_pred <- predict(email_classifier, test1)
confusionMatrix(test_pred, as.factor(test_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 507 18
## 1 3 252
##
## Accuracy : 0.9731
## 95% CI : (0.9591, 0.9833)
## No Information Rate : 0.6538
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9397
##
## Mcnemar's Test P-Value : 0.00225
##
## Sensitivity : 0.9941
## Specificity : 0.9333
## Pos Pred Value : 0.9657
## Neg Pred Value : 0.9882
## Prevalence : 0.6538
## Detection Rate : 0.6500
## Detection Prevalence : 0.6731
## Balanced Accuracy : 0.9637
##
## 'Positive' Class : 0
##
Here, by applying the Naive Bayes algorithm, I have created a model to determine the probability that a given email is spam. It is seen that the accuracy of the model of predicting the spam email is 97.31% ! The model missed 3 spam emails and wrongly classified them as ham.