For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/
suppressMessages(library(stringr))
## Warning: package 'stringr' was built under R version 4.1.2
suppressMessages(library(dplyr))
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
library(tidyr)
suppressMessages(library(wordcloud))
## Warning: package 'wordcloud' was built under R version 4.1.3
library(readr)
library(purrr)
library(tm)
## Warning: package 'tm' was built under R version 4.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.1.1
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
suppressMessages(library(data.table))
## Warning: package 'data.table' was built under R version 4.1.3
suppressMessages(library(magrittr))
## Warning: package 'magrittr' was built under R version 4.1.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
suppressMessages(library(caret))
## Warning: package 'caret' was built under R version 4.1.3
spam_folder <- 'C:/Users/Uzma/CUNY_SPS_PROJECTS/Data_607_Project4/spam/'
ham_folder <- 'C:/Users/Uzma/CUNY_SPS_PROJECTS/Data_607_Project4/easy_ham/'
length(list.files(path = spam_folder))
## [1] 500
spam_files <- list.files(path = spam_folder, full.names = TRUE)
ham_files <- list.files(path = ham_folder, full.names = TRUE)
spam <- list.files(path = spam_folder) %>%
as.data.frame() %>%
set_colnames("file") %>%
mutate(text = lapply(spam_files, read_lines)) %>%
unnest(c(text)) %>%
mutate(class = "spam",
spam = 1) %>%
group_by(file) %>%
mutate(text = paste(text, collapse = " ")) %>%
ungroup() %>%
distinct()
head(spam)
## # A tibble: 6 x 4
## file text class spam
## <chr> <chr> <chr> <dbl>
## 1 0001.bfc8d64d12b325ff~ "From 12a1mailbot1@web.de Thu Aug 22 13:1~ spam 1
## 2 0002.24b47bb3ce90708a~ "From ilug-admin@linux.ie Thu Aug 22 13:2~ spam 1
## 3 0003.4b3d943b8df71af2~ "From sabrina@mx3.1premio.com Thu Aug 22 ~ spam 1
## 4 0004.1874ab60c71f0b31~ "From wsup@playful.com Thu Aug 22 16:17:0~ spam 1
## 5 0005.1f42bb885de0ef7f~ "From social-admin@linux.ie Thu Aug 22 16~ spam 1
## 6 0006.7a32642f8c22bbeb~ "From Thecashsystem@firemail.de Thu Aug 2~ spam 1
ham <- list.files(path = ham_folder) %>%
as.data.frame() %>%
set_colnames("file") %>%
mutate(text = lapply(ham_files, read_lines)) %>%
unnest(c(text)) %>%
mutate(class = "ham",
spam = 0) %>%
group_by(file) %>%
mutate(text = paste(text, collapse = " ")) %>%
ungroup() %>%
distinct()
head(ham)
## # A tibble: 6 x 4
## file text class spam
## <chr> <chr> <chr> <dbl>
## 1 00001.7c53336b37003a92~ "From exmh-workers-admin@redhat.com Thu ~ ham 0
## 2 00002.9c4069e25e1ef370~ "From Steve_Burt@cursor-system.com Thu A~ ham 0
## 3 00003.860e3c3cee1b42ea~ "From timc@2ubh.com Thu Aug 22 13:52:59 ~ ham 0
## 4 00004.864220c5b6930b20~ "From irregulars-admin@tb.tf Thu Aug 22 ~ ham 0
## 5 00005.bf27cdeaf0b8c464~ "From Stewart.Smith@ee.ed.ac.uk Thu Aug ~ ham 0
## 6 00006.253ea2f9a9cc36fa~ "From martin@srv0.ems.ed.ac.uk Thu Aug 2~ ham 0
ham_spam<- rbind(ham, spam) %>%
select(class, spam,file, text)
ham_spam$text <- ham_spam$text %>%
str_replace(.,"[\\r\\n\\t]+", "")
replacePunctuation <- content_transformer(function(x) {return (gsub("[[:punct:]]", " ", x))})
corpus <- Corpus(VectorSource(ham_spam$text))
corpus <- Corpus(VectorSource(ham_spam$text)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))%>%
tm_map(replacePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., replacePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
dtm <- DocumentTermMatrix(corpus)
dtm <- removeSparseTerms(dtm,1-(10/length(corpus)))
dim(dtm)
## [1] 3001 5298
ham_spam_dtm <- dtm %>%
as.matrix() %>%
as.data.frame() %>%
sapply(., as.numeric) %>%
as.data.frame() %>%
mutate(class = ham_spam$class) %>%
select(class, everything())
ham_spam_dtm$class <- as.factor(ham_spam_dtm$class)
sample_size <- floor(0.8 * nrow(ham_spam_dtm))
set.seed(1500)
index <- sample(seq_len(nrow(ham_spam_dtm)), size = sample_size)
dtm_train <- ham_spam_dtm[index, ]
dtm_test <- ham_spam_dtm[-index, ]
train_labels <- dtm_train$class
test_labels <- dtm_test$class
prop.table(table(train_labels))
## train_labels
## ham spam
## 0.8341667 0.1658333
dtm_train[ , 2:5298] <- ifelse(dtm_train[ , 2:5298] == 0, "No", "Yes")
dtm_test[ , 2:5298] <- ifelse(dtm_test[ , 2:5298] == 0, "No", "Yes")
model_classifier <- naiveBayes(dtm_train, train_labels)
test_pred <- predict(model_classifier, dtm_test)
confusionMatrix(test_pred, test_labels, positive = "spam",
dnn = c("Prediction","Actual"))
## Confusion Matrix and Statistics
##
## Actual
## Prediction ham spam
## ham 497 1
## spam 2 101
##
## Accuracy : 0.995
## 95% CI : (0.9855, 0.999)
## No Information Rate : 0.8303
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9824
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9902
## Specificity : 0.9960
## Pos Pred Value : 0.9806
## Neg Pred Value : 0.9980
## Prevalence : 0.1697
## Detection Rate : 0.1681
## Detection Prevalence : 0.1714
## Balanced Accuracy : 0.9931
##
## 'Positive' Class : spam
##
The Model prediction accuracy is 99% for the emails into the proper categories. The 99% sensivity rate means that 99% of the spam emails were classified correctly. The 99% specificity rate means that 99% of the ham emails were classified correctly.