For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

Loading the libraries:

suppressMessages(library(stringr))
## Warning: package 'stringr' was built under R version 4.1.2
suppressMessages(library(dplyr))
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
library(tidyr)
suppressMessages(library(wordcloud))
## Warning: package 'wordcloud' was built under R version 4.1.3
library(readr)
library(purrr)
library(tm)
## Warning: package 'tm' was built under R version 4.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.1.1
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
suppressMessages(library(data.table))
## Warning: package 'data.table' was built under R version 4.1.3
suppressMessages(library(magrittr))
## Warning: package 'magrittr' was built under R version 4.1.3
library(e1071) 
## Warning: package 'e1071' was built under R version 4.1.3
suppressMessages(library(caret))
## Warning: package 'caret' was built under R version 4.1.3

To load the corpus data into ‘spam_folder’ and ‘ham_folder’.

spam_folder <- 'C:/Users/Uzma/CUNY_SPS_PROJECTS/Data_607_Project4/spam/'

ham_folder <- 'C:/Users/Uzma/CUNY_SPS_PROJECTS/Data_607_Project4/easy_ham/'

length(list.files(path = spam_folder))
## [1] 500

Using ‘list.files’ on our ‘spam_folder’ object which produces a character vector of the names of files

spam_files <- list.files(path = spam_folder, full.names = TRUE)
ham_files <- list.files(path = ham_folder, full.names = TRUE)

Read files into a data frame and assigning column names

spam <- list.files(path = spam_folder) %>%
  as.data.frame() %>%
  set_colnames("file") %>%
  mutate(text = lapply(spam_files, read_lines)) %>%
  unnest(c(text)) %>%
  mutate(class = "spam",
         spam = 1) %>%
  group_by(file) %>%
  mutate(text = paste(text, collapse = " ")) %>%
  ungroup() %>%
  distinct()

head(spam)
## # A tibble: 6 x 4
##   file                   text                                        class  spam
##   <chr>                  <chr>                                       <chr> <dbl>
## 1 0001.bfc8d64d12b325ff~ "From 12a1mailbot1@web.de  Thu Aug 22 13:1~ spam      1
## 2 0002.24b47bb3ce90708a~ "From ilug-admin@linux.ie  Thu Aug 22 13:2~ spam      1
## 3 0003.4b3d943b8df71af2~ "From sabrina@mx3.1premio.com  Thu Aug 22 ~ spam      1
## 4 0004.1874ab60c71f0b31~ "From wsup@playful.com  Thu Aug 22 16:17:0~ spam      1
## 5 0005.1f42bb885de0ef7f~ "From social-admin@linux.ie  Thu Aug 22 16~ spam      1
## 6 0006.7a32642f8c22bbeb~ "From Thecashsystem@firemail.de  Thu Aug 2~ spam      1

The ‘lapply’ function takes a list, vector or data frame as input and gives output in the list.

ham <- list.files(path = ham_folder) %>%
  as.data.frame() %>%
  set_colnames("file") %>%
  mutate(text = lapply(ham_files, read_lines)) %>%
  unnest(c(text)) %>%
  mutate(class = "ham",
         spam = 0) %>%
  group_by(file) %>%
  mutate(text = paste(text, collapse = " ")) %>%
  ungroup() %>%
  distinct()

head(ham)
## # A tibble: 6 x 4
##   file                    text                                       class  spam
##   <chr>                   <chr>                                      <chr> <dbl>
## 1 00001.7c53336b37003a92~ "From exmh-workers-admin@redhat.com  Thu ~ ham       0
## 2 00002.9c4069e25e1ef370~ "From Steve_Burt@cursor-system.com  Thu A~ ham       0
## 3 00003.860e3c3cee1b42ea~ "From timc@2ubh.com  Thu Aug 22 13:52:59 ~ ham       0
## 4 00004.864220c5b6930b20~ "From irregulars-admin@tb.tf  Thu Aug 22 ~ ham       0
## 5 00005.bf27cdeaf0b8c464~ "From Stewart.Smith@ee.ed.ac.uk  Thu Aug ~ ham       0
## 6 00006.253ea2f9a9cc36fa~ "From martin@srv0.ems.ed.ac.uk  Thu Aug 2~ ham       0

Using the rbind() fuction to combine both spam/ham vectors

ham_spam<- rbind(ham, spam) %>%
  select(class, spam,file, text)

Tidy the data using str_replace function

ham_spam$text <- ham_spam$text %>%
  str_replace(.,"[\\r\\n\\t]+", "")

replacePunctuation <- content_transformer(function(x) {return (gsub("[[:punct:]]", " ", x))})

Document Term Matrix:

corpus <- Corpus(VectorSource(ham_spam$text))
corpus <- Corpus(VectorSource(ham_spam$text)) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))%>%
  tm_map(replacePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., replacePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
dtm <- DocumentTermMatrix(corpus)
dtm <- removeSparseTerms(dtm,1-(10/length(corpus)))

dim(dtm)
## [1] 3001 5298

Now Training and Testing Data

ham_spam_dtm <- dtm %>%
  as.matrix() %>%
  as.data.frame() %>%
  sapply(., as.numeric) %>%
  as.data.frame() %>%
  mutate(class = ham_spam$class) %>%
  select(class, everything())

ham_spam_dtm$class <- as.factor(ham_spam_dtm$class)

Creating training and testing set

sample_size <- floor(0.8 * nrow(ham_spam_dtm))
set.seed(1500)
index <- sample(seq_len(nrow(ham_spam_dtm)), size = sample_size)
dtm_train <- ham_spam_dtm[index, ]
dtm_test <-  ham_spam_dtm[-index, ]

Count of training and testing set

train_labels <- dtm_train$class
test_labels <- dtm_test$class

Creating proportion for training & test Spam

prop.table(table(train_labels))
## train_labels
##       ham      spam 
## 0.8341667 0.1658333

Training the Model Using Naive Bayes model:

dtm_train[ , 2:5298] <- ifelse(dtm_train[ , 2:5298] == 0, "No", "Yes")
dtm_test[ , 2:5298] <- ifelse(dtm_test[ , 2:5298] == 0, "No", "Yes")

model_classifier <- naiveBayes(dtm_train, train_labels) 

test_pred <- predict(model_classifier, dtm_test)

confusionMatrix(test_pred, test_labels, positive = "spam", 
                dnn = c("Prediction","Actual"))
## Confusion Matrix and Statistics
## 
##           Actual
## Prediction ham spam
##       ham  497    1
##       spam   2  101
##                                          
##                Accuracy : 0.995          
##                  95% CI : (0.9855, 0.999)
##     No Information Rate : 0.8303         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9824         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9902         
##             Specificity : 0.9960         
##          Pos Pred Value : 0.9806         
##          Neg Pred Value : 0.9980         
##              Prevalence : 0.1697         
##          Detection Rate : 0.1681         
##    Detection Prevalence : 0.1714         
##       Balanced Accuracy : 0.9931         
##                                          
##        'Positive' Class : spam           
## 

Conclusion:

The Model prediction accuracy is 99% for the emails into the proper categories. The 99% sensivity rate means that 99% of the spam emails were classified correctly. The 99% specificity rate means that 99% of the ham emails were classified correctly.