##Loading libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.3.3
## Loading required package: RColorBrewer
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.3.3
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
# loading both test and training files
spam_directory = "C:/Users/ajay2/Downloads/20050311_spam_2/spam_2"
hard_ham_directory = "C:/Users/ajay2/Downloads/20030228_hard_ham/hard_ham"
spam_files <- list.files(spam_directory)
hard_ham_files <- list.files(hard_ham_directory)
# hard_ham folder files
# read the directory, read all the text in each document, apply to all documents, create a source object and create a Corpus from this source object
hard_ham_corpus <- hard_ham_directory %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()
## Warning in FUN(X[[i]], ...): incomplete final line found on
## 'C:/Users/ajay2/Downloads/20030228_hard_ham/hard_ham/00228.0eaef7857bbbf3ebf5edbbdae2b30493'
# spam folder files
# read the directory, read all the text in each document, apply to all documents, create a source object and create a Corpus from this source object
spam_corpus <- spam_directory %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()
# hard ham emails
# tm package needs all text to be in UTF-8 format
# hence i need to convert all to this format , else i am receiving invalid UTF-8 error
# Convert text data to UTF-8 encoding
hard_ham_corpus <- tm_map(hard_ham_corpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
# preprocessing before analyzing - removing numbers, convert to lower,punctuation, stop words, blankspace and reducing the terms.
hard_ham_corpus <- hard_ham_corpus %>%
tm_map(removeNumbers) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
#spam emails
# Convert text data to UTF-8 encoding
spam_corpus <- tm_map(spam_corpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
# preprocessing before analyzing - removing numbers, punctuation, stop words, blankspace and reducing the terms.
spam_corpus <- spam_corpus %>%
tm_map(removeNumbers) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
ham_or_spam_corpus <- c(hard_ham_corpus, spam_corpus)
# matrix of rows and columns
# rows are the documents
# columns are the terms in the document
doc_term <- TermDocumentMatrix(ham_or_spam_corpus)
# Convert the document term to a matrix for better visualization
doc_term_mtx<- as.matrix(doc_term)
glimpse(doc_term_mtx)
## num [1:90491, 1:1646] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ Terms: chr [1:90491] "\033b\033b" "\033b\033bservic" "\033babjnhmqurdkhgannvncgooj\033b" "\033bank\033bservic" ...
## ..$ Docs : chr [1:1646] "1" "2" "3" "4" ...
wordcloud(ham_or_spam_corpus, max.words = 100, random.order = FALSE, rot.per=0.15, min.freq=5, colors = brewer.pal(8, "Dark2"))
I am using the classification method Naive Bayes classifier to find out the appearance of words in the matrix to predict if the email is spam or ham.
# adding text and email variables to the dataset
df_ham <- as.data.frame(unlist(hard_ham_corpus), stringsAsFactors = FALSE)
df_ham$type <- "ham"
colnames(df_ham)=c("text", "email")
df_spam <- as.data.frame(unlist(spam_corpus), stringsAsFactors = FALSE)
df_spam$type <- "spam"
colnames(df_spam)=c("text", "email")
df_ham_or_spam <- rbind(df_ham, df_spam)
head(df_ham_or_spam)
## text email
## 1 returnpath foolmotleyfoolcom ham
## 2 deliveryd wed jan ham
## 3 returnpath expresserrorsmotleyfoolcom ham
## 4 receiv hmailhomecom ham
## 5 femailsdcsfbahomecom ham
## 6 intermail vm esmtp ham
sample_size <- floor(0.80 * nrow(df_ham_or_spam)) # selecting sample size of 80% of the data for training.
set.seed(123)
df_ham_or_spam$text[df_ham_or_spam$text==""] <- "NaN"
train <- sample(seq_len(nrow(df_ham_or_spam)), size = sample_size)
train_ham_or_spam <- df_ham_or_spam[train, ]
test_ham_or_spam <- df_ham_or_spam[-train, ]
train_corpus <- Corpus (VectorSource(train_ham_or_spam$text)) # corpus training data
test_corpus <- Corpus(VectorSource(test_ham_or_spam$text)) # corpus test data
# Corpus cleaning
train_corpus <- train_corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
test_corpus <- test_corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
# Create document term matrix
train_tdm <- DocumentTermMatrix(train_corpus)
test_tdm <- DocumentTermMatrix(test_corpus)
# Separate training data to spam and ham.
spam <- subset(train_ham_or_spam, email == "spam")
ham <- subset(train_ham_or_spam, email == "ham")