Project 4

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

##Loading libraries

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.3.3
## Loading required package: RColorBrewer
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.3.3
## naivebayes 1.0.0 loaded
## For more information please visit: 
## https://majkamichal.github.io/naivebayes/

Loading files

# loading both test and training files
spam_directory = "C:/Users/ajay2/Downloads/20050311_spam_2/spam_2"
hard_ham_directory = "C:/Users/ajay2/Downloads/20030228_hard_ham/hard_ham"
spam_files <- list.files(spam_directory)
hard_ham_files <- list.files(hard_ham_directory)

Create Corpus

# hard_ham folder files 
# read the directory, read all the text in each document, apply to all documents, create a source object and create a Corpus from this source object
hard_ham_corpus <- hard_ham_directory %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()
## Warning in FUN(X[[i]], ...): incomplete final line found on
## 'C:/Users/ajay2/Downloads/20030228_hard_ham/hard_ham/00228.0eaef7857bbbf3ebf5edbbdae2b30493'
# spam folder files
# read the directory, read all the text in each document, apply to all documents, create a source object and create a Corpus from this source object
spam_corpus <- spam_directory %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()

Tidying the data

# hard ham emails

# tm package needs all text to be in UTF-8 format
# hence i need to convert all to this format , else i am receiving invalid UTF-8 error

# Convert text data to UTF-8 encoding
hard_ham_corpus <- tm_map(hard_ham_corpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))

# preprocessing before analyzing - removing  numbers, convert to lower,punctuation, stop words, blankspace and reducing the terms.
hard_ham_corpus <- hard_ham_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)

#spam emails

# Convert text data to UTF-8 encoding
spam_corpus <- tm_map(spam_corpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))

# preprocessing before analyzing - removing  numbers, punctuation, stop words, blankspace and reducing the terms.
spam_corpus <- spam_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)

Combine the two corpuses

ham_or_spam_corpus <- c(hard_ham_corpus, spam_corpus)

Create the Document Matrix

# matrix of rows and columns
# rows are the documents
# columns are the terms in the document
doc_term <- TermDocumentMatrix(ham_or_spam_corpus)

# Convert the document term to a matrix for better visualization
doc_term_mtx<- as.matrix(doc_term)

glimpse(doc_term_mtx)
##  num [1:90491, 1:1646] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ Terms: chr [1:90491] "\033b\033b" "\033b\033bservic" "\033babjnhmqurdkhgannvncgooj\033b" "\033bank\033bservic" ...
##   ..$ Docs : chr [1:1646] "1" "2" "3" "4" ...
wordcloud(ham_or_spam_corpus, max.words = 100, random.order = FALSE, rot.per=0.15, min.freq=5, colors = brewer.pal(8, "Dark2"))

Prepare the model

I am using the classification method Naive Bayes classifier to find out the appearance of words in the matrix to predict if the email is spam or ham.

create a data frame to structure the ham and spam terms and combine them into one.

# adding text and email variables to the dataset
df_ham <- as.data.frame(unlist(hard_ham_corpus), stringsAsFactors = FALSE)
df_ham$type <- "ham"
colnames(df_ham)=c("text", "email")

df_spam <- as.data.frame(unlist(spam_corpus), stringsAsFactors = FALSE)
df_spam$type <- "spam"
colnames(df_spam)=c("text", "email")

df_ham_or_spam <- rbind(df_ham, df_spam)

head(df_ham_or_spam)
##                                    text email
## 1          returnpath foolmotleyfoolcom   ham
## 2                     deliveryd wed jan   ham
## 3 returnpath expresserrorsmotleyfoolcom   ham
## 4                   receiv hmailhomecom   ham
## 5                  femailsdcsfbahomecom   ham
## 6                    intermail vm esmtp   ham

Create train data and test data frames and then create a VCorpus

sample_size <- floor(0.80 * nrow(df_ham_or_spam)) # selecting sample size of 80% of the data for training. 

set.seed(123)
df_ham_or_spam$text[df_ham_or_spam$text==""] <- "NaN"
train <- sample(seq_len(nrow(df_ham_or_spam)), size = sample_size)

train_ham_or_spam <- df_ham_or_spam[train, ]
test_ham_or_spam <- df_ham_or_spam[-train, ]

train_corpus <- Corpus (VectorSource(train_ham_or_spam$text)) # corpus training data
test_corpus <- Corpus(VectorSource(test_ham_or_spam$text)) # corpus test data

# Corpus cleaning
train_corpus <- train_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
test_corpus <- test_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
# Create document term matrix

train_tdm <- DocumentTermMatrix(train_corpus)
test_tdm <- DocumentTermMatrix(test_corpus)

# Separate training data to spam and ham.
spam <- subset(train_ham_or_spam, email == "spam")
ham <- subset(train_ham_or_spam, email == "ham")

Build the model

i tried to build the model but it is not executing as it is giving me error:

Error: cannot allocate vector of size 144.1 Gb

i could not go past the error, hence could not build a model and predict

Conclusion

  • I downloaded the data and saved to my folders.
  • i created the corpus
  • tidied the data
  • created the document term matrix and prepared the word cloud
  • i see that the following are the most used words : width , receiv, size, tabl, height, font, widthd