Project 4

Introduction

“For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder)”. I decided to use the spam/ham dataset provided by the instructor, I created a folder to unzip the files on it, and from there import it to R from my C drive.

Load libraries required for this project

I am going to use two new packages that I think it would be useful for this project, “NLP” which is used for Natural Language Processing, and also “tm”, a library that is used for collecting data from corpus. In addition, I am going to use the e1071 package which is new for me, this library is used for model prediction, it contains the Naive Bayes classifier that I will use for predicting this model, another new package to me is the CARET package, I will use it to produce a matrix to the clasifier.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tm)

## Warning: package 'tm' was built under R version 4.3.3

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(NLP)
library(e1071)

## Warning: package 'e1071' was built under R version 4.3.3

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

Import both easy_ham and spam files into R.

I’m using the list.files function which produces a character vector of the names of files or directories in the named directory.

spam_folder <- "C:/Users/vitug/OneDrive/Desktop/DATA_607/Project4/spamham/spam_folder"
easy_folder <- "C:/Users/vitug/OneDrive/Desktop/DATA_607/Project4/spamham/easy_folder"

length(list.files(path = spam_folder))

## [1] 1397

length(list.files(path = easy_folder))

## [1] 1401

Create a dataframe of the spam_folder, using list.files and lapply function, unnest the files by text and group by file.

spam_files <- list.files(path = spam_folder, full.names = TRUE)
spam <- list.files(path = spam_folder) %>%
  as.data.frame() %>%
  set_colnames("file") %>%
  mutate(text = lapply(spam_files, read_lines)) %>%
  unnest(c(text)) %>%
  mutate(class = "spam",
         spam = 1) %>%
  group_by(file) %>%
  mutate(text = paste(text, collapse = " ")) %>%
  ungroup() %>%
  distinct()

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `text = lapply(spam_files, read_lines)`.
## Caused by warning:
## ! One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

Perform the same operation with the easy_folder.

ham_files <- list.files(path = easy_folder, full.names = TRUE) 
ham <- list.files(path = easy_folder) %>%
  as.data.frame() %>%
  set_colnames("file") %>%
  mutate(text = lapply(ham_files, read_lines)) %>%
  unnest(c(text)) %>%
  mutate(class = "ham",
         spam = 0) %>%
  group_by(file) %>%
  mutate(text = paste(text, collapse = " ")) %>%
  ungroup() %>%
  distinct()

Cleaning and Tyding data using the rbind function to combine dataframe by rows,use the str_replace function to remove empty spaces in columns and rows, and the content_transformer function to replace puntuation signs with an space.

ham_spam <- rbind(ham, spam) %>%
  select(class, spam, file, text)

ham_spam$text <- ham_spam$text %>%
  str_replace(.,"[\\r\\n\\t]+", "")

replacePunctuation <- content_transformer(function(x) {return (gsub("[[:punct:]]", " ", x))})
head(ham_spam)

## # A tibble: 6 × 4
##   class  spam file                                   text                       
##   <chr> <dbl> <chr>                                  <chr>                      
## 1 ham       0 00001.1a31cc283af0060967a233d26548a6ce "Return-Path: <exmh-worker…
## 2 ham       0 00002.5a587ae61666c5aa097c8e866aedcc59 "From exmh-workers-admin@r…
## 3 ham       0 00003.19be8acd739ad589cd00d8425bac7115 "From exmh-workers-admin@r…
## 4 ham       0 00004.b2ed6c3c62bbdfab7683d60e214d1445 "From exmh-workers-admin@r…
## 5 ham       0 00005.07b9d4aa9e6c596440295a5170111392 "From exmh-workers-admin@r…
## 6 ham       0 00006.654c4ec7c059531accf388a807064363 "From exmh-workers-admin@r…

Create the corpus using TM library, and using some of the tm functions to clean the data on it, create a doxument term matrix, and removing some unused words(-10) with the remove sparse terms function

corpus <- Corpus(VectorSource(ham_spam$text)) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english")) %>%
  tm_map(replacePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace)

## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(., replacePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

dtm <- DocumentTermMatrix(corpus)

dtm <- removeSparseTerms(dtm, 1-(10/length(corpus)))

inspect(dtm)

## <<DocumentTermMatrix (documents: 2798, terms: 5657)>>
## Non-/sparse entries: 440778/15387508
## Sparsity           : 97%
## Maximal term length: 33
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   com font http list localhost nbsp net org received size
##   1317   4    1  242   16        13    0 254   5       10    2
##   1380  18    0    5   18         8    0   0   3        9    2
##   1429 149 1627   80    4         0    0   8   5        2  273
##   1452 167   41   83   12         0  567  14   0        2   24
##   1478 165   41   93    0         0  283  24   4        2   24
##   2456  22  542    4    6         5   18  30   0       16  114
##   2480  19    0    9    5         5    0  25   2        8    0
##   2484  20  542    4    6         5   18  31   0       14  114
##   2491 204 1102  516    5         6  339  43   4        7  447
##   2492 204 1102  516    2         6  339  43   4        7  447

dim(dtm)

## [1] 2798 5657

I am going to convert the ‘DTM’ to a dataframe, also add a column to get the results of spam or not spam, and mutate the spam column into a factor.

mail_dtm <- dtm %>%
  as.matrix() %>%
  as.data.frame() %>%
  sapply(., as.numeric) %>%
  as.data.frame() %>%
  mutate(class = ham_spam$class) %>%
  select(class, everything())
  mail_dtm$class <- as.factor(mail_dtm$class)

Set the sample_size to 0.85 as a training data, set a sed, and setup the training DTM with the test data.

sample_size <- floor(0.85 * nrow(mail_dtm))

set.seed(2779)
index <- sample(seq_len(nrow(mail_dtm)), size = sample_size)
  
dtm_train <- mail_dtm[index,]
dtm_test <-  mail_dtm[-index,]

training_lab <- dtm_train$class
test_lab <- dtm_test$class

prop.table(table(training_lab))

## training_lab
##       ham      spam 
## 0.4987384 0.5012616

prop.table(table(test_lab))

## test_lab
##       ham      spam 
## 0.5119048 0.4880952

Finalize the training model using the Naive Bayes Model, dispalying the results in the table called matrix and statistics.

dtm_train[ , 2:1914] <- ifelse(dtm_train[ , 2:1914] == 0, "No", "Yes")
dtm_test[ , 2:1914] <- ifelse(dtm_test[ , 2:1914] == 0, "No", "Yes")

model_classifier <- naiveBayes(dtm_train, training_lab) 

test_pred <- predict(model_classifier, dtm_test)

confusionMatrix(test_pred, test_lab, positive = "spam", 
                dnn = c("Prediction","Actual"))

## Confusion Matrix and Statistics
## 
##           Actual
## Prediction ham spam
##       ham  211   85
##       spam   4  120
##                                           
##                Accuracy : 0.7881          
##                  95% CI : (0.7459, 0.8262)
##     No Information Rate : 0.5119          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.572           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5854          
##             Specificity : 0.9814          
##          Pos Pred Value : 0.9677          
##          Neg Pred Value : 0.7128          
##              Prevalence : 0.4881          
##          Detection Rate : 0.2857          
##    Detection Prevalence : 0.2952          
##       Balanced Accuracy : 0.7834          
##                                           
##        'Positive' Class : spam            
##

I created a small visualization of words in ham_spam, using wordcloud

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.3.3

## Loading required package: RColorBrewer

wordcloud(ham_spam, max.words = 100, random.order = FALSE, rot.per=0.15, min.freq=5, colors = brewer.pal(8, "Dark2"))