Libraries

library(tidyverse)
library(stringr)
library(tm)
library(dplyr)
library(tidymodels)

Importing Data

ham_dir <- "C:\\Users\\erodr\\Documents\\R\\data\\easy_ham"
ham_list <- list.files(ham_dir)

ham_list <- paste(ham_dir, "\\", ham_list, sep = "")

ham <- lapply(ham_list, FUN=readLines)

ham_corpus <- lapply(ham, FUN=paste, collapse=" ")

spam_dir <-  "C:\\Users\\erodr\\Documents\\R\\data\\spam"

spam_list <- list.files(spam_dir)

spam_list <- paste(spam_dir, "\\", spam_list, sep = "")

spam <- lapply(spam_list, FUN=readLines)

spam_corpus <- lapply(spam, FUN=paste, collapse=" ")

spam_df <- as.data.frame(unlist(spam_corpus), stringsAsFactors = FALSE)

ham_df <- as.data.frame(unlist(ham_corpus), stringsAsFactors = FALSE)

#Add column to identify row as ham or spam
ham_df1 <- ham_df %>% 
  mutate(type = "ham_email")%>%
  mutate_at(vars(type),list(factor))



spam_df1 <- spam_df %>% 
  mutate(type = "spam_email")%>%
  mutate_at(vars(type),list(factor))

#Rename columnames for spam and ham dataframe to be consistent in preparation to bind
names(ham_df1)[1] <- "text"
names(spam_df1)[1] <- "text"

#Binding dataframes in prepration to make corpus and document term matrix

spam_ham <- rbind(ham_df1, spam_df1)

Cleaning Data

I used the tm package to clean up the data set. Although this package has nice functions to cleaning data, it needs to be converted into a Vcorpus. The corpus cannot be used in the model, so this then needs to be converted back into a data frame.

spam_ham_clean <- VCorpus(VectorSource(spam_ham$text))%>% 
  tm_map(removePunctuation)%>%
  tm_map(removeNumbers)%>%
  tm_map(content_transformer(tolower))%>%
  tm_map(removeWords, stopwords("en"))%>%
  tm_map(stripWhitespace)
 

#Converted the cleaned corpus into a document term matrix.  Converted back to a data frame Add the type of email per document.
spam_ham_dtm <- DocumentTermMatrix(spam_ham_clean)

email_df <- data.frame(as.matrix(spam_ham_dtm), stringsAsFactors=FALSE)

email_df1 <- cbind(email_df, spam_ham$type)

email_df2 <- email_df1 %>% mutate_at(vars(type),list(factor))

Data Modeling

##set.seed(123)

email_split <- initial_split(email_df1, prop = 0.75, strata = type)

email_training <- email_split %>% training()

email_test <- email_split %>% testing()

logistic_model <- logistic_reg()%>% set_engine('glm') %>% set_mode('classification')

#logistic_fit <- logistic_model %>% fit(type ~., data = email_training)

#class_preds <- logistic_fit %>% predict(new_data = leads_test, type='class')

#prob_preds <- logistic_fit %>% predict(new_data = leads_test, type='prob')

The code that is commented out produce errors.

References:
1. DataDaft, https://www.youtube.com/watch?v=34sbvhr_pm8
2. Jalayer Academy, https://www.youtube.com/watch?v=pFinlXYLZ-A
3. Jaylayer Academy, https://www.youtube.com/watch?v=jCrQYOsAcv4
4. DataCamp, Cleaning and Prepocessing Text