library(tidyverse)
library(stringr)
library(tm)
library(dplyr)
library(tidymodels)
ham_dir <- "C:\\Users\\erodr\\Documents\\R\\data\\easy_ham"
ham_list <- list.files(ham_dir)
ham_list <- paste(ham_dir, "\\", ham_list, sep = "")
ham <- lapply(ham_list, FUN=readLines)
ham_corpus <- lapply(ham, FUN=paste, collapse=" ")
spam_dir <- "C:\\Users\\erodr\\Documents\\R\\data\\spam"
spam_list <- list.files(spam_dir)
spam_list <- paste(spam_dir, "\\", spam_list, sep = "")
spam <- lapply(spam_list, FUN=readLines)
spam_corpus <- lapply(spam, FUN=paste, collapse=" ")
spam_df <- as.data.frame(unlist(spam_corpus), stringsAsFactors = FALSE)
ham_df <- as.data.frame(unlist(ham_corpus), stringsAsFactors = FALSE)
#Add column to identify row as ham or spam
ham_df1 <- ham_df %>%
mutate(type = "ham_email")%>%
mutate_at(vars(type),list(factor))
spam_df1 <- spam_df %>%
mutate(type = "spam_email")%>%
mutate_at(vars(type),list(factor))
#Rename columnames for spam and ham dataframe to be consistent in preparation to bind
names(ham_df1)[1] <- "text"
names(spam_df1)[1] <- "text"
#Binding dataframes in prepration to make corpus and document term matrix
spam_ham <- rbind(ham_df1, spam_df1)
I used the tm package to clean up the data set. Although this package has nice functions to cleaning data, it needs to be converted into a Vcorpus. The corpus cannot be used in the model, so this then needs to be converted back into a data frame.
spam_ham_clean <- VCorpus(VectorSource(spam_ham$text))%>%
tm_map(removePunctuation)%>%
tm_map(removeNumbers)%>%
tm_map(content_transformer(tolower))%>%
tm_map(removeWords, stopwords("en"))%>%
tm_map(stripWhitespace)
#Converted the cleaned corpus into a document term matrix. Converted back to a data frame Add the type of email per document.
spam_ham_dtm <- DocumentTermMatrix(spam_ham_clean)
email_df <- data.frame(as.matrix(spam_ham_dtm), stringsAsFactors=FALSE)
email_df1 <- cbind(email_df, spam_ham$type)
email_df2 <- email_df1 %>% mutate_at(vars(type),list(factor))
##set.seed(123)
email_split <- initial_split(email_df1, prop = 0.75, strata = type)
email_training <- email_split %>% training()
email_test <- email_split %>% testing()
logistic_model <- logistic_reg()%>% set_engine('glm') %>% set_mode('classification')
#logistic_fit <- logistic_model %>% fit(type ~., data = email_training)
#class_preds <- logistic_fit %>% predict(new_data = leads_test, type='class')
#prob_preds <- logistic_fit %>% predict(new_data = leads_test, type='prob')
The code that is commented out produce errors.
References:
1. DataDaft, https://www.youtube.com/watch?v=34sbvhr_pm8
2. Jalayer Academy, https://www.youtube.com/watch?v=pFinlXYLZ-A
3. Jaylayer Academy, https://www.youtube.com/watch?v=jCrQYOsAcv4
4. DataCamp, Cleaning and Prepocessing Text