Project 4 - Document classification

library(tidymodels)
library(tidytext)
library(tinytex)
library(tidyverse)
library(RCurl)
library(knitr)
library(R.utils)
library(tm)

# Function to get tar file and untar
get_tar_untar <- function(tar_file_nm)
{
  tarDir <- "https://spamassassin.apache.org/old/publiccorpus/"
  tar_file <- paste0(tarDir,tar_file_nm)
  download.file(tar_file,destfile=paste0(outDir,tar_file_nm))
  untar(paste0(outDir,tar_file_nm),exdir = outDir)
}

# Function to separate the message from the header
get_email_body <- function(all_emails_dir)
{
  emails <- file(all_emails_dir, open="rt", encoding="latin1")
  email_text <- readLines(emails)
  email_body <- email_text[seq(which(email_text=="")[1]+1,length(email_text),1)]
  close(emails)
  return(paste(email_body, collapse="\n"))
}

# Function to separate the message from the header
get_all_emails <- function(email_list)
{
  email_list <- email_list[which(email_list!="cmds")]
  all_email_text <- sapply(email_list, function(p) get_email_body(paste0(email_dir,p)))
  return(all_email_text)
}

# Set up Global parms 
outDir <- "/Users/Audiorunner13/CUNY MSDS Course Work/DATA607 Spring 2021/Week12/Data/"
ham_dir <- paste0(outDir, "easy_ham/")
spam_dir <- paste0(outDir, "spam_2/")

# Call function to get and untar tar file
ham_file <- "20030228_easy_ham.tar.bz2"
get_tar_untar(ham_file)

spam_file <- "20050311_spam_2.tar.bz2"
get_tar_untar(spam_file)

# Create corresponding list of file names in easy_ham directory
ham_list = list.files(ham_dir)

# Call functions to separate header from message body for ham email
email_dir <- ham_dir
all_email_body <- get_all_emails(ham_list)

# Create a data frame of the ham emails and set a column spam to 0 to represent non-spam.
df_easy_ham <- as.data.frame(all_email_body)
rownames(df_easy_ham) <- NULL
df_easy_ham$spam <- 0

# Create corresponding list of file names in df_spam_2 directory
spam_list = list.files(spam_dir)

# Call functions to separate header from message body for df_spam_2 email
email_dir <- spam_dir
all_email_body <- get_all_emails(spam_list)

# Create a data frame of the spam emails and set a column spam to 1 to represent spam.
df_spam_2 <- as.data.frame(all_email_body)
rownames(df_spam_2) <- NULL
df_spam_2$spam <- 1

# Rename fields in each data frame.
names(df_easy_ham) <- c("text", "spam")
names(df_spam_2) <- c("text", "spam")

head(df_easy_ham,10)

head(df_spam_2,10)

# combining dataframes and getting a record count of each classification
email_data <- rbind(df_easy_ham, df_spam_2)
email_data_num   <- nrow(email_data)
table(email_data$spam)

## 
##    0    1 
## 2500 1396

# Build a new corpus variable called corpus
corpus <- VCorpus(VectorSource(email_data$text))
# convert the text to lowercase
corpus <- tm_map(corpus,content_transformer(tolower))
corpus <- tm_map(corpus,PlainTextDocument)
# remove all punctuation from the corpus
corpus <- tm_map(corpus,removePunctuation)
# remove all English stopwords from the corpus
corpus <- tm_map(corpus,removeWords,stopwords("en"))
# stem the words in the corpus
corpus <- tm_map(corpus,stemDocument)

# Extract the word frequencies to be used in our prediction problem. Create a DocumentTermMatrix where the rows # correspond to documents (emails), and the columns correspond to words.
(dtm = DocumentTermMatrix(corpus))

## <<DocumentTermMatrix (documents: 3896, terms: 77949)>>
## Non-/sparse entries: 429763/303259541
## Sparsity           : 100%
## Maximal term length: 880
## Weighting          : term frequency (tf)

# Limit dtm to contain terms appearing in at least 5% of documents to obtain a more reasonable number of terms.
# Remove sparse terms (that don't appear very often)
(spdtm = removeSparseTerms(dtm, 0.95))

## <<DocumentTermMatrix (documents: 3896, terms: 330)>>
## Non-/sparse entries: 125999/1159681
## Sparsity           : 90%
## Maximal term length: 43
## Weighting          : term frequency (tf)

# convert the dependent variable to a factor
emailsSparse$spam = as.factor(emailsSparse$spam)

# split the dataset into training and testing datasets and get a row count
email_split <- initial_split(emailsSparse, prop = 0.75, strata = spam)
email_training <- email_split %>% training()
email_test <- email_split %>% testing()
nrow(email_training)

## [1] 2922

nrow(email_test)

## [1] 974

# Create a logistic regression model
glm_model <- logistic_reg() %>% 
  set_engine("glm") %>%
  set_mode("classification")

# Train the model by using the fit function
glm_fit <- glm_model %>% 
  fit(spam ~ ., data=email_training)

#Predict outcome categories
(class_preds <- glm_fit %>% 
  predict(new_data = email_test, type = "class"))

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

# Calculate the probability of each prediction
(prob_preds <- glm_fit %>%
  predict(new_data = email_test, type = "prob"))

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

(spam_results <- email_test %>% 
  select(spam) %>% 
  bind_cols(class_preds, prob_preds))

Project 4 - Document classification

Peter Gatica

May 1, 2021