Project 4

CodeBase

library(tm)

Warning: package 'tm' was built under R version 4.5.3

Loading required package: NLP

library(stringr)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(caret)

Loading required package: ggplot2


Attaching package: 'ggplot2'

The following object is masked from 'package:NLP':

    annotate

Loading required package: lattice

library(e1071)


Attaching package: 'e1071'

The following object is masked from 'package:ggplot2':

    element

Function th

#Created a function to clean the email
clean_email <- function(path) {
  # Read the file
  raw_text <- readLines(path, warn = FALSE, encoding = "latin1")
  full_text <- paste(raw_text, collapse = " ")
  
  #Removes the email headers
  body_text <- str_replace(full_text, "^.*?(\r?\n\r?\n|<html>)", "")
  
  #Removes the HTML Tags
  clean_text <- body_text %>%
    str_replace_all("=3D", "=") %>% 
    str_replace_all("<[^>]+>", " ") %>%          # Remove HTML tags
    str_replace_all("\\s+", " ")                 # Remove extra whitespace
  
  return(clean_text)
}

#Getting the datasets

ham_dir <- "C:/Users/Jeovany/Documents/R studio/Data 607/Project 4/easy_ham"
spam_dir <- "C:/Users/Jeovany/Documents/R studio/Data 607/Project 4/spam"

ham_files <- list.files(ham_dir, full.names = TRUE)
spam_files <- list.files(spam_dir, full.names = TRUE)

# Creating dataframes
data_ham <- data.frame(text = sapply(ham_files, clean_email), label = "ham", stringsAsFactors = FALSE)
data_spam <- data.frame(text = sapply(spam_files, clean_email), label = "spam", stringsAsFactors = FALSE)
email_df <- bind_rows(data_ham, data_spam)

Creating the Corpus and Document Term Matrix

corpus <- VCorpus(VectorSource(email_df$text))

corpus_clean <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords("english")) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)

# Creating the DTM
dtm <- DocumentTermMatrix(corpus_clean)
dtm_filtered <- removeSparseTerms(dtm, 0.99)   #Filtering out rarely used words

convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

# convert the dataframe to a matrix
dtm_binary <- apply(dtm_filtered, MARGIN = 2, convert_counts)
labeled_data <- as.data.frame(dtm_binary)
labeled_data$class_label <- as.factor(email_df$label)

Training a Naive Bayes model with Laplace smoothing

set.seed(67) # Seed set for reproducibility

train_index <- createDataPartition(labeled_data$class_label, p = 0.7, list = FALSE)
train_set <- labeled_data[train_index, ]
test_set  <- labeled_data[-train_index, ]

# Laplace = 1 prevents the "zero probability" trap
model <- naiveBayes(class_label ~ ., data = train_set, laplace = 1)

# 6. Evaluation
predictions <- predict(model, test_set)
confusionMatrix(predictions, test_set$class_label)

Confusion Matrix and Statistics

          Reference
Prediction ham spam
      ham  706    2
      spam  59  148
                                          
               Accuracy : 0.9333          
                 95% CI : (0.9152, 0.9486)
    No Information Rate : 0.8361          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.789           
                                          
 Mcnemar's Test P-Value : 7.496e-13       
                                          
            Sensitivity : 0.9229          
            Specificity : 0.9867          
         Pos Pred Value : 0.9972          
         Neg Pred Value : 0.7150          
             Prevalence : 0.8361          
         Detection Rate : 0.7716          
   Detection Prevalence : 0.7738          
      Balanced Accuracy : 0.9548          
                                          
       'Positive' Class : ham