Data607Project4.knit

library(tm)

## Warning: package 'tm' was built under R version 4.4.2

## Loading required package: NLP

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## Loading required package: lattice

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.2

library(text)

## Warning: package 'text' was built under R version 4.4.2

## [0;34mThis is text (version 1.2.3).
## [0m[0;32mText is new and still rapidly improving.
##                
## Newer versions may have improved functions and updated defaults to reflect current understandings of the state-of-the-art.
##                Please send us feedback based on your experience.[0m[0;35m
## 
## For more information about the package see www.r-text.org.[0m

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ purrr::lift()       masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(knitr)
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.4.2

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.4.2

## Loading required package: RColorBrewer

# File paths for spam and ham emails 
easy_ham_path <- "C:/Users/poiso/Downloads/easy_ham"
spam_path <- "C:/Users/poiso/Downloads/spam"

# Function to read all files in a directory
read_emails <- function(path) {
  files <- list.files(path, full.names = TRUE)  # Get full paths
  sapply(files, function(x) paste(readLines(x, warn = FALSE), collapse = " "))
}

# Reads emails
easy_ham <- read_emails(easy_ham_path)
spam <- read_emails(spam_path)

# Creates labeled data frame
data <- data.frame(
  text = c(easy_ham, spam),
  label = factor(c(rep("ham", length(easy_ham)), rep("spam", length(spam))))
)
str(data)

## 'data.frame':    3052 obs. of  2 variables:
##  $ text : chr  "From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002 Return-Path: <exmh-workers-admin@example.com> Deli"| __truncated__ "From Steve_Burt@cursor-system.com  Thu Aug 22 12:46:39 2002 Return-Path: <Steve_Burt@cursor-system.com> Deliver"| __truncated__ "From timc@2ubh.com  Thu Aug 22 13:52:59 2002 Return-Path: <timc@2ubh.com> Delivered-To: zzzz@localhost.netnotei"| __truncated__ "From irregulars-admin@tb.tf  Thu Aug 22 14:23:39 2002 Return-Path: <irregulars-admin@tb.tf> Delivered-To: zzzz@"| __truncated__ ...
##  $ label: Factor w/ 2 levels "ham","spam": 1 1 1 1 1 1 1 1 1 1 ...

# Function to clean email text
clean_text <- function(text) {
  text <- iconv(text, from = "latin1", to = "UTF-8", sub = "")  # Convert encoding to UTF-8
  text <- tolower(text)                                        # Convert to lowercase
  text <- removeNumbers(text)                                  # Remove numbers
  text <- removePunctuation(text)                              # Remove punctuation
  text <- removeWords(text, stopwords("en"))                   # Remove stop words
  text <- stripWhitespace(text)                                # Remove extra whitespace
  return(text)
}

# Applies the clean_text function
data$text <- sapply(data$text, clean_text)

# Split data into 80% training and 20% testing sets
set.seed(123)
train_index <- createDataPartition(data$label, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

# Balances the dataset by undersampling ham, necessary due to large imbalance in observations
ham_sample <- train_data %>% 
  filter(label == "ham") %>% 
  sample_n(sum(train_data$label == "spam")) 

spam_sample <- train_data %>% filter(label == "spam")

# Combines the balanced data set
balanced_train_data <- bind_rows(ham_sample, spam_sample)

# Checks class distribution
table(balanced_train_data$label)

## 
##  ham spam 
##  401  401

# Create Corpus for the balanced training data
balanced_corpus <- Corpus(VectorSource(balanced_train_data$text))

# Generate the DTM
balanced_dtm <- DocumentTermMatrix(balanced_corpus, control = list(wordLengths = c(1, Inf)))

# Reduce sparsity
balanced_dtm <- removeSparseTerms(balanced_dtm, 0.99)

# Convert to dataframe
train_matrix <- as.data.frame(as.matrix(balanced_dtm))
train_matrix$label <- balanced_train_data$label

test_corpus <- Corpus(VectorSource(test_data$text))

# Create the DTM for test data using the same terms
test_dtm <- DocumentTermMatrix(test_corpus, control = list(dictionary = Terms(balanced_dtm)))
test_matrix <- as.data.frame(as.matrix(test_dtm))
test_matrix$label <- test_data$label

# Train the Naive Bayes model
model <- naiveBayes(label ~ ., data = train_matrix)

# Predict on the test data set
predictions <- predict(model, newdata = test_matrix)

# Creates a confusion matrix
conf_matrix <- confusionMatrix(predictions, test_matrix$label)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ham spam
##       ham  502   31
##       spam   8   69
##                                           
##                Accuracy : 0.9361          
##                  95% CI : (0.9136, 0.9541)
##     No Information Rate : 0.8361          
##     P-Value [Acc > NIR] : 8.57e-14        
##                                           
##                   Kappa : 0.743           
##                                           
##  Mcnemar's Test P-Value : 0.000427        
##                                           
##             Sensitivity : 0.9843          
##             Specificity : 0.6900          
##          Pos Pred Value : 0.9418          
##          Neg Pred Value : 0.8961          
##              Prevalence : 0.8361          
##          Detection Rate : 0.8230          
##    Detection Prevalence : 0.8738          
##       Balanced Accuracy : 0.8372          
##                                           
##        'Positive' Class : ham             
##

# Display Accuracy
cat("Accuracy:", conf_matrix$overall["Accuracy"], "\n")

## Accuracy: 0.9360656

# Example new emails to classify
new_docs <- c(
  "Congratulations! You have won a $1,000 Amazon gift card. Click here to claim now!",
  "Hi team, please review the annual reports here."
)

# Cleans the new documents
new_docs_clean <- sapply(new_docs, clean_text)

# Create a Corpus and DTM for new documents
new_corpus <- Corpus(VectorSource(new_docs_clean))
new_dtm <- DocumentTermMatrix(new_corpus, control = list(dictionary = Terms(balanced_dtm)))

# Convert the new DTM to a dataframe
new_matrix <- as.data.frame(as.matrix(new_dtm))

# Predict labels for the new documents
new_predictions <- predict(model, newdata = new_matrix)
print(data.frame(Document = new_docs, Prediction = new_predictions))

##                                                                            Document
## 1 Congratulations! You have won a $1,000 Amazon gift card. Click here to claim now!
## 2                                   Hi team, please review the annual reports here.
##   Prediction
## 1        ham
## 2        ham

# Plots the distribution of spam and ham emails
data %>%
  count(label) %>%
  ggplot(aes(x = label, y = n, fill = label)) +
  geom_bar(stat = "identity") +
  labs(title = "Spam vs Ham Distribution", x = "Label", y = "Count") +
  theme_minimal()

# Creates a word cloud for spam emails
spam_corpus <- Corpus(VectorSource(data$text[data$label == "spam"]))
wordcloud(spam_corpus, max.words = 100, random.order = FALSE, scale = c(3, 0.5))

# Creates a word cloud for ham emails
ham_corpus <- Corpus(VectorSource(data$text[data$label == "ham"]))
wordcloud(ham_corpus, max.words = 100, random.order = FALSE, scale = c(3, 0.5))

# Display the confusion matrix as a table using Kable
conf_matrix_table <- as.data.frame(conf_matrix$table)
kable(conf_matrix_table, caption = "Confusion Matrix", format = "html") %>%
  kable_styling("striped", full_width = FALSE)

Confusion Matrix
Prediction	Reference	Freq
ham	ham	502
spam	ham	8
ham	spam	31
spam	spam	69