library(rvest)
library(httr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
library(tidytext)
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(knitr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ NLP::content() masks httr::content()
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(wordcloud)
## Loading required package: RColorBrewer
library(e1071)
#load files from github desktop
ham <- "/Users/angelgallardo/Documents/GitHub/DATA607-P4/easy_ham"
hamfiles = list.files(ham)
hamlist <- NA
for(i in 1:length(hamfiles))
{
filepath<-paste0(ham, "/", hamfiles[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
hamlist = c(hamlist,list1)
}
# ham data frame
hamDF <-as.data.frame(unlist(hamlist),stringsAsFactors = FALSE)
hamDF$type <- "0"
colnames(hamDF) <- c("text","type")
# Create Spam Dataframe
spam <- "/Users/angelgallardo/Documents/GitHub/DATA607-P4/spam"
spamfiles = list.files(spam)
spamlist <- NA
for(i in 1:length(spamfiles))
{
filepath<-paste0(spam, "/", spamfiles[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
spamlist = c(spamlist,list1)
}
spamDF <-as.data.frame(unlist(spamlist),stringsAsFactors = FALSE)
spamDF$type <- "1"
colnames(spamDF) <- c("text","type")
# creating combined data frame of spam and ham
spam_ham_df <- rbind(hamDF, spamDF)
emails <- Corpus(VectorSource(spam_ham_df$text))
cleanCorpus <- tm_map(emails, removeNumbers)
## Warning in tm_map.SimpleCorpus(emails, removeNumbers): transformation drops
## documents
cleanCorpus <- tm_map(cleanCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(cleanCorpus, removePunctuation): transformation
## drops documents
cleanCorpus <- tm_map(cleanCorpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(cleanCorpus, removeWords, stopwords()):
## transformation drops documents
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(cleanCorpus, stripWhitespace): transformation
## drops documents
sample <- floor(0.80 * nrow(spam_ham_df))
# set the seed to make your partition reproductible
set.seed(555)
train_ind <- sample(seq_len(nrow(spam_ham_df)), size = sample)
train_spam_ham <- spam_ham_df[train_ind, ]
test_spam_ham <- spam_ham_df[-train_ind, ]
# count of spam and ham in train data set
spam<-subset(train_spam_ham,train_spam_ham$type == "0")
ham<-subset(train_spam_ham,train_spam_ham$type == "1")
train_email_corpus <- Corpus(VectorSource(train_spam_ham$text))
test_email_corpus <- Corpus(VectorSource(test_spam_ham$text))
train_clean_corpus <- tm_map(train_email_corpus ,removeNumbers)
## Warning in tm_map.SimpleCorpus(train_email_corpus, removeNumbers):
## transformation drops documents
test_clean_corpus <- tm_map(test_email_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(test_email_corpus, removeNumbers):
## transformation drops documents
train_clean_corpus <- tm_map(train_clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removePunctuation):
## transformation drops documents
test_clean_corpus <- tm_map(test_clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removePunctuation):
## transformation drops documents
train_clean_corpus <- tm_map(train_clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removeWords, stopwords()):
## transformation drops documents
test_clean_corpus <- tm_map(test_clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removeWords, stopwords()):
## transformation drops documents
train_clean_corpus<- tm_map(train_clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, stripWhitespace):
## transformation drops documents
test_clean_corpus<- tm_map(test_clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, stripWhitespace):
## transformation drops documents
train_email_dtm <- DocumentTermMatrix(train_clean_corpus)
test_email_dtm <- DocumentTermMatrix(test_clean_corpus)
train_matrix <- as.matrix(train_email_dtm)
# Extract labels from training data
labels <- train_spam_ham$type
# Ensure labels are factors for classification
labels <- as.factor(labels)
# Train SVM model for classification
svm_model <- svm(x = train_matrix, y = labels, kernel = "linear", cost = 1)
# Convert Document-Term Matrix of test data to matrix
test_matrix <- as.matrix(test_email_dtm)
# Make predictions on test data using the trained SVM model
predictions <- predict(svm_model, newdata = test_matrix)
# Evaluate model performance (e.g., accuracy)
accuracy <- mean(predictions == test_spam_ham$type)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9983633
#(ham) 515 instances correctly predicted as class 0 #(spam): 1 instance incorrectly predicted as class 0, # and 95 instances correctly predicted as class 1
# Create confusion matrix
conf_matrix <- table(Actual = test_spam_ham$type, Predicted = predictions)
print(conf_matrix)
## Predicted
## Actual 0 1
## 0 515 0
## 1 1 95
# Calculate precision, recall, and F1-score
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Precision:", precision, "\n")
## Precision: 1
cat("Recall (Sensitivity):", recall, "\n")
## Recall (Sensitivity): 0.9895833
cat("F1-score:", f1_score, "\n")
## F1-score: 0.9947644
#ham word cloud
dtm <- DocumentTermMatrix(cleanCorpus)
# Ham word cloud
ham_indices <- which(spam_ham_df$type == "0")
suppressWarnings(wordcloud(cleanCorpus[ham_indices], min.freq=40))
#spam word cloud
dtm <- DocumentTermMatrix(cleanCorpus)
# spam word cloud
spam_indices <- which(spam_ham_df$type == "1")
suppressWarnings(wordcloud(cleanCorpus[spam_indices], min.freq=40))
Test had a 98% accuracy. All of the ham intances were correctly predicted while only 1 of the spam instance was classified as not a spam