library(tm)
## Loading required package: NLP
library(stringr)
library(SnowballC)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(knitr)
library(tidyr)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(caret)
## Loading required package: lattice
library(gbm)
## Loaded gbm 2.1.8.1
library(e1071)
load data
spam <- DirSource("/Users/ariananolan/Documents/MSDS Fall 2022/Data 607/spam_2")
spam_corpus <- Corpus(spam, readerControl=list(reader=readPlain))
length(spam_corpus)
## [1] 1397
ham <- DirSource("/Users/ariananolan/Documents/MSDS Fall 2022/Data 607/easy_ham")
ham_corpus <- Corpus(ham, readerControl=list(reader=readPlain))
length(ham_corpus)
## [1] 2551
clean data
spam_files = spam_corpus[which(spam_corpus!="cmds")] # Removing the .cmds files in all the folders.
ham_files=ham_corpus[which(ham_corpus!="cmds")]
toVCorpus <- function(file_path) {
corpus <- file_path %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()
return(corpus)
}
docClean <- function(corpus) {
corpus <- corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(tolower) %>%
tm_map(PlainTextDocument) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
return(corpus)
}
addTag <- function(corpus, tag, value){
for (i in 1:length(corpus)){
meta(corpus[[i]], tag) <- value
}
return(corpus)
}
visualization
#Ham
hamDF <-as.data.frame(unlist(ham_corpus),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")
#Spam
spamDF <-as.data.frame(unlist(spam_corpus),stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text","type")
spam_ham_df <- rbind(hamDF[1:1000,], spamDF[1:1000,])
# Combine both corpora
clean_corpus <- c(spam_corpus, ham_corpus)
word cloud
wordcloud(clean_corpus,max.words = 70, random.order = FALSE, min.freq=1000)
using caret to split data
corpus_labels <- unlist(clean_corpus, "emails")
corpus_dtm <-DocumentTermMatrix(clean_corpus)
set.seed(123)
spam_ham_df$text[spam_ham_df$text==""] <- "NaN"
train_index <- createDataPartition(spam_ham_df$type, p=0.70, list=FALSE)
email_train <- spam_ham_df[train_index,]
email_test <- spam_ham_df[-train_index,]
# Create corpus for training and test data
train_email_corpus <- Corpus(VectorSource(email_train$text))
test_email_corpus <- Corpus(VectorSource(email_test$text))
train_clean_corpus <- tm_map(train_email_corpus ,
removeNumbers)
## Warning in tm_map.SimpleCorpus(train_email_corpus, removeNumbers):
## transformation drops documents
test_clean_corpus <- tm_map(test_email_corpus,
removeNumbers)
## Warning in tm_map.SimpleCorpus(test_email_corpus, removeNumbers): transformation
## drops documents
train_clean_corpus <- tm_map(train_clean_corpus,
removePunctuation)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removePunctuation):
## transformation drops documents
test_clean_corpus <- tm_map(test_clean_corpus,
removePunctuation)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removePunctuation):
## transformation drops documents
#train_clean_corpus <- tm_map(train_clean_corpus,
#removeWords,
#stopwords())
#test_clean_corpus <- tm_map(test_clean_corpus,
#removeWords,
#stopwords())
train_clean_corpus<- tm_map(train_clean_corpus,
stripWhitespace)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, stripWhitespace):
## transformation drops documents
test_clean_corpus<- tm_map(test_clean_corpus,
stripWhitespace)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, stripWhitespace):
## transformation drops documents
train_email_dtm <- DocumentTermMatrix(train_clean_corpus)
test_email_dtm <- DocumentTermMatrix(test_clean_corpus)
#defining input variables 0 and 1 from string to integer
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c(0,1))
y
}
train_sms <- apply(train_email_dtm, 2, convert_count)
test_sms <- apply(test_email_dtm, 2, convert_count)
naiveBayes
classifier <- naiveBayes(train_sms, factor(email_train$type))
test_pred <- predict(classifier, newdata=test_sms)
summary: true positives (TP): These are cases in which we predicted spam (they have the disease), and emails are actually a spam. true negatives (TN): We predicted ham, and emails are actually a ham. false positives (FP): We predicted spam, but emails are actually a ham. (Also known as a “Type I error.”) false negatives (FN): We predicted ham, but emails are actually a spam. (Also known as a “Type II error.”)
table(test_pred, email_test$type)
##
## test_pred ham spam
## ham 297 3
## spam 3 297
Accuracy rate = (True Positive + True Negative )/Total
(297+297)/600
Accuracy rate = 0.99