library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
## Loading required package: RColorBrewer
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.1
## v tibble 2.0.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts --------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(SnowballC)
library(tm)
## Warning: package 'tm' was built under R version 3.5.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
Load data
spam <- 'C:\\Users\\HR\\Desktop\\data 607\\project 4\\spam_2\\'
ham <- 'C:\\Users\\HR\\Desktop\\data 607\\project 4\\easy_ham\\'
Create data frame for Ham and Spam
ham.files <- list.files(ham)
spam.files <- list.files(spam)
List documents of Ham
ham_docs_list <- NA
for(i in 1:length(ham.files))
{
filepath<-paste0(ham, "/", ham.files[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
ham_docs_list = c(ham_docs_list,list1)
}
hamDF <-as.data.frame(unlist(ham_docs_list),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")
nrow(hamDF)
## [1] 2502
List documents of Spam
spam_docs_list <- NA
for(i in 1:length(spam.files))
{
filepath<-paste0(spam, "/", spam.files[1])
text <- readLines(filepath)
list1<- list(paste(text, collapse = "\n"))
spam_docs_list = c(spam_docs_list,list1)
}
spamDF <-as.data.frame(unlist(spam_docs_list), stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text", "type")
nrow(spamDF)
## [1] 1398
ham_spam_DF <- rbind(hamDF, spamDF)
Prepare Corpus
ham.corpus <- VCorpus(VectorSource(hamDF$text))
spam.corpus <- VCorpus(VectorSource(spamDF$text))
corpus <- c(ham.corpus,spam.corpus)
Remove Numbers
Remove Punctuation
Remove Word
emailCorpus <- Corpus(VectorSource(corpus$text))
cleanCorpus <- tm_map(emailCorpus, removeNumbers)
cleanCorpus <- tm_map(cleanCorpus, removePunctuation)
cleanCorpus <- tm_map(cleanCorpus, removeWords, stopwords())
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace)
Create document Term Matrix
spam_indices <- which(ham_spam_DF$type == "spam")
wordcloud(corpus[spam_indices], max.words = 75, random.order = FALSE, random.color = TRUE,colors=palette())

ham_indices <- which(ham_spam_DF$type == "ham")
wordcloud(corpus[ham_indices], max.words = 75, random.order = FALSE, random.color = TRUE,colors=palette())
## Warning in wordcloud(corpus[ham_indices], max.words = 75, random.order =
## FALSE, : <https://listman.spamassassin.taint.org/mailman/listinfo/exmh-
## workers>, could not be fit on page. It will not be plotted.

Prepare test and train data
set.seed(100)
sample_size <- floor(0.70 * nrow(ham_spam_DF))
train_idx <- sample(seq_len(nrow(ham_spam_DF)), size = sample_size)
training_df <- ham_spam_DF[train_idx, ]
testing_df <- ham_spam_DF[-train_idx, ]
training_corp <- Corpus(VectorSource(training_df$text))
testing_corp <- Corpus(VectorSource(testing_df$text))
training_dtm <- DocumentTermMatrix(training_corp)
testing_dtm <- DocumentTermMatrix(testing_corp)
counter <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
train_sms <- apply(training_dtm, 2, counter)
test_sms <- apply(testing_dtm, 2, counter)
Predict using test data
classifier <- naiveBayes(train_sms, factor(training_df$type))
predict_test <- predict(classifier, newdata=test_sms)
table(predict_test, testing_df$type)
##
## predict_test ham spam
## ham 756 0
## spam 0 414