library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.5.3

## Loading required package: RColorBrewer

library(tidytext)

## Warning: package 'tidytext' was built under R version 3.5.3

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.5.3

## -- Attaching packages ------------------------------------------------------------------------ tidyverse 1.2.1 --

## v ggplot2 3.1.1       v purrr   0.3.1  
## v tibble  2.0.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.5.3

## Warning: package 'readr' was built under R version 3.5.3

## Warning: package 'forcats' was built under R version 3.5.3

## -- Conflicts --------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Warning: package 'caret' was built under R version 3.5.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(SnowballC)
library(tm)

## Warning: package 'tm' was built under R version 3.5.3

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(e1071)

## Warning: package 'e1071' was built under R version 3.5.3

Load data

spam <- 'C:\\Users\\HR\\Desktop\\data 607\\project 4\\spam_2\\'
ham <- 'C:\\Users\\HR\\Desktop\\data 607\\project 4\\easy_ham\\'

Create data frame for Ham and Spam

ham.files <- list.files(ham)
spam.files <- list.files(spam)

List documents of Ham

ham_docs_list <- NA
for(i in 1:length(ham.files))
{
  filepath<-paste0(ham, "/", ham.files[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  ham_docs_list = c(ham_docs_list,list1)
}

hamDF <-as.data.frame(unlist(ham_docs_list),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")
nrow(hamDF)

## [1] 2502

List documents of Spam

spam_docs_list <- NA
for(i in 1:length(spam.files))
{
  filepath<-paste0(spam, "/", spam.files[1])
  text <- readLines(filepath)
  list1<- list(paste(text, collapse = "\n"))
  spam_docs_list = c(spam_docs_list,list1)
}

spamDF <-as.data.frame(unlist(spam_docs_list), stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text", "type")
nrow(spamDF)

## [1] 1398

ham_spam_DF <- rbind(hamDF, spamDF)

Prepare Corpus

ham.corpus <- VCorpus(VectorSource(hamDF$text))
spam.corpus <- VCorpus(VectorSource(spamDF$text))
corpus <- c(ham.corpus,spam.corpus)

Remove Numbers

Remove Punctuation

Remove Word

emailCorpus <- Corpus(VectorSource(corpus$text))
cleanCorpus <- tm_map(emailCorpus, removeNumbers)
cleanCorpus <- tm_map(cleanCorpus, removePunctuation)
cleanCorpus <- tm_map(cleanCorpus, removeWords, stopwords())
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace)

Create document Term Matrix

spam_indices <- which(ham_spam_DF$type == "spam")
wordcloud(corpus[spam_indices], max.words = 75, random.order = FALSE, random.color = TRUE,colors=palette())

ham_indices <- which(ham_spam_DF$type == "ham")
wordcloud(corpus[ham_indices], max.words = 75, random.order = FALSE, random.color = TRUE,colors=palette())

## Warning in wordcloud(corpus[ham_indices], max.words = 75, random.order =
## FALSE, : <https://listman.spamassassin.taint.org/mailman/listinfo/exmh-
## workers>, could not be fit on page. It will not be plotted.

Prepare test and train data

set.seed(100)
sample_size <- floor(0.70 * nrow(ham_spam_DF))
train_idx <- sample(seq_len(nrow(ham_spam_DF)), size = sample_size)

training_df <- ham_spam_DF[train_idx, ]
testing_df <- ham_spam_DF[-train_idx, ]

training_corp <- Corpus(VectorSource(training_df$text))
testing_corp <- Corpus(VectorSource(testing_df$text))
training_dtm <- DocumentTermMatrix(training_corp)
testing_dtm <- DocumentTermMatrix(testing_corp)

counter <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

train_sms <- apply(training_dtm, 2, counter)
test_sms <- apply(testing_dtm, 2, counter)

Predict using test data

classifier <- naiveBayes(train_sms, factor(training_df$type))

predict_test <- predict(classifier, newdata=test_sms)

table(predict_test, testing_df$type)

##             
## predict_test ham spam
##         ham  756    0
##         spam   0  414

Untitled

Load data

Create data frame for Ham and Spam

List documents of Ham

List documents of Spam

Prepare Corpus

Remove Numbers

Remove Punctuation

Remove Word

Create document Term Matrix

Prepare test and train data

Predict using test data