library(tidyverse)
library(tm)
library(tidytext)
library(wordcloud)
# read the easy_ham file in the drive
easy_ham <-"/Users/zhianna/Downloads/easy_ham/"
easy_spam <-"/Users/zhianna/Downloads/spam2/"
#list out all the file names
file <- list.files(easy_ham)
file2 <- list.files(easy_spam)
#loop to list all files with easy_ham
corpus <- NA
for(x in file)
{
doc <- print(paste0(easy_ham, x)) #print easy_ham and file name in one line
read <-readLines(doc) #read the file
corpus = c(corpus, read) # print out all the file information
}
corpus <- data.frame(text=unlist(corpus), stringsAsFactors=FALSE) %>%
unnest_tokens(word, text)
# Make a vector source and volatile corpus
corpus2 <- VectorSource(corpus)
corpus3 <- VCorpus(corpus2)
clean corpus
# used to clean up a corpus file
clean_corpus <- function(corpus) {
corpus <- tm_map(corpus, removePunctuation) # remove punctuations
corpus <- tm_map(corpus, content_transformer(tolower)) # change to lowercase
corpus <- tm_map(corpus, removeWords, words = c(stopwords("en"))) # remove stopwords
corpus <- tm_map(corpus, stripWhitespace) # remove whitespace
corpus <- tm_map(corpus, removeNumbers) # remove numbers
return(corpus)
}
#clean up the corpus using the clean corpus
ham_spam <- clean_corpus(corpus3)
#Build a document term matrix
token <-DocumentTermMatrix(ham_spam)
token
## <<DocumentTermMatrix (documents: 1, terms: 36250)>>
## Non-/sparse entries: 36250/0
## Sparsity : 0%
## Maximal term length: 76
## Weighting : term frequency (tf)
#create dataframe
ham_spam2 <- as.data.frame(as.matrix(token))
ham3<-pivot_longer(ham_spam2, cols = everything()) %>%
mutate(ham = "ham")
# wordcloud
wordcloud(ham_spam, max.words = 70)
## Warning in wordcloud(ham_spam, max.words = 70): received could not be fit on
## page. It will not be plotted.

#loop to list all files with spam
spamcorpus <- NA
for(x in file2)
{
doc <- print(paste0(easy_spam, x)) #print spam and file name in one line
read <-readLines(doc) #read the file
spamcorpus = c(spamcorpus, read) # print out all the file information
}
spamcorpus <- data.frame(text=unlist(spamcorpus), stringsAsFactors=FALSE) %>%
unnest_tokens(word, text)
# Make a vector source and volatile corpus
spamcorpus2 <- VectorSource(spamcorpus)
spamcorpus3 <- VCorpus(spamcorpus2)
#clean up the corpus using the clean corpus
ham_spam30 <- clean_corpus(spamcorpus3)
#Build a document term matrix
token23 <-DocumentTermMatrix(ham_spam30)
token23
#create dataframe
spam <- as.data.frame(as.matrix(token23)) %>%
arrange()
spam3<-pivot_longer(spam, cols = everything()) %>%
mutate(ham = "spam")
# wordcloud
wordcloud(spam3$name, max.words = 70)

together <- rbind(spam3,ham3)
together <-as.data.frame(together) %>%
select(name,ham)
together$ham<-as.factor(together$ham)
str(together)
# use 70% of dataset as training set and remaining 30% as testing set.
set.seed(1)
sample <- sample(c(TRUE, FALSE), nrow(together), replace=TRUE, prob=c(0.7,0.3))
train <- together[sample, ]
test <- together[!sample, ]
#view dimensions of training set
dim(train)
## [1] 53003 2
#view dimensions of test set
dim(test)
## [1] 22898 2