library(tm)
## Loading required package: NLP
library(stringr)
library(SnowballC)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(knitr)
library(tidyr)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(caret)
## Loading required package: lattice
library(gbm)
## Loaded gbm 2.1.8.1
library(e1071)
ham = Corpus(DirSource("C:/Users/paola/Downloads/spamham/easy_ham_2"), readerControl=list(reader=readPlain))
length(ham)
## [1] 1401
spam = Corpus(DirSource("C:/Users/paola/Downloads/spamham/spam_2"), readerControl=list(reader=readPlain))
length(spam)
## [1] 1398
ham_docs = ham
ham_docs = tm_map(ham_docs, content_transformer(tolower))
ham_docs = tm_map(ham_docs, removeWords, stopwords("english"))
ham_docs = tm_map(ham_docs, removeNumbers)
a = TermDocumentMatrix(ham_docs)
b = as.matrix(a)
c = sort(rowSums(b), decreasing = TRUE)
ham_df = data.frame(word = names(c), freq=c)
head(ham_df)
## word freq
## received: received: 9473
## aug aug 7917
## esmtp esmtp 6134
## jul jul 5458
## [...]) [...]) 5438
## (../..) (../..) 3409
spam_docs = spam
spam_docs = tm_map(spam_docs, content_transformer(tolower))
spam_docs = tm_map(spam_docs, removeWords, stopwords("english"))
spam_docs = tm_map(spam_docs, removeNumbers)
d = TermDocumentMatrix(spam_docs)
e = as.matrix(d)
f = sort(rowSums(e), decreasing = TRUE)
spam_df = data.frame(word = names(f), freq=f)
head(spam_df)
## word freq
## received: received: 5976
## <td <td 5493
## <br> <br> 4655
## jul jul 4279
## </tr> </tr> 3267
## esmtp esmtp 3085
The most common words in both the spam and ham files are “received:”, which is to be expected as messages need confirmation of delivery. For the ham files, “aus” seems to be the most common word while the spam files use “<td” as the most common “word”.