library(tm)
## Loading required package: NLP
library(stringr)
library(SnowballC)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(knitr)
library(tidyr)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(caret)
## Loading required package: lattice
library(gbm)
## Loaded gbm 2.1.8.1
library(e1071)

Load Data

ham = Corpus(DirSource("C:/Users/paola/Downloads/spamham/easy_ham_2"), readerControl=list(reader=readPlain))
length(ham)
## [1] 1401
spam = Corpus(DirSource("C:/Users/paola/Downloads/spamham/spam_2"), readerControl=list(reader=readPlain))
length(spam)
## [1] 1398

Analyze the Data

ham_docs = ham
ham_docs = tm_map(ham_docs, content_transformer(tolower))
ham_docs = tm_map(ham_docs, removeWords, stopwords("english"))
ham_docs = tm_map(ham_docs, removeNumbers)

a = TermDocumentMatrix(ham_docs)
b = as.matrix(a)
c = sort(rowSums(b), decreasing = TRUE)
ham_df = data.frame(word = names(c), freq=c)
head(ham_df)
##                word freq
## received: received: 9473
## aug             aug 7917
## esmtp         esmtp 6134
## jul             jul 5458
## [...])       [...]) 5438
## (../..)     (../..) 3409
spam_docs = spam
spam_docs = tm_map(spam_docs, content_transformer(tolower))
spam_docs = tm_map(spam_docs, removeWords, stopwords("english"))
spam_docs = tm_map(spam_docs, removeNumbers)

d = TermDocumentMatrix(spam_docs)
e = as.matrix(d)
f = sort(rowSums(e), decreasing = TRUE)
spam_df = data.frame(word = names(f), freq=f)
head(spam_df)
##                word freq
## received: received: 5976
## <td             <td 5493
## <br>           <br> 4655
## jul             jul 4279
## </tr>         </tr> 3267
## esmtp         esmtp 3085

Conclusion

The most common words in both the spam and ham files are “received:”, which is to be expected as messages need confirmation of delivery. For the ham files, “aus” seems to be the most common word while the spam files use “<td” as the most common “word”.