SpamAssassin

library(tm)

## Loading required package: NLP

library(stringr)
library(SnowballC)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(knitr)
library(tidyr)
library(tidytext)
library(wordcloud)

## Loading required package: RColorBrewer

library(caret)

## Loading required package: lattice

library(gbm)

## Loaded gbm 2.1.8.1

library(e1071)

Load Data

ham = Corpus(DirSource("C:/Users/paola/Downloads/spamham/easy_ham_2"), readerControl=list(reader=readPlain))
length(ham)

## [1] 1401

spam = Corpus(DirSource("C:/Users/paola/Downloads/spamham/spam_2"), readerControl=list(reader=readPlain))
length(spam)

## [1] 1398

Analyze the Data

ham_docs = ham
ham_docs = tm_map(ham_docs, content_transformer(tolower))
ham_docs = tm_map(ham_docs, removeWords, stopwords("english"))
ham_docs = tm_map(ham_docs, removeNumbers)

a = TermDocumentMatrix(ham_docs)
b = as.matrix(a)
c = sort(rowSums(b), decreasing = TRUE)
ham_df = data.frame(word = names(c), freq=c)
head(ham_df)

##                word freq
## received: received: 9473
## aug             aug 7917
## esmtp         esmtp 6134
## jul             jul 5458
## [...])       [...]) 5438
## (../..)     (../..) 3409

spam_docs = spam
spam_docs = tm_map(spam_docs, content_transformer(tolower))
spam_docs = tm_map(spam_docs, removeWords, stopwords("english"))
spam_docs = tm_map(spam_docs, removeNumbers)

d = TermDocumentMatrix(spam_docs)
e = as.matrix(d)
f = sort(rowSums(e), decreasing = TRUE)
spam_df = data.frame(word = names(f), freq=f)
head(spam_df)

##                word freq
## received: received: 5976
## <td             <td 5493
## <br>           <br> 4655
## jul             jul 4279
## </tr>         </tr> 3267
## esmtp         esmtp 3085

Conclusion

The most common words in both the spam and ham files are “received:”, which is to be expected as messages need confirmation of delivery. For the ham files, “aus” seems to be the most common word while the spam files use “<td” as the most common “word”.

SpamAssassin

Tyler Brown

2022-12-01

Load Data

Analyze the Data

Conclusion