Introduction

In Project 4 we were tasked with creating a documentation classification filter that would act as a spam filter. I took spam and non-spam (ham) emails from https://spamassassin.apache.org/old/publiccorpus/.

Data Collection

First I created a few for loops that would reference the local directory to download the text files associated with ham and spam emails.

library(readr)

ham_directory <- "C:/Users/gerso/Documents/DATA-607/Project 4/20030228_easy_ham_2.tar/20030228_easy_ham_2/easy_ham_2"
ham_list <- list.files(ham_directory)

h = data.frame()

for (f in ham_list){
  fl<-paste(ham_directory,'/',f,sep="")
  x <- data.frame(read_file(fl))
  h<-rbind(h,x)
}

spam_directory <- "C:/Users/gerso/Documents/DATA-607/Project 4/20050311_spam_2.tar/20050311_spam_2/spam_2"
spam_list <- list.files(spam_directory)

s<-data.frame()

for (f in spam_list){
  sl<-paste(spam_directory,'/',f,sep="")
  x <- data.frame(read_file(sl))
  s<-rbind(s,x)
}

Data Analysis

After getting the data inro data frames I randomized the data frame so that I wouldn’t split the data frame and get all spam or all ham in a set of data. I then converted the “Msg” data into a corpus and cleaned the corpus of things like white space and punctuation usinf the tm package. I then used a random forest analysis and the caret package to perform a prediction of which emails in the test dataset would be ham vs spam emails. I relied heavily on online research and used snippets of code from the following sources:

https://towardsdatascience.com/random-forest-text-classification-trump-v-obama-c09f947173dc

https://towardsdatascience.com/sms-text-classification-a51defc2361c

library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caTools)
## Warning: package 'caTools' was built under R version 3.6.3
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
library(caret) 
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## The following object is masked from 'package:NLP':
## 
##     annotate
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.6.3
colnames(h) <- c( "Msg")
colnames(s) <- c( "Msg")

h<-sqldf("select '0' as type, Msg from h")
s<-sqldf("select '1' as type, Msg from s")



hs<-rbind(h,s)

hs$Msg<-sub("Message-Id.*", "", hs$Msg)



set.seed(99)
rows <- sample(nrow(hs))
hs <- hs[rows, ]


hamcorpus = Corpus(VectorSource(hs$Msg))
hamcorpus<-tm_map(hamcorpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
## Warning in tm_map.SimpleCorpus(hamcorpus, content_transformer(function(x)
## iconv(enc2utf8(x), : transformation drops documents
hamcorpus <- tm_map(hamcorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(hamcorpus, content_transformer(tolower)):
## transformation drops documents
hamcorpus <- tm_map(hamcorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(hamcorpus, removeNumbers): transformation drops
## documents
hamcorpus <- tm_map(hamcorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(hamcorpus, removePunctuation): transformation
## drops documents
hamcorpus <- tm_map(hamcorpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(hamcorpus, removeWords, stopwords()):
## transformation drops documents
hamcorpus <- tm_map(hamcorpus, stemDocument)
## Warning in tm_map.SimpleCorpus(hamcorpus, stemDocument): transformation drops
## documents
hamcorpus <- tm_map(hamcorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(hamcorpus, stripWhitespace): transformation drops
## documents
dtm <- DocumentTermMatrix(hamcorpus)
dtm <- removeSparseTerms(dtm, 0.9)

data <- as.data.frame(as.matrix(dtm))
data$type <- hs$type

set.seed(1234)
split <- sample.split(data$type, SplitRatio = 0.75)
training_set <- subset(data, split == TRUE)
test_set <- subset(data, split == FALSE)

ts<-as.data.frame(test_set)

classifier <- randomForest(x = training_set[-250], 
                           y = as.factor(training_set$type),
                           nTree = 10)

y <- predict(classifier, newdata = test_set[-250])


pred<-as.data.frame(y)
pred$type<-ts$type


tableset<-sqldf("select y,type,case when y = 1 then 'Spam Prediction' else 'Ham Prediction' end pred,
                case when type = 1 then 'Spam Actual' else 'Ham Actual' end actual
                from pred")

table<-table(tableset$pred,tableset$actual)

table
##                  
##                   Ham Actual Spam Actual
##   Ham Prediction         349           0
##   Spam Prediction          1         349

Conclusion

My documentation prediction spam folder had one false positive prediction and did not miss any actual spam emails. To further this analysis I would introduce a dataset that would be external to the dataset I used to create the test and training data frames. This would allow for more robust testing of the effectiveness of this model.