In Project 4 we were tasked with creating a documentation classification filter that would act as a spam filter. I took spam and non-spam (ham) emails from https://spamassassin.apache.org/old/publiccorpus/.
First I created a few for loops that would reference the local directory to download the text files associated with ham and spam emails.
library(readr)
ham_directory <- "C:/Users/gerso/Documents/DATA-607/Project 4/20030228_easy_ham_2.tar/20030228_easy_ham_2/easy_ham_2"
ham_list <- list.files(ham_directory)
h = data.frame()
for (f in ham_list){
fl<-paste(ham_directory,'/',f,sep="")
x <- data.frame(read_file(fl))
h<-rbind(h,x)
}
spam_directory <- "C:/Users/gerso/Documents/DATA-607/Project 4/20050311_spam_2.tar/20050311_spam_2/spam_2"
spam_list <- list.files(spam_directory)
s<-data.frame()
for (f in spam_list){
sl<-paste(spam_directory,'/',f,sep="")
x <- data.frame(read_file(sl))
s<-rbind(s,x)
}
After getting the data inro data frames I randomized the data frame so that I wouldn’t split the data frame and get all spam or all ham in a set of data. I then converted the “Msg” data into a corpus and cleaned the corpus of things like white space and punctuation usinf the tm package. I then used a random forest analysis and the caret package to perform a prediction of which emails in the test dataset would be ham vs spam emails. I relied heavily on online research and used snippets of code from the following sources:
https://towardsdatascience.com/random-forest-text-classification-trump-v-obama-c09f947173dc
https://towardsdatascience.com/sms-text-classification-a51defc2361c
library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caTools)
## Warning: package 'caTools' was built under R version 3.6.3
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## The following object is masked from 'package:NLP':
##
## annotate
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.6.3
colnames(h) <- c( "Msg")
colnames(s) <- c( "Msg")
h<-sqldf("select '0' as type, Msg from h")
s<-sqldf("select '1' as type, Msg from s")
hs<-rbind(h,s)
hs$Msg<-sub("Message-Id.*", "", hs$Msg)
set.seed(99)
rows <- sample(nrow(hs))
hs <- hs[rows, ]
hamcorpus = Corpus(VectorSource(hs$Msg))
hamcorpus<-tm_map(hamcorpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
## Warning in tm_map.SimpleCorpus(hamcorpus, content_transformer(function(x)
## iconv(enc2utf8(x), : transformation drops documents
hamcorpus <- tm_map(hamcorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(hamcorpus, content_transformer(tolower)):
## transformation drops documents
hamcorpus <- tm_map(hamcorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(hamcorpus, removeNumbers): transformation drops
## documents
hamcorpus <- tm_map(hamcorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(hamcorpus, removePunctuation): transformation
## drops documents
hamcorpus <- tm_map(hamcorpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(hamcorpus, removeWords, stopwords()):
## transformation drops documents
hamcorpus <- tm_map(hamcorpus, stemDocument)
## Warning in tm_map.SimpleCorpus(hamcorpus, stemDocument): transformation drops
## documents
hamcorpus <- tm_map(hamcorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(hamcorpus, stripWhitespace): transformation drops
## documents
dtm <- DocumentTermMatrix(hamcorpus)
dtm <- removeSparseTerms(dtm, 0.9)
data <- as.data.frame(as.matrix(dtm))
data$type <- hs$type
set.seed(1234)
split <- sample.split(data$type, SplitRatio = 0.75)
training_set <- subset(data, split == TRUE)
test_set <- subset(data, split == FALSE)
ts<-as.data.frame(test_set)
classifier <- randomForest(x = training_set[-250],
y = as.factor(training_set$type),
nTree = 10)
y <- predict(classifier, newdata = test_set[-250])
pred<-as.data.frame(y)
pred$type<-ts$type
tableset<-sqldf("select y,type,case when y = 1 then 'Spam Prediction' else 'Ham Prediction' end pred,
case when type = 1 then 'Spam Actual' else 'Ham Actual' end actual
from pred")
table<-table(tableset$pred,tableset$actual)
table
##
## Ham Actual Spam Actual
## Ham Prediction 349 0
## Spam Prediction 1 349
My documentation prediction spam folder had one false positive prediction and did not miss any actual spam emails. To further this analysis I would introduce a dataset that would be external to the dataset I used to create the test and training data frames. This would allow for more robust testing of the effectiveness of this model.