loading packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(tm)
## Warning: package 'tm' was built under R version 4.0.5
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
functions
# function for cleaning up text.
# Takes in string, removes various, non word characters, and splits it into a vector of phrases
prepare_str = function(str){
prepared = str%>%
# remove junk
str_replace_all("\\&|\\*|\\_|\'|\"|\r|\t|\n|\\$|\\-|\\/|,|\\@|\\.|\\<|\\>|\\;|\\:|\\[|\\]|\\)|\\(|\\=|\\!|\\?",' ')%>%
str_to_lower()%>%
removeWords(stopwords('en'))%>%
removeNumbers()%>%
stripWhitespace()%>%
return()
}
loading data
spam_emails = list.files("spam", full.names = TRUE)
ham_emails = list.files("ham", full.names = TRUE)
# dataframe with text and flags 1:spam 0: ham
email = c()
text = c()
for (spam in spam_emails){
email = c(email,1)
txt = read_file(spam)
text = c(text, txt)
}
for (ham in ham_emails){
email = c(email,0)
txt = read_file(ham)
text = c(text, txt)
}
emails = data.frame(email = email, text = text)
#head(emails)
word dictionary
# create Document Term Matrix using tm() library
dtm = emails$text%>%
prepare_str()%>%
VectorSource()%>%
Corpus()%>%
DocumentTermMatrix()%>%
# remove uncommon terms
removeSparseTerms(.9)
inspect(dtm)
## <<DocumentTermMatrix (documents: 3898, terms: 218)>>
## Non-/sparse entries: 260274/589490
## Sparsity : 69%
## Maximal term length: 12
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs com font fork http list localhost net org received spamassassin
## 1055 22 542 0 4 6 5 30 0 16 3
## 1083 20 542 0 4 6 5 31 0 14 3
## 1090 204 1102 0 516 5 6 43 4 7 1
## 1091 204 1102 0 516 2 6 43 4 7 1
## 1300 176 36 17 156 6 7 0 6 5 4
## 1563 175 198 0 141 2 5 11 14 4 11
## 28 146 1627 0 80 4 0 8 5 2 0
## 51 167 41 0 83 12 0 14 0 2 0
## 77 165 41 0 93 0 0 24 4 2 0
## 909 26 340 0 11 24 5 33 0 11 3
Create table for modeling
# transform dtm created with tm() package to a dataframe
word_matrix = dtm %>%
as.matrix()%>%
as.data.frame() %>%
mutate(spam = emails$email)
head(word_matrix)
## address admin also ascii aug back beenthere bulk business call charset click
## 1 1 4 2 1 8 1 1 1 1 1 1 1
## 2 10 0 1 0 0 0 0 0 0 0 1 0
## 3 10 0 0 0 0 0 0 0 0 0 1 1
## 4 10 0 0 0 0 0 0 0 0 0 1 1
## 5 4 0 0 0 5 1 0 0 0 0 1 2
## 6 0 0 0 7 0 0 0 0 0 0 2 0
## com content date delivered dogma drop edt email errors esmtp fetchmail form
## 1 6 1 1 1 1 1 1 4 1 4 1 1
## 2 14 1 2 0 1 0 0 3 0 1 0 0
## 3 20 1 2 0 1 0 0 3 0 2 0 0
## 4 20 1 2 0 1 0 0 3 0 3 0 0
## 5 10 1 1 1 0 1 1 2 0 2 1 0
## 6 2 6 1 0 0 0 1 1 0 4 0 3
## format free fri friends get good group hotmail http imap info information ist
## 1 1 3 5 1 1 1 2 1 1 1 1 6 1
## 2 0 3 0 0 1 0 0 0 4 0 0 2 0
## 3 0 4 0 0 2 5 0 0 4 0 0 2 0
## 4 0 4 0 0 2 5 0 0 4 0 0 2 0
## 5 0 0 0 0 1 1 0 0 2 0 0 3 1
## 6 0 0 1 0 0 0 0 0 3 0 0 0 0
## jmason just labs like linux list listinfo localhost mailing mailman many may
## 1 1 1 1 1 13 5 1 7 1 2 1 1
## 2 1 0 2 0 0 0 0 0 0 0 0 6
## 3 1 0 2 1 0 1 0 0 0 0 0 6
## 4 1 0 2 1 0 1 0 0 0 0 0 7
## 5 0 3 1 2 0 0 0 6 0 0 0 2
## 6 0 0 0 0 0 1 1 3 1 0 0 0
## message mime money need net netnoteinc new now one org path people phobos
## 1 2 1 1 1 2 2 1 1 4 7 1 4 2
## 2 1 1 0 1 1 3 0 1 3 2 1 0 0
## 3 1 1 0 0 0 3 4 3 2 2 1 0 0
## 4 1 1 0 0 4 4 4 3 2 2 1 0 0
## 5 1 1 1 1 4 3 0 0 0 0 1 2 1
## 6 1 2 0 0 28 0 12 0 0 0 0 0 0
## plain please postfix precedence real receive received reply return see send
## 1 1 2 1 1 1 2 7 1 1 1 4
## 2 0 4 0 0 2 0 3 0 2 0 1
## 3 0 4 0 0 0 1 3 0 1 0 1
## 4 0 4 0 0 0 1 4 0 1 0 1
## 5 0 6 1 0 0 1 5 1 1 0 1
## 6 0 0 0 0 0 0 7 0 0 0 0
## sender sent single slashnull subject text think time tue type use used users
## 1 1 2 1 1 2 1 1 1 3 2 3 1 2
## 2 0 0 0 1 1 1 0 2 0 1 1 0 0
## 3 0 0 0 1 2 1 0 1 0 1 1 0 0
## 4 0 0 0 1 2 1 0 1 0 1 1 0 0
## 5 0 0 1 0 1 1 0 0 3 1 0 0 0
## 6 0 0 0 0 1 3 0 1 7 13 0 0 0
## version way will work www years yyyy body can center color day delivery don
## 1 2 1 6 2 1 1 1 0 0 0 0 0 0 0
## 2 1 0 3 1 4 0 0 2 1 2 3 1 1 3
## 3 1 1 4 0 4 0 0 7 2 4 4 2 1 2
## 4 1 1 4 0 4 0 0 7 2 4 4 2 1 2
## 5 1 0 1 0 2 1 3 1 3 2 7 1 0 1
## 6 3 0 0 0 1 0 0 2 0 19 36 0 0 0
## even express font help home href html jul jun keywords mail mailto make
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 1 6 1 1 4 4 1 1 1 1 2 1
## 3 0 0 6 2 0 5 4 1 1 1 2 3 1
## 4 0 0 6 2 0 5 4 1 1 1 3 3 1
## 5 0 0 20 0 0 2 4 0 0 0 2 0 1
## 6 0 0 230 0 0 0 6 8 0 0 2 0 0
## mandark mon name phone right service smtp take today web well line microsoft
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 2 6 2 1 1 1 2 1 1 1 2 0 0
## 3 2 1 2 1 1 0 0 2 1 2 0 1 1
## 4 2 1 2 1 1 0 0 2 1 2 0 1 1
## 5 0 0 1 0 0 0 1 0 1 0 0 1 0
## 6 0 0 23 0 0 0 0 0 0 0 0 0 1
## remove removed still want wed without yahoo thu know nbsp oct old post sat
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 1 1 1 1 5 2 1 0 0 0 0 0 0 0
## 4 1 1 1 1 1 2 1 5 0 0 0 0 0 0
## 5 1 0 0 0 0 0 0 0 1 1 1 1 1 1
## 6 0 0 0 0 0 0 0 0 0 60 0 0 2 0
## size sun table unsubscribe width align bgcolor bit border cellpadding
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 3 2 2 1 2 0 0 0 0 0
## 6 83 0 4 0 25 17 2 1 2 1
## cellspacing encoding face head height https lists php printable quoted
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 1 1 49 2 2 1 3 1 2 2
## sourceforge spamassassin sponsored title transfer iso system taint user
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 2 3 1 2 1 0 0 0 0
## windows arial build example find first found mailer msmail normal original
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## outlook priority produced unknown world archive best contact debian exim helo
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## request subscribe discussion fork internet khare lair pdt pipermail rohit
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## xent top link much network using url invoked qmail really references wrote
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## sep agent utf jalapeno rssfeeds spam
## 1 0 0 0 0 0 1
## 2 0 0 0 0 0 1
## 3 0 0 0 0 0 1
## 4 0 0 0 0 0 1
## 5 0 0 0 0 0 1
## 6 0 0 0 0 0 1
training data
len = dim(word_matrix)[1]
set.seed(1111)
# create training/ evaluation sets
training = sample(seq(len),size = round(len*.7))
training_set = word_matrix[training,]
evaluation_set = word_matrix[-training,]
# naive bayes model
model1 = naiveBayes(training_set,training_set$spam)
predictions
predictions = data.frame(train = training_set$spam)
predictions$model1 = predict(model1,newdata =training_set)
predictions = predictions %>%
mutate(train = as.factor(train),
model1 = as.factor(model1)
)
# 95% accurate on itself
confusionMatrix(predictions$model1,predictions$train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1752 84
## 1 12 881
##
## Accuracy : 0.9648
## 95% CI : (0.9572, 0.9714)
## No Information Rate : 0.6464
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9217
##
## Mcnemar's Test P-Value : 4.28e-13
##
## Sensitivity : 0.9932
## Specificity : 0.9130
## Pos Pred Value : 0.9542
## Neg Pred Value : 0.9866
## Prevalence : 0.6464
## Detection Rate : 0.6420
## Detection Prevalence : 0.6728
## Balanced Accuracy : 0.9531
##
## 'Positive' Class : 0
##
testing on evaluation set
eval = data.frame(train = evaluation_set$spam)
eval$model1 = predict(model1,evaluation_set)
eval = eval%>%
mutate(train = as.factor(train),
model1 = as.factor(model1)
)
# showed similar accuracy in evaluation set
confusionMatrix(eval$model1,eval$train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 732 40
## 1 5 392
##
## Accuracy : 0.9615
## 95% CI : (0.9488, 0.9718)
## No Information Rate : 0.6305
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.916
##
## Mcnemar's Test P-Value : 4.011e-07
##
## Sensitivity : 0.9932
## Specificity : 0.9074
## Pos Pred Value : 0.9482
## Neg Pred Value : 0.9874
## Prevalence : 0.6305
## Detection Rate : 0.6262
## Detection Prevalence : 0.6604
## Balanced Accuracy : 0.9503
##
## 'Positive' Class : 0
##