loading packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(tm)
## Warning: package 'tm' was built under R version 4.0.5
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

functions

# function for cleaning up text.
# Takes in string, removes various, non word characters, and splits it into a vector of phrases
prepare_str = function(str){
  prepared = str%>%
    # remove junk
    str_replace_all("\\&|\\*|\\_|\'|\"|\r|\t|\n|\\$|\\-|\\/|,|\\@|\\.|\\<|\\>|\\;|\\:|\\[|\\]|\\)|\\(|\\=|\\!|\\?",' ')%>%
    str_to_lower()%>%
    removeWords(stopwords('en'))%>%
    removeNumbers()%>%
    stripWhitespace()%>%
    return()
    
}

loading data

spam_emails = list.files("spam", full.names = TRUE)
ham_emails = list.files("ham", full.names = TRUE)

# dataframe with text and flags 1:spam 0: ham
email = c()
text = c()
for (spam in spam_emails){
  email = c(email,1)
  txt = read_file(spam)
  text = c(text, txt)
}

for (ham in ham_emails){
  email = c(email,0)
  txt = read_file(ham)
  text = c(text, txt)
}

emails = data.frame(email = email, text = text)
#head(emails)

word dictionary

# create Document Term Matrix using tm() library 
dtm = emails$text%>%
  prepare_str()%>%
  VectorSource()%>%
  Corpus()%>%
  DocumentTermMatrix()%>%
  # remove uncommon terms
  removeSparseTerms(.9)

inspect(dtm)
## <<DocumentTermMatrix (documents: 3898, terms: 218)>>
## Non-/sparse entries: 260274/589490
## Sparsity           : 69%
## Maximal term length: 12
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   com font fork http list localhost net org received spamassassin
##   1055  22  542    0    4    6         5  30   0       16            3
##   1083  20  542    0    4    6         5  31   0       14            3
##   1090 204 1102    0  516    5         6  43   4        7            1
##   1091 204 1102    0  516    2         6  43   4        7            1
##   1300 176   36   17  156    6         7   0   6        5            4
##   1563 175  198    0  141    2         5  11  14        4           11
##   28   146 1627    0   80    4         0   8   5        2            0
##   51   167   41    0   83   12         0  14   0        2            0
##   77   165   41    0   93    0         0  24   4        2            0
##   909   26  340    0   11   24         5  33   0       11            3

Create table for modeling

# transform dtm created with tm() package to a dataframe 
word_matrix = dtm %>%
  as.matrix()%>%
  as.data.frame() %>%
  mutate(spam = emails$email)

head(word_matrix)
##   address admin also ascii aug back beenthere bulk business call charset click
## 1       1     4    2     1   8    1         1    1        1    1       1     1
## 2      10     0    1     0   0    0         0    0        0    0       1     0
## 3      10     0    0     0   0    0         0    0        0    0       1     1
## 4      10     0    0     0   0    0         0    0        0    0       1     1
## 5       4     0    0     0   5    1         0    0        0    0       1     2
## 6       0     0    0     7   0    0         0    0        0    0       2     0
##   com content date delivered dogma drop edt email errors esmtp fetchmail form
## 1   6       1    1         1     1    1   1     4      1     4         1    1
## 2  14       1    2         0     1    0   0     3      0     1         0    0
## 3  20       1    2         0     1    0   0     3      0     2         0    0
## 4  20       1    2         0     1    0   0     3      0     3         0    0
## 5  10       1    1         1     0    1   1     2      0     2         1    0
## 6   2       6    1         0     0    0   1     1      0     4         0    3
##   format free fri friends get good group hotmail http imap info information ist
## 1      1    3   5       1   1    1     2       1    1    1    1           6   1
## 2      0    3   0       0   1    0     0       0    4    0    0           2   0
## 3      0    4   0       0   2    5     0       0    4    0    0           2   0
## 4      0    4   0       0   2    5     0       0    4    0    0           2   0
## 5      0    0   0       0   1    1     0       0    2    0    0           3   1
## 6      0    0   1       0   0    0     0       0    3    0    0           0   0
##   jmason just labs like linux list listinfo localhost mailing mailman many may
## 1      1    1    1    1    13    5        1         7       1       2    1   1
## 2      1    0    2    0     0    0        0         0       0       0    0   6
## 3      1    0    2    1     0    1        0         0       0       0    0   6
## 4      1    0    2    1     0    1        0         0       0       0    0   7
## 5      0    3    1    2     0    0        0         6       0       0    0   2
## 6      0    0    0    0     0    1        1         3       1       0    0   0
##   message mime money need net netnoteinc new now one org path people phobos
## 1       2    1     1    1   2          2   1   1   4   7    1      4      2
## 2       1    1     0    1   1          3   0   1   3   2    1      0      0
## 3       1    1     0    0   0          3   4   3   2   2    1      0      0
## 4       1    1     0    0   4          4   4   3   2   2    1      0      0
## 5       1    1     1    1   4          3   0   0   0   0    1      2      1
## 6       1    2     0    0  28          0  12   0   0   0    0      0      0
##   plain please postfix precedence real receive received reply return see send
## 1     1      2       1          1    1       2        7     1      1   1    4
## 2     0      4       0          0    2       0        3     0      2   0    1
## 3     0      4       0          0    0       1        3     0      1   0    1
## 4     0      4       0          0    0       1        4     0      1   0    1
## 5     0      6       1          0    0       1        5     1      1   0    1
## 6     0      0       0          0    0       0        7     0      0   0    0
##   sender sent single slashnull subject text think time tue type use used users
## 1      1    2      1         1       2    1     1    1   3    2   3    1     2
## 2      0    0      0         1       1    1     0    2   0    1   1    0     0
## 3      0    0      0         1       2    1     0    1   0    1   1    0     0
## 4      0    0      0         1       2    1     0    1   0    1   1    0     0
## 5      0    0      1         0       1    1     0    0   3    1   0    0     0
## 6      0    0      0         0       1    3     0    1   7   13   0    0     0
##   version way will work www years yyyy body can center color day delivery don
## 1       2   1    6    2   1     1    1    0   0      0     0   0        0   0
## 2       1   0    3    1   4     0    0    2   1      2     3   1        1   3
## 3       1   1    4    0   4     0    0    7   2      4     4   2        1   2
## 4       1   1    4    0   4     0    0    7   2      4     4   2        1   2
## 5       1   0    1    0   2     1    3    1   3      2     7   1        0   1
## 6       3   0    0    0   1     0    0    2   0     19    36   0        0   0
##   even express font help home href html jul jun keywords mail mailto make
## 1    0       0    0    0    0    0    0   0   0        0    0      0    0
## 2    1       1    6    1    1    4    4   1   1        1    1      2    1
## 3    0       0    6    2    0    5    4   1   1        1    2      3    1
## 4    0       0    6    2    0    5    4   1   1        1    3      3    1
## 5    0       0   20    0    0    2    4   0   0        0    2      0    1
## 6    0       0  230    0    0    0    6   8   0        0    2      0    0
##   mandark mon name phone right service smtp take today web well line microsoft
## 1       0   0    0     0     0       0    0    0     0   0    0    0         0
## 2       2   6    2     1     1       1    2    1     1   1    2    0         0
## 3       2   1    2     1     1       0    0    2     1   2    0    1         1
## 4       2   1    2     1     1       0    0    2     1   2    0    1         1
## 5       0   0    1     0     0       0    1    0     1   0    0    1         0
## 6       0   0   23     0     0       0    0    0     0   0    0    0         1
##   remove removed still want wed without yahoo thu know nbsp oct old post sat
## 1      0       0     0    0   0       0     0   0    0    0   0   0    0   0
## 2      0       0     0    0   0       0     0   0    0    0   0   0    0   0
## 3      1       1     1    1   5       2     1   0    0    0   0   0    0   0
## 4      1       1     1    1   1       2     1   5    0    0   0   0    0   0
## 5      1       0     0    0   0       0     0   0    1    1   1   1    1   1
## 6      0       0     0    0   0       0     0   0    0   60   0   0    2   0
##   size sun table unsubscribe width align bgcolor bit border cellpadding
## 1    0   0     0           0     0     0       0   0      0           0
## 2    0   0     0           0     0     0       0   0      0           0
## 3    0   0     0           0     0     0       0   0      0           0
## 4    0   0     0           0     0     0       0   0      0           0
## 5    3   2     2           1     2     0       0   0      0           0
## 6   83   0     4           0    25    17       2   1      2           1
##   cellspacing encoding face head height https lists php printable quoted
## 1           0        0    0    0      0     0     0   0         0      0
## 2           0        0    0    0      0     0     0   0         0      0
## 3           0        0    0    0      0     0     0   0         0      0
## 4           0        0    0    0      0     0     0   0         0      0
## 5           0        0    0    0      0     0     0   0         0      0
## 6           1        1   49    2      2     1     3   1         2      2
##   sourceforge spamassassin sponsored title transfer iso system taint user
## 1           0            0         0     0        0   0      0     0    0
## 2           0            0         0     0        0   0      0     0    0
## 3           0            0         0     0        0   0      0     0    0
## 4           0            0         0     0        0   0      0     0    0
## 5           0            0         0     0        0   0      0     0    0
## 6           2            3         1     2        1   0      0     0    0
##   windows arial build example find first found mailer msmail normal original
## 1       0     0     0       0    0     0     0      0      0      0        0
## 2       0     0     0       0    0     0     0      0      0      0        0
## 3       0     0     0       0    0     0     0      0      0      0        0
## 4       0     0     0       0    0     0     0      0      0      0        0
## 5       0     0     0       0    0     0     0      0      0      0        0
## 6       0     0     0       0    0     0     0      0      0      0        0
##   outlook priority produced unknown world archive best contact debian exim helo
## 1       0        0        0       0     0       0    0       0      0    0    0
## 2       0        0        0       0     0       0    0       0      0    0    0
## 3       0        0        0       0     0       0    0       0      0    0    0
## 4       0        0        0       0     0       0    0       0      0    0    0
## 5       0        0        0       0     0       0    0       0      0    0    0
## 6       0        0        0       0     0       0    0       0      0    0    0
##   request subscribe discussion fork internet khare lair pdt pipermail rohit
## 1       0         0          0    0        0     0    0   0         0     0
## 2       0         0          0    0        0     0    0   0         0     0
## 3       0         0          0    0        0     0    0   0         0     0
## 4       0         0          0    0        0     0    0   0         0     0
## 5       0         0          0    0        0     0    0   0         0     0
## 6       0         0          0    0        0     0    0   0         0     0
##   xent top link much network using url invoked qmail really references wrote
## 1    0   0    0    0       0     0   0       0     0      0          0     0
## 2    0   0    0    0       0     0   0       0     0      0          0     0
## 3    0   0    0    0       0     0   0       0     0      0          0     0
## 4    0   0    0    0       0     0   0       0     0      0          0     0
## 5    0   0    0    0       0     0   0       0     0      0          0     0
## 6    0   0    0    0       0     0   0       0     0      0          0     0
##   sep agent utf jalapeno rssfeeds spam
## 1   0     0   0        0        0    1
## 2   0     0   0        0        0    1
## 3   0     0   0        0        0    1
## 4   0     0   0        0        0    1
## 5   0     0   0        0        0    1
## 6   0     0   0        0        0    1

training data

len = dim(word_matrix)[1]

set.seed(1111)
# create training/ evaluation sets
training = sample(seq(len),size = round(len*.7))
training_set = word_matrix[training,]
evaluation_set = word_matrix[-training,]

# naive bayes model
model1 = naiveBayes(training_set,training_set$spam)

predictions

predictions = data.frame(train = training_set$spam)
predictions$model1 = predict(model1,newdata =training_set)

predictions = predictions %>%
  mutate(train = as.factor(train),
         model1 = as.factor(model1)
         )
# 95% accurate on itself
confusionMatrix(predictions$model1,predictions$train)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1752   84
##          1   12  881
##                                           
##                Accuracy : 0.9648          
##                  95% CI : (0.9572, 0.9714)
##     No Information Rate : 0.6464          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9217          
##                                           
##  Mcnemar's Test P-Value : 4.28e-13        
##                                           
##             Sensitivity : 0.9932          
##             Specificity : 0.9130          
##          Pos Pred Value : 0.9542          
##          Neg Pred Value : 0.9866          
##              Prevalence : 0.6464          
##          Detection Rate : 0.6420          
##    Detection Prevalence : 0.6728          
##       Balanced Accuracy : 0.9531          
##                                           
##        'Positive' Class : 0               
## 

testing on evaluation set

eval = data.frame(train = evaluation_set$spam)
eval$model1 = predict(model1,evaluation_set)

eval = eval%>%
  mutate(train = as.factor(train),
         model1 = as.factor(model1)
         )

# showed similar accuracy in evaluation set
confusionMatrix(eval$model1,eval$train)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 732  40
##          1   5 392
##                                           
##                Accuracy : 0.9615          
##                  95% CI : (0.9488, 0.9718)
##     No Information Rate : 0.6305          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.916           
##                                           
##  Mcnemar's Test P-Value : 4.011e-07       
##                                           
##             Sensitivity : 0.9932          
##             Specificity : 0.9074          
##          Pos Pred Value : 0.9482          
##          Neg Pred Value : 0.9874          
##              Prevalence : 0.6305          
##          Detection Rate : 0.6262          
##    Detection Prevalence : 0.6604          
##       Balanced Accuracy : 0.9503          
##                                           
##        'Positive' Class : 0               
##