Project 4

John Kellogg

2019-11-17

This project is to attempt to create a method to codify and classify SPAM and HAM emails. All of the dataset is pulled from https://spamassassin.apache.org/old/publiccorpus/.

Data Loading

Loading the dataset for the “HAM” messages.

ham_dir <- "~/dumps/easy_ham_2"
hamFileNames <- list.files(ham_dir)

head(hamFileNames, 20)

##  [1] "00001.1a31cc283af0060967a233d26548a6ce"
##  [2] "00002.5a587ae61666c5aa097c8e866aedcc59"
##  [3] "00003.19be8acd739ad589cd00d8425bac7115"
##  [4] "00004.b2ed6c3c62bbdfab7683d60e214d1445"
##  [5] "00005.07b9d4aa9e6c596440295a5170111392"
##  [6] "00006.654c4ec7c059531accf388a807064363"
##  [7] "00007.2e086b13730b68a21ee715db145522b9"
##  [8] "00008.6b73027e1e56131377941ff1db17ff12"
##  [9] "00009.13c349859b09264fa131872ed4fb6e4e"
## [10] "00010.d1b4dbbad797c5c0537c5a0670c373fd"
## [11] "00011.bc1aa4dca14300a8eec8b7658e568f29"
## [12] "00012.3c1ff7380f10a806321027fc0ad09560"
## [13] "00013.245fc5b9e5719b033d5d740c51af92e0"
## [14] "00014.8e21078a89bd9c57255d302f346551e8"
## [15] "00015.d5c8f360cf052b222819718165db24c6"
## [16] "00016.bc1f434b566619637a0de033cd3380d1"
## [17] "00017.8b965080dfffada165a54c041c27e33f"
## [18] "00018.3b6a8c5da4043f2a6a63a1ae12bd9824"
## [19] "00019.c6b272a04ec32252f7c685f464ae3942"
## [20] "00020.83ef024f76cc42b8245a683ed9b38406"

We need to strip out the header information as the function below only needs the body of the email. Using the function identified above, we are able to extract just the meat of the message and save into a Dataframe.

ham_docs_list <- NA
for(i in 1:length(hamFileNames))
{
  filepath<-paste0(ham_dir, "/", hamFileNames[1])  
  Content <-suppressWarnings(warning(readtext(filepath)))
  message <- get_email_body (Content)
  message <- gsub("<.*?>", " ", message)
  list1<- list(paste(message, collapse="\n"))
  ham_docs_list = c(ham_docs_list,list1)
  
}

hamDF <-as.data.frame(unlist(ham_docs_list),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")

Loading the dataset for the “SPAM” messages.

spam_dir <- "~/dumps/spam_2"
spamFileNames <- list.files(spam_dir)

head(spamFileNames, 20)

##  [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
##  [2] "00002.9438920e9a55591b18e60d1ed37d992b"
##  [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
##  [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
##  [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
##  [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
##  [7] "00007.acefeee792b5298f8fee175f9f65c453"
##  [8] "00008.ccf927a6aec028f5472ca7b9db9eee20"
##  [9] "00009.1e1a8cb4b57532ab38aa23287523659d"
## [10] "00010.2558d935f6439cb40d3acb8b8569aa9b"
## [11] "00011.bd8c904d9f7b161a813d222230214d50"
## [12] "00012.cb9c9f2a25196f5b16512338625a85b4"
## [13] "00013.372ec9dc663418ca71f7d880a76f117a"
## [14] "00014.13574737e55e51fe6737a475b88b5052"
## [15] "00015.206d5a5d1d34272ae32fc286788fdf55"
## [16] "00016.4fb07c8dff1a5a2b4889dc5024c55023"
## [17] "00017.6430f3b8dedf51ba3c3fcb9304e722e7"
## [18] "00018.336cb9e7b0358594cf002e7bf669eaf5"
## [19] "00019.86ce6f6c2e9f4ae0415860fecdf055db"
## [20] "00020.7d36d16fd2be07c4f6a5616590cdea07"

Again, just as in the ‘HAM’ messages, we only need the meat of the email saved to a Dataframe.

spam_docs_list <- NA
for(i in 1:length(spamFileNames))
{
  filepath<-paste0(spam_dir, "/", spamFileNames[1])  
  Content <-suppressWarnings(warning(readtext(filepath)))
  message <- get_email_body (Content)
  message <- gsub("<.*?>", " ", message)
  list1<- list(paste(message, collapse="\n"))
  spam_docs_list = c(spam_docs_list,list1)
  
}

spamDF <-as.data.frame(unlist(spam_docs_list),stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text","type")

Combining the two dataframes into one DF.

spam_ham_comb <- rbind(hamDF, spamDF)

Creating the Corpus

Using the TM function, we need to create a ‘corpus’ from the text which was extracted in the earlier section.

corpus_ham = VCorpus(VectorSource(hamDF$text))
corpus_ham = tm_map(corpus_ham, content_transformer(tolower))
corpus_ham = tm_map(corpus_ham, removeNumbers)
corpus_ham = tm_map(corpus_ham, removePunctuation)
corpus_ham = tm_map(corpus_ham, removeWords, stopwords())
#corpus_ham = tm_map(corpus_ham, stemDocument) # I left the "Stem" function in however I didn't like what it did to the clouds below
corpus_ham = tm_map(corpus_ham, stripWhitespace)
corpus_ham = tm_map(corpus_ham, removeWords, c("lbrace", "rbrace"))

corpus_spam = VCorpus(VectorSource(spamDF$text))
corpus_spam = tm_map(corpus_spam, content_transformer(tolower))
corpus_spam = tm_map(corpus_spam, removeNumbers)
corpus_spam = tm_map(corpus_spam, removePunctuation)
corpus_spam = tm_map(corpus_spam, removeWords, stopwords())
#corpus_spam = tm_map(corpus_spam, stemDocument) # I left the "Stem" function in however I didn't like what it did to the clouds below
corpus_spam = tm_map(corpus_spam, stripWhitespace)
corpus_spam = tm_map(corpus_spam, removeWords, c("lbrace", "rbrace"))

corpus_all <- c(corpus_ham, corpus_spam)


for(i in 1:length(corpus_ham)){
  meta(corpus_all[[i]],"classification") <- "Ham"}
for(i in (length(corpus_ham)+1):(length(corpus_spam)+length(corpus_ham))){
  meta(corpus_all[[i]],"classification") <- "Spam"}

What does this data now look like? Creating a word cloud, I can see at quick glance what the data now looks like. There are a few “non” words which in the future I would want to go abck and clean up. They come from the footer of the email. This would become a problem in companies who use email disclaimers in their post signature section. Luckily, most of those are the same text so it can be filtered out.

SPAM word cloud

wordcloud(corpus_spam, max.words = 50)

HAM word cloud

wordcloud(corpus_ham, max.words = 50)

Next, I wanted to play with getting the Sparsity of the messages down to 0%. What words start to emerge from the data?

HAM Sparsity:0%

ham_subset <- corpus_ham
ham_subset_dtm <- DocumentTermMatrix(ham_subset)
ham_subset_tdm <- TermDocumentMatrix (ham_subset)

dtm_ham_Sparse <- removeSparseTerms(ham_subset_dtm, 0.2)
inspect(dtm_ham_Sparse)

## <<DocumentTermMatrix (documents: 1402, terms: 291)>>
## Non-/sparse entries: 407691/291
## Sparsity           : 0%
## Maximal term length: 59
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs exmh folder invoked line list message procedure sequences window
##   10    7      9      25   23   10       8        18        13      8
##   11    7      9      25   23   10       8        18        13      8
##   2     7      9      25   23   10       8        18        13      8
##   3     7      9      25   23   10       8        18        13      8
##   4     7      9      25   23   10       8        18        13      8
##   5     7      9      25   23   10       8        18        13      8
##   6     7      9      25   23   10       8        18        13      8
##   7     7      9      25   23   10       8        18        13      8
##   8     7      9      25   23   10       8        18        13      8
##   9     7      9      25   23   10       8        18        13      8
##     Terms
## Docs within
##   10     25
##   11     25
##   2      25
##   3      25
##   4      25
##   5      25
##   6      25
##   7      25
##   8      25
##   9      25

dtm_ham_freq <- sort(colSums(as.matrix(dtm_ham_Sparse)), decreasing = TRUE)
head(dtm_ham_freq, 50)

##         invoked          within            line       procedure 
##           35025           35025           32223           25218 
##       sequences            list          folder         message 
##           18213           14010           12609           11208 
##          window            exmh             one            just 
##           11208            9807            9807            8406 
##             new        sequence            show          unseen 
##            8406            8406            8406            8406 
##            args    folderchange       msgchange         msgshow 
##            7005            7005            7005            7005 
##             now            pick            also      background 
##            7005            7005            5604            5604 
##            menu           msgid         uplevel            body 
##            5604            5604            5604            4203 
##  busycursorhack             cmd           didnt         display 
##            4203            4203            4203            4203 
##            eval           inbox            name       pickinner 
##            4203            4203            4203            4203 
##            time             use          useful           black 
##            4203            4203            4203            2802 
##          bottom            busy busycursorinner             can 
##            2802            2802            2802            2802 
##          chance         changes             cur         current 
##            2802            2802            2802            2802 
##          cursor         defined 
##            2802            2802

SPAM Sparsity:0%

spam_subset <- corpus_spam
spam_subset_dtm <- DocumentTermMatrix(spam_subset)
spam_subset_tdm <- TermDocumentMatrix (spam_subset)

dtm_spam_Sparse <- removeSparseTerms(spam_subset_dtm, 0.2)
inspect(dtm_spam_Sparse)

## <<DocumentTermMatrix (documents: 1398, terms: 182)>>
## Non-/sparse entries: 254254/182
## Sparsity           : 0%
## Maximal term length: 33
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs email free information letter list mlm people receiving send will
##   10     4    3           6      4    4   9      4         4    4    6
##   11     4    3           6      4    4   9      4         4    4    6
##   2      4    3           6      4    4   9      4         4    4    6
##   3      4    3           6      4    4   9      4         4    4    6
##   4      4    3           6      4    4   9      4         4    4    6
##   5      4    3           6      4    4   9      4         4    4    6
##   6      4    3           6      4    4   9      4         4    4    6
##   7      4    3           6      4    4   9      4         4    4    6
##   8      4    3           6      4    4   9      4         4    4    6
##   9      4    3           6      4    4   9      4         4    4    6

dtm_spam_freq <- sort(colSums(as.matrix(dtm_spam_Sparse)), decreasing = TRUE)
head(dtm_spam_freq, 50)

##         mlm information        will       email      letter        list 
##       12573        8382        8382        5588        5588        5588 
##      people   receiving        send        free   marketing         one 
##        5588        5588        5588        4191        4191        4191 
##        tell         use       youve        also         big        ever 
##        4191        4191        4191        2794        2794        2794 
##      havent       inbox  multilevel      online      please        read 
##        2794        2794        2794        2794        2794        2794 
##     receive        sent      signed     someone        spam     systems 
##        2794        2794        2794        2794        2794        2794 
##        work       works   abandoned      accept     address       agree 
##        2794        2794        1397        1397        1397        1397 
##      agreed alternative       altra     apology backstabbed     believe 
##        1397        1397        1397        1397        1397        1397 
##  beneficial    betrayed         box       brief      burned    business 
##        1397        1397        1397        1397        1397        1397 
##        call       click 
##        1397        1397

For the actual model, I wanted more words than the Sparsity O% would give me. Setting the removal back up to 0.90 gives us a sparsity of 47%, meaning only terms in 47% of the documents were returned. I also wanted the terms to be in a dataframe which can then be shuffled. Shuffling gives us less chance of a huge block of HAM and a huge block of SPAM when setting the training/test sets.

terms_matrix <- DocumentTermMatrix(corpus_all)
dtm_training <- removeSparseTerms(terms_matrix, 0.90)
inspect(dtm_training)

## <<DocumentTermMatrix (documents: 2800, terms: 446)>>
## Non-/sparse entries: 661945/586855
## Sparsity           : 47%
## Maximal term length: 59
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs folder invoked line list message mlm one procedure sequences within
##   10      9      25   23   10       8   0   7        18        13     25
##   11      9      25   23   10       8   0   7        18        13     25
##   2       9      25   23   10       8   0   7        18        13     25
##   3       9      25   23   10       8   0   7        18        13     25
##   4       9      25   23   10       8   0   7        18        13     25
##   5       9      25   23   10       8   0   7        18        13     25
##   6       9      25   23   10       8   0   7        18        13     25
##   7       9      25   23   10       8   0   7        18        13     25
##   8       9      25   23   10       8   0   7        18        13     25
##   9       9      25   23   10       8   0   7        18        13     25

training_DF <- as.data.frame(as.matrix(dtm_training))
training_DF$class = spam_ham_comb$type

#Shuffling the data
training_DF <- training_DF[sample(1:nrow(training_DF)),]

In order for the next functions to really work, I had to make the entire dataframe as numberic.

# In order for the next functions to really work, I had to make the entire dataframe as numberic.
training_DF <- training_DF %>% mutate(class = replace(class, class == "ham", 0))
training_DF <- training_DF %>% mutate(class = replace(class, class == "spam", 1))
training_DF$class <- as.numeric(as.character(training_DF$class))
training_DF <- as.data.frame(apply(training_DF, 2, as.numeric))

# splitting the dataframe into Training and Test

set.seed(500)
split = sample.split(training_DF$class, SplitRatio = 0.75)
training_set = subset(training_DF, split == TRUE)
test_set = subset(training_DF, split == FALSE)

# QC of the Sets
nrow(training_set)

## [1] 2100

nrow(test_set)

## [1] 700

# setting up the total number of observations 
num_observation <- ncol(training_set) - 1
num_observation

## [1] 446

I set up a Random Forest model to randomly pull values and be sampled independently. I grow 5 trees.

rand_class = randomForest(x = training_set[-num_observation],
                          y = training_set$class,
                          ntree = 5)

## Warning in randomForest.default(x = training_set[-num_observation], y =
## training_set$class, : The response has five or fewer unique values. Are you
## sure you want to do regression?

Creating the prediction of the results

predicting_y = predict(rand_class, newdata = test_set[-num_observation])

test_matrix <- table(predicting_y>0,test_set$class)
cm

## function (x) 
## 2.54 * x
## <bytecode: 0x000000000e74d648>
## <environment: namespace:grDevices>

Is my model accurate?

success <- test_matrix['TRUE', 2] + test_matrix['FALSE', 1] 
accuracy <- success/nrow(test_set) * 100
accuracy

## [1] 100