Grando Project 4

Note, I decided to take all the latest spam/ham files available so if you try to re-run this analysis, please be aware that it will take a while.

options(width = 100)
# This is a standard setup I include so that my working
# directory is set correctly whether I work on one of my
# windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
    setwd("~/Masters/DATA607/Project4")
} else {
    setwd("~/Documents/Masters/DATA607/Project4")
}

Load the ham/spam files

For this assignment, I took all the updated spam and ham emails located on the referenced website. I loaded all of these emails into dataframes and did some simple text formatting. I left warnings enabled for the text processing to ensure that there are not numerous file errors (there is only one noted below).

suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(plyr)))
suppressWarnings(suppressMessages(library("tm")))
suppressWarnings(suppressMessages(library(tidytext)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyr)))

format_list <- function(mail_type, y) {
    lapply(y, function(x) {
        to_list <- str_replace(unlist(str_match_all(x, "\\s*To:\\s(.*)"))[1], 
            "To: ", "")
        from_list <- str_replace(unlist(str_match_all(x, "\\s*From:\\s(.*)"))[1], 
            "From: ", "")
        content_list <- str_replace(unlist(str_match_all(x, "\\s*Content-Type:\\s*(.*)"))[1], 
            "\\s*Content-Type: ", "")
        subject_list <- str_replace(unlist(str_match_all(x, "\\s*Subject:\\s(.*)"))[1], 
            "\\s*Subject: ", "")
        body_text <- paste(str_match_all(paste(x, collapse = " "), 
            "([[:alpha:]]+)")[[1]], collapse = " ")
        x <- c(mail_type, to_list, from_list, content_list, subject_list, 
            body_text)
    })
}

easy_ham_file_list <- list.files(path = "./data/easy_ham", full.names = TRUE, 
    pattern = "0.*")
easy_ham_files <- lapply(easy_ham_file_list, readLines)
easy_ham_list <- format_list("ham", easy_ham_files)
easy_ham_df <- lapply(easy_ham_list, function(x) {
    x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
easy_ham_df <- rbind.fill(easy_ham_df)

easy_ham_2_file_list <- list.files(path = "./data/easy_ham_2", 
    full.names = TRUE, pattern = "0.*")
easy_ham_2_files <- lapply(easy_ham_2_file_list, readLines)
easy_ham_2_list <- format_list("ham", easy_ham_2_files)
easy_ham_2_df <- lapply(easy_ham_2_list, function(x) {
    x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
easy_ham_2_df <- rbind.fill(easy_ham_2_df)

hard_ham_file_list <- list.files(path = "./data/hard_ham", full.names = TRUE, 
    pattern = "0.*")
hard_ham_files <- lapply(hard_ham_file_list, readLines)
## Warning in FUN(X[[i]], ...): incomplete final line found on './data/hard_ham/
## 00228.0eaef7857bbbf3ebf5edbbdae2b30493'
hard_ham_list <- format_list("ham", hard_ham_files)
hard_ham_df <- lapply(hard_ham_list, function(x) {
    x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
hard_ham_df <- rbind.fill(hard_ham_df)

spam_file_list <- list.files(path = "./data/spam", full.names = TRUE, 
    pattern = "0.*")
spam_files <- lapply(spam_file_list, readLines)
spam_list <- format_list("spam", spam_files)
spam_df <- lapply(spam_list, function(x) {
    x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
spam_df <- rbind.fill(spam_df)

spam_2_file_list <- list.files(path = "./data/spam_2", full.names = TRUE, 
    pattern = "0.*")
spam_2_files <- lapply(spam_2_file_list, readLines)
spam_2_list <- format_list("spam", spam_2_files)
spam_2_df <- lapply(spam_2_list, function(x) {
    x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
spam_2_df <- rbind.fill(spam_2_df)

full_set_df <- rbind(easy_ham_df, easy_ham_2_df, hard_ham_df, 
    spam_df, spam_2_df)
colnames(full_set_df) <- c("type", "to", "from", "content_type", 
    "subject", "body")

full_set_df$body <- removeWords(str_replace_all(tolower(full_set_df$body), 
    "[\\W]", " "), stopwords("english"))
full_set_df$subject <- removeWords(str_replace_all(tolower(full_set_df$subject), 
    "[\\W]", " "), stopwords("english"))
full_set_df$from <- removeWords(str_replace_all(tolower(full_set_df$from), 
    "[\\W]", " "), c("com"))
full_set_df$content_type <- str_replace_all(tolower(full_set_df$content_type), 
    "[^[:alnum:]]", " ")

Separate the data into training, development, and test data

I decided to use 25% of the data to use for the comparisons (training data), 25% to run the generalized linear model (development), and 50% as holdout data to test the results (test data).

sample_size <- floor(0.5 * nrow(full_set_df))
set.seed(869)
train_dev_ind <- sample(seq_len(nrow(full_set_df)), size = sample_size)
train_dev_df <- full_set_df[train_dev_ind, ]
test_df <- full_set_df[-train_dev_ind, ]

train_sample_size <- floor(0.5 * nrow(train_dev_df))
train_ind <- sample(seq_len(nrow(train_dev_df)), size = train_sample_size)
train_df <- train_dev_df[train_ind, ]
dev_df <- train_dev_df[-train_ind, ]

Create some attributes to test

After tidying up the data, I created some comparisons to use for classifying the data:

  1. Count of most common spam/ham words in the body (ham_count + spam_count)
  2. Length of email body (body_length)
  3. Count of most common spam/ham words in the subject line (ham_subject_count + spam_subject_count)
  4. Length of email subject line (subject_length)
  5. Count of most common spam/ham words in the from data (ham_from_count + spam_from_count)
  6. Unique ham/spam words found in the body (top 10) (ham_compare_count + spam_compare_count)
  7. Unique ham/spam words found in the subject lines (top ten) (hamsubject_compare_count + spamsubject_compare_count)
  8. Count of most common spam/ham words in the content data (hamcontent_from_count + spamcontent_from_count)
  9. Unique ham/spam words found in the content data (hamcontent_compare_count + spamcontent_compare_count)
spamwords_full_df <- train_df[train_df$type == "spam", ] %>% 
    unnest_tokens(text, body) %>% count(text, sort = TRUE) %>% 
    filter(n > 1000)
hamwords_full_df <- train_df[train_df$type == "ham", ] %>% unnest_tokens(text, 
    body) %>% count(text, sort = TRUE) %>% filter(n > 1000)

spamsubject_full_df <- train_df[train_df$type == "spam", ] %>% 
    unnest_tokens(text, subject) %>% count(text, sort = TRUE) %>% 
    filter(n > 10)
hamsubject_full_df <- train_df[train_df$type == "ham", ] %>% 
    unnest_tokens(text, subject) %>% count(text, sort = TRUE) %>% 
    filter(n > 10)

spamsubject_compare_df <- anti_join(x = spamsubject_full_df, 
    y = hamsubject_full_df, by = "text")[1:10, ]
hamsubject_compare_df <- anti_join(x = hamsubject_full_df, y = spamsubject_full_df, 
    by = "text")[1:10, ]

spamfrom_full_df <- train_df[train_df$type == "spam", ] %>% unnest_tokens(text, 
    from) %>% count(text, sort = TRUE) %>% filter(n > 20)
hamfrom_full_df <- train_df[train_df$type == "ham", ] %>% unnest_tokens(text, 
    from) %>% count(text, sort = TRUE) %>% filter(n > 20)

spamwords_compare_df <- anti_join(x = spamwords_full_df, y = hamwords_full_df, 
    by = "text")[1:10, ]
hamwords_compare_df <- anti_join(x = hamwords_full_df, y = spamwords_full_df, 
    by = "text")[1:10, ]

spamcontent_full_df <- train_df[train_df$type == "spam", ] %>% 
    unnest_tokens(text, content_type) %>% count(text, sort = TRUE) %>% 
    filter(n > 20)
hamcontent_full_df <- train_df[train_df$type == "ham", ] %>% 
    unnest_tokens(text, content_type) %>% count(text, sort = TRUE) %>% 
    filter(n > 20)

spamcontent_compare_df <- anti_join(x = spamcontent_full_df, 
    y = hamcontent_full_df, by = "text")
hamcontent_compare_df <- anti_join(x = hamcontent_full_df, y = spamcontent_full_df, 
    by = "text")

word_compare <- function(body, compare_list) {
    body_list <- unlist(strsplit(body, " "))
    x <- ifelse(body_list %in% compare_list, 1, 0)
    x <- sum(x)
}

train_df$ham_count <- sapply(train_df$body, function(x) {
    word_compare(x, hamwords_full_df$text)
})

dev_df$ham_count <- sapply(dev_df$body, function(x) {
    word_compare(x, hamwords_full_df$text)
})

test_df$ham_count <- sapply(test_df$body, function(x) {
    word_compare(x, hamwords_full_df$text)
})

train_df$spam_count <- sapply(train_df$body, function(x) {
    word_compare(x, spamwords_full_df$text)
})

dev_df$spam_count <- sapply(dev_df$body, function(x) {
    word_compare(x, spamwords_full_df$text)
})

test_df$spam_count <- sapply(test_df$body, function(x) {
    word_compare(x, spamwords_full_df$text)
})

train_df$body_length <- sapply(train_df$body, function(x) {
    length(unlist(strsplit(x, " ")))
})

dev_df$body_length <- sapply(dev_df$body, function(x) {
    length(unlist(strsplit(x, " ")))
})

test_df$body_length <- sapply(test_df$body, function(x) {
    length(unlist(strsplit(x, " ")))
})

train_df$subject_length <- sapply(train_df$subject, function(x) {
    length(unlist(strsplit(x, " ")))
})

dev_df$subject_length <- sapply(dev_df$subject, function(x) {
    length(unlist(strsplit(x, " ")))
})

test_df$subject_length <- sapply(test_df$subject, function(x) {
    length(unlist(strsplit(x, " ")))
})

train_df$spam_subject_count <- sapply(train_df$subject, function(x) {
    word_compare(x, spamsubject_full_df$text)
})

dev_df$spam_subject_count <- sapply(dev_df$subject, function(x) {
    word_compare(x, spamsubject_full_df$text)
})

test_df$spam_subject_count <- sapply(test_df$subject, function(x) {
    word_compare(x, spamsubject_full_df$text)
})

train_df$ham_subject_count <- sapply(train_df$subject, function(x) {
    word_compare(x, hamsubject_full_df$text)
})

dev_df$ham_subject_count <- sapply(dev_df$subject, function(x) {
    word_compare(x, hamsubject_full_df$text)
})

test_df$ham_subject_count <- sapply(test_df$subject, function(x) {
    word_compare(x, hamsubject_full_df$text)
})

train_df$spam_from_count <- sapply(train_df$from, function(x) {
    word_compare(x, spamfrom_full_df$text)
})

dev_df$spam_from_count <- sapply(dev_df$from, function(x) {
    word_compare(x, spamfrom_full_df$text)
})

test_df$spam_from_count <- sapply(test_df$from, function(x) {
    word_compare(x, spamfrom_full_df$text)
})

train_df$ham_from_count <- sapply(train_df$from, function(x) {
    word_compare(x, hamfrom_full_df$text)
})

dev_df$ham_from_count <- sapply(dev_df$from, function(x) {
    word_compare(x, hamfrom_full_df$text)
})

test_df$ham_from_count <- sapply(test_df$from, function(x) {
    word_compare(x, hamfrom_full_df$text)
})

train_df$spam_compare_count <- sapply(train_df$body, function(x) {
    word_compare(x, spamwords_compare_df$text)
})

dev_df$spam_compare_count <- sapply(dev_df$body, function(x) {
    word_compare(x, spamwords_compare_df$text)
})

test_df$spam_compare_count <- sapply(test_df$body, function(x) {
    word_compare(x, spamwords_compare_df$text)
})

train_df$ham_compare_count <- sapply(train_df$body, function(x) {
    word_compare(x, hamwords_compare_df$text)
})

dev_df$ham_compare_count <- sapply(dev_df$body, function(x) {
    word_compare(x, hamwords_compare_df$text)
})

test_df$ham_compare_count <- sapply(test_df$body, function(x) {
    word_compare(x, hamwords_compare_df$text)
})

train_df$spamsubject_compare_count <- sapply(train_df$body, function(x) {
    word_compare(x, spamsubject_compare_df$text)
})

dev_df$spamsubject_compare_count <- sapply(dev_df$body, function(x) {
    word_compare(x, spamsubject_compare_df$text)
})

test_df$spamsubject_compare_count <- sapply(test_df$body, function(x) {
    word_compare(x, spamsubject_compare_df$text)
})

train_df$hamsubject_compare_count <- sapply(train_df$body, function(x) {
    word_compare(x, hamsubject_compare_df$text)
})

dev_df$hamsubject_compare_count <- sapply(dev_df$body, function(x) {
    word_compare(x, hamsubject_compare_df$text)
})

test_df$hamsubject_compare_count <- sapply(test_df$body, function(x) {
    word_compare(x, hamsubject_compare_df$text)
})

train_df$hamcontent_compare_count <- sapply(train_df$body, function(x) {
    word_compare(x, hamcontent_compare_df$text)
})

dev_df$hamcontent_compare_count <- sapply(dev_df$body, function(x) {
    word_compare(x, hamcontent_compare_df$text)
})

test_df$hamcontent_compare_count <- sapply(test_df$body, function(x) {
    word_compare(x, hamcontent_compare_df$text)
})

train_df$hamcontent_from_count <- sapply(train_df$from, function(x) {
    word_compare(x, hamcontent_full_df$text)
})

dev_df$hamcontent_from_count <- sapply(dev_df$from, function(x) {
    word_compare(x, hamcontent_full_df$text)
})

test_df$hamcontent_from_count <- sapply(test_df$from, function(x) {
    word_compare(x, hamcontent_full_df$text)
})

train_df$spamcontent_compare_count <- sapply(train_df$body, function(x) {
    word_compare(x, spamcontent_compare_df$text)
})

dev_df$spamcontent_compare_count <- sapply(dev_df$body, function(x) {
    word_compare(x, spamcontent_compare_df$text)
})

test_df$spamcontent_compare_count <- sapply(test_df$body, function(x) {
    word_compare(x, spamcontent_compare_df$text)
})

train_df$spamcontent_from_count <- sapply(train_df$from, function(x) {
    word_compare(x, spamcontent_full_df$text)
})

dev_df$spamcontent_from_count <- sapply(dev_df$from, function(x) {
    word_compare(x, spamcontent_full_df$text)
})

test_df$spamcontent_from_count <- sapply(test_df$from, function(x) {
    word_compare(x, spamcontent_full_df$text)
})

train_df$type_binary <- ifelse(train_df$type == "ham", 0, 1)
dev_df$type_binary <- ifelse(dev_df$type == "ham", 0, 1)
test_df$type_binary <- ifelse(test_df$type == "ham", 0, 1)

Select the signficant categories

A generalized linear model was run on the development data and then a step function was applied to determine the most appropriate categories to use.

email_glm <- glm(data = dev_df, type_binary ~ ham_count + spam_count + 
    body_length + ham_subject_count + spam_subject_count + subject_length + 
    ham_from_count + spam_from_count + ham_compare_count + spam_compare_count + 
    hamsubject_compare_count + spamsubject_compare_count + hamcontent_from_count + 
    spamcontent_from_count + hamcontent_compare_count + spamcontent_compare_count, 
    family = "binomial")
summary(email_glm)
## 
## Call:
## glm(formula = type_binary ~ ham_count + spam_count + body_length + 
##     ham_subject_count + spam_subject_count + subject_length + 
##     ham_from_count + spam_from_count + ham_compare_count + spam_compare_count + 
##     hamsubject_compare_count + spamsubject_compare_count + hamcontent_from_count + 
##     spamcontent_from_count + hamcontent_compare_count + spamcontent_compare_count, 
##     family = "binomial", data = dev_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1217  -0.1380  -0.0040   0.0989   3.6665  
## 
## Coefficients: (1 not defined because of singularities)
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                2.434e+00  3.243e-01   7.505 6.13e-14 ***
## ham_count                 -1.192e-02  1.998e-03  -5.970 2.38e-09 ***
## spam_count                 1.115e-02  1.930e-03   5.780 7.49e-09 ***
## body_length                2.577e-04  9.135e-05   2.821 0.004794 ** 
## ham_subject_count         -2.308e+00  2.363e-01  -9.770  < 2e-16 ***
## spam_subject_count         1.267e+00  2.710e-01   4.674 2.96e-06 ***
## subject_length             1.028e-01  2.771e-02   3.709 0.000208 ***
## ham_from_count            -3.250e+00  4.099e-01  -7.930 2.20e-15 ***
## spam_from_count            2.931e+00  4.391e-01   6.676 2.46e-11 ***
## ham_compare_count         -1.216e-02  4.846e-03  -2.508 0.012131 *  
## spam_compare_count        -9.666e-04  4.246e-03  -0.228 0.819896    
## hamsubject_compare_count  -2.329e-02  1.993e-02  -1.169 0.242527    
## spamsubject_compare_count  3.271e-02  1.693e-02   1.932 0.053336 .  
## hamcontent_from_count      1.984e-01  4.680e-01   0.424 0.671677    
## spamcontent_from_count            NA         NA      NA       NA    
## hamcontent_compare_count  -1.657e-01  6.351e-02  -2.609 0.009070 ** 
## spamcontent_compare_count  8.052e-02  3.341e-02   2.410 0.015952 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1898.90  on 1511  degrees of freedom
## Residual deviance:  472.51  on 1496  degrees of freedom
## AIC: 504.51
## 
## Number of Fisher Scoring iterations: 9
step_email_glm <- step(email_glm)
## Start:  AIC=504.51
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count + 
##     spam_subject_count + subject_length + ham_from_count + spam_from_count + 
##     ham_compare_count + spam_compare_count + hamsubject_compare_count + 
##     spamsubject_compare_count + hamcontent_from_count + spamcontent_from_count + 
##     hamcontent_compare_count + spamcontent_compare_count
## 
## 
## Step:  AIC=504.51
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count + 
##     spam_subject_count + subject_length + ham_from_count + spam_from_count + 
##     ham_compare_count + spam_compare_count + hamsubject_compare_count + 
##     spamsubject_compare_count + hamcontent_from_count + hamcontent_compare_count + 
##     spamcontent_compare_count
## 
##                             Df Deviance    AIC
## - spam_compare_count         1   472.56 502.56
## - hamcontent_from_count      1   472.69 502.69
## - hamsubject_compare_count   1   473.90 503.90
## <none>                           472.51 504.51
## - spamsubject_compare_count  1   477.28 507.28
## - ham_compare_count          1   478.19 508.19
## - spamcontent_compare_count  1   478.43 508.43
## - hamcontent_compare_count   1   479.39 509.39
## - body_length                1   480.25 510.25
## - subject_length             1   489.59 519.59
## - spam_subject_count         1   499.05 529.05
## - spam_count                 1   521.83 551.83
## - ham_count                  1   525.58 555.58
## - spam_from_count            1   531.83 561.83
## - ham_subject_count          1   632.34 662.34
## - ham_from_count             1   712.25 742.25
## 
## Step:  AIC=502.56
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count + 
##     spam_subject_count + subject_length + ham_from_count + spam_from_count + 
##     ham_compare_count + hamsubject_compare_count + spamsubject_compare_count + 
##     hamcontent_from_count + hamcontent_compare_count + spamcontent_compare_count
## 
##                             Df Deviance    AIC
## - hamcontent_from_count      1   472.76 500.76
## - hamsubject_compare_count   1   473.99 501.99
## <none>                           472.56 502.56
## - spamsubject_compare_count  1   477.34 505.34
## - spamcontent_compare_count  1   478.51 506.51
## - ham_compare_count          1   479.38 507.38
## - hamcontent_compare_count   1   479.63 507.63
## - body_length                1   480.26 508.26
## - subject_length             1   489.70 517.70
## - spam_subject_count         1   499.24 527.24
## - spam_from_count            1   531.96 559.96
## - ham_count                  1   544.33 572.33
## - spam_count                 1   545.02 573.02
## - ham_subject_count          1   632.87 660.87
## - ham_from_count             1   713.12 741.12
## 
## Step:  AIC=500.76
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count + 
##     spam_subject_count + subject_length + ham_from_count + spam_from_count + 
##     ham_compare_count + hamsubject_compare_count + spamsubject_compare_count + 
##     hamcontent_compare_count + spamcontent_compare_count
## 
##                             Df Deviance    AIC
## - hamsubject_compare_count   1   474.20 500.20
## <none>                           472.76 500.76
## - spamsubject_compare_count  1   477.55 503.55
## - spamcontent_compare_count  1   479.12 505.12
## - ham_compare_count          1   479.51 505.51
## - hamcontent_compare_count   1   479.81 505.81
## - body_length                1   480.48 506.48
## - subject_length             1   489.83 515.83
## - spam_subject_count         1   500.04 526.04
## - spam_from_count            1   532.30 558.30
## - ham_count                  1   544.78 570.78
## - spam_count                 1   545.55 571.55
## - ham_subject_count          1   632.98 658.98
## - ham_from_count             1   713.23 739.23
## 
## Step:  AIC=500.2
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count + 
##     spam_subject_count + subject_length + ham_from_count + spam_from_count + 
##     ham_compare_count + spamsubject_compare_count + hamcontent_compare_count + 
##     spamcontent_compare_count
## 
##                             Df Deviance    AIC
## <none>                           474.20 500.20
## - spamsubject_compare_count  1   478.51 502.51
## - spamcontent_compare_count  1   480.56 504.56
## - ham_compare_count          1   480.71 504.71
## - hamcontent_compare_count   1   481.02 505.02
## - body_length                1   481.16 505.16
## - subject_length             1   490.89 514.89
## - spam_subject_count         1   502.92 526.92
## - spam_from_count            1   534.20 558.20
## - ham_count                  1   547.68 571.68
## - spam_count                 1   547.87 571.87
## - ham_subject_count          1   645.07 669.07
## - ham_from_count             1   713.86 737.86
summary(step_email_glm)
## 
## Call:
## glm(formula = type_binary ~ ham_count + spam_count + body_length + 
##     ham_subject_count + spam_subject_count + subject_length + 
##     ham_from_count + spam_from_count + ham_compare_count + spamsubject_compare_count + 
##     hamcontent_compare_count + spamcontent_compare_count, family = "binomial", 
##     data = dev_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2469  -0.1517  -0.0043   0.0972   3.6820  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                2.427e+00  3.244e-01   7.480 7.41e-14 ***
## ham_count                 -1.197e-02  1.840e-03  -6.506 7.74e-11 ***
## spam_count                 1.117e-02  1.742e-03   6.409 1.46e-10 ***
## body_length                2.408e-04  8.981e-05   2.682 0.007328 ** 
## ham_subject_count         -2.349e+00  2.361e-01  -9.947  < 2e-16 ***
## spam_subject_count         1.309e+00  2.693e-01   4.862 1.16e-06 ***
## subject_length             1.015e-01  2.770e-02   3.663 0.000249 ***
## ham_from_count            -3.232e+00  4.081e-01  -7.921 2.37e-15 ***
## spam_from_count            2.937e+00  4.381e-01   6.705 2.01e-11 ***
## ham_compare_count         -1.240e-02  4.704e-03  -2.637 0.008368 ** 
## spamsubject_compare_count  3.068e-02  1.642e-02   1.869 0.061646 .  
## hamcontent_compare_count  -1.649e-01  6.367e-02  -2.590 0.009605 ** 
## spamcontent_compare_count  8.044e-02  3.218e-02   2.500 0.012429 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1898.9  on 1511  degrees of freedom
## Residual deviance:  474.2  on 1499  degrees of freedom
## AIC: 500.2
## 
## Number of Fisher Scoring iterations: 9
step_email_glm <- glm(formula = type_binary ~ ham_count + spam_count + 
    body_length + ham_subject_count + spam_subject_count + subject_length + 
    ham_from_count + spam_from_count + ham_compare_count + spamsubject_compare_count + 
    hamcontent_compare_count + spamcontent_compare_count, family = "binomial", 
    data = dev_df)
summary(step_email_glm)
## 
## Call:
## glm(formula = type_binary ~ ham_count + spam_count + body_length + 
##     ham_subject_count + spam_subject_count + subject_length + 
##     ham_from_count + spam_from_count + ham_compare_count + spamsubject_compare_count + 
##     hamcontent_compare_count + spamcontent_compare_count, family = "binomial", 
##     data = dev_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2469  -0.1517  -0.0043   0.0972   3.6820  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                2.427e+00  3.244e-01   7.480 7.41e-14 ***
## ham_count                 -1.197e-02  1.840e-03  -6.506 7.74e-11 ***
## spam_count                 1.117e-02  1.742e-03   6.409 1.46e-10 ***
## body_length                2.408e-04  8.981e-05   2.682 0.007328 ** 
## ham_subject_count         -2.349e+00  2.361e-01  -9.947  < 2e-16 ***
## spam_subject_count         1.309e+00  2.693e-01   4.862 1.16e-06 ***
## subject_length             1.015e-01  2.770e-02   3.663 0.000249 ***
## ham_from_count            -3.232e+00  4.081e-01  -7.921 2.37e-15 ***
## spam_from_count            2.937e+00  4.381e-01   6.705 2.01e-11 ***
## ham_compare_count         -1.240e-02  4.704e-03  -2.637 0.008368 ** 
## spamsubject_compare_count  3.068e-02  1.642e-02   1.869 0.061646 .  
## hamcontent_compare_count  -1.649e-01  6.367e-02  -2.590 0.009605 ** 
## spamcontent_compare_count  8.044e-02  3.218e-02   2.500 0.012429 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1898.9  on 1511  degrees of freedom
## Residual deviance:  474.2  on 1499  degrees of freedom
## AIC: 500.2
## 
## Number of Fisher Scoring iterations: 9

Apply the results to development data

Now, we can use the results of the glm to check the results of our predictions in the devlopment data:

  1. Count of most common spam/ham words in the body (ham_count + spam_count)
  2. Length of email body (body_length)
  3. Count of most common spam/ham words in the subject line (ham_subject_count + spam_subject_count)
  4. Length of email subject line (subject_length)
  5. Count of most common spam/ham words in the from data (ham_from_count + spam_from_count)
  6. Unique ham words found in the body (top 10) (ham_compare_count)
  7. Unique spam words found in the subject lines (top ten) (spamsubject_compare_count)
  8. Unique ham/spam words found in the content data (hamcontent_compare_count + spamcontent_compare_count)
classifier_function <- function(hamcount, spamcount, bodylength, 
    hamsubjectcount, spamsubjectcount, subjectlength, hamfromcount, 
    spamfromcount, hamcomparecount, spamsubjectcomparecount, 
    hamcontentcomparecount, spamcontentcomprecount) {
    x <- email_glm$coefficients[[1]] + email_glm$coefficients[[2]] * 
        hamcount + email_glm$coefficients[[3]] * spamcount + 
        email_glm$coefficients[[4]] * bodylength + email_glm$coefficients[[5]] * 
        hamsubjectcount + email_glm$coefficients[[6]] * spamsubjectcount + 
        email_glm$coefficients[[7]] * subjectlength + email_glm$coefficients[[8]] * 
        hamfromcount + email_glm$coefficients[[9]] * spamfromcount + 
        email_glm$coefficients[[10]] * hamcomparecount + email_glm$coefficients[[11]] * 
        spamsubjectcomparecount + email_glm$coefficients[[12]] * 
        hamcontentcomparecount + email_glm$coefficients[[13]] * 
        spamcontentcomprecount
}

logit2prob <- function(logit) {
    odds <- exp(logit)
    prob <- odds/(1 + odds)
    return(prob)
}

dev_df$fit_logit <- mapply(classifier_function, dev_df$ham_count, 
    dev_df$spam_count, dev_df$body_length, dev_df$ham_subject_count, 
    dev_df$spam_subject_count, dev_df$subject_length, dev_df$ham_from_count, 
    dev_df$spam_from_count, dev_df$ham_compare_count, dev_df$spamsubject_compare_count, 
    dev_df$hamcontent_compare_count, dev_df$spamcontent_compare_count)

dev_df$fit <- logit2prob(dev_df$fit_logit)

Probability Selection

But wait, here’s another question. At what probability do we determine an email is spam? 50%? 60%? Here, we can run a data frame of varying probabiblities to return the best results. By doing this analysis, we see that using a probablity of 40% to determine whether an email is spam returns the best results.

cm_function <- function(x) {
    dev_df$guess <- ifelse(dev_df$fit > x, 1, 0)
    dev_df$result <- ifelse(dev_df$type == "spam" & dev_df$guess == 
        1, "tp", ifelse(dev_df$type == "ham" & dev_df$guess == 
        0, "tn", ifelse(dev_df$type == "ham" & dev_df$guess == 
        1, "fp", ifelse(dev_df$type == "spam" & dev_df$guess == 
        0, "fn", ""))))
    tp <- nrow(dev_df[dev_df$result == "tp", ])
    tn <- nrow(dev_df[dev_df$result == "tn", ])
    fp <- nrow(dev_df[dev_df$result == "fp", ])
    fn <- nrow(dev_df[dev_df$result == "fn", ])
    cm <- as.matrix(table(Actual = dev_df$type_binary, Predicted = dev_df$guess))
    n = sum(cm)  # number of instances
    nc = nrow(cm)  # number of classes
    diag = diag(cm)  # number of correctly classified instances per class 
    rowsums = apply(cm, 1, sum)  # number of instances per class
    colsums = apply(cm, 2, sum)  # number of predictions per class
    p = rowsums/n  # distribution of instances over the actual classes
    q = colsums/n  # distribution of instances over the predicted classes
    accuracy = sum(diag)/n
    precision = diag/colsums
    recall = diag/rowsums
    f1 = 2 * precision * recall/(precision + recall)
    df <- data.frame(x, accuracy, precision, recall, f1)
    df$pred <- row.names(df)
    df
}

cm_table <- function(x) {
    dev_df$guess <- ifelse(dev_df$fit > x, 1, 0)
    dev_df$result <- ifelse(dev_df$type == "spam" & dev_df$guess == 
        1, "tp", ifelse(dev_df$type == "ham" & dev_df$guess == 
        0, "tn", ifelse(dev_df$type == "ham" & dev_df$guess == 
        1, "fp", ifelse(dev_df$type == "spam" & dev_df$guess == 
        0, "fn", ""))))
    tp <- nrow(dev_df[dev_df$result == "tp", ])
    tn <- nrow(dev_df[dev_df$result == "tn", ])
    fp <- nrow(dev_df[dev_df$result == "fp", ])
    fn <- nrow(dev_df[dev_df$result == "fn", ])
    cm <- as.matrix(table(Actual = dev_df$type_binary, Predicted = dev_df$guess))
    cm
}

probability_dev_df <- data.frame(prob = seq(0.1, 0.9, 0.05))
probability_dev_df$result <- sapply(probability_dev_df$prob, 
    function(x) {
        nest(cm_function(x), -x)[, 2]
    })
unnest(probability_dev_df, result) %>% group_by(accuracy, prob) %>% 
    summarise(f1avg = mean(f1)) %>% arrange(desc(accuracy))
## # A tibble: 17 x 3
## # Groups:   accuracy [14]
##     accuracy  prob     f1avg
##        <dbl> <dbl>     <dbl>
##  1 0.9411376  0.40 0.9329979
##  2 0.9404762  0.50 0.9316252
##  3 0.9398148  0.35 0.9318465
##  4 0.9391534  0.45 0.9303330
##  5 0.9378307  0.60 0.9276992
##  6 0.9371693  0.55 0.9274668
##  7 0.9371693  0.65 0.9264597
##  8 0.9338624  0.30 0.9259589
##  9 0.9338624  0.70 0.9217921
## 10 0.9305556  0.75 0.9173176
## 11 0.9252646  0.25 0.9173084
## 12 0.9252646  0.80 0.9105642
## 13 0.9226190  0.85 0.9061744
## 14 0.9080688  0.20 0.8999382
## 15 0.9034392  0.90 0.8800209
## 16 0.8862434  0.15 0.8779563
## 17 0.8518519  0.10 0.8440591
nest(cm_function(0.4), -x)[, 2][[1]]
##    accuracy precision    recall        f1 pred
## 0 0.9411376 0.9624877 0.9502924 0.9563512    0
## 1 0.9411376 0.8977956 0.9218107 0.9096447    1
cm_table(0.4)
##       Predicted
## Actual   0   1
##      0 975  51
##      1  38 448

Run the adjusted function on the test data

Here we see that the test results return similar summary values as the development data. Unfortunately, the 40% limit chosen in the development set is not the most optimized solution, but still does a good job of predicting spam emails without accidentally identifying ham emails.

test_df$fit_logit <- mapply(classifier_function, test_df$ham_count, 
    test_df$spam_count, test_df$body_length, test_df$ham_subject_count, 
    test_df$spam_subject_count, test_df$subject_length, test_df$ham_from_count, 
    test_df$spam_from_count, test_df$ham_compare_count, test_df$spamsubject_compare_count, 
    test_df$hamcontent_compare_count, test_df$spamcontent_compare_count)

test_df$fit <- logit2prob(test_df$fit_logit)

cm_function_test <- function(x) {
    test_df$guess <- ifelse(test_df$fit > x, 1, 0)
    test_df$result <- ifelse(test_df$type == "spam" & test_df$guess == 
        1, "tp", ifelse(test_df$type == "ham" & test_df$guess == 
        0, "tn", ifelse(test_df$type == "ham" & test_df$guess == 
        1, "fp", ifelse(test_df$type == "spam" & test_df$guess == 
        0, "fn", ""))))
    tp <- nrow(test_df[test_df$result == "tp", ])
    tn <- nrow(test_df[test_df$result == "tn", ])
    fp <- nrow(test_df[test_df$result == "fp", ])
    fn <- nrow(test_df[test_df$result == "fn", ])
    cm <- as.matrix(table(Actual = test_df$type_binary, Predicted = test_df$guess))
    n = sum(cm)  # number of instances
    nc = nrow(cm)  # number of classes
    diag = diag(cm)  # number of correctly classified instances per class 
    rowsums = apply(cm, 1, sum)  # number of instances per class
    colsums = apply(cm, 2, sum)  # number of predictions per class
    p = rowsums/n  # distribution of instances over the actual classes
    q = colsums/n  # distribution of instances over the predicted classes
    accuracy = sum(diag)/n
    precision = diag/colsums
    recall = diag/rowsums
    f1 = 2 * precision * recall/(precision + recall)
    df <- data.frame(x, accuracy, precision, recall, f1)
    df$pred <- row.names(df)
    df
}

cm_table_test <- function(x) {
    test_df$guess <- ifelse(test_df$fit > x, 1, 0)
    test_df$result <- ifelse(test_df$type == "spam" & test_df$guess == 
        1, "tp", ifelse(test_df$type == "ham" & test_df$guess == 
        0, "tn", ifelse(test_df$type == "ham" & test_df$guess == 
        1, "fp", ifelse(test_df$type == "spam" & test_df$guess == 
        0, "fn", ""))))
    tp <- nrow(test_df[test_df$result == "tp", ])
    tn <- nrow(test_df[test_df$result == "tn", ])
    fp <- nrow(test_df[test_df$result == "fp", ])
    fn <- nrow(test_df[test_df$result == "fn", ])
    cm <- as.matrix(table(Actual = test_df$type_binary, Predicted = test_df$guess))
    cm
}

probability_test_df <- data.frame(prob = seq(0.1, 0.9, 0.05))
probability_test_df$result <- sapply(probability_test_df$prob, 
    function(x) {
        nest(cm_function_test(x), -x)[, 2]
    })

unnest(probability_test_df, result) %>% group_by(accuracy, prob) %>% 
    summarise(f1avg = mean(f1)) %>% arrange(desc(accuracy))
## # A tibble: 17 x 3
## # Groups:   accuracy [14]
##     accuracy  prob     f1avg
##        <dbl> <dbl>     <dbl>
##  1 0.9318558  0.65 0.9175007
##  2 0.9295402  0.45 0.9176181
##  3 0.9295402  0.50 0.9172257
##  4 0.9285478  0.60 0.9144546
##  5 0.9285478  0.75 0.9123666
##  6 0.9272246  0.55 0.9138529
##  7 0.9272246  0.70 0.9112966
##  8 0.9268938  0.40 0.9155013
##  9 0.9249090  0.35 0.9138289
## 10 0.9225935  0.80 0.9037701
## 11 0.9186239  0.30 0.9075319
## 12 0.9166391  0.85 0.8954468
## 13 0.9120079  0.25 0.9013652
## 14 0.8971221  0.90 0.8673581
## 15 0.8961297  0.20 0.8850377
## 16 0.8772742  0.15 0.8666304
## 17 0.8471717  0.10 0.8376250
nest(cm_function_test(0.4), -x)[, 2][[1]]
##    accuracy precision    recall        f1 pred
## 0 0.9268938 0.9564792 0.9367816 0.9465279    0
## 1 0.9268938 0.8650307 0.9048128 0.8844746    1
cm_table_test(0.4)
##       Predicted
## Actual    0    1
##      0 1956  132
##      1   89  846