Note, I decided to take all the latest spam/ham files available so if you try to re-run this analysis, please be aware that it will take a while.
options(width = 100)
# This is a standard setup I include so that my working
# directory is set correctly whether I work on one of my
# windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
setwd("~/Masters/DATA607/Project4")
} else {
setwd("~/Documents/Masters/DATA607/Project4")
}
For this assignment, I took all the updated spam and ham emails located on the referenced website. I loaded all of these emails into dataframes and did some simple text formatting. I left warnings enabled for the text processing to ensure that there are not numerous file errors (there is only one noted below).
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(plyr)))
suppressWarnings(suppressMessages(library("tm")))
suppressWarnings(suppressMessages(library(tidytext)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tidyr)))
format_list <- function(mail_type, y) {
lapply(y, function(x) {
to_list <- str_replace(unlist(str_match_all(x, "\\s*To:\\s(.*)"))[1],
"To: ", "")
from_list <- str_replace(unlist(str_match_all(x, "\\s*From:\\s(.*)"))[1],
"From: ", "")
content_list <- str_replace(unlist(str_match_all(x, "\\s*Content-Type:\\s*(.*)"))[1],
"\\s*Content-Type: ", "")
subject_list <- str_replace(unlist(str_match_all(x, "\\s*Subject:\\s(.*)"))[1],
"\\s*Subject: ", "")
body_text <- paste(str_match_all(paste(x, collapse = " "),
"([[:alpha:]]+)")[[1]], collapse = " ")
x <- c(mail_type, to_list, from_list, content_list, subject_list,
body_text)
})
}
easy_ham_file_list <- list.files(path = "./data/easy_ham", full.names = TRUE,
pattern = "0.*")
easy_ham_files <- lapply(easy_ham_file_list, readLines)
easy_ham_list <- format_list("ham", easy_ham_files)
easy_ham_df <- lapply(easy_ham_list, function(x) {
x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
easy_ham_df <- rbind.fill(easy_ham_df)
easy_ham_2_file_list <- list.files(path = "./data/easy_ham_2",
full.names = TRUE, pattern = "0.*")
easy_ham_2_files <- lapply(easy_ham_2_file_list, readLines)
easy_ham_2_list <- format_list("ham", easy_ham_2_files)
easy_ham_2_df <- lapply(easy_ham_2_list, function(x) {
x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
easy_ham_2_df <- rbind.fill(easy_ham_2_df)
hard_ham_file_list <- list.files(path = "./data/hard_ham", full.names = TRUE,
pattern = "0.*")
hard_ham_files <- lapply(hard_ham_file_list, readLines)
## Warning in FUN(X[[i]], ...): incomplete final line found on './data/hard_ham/
## 00228.0eaef7857bbbf3ebf5edbbdae2b30493'
hard_ham_list <- format_list("ham", hard_ham_files)
hard_ham_df <- lapply(hard_ham_list, function(x) {
x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
hard_ham_df <- rbind.fill(hard_ham_df)
spam_file_list <- list.files(path = "./data/spam", full.names = TRUE,
pattern = "0.*")
spam_files <- lapply(spam_file_list, readLines)
spam_list <- format_list("spam", spam_files)
spam_df <- lapply(spam_list, function(x) {
x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
spam_df <- rbind.fill(spam_df)
spam_2_file_list <- list.files(path = "./data/spam_2", full.names = TRUE,
pattern = "0.*")
spam_2_files <- lapply(spam_2_file_list, readLines)
spam_2_list <- format_list("spam", spam_2_files)
spam_2_df <- lapply(spam_2_list, function(x) {
x <- data.frame(matrix(x, ncol = 6), stringsAsFactors = FALSE)
})
spam_2_df <- rbind.fill(spam_2_df)
full_set_df <- rbind(easy_ham_df, easy_ham_2_df, hard_ham_df,
spam_df, spam_2_df)
colnames(full_set_df) <- c("type", "to", "from", "content_type",
"subject", "body")
full_set_df$body <- removeWords(str_replace_all(tolower(full_set_df$body),
"[\\W]", " "), stopwords("english"))
full_set_df$subject <- removeWords(str_replace_all(tolower(full_set_df$subject),
"[\\W]", " "), stopwords("english"))
full_set_df$from <- removeWords(str_replace_all(tolower(full_set_df$from),
"[\\W]", " "), c("com"))
full_set_df$content_type <- str_replace_all(tolower(full_set_df$content_type),
"[^[:alnum:]]", " ")
I decided to use 25% of the data to use for the comparisons (training data), 25% to run the generalized linear model (development), and 50% as holdout data to test the results (test data).
sample_size <- floor(0.5 * nrow(full_set_df))
set.seed(869)
train_dev_ind <- sample(seq_len(nrow(full_set_df)), size = sample_size)
train_dev_df <- full_set_df[train_dev_ind, ]
test_df <- full_set_df[-train_dev_ind, ]
train_sample_size <- floor(0.5 * nrow(train_dev_df))
train_ind <- sample(seq_len(nrow(train_dev_df)), size = train_sample_size)
train_df <- train_dev_df[train_ind, ]
dev_df <- train_dev_df[-train_ind, ]
After tidying up the data, I created some comparisons to use for classifying the data:
spamwords_full_df <- train_df[train_df$type == "spam", ] %>%
unnest_tokens(text, body) %>% count(text, sort = TRUE) %>%
filter(n > 1000)
hamwords_full_df <- train_df[train_df$type == "ham", ] %>% unnest_tokens(text,
body) %>% count(text, sort = TRUE) %>% filter(n > 1000)
spamsubject_full_df <- train_df[train_df$type == "spam", ] %>%
unnest_tokens(text, subject) %>% count(text, sort = TRUE) %>%
filter(n > 10)
hamsubject_full_df <- train_df[train_df$type == "ham", ] %>%
unnest_tokens(text, subject) %>% count(text, sort = TRUE) %>%
filter(n > 10)
spamsubject_compare_df <- anti_join(x = spamsubject_full_df,
y = hamsubject_full_df, by = "text")[1:10, ]
hamsubject_compare_df <- anti_join(x = hamsubject_full_df, y = spamsubject_full_df,
by = "text")[1:10, ]
spamfrom_full_df <- train_df[train_df$type == "spam", ] %>% unnest_tokens(text,
from) %>% count(text, sort = TRUE) %>% filter(n > 20)
hamfrom_full_df <- train_df[train_df$type == "ham", ] %>% unnest_tokens(text,
from) %>% count(text, sort = TRUE) %>% filter(n > 20)
spamwords_compare_df <- anti_join(x = spamwords_full_df, y = hamwords_full_df,
by = "text")[1:10, ]
hamwords_compare_df <- anti_join(x = hamwords_full_df, y = spamwords_full_df,
by = "text")[1:10, ]
spamcontent_full_df <- train_df[train_df$type == "spam", ] %>%
unnest_tokens(text, content_type) %>% count(text, sort = TRUE) %>%
filter(n > 20)
hamcontent_full_df <- train_df[train_df$type == "ham", ] %>%
unnest_tokens(text, content_type) %>% count(text, sort = TRUE) %>%
filter(n > 20)
spamcontent_compare_df <- anti_join(x = spamcontent_full_df,
y = hamcontent_full_df, by = "text")
hamcontent_compare_df <- anti_join(x = hamcontent_full_df, y = spamcontent_full_df,
by = "text")
word_compare <- function(body, compare_list) {
body_list <- unlist(strsplit(body, " "))
x <- ifelse(body_list %in% compare_list, 1, 0)
x <- sum(x)
}
train_df$ham_count <- sapply(train_df$body, function(x) {
word_compare(x, hamwords_full_df$text)
})
dev_df$ham_count <- sapply(dev_df$body, function(x) {
word_compare(x, hamwords_full_df$text)
})
test_df$ham_count <- sapply(test_df$body, function(x) {
word_compare(x, hamwords_full_df$text)
})
train_df$spam_count <- sapply(train_df$body, function(x) {
word_compare(x, spamwords_full_df$text)
})
dev_df$spam_count <- sapply(dev_df$body, function(x) {
word_compare(x, spamwords_full_df$text)
})
test_df$spam_count <- sapply(test_df$body, function(x) {
word_compare(x, spamwords_full_df$text)
})
train_df$body_length <- sapply(train_df$body, function(x) {
length(unlist(strsplit(x, " ")))
})
dev_df$body_length <- sapply(dev_df$body, function(x) {
length(unlist(strsplit(x, " ")))
})
test_df$body_length <- sapply(test_df$body, function(x) {
length(unlist(strsplit(x, " ")))
})
train_df$subject_length <- sapply(train_df$subject, function(x) {
length(unlist(strsplit(x, " ")))
})
dev_df$subject_length <- sapply(dev_df$subject, function(x) {
length(unlist(strsplit(x, " ")))
})
test_df$subject_length <- sapply(test_df$subject, function(x) {
length(unlist(strsplit(x, " ")))
})
train_df$spam_subject_count <- sapply(train_df$subject, function(x) {
word_compare(x, spamsubject_full_df$text)
})
dev_df$spam_subject_count <- sapply(dev_df$subject, function(x) {
word_compare(x, spamsubject_full_df$text)
})
test_df$spam_subject_count <- sapply(test_df$subject, function(x) {
word_compare(x, spamsubject_full_df$text)
})
train_df$ham_subject_count <- sapply(train_df$subject, function(x) {
word_compare(x, hamsubject_full_df$text)
})
dev_df$ham_subject_count <- sapply(dev_df$subject, function(x) {
word_compare(x, hamsubject_full_df$text)
})
test_df$ham_subject_count <- sapply(test_df$subject, function(x) {
word_compare(x, hamsubject_full_df$text)
})
train_df$spam_from_count <- sapply(train_df$from, function(x) {
word_compare(x, spamfrom_full_df$text)
})
dev_df$spam_from_count <- sapply(dev_df$from, function(x) {
word_compare(x, spamfrom_full_df$text)
})
test_df$spam_from_count <- sapply(test_df$from, function(x) {
word_compare(x, spamfrom_full_df$text)
})
train_df$ham_from_count <- sapply(train_df$from, function(x) {
word_compare(x, hamfrom_full_df$text)
})
dev_df$ham_from_count <- sapply(dev_df$from, function(x) {
word_compare(x, hamfrom_full_df$text)
})
test_df$ham_from_count <- sapply(test_df$from, function(x) {
word_compare(x, hamfrom_full_df$text)
})
train_df$spam_compare_count <- sapply(train_df$body, function(x) {
word_compare(x, spamwords_compare_df$text)
})
dev_df$spam_compare_count <- sapply(dev_df$body, function(x) {
word_compare(x, spamwords_compare_df$text)
})
test_df$spam_compare_count <- sapply(test_df$body, function(x) {
word_compare(x, spamwords_compare_df$text)
})
train_df$ham_compare_count <- sapply(train_df$body, function(x) {
word_compare(x, hamwords_compare_df$text)
})
dev_df$ham_compare_count <- sapply(dev_df$body, function(x) {
word_compare(x, hamwords_compare_df$text)
})
test_df$ham_compare_count <- sapply(test_df$body, function(x) {
word_compare(x, hamwords_compare_df$text)
})
train_df$spamsubject_compare_count <- sapply(train_df$body, function(x) {
word_compare(x, spamsubject_compare_df$text)
})
dev_df$spamsubject_compare_count <- sapply(dev_df$body, function(x) {
word_compare(x, spamsubject_compare_df$text)
})
test_df$spamsubject_compare_count <- sapply(test_df$body, function(x) {
word_compare(x, spamsubject_compare_df$text)
})
train_df$hamsubject_compare_count <- sapply(train_df$body, function(x) {
word_compare(x, hamsubject_compare_df$text)
})
dev_df$hamsubject_compare_count <- sapply(dev_df$body, function(x) {
word_compare(x, hamsubject_compare_df$text)
})
test_df$hamsubject_compare_count <- sapply(test_df$body, function(x) {
word_compare(x, hamsubject_compare_df$text)
})
train_df$hamcontent_compare_count <- sapply(train_df$body, function(x) {
word_compare(x, hamcontent_compare_df$text)
})
dev_df$hamcontent_compare_count <- sapply(dev_df$body, function(x) {
word_compare(x, hamcontent_compare_df$text)
})
test_df$hamcontent_compare_count <- sapply(test_df$body, function(x) {
word_compare(x, hamcontent_compare_df$text)
})
train_df$hamcontent_from_count <- sapply(train_df$from, function(x) {
word_compare(x, hamcontent_full_df$text)
})
dev_df$hamcontent_from_count <- sapply(dev_df$from, function(x) {
word_compare(x, hamcontent_full_df$text)
})
test_df$hamcontent_from_count <- sapply(test_df$from, function(x) {
word_compare(x, hamcontent_full_df$text)
})
train_df$spamcontent_compare_count <- sapply(train_df$body, function(x) {
word_compare(x, spamcontent_compare_df$text)
})
dev_df$spamcontent_compare_count <- sapply(dev_df$body, function(x) {
word_compare(x, spamcontent_compare_df$text)
})
test_df$spamcontent_compare_count <- sapply(test_df$body, function(x) {
word_compare(x, spamcontent_compare_df$text)
})
train_df$spamcontent_from_count <- sapply(train_df$from, function(x) {
word_compare(x, spamcontent_full_df$text)
})
dev_df$spamcontent_from_count <- sapply(dev_df$from, function(x) {
word_compare(x, spamcontent_full_df$text)
})
test_df$spamcontent_from_count <- sapply(test_df$from, function(x) {
word_compare(x, spamcontent_full_df$text)
})
train_df$type_binary <- ifelse(train_df$type == "ham", 0, 1)
dev_df$type_binary <- ifelse(dev_df$type == "ham", 0, 1)
test_df$type_binary <- ifelse(test_df$type == "ham", 0, 1)
A generalized linear model was run on the development data and then a step function was applied to determine the most appropriate categories to use.
email_glm <- glm(data = dev_df, type_binary ~ ham_count + spam_count +
body_length + ham_subject_count + spam_subject_count + subject_length +
ham_from_count + spam_from_count + ham_compare_count + spam_compare_count +
hamsubject_compare_count + spamsubject_compare_count + hamcontent_from_count +
spamcontent_from_count + hamcontent_compare_count + spamcontent_compare_count,
family = "binomial")
summary(email_glm)
##
## Call:
## glm(formula = type_binary ~ ham_count + spam_count + body_length +
## ham_subject_count + spam_subject_count + subject_length +
## ham_from_count + spam_from_count + ham_compare_count + spam_compare_count +
## hamsubject_compare_count + spamsubject_compare_count + hamcontent_from_count +
## spamcontent_from_count + hamcontent_compare_count + spamcontent_compare_count,
## family = "binomial", data = dev_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1217 -0.1380 -0.0040 0.0989 3.6665
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.434e+00 3.243e-01 7.505 6.13e-14 ***
## ham_count -1.192e-02 1.998e-03 -5.970 2.38e-09 ***
## spam_count 1.115e-02 1.930e-03 5.780 7.49e-09 ***
## body_length 2.577e-04 9.135e-05 2.821 0.004794 **
## ham_subject_count -2.308e+00 2.363e-01 -9.770 < 2e-16 ***
## spam_subject_count 1.267e+00 2.710e-01 4.674 2.96e-06 ***
## subject_length 1.028e-01 2.771e-02 3.709 0.000208 ***
## ham_from_count -3.250e+00 4.099e-01 -7.930 2.20e-15 ***
## spam_from_count 2.931e+00 4.391e-01 6.676 2.46e-11 ***
## ham_compare_count -1.216e-02 4.846e-03 -2.508 0.012131 *
## spam_compare_count -9.666e-04 4.246e-03 -0.228 0.819896
## hamsubject_compare_count -2.329e-02 1.993e-02 -1.169 0.242527
## spamsubject_compare_count 3.271e-02 1.693e-02 1.932 0.053336 .
## hamcontent_from_count 1.984e-01 4.680e-01 0.424 0.671677
## spamcontent_from_count NA NA NA NA
## hamcontent_compare_count -1.657e-01 6.351e-02 -2.609 0.009070 **
## spamcontent_compare_count 8.052e-02 3.341e-02 2.410 0.015952 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1898.90 on 1511 degrees of freedom
## Residual deviance: 472.51 on 1496 degrees of freedom
## AIC: 504.51
##
## Number of Fisher Scoring iterations: 9
step_email_glm <- step(email_glm)
## Start: AIC=504.51
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count +
## spam_subject_count + subject_length + ham_from_count + spam_from_count +
## ham_compare_count + spam_compare_count + hamsubject_compare_count +
## spamsubject_compare_count + hamcontent_from_count + spamcontent_from_count +
## hamcontent_compare_count + spamcontent_compare_count
##
##
## Step: AIC=504.51
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count +
## spam_subject_count + subject_length + ham_from_count + spam_from_count +
## ham_compare_count + spam_compare_count + hamsubject_compare_count +
## spamsubject_compare_count + hamcontent_from_count + hamcontent_compare_count +
## spamcontent_compare_count
##
## Df Deviance AIC
## - spam_compare_count 1 472.56 502.56
## - hamcontent_from_count 1 472.69 502.69
## - hamsubject_compare_count 1 473.90 503.90
## <none> 472.51 504.51
## - spamsubject_compare_count 1 477.28 507.28
## - ham_compare_count 1 478.19 508.19
## - spamcontent_compare_count 1 478.43 508.43
## - hamcontent_compare_count 1 479.39 509.39
## - body_length 1 480.25 510.25
## - subject_length 1 489.59 519.59
## - spam_subject_count 1 499.05 529.05
## - spam_count 1 521.83 551.83
## - ham_count 1 525.58 555.58
## - spam_from_count 1 531.83 561.83
## - ham_subject_count 1 632.34 662.34
## - ham_from_count 1 712.25 742.25
##
## Step: AIC=502.56
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count +
## spam_subject_count + subject_length + ham_from_count + spam_from_count +
## ham_compare_count + hamsubject_compare_count + spamsubject_compare_count +
## hamcontent_from_count + hamcontent_compare_count + spamcontent_compare_count
##
## Df Deviance AIC
## - hamcontent_from_count 1 472.76 500.76
## - hamsubject_compare_count 1 473.99 501.99
## <none> 472.56 502.56
## - spamsubject_compare_count 1 477.34 505.34
## - spamcontent_compare_count 1 478.51 506.51
## - ham_compare_count 1 479.38 507.38
## - hamcontent_compare_count 1 479.63 507.63
## - body_length 1 480.26 508.26
## - subject_length 1 489.70 517.70
## - spam_subject_count 1 499.24 527.24
## - spam_from_count 1 531.96 559.96
## - ham_count 1 544.33 572.33
## - spam_count 1 545.02 573.02
## - ham_subject_count 1 632.87 660.87
## - ham_from_count 1 713.12 741.12
##
## Step: AIC=500.76
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count +
## spam_subject_count + subject_length + ham_from_count + spam_from_count +
## ham_compare_count + hamsubject_compare_count + spamsubject_compare_count +
## hamcontent_compare_count + spamcontent_compare_count
##
## Df Deviance AIC
## - hamsubject_compare_count 1 474.20 500.20
## <none> 472.76 500.76
## - spamsubject_compare_count 1 477.55 503.55
## - spamcontent_compare_count 1 479.12 505.12
## - ham_compare_count 1 479.51 505.51
## - hamcontent_compare_count 1 479.81 505.81
## - body_length 1 480.48 506.48
## - subject_length 1 489.83 515.83
## - spam_subject_count 1 500.04 526.04
## - spam_from_count 1 532.30 558.30
## - ham_count 1 544.78 570.78
## - spam_count 1 545.55 571.55
## - ham_subject_count 1 632.98 658.98
## - ham_from_count 1 713.23 739.23
##
## Step: AIC=500.2
## type_binary ~ ham_count + spam_count + body_length + ham_subject_count +
## spam_subject_count + subject_length + ham_from_count + spam_from_count +
## ham_compare_count + spamsubject_compare_count + hamcontent_compare_count +
## spamcontent_compare_count
##
## Df Deviance AIC
## <none> 474.20 500.20
## - spamsubject_compare_count 1 478.51 502.51
## - spamcontent_compare_count 1 480.56 504.56
## - ham_compare_count 1 480.71 504.71
## - hamcontent_compare_count 1 481.02 505.02
## - body_length 1 481.16 505.16
## - subject_length 1 490.89 514.89
## - spam_subject_count 1 502.92 526.92
## - spam_from_count 1 534.20 558.20
## - ham_count 1 547.68 571.68
## - spam_count 1 547.87 571.87
## - ham_subject_count 1 645.07 669.07
## - ham_from_count 1 713.86 737.86
summary(step_email_glm)
##
## Call:
## glm(formula = type_binary ~ ham_count + spam_count + body_length +
## ham_subject_count + spam_subject_count + subject_length +
## ham_from_count + spam_from_count + ham_compare_count + spamsubject_compare_count +
## hamcontent_compare_count + spamcontent_compare_count, family = "binomial",
## data = dev_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2469 -0.1517 -0.0043 0.0972 3.6820
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.427e+00 3.244e-01 7.480 7.41e-14 ***
## ham_count -1.197e-02 1.840e-03 -6.506 7.74e-11 ***
## spam_count 1.117e-02 1.742e-03 6.409 1.46e-10 ***
## body_length 2.408e-04 8.981e-05 2.682 0.007328 **
## ham_subject_count -2.349e+00 2.361e-01 -9.947 < 2e-16 ***
## spam_subject_count 1.309e+00 2.693e-01 4.862 1.16e-06 ***
## subject_length 1.015e-01 2.770e-02 3.663 0.000249 ***
## ham_from_count -3.232e+00 4.081e-01 -7.921 2.37e-15 ***
## spam_from_count 2.937e+00 4.381e-01 6.705 2.01e-11 ***
## ham_compare_count -1.240e-02 4.704e-03 -2.637 0.008368 **
## spamsubject_compare_count 3.068e-02 1.642e-02 1.869 0.061646 .
## hamcontent_compare_count -1.649e-01 6.367e-02 -2.590 0.009605 **
## spamcontent_compare_count 8.044e-02 3.218e-02 2.500 0.012429 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1898.9 on 1511 degrees of freedom
## Residual deviance: 474.2 on 1499 degrees of freedom
## AIC: 500.2
##
## Number of Fisher Scoring iterations: 9
step_email_glm <- glm(formula = type_binary ~ ham_count + spam_count +
body_length + ham_subject_count + spam_subject_count + subject_length +
ham_from_count + spam_from_count + ham_compare_count + spamsubject_compare_count +
hamcontent_compare_count + spamcontent_compare_count, family = "binomial",
data = dev_df)
summary(step_email_glm)
##
## Call:
## glm(formula = type_binary ~ ham_count + spam_count + body_length +
## ham_subject_count + spam_subject_count + subject_length +
## ham_from_count + spam_from_count + ham_compare_count + spamsubject_compare_count +
## hamcontent_compare_count + spamcontent_compare_count, family = "binomial",
## data = dev_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2469 -0.1517 -0.0043 0.0972 3.6820
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.427e+00 3.244e-01 7.480 7.41e-14 ***
## ham_count -1.197e-02 1.840e-03 -6.506 7.74e-11 ***
## spam_count 1.117e-02 1.742e-03 6.409 1.46e-10 ***
## body_length 2.408e-04 8.981e-05 2.682 0.007328 **
## ham_subject_count -2.349e+00 2.361e-01 -9.947 < 2e-16 ***
## spam_subject_count 1.309e+00 2.693e-01 4.862 1.16e-06 ***
## subject_length 1.015e-01 2.770e-02 3.663 0.000249 ***
## ham_from_count -3.232e+00 4.081e-01 -7.921 2.37e-15 ***
## spam_from_count 2.937e+00 4.381e-01 6.705 2.01e-11 ***
## ham_compare_count -1.240e-02 4.704e-03 -2.637 0.008368 **
## spamsubject_compare_count 3.068e-02 1.642e-02 1.869 0.061646 .
## hamcontent_compare_count -1.649e-01 6.367e-02 -2.590 0.009605 **
## spamcontent_compare_count 8.044e-02 3.218e-02 2.500 0.012429 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1898.9 on 1511 degrees of freedom
## Residual deviance: 474.2 on 1499 degrees of freedom
## AIC: 500.2
##
## Number of Fisher Scoring iterations: 9
Now, we can use the results of the glm to check the results of our predictions in the devlopment data:
classifier_function <- function(hamcount, spamcount, bodylength,
hamsubjectcount, spamsubjectcount, subjectlength, hamfromcount,
spamfromcount, hamcomparecount, spamsubjectcomparecount,
hamcontentcomparecount, spamcontentcomprecount) {
x <- email_glm$coefficients[[1]] + email_glm$coefficients[[2]] *
hamcount + email_glm$coefficients[[3]] * spamcount +
email_glm$coefficients[[4]] * bodylength + email_glm$coefficients[[5]] *
hamsubjectcount + email_glm$coefficients[[6]] * spamsubjectcount +
email_glm$coefficients[[7]] * subjectlength + email_glm$coefficients[[8]] *
hamfromcount + email_glm$coefficients[[9]] * spamfromcount +
email_glm$coefficients[[10]] * hamcomparecount + email_glm$coefficients[[11]] *
spamsubjectcomparecount + email_glm$coefficients[[12]] *
hamcontentcomparecount + email_glm$coefficients[[13]] *
spamcontentcomprecount
}
logit2prob <- function(logit) {
odds <- exp(logit)
prob <- odds/(1 + odds)
return(prob)
}
dev_df$fit_logit <- mapply(classifier_function, dev_df$ham_count,
dev_df$spam_count, dev_df$body_length, dev_df$ham_subject_count,
dev_df$spam_subject_count, dev_df$subject_length, dev_df$ham_from_count,
dev_df$spam_from_count, dev_df$ham_compare_count, dev_df$spamsubject_compare_count,
dev_df$hamcontent_compare_count, dev_df$spamcontent_compare_count)
dev_df$fit <- logit2prob(dev_df$fit_logit)
But wait, here’s another question. At what probability do we determine an email is spam? 50%? 60%? Here, we can run a data frame of varying probabiblities to return the best results. By doing this analysis, we see that using a probablity of 40% to determine whether an email is spam returns the best results.
cm_function <- function(x) {
dev_df$guess <- ifelse(dev_df$fit > x, 1, 0)
dev_df$result <- ifelse(dev_df$type == "spam" & dev_df$guess ==
1, "tp", ifelse(dev_df$type == "ham" & dev_df$guess ==
0, "tn", ifelse(dev_df$type == "ham" & dev_df$guess ==
1, "fp", ifelse(dev_df$type == "spam" & dev_df$guess ==
0, "fn", ""))))
tp <- nrow(dev_df[dev_df$result == "tp", ])
tn <- nrow(dev_df[dev_df$result == "tn", ])
fp <- nrow(dev_df[dev_df$result == "fp", ])
fn <- nrow(dev_df[dev_df$result == "fn", ])
cm <- as.matrix(table(Actual = dev_df$type_binary, Predicted = dev_df$guess))
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums/n # distribution of instances over the actual classes
q = colsums/n # distribution of instances over the predicted classes
accuracy = sum(diag)/n
precision = diag/colsums
recall = diag/rowsums
f1 = 2 * precision * recall/(precision + recall)
df <- data.frame(x, accuracy, precision, recall, f1)
df$pred <- row.names(df)
df
}
cm_table <- function(x) {
dev_df$guess <- ifelse(dev_df$fit > x, 1, 0)
dev_df$result <- ifelse(dev_df$type == "spam" & dev_df$guess ==
1, "tp", ifelse(dev_df$type == "ham" & dev_df$guess ==
0, "tn", ifelse(dev_df$type == "ham" & dev_df$guess ==
1, "fp", ifelse(dev_df$type == "spam" & dev_df$guess ==
0, "fn", ""))))
tp <- nrow(dev_df[dev_df$result == "tp", ])
tn <- nrow(dev_df[dev_df$result == "tn", ])
fp <- nrow(dev_df[dev_df$result == "fp", ])
fn <- nrow(dev_df[dev_df$result == "fn", ])
cm <- as.matrix(table(Actual = dev_df$type_binary, Predicted = dev_df$guess))
cm
}
probability_dev_df <- data.frame(prob = seq(0.1, 0.9, 0.05))
probability_dev_df$result <- sapply(probability_dev_df$prob,
function(x) {
nest(cm_function(x), -x)[, 2]
})
unnest(probability_dev_df, result) %>% group_by(accuracy, prob) %>%
summarise(f1avg = mean(f1)) %>% arrange(desc(accuracy))
## # A tibble: 17 x 3
## # Groups: accuracy [14]
## accuracy prob f1avg
## <dbl> <dbl> <dbl>
## 1 0.9411376 0.40 0.9329979
## 2 0.9404762 0.50 0.9316252
## 3 0.9398148 0.35 0.9318465
## 4 0.9391534 0.45 0.9303330
## 5 0.9378307 0.60 0.9276992
## 6 0.9371693 0.55 0.9274668
## 7 0.9371693 0.65 0.9264597
## 8 0.9338624 0.30 0.9259589
## 9 0.9338624 0.70 0.9217921
## 10 0.9305556 0.75 0.9173176
## 11 0.9252646 0.25 0.9173084
## 12 0.9252646 0.80 0.9105642
## 13 0.9226190 0.85 0.9061744
## 14 0.9080688 0.20 0.8999382
## 15 0.9034392 0.90 0.8800209
## 16 0.8862434 0.15 0.8779563
## 17 0.8518519 0.10 0.8440591
nest(cm_function(0.4), -x)[, 2][[1]]
## accuracy precision recall f1 pred
## 0 0.9411376 0.9624877 0.9502924 0.9563512 0
## 1 0.9411376 0.8977956 0.9218107 0.9096447 1
cm_table(0.4)
## Predicted
## Actual 0 1
## 0 975 51
## 1 38 448
Here we see that the test results return similar summary values as the development data. Unfortunately, the 40% limit chosen in the development set is not the most optimized solution, but still does a good job of predicting spam emails without accidentally identifying ham emails.
test_df$fit_logit <- mapply(classifier_function, test_df$ham_count,
test_df$spam_count, test_df$body_length, test_df$ham_subject_count,
test_df$spam_subject_count, test_df$subject_length, test_df$ham_from_count,
test_df$spam_from_count, test_df$ham_compare_count, test_df$spamsubject_compare_count,
test_df$hamcontent_compare_count, test_df$spamcontent_compare_count)
test_df$fit <- logit2prob(test_df$fit_logit)
cm_function_test <- function(x) {
test_df$guess <- ifelse(test_df$fit > x, 1, 0)
test_df$result <- ifelse(test_df$type == "spam" & test_df$guess ==
1, "tp", ifelse(test_df$type == "ham" & test_df$guess ==
0, "tn", ifelse(test_df$type == "ham" & test_df$guess ==
1, "fp", ifelse(test_df$type == "spam" & test_df$guess ==
0, "fn", ""))))
tp <- nrow(test_df[test_df$result == "tp", ])
tn <- nrow(test_df[test_df$result == "tn", ])
fp <- nrow(test_df[test_df$result == "fp", ])
fn <- nrow(test_df[test_df$result == "fn", ])
cm <- as.matrix(table(Actual = test_df$type_binary, Predicted = test_df$guess))
n = sum(cm) # number of instances
nc = nrow(cm) # number of classes
diag = diag(cm) # number of correctly classified instances per class
rowsums = apply(cm, 1, sum) # number of instances per class
colsums = apply(cm, 2, sum) # number of predictions per class
p = rowsums/n # distribution of instances over the actual classes
q = colsums/n # distribution of instances over the predicted classes
accuracy = sum(diag)/n
precision = diag/colsums
recall = diag/rowsums
f1 = 2 * precision * recall/(precision + recall)
df <- data.frame(x, accuracy, precision, recall, f1)
df$pred <- row.names(df)
df
}
cm_table_test <- function(x) {
test_df$guess <- ifelse(test_df$fit > x, 1, 0)
test_df$result <- ifelse(test_df$type == "spam" & test_df$guess ==
1, "tp", ifelse(test_df$type == "ham" & test_df$guess ==
0, "tn", ifelse(test_df$type == "ham" & test_df$guess ==
1, "fp", ifelse(test_df$type == "spam" & test_df$guess ==
0, "fn", ""))))
tp <- nrow(test_df[test_df$result == "tp", ])
tn <- nrow(test_df[test_df$result == "tn", ])
fp <- nrow(test_df[test_df$result == "fp", ])
fn <- nrow(test_df[test_df$result == "fn", ])
cm <- as.matrix(table(Actual = test_df$type_binary, Predicted = test_df$guess))
cm
}
probability_test_df <- data.frame(prob = seq(0.1, 0.9, 0.05))
probability_test_df$result <- sapply(probability_test_df$prob,
function(x) {
nest(cm_function_test(x), -x)[, 2]
})
unnest(probability_test_df, result) %>% group_by(accuracy, prob) %>%
summarise(f1avg = mean(f1)) %>% arrange(desc(accuracy))
## # A tibble: 17 x 3
## # Groups: accuracy [14]
## accuracy prob f1avg
## <dbl> <dbl> <dbl>
## 1 0.9318558 0.65 0.9175007
## 2 0.9295402 0.45 0.9176181
## 3 0.9295402 0.50 0.9172257
## 4 0.9285478 0.60 0.9144546
## 5 0.9285478 0.75 0.9123666
## 6 0.9272246 0.55 0.9138529
## 7 0.9272246 0.70 0.9112966
## 8 0.9268938 0.40 0.9155013
## 9 0.9249090 0.35 0.9138289
## 10 0.9225935 0.80 0.9037701
## 11 0.9186239 0.30 0.9075319
## 12 0.9166391 0.85 0.8954468
## 13 0.9120079 0.25 0.9013652
## 14 0.8971221 0.90 0.8673581
## 15 0.8961297 0.20 0.8850377
## 16 0.8772742 0.15 0.8666304
## 17 0.8471717 0.10 0.8376250
nest(cm_function_test(0.4), -x)[, 2][[1]]
## accuracy precision recall f1 pred
## 0 0.9268938 0.9564792 0.9367816 0.9465279 0
## 1 0.9268938 0.8650307 0.9048128 0.8844746 1
cm_table_test(0.4)
## Predicted
## Actual 0 1
## 0 1956 132
## 1 89 846