Project 4 - Spam/Ham Classification

Overview

The assignment was to use a set of email messages and build a classifier to separate them into spam or ham. I used a set of messages from Kaggle (Wessel van Lit, https://www.kaggle.com/veleon/ham-and-spam-dataset, 2019). I used a manual Bayesian model to gain a better understanding of how the process works, as opposed to simply feeding data into canned functions. My approach was as follows:

  1. Partition data into folds for training and holdout purposes.
  2. Read and tokenize the data.
  3. Clean up, normalize, and stem the terms, removing stopwords and terms common to both sets.
  4. Calculate TF and word probabilities on the training set.
  5. Calculate Bayesian probabilities on the holdout fold for each set.
  6. Evalute the performance of the model using the holdout set.
  7. Cross-validate by iterating over the remaining folds.
  8. Generate ROCs, cumulative response curves, and lift curves.
  9. Compare the results of each fold and select the best one.

Folds

Create directory structure and split messages into five folds for training and holdout data.

# Set parameters
# Number of folds to partition the data into
numfolds <- 5
# Use the top [maxwords] occurrences to decrease runtime;
# using every single tokenized word resulted in near-perfect models, which
# was pretty supsect and not reasonable for practical use.
maxwords <- 200

# See if the dataset has been unzipped yet; if not, automatically unzip it.
if (!dir.exists("hamnspam")) {
  if (file.exists("kaggle_spam_ham.zip")) {
    print("Unzipping")
    unzip("kaggle_spam_ham.zip", overwrite = T)
  }
}

# Partiion into folds for training and holdout sets
for (k in seq(-1, 1, 2)) {

  # Set variable indicating if this is the "ham" loop or "spam loop"
  hamorspam <- "ham"
  if (k == 1) {
    hamorspam <- "spam"
  }
  print(paste0("Partitioning ", hamorspam," directory"))

  # Iterate over the number of folds
  for (i in c(0:(numfolds - 1))) {

    # Create the ham or spam folder if it doesn't exist
    if (!dir.exists(paste0(hamorspam, "/", i))) {
      dir.create(paste0(hamorspam, "/", i))
    }
  }

  # Generate list of files and iterate through them, moving them into a different fold
  tmpfiles <- list.files(path=hamorspam, full.names=F, recursive=F, include.dirs=F)
  
  ct <- 0
  for (f in tmpfiles) {
    fromfile <- paste0(hamorspam, "/", f)
    tofile <- paste0(hamorspam, "/", as.character(ct %% numfolds), "/", f)
    if (file.info(fromfile)$isdir) {
      next
    }
    ct <- ct + 1
    if (ct %% 100 == 0) {
        print(paste0("    moving file ", ct, " of ", length(tmpfiles), ": ", fromfile, " -> ", tofile))
    }
    file.rename(from = fromfile, to = tofile)
  }

}
## [1] "Partitioning ham directory"
## [1] "Partitioning spam directory"
# Count total ham and spam
print("Getting file counts")
## [1] "Getting file counts"
totalham <- 0
totalspam <- 0
for (k in seq(-1, 1, 2)) {

    # Set variable indicating if this is the "ham" loop or "spam loop"
    hamorspam <- "ham"
    if (k == 1) {
    hamorspam <- "spam"
    }
    tmpfiles <- list.files(path=hamorspam, full.names=F, recursive=T, include.dirs=F)
    if(k == 1) {
        totalspam <- length(tmpfiles)
    } else {
        totalham <- length(tmpfiles)
    }
}

Tokenize

Ingest the data and tokenize it into terms.

# To only use a balanced set, find the maximum number of each type (ham or spam) to choose
maxset <- totalham
if (totalspam < totalham) {
    maxset <- totalspam
}
maxset <- maxset / numfolds

# Iterate through each file in the specified fold
processFold <- function(custom_stopwords, k, foldnum, holdoutflag, maxset) {

  # Set variable indicating if this is the "ham" loop or "spam loop"
  hamorspam <- "ham"
  if (k == 1) {
    hamorspam <- "spam"
  }
  print(paste0("    Fold #", foldnum, ", reading ", hamorspam," directory"))
    
  # Generate list of files and iterate through them, moving them into a different fold
  current_path <- paste0(hamorspam, "/", foldnum)
  tmpfiles <- list.files(path=current_path, full.names=F, recursive=F, include.dirs=F)

  ctf <- 0  #file counter
  for (f in tmpfiles) {
    
    # Exit as soon as we reach maxset to maintain a balanced sample set from each fold
    if (ctf >= maxset) {
        break
    }

    # Increment counters
    ct <<- ct + 1
    ctf <- ctf + 1
    if (ct %% 100 == 0) {
        print(paste0("        reading file #", ct, ": ", current_path, "/", f))
    }

        # Read file
        filename <- paste0(current_path, "/", f)
        txt <- readtext(filename, verbosity=0)

        # Extract subject from header
        header_end <- str_locate(txt, '\n\n')
        m <- str_match(str_sub(txt, 1, header_end[1] - 1), 'Subject: *(.+?)\n')
        subj <- m[2]

        # Extract message body, discarding header
        msgbody <- str_sub(txt, header_end[2] + 1)
        
        # Tokenize
        dftmp <- data.frame(text = paste(subj, msgbody))
        dftmp <- dftmp %>% unnest_tokens(word, text, token = "ngrams", n_min = 1, n = 1)

        # Remove leading and trailing underscores
        dftmp <- dftmp %>% mutate(word = str_replace(word, '^_+(.+)$', '\\1'))
        dftmp <- dftmp %>% mutate(word = str_replace(word, '^(.+)_+$', '\\1'))

        # Lemmatize words
        dftmp <- dftmp %>% mutate(word = lemmatize_words(word))
        
        # Remove words with numbers
        dftmp <- dftmp %>% filter(!grepl('\\d', word))
        
        # Remove all underscores
        dftmp <- dftmp %>% filter(!grepl('^_+$', word))

        # Remove stopwords
        dftmp <- dftmp %>% anti_join(custom_stopwords, by = c("word"))

        # TF
        dftmp <- dftmp %>% count(word, sort = F) %>%
            rename(tf = n)

        # Add doc_id, spam fields, and holdout fields
        dftmp <- dftmp %>% mutate(doc_id = f, spam = k, fold = foldnum)
        
        # Bind to main data frame
        df <<- bind_rows(df, dftmp)

  }  # for each file

}

# Load standard stop_words and read in custom stop_words from file
data(stop_words)
custom_stopwords <- read.delim("custom_stopwords.txt", sep = '\n', header = F) %>%
    rename(word = V1) %>% mutate(lexicon = "custom")
custom_stopwords <- bind_rows(custom_stopwords, stop_words)

# Create main data frame
df <- data.frame(matrix(vector(), 0, 5), stringsAsFactors=F)
colnames(df) <- c("doc_id", "word", "tf", "spam", "fold")

# Seed the tables to set the variable types
df <- df %>% mutate(doc_id = "text", word = "text", tf = 0, spam = 0, fold = 0)

# Do this twice - once for ham, once for spam
ct <- 0
for (k in seq(-1, 1, 2)) {

  # Iterate over folds
  for (foldnum in c(0:(numfolds-1))) {

      # Read and tokenize files
      processFold(custom_stopwords, k, foldnum, 0, maxset)

  }  # for each fold

}  # once for ham, once for ham
## [1] "    Fold #0, reading ham directory"
## [1] "        reading file #100: ham/0/0496.397e015dabee7ac3c6e655ad9bf66052"
## [1] "    Fold #1, reading ham directory"
## [1] "        reading file #200: ham/1/0492.3aa3aaa0ac9343fd1aa11864e8c281b1"
## [1] "    Fold #2, reading ham directory"
## [1] "        reading file #300: ham/2/0488.8fddac859ba93d27a041fe770be65d29"
## [1] "    Fold #3, reading ham directory"
## [1] "        reading file #400: ham/3/0484.9c63b894f85b12f32ad5e5b97cd5c49c"
## [1] "    Fold #4, reading ham directory"
## [1] "        reading file #500: ham/4/0480.ac223ce8c5be563214bdc8e3af2a18b6"
## [1] "    Fold #0, reading spam directory"
## [1] "        reading file #600: spam/0/0470.b9e513715695ea1c79c1e5af0fb0eea9"
## [1] "    Fold #1, reading spam directory"
## [1] "        reading file #700: spam/1/0466.11bc31540055c320b62e5886ef27c4b2"
## [1] "    Fold #2, reading spam directory"
## [1] "        reading file #800: spam/2/0467.b59a3337d0979ba8d587bf4c166db8b1"
## [1] "    Fold #3, reading spam directory"
## [1] "        reading file #900: spam/3/0468.8edb99340b9a96a81813be2d3362605d"
## [1] "    Fold #4, reading spam directory"
## [1] "        reading file #1000: spam/4/0469.c1d9ab5918d50dac4242ef53c4aaf678"

Training

Train the model by iterating over the folds, reserving a holdout fold each iteration.

# Iterate over training folds
dfscores <- NULL
for (holdoutfold in c((numfolds-1):0)) {
    print(paste0("Set holdout fold to ", holdoutfold))

    # Get total number of docs by ham or spam, along with the probabilities of ham or spam
    print("    Calculating counts")
    numham <- count(df %>%
        filter(fold != holdoutfold) %>%
        filter(spam == -1) %>%
        group_by(doc_id) %>%
        summarize(n = n()))[[1]]
    numspam <- count(df %>%
        filter(fold != holdoutfold) %>%
        filter(spam == 1) %>%
        group_by(doc_id) %>%
        summarize(n = n()))[[1]]
    pham <- numham / (numham + numspam)
    pspam <- numspam / (numham + numspam)
    
    # Calculate the probability of seeing each word in the training set (which exludes the holdout fold)
    print("    Calculating p_word")
    dfwords <- df %>%
        filter(fold != holdoutfold) %>%
        group_by(spam, word) %>%
        summarise(docct = n()) %>%
        mutate(p = ifelse(spam == 1, docct / numspam, docct / numham)) %>%
        rename(spam_of_word = spam) %>%
        slice_max(n = maxwords, order_by = p, with_ties = F)

    # Using the holdout fold, calculate Bayes probabilities of ham and spam
    print("    Calculating Bayes probabilities")
    dfjoin <- df %>%
        filter(fold == holdoutfold) %>%
        inner_join(dfwords, by = c("word")) %>%
        group_by(doc_id, spam, spam_of_word) %>%
        summarise(prod_p = prod(p)) %>%
        mutate(prod_p = ifelse(spam_of_word == 1, prod_p * pspam, prod_p * pham))

    # Calculate final probability scores
    print("    Calculating final probability scores")
    dfresult <- dfjoin %>%
        spread(spam_of_word, prod_p) %>%
        rename(prod_p_ham = `-1`, prod_p_spam = `1`) %>%
        mutate(prod_p_ham = ifelse(is.na(prod_p_ham), 0, prod_p_ham)) %>%
        mutate(prod_p_spam = ifelse(is.na(prod_p_spam), 0, prod_p_spam)) %>%
        mutate(p_spam = ifelse(prod_p_spam > prod_p_ham, prod_p_spam, 1 - prod_p_ham)) %>%
        mutate(is_spam = ifelse(spam == 1, 1, 0)) %>%
        mutate(is_ham = ifelse(spam == -1, 1, 0)) %>%
        arrange(desc(p_spam))
    
    # Generate thresholds for ROC curves
    print("    Generating thresholds for ROC curves")
    currentThreshold <- 1
    dfthreshold <- dfresult
    for (i in 1:nrow(dfresult)) {
        if (dfthreshold[i, 'is_spam'] == 1 & dfthreshold[i, 'p_spam'] >= currentThreshold) {
            dfthreshold[i, 'true_pos'] <- 1
            dfthreshold[i, 'false_pos'] <- 0
        }
        else if (dfthreshold[i, 'is_spam'] == 1 & dfthreshold[i, 'p_spam'] < currentThreshold) {
            dfthreshold[i, 'true_pos'] <- 0
            dfthreshold[i, 'false_pos'] <- 1
        }
        else {
            dfthreshold[i, 'true_pos'] <- 0
            dfthreshold[i, 'false_pos'] <- 0
        }
        dfthreshold[i, 'pct_instances'] <- i / nrow(dfresult)
        currentThreshold <- dfthreshold[i, 'p_spam']
    }
    print("    Calculating cumulative rates")
    dfthreshold$cum_true_pos <- cumsum(dfthreshold$true_pos)
    dfthreshold$cum_false_pos <- cumsum(dfthreshold$false_pos)
    dfthreshold$cum_true_pos_rate <- dfthreshold$cum_true_pos / sum(dfthreshold$true_pos)
    dfthreshold$cum_false_pos_rate <- dfthreshold$cum_false_pos / sum(dfthreshold$false_pos)
    dfthreshold$lift <- dfthreshold$cum_true_pos_rate / dfthreshold$pct_instances
    dfthreshold$holdoutfold <- holdoutfold
    
    # Save data frame to a new instance so we can reference it outside of the for loop
    if (exists('dfscores') && is.data.frame(get('dfscores'))) {
        dfscores <- dfscores %>%
            bind_rows(dfthreshold)
    } else {
        dfscores <- dfthreshold
    }

    # Generate ROC curve
    print("    Ploting ROC curves")
    if (!exists('rocplot')) {
        rocplot <- NULL
    }
    rocplot[[holdoutfold + 1]] <- dfthreshold %>%
        ggplot() +
        theme(text = element_text(size = 10)) + 
        geom_point(mapping = aes(x = cum_false_pos_rate, y = cum_true_pos_rate)) + 
        ggtitle(paste0("Holdout fold #", holdoutfold)) + 
        xlab("False positive rate") +
        ylab("True positive rate")
    if (!exists('auc_result')) {
        auc_result <- NULL
    }
    auc_result[holdoutfold + 1] <- area_under_curve(dfthreshold$cum_false_pos_rate, 
        dfthreshold$cum_true_pos_rate, method = "trapezoid")
    print(paste0("   AUC (holdout fold #", holdoutfold, ") = ", auc_result[holdoutfold + 1]))

}  # for each cross-validation set
## [1] "Set holdout fold to 4"
## [1] "    Calculating counts"
## [1] "    Calculating p_word"
## [1] "    Calculating Bayes probabilities"
## [1] "    Calculating final probability scores"
## [1] "    Generating thresholds for ROC curves"
## [1] "    Calculating cumulative rates"
## [1] "    Ploting ROC curves"
## [1] "   AUC (holdout fold #4) = 0.871783088235294"
## [1] "Set holdout fold to 3"
## [1] "    Calculating counts"
## [1] "    Calculating p_word"
## [1] "    Calculating Bayes probabilities"
## [1] "    Calculating final probability scores"
## [1] "    Generating thresholds for ROC curves"
## [1] "    Calculating cumulative rates"
## [1] "    Ploting ROC curves"
## [1] "   AUC (holdout fold #3) = 0.917157275021026"
## [1] "Set holdout fold to 2"
## [1] "    Calculating counts"
## [1] "    Calculating p_word"
## [1] "    Calculating Bayes probabilities"
## [1] "    Calculating final probability scores"
## [1] "    Generating thresholds for ROC curves"
## [1] "    Calculating cumulative rates"
## [1] "    Ploting ROC curves"
## [1] "   AUC (holdout fold #2) = 0.950363714163457"
## [1] "Set holdout fold to 1"
## [1] "    Calculating counts"
## [1] "    Calculating p_word"
## [1] "    Calculating Bayes probabilities"
## [1] "    Calculating final probability scores"
## [1] "    Generating thresholds for ROC curves"
## [1] "    Calculating cumulative rates"
## [1] "    Ploting ROC curves"
## [1] "   AUC (holdout fold #1) = 0.911993097497843"
## [1] "Set holdout fold to 0"
## [1] "    Calculating counts"
## [1] "    Calculating p_word"
## [1] "    Calculating Bayes probabilities"
## [1] "    Calculating final probability scores"
## [1] "    Generating thresholds for ROC curves"
## [1] "    Calculating cumulative rates"
## [1] "    Ploting ROC curves"
## [1] "   AUC (holdout fold #0) = 0.852112676056338"

Graphs

Present the results.

# Plot ROC curves
grid.arrange(ncol = 2, rocplot[[1]], rocplot[[2]], rocplot[[3]], rocplot[[4]], rocplot[[5]], 
    top = textGrob("ROC curves",gp=gpar(fontsize=12,font=3)))

# Show AUC values
df_auc <- data.frame(holdoutfold = c(0:(numfolds-1)), auc = auc_result)
df_auc
# Plot cumulative response curves
dfscores %>% 
    ggplot(aes(x = 100 * pct_instances, y = 100 * cum_true_pos_rate, 
        color = as.character(holdoutfold), linetype = as.character(holdoutfold))) +
    theme(text = element_text(size = 10)) + 
    geom_point(alpha = 0.01) +
    geom_line() + geom_abline(slope = 1, intercept = 0) +
    xlab("Percentage of test instances") + 
    ylab("Percentage of positives") + 
    ggtitle("Cumulative resposne curve")

dfscores %>% ggplot(aes(x = 100 * pct_instances, y = lift,  
        color = as.character(holdoutfold), linetype = as.character(holdoutfold))) +
    theme(text = element_text(size = 10)) + 
    geom_point(alpha = 0.01) + 
    geom_line() + 
    xlab("Percentage of test instances") + 
    ylab("Lift") + 
    ggtitle("Lift curve")

Top words

Display the most common hammy and spammy words.

# Top hammy words
dftopwords <- df %>%
    filter(spam == -1) %>%
    count(word, spam, sort = T) %>%
    ungroup()
head(dftopwords, 20)
# Top spammy words
dftopwords <- df %>%
    filter(spam == 1) %>%
    count(word, spam, sort = T) %>%
    ungroup()
head(dftopwords, 20)
# Word cloud - hammy words
df %>%
    filter(spam == -1) %>%
    count(word) %>% 
    with(wordcloud(word, n, max.words = 75))

# Word cloud - spammy words
df %>%
    filter(spam == 1) %>%
    count(word) %>% 
    with(wordcloud(word, n, max.words = 75))

Conclusions

As shown in the ROC plots, AUC table, and cumulative response curves, the model that performed best was the one in which holdoutfold #3 was exluded from the training data and used for testing. The model performs vert well, with an AUC of 0.95.

It should be noted that when the full tokenized word set from fold was used for training, some models achieved an AUC of 1.000, but at the expense of runtimes of well over two minutes. To balance model performance with resource utilization, I introduced a “maxwords” variable and set it at 200. While I didn’t do any benchmarking to evaluate the effect of maxwords on resource consumption, setting maxwords to 200 still achieved a respectable 0.95 AUC while reducing runtime to about 30 seconds.