Data 607 - Project 4

Download Files

First step is to procure the spam and ham files. Accomplished using download.file.

url <- 'http://spamassassin.apache.org/old/publiccorpus/'
spamZip <- '20050311_spam_2.tar.bz2'
hamZip <- '20030228_easy_ham.tar.bz2'


# download function
dloadBZ2 <- function(baseURL = NULL, bz2 = NULL) {
  # full URL
  fullUrl <- paste(baseURL, bz2, sep = '')
  
  # detination
  destFolder <- './spamham/'
  saveTo <- paste(destFolder, bz2, sep = '')
  
  # download file
  if (!file.exists(saveTo)) download.file(fullUrl, destfile = saveTo)
  
  # return downloaded file
  return(saveTo)
}

# download files
spamZip <- dloadBZ2(url, spamZip)
hamZip <- dloadBZ2(url, hamZip)

Unzipping Spam and Ham Files

The files must be unzipped after downloading.

# functions to decompress files
decompress <- function(zipFile = NULL) {
  # get list of files
  files <- zipFile %>%
    bunzip2(overwrite = T, remove = F) %>%
    str_replace(pattern = ".bz2", replacement = "") %>%
    untar(compressed = "bzip2", list = T)
  
  # untar files
  zipFile %>%
    str_replace(pattern = ".bz2", replacement = "") %>%
    untar(compressed = "bzip2", exdir = "./spamham/")
  
  # return list of files
  return(files)
}

# decompress zipfiles
spamList <- decompress(spamZip)
hamList <- decompress(hamZip)

Get File Names

This section is to create a list of all the file names in the zip files.

# function to get file names
getFileNames <- function(list = NULL) {
  # replace prefixed file, trim, and keep only relevant files
  names <- list %>%
    # remove the useless files from list
    keep(function(x) nchar(x) > 40) %>%
    # get full file path
    map(function(x) str_c("./spamham/", x, collapse = '')) %>%
    # trim for cleanliness :)
    str_trim()
  
  # return file names
  return(names)
}

spamFiles <- getFileNames(spamList)
hamFiles <- getFileNames(hamList)

Preliminary Results:

Spam Emails: 1396

Ham Emails: 2500

Total Emails: 3896

Get the Contents of Files

In this section, we will create a single data frame of all our results. The headers will be word, count, filename, email type (ham or spam). This data frame will then power the rest of the visualizations and analysis.

## load stop words so that they can be removed from each set of data.
data(stop_words)

# function to clean and tidy the data
cleanTidy <- function(file = NULL, type = NULL) {
  lines <- file %>%
    # read lines from email
    read_lines %>%
    # put lines into dataframe
    data_frame(text = .) %>%
    # get every individual word in email
    unnest_tokens(word, text) %>%
    # remove stop words
    anti_join(stop_words, by = "word") %>%
    # count the remaining words
    count(word, sort = T) %>%
    # add additional columns that could potentially help later
    mutate(filename = str_replace(file, "^\\./spamham/.+/", ""),
           email_type = type,
           word = as.character(word)) %>%
    # rename n to count
    rename(count = n)
}

merge_df <- function(df, x, type = NULL) {
  # read lines and convert to data frame
  lines <- cleanTidy(x, type)

  # merge dfs
  return(bind_rows(df, lines))
}

# function to get contents
getFileContents <- function(files = NULL, type = NULL) {
  # reduce file list to single df
  files %>%
    reduce(merge_df, .init = data_frame(word = character(0), count = integer(0), 
                                        filename = character(0), email_type = character(0)), type = type)
}

spam <- getFileContents(spamFiles, "spam")
ham <- getFileContents(hamFiles, "ham")

Summary Statistics on Each Dataset

HAM Summary

hsummary <- ham %>%
  group_by(filename) %>%
  summarise(email_length = sum(count),
            unique_words = n_distinct(word)) %>%
  arrange(desc(unique_words))

hsummary %>%
  head(10)

More statistics summarizing HAM data

hsummary %>%
  select(2:3) %>%
  summary()

##   email_length     unique_words   
##  Min.   :  48.0   Min.   :  37.0  
##  1st Qu.: 175.0   1st Qu.: 106.0  
##  Median : 330.0   Median : 164.0  
##  Mean   : 340.6   Mean   : 173.1  
##  3rd Qu.: 414.0   3rd Qu.: 203.2  
##  Max.   :6607.0   Max.   :2268.0

SPAM Summary

ssummary <- spam %>%
  group_by(filename) %>%
  summarise(email_length = sum(count),
            unique_words = n_distinct(word)) %>%
  arrange(desc(unique_words)) 

ssummary %>%
  head(10)

More statistics summarizing SPAM data

ssummary %>%
  select(2:3) %>%
  summary()

##   email_length     unique_words   
##  Min.   :  75.0   Min.   :  54.0  
##  1st Qu.: 252.0   1st Qu.: 147.0  
##  Median : 407.0   Median : 203.0  
##  Mean   : 641.4   Mean   : 241.7  
##  3rd Qu.: 768.0   3rd Qu.: 275.0  
##  Max.   :7844.0   Max.   :2514.0

Across both metrics, SPAM emails typically have more unique words, and are generally longer in length.

Visualizations

ham %>%
  select(word:count) %>%
  group_by(word) %>%
  summarise(word_count = sum(count)) %>%
  filter(!str_detect(word, '[[:punct:][:digit:]]')) %>%
  mutate(word = reorder(word, word_count)) %>%
  head(10) %>%
  ggplot(aes(word, word_count)) +
  geom_col() +
  xlab("Ham") +
  coord_flip()

spam %>%
  select(word:count) %>%
  group_by(word) %>%
  summarise(word_count = sum(count)) %>%
  filter(!str_detect(word, '[[:punct:][:digit:]]')) %>%
  mutate(word = reorder(word, word_count)) %>%
  head(10) %>%
  ggplot(aes(word, word_count)) +
  geom_col() +
  xlab("Spam") +
  coord_flip()

From the graphs, you can see there is a lot of jibberish is a typical spam email.

Training

Create different datasets for training/testing

# function to create datasets
contentGetter <- function(file = NULL, type = NULL) {
  lines <- file %>%
    # read lines from email
    read_lines %>%
    # collapse email
    str_c(collapse = "") %>%
    # put lines into dataframe
    data_frame(text = .) %>%
    # add email type
    mutate(type = type)
}

whole_df <- function(df, x, type = NULL) {
  # read lines and convert to data frame
  lines <- contentGetter(x, type)

  # merge dfs
  return(bind_rows(df, lines))
}

# function to get contents
getWholeContents <- function(files = NULL, type = NULL) {
  # reduce file list to single df
  files %>%
    reduce(whole_df, .init = data_frame(text = character(0), type = character(0)), type = type)
}

spamWhole <- getWholeContents(spamFiles, "spam")
hamWhole <- getWholeContents(hamFiles, "spam")

Set up training/test data

# divide test and training 75/25
emails <- bind_rows(spamWhole, hamWhole)
randomEmails <- emails[sample(nrow(emails)), ]
trainLength <- nrow(randomEmails) * .75

trainEmails <- randomEmails[1:trainLength, ]
testEmails <- randomEmails[(trainLength + 1):nrow(randomEmails), ]

Term Document Matrix

fullTTData <- bind_rows(trainEmails, testEmails) %>%
  mutate(type = if_else(type == 'spam', 1, 0)) %>%
  as.data.frame()
fullTTTypes <- fullTTData$type
fullTTMsg <- fullTTData$text

# create matrix
matrix <- create_matrix(fullTTMsg,
                      language = "english", 
                      minWordLength = 3, 
                      removeNumbers = TRUE, 
                      stemWords = FALSE, 
                      removePunctuation = TRUE)


# create container
container <- create_container(matrix, 
                              t(factor(fullTTTypes)),
                              trainSize = 1:trainLength,
                              testSize = (trainLength + 1):(nrow(fullTTData)),
                              virgin = F)

Model

# model <- train_model(container, "SVM")
# result <- classify_model(container, model)
# analytics <- create_analytics(container, result)

# document summary
# docsum <- analytics@document_summary