Download Files
First step is to procure the spam and ham files. Accomplished using download.file.
url <- 'http://spamassassin.apache.org/old/publiccorpus/'
spamZip <- '20050311_spam_2.tar.bz2'
hamZip <- '20030228_easy_ham.tar.bz2'
# download function
dloadBZ2 <- function(baseURL = NULL, bz2 = NULL) {
# full URL
fullUrl <- paste(baseURL, bz2, sep = '')
# detination
destFolder <- './spamham/'
saveTo <- paste(destFolder, bz2, sep = '')
# download file
if (!file.exists(saveTo)) download.file(fullUrl, destfile = saveTo)
# return downloaded file
return(saveTo)
}
# download files
spamZip <- dloadBZ2(url, spamZip)
hamZip <- dloadBZ2(url, hamZip)
Unzipping Spam and Ham Files
The files must be unzipped after downloading.
# functions to decompress files
decompress <- function(zipFile = NULL) {
# get list of files
files <- zipFile %>%
bunzip2(overwrite = T, remove = F) %>%
str_replace(pattern = ".bz2", replacement = "") %>%
untar(compressed = "bzip2", list = T)
# untar files
zipFile %>%
str_replace(pattern = ".bz2", replacement = "") %>%
untar(compressed = "bzip2", exdir = "./spamham/")
# return list of files
return(files)
}
# decompress zipfiles
spamList <- decompress(spamZip)
hamList <- decompress(hamZip)
Get File Names
This section is to create a list of all the file names in the zip files.
# function to get file names
getFileNames <- function(list = NULL) {
# replace prefixed file, trim, and keep only relevant files
names <- list %>%
# remove the useless files from list
keep(function(x) nchar(x) > 40) %>%
# get full file path
map(function(x) str_c("./spamham/", x, collapse = '')) %>%
# trim for cleanliness :)
str_trim()
# return file names
return(names)
}
spamFiles <- getFileNames(spamList)
hamFiles <- getFileNames(hamList)
Preliminary Results:
Spam Emails: 1396
Ham Emails: 2500
Total Emails: 3896
Get the Contents of Files
In this section, we will create a single data frame of all our results. The headers will be word, count, filename, email type (ham or spam). This data frame will then power the rest of the visualizations and analysis.
## load stop words so that they can be removed from each set of data.
data(stop_words)
# function to clean and tidy the data
cleanTidy <- function(file = NULL, type = NULL) {
lines <- file %>%
# read lines from email
read_lines %>%
# put lines into dataframe
data_frame(text = .) %>%
# get every individual word in email
unnest_tokens(word, text) %>%
# remove stop words
anti_join(stop_words, by = "word") %>%
# count the remaining words
count(word, sort = T) %>%
# add additional columns that could potentially help later
mutate(filename = str_replace(file, "^\\./spamham/.+/", ""),
email_type = type,
word = as.character(word)) %>%
# rename n to count
rename(count = n)
}
merge_df <- function(df, x, type = NULL) {
# read lines and convert to data frame
lines <- cleanTidy(x, type)
# merge dfs
return(bind_rows(df, lines))
}
# function to get contents
getFileContents <- function(files = NULL, type = NULL) {
# reduce file list to single df
files %>%
reduce(merge_df, .init = data_frame(word = character(0), count = integer(0),
filename = character(0), email_type = character(0)), type = type)
}
spam <- getFileContents(spamFiles, "spam")
ham <- getFileContents(hamFiles, "ham")
Summary Statistics on Each Dataset
HAM Summary
hsummary <- ham %>%
group_by(filename) %>%
summarise(email_length = sum(count),
unique_words = n_distinct(word)) %>%
arrange(desc(unique_words))
hsummary %>%
head(10)
More statistics summarizing HAM data
hsummary %>%
select(2:3) %>%
summary()
## email_length unique_words
## Min. : 48.0 Min. : 37.0
## 1st Qu.: 175.0 1st Qu.: 106.0
## Median : 330.0 Median : 164.0
## Mean : 340.6 Mean : 173.1
## 3rd Qu.: 414.0 3rd Qu.: 203.2
## Max. :6607.0 Max. :2268.0
SPAM Summary
ssummary <- spam %>%
group_by(filename) %>%
summarise(email_length = sum(count),
unique_words = n_distinct(word)) %>%
arrange(desc(unique_words))
ssummary %>%
head(10)
More statistics summarizing SPAM data
ssummary %>%
select(2:3) %>%
summary()
## email_length unique_words
## Min. : 75.0 Min. : 54.0
## 1st Qu.: 252.0 1st Qu.: 147.0
## Median : 407.0 Median : 203.0
## Mean : 641.4 Mean : 241.7
## 3rd Qu.: 768.0 3rd Qu.: 275.0
## Max. :7844.0 Max. :2514.0
Across both metrics, SPAM emails typically have more unique words, and are generally longer in length.
Visualizations
ham %>%
select(word:count) %>%
group_by(word) %>%
summarise(word_count = sum(count)) %>%
filter(!str_detect(word, '[[:punct:][:digit:]]')) %>%
mutate(word = reorder(word, word_count)) %>%
head(10) %>%
ggplot(aes(word, word_count)) +
geom_col() +
xlab("Ham") +
coord_flip()
spam %>%
select(word:count) %>%
group_by(word) %>%
summarise(word_count = sum(count)) %>%
filter(!str_detect(word, '[[:punct:][:digit:]]')) %>%
mutate(word = reorder(word, word_count)) %>%
head(10) %>%
ggplot(aes(word, word_count)) +
geom_col() +
xlab("Spam") +
coord_flip()
From the graphs, you can see there is a lot of jibberish is a typical spam email.
Training
Create different datasets for training/testing
# function to create datasets
contentGetter <- function(file = NULL, type = NULL) {
lines <- file %>%
# read lines from email
read_lines %>%
# collapse email
str_c(collapse = "") %>%
# put lines into dataframe
data_frame(text = .) %>%
# add email type
mutate(type = type)
}
whole_df <- function(df, x, type = NULL) {
# read lines and convert to data frame
lines <- contentGetter(x, type)
# merge dfs
return(bind_rows(df, lines))
}
# function to get contents
getWholeContents <- function(files = NULL, type = NULL) {
# reduce file list to single df
files %>%
reduce(whole_df, .init = data_frame(text = character(0), type = character(0)), type = type)
}
spamWhole <- getWholeContents(spamFiles, "spam")
hamWhole <- getWholeContents(hamFiles, "spam")
Set up training/test data
# divide test and training 75/25
emails <- bind_rows(spamWhole, hamWhole)
randomEmails <- emails[sample(nrow(emails)), ]
trainLength <- nrow(randomEmails) * .75
trainEmails <- randomEmails[1:trainLength, ]
testEmails <- randomEmails[(trainLength + 1):nrow(randomEmails), ]
Term Document Matrix
fullTTData <- bind_rows(trainEmails, testEmails) %>%
mutate(type = if_else(type == 'spam', 1, 0)) %>%
as.data.frame()
fullTTTypes <- fullTTData$type
fullTTMsg <- fullTTData$text
# create matrix
matrix <- create_matrix(fullTTMsg,
language = "english",
minWordLength = 3,
removeNumbers = TRUE,
stemWords = FALSE,
removePunctuation = TRUE)
# create container
container <- create_container(matrix,
t(factor(fullTTTypes)),
trainSize = 1:trainLength,
testSize = (trainLength + 1):(nrow(fullTTData)),
virgin = F)
Model
# model <- train_model(container, "SVM")
# result <- classify_model(container, model)
# analytics <- create_analytics(container, result)
# document summary
# docsum <- analytics@document_summary