This project is to build a Supervised Learning techniques based document classification model to detect Spam Vs. Ham e-mail documents. Training and Test data has been obtained based on the public data sets available under below location -
https://github.com/soumya2g/R-CUNY-MSDS/tree/master/DATA-607/E-mail%20Spam%20Detection%20Model
Load necessary libraries -
library(kableExtra)
library(dplyr)
library(tm)
library(stringr)
library(ggplot2)
library(rvest)
library(R.utils)
library(quanteda)
library(readtext)
The goal of this project is to build a supervised document classification solution to work with the already labelled Training data sets (Spam Vs. Ham) and build corpus of documents, extract tokens, build a document feature matrix following a structured workflow approach as depicted below -
I have tried to modularise the solution using few wrapper functions-
downloadTAR() : The purpose of this function is to read a set of input zipped files and a file type label for ‘Spam’ or ‘Ham’ and extract the raw text files into appropriate Training data directory and return a data frame containing the listing of files for a specific label (Spam or Ham).
buildCorpus() : This function receives the file type label and the dataframe containing the list of training data files. It reads all these files from appropriate training data directory and parses each of the files to build a corpus of documents including document level variables like e-mail sender address, subject line etc.
Note: I have used rvest package to read the e-mail documents body section based on HTML Tags. While using this library there were quite a few documents which threw errors for which I had to create a list of error files and exclude them in my Training data build process.
### Function to download all Spam and Ham Files
downloadTAR <- function(filetype=NULL, url=NULL, rootfile=NULL){
directoryReplace <- str_sub(rootfile,10,str_locate(rootfile,"\\.")[1,1]-1)
myfile <- paste(url,rootfile,sep="")
destfile <- paste(filetype,".tar.bz2", sep="")
download.file(myfile, destfile= destfile)
destTarFile <- paste(filetype,".tar", sep="")
bunzip2(destfile,destname=destTarFile, overwrite=TRUE, remove=TRUE)
untar(destTarFile,exdir = paste(getwd(),"/",filetype,sep = ""))
filenamesDF <- data_frame(filetype,filename = str_trim(str_sub(untar(destTarFile, list = TRUE),nchar(directoryReplace)+2,-1)))
return(filenamesDF)
}
### Function to build corpus
buildCorpus <- function(importtype=NULL, filenames=NULL){
if (importtype == "Spam") {
filePaths <- paste(getwd(),"/",importtype,"/",importtype,"/",filenames$filename, sep = "")
}
if (importtype == "Ham") {
filePaths <- paste(getwd(),"/",importtype,"/easy_ham/",filenames$filename, sep = "")
}
tempEmailBody <- vector()
tempEmailFrom <- vector()
tempEmailSubject <- vector()
docid <- vector()
for(i in 1:length(filenames$filename)){
if(filenames$filename[i] != "cmds" & nchar(filenames$filename[i])>10 & filenames$filename[i] != "0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1" &
filenames$filename[i] != "0143.260a940290dcb61f9327b224a368d4af" & filenames$filename[i] != "00136.faa39d8e816c70f23b4bb8758d8a74f0" &
filenames$filename[i] != "0231.7c6cc716ce3f3bfad7130dd3c8d7b072" & filenames$filename[i] != "0250.7c6cc716ce3f3bfad7130dd3c8d7b072")
{
con <- file(filePaths[i], open="rt")
text <- readLines(con)
if(length(text) > 1)
{
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
eMailBodyDoc <- msg
emailSubject <- str_trim(str_replace_all(str_sub(unlist(str_extract_all(eMailBodyDoc,"Subject:.*")),10,-1)," ",""))
emailFrom <- str_trim(str_sub(unlist(str_extract_all(eMailBodyDoc,"From:.*")),7,-1))
eMailBodyDoc <- eMailBodyDoc %>% str_replace_all("\n","")
eMailBodyDoc <- eMailBodyDoc %>% str_replace_all("<.*>","")
tempEmailBody[i] <- ifelse(length(eMailBodyDoc) == 0,"Body not available",eMailBodyDoc)
tempEmailFrom[i] <- ifelse(length(emailFrom) == 0,"Sender not available",emailFrom)
tempEmailSubject[i] <- ifelse(length(emailSubject) == 0,"Subject not available",emailSubject)
docid[i] <- ifelse(nchar(filenames$filename[i]) == 0,"E-mail name not available",filenames$filename[i])
}
close(con)
}
}
email_corp <- corpus(tempEmailBody, docvars = data_frame(Email_From = tempEmailFrom, Email_Subject = tempEmailSubject, Tag = importtype))
email_corp <- corpus_subset(email_corp,!is.na(Email_From))
docid <- docid[ifelse(!is.na(docid), TRUE,FALSE)]
docnames(email_corp) <- docid
return(email_corp)
}
The 9 folders available in the source were decompressed and copied into two folders, one containing the “ham” and the other containing the “spam”. I found out that there were some duplicated files in the original raw dataset. After removing the duplicates, I obtained 6,952 ham and 2,398 spam files, for a total of 9350 files.
I downloaded the training data files from the source and extracted them under appropriate local source directories using the wrapper function downloadTAR()
URL <- "https://spamassassin.apache.org/old/publiccorpus/"
spamFiles <- c("20021010_spam.tar.bz2","20030228_spam.tar.bz2","20030228_spam_2.tar.bz2")
hamFiles <- c("20021010_easy_ham.tar.bz2","20021010_hard_ham.tar.bz2","20030228_easy_ham.tar.bz2",
"20030228_easy_ham_2.tar.bz2")
### Set Local Working directory
#setwd("C:/CUNY/Semester1 (Fall)/DATA 607/Project 4/Source/Training Data")
## Extract all the Spam Files under Training Data
spamFileNames <- data_frame()
for(i in 1:length(spamFiles))
{
spamFileNames <- rbind(spamFileNames,downloadTAR("Spam", URL, spamFiles[i]))
}
spamTrainCount <- spamFileNames %>% group_by(filetype) %>% summarise(file_count = n())
## Extract all the Ham Files under Training Data
hamFileNames <- data_frame()
for(i in 1:length(hamFiles))
{
hamFileNames <- rbind(hamFileNames,downloadTAR("Ham", URL, hamFiles[i]))
}
hamTrainCount <- hamFileNames %>% group_by(filetype) %>% summarise(file_count = n())
#### Training Data Summary
rbind(spamTrainCount,hamTrainCount) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
filetype | file_count |
---|---|
Spam | 2403 |
Ham | 6707 |
#### Build Spam Corpus and Summarise
spamEmailCorp <- buildCorpus(importtype = "Spam", filenames = spamFileNames)
summary(spamEmailCorp) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
Text | Types | Tokens | Sentences | Email_From | Email_Subject | Tag |
---|---|---|---|---|---|---|
0103.8c39bfed2079f865e9dfb75f4416a468 | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0107.f1d4194b57840ea6587b9a73ed88e075 | 1 | 1 | 1 | Sender not available | Subject not available | Spam |
0012.7bc8e619ad0264979edce15083e70a02 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0114.c104ada3a249e1e1846c0cd156a303e9 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0118.4be8b50c2a818c62b62e70c4b5456113 | 11 | 12 | 1 | Sender not available | Subject not available | Spam |
0121.772c3ccd1b6c1a2e0e2ec0356082c77b | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0125.44381546181fc6c5d7ea59e917f232c5 | 11 | 13 | 1 | Sender not available | Subject not available | Spam |
0129.78a705ff6b3bde3395d067459e6e46e2 | 13 | 13 | 2 | Sender not available | Subject not available | Spam |
0132.7ac2141ed9a163f934ac65b3f59a2a03 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0136.7e7d6adf293fa0a3dc56b3f796cf00d1 | 3 | 3 | 1 | Sender not available | Subject not available | Spam |
0140.a2bb669eaf743ed123fca884a40cfbd4 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0147.65cf30538f09402e4d1bd4aa91d9532a | 51 | 74 | 5 | Sender not available | Subject not available | Spam |
0016.f9c349935955e1ccc7626270da898445 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0154.e39fc51ffdb9c2ecd480ce972078aeaa | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0158.ff5dce5446d2ec91f0caffeffdd48852 | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0161.00e60d1a3478f1ae99ff49fbd4b30605 | 7 | 8 | 2 | Sender not available | Subject not available | Spam |
0165.6eedc001155da3cbd75a60eba2b19448 | 202 | 375 | 12 | Sender not available | Subject not available | Spam |
0169.bc6e1356af0602fb96dd3f721fb17c48 | 4 | 5 | 1 | Sender not available | Subject not available | Spam |
0172.e524e85cab354337018e1d0d2fc21ffd | 15 | 15 | 2 | Sender not available | Subject not available | Spam |
0176.70022adaab1a9dfe64ae7588ffa5add9 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0180.afdbcd7acb65828c217eea90ff92c3b0 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0183.4aaadeb40e3362e71e3e4aba15624e3a | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0187.e2178f6d01a70dfbdf9c84c4dcaf58dc | 5 | 5 | 1 | Sender not available | Subject not available | Spam |
0020.4120dc06a0124a8688e96f8cff029113 | 38 | 46 | 5 | Sender not available | Subject not available | Spam |
0194.dd4dd86bdd8e1113889af9afaf299d6c | 5 | 5 | 1 | Sender not available | Subject not available | Spam |
0201.9da0b5702a864a8ffd06cfb4c724f9c8 | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0003.4b3d943b8df71af248d12f8b2e7a224a | 8 | 8 | 1 | Sender not available | Subject not available | Spam |
0206.806263422d55d38a151fe3b89d56192f | 7 | 7 | 1 | Sender not available | Subject not available | Spam |
0022.4b5cf3c16feb88dd6932a8c46a41946c | 4 | 4 | 1 | Sender not available | Subject not available | Spam |
0215.57c4f4d8e2f582088f8aca38239059f7 | 3 | 8 | 1 | Sender not available | Subject not available | Spam |
0216.feb2a8df9887bc2d84e80c9d2a8faf56 | 5 | 10 | 2 | Sender not available | Subject not available | Spam |
0220.15583875f5ef9e2cf6450ebc821f0dff | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0271.24302cf2e759401d1f9975fe4fc1def9 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0301.ad155a30cca1f9d16e75e8934030edae | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0341.7c13df68bb4feae35d9ea86001a3ecad | 11 | 11 | 2 | Sender not available | Subject not available | Spam |
0381.492ed1e5eed1b631560e2009be5b8c9a | 3 | 8 | 1 | Sender not available | Subject not available | Spam |
0411.e6e37cbb02ad33b4e0ba5fb6caf2bbcf | 17 | 19 | 1 | Sender not available | Subject not available | Spam |
0451.588b22df28f4036ff3895447afbcb7f3 | 3 | 8 | 1 | Sender not available | Subject not available | Spam |
0491.f47154f78397c57b14e05450a16745d5 | 5 | 5 | 1 | Sender not available | Subject not available | Spam |
0054.839a9c0a07f13718570da944986a898a | 13 | 13 | 1 | Sender not available | Subject not available | Spam |
0058.abb13c5db31d26a58607aac809573ed4 | 6 | 9 | 2 | Sender not available | Subject not available | Spam |
0061.c148ebba16540e48c7aae2e3f733a8a3 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0065.18d2edcf9aa0e940651b5fdd218ac019 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0069.a0b6cfde0e477af7f406ee756ba53826 | 6 | 6 | 1 | Sender not available | Subject not available | Spam |
0072.f97a14d667569ebbc0502bb2c7beec27 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0076.770f0e7b8378a47a945043434f6f43df | 1 | 1 | 1 | Sender not available | Subject not available | Spam |
0080.77af9ca7f967f055062aade45001129e | 4 | 4 | 1 | Sender not available | Subject not available | Spam |
0083.a042c7512d5db5f9fc1857fdc6bbdcc3 | 4 | 4 | 1 | Sender not available | Subject not available | Spam |
0087.1cbd88a0c1564cb5d6c9b12c8c4175d8 | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0010.7f5fb525755c45eb78efc18d7c9ea5aa | 3 | 3 | 1 | Sender not available | Subject not available | Spam |
0094.3ba780eac7dce1c2b063cd1fc12738be | 3 | 8 | 1 | Sender not available | Subject not available | Spam |
0098.01d2958ccb7c2e4c02d0920593962436 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0001.bfc8d64d12b325ff385cca8d07b84288 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0049.625bab436c7fc6299cfceeaa24e198ae | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0500.2e8762b67913d1b07bc8da293448d27f | 5 | 5 | 1 | Sender not available | Subject not available | Spam |
0488.6d41f6d7222978a3ee2b6cfbfce55a02 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0492.f2d030fd71d7c3075626195b5c0b56f7 | 8 | 13 | 5 | Sender not available | Subject not available | Spam |
0495.a13bce4369913c929a48b073f2b320c9 | 8 | 8 | 1 | Sender not available | Subject not available | Spam |
0307.2e4dc0cdb1e3b49f0986c19c1f324224 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0308.1245e8fa9e6092687b535e36b367d8fb | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0309.2a74113b0330ea76cecd28571fc6f7fe | 3 | 8 | 1 | Sender not available | Subject not available | Spam |
0032.081c3615bc9b91d09b6cbb9239ba8c99 | 9 | 10 | 1 | Sender not available | Subject not available | Spam |
0312.a0e7f2633bd0ceaddf16fba58be54778 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0315.26ca39910895a935e2b8bca93a44ebfe | 7 | 12 | 1 | Sender not available | Subject not available | Spam |
0319.e4a20802d12937998f3b3bf805362a3f | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0323.badf0273f656afd0dfebaa63af1c81f6 | 3 | 3 | 1 | Sender not available | Subject not available | Spam |
0327.5df76bb4359800b5408821285677b5cf | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0329.5c22249fa35fff050675e7df4433b89f | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0332.b82bddb316d2e12418d6ea8791ce5896 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0336.b864dd710e659f0ef5826dc4d80714f8 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0340.8e191c37e2d30a639013203aacf60086 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0344.8bbe5c7c8269a039761968a1b10a936a | 2 | 7 | 1 | Sender not available | Subject not available | Spam |
0348.e0b89978fa806cf3e7fd3ba0869b3c65 | 54 | 75 | 5 | Sender not available | Subject not available | Spam |
0352.f7adb4aa267e50a8db1e4bcacfe863f3 | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0356.86a795300367f707a8b648e0c50253ad | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0360.5f5fc66c831d845705efac502121308a | 4 | 4 | 1 | Sender not available | Subject not available | Spam |
0364.8e5f3385c2deb2c0c32794b403851ec4 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0368.3a53888c2f7fbe52a7293f223375c245 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0372.216f90ef52558ed24402e192586a40e8 | 5 | 5 | 2 | Sender not available | Subject not available | Spam |
0376.d87b4313e6c43a986060d57a0b8515a6 | 2 | 15 | 1 | Sender not available | Subject not available | Spam |
0380.c4d530b5816543f4f1a23b8ce0d281f5 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0383.5b89d5a9c0152070a77e133734f7cd83 | 2 | 2 | 1 | Sender not available | Subject not available | Spam |
0389.ed4ca8aceef91808c783909351c7bdb4 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0393.d3a4d296a35c6a7f39429247c007eeae | 1 | 69 | 1 | Sender not available | Subject not available | Spam |
0397.c02eba1386b00d640c954e5117dd1aa0 | 12 | 12 | 2 | Sender not available | Subject not available | Spam |
0005.1f42bb885de0ef7fc5cd09d34dc2ba54 | 7 | 7 | 1 | Sender not available | Subject not available | Spam |
0405.18a5c3d971e1def2c3b4a2df122f3583 | 2 | 2 | 1 | Sender not available | Subject not available | Spam |
0409.09cb28cd8753bff06fc8a547c3ed8fe2 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0042.21cc985cc36d931916863aed24de8c27 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0235.77e9a7e398ce81359c08e64bf20e9825 | 5 | 5 | 1 | Sender not available | Subject not available | Spam |
0246.3b997087302d48ff57ab5afb3d400d5b | 3 | 3 | 1 | Sender not available | Subject not available | Spam |
0245.39c15852204971c72e8d89f9f3f9bb38 | 4 | 4 | 1 | Sender not available | Subject not available | Spam |
0413.4c74110f6640067c2172a04543dea670 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0260.737eefb83e7eedbd531117c273c56241 | 7 | 7 | 1 | Sender not available | Subject not available | Spam |
0256.ad88c1a165392a509a8b0b8df6d56cbd | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
0265.1120a7d868b23e83b91ad00ec8b79e08 | 7 | 7 | 1 | Sender not available | Subject not available | Spam |
0028.83a43dd97923463030349506a56226c1 | 8 | 8 | 1 | Sender not available | Subject not available | Spam |
0274.85756abb8d0bcfe267e464a2f33ce686 | 4 | 5 | 2 | Sender not available | Subject not available | Spam |
0279.4ef122899a70a2225ffb9b5c54fde1fc | 9 | 9 | 1 | Sender not available | Subject not available | Spam |
0292.3e12964912377bd9b52d223e37812e56 | 0 | 0 | 0 | Sender not available | Subject not available | Spam |
#### Build Ham Corpus and Summarise
hamEmailCorp <- buildCorpus(importtype = "Ham", filenames = hamFileNames)
summary(hamEmailCorp) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
Text | Types | Tokens | Sentences | Email_From | Email_Subject | Tag |
---|---|---|---|---|---|---|
1011.82f644586fced13704dd79e22c3d8fb9 | 0 | 0 | 0 | fork-admin@xent.com [mailto:fork-admin@xent.com] On Behalf Of | Re: The Curse of India’s Socialism | Ham |
1051.cf81a19208b703f18497a0d6fedb1f13 | 2 | 2 | 1 | Sender not available | Subject not available | Ham |
1091.0bade8676340d304cae87dad02efa8ce | 13 | 14 | 1 | Brent Welch <welch@panasas.com>; | Subject not available | Ham |
1121.51f7e5e557bde451a6b36e527211ed04 | 11 | 13 | 1 | Sender not available | Subject not available | Ham |
1161.9bcd69bccfeb05378e3e36fa62b16f7d | 15 | 16 | 1 | Sender not available | Subject not available | Ham |
0121.b475478456e52de66ef0b0fb501bbfd3 | 3 | 3 | 1 | Sender not available | Subject not available | Ham |
1231.9a7db322df8f2bdf4eeb2d589cb51e34 | 3 | 5 | 1 | Hal DeVore <haldevore@acm.org>; | Subject not available | Ham |
1271.1af7c90a1459165ff18d621de40239c5 | 8 | 8 | 1 | Sender not available | Subject not available | Ham |
1301.7d4abb56d43695d968aafd6f15f5bbb3 | 9 | 9 | 1 | Sender not available | Subject not available | Ham |
1341.91bc30d50566e71807217c8977f7a793 | 8 | 8 | 1 | Sender not available | Subject not available | Ham |
1381.3c5527db01789ad42005006ac2ed2fcc | 8 | 8 | 1 | “” Angles " Puglisi" <angles@aminvestments.com>; | some (null) eyecandy packages | Ham |
1411.a455fcdcc40a1a29551cc0153a9450bf | 3 | 3 | 1 | Sender not available | Subject not available | Ham |
1451.64eda615fc56b4bbfdce6291a59c12f4 | 0 | 0 | 0 | Sender not available | Subject not available | Ham |
1491.2e82a3803e51f420c6398d963052469f | 2 | 2 | 1 | Sender not available | Subject not available | Ham |
1521.692de69e480a819f6d32578f93fca74b | 10 | 11 | 1 | Sender not available | Subject not available | Ham |
0191.314e2f68086989044e631e347a03b979 | 7 | 7 | 1 | Sender not available | Subject not available | Ham |
0231.1bc071b16de2cc1e2adab334be65f7c6 | 14 | 14 | 1 | Sender not available | Subject not available | Ham |
0271.349d737f101674586c61996593772a63 | 16 | 20 | 1 | Sender not available | Subject not available | Ham |
0311.de3984f9da9dba841ba515681fa065a6 | 0 | 0 | 0 | Sender not available | Subject not available | Ham |
0351.a7381397d31e8511581dc5cd59f39959 | 12 | 13 | 2 | Sender not available | Subject not available | Ham |
0391.e1f15b5f5a6dbbb8cde6571055be3127 | 13 | 14 | 2 | Sender not available | Subject not available | Ham |
0431.26f19fa47fab85e813b1aba8ff6139d6 | 16 | 17 | 1 | Sender not available | Subject not available | Ham |
0471.55f1c6122fd60d151c2c42182ecb734a | 17 | 18 | 1 | Sender not available | Subject not available | Ham |
0511.7c59aca5d737ae0f1a94b8e08d5def67 | 9 | 9 | 1 | Sender not available | Subject not available | Ham |
0551.5bdfb0c299c60e442e39346dc08bad68 | 12 | 13 | 1 | Sender not available | Subject not available | Ham |
0591.f533bea095d75cc4dc282bdcba69072a | 4 | 4 | 1 | fork-admin@xent.com [mailto:fork-admin@xent.com] On Behalf Of | Re: The Big Jump | Ham |
0631.6a6516bb19c38ae705d06f9518231f49 | 0 | 0 | 0 | Sender not available | Subject not available | Ham |
0671.ea82754ba3b836e43d376b32c07b79f0 | 12 | 14 | 1 | Sender not available | Subject not available | Ham |
0711.27203d4f43e71f7e1ced0cdd7f8685c8 | 4 | 4 | 1 | Sender not available | Subject not available | Ham |
0751.58282452f1adc8ad703ddc4cf12c2e37 | 0 | 0 | 0 | “R. A. Hettinga” <rah@shipwright.com>; | Subject not available | Ham |
0791.0b633957da3fa40b511d8e56ad877722 | 0 | 0 | 0 | “John Hall” <johnhall@evergo.net>; | Subject not available | Ham |
0831.0162ac7b4cc1c62fb35803bc4e8db70d | 11 | 13 | 1 | Sender not available | Subject not available | Ham |
0871.79be1926ade2b8fc591f9f51abf66224 | 4 | 4 | 1 | Sender not available | Subject not available | Ham |
0911.dcc71630eed7821469e4c26e5b768aee | 0 | 0 | 0 | Sender not available | Subject not available | Ham |
0951.542debe6e20315751c6e2c9bdedff85d | 7 | 8 | 1 | “Mr. FoRK” <fork_list@hotmail.com>; | Documentum Acquires E-Room, Melding Content, Collaboration | Ham |
0991.f273a70df275a44e46f4544897eaee23 | 10 | 11 | 1 | Sender not available | Subject not available | Ham |
1561.b968a0929d29009dbb76603323c1862f | 6 | 9 | 3 | spamassassin-talk-admin@example.sourceforge.net | RE: [SAtalk] 2.41/2.50 spamd/spamc problem | Ham |
1601.e586e85a3d75cc48f9b913f244d52632 | 5 | 5 | 1 | Sender not available | Subject not available | Ham |
1641.7555c5920365e6315e3f20d83211d558 | 5 | 5 | 1 | “rODbegbie” <rOD@arsecandle.org>; | Re: [SAtalk] spamd error messages | Ham |
1681.b17d16768d9543099dd7fe511f14ca9e | 2 | 2 | 1 | Sender not available | Subject not available | Ham |
1721.2f654b5e99867bebf86ebb0280fb8e48 | 13 | 13 | 1 | bmord@icon-nicholson.com [mailto:bmord@icon-nicholson.com] | use of base image / delta image for automated recovery from | Ham |
1761.eeb706ce24cbbf2cd21648a4781a1464 | 12 | 12 | 2 | Sender not available | Subject not available | Ham |
1801.906dd11cca6bee22c6843afb597c87a3 | 8 | 8 | 1 | Sender not available | Subject not available | Ham |
1841.780eecf7f0db1db00bd0a6248de51260 | 4 | 4 | 1 | Sender not available | Subject not available | Ham |
1881.7a4b4e4c68a852fb5fb5876ec76899ab | 5 | 5 | 1 | Sender not available | Subject not available | Ham |
1921.fa34fb56fd0a04ce46ac71d176e6ec55 | 4 | 4 | 1 | Sender not available | Subject not available | Ham |
1981.2482770c04473ab0060a30b04366cf37 | 12 | 21 | 1 | Sender not available | Subject not available | Ham |
2011.9604a6dfecb37414590009d7c0c04f30 | 12 | 21 | 1 | Sender not available | Subject not available | Ham |
2051.e7d9645f8b4b87954a4445b9129a7ca5 | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
2091.1a1d629678aa1ab8953772a2cd006183 | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
2131.87a32fea62016d67931fa6138ad51748 | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
0051.9281d3f8a3faf47d09a7fafdf2caf26e | 4 | 4 | 1 | Sender not available | Subject not available | Ham |
0091.3bdd7b578973ee005733480a8b6c9b54 | 14 | 16 | 2 | Sender not available | Subject not available | Ham |
2181.3f950d2c7e806d81476ecbd529759d0e | 10 | 15 | 1 | Sender not available | Subject not available | Ham |
2221.13e25582abf522bfe1afc5c9ad180bde | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
2261.db40430b71ffa3e8f18a564181567788 | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
2301.12cd963ea74e881168b6bb865dedb892 | 6 | 9 | 1 | Sender not available | Subject not available | Ham |
2341.23fc100bad93aab582c5fe870225bac2 | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
2381.b2da2e96c499fbf2c4cc0ff9491cf337 | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
2421.aa75fb78c4f596e7ee91808fbdc34ab8 | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
2461.20f3b1b3e7e2bd1bd6a016f9f09374d3 | 6 | 9 | 1 | Sender not available | Subject not available | Ham |
2501.36311f6ceac129fa31540555ffe3cc97 | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
2541.892b976061b9ea684b9b3009d0b87dab | 8 | 13 | 1 | Sender not available | Subject not available | Ham |
0260.b68400a28ee29cb2f24149a03db1fd9e | 6 | 6 | 2 | Sender not available | Subject not available | Ham |
0264.a1183a59e4f0a71e80378d9404a3212f | 6 | 6 | 2 | Sender not available | Subject not available | Ham |
0268.77ef28e27a9ee085646f260418072111 | 4 | 4 | 1 | Sender not available | Subject not available | Ham |
0272.c3e06e3aa72f63dee68faaed6fbaaa1a | 3 | 3 | 1 | Sender not available | Subject not available | Ham |
0276.394eddb373972d985cabf6f63953a3d0 | 14 | 15 | 1 | Sender not available | Subject not available | Ham |
0280.6dca279b3e6aa252197d8439841032b4 | 10 | 15 | 1 | Sender not available | Subject not available | Ham |
0284.4af1f0641ea6aae6a9645dcbeabe95e9 | 2 | 2 | 1 | Sender not available | Subject not available | Ham |
0288.3ae040d2c993ea997470df41f31aebcb | 12 | 14 | 1 | Sender not available | Subject not available | Ham |
0292.6599ff4b0c1593297dd17247919b3fcf | 1 | 1 | 1 | Sender not available | Subject not available | Ham |
0296.42216a75e0256510b216eaba6893d40d | 14 | 14 | 1 | Sender not available | Subject not available | Ham |
0300.2e30b3bffb4f2887df203c197d11e936 | 2 | 2 | 1 | Sender not available | Subject not available | Ham |
0304.73bb5ec3f02f4db15750531e226b1cb8 | 3 | 3 | 1 | “Hunt, Bryan” <B.Hunt@emuse-tech.com>; | OT: RE: [ILUG] Newby to Linux looking for information on cvs | Ham |
0308.5c3218cf2a6260c6178cbea1b9e345f7 | 3 | 3 | 1 | Sender not available | Subject not available | Ham |
0312.d17a25919e7985a270c6fdee37a8f83e | 15 | 19 | 1 | Sender not available | Subject not available | Ham |
0316.0b7a8e1acbd09115574dc58120d93000 | 10 | 13 | 2 | Sender not available | Subject not available | Ham |
0320.6c54ea1bb991c6fae395588219cfce37 | 11 | 12 | 1 | Sender not available | Subject not available | Ham |
0324.d425c24d444091807e283e66449853b0 | 4 | 4 | 1 | Sender not available | Subject not available | Ham |
0328.dd38a12b955002695e7a0ec81129b043 | 10 | 11 | 1 | Sender not available | Subject not available | Ham |
0332.daed28f33b65dd9f1c91fa3737d21340 | 4 | 4 | 1 | Sender not available | Subject not available | Ham |
0336.d8d6d93ff9918e7a6b4b83a5bda3043e | 13 | 15 | 1 | Sender not available | Subject not available | Ham |
0340.aae9e33fb151ae061354b8cfe9f90b3d | 11 | 13 | 1 | Sender not available | Subject not available | Ham |
0345.c30e766af45337ac505a52ad592ab954 | 2 | 2 | 1 | Sender not available | Subject not available | Ham |
0349.1f77fea2fe759b72c8f29740558ffae8 | 10 | 15 | 1 | Sender not available | Subject not available | Ham |
0353.f4c84bdca20621cd5aa9bf8d6210e0b4 | 3 | 3 | 1 | Sender not available | Subject not available | Ham |
0462.6b066ed01f1856371a7ca52580774a20 | 8 | 8 | 1 | Sender not available | Subject not available | Ham |
0466.831f4b97805c8e03cf6630716309e89e | 10 | 11 | 1 | Sender not available | Subject not available | Ham |
0470.d5ad7286cd913b0cfdff5b954b9338ac | 14 | 17 | 1 | Sender not available | Subject not available | Ham |
0474.0b5f82aa1324baf9cf718849c48d679e | 0 | 0 | 0 | A guy who models plasma all day… | Re: Electric car an Edsel… | Ham |
0477.b7958d4d9cb312c16965859705826a7e | 5 | 6 | 1 | Sender not available | Subject not available | Ham |
0049.bda43370915afa1f557f7edab6913e04 | 13 | 14 | 1 | Sender not available | Subject not available | Ham |
0485.4657c96dd864d02d6273cb268b631015 | 9 | 9 | 1 | Sender not available | Subject not available | Ham |
0488.8fddac859ba93d27a041fe770be65d29 | 0 | 0 | 0 | “Jim Whitehead” <ejw@cse.ucsc.edu>; | Subject not available | Ham |
0492.3aa3aaa0ac9343fd1aa11864e8c281b1 | 11 | 12 | 1 | Sender not available | Subject not available | Ham |
0496.397e015dabee7ac3c6e655ad9bf66052 | 15 | 15 | 1 | Sender not available | Subject not available | Ham |
0500.26e81584fd9c739be4898acd4870143e | 16 | 16 | 2 | Sender not available | Subject not available | Ham |
0503.5364d68940343b854595410487aaa1ba | 9 | 9 | 1 | Sender not available | Subject not available | Ham |
0507.3e5a5811fbfca49dc665f982e81ea271 | 3 | 4 | 1 | Sender not available | Subject not available | Ham |
While tokenizing the corpuses using Quanteda’s tokens() function, I have removed the numbers and punctuation symbols.
### Spam Tokens
spamTokens <- tokens(spamEmailCorp,remove_punct = TRUE,remove_numbers = TRUE)
head(spamTokens)
## tokens from 6 documents.
## 0103.8c39bfed2079f865e9dfb75f4416a468 :
## [1] "This" "is" "a" "multi-part" "message"
## [6] "in" "MIME" "format"
##
## 0107.f1d4194b57840ea6587b9a73ed88e075 :
## character(0)
##
## 0012.7bc8e619ad0264979edce15083e70a02 :
## character(0)
##
## 0114.c104ada3a249e1e1846c0cd156a303e9 :
## character(0)
##
## 0118.4be8b50c2a818c62b62e70c4b5456113 :
## [1] "Whiter" "teeth" "and" "a" "brighter" "smile"
## [7] "are" "just" "a" "click" "away"
##
## 0121.772c3ccd1b6c1a2e0e2ec0356082c77b :
## character(0)
### Ham Token
hamTokens <- tokens(hamEmailCorp,remove_punct = TRUE,remove_numbers = TRUE)
head(hamTokens)
## tokens from 6 documents.
## 1011.82f644586fced13704dd79e22c3d8fb9 :
## character(0)
##
## 1051.cf81a19208b703f18497a0d6fedb1f13 :
## [1] "Hi"
##
## 1091.0bade8676340d304cae87dad02efa8ce :
## [1] "On" "Fri" "Sep" "at" "Robert" "Elz" "wrote"
##
## 1121.51f7e5e557bde451a6b36e527211ed04 :
## [1] "On" "Thu" "at" "Matthias" "Saou" "wrote"
##
## 1161.9bcd69bccfeb05378e3e36fa62b16f7d :
## [1] "Hi" "I'm" "building" "an" "rpm"
## [6] "for" "the" "resin" "webserver" "and"
## [11] "I" "basically" "want" "to"
##
## 0121.b475478456e52de66ef0b0fb501bbfd3 :
## [1] "Hi" "All"
### Keyword in Context based search for "Spam"
kwic(spamEmailCorp, "spam", valuetype = "regex") %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
docname | from | to | pre | keyword | post |
---|---|---|---|---|---|
00161.ae33257753c9bdaaadc9221347868496 | 3 | 3 | HI , | zzzz@spamassassin.taint.org | today , |
00354.dca4b8984863a76ffd01a33888498288 | 5 | 5 | New Account For : | zzzz@spamassassin.taint.org | |
00462.868771c8074e480f540a1d2e6a5ac7cb | 5 | 5 | New Account For : | zzzz@spamassassin.taint.org | |
00824.eec96f74d95afedbe574498808d29395 | 4 | 4 | This is NOT | spam | . = 20 |
00952.1a3c371c56be9de3bfb258b93af71649 | 3 | 3 |
|
DeathToSpamDeathToSpamDeathToSpam |
|
00953.906b4905eb02cfb9a093162f3c143252 | 3 | 3 |
|
DeathToSpamDeathToSpamDeathToSpam |
|
01073.bfc8a301fd274efde213c0249d5a85b7 | 4 | 4 | This is not | spam | . Thanks for posting to |
01380.fa9b4e89ba485def2921e01ae9fb7671 | 165 | 165 | of unsolicited email broadcasting ( | spam | ) . . . Therefore |
01380.fa9b4e89ba485def2921e01ae9fb7671 | 305 | 305 | clean the databases and be | spam | free . Thank you again |
In order to perform statistical analysis such as document scaling, we must extract a matrix associating values for certain features with each document. I have used quanteda library’s dfm function to produce such a matrix. “dfm” is short for document-feature matrix, and always refers to documents in rows and “features” as columns. We fix this dimensional orientation because it is standard in data analysis to have a unit of analysis as a row, and features or variables pertaining to each unit as columns. We call them “features” rather than terms, because features are more general than terms: they can be defined as raw terms, stemmed terms, the parts of speech of terms, terms after stopwords have been removed, or a dictionary class to which a term belongs.
### Document Feature Matrix: Spam
spamDFM <- dfm(spamEmailCorp, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, remove_numbers = TRUE)
spamDFM <- dfm_trim(spamDFM, min_termfreq = 4, max_docfreq = 10)
head(spamDFM)
## Document-feature matrix of: 6 documents, 325 features (99.9% sparse).
topfeatures(spamDFM,20)
## à ª age 2f sent sell pleas day name
## 16 16 14 13 12 12 11 11 10
## chat bank month us moment fill low qualifi smoker
## 10 10 10 10 10 10 10 10 10
## termlif comput
## 10 10
### Document Feature Matrix: Ham
hamDFM <- dfm(hamEmailCorp, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, remove_numbers = TRUE)
hamDFM <- dfm_trim(hamDFM, min_termfreq = 4, max_docfreq = 10)
head(hamDFM)
## Document-feature matrix of: 6 documents, 626 features (99.9% sparse).
topfeatures(hamDFM,20)
## anyth folder bad insur frank via french everyon
## 12 11 11 11 10 10 10 10
## love agre phone similar talk heard web notic
## 10 10 10 10 10 10 10 10
## debug lanc peltonen point
## 10 10 10 10
### Spam Wordcloud
set.seed(100)
textplot_wordcloud(spamDFM, min_count = 6, random_order = FALSE,
rotation = .25,
color = RColorBrewer::brewer.pal(8,"Dark2"))
### Ham Wordcloud
set.seed(100)
textplot_wordcloud(hamDFM, min_count = 6, random_order = FALSE,
rotation = .25,
color = RColorBrewer::brewer.pal(8,"Dark2"))