Project Summary:

This project is to build a Supervised Learning techniques based document classification model to detect Spam Vs. Ham e-mail documents. Training and Test data has been obtained based on the public data sets available under below location -

https://spamassassin.apache.org/old/publiccorpus/

GitHub Location:

https://github.com/soumya2g/R-CUNY-MSDS/tree/master/DATA-607/E-mail%20Spam%20Detection%20Model

R Libraries:

Load necessary libraries -

library(kableExtra)
library(dplyr)
library(tm)
library(stringr)
library(ggplot2)
library(rvest)
library(R.utils)
library(quanteda)
library(readtext)

Project Goal :

The goal of this project is to build a supervised document classification solution to work with the already labelled Training data sets (Spam Vs. Ham) and build corpus of documents, extract tokens, build a document feature matrix following a structured workflow approach as depicted below -

Wrapper functions :

I have tried to modularise the solution using few wrapper functions-

Note: I have used rvest package to read the e-mail documents body section based on HTML Tags. While using this library there were quite a few documents which threw errors for which I had to create a list of error files and exclude them in my Training data build process.

### Function to download all Spam and Ham Files 

downloadTAR <- function(filetype=NULL, url=NULL, rootfile=NULL){
  
  directoryReplace <- str_sub(rootfile,10,str_locate(rootfile,"\\.")[1,1]-1)
 
  myfile <- paste(url,rootfile,sep="")

  destfile <- paste(filetype,".tar.bz2", sep="")
    
  download.file(myfile, destfile= destfile)
    
  destTarFile <- paste(filetype,".tar", sep="")
  bunzip2(destfile,destname=destTarFile, overwrite=TRUE, remove=TRUE)
  untar(destTarFile,exdir = paste(getwd(),"/",filetype,sep = ""))

  filenamesDF <- data_frame(filetype,filename = str_trim(str_sub(untar(destTarFile, list = TRUE),nchar(directoryReplace)+2,-1)))

  return(filenamesDF)
}

### Function to build corpus

buildCorpus <- function(importtype=NULL, filenames=NULL){

  if (importtype == "Spam") {
    filePaths <- paste(getwd(),"/",importtype,"/",importtype,"/",filenames$filename, sep = "")
  }
  if (importtype == "Ham") {
    filePaths <- paste(getwd(),"/",importtype,"/easy_ham/",filenames$filename, sep = "")
  }

  tempEmailBody <- vector()
  tempEmailFrom <- vector()
  tempEmailSubject <- vector()
  docid <- vector()

  for(i in 1:length(filenames$filename)){
    if(filenames$filename[i] != "cmds" & nchar(filenames$filename[i])>10 & filenames$filename[i] != "0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1" &
       filenames$filename[i] != "0143.260a940290dcb61f9327b224a368d4af" & filenames$filename[i] != "00136.faa39d8e816c70f23b4bb8758d8a74f0" &
       filenames$filename[i] != "0231.7c6cc716ce3f3bfad7130dd3c8d7b072" & filenames$filename[i] != "0250.7c6cc716ce3f3bfad7130dd3c8d7b072")
    {
      con <- file(filePaths[i], open="rt")
      text <- readLines(con)
      if(length(text) > 1)
      {
        msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
        eMailBodyDoc <- msg
        emailSubject <- str_trim(str_replace_all(str_sub(unlist(str_extract_all(eMailBodyDoc,"Subject:.*")),10,-1),"  ",""))
        emailFrom <- str_trim(str_sub(unlist(str_extract_all(eMailBodyDoc,"From:.*")),7,-1))
        eMailBodyDoc <- eMailBodyDoc %>% str_replace_all("\n","") 
        eMailBodyDoc <- eMailBodyDoc %>% str_replace_all("<.*>","") 
      
        tempEmailBody[i] <- ifelse(length(eMailBodyDoc) == 0,"Body not available",eMailBodyDoc)
        tempEmailFrom[i] <- ifelse(length(emailFrom) == 0,"Sender not available",emailFrom)
        tempEmailSubject[i] <- ifelse(length(emailSubject) == 0,"Subject not available",emailSubject)
        docid[i] <- ifelse(nchar(filenames$filename[i]) == 0,"E-mail name not available",filenames$filename[i])
      }
      close(con)
    }

  }
  email_corp <- corpus(tempEmailBody, docvars = data_frame(Email_From = tempEmailFrom, Email_Subject = tempEmailSubject, Tag = importtype)) 
  email_corp <- corpus_subset(email_corp,!is.na(Email_From))
  docid <- docid[ifelse(!is.na(docid), TRUE,FALSE)]
  docnames(email_corp) <- docid 
  
  return(email_corp)
}

Training Data Preparation :

The 9 folders available in the source were decompressed and copied into two folders, one containing the “ham” and the other containing the “spam”. I found out that there were some duplicated files in the original raw dataset. After removing the duplicates, I obtained 6,952 ham and 2,398 spam files, for a total of 9350 files.

  1. Document Loading :

I downloaded the training data files from the source and extracted them under appropriate local source directories using the wrapper function downloadTAR()

URL <- "https://spamassassin.apache.org/old/publiccorpus/"

spamFiles <- c("20021010_spam.tar.bz2","20030228_spam.tar.bz2","20030228_spam_2.tar.bz2")

hamFiles <- c("20021010_easy_ham.tar.bz2","20021010_hard_ham.tar.bz2","20030228_easy_ham.tar.bz2",
              "20030228_easy_ham_2.tar.bz2")

### Set Local Working directory
#setwd("C:/CUNY/Semester1 (Fall)/DATA 607/Project 4/Source/Training Data")
  1. Extract all the Spam Files from source :
## Extract all the Spam Files under Training Data

spamFileNames <- data_frame()

for(i in 1:length(spamFiles))
{
  spamFileNames <- rbind(spamFileNames,downloadTAR("Spam", URL, spamFiles[i]))
}

spamTrainCount <- spamFileNames %>% group_by(filetype) %>% summarise(file_count = n())
  1. Extract all the Ham Files from source :
## Extract all the Ham Files under Training Data

hamFileNames <- data_frame()

for(i in 1:length(hamFiles))
{
  hamFileNames <- rbind(hamFileNames,downloadTAR("Ham", URL, hamFiles[i]))
}

hamTrainCount <- hamFileNames %>% group_by(filetype) %>% summarise(file_count = n())

#### Training Data Summary

rbind(spamTrainCount,hamTrainCount) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
filetype file_count
Spam 2403
Ham 6707

Build Document Corpuses:

  1. Build Document Corpuses using Training Data set :
#### Build Spam Corpus and Summarise

spamEmailCorp <- buildCorpus(importtype = "Spam", filenames = spamFileNames)
summary(spamEmailCorp) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
Text Types Tokens Sentences Email_From Email_Subject Tag
0103.8c39bfed2079f865e9dfb75f4416a468 9 9 1 Sender not available Subject not available Spam
0107.f1d4194b57840ea6587b9a73ed88e075 1 1 1 Sender not available Subject not available Spam
0012.7bc8e619ad0264979edce15083e70a02 0 0 0 Sender not available Subject not available Spam
0114.c104ada3a249e1e1846c0cd156a303e9 0 0 0 Sender not available Subject not available Spam
0118.4be8b50c2a818c62b62e70c4b5456113 11 12 1 Sender not available Subject not available Spam
0121.772c3ccd1b6c1a2e0e2ec0356082c77b 0 0 0 Sender not available Subject not available Spam
0125.44381546181fc6c5d7ea59e917f232c5 11 13 1 Sender not available Subject not available Spam
0129.78a705ff6b3bde3395d067459e6e46e2 13 13 2 Sender not available Subject not available Spam
0132.7ac2141ed9a163f934ac65b3f59a2a03 0 0 0 Sender not available Subject not available Spam
0136.7e7d6adf293fa0a3dc56b3f796cf00d1 3 3 1 Sender not available Subject not available Spam
0140.a2bb669eaf743ed123fca884a40cfbd4 0 0 0 Sender not available Subject not available Spam
0147.65cf30538f09402e4d1bd4aa91d9532a 51 74 5 Sender not available Subject not available Spam
0016.f9c349935955e1ccc7626270da898445 0 0 0 Sender not available Subject not available Spam
0154.e39fc51ffdb9c2ecd480ce972078aeaa 0 0 0 Sender not available Subject not available Spam
0158.ff5dce5446d2ec91f0caffeffdd48852 9 9 1 Sender not available Subject not available Spam
0161.00e60d1a3478f1ae99ff49fbd4b30605 7 8 2 Sender not available Subject not available Spam
0165.6eedc001155da3cbd75a60eba2b19448 202 375 12 Sender not available Subject not available Spam
0169.bc6e1356af0602fb96dd3f721fb17c48 4 5 1 Sender not available Subject not available Spam
0172.e524e85cab354337018e1d0d2fc21ffd 15 15 2 Sender not available Subject not available Spam
0176.70022adaab1a9dfe64ae7588ffa5add9 0 0 0 Sender not available Subject not available Spam
0180.afdbcd7acb65828c217eea90ff92c3b0 0 0 0 Sender not available Subject not available Spam
0183.4aaadeb40e3362e71e3e4aba15624e3a 0 0 0 Sender not available Subject not available Spam
0187.e2178f6d01a70dfbdf9c84c4dcaf58dc 5 5 1 Sender not available Subject not available Spam
0020.4120dc06a0124a8688e96f8cff029113 38 46 5 Sender not available Subject not available Spam
0194.dd4dd86bdd8e1113889af9afaf299d6c 5 5 1 Sender not available Subject not available Spam
0201.9da0b5702a864a8ffd06cfb4c724f9c8 9 9 1 Sender not available Subject not available Spam
0003.4b3d943b8df71af248d12f8b2e7a224a 8 8 1 Sender not available Subject not available Spam
0206.806263422d55d38a151fe3b89d56192f 7 7 1 Sender not available Subject not available Spam
0022.4b5cf3c16feb88dd6932a8c46a41946c 4 4 1 Sender not available Subject not available Spam
0215.57c4f4d8e2f582088f8aca38239059f7 3 8 1 Sender not available Subject not available Spam
0216.feb2a8df9887bc2d84e80c9d2a8faf56 5 10 2 Sender not available Subject not available Spam
0220.15583875f5ef9e2cf6450ebc821f0dff 0 0 0 Sender not available Subject not available Spam
0271.24302cf2e759401d1f9975fe4fc1def9 0 0 0 Sender not available Subject not available Spam
0301.ad155a30cca1f9d16e75e8934030edae 9 9 1 Sender not available Subject not available Spam
0341.7c13df68bb4feae35d9ea86001a3ecad 11 11 2 Sender not available Subject not available Spam
0381.492ed1e5eed1b631560e2009be5b8c9a 3 8 1 Sender not available Subject not available Spam
0411.e6e37cbb02ad33b4e0ba5fb6caf2bbcf 17 19 1 Sender not available Subject not available Spam
0451.588b22df28f4036ff3895447afbcb7f3 3 8 1 Sender not available Subject not available Spam
0491.f47154f78397c57b14e05450a16745d5 5 5 1 Sender not available Subject not available Spam
0054.839a9c0a07f13718570da944986a898a 13 13 1 Sender not available Subject not available Spam
0058.abb13c5db31d26a58607aac809573ed4 6 9 2 Sender not available Subject not available Spam
0061.c148ebba16540e48c7aae2e3f733a8a3 0 0 0 Sender not available Subject not available Spam
0065.18d2edcf9aa0e940651b5fdd218ac019 0 0 0 Sender not available Subject not available Spam
0069.a0b6cfde0e477af7f406ee756ba53826 6 6 1 Sender not available Subject not available Spam
0072.f97a14d667569ebbc0502bb2c7beec27 0 0 0 Sender not available Subject not available Spam
0076.770f0e7b8378a47a945043434f6f43df 1 1 1 Sender not available Subject not available Spam
0080.77af9ca7f967f055062aade45001129e 4 4 1 Sender not available Subject not available Spam
0083.a042c7512d5db5f9fc1857fdc6bbdcc3 4 4 1 Sender not available Subject not available Spam
0087.1cbd88a0c1564cb5d6c9b12c8c4175d8 9 9 1 Sender not available Subject not available Spam
0010.7f5fb525755c45eb78efc18d7c9ea5aa 3 3 1 Sender not available Subject not available Spam
0094.3ba780eac7dce1c2b063cd1fc12738be 3 8 1 Sender not available Subject not available Spam
0098.01d2958ccb7c2e4c02d0920593962436 0 0 0 Sender not available Subject not available Spam
0001.bfc8d64d12b325ff385cca8d07b84288 0 0 0 Sender not available Subject not available Spam
0049.625bab436c7fc6299cfceeaa24e198ae 0 0 0 Sender not available Subject not available Spam
0500.2e8762b67913d1b07bc8da293448d27f 5 5 1 Sender not available Subject not available Spam
0488.6d41f6d7222978a3ee2b6cfbfce55a02 0 0 0 Sender not available Subject not available Spam
0492.f2d030fd71d7c3075626195b5c0b56f7 8 13 5 Sender not available Subject not available Spam
0495.a13bce4369913c929a48b073f2b320c9 8 8 1 Sender not available Subject not available Spam
0307.2e4dc0cdb1e3b49f0986c19c1f324224 0 0 0 Sender not available Subject not available Spam
0308.1245e8fa9e6092687b535e36b367d8fb 9 9 1 Sender not available Subject not available Spam
0309.2a74113b0330ea76cecd28571fc6f7fe 3 8 1 Sender not available Subject not available Spam
0032.081c3615bc9b91d09b6cbb9239ba8c99 9 10 1 Sender not available Subject not available Spam
0312.a0e7f2633bd0ceaddf16fba58be54778 0 0 0 Sender not available Subject not available Spam
0315.26ca39910895a935e2b8bca93a44ebfe 7 12 1 Sender not available Subject not available Spam
0319.e4a20802d12937998f3b3bf805362a3f 9 9 1 Sender not available Subject not available Spam
0323.badf0273f656afd0dfebaa63af1c81f6 3 3 1 Sender not available Subject not available Spam
0327.5df76bb4359800b5408821285677b5cf 0 0 0 Sender not available Subject not available Spam
0329.5c22249fa35fff050675e7df4433b89f 0 0 0 Sender not available Subject not available Spam
0332.b82bddb316d2e12418d6ea8791ce5896 0 0 0 Sender not available Subject not available Spam
0336.b864dd710e659f0ef5826dc4d80714f8 0 0 0 Sender not available Subject not available Spam
0340.8e191c37e2d30a639013203aacf60086 0 0 0 Sender not available Subject not available Spam
0344.8bbe5c7c8269a039761968a1b10a936a 2 7 1 Sender not available Subject not available Spam
0348.e0b89978fa806cf3e7fd3ba0869b3c65 54 75 5 Sender not available Subject not available Spam
0352.f7adb4aa267e50a8db1e4bcacfe863f3 9 9 1 Sender not available Subject not available Spam
0356.86a795300367f707a8b648e0c50253ad 0 0 0 Sender not available Subject not available Spam
0360.5f5fc66c831d845705efac502121308a 4 4 1 Sender not available Subject not available Spam
0364.8e5f3385c2deb2c0c32794b403851ec4 0 0 0 Sender not available Subject not available Spam
0368.3a53888c2f7fbe52a7293f223375c245 0 0 0 Sender not available Subject not available Spam
0372.216f90ef52558ed24402e192586a40e8 5 5 2 Sender not available Subject not available Spam
0376.d87b4313e6c43a986060d57a0b8515a6 2 15 1 Sender not available Subject not available Spam
0380.c4d530b5816543f4f1a23b8ce0d281f5 0 0 0 Sender not available Subject not available Spam
0383.5b89d5a9c0152070a77e133734f7cd83 2 2 1 Sender not available Subject not available Spam
0389.ed4ca8aceef91808c783909351c7bdb4 0 0 0 Sender not available Subject not available Spam
0393.d3a4d296a35c6a7f39429247c007eeae 1 69 1 Sender not available Subject not available Spam
0397.c02eba1386b00d640c954e5117dd1aa0 12 12 2 Sender not available Subject not available Spam
0005.1f42bb885de0ef7fc5cd09d34dc2ba54 7 7 1 Sender not available Subject not available Spam
0405.18a5c3d971e1def2c3b4a2df122f3583 2 2 1 Sender not available Subject not available Spam
0409.09cb28cd8753bff06fc8a547c3ed8fe2 0 0 0 Sender not available Subject not available Spam
0042.21cc985cc36d931916863aed24de8c27 0 0 0 Sender not available Subject not available Spam
0235.77e9a7e398ce81359c08e64bf20e9825 5 5 1 Sender not available Subject not available Spam
0246.3b997087302d48ff57ab5afb3d400d5b 3 3 1 Sender not available Subject not available Spam
0245.39c15852204971c72e8d89f9f3f9bb38 4 4 1 Sender not available Subject not available Spam
0413.4c74110f6640067c2172a04543dea670 0 0 0 Sender not available Subject not available Spam
0260.737eefb83e7eedbd531117c273c56241 7 7 1 Sender not available Subject not available Spam
0256.ad88c1a165392a509a8b0b8df6d56cbd 0 0 0 Sender not available Subject not available Spam
0265.1120a7d868b23e83b91ad00ec8b79e08 7 7 1 Sender not available Subject not available Spam
0028.83a43dd97923463030349506a56226c1 8 8 1 Sender not available Subject not available Spam
0274.85756abb8d0bcfe267e464a2f33ce686 4 5 2 Sender not available Subject not available Spam
0279.4ef122899a70a2225ffb9b5c54fde1fc 9 9 1 Sender not available Subject not available Spam
0292.3e12964912377bd9b52d223e37812e56 0 0 0 Sender not available Subject not available Spam
#### Build Ham Corpus and Summarise

hamEmailCorp <- buildCorpus(importtype = "Ham", filenames = hamFileNames)
summary(hamEmailCorp) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
Text Types Tokens Sentences Email_From Email_Subject Tag
1011.82f644586fced13704dd79e22c3d8fb9 0 0 0 fork-admin@xent.com [mailto:fork-admin@xent.com] On Behalf Of Re: The Curse of India’s Socialism Ham
1051.cf81a19208b703f18497a0d6fedb1f13 2 2 1 Sender not available Subject not available Ham
1091.0bade8676340d304cae87dad02efa8ce 13 14 1 Brent Welch <welch@panasas.com>; Subject not available Ham
1121.51f7e5e557bde451a6b36e527211ed04 11 13 1 Sender not available Subject not available Ham
1161.9bcd69bccfeb05378e3e36fa62b16f7d 15 16 1 Sender not available Subject not available Ham
0121.b475478456e52de66ef0b0fb501bbfd3 3 3 1 Sender not available Subject not available Ham
1231.9a7db322df8f2bdf4eeb2d589cb51e34 3 5 1 Hal DeVore <haldevore@acm.org>; Subject not available Ham
1271.1af7c90a1459165ff18d621de40239c5 8 8 1 Sender not available Subject not available Ham
1301.7d4abb56d43695d968aafd6f15f5bbb3 9 9 1 Sender not available Subject not available Ham
1341.91bc30d50566e71807217c8977f7a793 8 8 1 Sender not available Subject not available Ham
1381.3c5527db01789ad42005006ac2ed2fcc 8 8 1 “” Angles " Puglisi" <angles@aminvestments.com>; some (null) eyecandy packages Ham
1411.a455fcdcc40a1a29551cc0153a9450bf 3 3 1 Sender not available Subject not available Ham
1451.64eda615fc56b4bbfdce6291a59c12f4 0 0 0 Sender not available Subject not available Ham
1491.2e82a3803e51f420c6398d963052469f 2 2 1 Sender not available Subject not available Ham
1521.692de69e480a819f6d32578f93fca74b 10 11 1 Sender not available Subject not available Ham
0191.314e2f68086989044e631e347a03b979 7 7 1 Sender not available Subject not available Ham
0231.1bc071b16de2cc1e2adab334be65f7c6 14 14 1 Sender not available Subject not available Ham
0271.349d737f101674586c61996593772a63 16 20 1 Sender not available Subject not available Ham
0311.de3984f9da9dba841ba515681fa065a6 0 0 0 Sender not available Subject not available Ham
0351.a7381397d31e8511581dc5cd59f39959 12 13 2 Sender not available Subject not available Ham
0391.e1f15b5f5a6dbbb8cde6571055be3127 13 14 2 Sender not available Subject not available Ham
0431.26f19fa47fab85e813b1aba8ff6139d6 16 17 1 Sender not available Subject not available Ham
0471.55f1c6122fd60d151c2c42182ecb734a 17 18 1 Sender not available Subject not available Ham
0511.7c59aca5d737ae0f1a94b8e08d5def67 9 9 1 Sender not available Subject not available Ham
0551.5bdfb0c299c60e442e39346dc08bad68 12 13 1 Sender not available Subject not available Ham
0591.f533bea095d75cc4dc282bdcba69072a 4 4 1 fork-admin@xent.com [mailto:fork-admin@xent.com] On Behalf Of Re: The Big Jump Ham
0631.6a6516bb19c38ae705d06f9518231f49 0 0 0 Sender not available Subject not available Ham
0671.ea82754ba3b836e43d376b32c07b79f0 12 14 1 Sender not available Subject not available Ham
0711.27203d4f43e71f7e1ced0cdd7f8685c8 4 4 1 Sender not available Subject not available Ham
0751.58282452f1adc8ad703ddc4cf12c2e37 0 0 0 “R. A. Hettinga” <rah@shipwright.com>; Subject not available Ham
0791.0b633957da3fa40b511d8e56ad877722 0 0 0 “John Hall” <johnhall@evergo.net>; Subject not available Ham
0831.0162ac7b4cc1c62fb35803bc4e8db70d 11 13 1 Sender not available Subject not available Ham
0871.79be1926ade2b8fc591f9f51abf66224 4 4 1 Sender not available Subject not available Ham
0911.dcc71630eed7821469e4c26e5b768aee 0 0 0 Sender not available Subject not available Ham
0951.542debe6e20315751c6e2c9bdedff85d 7 8 1 “Mr. FoRK” <fork_list@hotmail.com>; Documentum Acquires E-Room, Melding Content, Collaboration Ham
0991.f273a70df275a44e46f4544897eaee23 10 11 1 Sender not available Subject not available Ham
1561.b968a0929d29009dbb76603323c1862f 6 9 3 spamassassin-talk-admin@example.sourceforge.net RE: [SAtalk] 2.41/2.50 spamd/spamc problem Ham
1601.e586e85a3d75cc48f9b913f244d52632 5 5 1 Sender not available Subject not available Ham
1641.7555c5920365e6315e3f20d83211d558 5 5 1 “rODbegbie” <rOD@arsecandle.org>; Re: [SAtalk] spamd error messages Ham
1681.b17d16768d9543099dd7fe511f14ca9e 2 2 1 Sender not available Subject not available Ham
1721.2f654b5e99867bebf86ebb0280fb8e48 13 13 1 bmord@icon-nicholson.com [mailto:bmord@icon-nicholson.com] use of base image / delta image for automated recovery from Ham
1761.eeb706ce24cbbf2cd21648a4781a1464 12 12 2 Sender not available Subject not available Ham
1801.906dd11cca6bee22c6843afb597c87a3 8 8 1 Sender not available Subject not available Ham
1841.780eecf7f0db1db00bd0a6248de51260 4 4 1 Sender not available Subject not available Ham
1881.7a4b4e4c68a852fb5fb5876ec76899ab 5 5 1 Sender not available Subject not available Ham
1921.fa34fb56fd0a04ce46ac71d176e6ec55 4 4 1 Sender not available Subject not available Ham
1981.2482770c04473ab0060a30b04366cf37 12 21 1 Sender not available Subject not available Ham
2011.9604a6dfecb37414590009d7c0c04f30 12 21 1 Sender not available Subject not available Ham
2051.e7d9645f8b4b87954a4445b9129a7ca5 8 13 1 Sender not available Subject not available Ham
2091.1a1d629678aa1ab8953772a2cd006183 8 13 1 Sender not available Subject not available Ham
2131.87a32fea62016d67931fa6138ad51748 8 13 1 Sender not available Subject not available Ham
0051.9281d3f8a3faf47d09a7fafdf2caf26e 4 4 1 Sender not available Subject not available Ham
0091.3bdd7b578973ee005733480a8b6c9b54 14 16 2 Sender not available Subject not available Ham
2181.3f950d2c7e806d81476ecbd529759d0e 10 15 1 Sender not available Subject not available Ham
2221.13e25582abf522bfe1afc5c9ad180bde 8 13 1 Sender not available Subject not available Ham
2261.db40430b71ffa3e8f18a564181567788 8 13 1 Sender not available Subject not available Ham
2301.12cd963ea74e881168b6bb865dedb892 6 9 1 Sender not available Subject not available Ham
2341.23fc100bad93aab582c5fe870225bac2 8 13 1 Sender not available Subject not available Ham
2381.b2da2e96c499fbf2c4cc0ff9491cf337 8 13 1 Sender not available Subject not available Ham
2421.aa75fb78c4f596e7ee91808fbdc34ab8 8 13 1 Sender not available Subject not available Ham
2461.20f3b1b3e7e2bd1bd6a016f9f09374d3 6 9 1 Sender not available Subject not available Ham
2501.36311f6ceac129fa31540555ffe3cc97 8 13 1 Sender not available Subject not available Ham
2541.892b976061b9ea684b9b3009d0b87dab 8 13 1 Sender not available Subject not available Ham
0260.b68400a28ee29cb2f24149a03db1fd9e 6 6 2 Sender not available Subject not available Ham
0264.a1183a59e4f0a71e80378d9404a3212f 6 6 2 Sender not available Subject not available Ham
0268.77ef28e27a9ee085646f260418072111 4 4 1 Sender not available Subject not available Ham
0272.c3e06e3aa72f63dee68faaed6fbaaa1a 3 3 1 Sender not available Subject not available Ham
0276.394eddb373972d985cabf6f63953a3d0 14 15 1 Sender not available Subject not available Ham
0280.6dca279b3e6aa252197d8439841032b4 10 15 1 Sender not available Subject not available Ham
0284.4af1f0641ea6aae6a9645dcbeabe95e9 2 2 1 Sender not available Subject not available Ham
0288.3ae040d2c993ea997470df41f31aebcb 12 14 1 Sender not available Subject not available Ham
0292.6599ff4b0c1593297dd17247919b3fcf 1 1 1 Sender not available Subject not available Ham
0296.42216a75e0256510b216eaba6893d40d 14 14 1 Sender not available Subject not available Ham
0300.2e30b3bffb4f2887df203c197d11e936 2 2 1 Sender not available Subject not available Ham
0304.73bb5ec3f02f4db15750531e226b1cb8 3 3 1 “Hunt, Bryan” <B.Hunt@emuse-tech.com>; OT: RE: [ILUG] Newby to Linux looking for information on cvs Ham
0308.5c3218cf2a6260c6178cbea1b9e345f7 3 3 1 Sender not available Subject not available Ham
0312.d17a25919e7985a270c6fdee37a8f83e 15 19 1 Sender not available Subject not available Ham
0316.0b7a8e1acbd09115574dc58120d93000 10 13 2 Sender not available Subject not available Ham
0320.6c54ea1bb991c6fae395588219cfce37 11 12 1 Sender not available Subject not available Ham
0324.d425c24d444091807e283e66449853b0 4 4 1 Sender not available Subject not available Ham
0328.dd38a12b955002695e7a0ec81129b043 10 11 1 Sender not available Subject not available Ham
0332.daed28f33b65dd9f1c91fa3737d21340 4 4 1 Sender not available Subject not available Ham
0336.d8d6d93ff9918e7a6b4b83a5bda3043e 13 15 1 Sender not available Subject not available Ham
0340.aae9e33fb151ae061354b8cfe9f90b3d 11 13 1 Sender not available Subject not available Ham
0345.c30e766af45337ac505a52ad592ab954 2 2 1 Sender not available Subject not available Ham
0349.1f77fea2fe759b72c8f29740558ffae8 10 15 1 Sender not available Subject not available Ham
0353.f4c84bdca20621cd5aa9bf8d6210e0b4 3 3 1 Sender not available Subject not available Ham
0462.6b066ed01f1856371a7ca52580774a20 8 8 1 Sender not available Subject not available Ham
0466.831f4b97805c8e03cf6630716309e89e 10 11 1 Sender not available Subject not available Ham
0470.d5ad7286cd913b0cfdff5b954b9338ac 14 17 1 Sender not available Subject not available Ham
0474.0b5f82aa1324baf9cf718849c48d679e 0 0 0 A guy who models plasma all day… Re: Electric car an Edsel… Ham
0477.b7958d4d9cb312c16965859705826a7e 5 6 1 Sender not available Subject not available Ham
0049.bda43370915afa1f557f7edab6913e04 13 14 1 Sender not available Subject not available Ham
0485.4657c96dd864d02d6273cb268b631015 9 9 1 Sender not available Subject not available Ham
0488.8fddac859ba93d27a041fe770be65d29 0 0 0 “Jim Whitehead” <ejw@cse.ucsc.edu>; Subject not available Ham
0492.3aa3aaa0ac9343fd1aa11864e8c281b1 11 12 1 Sender not available Subject not available Ham
0496.397e015dabee7ac3c6e655ad9bf66052 15 15 1 Sender not available Subject not available Ham
0500.26e81584fd9c739be4898acd4870143e 16 16 2 Sender not available Subject not available Ham
0503.5364d68940343b854595410487aaa1ba 9 9 1 Sender not available Subject not available Ham
0507.3e5a5811fbfca49dc665f982e81ea271 3 4 1 Sender not available Subject not available Ham

Apply Tokenization :

  1. Tokenize the Spam and Ham Corpuses :

While tokenizing the corpuses using Quanteda’s tokens() function, I have removed the numbers and punctuation symbols.

### Spam Tokens
spamTokens <- tokens(spamEmailCorp,remove_punct = TRUE,remove_numbers = TRUE)
head(spamTokens) 
## tokens from 6 documents.
## 0103.8c39bfed2079f865e9dfb75f4416a468 :
## [1] "This"       "is"         "a"          "multi-part" "message"   
## [6] "in"         "MIME"       "format"    
## 
## 0107.f1d4194b57840ea6587b9a73ed88e075 :
## character(0)
## 
## 0012.7bc8e619ad0264979edce15083e70a02 :
## character(0)
## 
## 0114.c104ada3a249e1e1846c0cd156a303e9 :
## character(0)
## 
## 0118.4be8b50c2a818c62b62e70c4b5456113 :
##  [1] "Whiter"   "teeth"    "and"      "a"        "brighter" "smile"   
##  [7] "are"      "just"     "a"        "click"    "away"    
## 
## 0121.772c3ccd1b6c1a2e0e2ec0356082c77b :
## character(0)
### Ham Token
hamTokens <- tokens(hamEmailCorp,remove_punct = TRUE,remove_numbers = TRUE)
head(hamTokens) 
## tokens from 6 documents.
## 1011.82f644586fced13704dd79e22c3d8fb9 :
## character(0)
## 
## 1051.cf81a19208b703f18497a0d6fedb1f13 :
## [1] "Hi"
## 
## 1091.0bade8676340d304cae87dad02efa8ce :
## [1] "On"     "Fri"    "Sep"    "at"     "Robert" "Elz"    "wrote" 
## 
## 1121.51f7e5e557bde451a6b36e527211ed04 :
## [1] "On"       "Thu"      "at"       "Matthias" "Saou"     "wrote"   
## 
## 1161.9bcd69bccfeb05378e3e36fa62b16f7d :
##  [1] "Hi"        "I'm"       "building"  "an"        "rpm"      
##  [6] "for"       "the"       "resin"     "webserver" "and"      
## [11] "I"         "basically" "want"      "to"       
## 
## 0121.b475478456e52de66ef0b0fb501bbfd3 :
## [1] "Hi"  "All"
  1. Sample context based search on Tokens :
### Keyword in Context based search for "Spam"

kwic(spamEmailCorp, "spam", valuetype = "regex") %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
docname from to pre keyword post
00161.ae33257753c9bdaaadc9221347868496 3 3 HI , zzzz@spamassassin.taint.org today ,
00354.dca4b8984863a76ffd01a33888498288 5 5 New Account For : zzzz@spamassassin.taint.org
00462.868771c8074e480f540a1d2e6a5ac7cb 5 5 New Account For : zzzz@spamassassin.taint.org
00824.eec96f74d95afedbe574498808d29395 4 4 This is NOT spam . = 20
00952.1a3c371c56be9de3bfb258b93af71649 3 3
DeathToSpamDeathToSpamDeathToSpam
00953.906b4905eb02cfb9a093162f3c143252 3 3
DeathToSpamDeathToSpamDeathToSpam
01073.bfc8a301fd274efde213c0249d5a85b7 4 4 This is not spam . Thanks for posting to
01380.fa9b4e89ba485def2921e01ae9fb7671 165 165 of unsolicited email broadcasting ( spam ) . . . Therefore
01380.fa9b4e89ba485def2921e01ae9fb7671 305 305 clean the databases and be spam free . Thank you again

Build Document Feature Matrix :

  1. Build Document Feature Matrix :

In order to perform statistical analysis such as document scaling, we must extract a matrix associating values for certain features with each document. I have used quanteda library’s dfm function to produce such a matrix. “dfm” is short for document-feature matrix, and always refers to documents in rows and “features” as columns. We fix this dimensional orientation because it is standard in data analysis to have a unit of analysis as a row, and features or variables pertaining to each unit as columns. We call them “features” rather than terms, because features are more general than terms: they can be defined as raw terms, stemmed terms, the parts of speech of terms, terms after stopwords have been removed, or a dictionary class to which a term belongs.

### Document Feature Matrix: Spam

spamDFM <- dfm(spamEmailCorp, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, remove_numbers = TRUE)

spamDFM <- dfm_trim(spamDFM, min_termfreq = 4, max_docfreq = 10)

head(spamDFM)
## Document-feature matrix of: 6 documents, 325 features (99.9% sparse).
topfeatures(spamDFM,20)
##       à       ª     age      2f    sent    sell   pleas     day    name 
##      16      16      14      13      12      12      11      11      10 
##    chat    bank   month      us  moment    fill     low qualifi  smoker 
##      10      10      10      10      10      10      10      10      10 
## termlif  comput 
##      10      10
### Document Feature Matrix: Ham

hamDFM <- dfm(hamEmailCorp, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, remove_numbers = TRUE)

hamDFM <- dfm_trim(hamDFM, min_termfreq = 4, max_docfreq = 10)

head(hamDFM)
## Document-feature matrix of: 6 documents, 626 features (99.9% sparse).
topfeatures(hamDFM,20) 
##    anyth   folder      bad    insur    frank      via   french  everyon 
##       12       11       11       11       10       10       10       10 
##     love     agre    phone  similar     talk    heard      web    notic 
##       10       10       10       10       10       10       10       10 
##    debug     lanc peltonen    point 
##       10       10       10       10

Wordcloud Visualization :

  1. Build Wordclouds to identify top occurring tokens in Spam and Ham data sets :
### Spam Wordcloud
set.seed(100)
textplot_wordcloud(spamDFM, min_count = 6, random_order = FALSE,
                   rotation = .25, 
                   color = RColorBrewer::brewer.pal(8,"Dark2"))

### Ham Wordcloud
set.seed(100)
textplot_wordcloud(hamDFM, min_count = 6, random_order = FALSE,
                   rotation = .25, 
                   color = RColorBrewer::brewer.pal(8,"Dark2"))