Exploring a Spam-Ham Classifier

Overview

Classification is an important branch of machine learning that involves categorizing items. A common example that’s used to demonstrate classification is a ham-spam classifier, which attempts to differentiate spam emails from regular emails.

Load libraries.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## Warning: package 'tidyr' was built under R version 4.1.3

## Warning: package 'dplyr' was built under R version 4.1.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidytext)
library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.1.3

## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --

## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0

## Warning: package 'broom' was built under R version 4.1.3

## Warning: package 'dials' was built under R version 4.1.3

## Warning: package 'modeldata' was built under R version 4.1.3

## Warning: package 'parsnip' was built under R version 4.1.3

## Warning: package 'recipes' was built under R version 4.1.3

## Warning: package 'rsample' was built under R version 4.1.3

## Warning: package 'tune' was built under R version 4.1.3

## Warning: package 'workflows' was built under R version 4.1.3

## Warning: package 'workflowsets' was built under R version 4.1.3

## Warning: package 'yardstick' was built under R version 4.1.3

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org

library(rvest)

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

library(R.utils)

## Loading required package: R.oo

## Loading required package: R.methodsS3

## R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.

## R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.

## 
## Attaching package: 'R.oo'

## The following object is masked from 'package:R.methodsS3':
## 
##     throw

## The following object is masked from 'package:recipes':
## 
##     check

## The following object is masked from 'package:dials':
## 
##     finalize

## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods

## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save

## R.utils v2.11.0 (2021-09-26 08:30:02 UTC) successfully loaded. See ?R.utils for help.

## 
## Attaching package: 'R.utils'

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:utils':
## 
##     timestamp

## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
##     warnings

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(SnowballC)
library(caTools)

## Warning: package 'caTools' was built under R version 4.1.3

Get corpus links by scraping .

url <- "https://spamassassin.apache.org/old/publiccorpus/"
links <- read_html(url) %>%
            html_elements(xpath='//a') %>% 
            html_attr("href")
corpus_links <- links[6:14]

Download data into a temp directory.

tmpdir <- tempdir()

for (link in corpus_links) {
  download.file(paste0(url,link), link)
  unzipped <- bunzip2(link)
  folder_name <- gsub("[.]tar","", unzipped)  # will be a subdirectory of the temp directory, tmpdir.
  untar(unzipped, exdir=paste0(tmpdir,"\\",folder_name), compressed = 'gzip')
}

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

my_files <- list.files(tmpdir)
my_files

##  [1] "20021010_easy_ham"   "20021010_hard_ham"   "20021010_spam"      
##  [4] "20030228_easy_ham"   "20030228_easy_ham_2" "20030228_hard_ham"  
##  [7] "20030228_spam"       "20030228_spam_2"     "20050311_spam_2"    
## [10] "file54d81f667cb6"    "file54d82ccb5b82"    "file54d842413abc"

#unlink(tmpdir, recursive = TRUE, force=TRUE)

get_parts is a function for extracting body of email.

get_parts <- function(email) {
  
  Encoding(email) <- "UTF-8"
  email <- iconv(email, "UTF-8", "UTF-8",sub='')

  parts = unlist(regmatches(email, regexpr("\n\n", email), invert = TRUE))   # splits into header and body

  return (parts)
}

makedf <- function(folder_path, email_tag) { # folder path: paste0(tmpdir,"\\20021010_easy_ham\\easy_ham\\0001.ea7e79d3153e7469e7a9c3e0af6a357e")
  #folder_path:  paste0(tmpdir,"\\20021010_easy_ham\\easy_ham\\")
  
  
  files.names <- list.files(folder_path) 
  headers=c()
  body=c()
  tag=c()
  
  for (file.name in files.names) {
    file.path <- paste0(folder_path,"\\",file.name)
    file.content <- read_file(file.path)
    email_parts <- get_parts(file.content)
    
    headers=c(headers, email_parts[1])
    body=c(body, email_parts[2])
    tag=c(tag, email_tag)
  }
  
  
  df <- data.frame(headers=headers, body=body, tag=tag)  # tag is either "ham" or "spam"
  return (df)
}

easyham1.df <- makedf(paste0(tmpdir,"\\20021010_easy_ham\\easy_ham\\"),"ham")
easyham2.df <- makedf(paste0(tmpdir,"\\20030228_easy_ham\\easy_ham\\"),"ham")
easyham3.df <- makedf(paste0(tmpdir,"\\20030228_easy_ham_2\\easy_ham_2\\"),"ham")

spam1.df <- makedf(paste0(tmpdir,"\\20021010_spam\\spam\\"), "spam") 
spam2.df <- makedf(paste0(tmpdir,"\\20030228_spam\\spam\\"), "spam") 
spam3.df <- makedf(paste0(tmpdir,"\\20030228_spam_2\\spam_2\\"), "spam") 
spam4.df <- makedf(paste0(tmpdir,"\\20050311_spam_2\\spam_2\\"), "spam")

ham_count = nrow(easyham1.df)+nrow(easyham2.df)+nrow(easyham3.df)  #6453
spam_count = nrow(spam1.df)+nrow(spam2.df)+nrow(spam3.df)+nrow(spam4.df)  # 3797
print(paste0("No. of ham emails: ", ham_count, ", No. spam emails: ", spam_count))

## [1] "No. of ham emails: 6453, No. spam emails: 3797"

We can see that there are fewer spam emails than ham emails. Create an even data set, for balance.

easyham.df <- rbind(easyham1.df, easyham2.df, easyham3.df)
spam.df <- rbind(spam1.df,spam2.df,spam3.df,spam4.df)
df <- rbind(easyham.df[1:spam_count, ], spam.df)

set.seed(70)
rows <- sample(nrow(df))
df <- df[rows, ]

Now df is a data frame that contains all the emails, shuffled. Perform a series of cleaning steps of the text, including removing punctuation and numbers.

df <- df %>% 
  unite(email, c("headers","body"), sep="\n\n", remove=FALSE) # unite headers and body column into email column

corpus <- Corpus(VectorSource(df[,c(1)]))

corpus <- corpus %>% 
    tm_map(PlainTextDocument)  %>%
    tm_map(tolower) %>%
    tm_map(removePunctuation) %>%
    tm_map(removeWords, c(stopwords("english"))) %>%
    tm_map(removeNumbers) %>%
    tm_map(stemDocument) %>%
    tm_map(stripWhitespace)

## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents

## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeWords, c(stopwords("english"))):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents

## Warning in tm_map.SimpleCorpus(., stemDocument): transformation drops documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

Create document-term matrix, where each row represents an email and each column is a word that appears in an email. This can be a very sparse matrix, so remove the words that are very rare amongst all the emails.

dtm.frequencies = DocumentTermMatrix(corpus)
dtm.sparse = removeSparseTerms(dtm.frequencies, 0.995)
df.sparse = as.data.frame(as.matrix(dtm.sparse))

colnames(df.sparse) = make.names(colnames(df.sparse))

df.sparse$tag = df$tag

split = sample.split(df.sparse$tag, SplitRatio = 0.7)

train_set = subset(df.sparse, split==TRUE)

test_set = subset(df.sparse, split==FALSE)

There are different types of models used for classification, like logistic regression and SVM. Here we use RandomForest.

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.1.3

## randomForest 4.7-1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

train_set$tag = as.factor(train_set$tag)

test_set$tag = as.factor(test_set$tag)


RF_model = randomForest(tag ~ ., data=train_set)

predictRF = predict(RF_model, newdata=test_set)

table(test_set$tag, predictRF)

##       predictRF
##         ham spam
##   ham  3745    4
##   spam 3704   73

Evaluate accuracy.

cm <- table(test_set$tag, predictRF)
accuracy <- (cm[1]+cm[2,][2])/(sum(cm))
print(paste0("Accuracy: ", accuracy))

## [1] "Accuracy: 0.507307998937018"

Exploring a Spam-Ham Classifier

Tora Mullings

4/19/2022

Overview

Conclusion

Sources