Overview

Classification is an important branch of machine learning that involves categorizing items. A common example that’s used to demonstrate classification is a ham-spam classifier, which attempts to differentiate spam emails from regular emails.

Load libraries.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.1.3
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v rsample      0.1.1
## v dials        0.1.1     v tune         0.2.0
## v infer        1.0.0     v workflows    0.2.6
## v modeldata    0.1.1     v workflowsets 0.2.1
## v parsnip      0.2.1     v yardstick    0.0.9
## v recipes      0.2.0
## Warning: package 'broom' was built under R version 4.1.3
## Warning: package 'dials' was built under R version 4.1.3
## Warning: package 'modeldata' was built under R version 4.1.3
## Warning: package 'parsnip' was built under R version 4.1.3
## Warning: package 'recipes' was built under R version 4.1.3
## Warning: package 'rsample' was built under R version 4.1.3
## Warning: package 'tune' was built under R version 4.1.3
## Warning: package 'workflows' was built under R version 4.1.3
## Warning: package 'workflowsets' was built under R version 4.1.3
## Warning: package 'yardstick' was built under R version 4.1.3
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
## 
##     throw
## The following object is masked from 'package:recipes':
## 
##     check
## The following object is masked from 'package:dials':
## 
##     finalize
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save
## R.utils v2.11.0 (2021-09-26 08:30:02 UTC) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:utils':
## 
##     timestamp
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
##     warnings
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(SnowballC)
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3

Get corpus links by scraping .

url <- "https://spamassassin.apache.org/old/publiccorpus/"
links <- read_html(url) %>%
            html_elements(xpath='//a') %>% 
            html_attr("href")
corpus_links <- links[6:14]

Download data into a temp directory.

tmpdir <- tempdir()

for (link in corpus_links) {
  download.file(paste0(url,link), link)
  unzipped <- bunzip2(link)
  folder_name <- gsub("[.]tar","", unzipped)  # will be a subdirectory of the temp directory, tmpdir.
  untar(unzipped, exdir=paste0(tmpdir,"\\",folder_name), compressed = 'gzip')
}
## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated

## Warning: untar(compressed=) is deprecated
my_files <- list.files(tmpdir)
my_files
##  [1] "20021010_easy_ham"   "20021010_hard_ham"   "20021010_spam"      
##  [4] "20030228_easy_ham"   "20030228_easy_ham_2" "20030228_hard_ham"  
##  [7] "20030228_spam"       "20030228_spam_2"     "20050311_spam_2"    
## [10] "file54d81f667cb6"    "file54d82ccb5b82"    "file54d842413abc"
#unlink(tmpdir, recursive = TRUE, force=TRUE)

get_parts is a function for extracting body of email.

get_parts <- function(email) {
  
  Encoding(email) <- "UTF-8"
  email <- iconv(email, "UTF-8", "UTF-8",sub='')

  parts = unlist(regmatches(email, regexpr("\n\n", email), invert = TRUE))   # splits into header and body

  return (parts)
}
makedf <- function(folder_path, email_tag) { # folder path: paste0(tmpdir,"\\20021010_easy_ham\\easy_ham\\0001.ea7e79d3153e7469e7a9c3e0af6a357e")
  #folder_path:  paste0(tmpdir,"\\20021010_easy_ham\\easy_ham\\")
  
  
  files.names <- list.files(folder_path) 
  headers=c()
  body=c()
  tag=c()
  
  for (file.name in files.names) {
    file.path <- paste0(folder_path,"\\",file.name)
    file.content <- read_file(file.path)
    email_parts <- get_parts(file.content)
    
    headers=c(headers, email_parts[1])
    body=c(body, email_parts[2])
    tag=c(tag, email_tag)
  }
  
  
  df <- data.frame(headers=headers, body=body, tag=tag)  # tag is either "ham" or "spam"
  return (df)
}
easyham1.df <- makedf(paste0(tmpdir,"\\20021010_easy_ham\\easy_ham\\"),"ham")
easyham2.df <- makedf(paste0(tmpdir,"\\20030228_easy_ham\\easy_ham\\"),"ham")
easyham3.df <- makedf(paste0(tmpdir,"\\20030228_easy_ham_2\\easy_ham_2\\"),"ham")
spam1.df <- makedf(paste0(tmpdir,"\\20021010_spam\\spam\\"), "spam") 
spam2.df <- makedf(paste0(tmpdir,"\\20030228_spam\\spam\\"), "spam") 
spam3.df <- makedf(paste0(tmpdir,"\\20030228_spam_2\\spam_2\\"), "spam") 
spam4.df <- makedf(paste0(tmpdir,"\\20050311_spam_2\\spam_2\\"), "spam") 
ham_count = nrow(easyham1.df)+nrow(easyham2.df)+nrow(easyham3.df)  #6453
spam_count = nrow(spam1.df)+nrow(spam2.df)+nrow(spam3.df)+nrow(spam4.df)  # 3797
print(paste0("No. of ham emails: ", ham_count, ", No. spam emails: ", spam_count))
## [1] "No. of ham emails: 6453, No. spam emails: 3797"

We can see that there are fewer spam emails than ham emails. Create an even data set, for balance.

easyham.df <- rbind(easyham1.df, easyham2.df, easyham3.df)
spam.df <- rbind(spam1.df,spam2.df,spam3.df,spam4.df)
df <- rbind(easyham.df[1:spam_count, ], spam.df)

set.seed(70)
rows <- sample(nrow(df))
df <- df[rows, ]

Now df is a data frame that contains all the emails, shuffled. Perform a series of cleaning steps of the text, including removing punctuation and numbers.

df <- df %>% 
  unite(email, c("headers","body"), sep="\n\n", remove=FALSE) # unite headers and body column into email column

corpus <- Corpus(VectorSource(df[,c(1)]))

corpus <- corpus %>% 
    tm_map(PlainTextDocument)  %>%
    tm_map(tolower) %>%
    tm_map(removePunctuation) %>%
    tm_map(removeWords, c(stopwords("english"))) %>%
    tm_map(removeNumbers) %>%
    tm_map(stemDocument) %>%
    tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, c(stopwords("english"))):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., stemDocument): transformation drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

Create document-term matrix, where each row represents an email and each column is a word that appears in an email. This can be a very sparse matrix, so remove the words that are very rare amongst all the emails.

dtm.frequencies = DocumentTermMatrix(corpus)
dtm.sparse = removeSparseTerms(dtm.frequencies, 0.995)
df.sparse = as.data.frame(as.matrix(dtm.sparse))

colnames(df.sparse) = make.names(colnames(df.sparse))

df.sparse$tag = df$tag
split = sample.split(df.sparse$tag, SplitRatio = 0.7)

train_set = subset(df.sparse, split==TRUE)

test_set = subset(df.sparse, split==FALSE)

There are different types of models used for classification, like logistic regression and SVM. Here we use RandomForest.

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.3
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
train_set$tag = as.factor(train_set$tag)

test_set$tag = as.factor(test_set$tag)


RF_model = randomForest(tag ~ ., data=train_set)

predictRF = predict(RF_model, newdata=test_set)

table(test_set$tag, predictRF)
##       predictRF
##         ham spam
##   ham  3745    4
##   spam 3704   73

Evaluate accuracy.

cm <- table(test_set$tag, predictRF)
accuracy <- (cm[1]+cm[2,][2])/(sum(cm))
print(paste0("Accuracy: ", accuracy))
## [1] "Accuracy: 0.507307998937018"

Conclusion

Feature engineering is a large part of preparing to feed data to a classifier. It highly affects the prediction accuracy.

Let’s review the series of feature engineering steps we took:

Even after cleaning the raw emails, the classifier only had a prediction accuracy slightly above 50%, which means that it’s performance is hardly better than random guessing. This poor performance is likely due to either the wrong type of feature engineering or an unsuitable classifier. Perhaps an SVM classifier would have been better.


Sources

Singh, D. (2019, August 12). Machine Learning with Text Data Using R. Pluralsight. Retrieved April 20, 2022, from https://www.pluralsight.com/guides/machine-learning-text-data-using-r