1 Project description

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, we start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).

The data used in the project was retrieved from: https://spamassassin.apache.org/old/publiccorpus/

2 Load library

library(stringr)
suppressMessages(library(dplyr))
library(tidytext)
library(ggplot2)
library(tidyr)
suppressMessages(library(wordcloud))
library(readr)
library(purrr)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(e1071)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

3 Extracting files

Before starting with the statistical processing of text, we collect some files with email text data that will serve as basis for the corpus

We download and decompress the files into individual folders in the working directory.

base_url <-"https://spamassassin.apache.org/old/publiccorpus/"
spam_file <- "20030228_spam.tar.bz2"
ham_file <- "20030228_easy_ham.tar.bz2" 
files <- c(spam_file,ham_file)

# Download function:
download_file <- function(files,baseurl){
        n <- length(files)
        for (i in 1:n){
                fileurl <- str_c(baseurl,files[i])
                download.file(fileurl, destfile = files[i])
                Sys.sleep(1)
                untar(files[i])
        }
}

download_file(files,base_url)

# Names sub-directories including the current working directory:
list.dirs(path = ".", full.names = TRUE, recursive = TRUE)

## [1] "."                                                    
## [2] "./easy_ham"                                           
## [3] "./rsconnect"                                          
## [4] "./rsconnect/documents"                                
## [5] "./rsconnect/documents/Project4_v2.Rmd"                
## [6] "./rsconnect/documents/Project4_v2.Rmd/rpubs.com"      
## [7] "./rsconnect/documents/Project4_v2.Rmd/rpubs.com/rpubs"
## [8] "./spam"

# Number of files in spam folder:
length(list.files("./spam"))

## [1] 501

# Number of files in ham folder:
length(list.files("./easy_ham"))

## [1] 2501

# file names:
spam_fnames <- list.files("./spam")
spam_fnames[1:5]

## [1] "00001.7848dde101aa985090474a91ec93fcf0"
## [2] "00002.d94f1b97e48ed3b553b3508d116e6a09"
## [3] "00003.2ee33bc6eacdb11f38d052c44819ba6c"
## [4] "00004.eac8de8d759b7e74154f142194282724"
## [5] "00005.57696a39d7d84318ce497886896bf90d"

ham_fnames <- list.files("./easy_ham")
ham_fnames[1:5]

## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"

Checking if there is any file in the directory that doesn’t match the standard name of a file, that is, name starting with 0.

# spam folder:
not_stand_spam_name <- str_detect(spam_fnames,"^\\d+\\.[:alnum:]")
not_spam <- spam_fnames[not_stand_spam_name == FALSE]
not_spam

## [1] "cmds"

# ham folder:
not_stand_ham_name <- str_detect(ham_fnames,"^\\d+\\.[:alnum:]")
not_ham <- ham_fnames[not_stand_ham_name == FALSE]
not_ham

## [1] "cmds"

The existing cmds files don’t correspond to either spam or ham files so we remove them from their directories.

if (file.exists("spam/cmds")) file.remove("./spam/cmds")

## [1] TRUE

if (file.exists("easy_ham/cmds")) file.remove("./easy_ham/cmds")

## [1] TRUE

If we run again the code lines to dectect non-standard file names, it turns out to be empty.

not_stand_spam_name <- str_detect(spam_fnames,"^\\d+\\.[:alnum:]")
not_spam <- spam_fnames[not_stand_spam_name == FALSE]
not_spam

## [1] "cmds"

4 Pre-processing

We define a function to read all files from the folders into a data frame.

folders <- c("./spam", "./easy_ham")
categories <-c("spam", "ham")
symbs <- c("s", "h")
vec_fnames <- c(spam_fnames, ham_fnames)


# Function
#
read_folder_to_df <- function(folders, categories, symbs, vec_fnames){
        df <- data_frame()
        n <- length(folders)
        for (i in 1:n){
                folder <- folders[i]
                category <- categories[i]
                symb <- symbs[i]
                fnames <- vec_fnames[i]
                 
                temp <- data_frame(file = dir(folder,  full.names = TRUE)) %>%
                        mutate(text = map(file, read_lines)) %>%
                        transmute(category = category, id = basename(file), text) %>%
                        unnest(text)
                        df <- bind_rows(df, temp)
         }
         return(df)
}

# Creating a corpus
# 
corpus_df <- read_folder_to_df(folders, categories, symbs, vec_fnames)
corpus_df

## # A tibble: 264,563 x 3
##    category id                     text                                   
##    <chr>    <chr>                  <chr>                                  
##  1 spam     00001.7848dde101aa985… From 12a1mailbot1@web.de  Thu Aug 22 1…
##  2 spam     00001.7848dde101aa985… Return-Path: <12a1mailbot1@web.de>     
##  3 spam     00001.7848dde101aa985… Delivered-To: zzzz@localhost.spamassas…
##  4 spam     00001.7848dde101aa985… Received: from localhost (localhost [1…
##  5 spam     00001.7848dde101aa985… "\tby phobos.labs.spamassassin.taint.o…
##  6 spam     00001.7848dde101aa985… "\tfor <zzzz@localhost>; Thu, 22 Aug 2…
##  7 spam     00001.7848dde101aa985… Received: from mail.webnote.net [193.1…
##  8 spam     00001.7848dde101aa985… "\tby localhost with POP3 (fetchmail-5…
##  9 spam     00001.7848dde101aa985… "\tfor zzzz@localhost (single-drop); T…
## 10 spam     00001.7848dde101aa985… Received: from dd_it7 ([210.97.77.167])
## # ... with 264,553 more rows

corpus_df <- tibble::rowid_to_column(corpus_df, "linenumber")
corpus_df

## # A tibble: 264,563 x 4
##    linenumber category id                  text                           
##         <int> <chr>    <chr>               <chr>                          
##  1          1 spam     00001.7848dde101aa… From 12a1mailbot1@web.de  Thu …
##  2          2 spam     00001.7848dde101aa… Return-Path: <12a1mailbot1@web…
##  3          3 spam     00001.7848dde101aa… Delivered-To: zzzz@localhost.s…
##  4          4 spam     00001.7848dde101aa… Received: from localhost (loca…
##  5          5 spam     00001.7848dde101aa… "\tby phobos.labs.spamassassin…
##  6          6 spam     00001.7848dde101aa… "\tfor <zzzz@localhost>; Thu, …
##  7          7 spam     00001.7848dde101aa… Received: from mail.webnote.ne…
##  8          8 spam     00001.7848dde101aa… "\tby localhost with POP3 (fet…
##  9          9 spam     00001.7848dde101aa… "\tfor zzzz@localhost (single-…
## 10         10 spam     00001.7848dde101aa… Received: from dd_it7 ([210.97…
## # ... with 264,553 more rows

# Extract rows for the first occurrence of a category in the data frame
corpus_df %>%
        group_by(category) %>% 
        filter(linenumber == min(linenumber)) %>% 
        slice(1) %>% # takes the first occurrence if there is a tie
        ungroup()

## # A tibble: 2 x 4
##   linenumber category id                    text                          
##        <int> <chr>    <chr>                 <chr>                         
## 1      70952 ham      00001.7c53336b37003a… From exmh-workers-admin@redha…
## 2          1 spam     00001.7848dde101aa98… From 12a1mailbot1@web.de  Thu…

The column category describes from which type of emails each message comes from, and the id column identifies a unique message within each category. Next we check how many messages are included in each category.

The results corresponds to the number of files shown above minus the cmds files.

corpus_df %>% group_by(category) %>% 
        summarize(messages = n_distinct(id)) %>%
        ungroup()

## # A tibble: 2 x 2
##   category messages
##   <chr>       <int>
## 1 ham          2500
## 2 spam          500

new_df <- corpus_df

5 Create corpus

# Remove all non graphical characters 
# (must be done before using tm_map, otherwise certain characters cause error)
usableText=str_replace_all(corpus_df$text,"[^[:graph:]]", " ") 

email_corpus <- Corpus(VectorSource(usableText))
print(email_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 264563

inspect(email_corpus[1:25])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 25
## 
##  [1] From 12a1mailbot1@web.de  Thu Aug 22 13:17:22 2002                                                
##  [2] Return-Path: <12a1mailbot1@web.de>                                                                
##  [3] Delivered-To: zzzz@localhost.spamassassin.taint.org                                               
##  [4] Received: from localhost (localhost [127.0.0.1])                                                  
##  [5]  by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32                         
##  [6]  for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)                                      
##  [7] Received: from mail.webnote.net [193.120.211.219]                                                 
##  [8]  by localhost with POP3 (fetchmail-5.9.0)                                                         
##  [9]  for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)                          
## [10] Received: from dd_it7 ([210.97.77.167])                                                           
## [11]  by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623                                              
## [12]  for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100                               
## [13] From: 12a1mailbot1@web.de                                                                         
## [14] Received: from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);
## [15]   Sat, 24 Aug 2002 09:42:10 +0900                                                                 
## [16] To: <dcek1a1@netsgo.com>                                                                          
## [17] Subject: Life Insurance - Why Pay More?                                                           
## [18] Date: Wed, 21 Aug 2002 20:31:57 -1600                                                             
## [19] MIME-Version: 1.0                                                                                 
## [20] Message-ID: <0103c1042001882DD_IT7@dd_it7>                                                        
## [21] Content-Type: text/html; charset="iso-8859-1"                                                     
## [22] Content-Transfer-Encoding: quoted-printable                                                       
## [23]                                                                                                   
## [24] <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">                                    
## [25] <HTML><HEAD>

6 Data cleansing

Often colons and hyphens are used in email texts without spaces between the words separated by them. Using the removePunctuation transform without fixing this will cause the two words on either side of the symbols to be combined. We need to fix this prior to using the transformations.

The code below was retrieved from https://eight2late.wordpress.com/2015/05/27/a-gentle-introduction-to-text-mining-using-r/, and shows that tm package provides the ability to create a custom transformation.

length(corpus_df$text)

## [1] 264563

#create the toSpace content transformer
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})

# remove "-", ":", ".", and "'" and replace them with space
email_corpus <- tm_map(email_corpus, toSpace, "-")

## Warning in tm_map.SimpleCorpus(email_corpus, toSpace, "-"): transformation
## drops documents

email_corpus <- tm_map(email_corpus, toSpace, ":")

## Warning in tm_map.SimpleCorpus(email_corpus, toSpace, ":"): transformation
## drops documents

email_corpus <- tm_map(email_corpus, toSpace, "\\.")

## Warning in tm_map.SimpleCorpus(email_corpus, toSpace, "\\."):
## transformation drops documents

email_corpus <- tm_map(email_corpus, toSpace, "'")

## Warning in tm_map.SimpleCorpus(email_corpus, toSpace, "'"): transformation
## drops documents

The next steps in data cleansing are to:

Remove punctuation
Convert the corpus to lower case
Remove all numbers
Remove white spaces
Remove stop words

# remove punctuation
email_corpus <- tm_map(email_corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(email_corpus, removePunctuation):
## transformation drops documents

# remove numbers
email_corpus <- tm_map(email_corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(email_corpus, removeNumbers): transformation
## drops documents

#translate all letters to lower case
email_corpus <- tm_map(email_corpus, tolower)

## Warning in tm_map.SimpleCorpus(email_corpus, tolower): transformation drops
## documents

# remove white spaces
email_corpus <- tm_map(email_corpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(email_corpus, stripWhitespace):
## transformation drops documents

# Remove stop-words
email_corpus <- tm_map(email_corpus, removeWords, stopwords("English"))

## Warning in tm_map.SimpleCorpus(email_corpus, removeWords,
## stopwords("English")): transformation drops documents

length(email_corpus)

## [1] 264563

inspect(email_corpus[1:25])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 25
## 
##  [1]  amailbotweb de thu aug                                  
##  [2] return path amailbotweb de                               
##  [3] delivered  zzzzlocalhost spamassassin taint org          
##  [4] received  localhost localhost                            
##  [5]   phobos labs spamassassin taint org postfix  esmtp id bc
##  [6]   zzzzlocalhost thu aug edt                              
##  [7] received  mail webnote net                               
##  [8]   localhost  pop fetchmail                               
##  [9]   zzzzlocalhost single drop thu aug ist                  
## [10] received  ddit                                           
## [11]   webnote net  esmtp id naa                              
## [12]   zzzzspamassassin taint org thu aug                     
## [13]  amailbotweb de                                          
## [14] received  r smtp korea com  ddit  microsoft smtpsvc      
## [15]  sat aug                                                 
## [16]  dcekanetsgo com                                         
## [17] subject life insurance  pay                              
## [18] date wed aug                                             
## [19] mime version                                             
## [20] message id cdditddit                                     
## [21] content type texthtml charsetiso                         
## [22] content transfer encoding quoted printable               
## [23]                                                          
## [24] doctype html public wcdtd html transitionalen            
## [25] htmlhead

7 Document term matrix

email_dtm <- DocumentTermMatrix(email_corpus)
inspect(email_dtm[1:4, 1:30])

## <<DocumentTermMatrix (documents: 4, terms: 30)>>
## Non-/sparse entries: 13/107
## Sparsity           : 89%
## Maximal term length: 16
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs amailbotweb aug delivered localhost org path return spamassassin
##    1           1   1         0         0   0    0      0            0
##    2           1   0         0         0   0    1      1            0
##    3           0   0         1         0   1    0      0            1
##    4           0   0         0         2   0    0      0            0
##     Terms
## Docs taint thu
##    1     0   1
##    2     0   0
##    3     1   0
##    4     0   0

8 Mining the corpus

In constructing the TDM, we have converted a corpus of text into a mathematical object that can be analysed using quantitative techniques of matrix algebra.

To get the frequency of occurrence of each word in the corpus, one can use the dtm. Unfortunately, the process exhausted computational resources. To avoid this issue we turn back to the data frame

8.1 Replace text in data frame with the clean data

head(corpus_df, n=5)

## # A tibble: 5 x 4
##   linenumber category id                  text                            
##        <int> <chr>    <chr>               <chr>                           
## 1          1 spam     00001.7848dde101aa… From 12a1mailbot1@web.de  Thu A…
## 2          2 spam     00001.7848dde101aa… Return-Path: <12a1mailbot1@web.…
## 3          3 spam     00001.7848dde101aa… Delivered-To: zzzz@localhost.sp…
## 4          4 spam     00001.7848dde101aa… Received: from localhost (local…
## 5          5 spam     00001.7848dde101aa… "\tby phobos.labs.spamassassin.…

df <- data.frame(text = get("content", email_corpus))

head(df, n=5)

##                                                        text
## 1                                   amailbotweb de thu aug 
## 2                                return path amailbotweb de
## 3           delivered  zzzzlocalhost spamassassin taint org
## 4                            received  localhost localhost 
## 5   phobos labs spamassassin taint org postfix  esmtp id bc

corpus_df$text <- as.character(df$text)

head(corpus_df, n=5)

## # A tibble: 5 x 4
##   linenumber category id                    text                          
##        <int> <chr>    <chr>                 <chr>                         
## 1          1 spam     00001.7848dde101aa98… " amailbotweb de thu aug "    
## 2          2 spam     00001.7848dde101aa98… return path amailbotweb de    
## 3          3 spam     00001.7848dde101aa98… delivered  zzzzlocalhost spam…
## 4          4 spam     00001.7848dde101aa98… "received  localhost localhos…
## 5          5 spam     00001.7848dde101aa98… "  phobos labs spamassassin t…

8.2 Tidying data

tidy_corpus <-corpus_df %>% unnest_tokens(word, text)

tidy_corpus

## # A tibble: 1,042,276 x 4
##    linenumber category id                                     word         
##         <int> <chr>    <chr>                                  <chr>        
##  1          1 spam     00001.7848dde101aa985090474a91ec93fcf0 amailbotweb  
##  2          1 spam     00001.7848dde101aa985090474a91ec93fcf0 de           
##  3          1 spam     00001.7848dde101aa985090474a91ec93fcf0 thu          
##  4          1 spam     00001.7848dde101aa985090474a91ec93fcf0 aug          
##  5          2 spam     00001.7848dde101aa985090474a91ec93fcf0 return       
##  6          2 spam     00001.7848dde101aa985090474a91ec93fcf0 path         
##  7          2 spam     00001.7848dde101aa985090474a91ec93fcf0 amailbotweb  
##  8          2 spam     00001.7848dde101aa985090474a91ec93fcf0 de           
##  9          3 spam     00001.7848dde101aa985090474a91ec93fcf0 delivered    
## 10          3 spam     00001.7848dde101aa985090474a91ec93fcf0 zzzzlocalhost
## # ... with 1,042,266 more rows

spam_words <- tidy_corpus %>%
        group_by(category) %>% 
        filter(category == "spam") %>% 
        count(word, sort = TRUE) %>%
        ungroup()
head(spam_words, n=15)

## # A tibble: 15 x 3
##    category word              n
##    <chr>    <chr>         <int>
##  1 spam     com            3560
##  2 spam     org            2964
##  3 spam     td             2825
##  4 spam     tr             2622
##  5 spam     received       2492
##  6 spam     id             2135
##  7 spam     sep            2128
##  8 spam     net            1724
##  9 spam     widthd         1523
## 10 spam     www            1516
## 11 spam     font           1406
## 12 spam     zzzzlocalhost  1362
## 13 spam     size           1239
## 14 spam     table          1197
## 15 spam     localhost      1189

tail(spam_words)

## # A tibble: 6 x 3
##   category word      n
##   <chr>    <chr> <int>
## 1 spam     飬        1
## 2 spam     駼        1
## 3 spam     䰡        1
## 4 spam     鵵        1
## 5 spam     鹰        1
## 6 spam     鼰        1

ham_words <- tidy_corpus %>%
        group_by(category) %>% 
        filter(category == "ham") %>% 
        count(word, sort = TRUE) %>%
        ungroup()
head(ham_words, n=15)

## # A tibble: 15 x 3
##    category word             n
##    <chr>    <chr>        <int>
##  1 ham      org          20970
##  2 ham      com          19660
##  3 ham      net          14668
##  4 ham      id           14222
##  5 ham      received     14084
##  6 ham      list         13877
##  7 ham      taint        11007
##  8 ham      sep           9795
##  9 ham      x             8704
## 10 ham      esmtp         8407
## 11 ham      localhost     7684
## 12 ham      spamassassin  7623
## 13 ham      http          7402
## 14 ham      sourceforge   6584
## 15 ham      mailto        6157

tail(ham_words)

## # A tibble: 6 x 3
##   category word                                                          n
##   <chr>    <chr>                                                     <int>
## 1 ham      zxaokhbyawicikxxuiiksiebfksbpziakbbxycnkcjsjieluaxrpywxp…     1
## 2 ham      zxcgzmlszsailzhcisbcvehhlmxvzyicjsjcxjvdgfzsavdmfylxvzye…     1
## 3 ham      zxnjcmlwdglvbgogvghliefkdmfuyvkiexpbnvifnvdwkiefyyhpdgvj…     1
## 4 ham      zzmp                                                          1
## 5 ham      zzyonavetuutqxkktstjzdidpftoojdogberry                        1
## 6 ham      zzzzcc                                                        1

Interesting enough, the clean data shows some Chinese characters, which have low frequency. Some argue that the least frequent terms can be more interesting than one might think. This is because terms that occur rarely are likely to be more descriptive of specific documents.

Not being sure on how to deal with this, we leave the foreigner characters in the data set.

9 Frequency visualization

9.1 Bar graph

spam_words %>% 
        top_n(15) %>% 
        mutate(word = reorder(word, n)) %>%
        ggplot(aes(word, n))+
        geom_col(color='darkblue', fill="darkblue")+
        geom_col(show.legend = FALSE) +
        labs(y= "Spam Word Count", x=NULL)+
        coord_flip()

## Selecting by n

ham_words %>% 
        top_n(15) %>% 
        mutate(word = reorder(word, n)) %>%
        ggplot(aes(word, n))+
        geom_col(color="darkred", fill="darkred")+
        geom_col(show.legend = FALSE) +
        labs(y= "Ham Word Count", x=NULL)+
        coord_flip()

## Selecting by n

9.2 Word clouds

spam_indices <- which(corpus_df$category == "spam")
spam_indices[1:3]

## [1] 1 2 3

ham_indices <- which(corpus_df$category == "ham")
ham_indices[1:3]

## [1] 70952 70953 70954

9.2.1 Word cloud for spam

#setting the same seed each time ensures consistent look across clouds
set.seed(7)
suppressMessages(wordcloud(email_corpus[spam_indices], min.freq=500, colors=brewer.pal(6,"Dark2")))

9.2.2 Word cloud for ham

#setting the same seed each time ensures consistent look across clouds
set.seed(7)
suppressMessages(wordcloud(email_corpus[ham_indices], min.freq=1000, colors=brewer.pal(6,"Dark2")))

10 Training and test data

We divide corpus into training and test data by using 75% (or 3 qarters) of random text as training data and 25% (or 1 quarter) as test data.

The following code are based on a tutorial available on https://towardsdatascience.com/sms-text-classification-a51defc2361c.

# Randomize emails order and quantify each subset
set.seed(12)
(random_df <- corpus_df[sample(nrow(corpus_df)),]) #used

## # A tibble: 264,563 x 4
##    linenumber category id                  text                           
##         <int> <chr>    <chr>               <chr>                          
##  1      18351 spam     00182.1b9ba0f95506… method  placing free ads   int…
##  2     216353 ham      01503.5e13994a5676… " reply  gadeimos caltech edu" 
##  3     249381 ham      02151.d9959f781f81… " rssfeedsjmason org tue oct " 
##  4      71268 ham      00004.864220c5b693… "already   prolific virus ever…
##  5      44803 spam     00341.99b463b92346… " nextpartcabafefc"            
##  6       8968 spam     00089.7e7baae6ef4a… "buyers    marketing   key "   
##  7      47299 spam     00341.99b463b92346… xucaxkcidaorcmkeyyybkmizaffsab…
##  8     169757 ham      00996.01a4386651fb… "  enter    safeguard  "       
##  9       6053 spam     00056.c56d61cadd81… " "                            
## 10       2203 spam     00021.effe1449462a… content transfer encoding bit  
## # ... with 264,553 more rows

random_corpus <- Corpus(VectorSource(random_df$text))#used

print(random_corpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 264563

random_dtm <- DocumentTermMatrix(random_corpus) #used
inspect(random_dtm[1:4, 1:8])

## <<DocumentTermMatrix (documents: 4, terms: 8)>>
## Non-/sparse entries: 8/24
## Sparsity           : 75%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs ads caltech edu free gadeimos internet method placing
##    1   1       0   0    1        0        1      1       1
##    2   0       1   1    0        1        0      0       0
##    3   0       0   0    0        0        0      0       0
##    4   0       0   0    0        0        0      0       0

(n_train <- dim(random_df)[1]%/%4*3)

## [1] 198420

(n_text <- dim(random_df)[1])

## [1] 264563

# Training and test data frames
(train_df <- random_df[1:n_train,]) # used

## # A tibble: 198,420 x 4
##    linenumber category id                  text                           
##         <int> <chr>    <chr>               <chr>                          
##  1      18351 spam     00182.1b9ba0f95506… method  placing free ads   int…
##  2     216353 ham      01503.5e13994a5676… " reply  gadeimos caltech edu" 
##  3     249381 ham      02151.d9959f781f81… " rssfeedsjmason org tue oct " 
##  4      71268 ham      00004.864220c5b693… "already   prolific virus ever…
##  5      44803 spam     00341.99b463b92346… " nextpartcabafefc"            
##  6       8968 spam     00089.7e7baae6ef4a… "buyers    marketing   key "   
##  7      47299 spam     00341.99b463b92346… xucaxkcidaorcmkeyyybkmizaffsab…
##  8     169757 ham      00996.01a4386651fb… "  enter    safeguard  "       
##  9       6053 spam     00056.c56d61cadd81… " "                            
## 10       2203 spam     00021.effe1449462a… content transfer encoding bit  
## # ... with 198,410 more rows

(test_df <- random_df[(n_train+1):n_text,])

## # A tibble: 66,143 x 4
##    linenumber category id                      text                       
##         <int> <chr>    <chr>                   <chr>                      
##  1        607 spam     00008.dfd941deb10f5eed… ""                         
##  2      98747 ham      00298.8b7d79e9cff4860a… ""                         
##  3      10205 spam     00103.2eef38789b4ecce7… return path safetyol newna…
##  4     149011 ham      00802.c2f1957d9e67ae45… return path fork adminxent…
##  5      50869 spam     00366.f0bfcc3c84da11ae… "table widthd borderd alig…
##  6      90615 ham      00214.ccc58960373c957d… "  zzzzlocalhost wed aug e…
##  7      37943 spam     00302.544366fa4cd0f5d2… option valuenj new jersey  
##  8     107383 ham      00392.1a94887ca585cbda… " hal devore haldevoreacm …
##  9     216201 ham      01501.78f2f952275ec4dd… ""                         
## 10      22147 spam     00209.5276f967533f2ce0… "aligndrightimg srcdhttp i…
## # ... with 66,133 more rows

# Training and test DTMs
train_dtm <- random_dtm[1:n_train,]
test_dtm <- random_dtm[(n_train+1):n_text,]

#Training & Test Label
train_labels <- train_df$category
test_labels <- test_df$category

#Proportion for training & test labels
prop.table(table(train_labels))

## train_labels
##       ham      spam 
## 0.7320683 0.2679317

prop.table(table(test_labels))

## test_labels
##       ham      spam 
## 0.7310675 0.2689325

We see that the data in the training and test datasets are both split into 73% ham and 27% spam messages.

We are going to transform the DTM matrix into something the Naive Bayes model can train. We use the function findFreqTerms() to extract the most frequent words in the texts. It takes in a DTM and returns a character vector with the most frequent words.

threshold <- 0.1

min_freq = round(random_dtm$nrow*(threshold/100),0)

min_freq

## [1] 265

# Create vector of most frequent words
freq_words <- findFreqTerms(x = random_dtm, lowfreq = min_freq)

str(freq_words)

##  chr [1:414] "free" "internet" "edu" "reply" "oct" "org" ...

#Filter the DTM
train_dtm_freq <- train_dtm[ , freq_words]
test_dtm_freq <- test_dtm[ , freq_words]

dim(train_dtm_freq)

## [1] 198420    414

dim(test_dtm_freq)

## [1] 66143   414

Since Naive Bayes trains on categorical data, the numerical data is converted to categorical data. The numeric features are converted by a function that converts any non-zero positive value to “Yes” and all zero values to “No” to state whether a specific term is present in the document.

convert_values <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}
text_train <- apply(train_dtm_freq,  MARGIN =  2, convert_values)
text_test <- apply(test_dtm_freq, MARGIN = 2, convert_values)

11 Naive Bayes

Naive Bayes is a simple technique for constructing classifiers and performs extremely well. Naive Bayes is recommended when all input features are categorical. Thus we will use the Naive Bayes technique to classify such test data and check how well it performs.

The Naive Bayes model works on the assumption that the features of the dataset are independent of each other.

This works well for text documents since:

words in a text document are independent of each other.
the location of one word doesn’t depend on another word.

Thus satisfying the independence assumption of the Naive Bayes model. Hence, it is most commonly used for text classification, sentiment analysis, spam filtering & recommendation systems.

#Create model from the training dataset
text_classifier <- naiveBayes(text_train, factor(train_labels))

#Make predictions on test set
text_test_pred <- predict(text_classifier, text_test)


#Create confusion matrix
confusionMatrix(data = text_test_pred, reference = factor(test_labels),
                positive = "spam", dnn = c("Prediction", "Actual"))

## Confusion Matrix and Statistics
## 
##           Actual
## Prediction   ham  spam
##       ham  31671  4367
##       spam 16684 13421
##                                           
##                Accuracy : 0.6817          
##                  95% CI : (0.6782, 0.6853)
##     No Information Rate : 0.7311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3359          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7545          
##             Specificity : 0.6550          
##          Pos Pred Value : 0.4458          
##          Neg Pred Value : 0.8788          
##              Prevalence : 0.2689          
##          Detection Rate : 0.2029          
##    Detection Prevalence : 0.4552          
##       Balanced Accuracy : 0.7047          
##                                           
##        'Positive' Class : spam            
##

12 References

Dsilva, Deepal. “Ham or Spam? SMS Text Classification with Machine Learning. A Naive Bayes Implementation in R”. Towards Data Science, Medium, Aug 14, https://towardsdatascience.com/sms-text-classification-a51defc2361c.
Index of /old/publiccorpus, https://spamassassin.apache.org/old/publiccorpus/
Awati, Kailash. “A gentle introduction to text mining using R”. Eight to Late, May 27, 2015, https://eight2late.wordpress.com/2015/05/27/a-gentle-introduction-to-text-mining-using-r/.

Project 4 - Document Classification

B. Sosnovski

11/1/2018