Document classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.For this project, you can start with a spam/ham data set, then predict the class of new documents (either withheld from the training data set or from another source such as your own spam folder). Here, I will create the Naive Bayes model to predict whether or not a new email is spam.

Loading required libraries

library(tm)
## Loading required package: NLP
library(reader)
## Loading required package: NCmisc
## 
## Attaching package: 'reader'
## The following objects are masked from 'package:NCmisc':
## 
##     cat.path, get.ext, rmv.ext
## The following object is masked from 'package:tm':
## 
##     reader
library(NLP)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
library(e1071) 
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(stringr)

Creating data directory

The spam (20050311_spam_2) and ham (20030228_easy_ham) folders containing spam and ham emails are collected from https://spamassassin.apache.org/old/publiccorpus/.

spam_dir<-"F:\\CUNY masters\\assignment607\\project 4-spamham\\spam_2"
ham_dir<-"F:\\CUNY masters\\assignment607\\project 4-spamham\\easy_ham"
file1<-list.files(spam_dir)
file2<-list.files(ham_dir)

Creating spam emails dataframe with email text and classification (spam=1) columns

# Creating spamlist with their titles iterating through all files in the spam folder 
spamlist <- NA
for(i in 1:length(file1))
{
  path<-paste0(spam_dir, "/", file1[i])  
  text <-readLines(path)
  tmp<- list(paste(text, collapse="\n"))
  spamlist  = c(spamlist,tmp)
}

# Creating data frame
spam <-as.data.frame(unlist(spamlist),stringsAsFactors = FALSE)
spam$classification <- 1
colnames(spam)<- c("email","classification")

Creating ham emails dataframe with email text and classification (ham=0) columns

# Creating a hamlist for all files with their titles
hamlist <- NA
for(i in 1:length(file2))
{
  path<-paste0(ham_dir, "/", file2[i])  
  text <-readLines(path)
  tmp<- list(paste(text, collapse="\n"))
  hamlist  = c(hamlist,tmp)
  
}

# Creating data frame 
ham <-as.data.frame(unlist(hamlist),stringsAsFactors = FALSE)
ham$classification <- 0
colnames(ham)<-c("email","classification")

Creating final dataframe by combining the ham and spam data frames

df_final <- rbind(ham, spam)

looking at the proportion of the spam and the ham email in the dataset

prop.table(table(df_final$classification))  
## 
##         0         1 
## 0.6415385 0.3584615

Creating corpus

dfcorpus <- Corpus(VectorSource(df_final$email))
dfcorpus[[1]]$meta
##   author       : character(0)
##   datetimestamp: 2022-11-25 03:20:49
##   description  : character(0)
##   heading      : character(0)
##   id           : 1
##   language     : en
##   origin       : character(0)

Cleaning corpus

# Create a "addspace" function that finds a user specified pattern and substitutes the pattern with a space
addspace <- content_transformer(function(x, pattern) {
 return(gsub(pattern, " ", x))
  })

# Replace "-" with space 
my_corpus <- tm_map(dfcorpus, addspace, "-")
## Warning in tm_map.SimpleCorpus(dfcorpus, addspace, "-"): transformation drops
## documents
# Remove numbers
my_corpus<-tm_map(my_corpus,content_transformer(removeNumbers))
## Warning in tm_map.SimpleCorpus(my_corpus, content_transformer(removeNumbers)):
## transformation drops documents
# Remove white spaces
my_corpus<-tm_map(my_corpus,stripWhitespace)
## Warning in tm_map.SimpleCorpus(my_corpus, stripWhitespace): transformation drops
## documents
# To Lowercase Transformation
corpus1<-tm_map(my_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(my_corpus, content_transformer(tolower)):
## transformation drops documents
# Remove punctuation transformation
corpus2 <- tm_map(corpus1, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus1, removePunctuation): transformation drops
## documents
# Stem document transformation
corpus3<- tm_map(corpus2, stemDocument)
## Warning in tm_map.SimpleCorpus(corpus2, stemDocument): transformation drops
## documents
# Remove stopwords transformation
corpus4<-tm_map(corpus3, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(corpus3, removeWords, stopwords("en")):
## transformation drops documents

Creating a Document term sparse matrix

dtm1 <- DocumentTermMatrix(corpus4)

# Reducing sparsity
dtm1 <- removeSparseTerms(dtm1, 1-(10/length(corpus4)))

# Creating data frame
dtm1_df<-(as.data.frame(as.matrix(dtm1)))
dtm1_df$classification<-df_final$classification

Top 30 terms in the combined dataset

sort(colSums(dtm1_df ) ,decreasing = TRUE)[1:30]
##            receiv              list             esmtp               sep 
##             21641             15240             11549              9793 
##         localhost           content              size              date 
##              9246              6714              6498              6271 
##            messag               mon               aug           postfix 
##              6238              6013              5764              5628 
##           version               wed               oct       jmlocalhost 
##              5299              5268              5256              5227 
##               thu              font               ist              mail 
##              5194              4877              4841              4831 
##           subject              type               tue               jul 
##              4755              4629              4537              4434 
##             deliv            return dogmaslashnullorg              will 
##              4393              4269              4252              4050 
##              path          jalapeno 
##              3799              3753

Top 30 terms in spam emails

sort(colSums(dtm1_df %>% filter(`classification` == 1)) ,decreasing = TRUE)[1:30]
##    receiv      size      font       jul    widthd     esmtp      tabl   content 
##      7411      6307      4794      4382      3547      3141      3125      2919 
##     width       san      mail      will helvetica     email       may      list 
##      2913      2728      2695      2617      2520      2439      2419      2387 
##     serif    messag      type      date     color       mon localhost  facedari 
##      2139      2131      2080      2067      1986      1978      1899      1862 
##      free   subject       can       div   version     nsnet 
##      1857      1855      1774      1725      1672      1624

Top 30 terms in ham emails

sort(colSums(dtm1_df %>% filter(`classification` == 0)) ,decreasing = TRUE)[1:30]
##            receiv              list               sep             esmtp 
##             14230             12853              9788              8408 
##         localhost               oct           postfix               aug 
##              7347              5251              4662              4476 
##               ist              date       jmlocalhost            messag 
##              4224              4204              4144              4107 
##               mon               wed               thu           content 
##              4035              3860              3839              3795 
##          jalapeno           version             deliv               rpm 
##              3705              3627              3540              3529 
##              exmh              user              fork dogmaslashnullorg 
##              3364              3307              3093              3048 
##               tue           subject            return      adminxentcom 
##              3004              2900              2839              2775 
##               use              path 
##              2620              2584

Splitting the data into two portions: 80 percent for training and 20 percent for testing

sample_size <- floor(0.80 * nrow(dtm1_df))  
set.seed(123)
train <- sample(seq_len(nrow(dtm1_df)), size = sample_size)
#Training data set
dtm1_train <- dtm1_df[train, ]
# Test data set
dtm1_test <- dtm1_df[-train, ]

Labeling each of the rows in the training and testing matrices

train_labels <- dtm1_train$classification
test_labels <- dtm1_test$classification

Proportion for training & test labels

prop.table(table(train_labels))
## train_labels
##         0         1 
## 0.6384615 0.3615385
prop.table(table(test_labels))
## test_labels
##         0         1 
## 0.6538462 0.3461538

Trimming the data

# Finding minimum  frequency
threshold <- 0.1
min_freq <- round(dtm1$nrow*(threshold/100),0)
min_freq
## [1] 4
# Create vector of most frequent words
freq_words <- findFreqTerms(x = dtm1, lowfreq = min_freq)
str(freq_words)
##  chr [1:5571] "abl" "actual" "adminredhatcom" "adminspamassassintaintorg" ...
# Filter the DTM
dtm1_freq_train <- dtm1_train[ , freq_words]
dtm1_freq_test <- dtm1_test[ , freq_words]
dim(dtm1_freq_train)
## [1] 3120 5571
dim(dtm1_freq_test)
## [1]  780 5571

The training and test data sets now include 5571 features, which correspond to words appearing in at least five emails.

Changing the sparse matrix words frequency count to a categorical variable, ‘yes’ or ‘no’, depending on whether the word appears at all

convert_values <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

Converting the training and test matrices to character type matrices,each with cells indicating “Yes” or “No”

train1 <- apply(dtm1_freq_train, MARGIN = 2,
                   convert_values)
test1 <- apply(dtm1_freq_test, MARGIN = 2,
                  convert_values)

Creating Naive Bayes model from the training data

email_classifier <- naiveBayes(train1, train_labels)

Generating predictions on test set to evaluate the model performance

test_pred <- predict(email_classifier, test1)

Comparing the predicted labels with the reference test data set labels and getting the summary of comparison

confusionMatrix(test_pred, as.factor(test_labels))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 507  18
##          1   3 252
##                                           
##                Accuracy : 0.9731          
##                  95% CI : (0.9591, 0.9833)
##     No Information Rate : 0.6538          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9397          
##                                           
##  Mcnemar's Test P-Value : 0.00225         
##                                           
##             Sensitivity : 0.9941          
##             Specificity : 0.9333          
##          Pos Pred Value : 0.9657          
##          Neg Pred Value : 0.9882          
##              Prevalence : 0.6538          
##          Detection Rate : 0.6500          
##    Detection Prevalence : 0.6731          
##       Balanced Accuracy : 0.9637          
##                                           
##        'Positive' Class : 0               
## 

Conclusion

Here, by applying the Naive Bayes algorithm, I have created a model to determine the probability that a given email is spam. It is seen that the accuracy of the model of predicting the spam email is 97.31% ! The model missed 3 spam emails and wrongly classified them as ham.