Document Classification - Ham / Spam

This assignment is to classify new “test” documents using already classified “training” documents from an open source anti-spam platform, Apache SpamAssassin.

Load Libraries

We will being to load libraries

library(tidyverse)
library(e1071)
library(tm)
library(tidytext)
library(dplyr)
library(caret)
library(xgboost)
library(SnowballC)
library(RColorBrewer)
library(ggplot2)
library(wordcloud)

Load Files from Local Drive

The files were download from Apache SpamAssassin - Old - Public Corpus platform to access email and block spam (unsolicited email) to filter and classify.

This analysis used two datasets for the parent directory: * 20021010 Easy Ham * 20021010 Spam

The original file extension are ‘tar.bz2’, that was downloaded to my local drive and unzipped.

spam_path <- "C:/Users/andre/OneDrive/Documents/GitHub/D607_Project4/Data/spam"
ham_path <- "C:/Users/andre/OneDrive/Documents/GitHub/D607_Project4/Data/easy_ham"

This function crawls through the files and extracts the messages in there raw form.

make.data.frame<- function(path, class){
  # Dig through the directories for messages
  files <- list.files(path=path, 
                      full.names=TRUE, 
                      recursive=TRUE)
  # Read a file once directories are gone
  message<-lapply(files, function(x) {
    text_body<-read_file(x)
    })
  # Add to dataframe and assign "id" column
  message<-unlist(message)
  data<-as.data.frame(message)
  data$class<-class
  return (data)
}

Make SPAM and HAM dataframes and bind them

data_spam<-make.data.frame(spam_path, class="SPAM")
data_ham<-make.data.frame(ham_path, class="HAM")
data<-rbind(data_spam, data_ham)

Text Clean up

The dataframe will include a class column designated by document type, SPAM and HAM. Creating the dataframe with two variables and a numeric target class will be used in some classifiers.

data_spam<-data %>%
  filter(class == "SPAM") %>%
  mutate(target = 1)
data_ham<- data %>%
  filter(class == "HAM") %>%
  mutate(target = 0)
data<-rbind(data_spam, data_ham)
data$id <- 1:nrow(data)
DT::datatable(data %>%
              count(class, target),
         extensions = c('FixedColumns',"FixedHeader"),
          options = list(scrollX = TRUE,
                         paging=TRUE,
                         fixedHeader=TRUE))

Clean the dataframe text files, remove the html formatting, all punctuation, new lines, and digits.

data<-data %>%
  mutate(message= str_remove_all(message, pattern = "<.*?>")) %>%
  mutate(message= str_remove_all(message, pattern = "[:digit:]")) %>%
  mutate(message= str_remove_all(message, pattern = "[:punct:]")) %>%
  mutate(message= str_remove_all(message, pattern = "[\n]")) %>%
  mutate(message= str_to_lower(message)) %>%
  unnest_tokens(output=text,input=message,
                token="paragraphs",
                format="text") %>%
  anti_join(stop_words, by=c("text"="word"))
glimpse(data)
## Rows: 3,052
## Columns: 4
## $ class  <chr> "SPAM", "SPAM", "SPAM", "SPAM", "SPAM", "SPAM", "SPAM", "SPAM",~
## $ target <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ id     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ~
## $ text   <chr> "mv  bfcddbffccadbmv  ffbcebefcdceaaamv  cdcbcfabaemv  dfdeeaeb~
table(data$class)
## 
##  HAM SPAM 
## 2551  501

We will rearrange the dataset for use in the train, test splits.

set.seed(9450)
# randomize index
row_shuffle <- sample(nrow(data))
# reorder index
data<-data[row_shuffle,]
DT::datatable(data[1:5,c("class","id")],
         extensions = c('FixedColumns',"FixedHeader"),
          options = list(scrollX = TRUE,
                         paging=TRUE,
                         fixedHeader=TRUE,
                         pageLength = 5))

Convert the ‘class’ variable from character to factor

data$class <- as.factor(data$class)
prop.table(table(data$class))
## 
##       HAM      SPAM 
## 0.8358453 0.1641547

Document Term Matrix

The tm package was used to vectorize the words into a corpus of messages..

text_corpus <- VCorpus(VectorSource(data$text))

Clean-up corpus

text_corpus = tm_map(text_corpus, content_transformer(stringi::stri_trans_tolower))
text_corpus = tm_map(text_corpus, removeNumbers)
text_corpus = tm_map(text_corpus, removePunctuation)
text_corpus = tm_map(text_corpus, stripWhitespace)
text_corpus = tm_map(text_corpus, removeWords, stopwords("english"))
text_corpus = tm_map(text_corpus, stemDocument)

Create a Document Term Matrix, presenting a bag-of-words vectorizer for each message in the dataset. The columns represents the count frequency of each word in the corpus.

# For tokens by message
text_dtm <- DocumentTermMatrix(text_corpus, control =
                                 list(stemming = TRUE))
dim(text_dtm)  # 3052 / 60912
## [1]  3052 60912

Sparse terms removed, reducing the matrix

text_dtm <- removeSparseTerms(text_dtm, 0.999)
dim(text_dtm)  # 3052 / 9313
## [1] 3052 9313

Inspect the corpus

inspect(text_dtm[50:70, 30:50])
## <<DocumentTermMatrix (documents: 21, terms: 21)>>
## Non-/sparse entries: 2/439
## Sparsity           : 100%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs abd abett abf abidjan abil abit abl aboard abolit abus
##   50   0     0   0       0    0    0   0      0      0    0
##   51   0     0   0       0    0    0   0      0      0    0
##   52   0     0   0       0    0    0   0      0      0    0
##   53   0     0   0       0    0    0   0      0      0    0
##   54   0     0   0       0    0    0   0      0      0    0
##   55   0     0   0       0    0    0   0      0      0    0
##   56   0     0   0       0    0    0   0      0      0    0
##   57   0     0   0       0    0    0   0      0      0    0
##   59   0     0   0       0    0    0   1      0      0    0
##   66   0     0   0       0    0    0   0      0      0    2

Word Frequency

freq<- sort(colSums(as.matrix(text_dtm)), decreasing=TRUE)
head(freq, 10) # this is the least 
##       sep     esmtp localhost       aug    receiv       oct   postfix istreceiv 
##     12104      9824      8775      6348      6030      5507      5412      5021 
##       thu       mon 
##      4718      4555

Visualizing Word Frequency

BarPlot

word_freq<- data.frame(word=names(freq), freq=freq)
head(word_freq)
##                word  freq
## sep             sep 12104
## esmtp         esmtp  9824
## localhost localhost  8775
## aug             aug  6348
## receiv       receiv  6030
## oct             oct  5507
word_freq_bp <- ggplot(subset(word_freq, freq > 2000), aes(x=reorder(word, -freq), y =freq)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1))
word_freq_bp

Word Frequency in Emails

set.seed(3000)
wordcloud(words = word_freq$word, freq = word_freq$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(12, "Paired"))

Text Classification

Convert word frequency into logical value.

The term frequencies are replaced by Boolean presence/absence features for sentiment classification.

convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

# Apply the convert_count function to get final training and testing DTMs
datasetNB <- apply(text_dtm, 2, convert_count)

dataset = as.data.frame(as.matrix(datasetNB))
dataset$class = data$class
str(dataset$class)
##  Factor w/ 2 levels "HAM","SPAM": 1 1 1 1 1 1 1 2 1 2 ...

Data Splitting Based on the Outcome

The function createDataPartition can be used to create balanced splits of the data. If the y argument to this function is a factor, the random sampling occurs within each class and should preserve the overall class distribution of the data. For example, to create a single 80/20% split of the data:

set.seed(3000)  # caret function

train.set <- createDataPartition(dataset$class, p=.8, list = FALSE, times = 1)
head(train.set)
##      Resample1
## [1,]         1
## [2,]         2
## [3,]         3
## [4,]         4
## [5,]         5
## [6,]         7

Create both train and test and check the proportion of the data split.

dataTrain <- data[ train.set,]
dataTest <- data[ -train.set,]

prop.table(table(dataTrain$class))
## 
##       HAM      SPAM 
## 0.8357903 0.1642097
prop.table(table(dataTest$class))
## 
##       HAM      SPAM 
## 0.8360656 0.1639344

Model Fitting

Analysis for this text classification

Random Forest via Ranger, an alternative package for fitting a random forest XGBoost, an alternative boosting package *Naïve Bayes Classifier (NBC)

We will be building our model on 3 different Machine Learning algorithms which are Random Forest, Naive Bayes and XGBoost: Extreme Gradient Boosting for the purpose of deciding which perform the best.

Random Forest Classifier

Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction.

## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## Call:
##  randomForest(x = dataTrain, y = dataTrain$class, ntree = 300) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##       HAM SPAM class.error
## HAM  2041    0           0
## SPAM    0  401           0

The rf_classifier was able to accurately classify the text messages as ham and spam respectively with the class error of 0 which suggest that there is 100% accuracy of the model on the training set of observations.

Making Predictions and evaluating the Random Forest Classifier.

We want to evaluate the model using the test_set and see if our model can match the 100% accuracy on this new set of data in comparison to the one obtained from the training set.

# Predicting the Test set results
rf_pred = predict(rf_classifier, newdata = dataTest)

# Making the Confusion Matrix
confusionMatrix(table(rf_pred,dataTest$class))
## Confusion Matrix and Statistics
## 
##        
## rf_pred HAM SPAM
##    HAM  510    0
##    SPAM   0  100
##                                     
##                Accuracy : 1         
##                  95% CI : (0.994, 1)
##     No Information Rate : 0.8361    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
##                                     
##             Sensitivity : 1.0000    
##             Specificity : 1.0000    
##          Pos Pred Value : 1.0000    
##          Neg Pred Value : 1.0000    
##              Prevalence : 0.8361    
##          Detection Rate : 0.8361    
##    Detection Prevalence : 0.8361    
##       Balanced Accuracy : 1.0000    
##                                     
##        'Positive' Class : HAM       
## 

The Random Forest Classifier (rf_classifier) performed well on this data set as the model accuracy is 1.0 with a 95% CI of 0.994.

Naive Bayes Classifier

It is a Machine Learning model that is based upon the assumptions of conditional probability as proposed by Bayes’ Theorem. It is fast and easy.

control <- trainControl(method="repeatedcv", number=10, repeats=3)
system.time( classifier_nb <- naiveBayes(dataTrain, dataTrain$class, laplace = 1,
                                         trControl = control,tuneLength = 7) )
##    user  system elapsed 
##    4.81    0.00    4.81

Making Predictions and evaluating the Naive Bayes Classifier.

nb_pred = predict(classifier_nb, type = 'class', newdata = dataTest)

confusionMatrix(nb_pred, dataTest$class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction HAM SPAM
##       HAM  510    0
##       SPAM   0  100
##                                     
##                Accuracy : 1         
##                  95% CI : (0.994, 1)
##     No Information Rate : 0.8361    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
##                                     
##             Sensitivity : 1.0000    
##             Specificity : 1.0000    
##          Pos Pred Value : 1.0000    
##          Neg Pred Value : 1.0000    
##              Prevalence : 0.8361    
##          Detection Rate : 0.8361    
##    Detection Prevalence : 0.8361    
##       Balanced Accuracy : 1.0000    
##                                     
##        'Positive' Class : HAM       
## 

Naive Bayes Classifier performed well on this data set as the model accuracy is 1.0 with a 95% CI of 0.994.

XGBoost: eXtreme Gradient Boosting

test_split<-round(.25*dim(text_dtm)[1])

test_text<-text_dtm[1:test_split,]
train_text<-text_dtm[(test_split+1):dim(text_dtm)[1],]
test_target<-data$target[1:test_split]
train_target<-data$target[(test_split+1):dim(data)[1]]
xgb <- xgboost(data = as.matrix(train_text), 
               label = as.vector(train_target),
               max.depth = 7, eta = 1, 
               nthread = 2, nrounds = 2,
               objective = "binary:logistic")
## [23:34:23] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [1]  train-logloss:0.129703 
## [2]  train-logloss:0.044723
xg_pred <- predict(xgb, as.matrix(test_text))
# Convert probabilities to binary
xg_pred<- ifelse(xg_pred >0.5, 1,0)
# Evaluate
confusionMatrix(data = factor(xg_pred, levels=c(1,0)),
                reference = factor(test_target, levels=c(1,0)),
                positive = "1", dnn = c("Prediction", "Actual"))
## Confusion Matrix and Statistics
## 
##           Actual
## Prediction   1   0
##          1 126   0
##          0   0 637
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9952, 1)
##     No Information Rate : 0.8349     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1651     
##          Detection Rate : 0.1651     
##    Detection Prevalence : 0.1651     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 1          
## 

Conclusion

This text classification of emails is performed using three algorithms for comparison purposes. The three algorithms used were Random Forest, Naive Bayes, and XGBoost. The algorithms created models and trained them using some of the data and tested the effectiveness of the model using a test subset of the data. One of the difficulties faced during this project was to make the program extract the data from a github repository. In order to reproduce this project, change the data_spam and data_ham variables to the respective data’s path on your machine.

References:

Source:

GitHub RPubs