Document Classification - Ham / Spam
This assignment is to classify new “test” documents using already classified “training” documents from an open source anti-spam platform, Apache SpamAssassin.
Load Libraries
We will being to load libraries
library(tidyverse)
library(e1071)
library(tm)
library(tidytext)
library(dplyr)
library(caret)
library(xgboost)
library(SnowballC)
library(RColorBrewer)
library(ggplot2)
library(wordcloud)Load Files from Local Drive
The files were download from Apache SpamAssassin - Old - Public Corpus platform to access email and block spam (unsolicited email) to filter and classify.
This analysis used two datasets for the parent directory: * 20021010 Easy Ham * 20021010 Spam
The original file extension are ‘tar.bz2’, that was downloaded to my local drive and unzipped.
spam_path <- "C:/Users/andre/OneDrive/Documents/GitHub/D607_Project4/Data/spam"
ham_path <- "C:/Users/andre/OneDrive/Documents/GitHub/D607_Project4/Data/easy_ham"This function crawls through the files and extracts the messages in there raw form.
make.data.frame<- function(path, class){
# Dig through the directories for messages
files <- list.files(path=path,
full.names=TRUE,
recursive=TRUE)
# Read a file once directories are gone
message<-lapply(files, function(x) {
text_body<-read_file(x)
})
# Add to dataframe and assign "id" column
message<-unlist(message)
data<-as.data.frame(message)
data$class<-class
return (data)
}Make SPAM and HAM dataframes and bind them
data_spam<-make.data.frame(spam_path, class="SPAM")
data_ham<-make.data.frame(ham_path, class="HAM")
data<-rbind(data_spam, data_ham)Text Clean up
The dataframe will include a class column designated by document type, SPAM and HAM. Creating the dataframe with two variables and a numeric target class will be used in some classifiers.
data_spam<-data %>%
filter(class == "SPAM") %>%
mutate(target = 1)
data_ham<- data %>%
filter(class == "HAM") %>%
mutate(target = 0)
data<-rbind(data_spam, data_ham)
data$id <- 1:nrow(data)
DT::datatable(data %>%
count(class, target),
extensions = c('FixedColumns',"FixedHeader"),
options = list(scrollX = TRUE,
paging=TRUE,
fixedHeader=TRUE))Clean the dataframe text files, remove the html formatting, all punctuation, new lines, and digits.
data<-data %>%
mutate(message= str_remove_all(message, pattern = "<.*?>")) %>%
mutate(message= str_remove_all(message, pattern = "[:digit:]")) %>%
mutate(message= str_remove_all(message, pattern = "[:punct:]")) %>%
mutate(message= str_remove_all(message, pattern = "[\n]")) %>%
mutate(message= str_to_lower(message)) %>%
unnest_tokens(output=text,input=message,
token="paragraphs",
format="text") %>%
anti_join(stop_words, by=c("text"="word"))glimpse(data)## Rows: 3,052
## Columns: 4
## $ class <chr> "SPAM", "SPAM", "SPAM", "SPAM", "SPAM", "SPAM", "SPAM", "SPAM",~
## $ target <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ~
## $ text <chr> "mv bfcddbffccadbmv ffbcebefcdceaaamv cdcbcfabaemv dfdeeaeb~
table(data$class)##
## HAM SPAM
## 2551 501
We will rearrange the dataset for use in the train, test splits.
set.seed(9450)
# randomize index
row_shuffle <- sample(nrow(data))
# reorder index
data<-data[row_shuffle,]
DT::datatable(data[1:5,c("class","id")],
extensions = c('FixedColumns',"FixedHeader"),
options = list(scrollX = TRUE,
paging=TRUE,
fixedHeader=TRUE,
pageLength = 5))Convert the ‘class’ variable from character to factor
data$class <- as.factor(data$class)
prop.table(table(data$class))##
## HAM SPAM
## 0.8358453 0.1641547
Document Term Matrix
The tm package was used to vectorize the words into a corpus of messages..
text_corpus <- VCorpus(VectorSource(data$text))Clean-up corpus
text_corpus = tm_map(text_corpus, content_transformer(stringi::stri_trans_tolower))
text_corpus = tm_map(text_corpus, removeNumbers)
text_corpus = tm_map(text_corpus, removePunctuation)
text_corpus = tm_map(text_corpus, stripWhitespace)
text_corpus = tm_map(text_corpus, removeWords, stopwords("english"))
text_corpus = tm_map(text_corpus, stemDocument)Create a Document Term Matrix, presenting a bag-of-words vectorizer for each message in the dataset. The columns represents the count frequency of each word in the corpus.
# For tokens by message
text_dtm <- DocumentTermMatrix(text_corpus, control =
list(stemming = TRUE))
dim(text_dtm) # 3052 / 60912## [1] 3052 60912
Sparse terms removed, reducing the matrix
text_dtm <- removeSparseTerms(text_dtm, 0.999)
dim(text_dtm) # 3052 / 9313## [1] 3052 9313
Inspect the corpus
inspect(text_dtm[50:70, 30:50])## <<DocumentTermMatrix (documents: 21, terms: 21)>>
## Non-/sparse entries: 2/439
## Sparsity : 100%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs abd abett abf abidjan abil abit abl aboard abolit abus
## 50 0 0 0 0 0 0 0 0 0 0
## 51 0 0 0 0 0 0 0 0 0 0
## 52 0 0 0 0 0 0 0 0 0 0
## 53 0 0 0 0 0 0 0 0 0 0
## 54 0 0 0 0 0 0 0 0 0 0
## 55 0 0 0 0 0 0 0 0 0 0
## 56 0 0 0 0 0 0 0 0 0 0
## 57 0 0 0 0 0 0 0 0 0 0
## 59 0 0 0 0 0 0 1 0 0 0
## 66 0 0 0 0 0 0 0 0 0 2
Word Frequency
freq<- sort(colSums(as.matrix(text_dtm)), decreasing=TRUE)
head(freq, 10) # this is the least ## sep esmtp localhost aug receiv oct postfix istreceiv
## 12104 9824 8775 6348 6030 5507 5412 5021
## thu mon
## 4718 4555
Visualizing Word Frequency
BarPlot
word_freq<- data.frame(word=names(freq), freq=freq)
head(word_freq)## word freq
## sep sep 12104
## esmtp esmtp 9824
## localhost localhost 8775
## aug aug 6348
## receiv receiv 6030
## oct oct 5507
word_freq_bp <- ggplot(subset(word_freq, freq > 2000), aes(x=reorder(word, -freq), y =freq)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45, hjust=1))
word_freq_bpWord Frequency in Emails
set.seed(3000)
wordcloud(words = word_freq$word, freq = word_freq$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(12, "Paired"))Text Classification
Convert word frequency into logical value.
The term frequencies are replaced by Boolean presence/absence features for sentiment classification.
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
# Apply the convert_count function to get final training and testing DTMs
datasetNB <- apply(text_dtm, 2, convert_count)
dataset = as.data.frame(as.matrix(datasetNB))dataset$class = data$class
str(dataset$class)## Factor w/ 2 levels "HAM","SPAM": 1 1 1 1 1 1 1 2 1 2 ...
Data Splitting Based on the Outcome
The function createDataPartition can be used to create balanced splits of the data. If the y argument to this function is a factor, the random sampling occurs within each class and should preserve the overall class distribution of the data. For example, to create a single 80/20% split of the data:
set.seed(3000) # caret function
train.set <- createDataPartition(dataset$class, p=.8, list = FALSE, times = 1)
head(train.set)## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
## [6,] 7
Create both train and test and check the proportion of the data split.
dataTrain <- data[ train.set,]
dataTest <- data[ -train.set,]
prop.table(table(dataTrain$class))##
## HAM SPAM
## 0.8357903 0.1642097
prop.table(table(dataTest$class))##
## HAM SPAM
## 0.8360656 0.1639344
Model Fitting
Analysis for this text classification
Random Forest via Ranger, an alternative package for fitting a random forest XGBoost, an alternative boosting package *Naïve Bayes Classifier (NBC)
We will be building our model on 3 different Machine Learning algorithms which are Random Forest, Naive Bayes and XGBoost: Extreme Gradient Boosting for the purpose of deciding which perform the best.
Random Forest Classifier
Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction.
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
##
## Call:
## randomForest(x = dataTrain, y = dataTrain$class, ntree = 300)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## HAM SPAM class.error
## HAM 2041 0 0
## SPAM 0 401 0
The rf_classifier was able to accurately classify the text messages as ham and spam respectively with the class error of 0 which suggest that there is 100% accuracy of the model on the training set of observations.
Making Predictions and evaluating the Random Forest Classifier.
We want to evaluate the model using the test_set and see if our model can match the 100% accuracy on this new set of data in comparison to the one obtained from the training set.
# Predicting the Test set results
rf_pred = predict(rf_classifier, newdata = dataTest)
# Making the Confusion Matrix
confusionMatrix(table(rf_pred,dataTest$class))## Confusion Matrix and Statistics
##
##
## rf_pred HAM SPAM
## HAM 510 0
## SPAM 0 100
##
## Accuracy : 1
## 95% CI : (0.994, 1)
## No Information Rate : 0.8361
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8361
## Detection Rate : 0.8361
## Detection Prevalence : 0.8361
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : HAM
##
The Random Forest Classifier (rf_classifier) performed well on this data set as the model accuracy is 1.0 with a 95% CI of 0.994.
Naive Bayes Classifier
It is a Machine Learning model that is based upon the assumptions of conditional probability as proposed by Bayes’ Theorem. It is fast and easy.
control <- trainControl(method="repeatedcv", number=10, repeats=3)
system.time( classifier_nb <- naiveBayes(dataTrain, dataTrain$class, laplace = 1,
trControl = control,tuneLength = 7) )## user system elapsed
## 4.81 0.00 4.81
Making Predictions and evaluating the Naive Bayes Classifier.
nb_pred = predict(classifier_nb, type = 'class', newdata = dataTest)
confusionMatrix(nb_pred, dataTest$class)## Confusion Matrix and Statistics
##
## Reference
## Prediction HAM SPAM
## HAM 510 0
## SPAM 0 100
##
## Accuracy : 1
## 95% CI : (0.994, 1)
## No Information Rate : 0.8361
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8361
## Detection Rate : 0.8361
## Detection Prevalence : 0.8361
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : HAM
##
Naive Bayes Classifier performed well on this data set as the model accuracy is 1.0 with a 95% CI of 0.994.
XGBoost: eXtreme Gradient Boosting
test_split<-round(.25*dim(text_dtm)[1])
test_text<-text_dtm[1:test_split,]
train_text<-text_dtm[(test_split+1):dim(text_dtm)[1],]test_target<-data$target[1:test_split]
train_target<-data$target[(test_split+1):dim(data)[1]]xgb <- xgboost(data = as.matrix(train_text),
label = as.vector(train_target),
max.depth = 7, eta = 1,
nthread = 2, nrounds = 2,
objective = "binary:logistic")## [23:34:23] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [1] train-logloss:0.129703
## [2] train-logloss:0.044723
xg_pred <- predict(xgb, as.matrix(test_text))# Convert probabilities to binary
xg_pred<- ifelse(xg_pred >0.5, 1,0)# Evaluate
confusionMatrix(data = factor(xg_pred, levels=c(1,0)),
reference = factor(test_target, levels=c(1,0)),
positive = "1", dnn = c("Prediction", "Actual"))## Confusion Matrix and Statistics
##
## Actual
## Prediction 1 0
## 1 126 0
## 0 0 637
##
## Accuracy : 1
## 95% CI : (0.9952, 1)
## No Information Rate : 0.8349
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1651
## Detection Rate : 0.1651
## Detection Prevalence : 0.1651
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
Conclusion
This text classification of emails is performed using three algorithms for comparison purposes. The three algorithms used were Random Forest, Naive Bayes, and XGBoost. The algorithms created models and trained them using some of the data and tested the effectiveness of the model using a test subset of the data. One of the difficulties faced during this project was to make the program extract the data from a github repository. In order to reproduce this project, change the data_spam and data_ham variables to the respective data’s path on your machine.
References:
- Naïve Bayes Classifier · UC Business Analytics R Programming Guide. (2018). Github.io. http://uc-r.github.io/naive_bayes
- Kuhn, M. (n.d.). 3 Pre-Processing | The caret Package. In topepo.github.io. Retrieved November 15, 2021, from https://topepo.github.io/caret/pre-processing.html#putting-it-all-together
- Index of /old/publiccorpus. (n.d.). Spamassassin.apache.org. https://spamassassin.apache.org/old/publiccorpus/
Source: