Document Classification

Bikram Barua

11/11/2021

Project 4 (Document Classification)

Load the libraries

library(R.utils)
library(readr)
library(stringr)
library(tidytext)
library(dplyr)
library(tidyr)
library(tm)
library(RTextTools)
library(wordcloud2)
library(wordcloud)
library(e1071)

Introduction

The task is to assign the documents to one or more classes or categories. For this project we have a spam and ham folders with set of documents. Classification techniques can be applied to discern messages, understand routing, determine the language of a text, analyse sentiment etc.

The dataset is taken from using the publiccorpus_link The files used are 20030228_easy_ham.tar.bz2 and 20030228_spam.tar.bz2

Read the spam and ham files from the local PC
############# spam file processing #################
spam_path <- "C:/MSDS/Project4/spam/spam/"
spam_filenames <- list.files(spam_path)
              
spam_docs <- NA
# Iterate the files
for (i in 1:length(spam_filenames)) {
  full_path <- paste0(spam_path, "/", spam_filenames[i])
  text <- readLines(full_path)
  text_in_file <- list(paste(text, collapse="\n"))
  spam_docs <- c(spam_docs, text_in_file)
  
}
spam_df <-as.data.frame(unlist(spam_docs),stringsAsFactors = FALSE)
spam_df$type <- "spam"
colnames(spam_df) <- c("text","type")

################ ham file processing #################
ham_path <- "C:/MSDS/Project4/easy_ham/easy_ham/"
ham_filenames <- list.files(ham_path)

ham_docs <- NA
# Iterate the files
for (i in 1:length(ham_filenames)) {
  full_path <- paste0(ham_path, "/", ham_filenames[i])
  text <- readLines(full_path)
  text_in_file <- list(paste(text, collapse="\n"))
  ham_docs <- c(ham_docs, text_in_file)
}

ham_df <-as.data.frame(unlist(ham_docs),stringsAsFactors = FALSE)
ham_df$type <- "ham"
colnames(ham_df) <- c("text","type")

# Merge both dataframes
spam_ham_df <- rbind(ham_df, spam_df)
str(spam_ham_df)
## 'data.frame':    3004 obs. of  2 variables:
##  $ text: chr  NA "From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002\nReturn-Path: <exmh-workers-admin@spamassassin.tai"| __truncated__ "From Steve_Burt@cursor-system.com  Thu Aug 22 12:46:39 2002\nReturn-Path: <Steve_Burt@cursor-system.com>\nDeliv"| __truncated__ "From timc@2ubh.com  Thu Aug 22 13:52:59 2002\nReturn-Path: <timc@2ubh.com>\nDelivered-To: zzzz@localhost.netnot"| __truncated__ ...
##  $ type: chr  "ham" "ham" "ham" "ham" ...
Randomize the dataframe
set.seed(45)
rows <- sample(nrow(spam_ham_df))

random_df <- spam_ham_df[rows, ]
Clean data with Corpus
rand_corpus <- Corpus(VectorSource(random_df$text))


# Translate all letters to lower case
lower_corpus <- tm_map(rand_corpus, tolower)

# Clean data
clean_corpus<- tm_map(rand_corpus,content_transformer(gsub), pattern="\\W",replace=" ")

removeURL <- function(x) gsub("http^\\s\\s*", "", x)%>% 
clean_corpus <- tm_map(clean_corpus, content_transformer(removeURL))

# remove numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
# remove punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)
# remove stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())
# remove whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
Create Document Term Matrix
email_dtm <- DocumentTermMatrix(clean_corpus)

# Most frequent terms for each document:
head(findMostFreqTerms(email_dtm))
## $`1`
##        com       said friendship       fork       xent    friends 
##         30         23         20         18         17         15 
## 
## $`2`
##     com    fork    xent     aug     mon writing 
##      25      19      18       8       8       8 
## 
## $`3`
##       net freshrpms       rpm      alsa      list   zzzlist 
##        32        26        25        22        16        12 
## 
## $`4`
##   font    div  color  align   face darial 
##     62     44     25     22     17     16 
## 
## $`5`
##    social     linux       aug       org  received localhost 
##        15        14        11         9         9         8 
## 
## $`6`
##  com fork xent  aug  org  wed 
##   24   16   16    9    9    8
# Removing Sparse terms
email_dtm = removeSparseTerms(email_dtm, 0.10)
inspect(email_dtm)
## <<DocumentTermMatrix (documents: 3004, terms: 22)>>
## Non-/sparse entries: 62890/3198
## Sparsity           : 5%
## Maximal term length: 12
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   com esmtp from http localhost org postfix received spamassassin subject
##   1527  43     9    3    0         9  22       3       13           20       4
##   2154  61     3    2   52         5   8       1        6            5       1
##   2388  49     3    2   37         5  17       2        5           14       1
##   2574  33     8    9    1         9  26       3       12           23       4
##   2731 175     2    2  141         5  16       1        4           11       1
##   72    79     3    2   68         5   8       1        6            5       1
##   736    0     1    2    0         5  58       1        4            1       1
##   829    1     1    2    0         5  40       1        4            1       1
##   852   40     5    4    4        11  24       3       14           21       4
##   870   42     4    3    4        11  23       3       13           21       4
Seperate Spam and Ham text
## Spam
spam_only <- which(random_df$type == "spam")
spam_only[1:5]
## [1]  4  5 31 49 50
## Ham
ham_only <- which(random_df$type == "ham")
ham_only[1:5]
## [1] 1 2 3 6 7
Create Word Cloud for Spam emails
wordcloud(clean_corpus[spam_only], min.freq=50, max.words=75, random.order=FALSE, rot.per=0.60, 
          colors=c(1:4),random.color=TRUE)

Create Word Cloud for Ham emails
wordcloud(clean_corpus[ham_only], min.freq=50, max.words=75, random.order=FALSE, rot.per=0.60, 
          colors=c(1:4),random.color=TRUE)

Building the Spam filter

## Divide the corpus into training and test datasets
# Split the dataframe
training_df <- random_df[1:1800, ]
test_df <- random_df[1801:3004, ]

# Split the Corpus
corpus_train <- clean_corpus[1:800]
corpus_test <- clean_corpus[1801:3004]
Create DocumentTermMatrix for training and test
training_dtm <- DocumentTermMatrix(corpus_train)
test_dtm <- DocumentTermMatrix(corpus_test)
Create the function to convert count information to “Yes” or “No”
# For Naive Bayes classification to work it needs to be present or absent on each word that is in a message. This is used to convert the document-term matrices
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}
Convert the Document-Term Matrix
training_dtm <- apply(training_dtm, 2, convert_count)
test_dtm <- apply(test_dtm, 2, convert_count)
The Naive Bayes Function
##training_classifier <- naiveBayes(training_dtm, factor(training_df$type))
##class(training_classifier)
Predict Function to test model
##test_pred <- predict(training_classifier, newdata=test_dtm)
Check predictions
##table(test_pred, test_df$type)

Conclusion

Naive Bayes classifier is a classification algorithm based on Bayes’s theorem. It considers all the features of a data object to be independent of each other. It is very fast and useful for large datasets. It achieves very accurate results with very little training.

Using the Naive Bayes method is one of the best methods for spam filtering, the results would have correctly classified the ham and spam emails. Unfortuantely, I ran into an error “all arguments must have the same length” in the naiveBayes function.