Project 4 (Document Classification)
Load the libraries
library(R.utils)
library(readr)
library(stringr)
library(tidytext)
library(dplyr)
library(tidyr)
library(tm)
library(RTextTools)
library(wordcloud2)
library(wordcloud)
library(e1071)Introduction
The task is to assign the documents to one or more classes or categories. For this project we have a spam and ham folders with set of documents. Classification techniques can be applied to discern messages, understand routing, determine the language of a text, analyse sentiment etc.
The dataset is taken from using the publiccorpus_link The files used are 20030228_easy_ham.tar.bz2 and 20030228_spam.tar.bz2
Read the spam and ham files from the local PC
############# spam file processing #################
spam_path <- "C:/MSDS/Project4/spam/spam/"
spam_filenames <- list.files(spam_path)
spam_docs <- NA
# Iterate the files
for (i in 1:length(spam_filenames)) {
full_path <- paste0(spam_path, "/", spam_filenames[i])
text <- readLines(full_path)
text_in_file <- list(paste(text, collapse="\n"))
spam_docs <- c(spam_docs, text_in_file)
}
spam_df <-as.data.frame(unlist(spam_docs),stringsAsFactors = FALSE)
spam_df$type <- "spam"
colnames(spam_df) <- c("text","type")
################ ham file processing #################
ham_path <- "C:/MSDS/Project4/easy_ham/easy_ham/"
ham_filenames <- list.files(ham_path)
ham_docs <- NA
# Iterate the files
for (i in 1:length(ham_filenames)) {
full_path <- paste0(ham_path, "/", ham_filenames[i])
text <- readLines(full_path)
text_in_file <- list(paste(text, collapse="\n"))
ham_docs <- c(ham_docs, text_in_file)
}
ham_df <-as.data.frame(unlist(ham_docs),stringsAsFactors = FALSE)
ham_df$type <- "ham"
colnames(ham_df) <- c("text","type")
# Merge both dataframes
spam_ham_df <- rbind(ham_df, spam_df)
str(spam_ham_df)## 'data.frame': 3004 obs. of 2 variables:
## $ text: chr NA "From exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002\nReturn-Path: <exmh-workers-admin@spamassassin.tai"| __truncated__ "From Steve_Burt@cursor-system.com Thu Aug 22 12:46:39 2002\nReturn-Path: <Steve_Burt@cursor-system.com>\nDeliv"| __truncated__ "From timc@2ubh.com Thu Aug 22 13:52:59 2002\nReturn-Path: <timc@2ubh.com>\nDelivered-To: zzzz@localhost.netnot"| __truncated__ ...
## $ type: chr "ham" "ham" "ham" "ham" ...
Randomize the dataframe
set.seed(45)
rows <- sample(nrow(spam_ham_df))
random_df <- spam_ham_df[rows, ]Clean data with Corpus
rand_corpus <- Corpus(VectorSource(random_df$text))
# Translate all letters to lower case
lower_corpus <- tm_map(rand_corpus, tolower)
# Clean data
clean_corpus<- tm_map(rand_corpus,content_transformer(gsub), pattern="\\W",replace=" ")
removeURL <- function(x) gsub("http^\\s\\s*", "", x)%>%
clean_corpus <- tm_map(clean_corpus, content_transformer(removeURL))
# remove numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
# remove punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)
# remove stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())
# remove whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)Create Document Term Matrix
email_dtm <- DocumentTermMatrix(clean_corpus)
# Most frequent terms for each document:
head(findMostFreqTerms(email_dtm))## $`1`
## com said friendship fork xent friends
## 30 23 20 18 17 15
##
## $`2`
## com fork xent aug mon writing
## 25 19 18 8 8 8
##
## $`3`
## net freshrpms rpm alsa list zzzlist
## 32 26 25 22 16 12
##
## $`4`
## font div color align face darial
## 62 44 25 22 17 16
##
## $`5`
## social linux aug org received localhost
## 15 14 11 9 9 8
##
## $`6`
## com fork xent aug org wed
## 24 16 16 9 9 8
# Removing Sparse terms
email_dtm = removeSparseTerms(email_dtm, 0.10)
inspect(email_dtm)## <<DocumentTermMatrix (documents: 3004, terms: 22)>>
## Non-/sparse entries: 62890/3198
## Sparsity : 5%
## Maximal term length: 12
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs com esmtp from http localhost org postfix received spamassassin subject
## 1527 43 9 3 0 9 22 3 13 20 4
## 2154 61 3 2 52 5 8 1 6 5 1
## 2388 49 3 2 37 5 17 2 5 14 1
## 2574 33 8 9 1 9 26 3 12 23 4
## 2731 175 2 2 141 5 16 1 4 11 1
## 72 79 3 2 68 5 8 1 6 5 1
## 736 0 1 2 0 5 58 1 4 1 1
## 829 1 1 2 0 5 40 1 4 1 1
## 852 40 5 4 4 11 24 3 14 21 4
## 870 42 4 3 4 11 23 3 13 21 4
Seperate Spam and Ham text
## Spam
spam_only <- which(random_df$type == "spam")
spam_only[1:5]## [1] 4 5 31 49 50
## Ham
ham_only <- which(random_df$type == "ham")
ham_only[1:5]## [1] 1 2 3 6 7
Create Word Cloud for Spam emails
wordcloud(clean_corpus[spam_only], min.freq=50, max.words=75, random.order=FALSE, rot.per=0.60,
colors=c(1:4),random.color=TRUE)Create Word Cloud for Ham emails
wordcloud(clean_corpus[ham_only], min.freq=50, max.words=75, random.order=FALSE, rot.per=0.60,
colors=c(1:4),random.color=TRUE)Building the Spam filter
## Divide the corpus into training and test datasets
# Split the dataframe
training_df <- random_df[1:1800, ]
test_df <- random_df[1801:3004, ]
# Split the Corpus
corpus_train <- clean_corpus[1:800]
corpus_test <- clean_corpus[1801:3004]Create DocumentTermMatrix for training and test
training_dtm <- DocumentTermMatrix(corpus_train)
test_dtm <- DocumentTermMatrix(corpus_test)Create the function to convert count information to “Yes” or “No”
# For Naive Bayes classification to work it needs to be present or absent on each word that is in a message. This is used to convert the document-term matrices
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}Convert the Document-Term Matrix
training_dtm <- apply(training_dtm, 2, convert_count)
test_dtm <- apply(test_dtm, 2, convert_count)The Naive Bayes Function
##training_classifier <- naiveBayes(training_dtm, factor(training_df$type))
##class(training_classifier)Predict Function to test model
##test_pred <- predict(training_classifier, newdata=test_dtm)Check predictions
##table(test_pred, test_df$type)Conclusion
Naive Bayes classifier is a classification algorithm based on Bayes’s theorem. It considers all the features of a data object to be independent of each other. It is very fast and useful for large datasets. It achieves very accurate results with very little training.
Using the Naive Bayes method is one of the best methods for spam filtering, the results would have correctly classified the ham and spam emails. Unfortuantely, I ran into an error “all arguments must have the same length” in the naiveBayes function.