The task is to create a program that can classify a text document using training documents that are
already classified. Specifically the program that will classify email as ‘spam’ - unwanted email, or
‘ham’ wanted email:
# Load
library("tidyverse")
library("stringr")
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library(dplyr)
library(e1071)
library(caTools)
library(caret)
library(kernlab)
library(rpart)
library(plyr)
library(class)
library(knitr)
http://spamassassin.apache.org/old/publiccorpus/
20050311_spam_2.tar.bz2 for spam data
20030228_easy_ham.tar.bz2 for ham data
# Create Ham Dataframe
ham_dir <- "C:/Desk top Stuff/CUNY Fall 2019 Folder/Data Acquisition/Project 4/easy_ham"
hamFileNames <- list.files(ham_dir)
nnrow <- length(hamFileNames)
nt <- 100 #use only a 100 for illustrative purpose only because it will take too long
# List of docs
ham_docs_list <- list()
for(i in 1:nt)
{
filepath<-paste0(ham_dir, "/", hamFileNames[i])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
ham_docs_list = c(ham_docs_list,list1)
}
# ham data frame
hamDF <-as.data.frame(unlist(ham_docs_list),stringsAsFactors = FALSE)
#Assigning Labels "ham" to the ham dataframe
hamDF$type <- "ham"
names(hamDF) <- c("text","type")
# Create Spam Dataframe
spam_dir="C:/Desk top Stuff/CUNY Fall 2019 Folder/Data Acquisition/Project 4/spam_2"
spamFileNames = list.files(spam_dir)
nnrow1 <- length(spamFileNames)
# List of docs
spam_docs_list <- list()
for(i in 1:nt)
{
filepath1<-paste0(spam_dir, "/", spamFileNames[i])
text1 <-readLines(filepath1)
list2<- list(paste(text1, collapse="\n"))
spam_docs_list = c(spam_docs_list,list2)
}
# Spam data frame
spamDF <-as.data.frame(unlist(spam_docs_list),stringsAsFactors = FALSE)
#Assigning Labels "spam" to the spam dataframe
spamDF$type <- "spam"
names(spamDF) <- c("text","type")
# creating combined data frame of spam and ham
df <- rbind(hamDF, spamDF)
Clean up process involves the following activities:
Create Corpus dataset
Removing numbers
Removing punctuation
Removing stopwords - remove common non-content words, like to, and, the, etc. Words with no values
Removing excess white space
# These were needed because I had some Japanese and Chinese characters that were erroring out
Sys.setlocale('LC_ALL','C')
## [1] "C"
Sys.setlocale("LC_ALL", "Japanese_Japan.932")
## [1] "LC_COLLATE=Japanese_Japan.932;LC_CTYPE=Japanese_Japan.932;LC_MONETARY=Japanese_Japan.932;LC_NUMERIC=C;LC_TIME=Japanese_Japan.932"
Sys.setlocale("LC_ALL","Japanese_Japan.20932")
## [1] "LC_COLLATE=Japanese_Japan.20932;LC_CTYPE=Japanese_Japan.20932;LC_MONETARY=Japanese_Japan.20932;LC_NUMERIC=C;LC_TIME=Japanese_Japan.20932"
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
# Create a Corpus, then perform some cleaning up from df dataframe column (the text); tm package
df3 <- df %>%
filter(type=="spam")
emailCorpus <- Corpus(VectorSource(df3$text)) %>%
tm_map(toSpace, "/") %>%
tm_map(toSpace, "@") %>%
tm_map(toSpace, "\\|") %>%
tm_map(PlainTextDocument) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., toSpace, "/"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., toSpace, "@"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., toSpace, "\\|"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
# Create documen-term matrix for ham and spam emails
# documenterm matrix is the mathematical maxtrix that describes the frequency of terms that occurs in a collection of documents
email_dtm <- TermDocumentMatrix(emailCorpus)
# Remove outliers consisting of very rare terms - infrequent words outlairs
tdms.90 <- removeSparseTerms(email_dtm , sparse = 0.90)
m <- as.matrix(tdms.90 )
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## font font 1445
## will will 371
## email email 333
## received received 313
## can can 284
## hrefhttp hrefhttp 254
## jul jul 251
## free free 225
## netnoteinccom netnoteinccom 221
## sized sized 210
# spam word cloud
set.seed(1234)
suppressWarnings(wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
df4 <- df %>%
filter(type=="ham")
emailCorpus1 <- Corpus(VectorSource(df4$text)) %>%
tm_map( toSpace, "/") %>%
tm_map(toSpace, "@") %>%
tm_map( toSpace, "\\|") %>%
tm_map(PlainTextDocument) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map( removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., toSpace, "/"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., toSpace, "@"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., toSpace, "\\|"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
email_dtm1 <- TermDocumentMatrix(emailCorpus1 )
# Remove outliers consisting of very rare terms - infrequent words outlairs
tdms1.90 <- removeSparseTerms(email_dtm1 , sparse = 0.90)
m1 <- as.matrix(tdms1.90 )
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1, 10)
## word freq
## aug aug 683
## received received 678
## localhost localhost 620
## thu thu 367
## esmtp esmtp 366
## xentcom xentcom 351
## linuxie linuxie 339
## http http 329
## zzzz zzzz 318
## ilug ilug 254
# Ham word cloud
set.seed(1234)
suppressWarnings(wordcloud(words = d1$word, freq = d1$freq, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
Observation 1:
The wordcloud in Ham shows more meaningful words than the ones in Spams
ham_percent <- df %>%
summarise(ham_percent = nnrow/sum(nnrow1+nnrow))
spam_percent <- df %>%
summarise(spam_percent = nnrow1/sum(nnrow1+nnrow))
# Table of % split between Hams vs. Spams
pie <- matrix(c(64,36),ncol=2,byrow=TRUE)
colnames(pie) <- c("Ham","Spam")
rownames(pie) <- c("% split")
pie <- as.table(pie)
pie
## Ham Spam
## % split 64 36
# 3D Exploded Pie Chart
library(plotrix)
slices <- c(64,36)
lbls <- c("Ham","Spam")
pct <-round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add % to labels
lbls <- paste(lbls, "%", sep="") #add % sign
pie3D(slices,labels = lbls, col=rainbow(length(lbls)),main ="Pie Chart of Original datasets, w/o cutoffs:Hams vs Spams")
80% of data is partitioned to be training
20% of data is partitioned to be testing (hold outs)
# Splitting the dataset into the Training set and Test set
df$type <- factor(df$type)
#shuffle up the dataframe again
set.seed(42)
rows <- sample(nrow(df))
df_final <- df[rows, ]
trainIndex <- createDataPartition(df_final$type, p=0.80, list=FALSE)
dataTrain <- df_final[trainIndex,]
dataTest <- df_final[-trainIndex,]
#cleaning original corpus converting and removing to improve performance of models
cleanCorpus <- Corpus(VectorSource(df_final$text)) %>%
tm_map(toSpace, "/") %>%
tm_map(toSpace, "@") %>%
tm_map(toSpace, "\\|") %>%
tm_map(PlainTextDocument) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., toSpace, "/"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., toSpace, "@"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., toSpace, "\\|"): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords()): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
corpus_train <- cleanCorpus[trainIndex]
corpus_test <- cleanCorpus[-trainIndex]
email_train <- DocumentTermMatrix(corpus_train)
email_test <- DocumentTermMatrix(corpus_test)
Found this snippet of code on-line that Converts the numeric entries in the term matrices into
factors that indicate whether the term is present or not; converts count info into “Yes/No”
# count function
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
email_train <- apply(email_train , 2, convert_count)
email_test <- apply(email_test , 2, convert_count)
classifier <- naiveBayes(email_train , factor(dataTrain$type))
# Predicting the Test set results
test_pred <- predict(classifier, newdata=email_test)
confusionMatrix(test_pred, dataTest$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 19 0
## spam 1 20
##
## Accuracy : 0.975
## 95% CI : (0.8684, 0.9994)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 3.729e-11
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9500
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9524
## Prevalence : 0.5000
## Detection Rate : 0.4750
## Detection Prevalence : 0.4750
## Balanced Accuracy : 0.9750
##
## 'Positive' Class : ham
##
Observation 2: (Note: This was only based on a small sample size for demonstrative purposes only)
The Naive Bayes Classifier performed very well on the training set by achieving 97.5% overall accuracy
While the model has a 95% sensitivity rate; the proportion of the positive class predicted as positive,
it was able to achieve about 100% on specificity rate which is the proportion of the negative class
predicted accurately