DATA 607 Project 4 - Document Classification
HAZAL GUNDUZ
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/
Load Libraries
library(tm)
## Loading required package: NLP
library(RCurl)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
library(wordcloud)
## Loading required package: RColorBrewer
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
## The following object is masked from 'package:RCurl':
##
## complete
library(readr)
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
length(list.files("./spam"))
## [1] 500
length(list.files("./easy_ham"))
## [1] 2501
spam_fnames <- list.files("./spam")
spam_fnames[1:6]
## [1] "00001.7848dde101aa985090474a91ec93fcf0"
## [2] "00002.d94f1b97e48ed3b553b3508d116e6a09"
## [3] "00003.2ee33bc6eacdb11f38d052c44819ba6c"
## [4] "00004.eac8de8d759b7e74154f142194282724"
## [5] "00005.57696a39d7d84318ce497886896bf90d"
## [6] "00006.5ab5620d3d7c6c0db76234556a16f6c1"
ham_fnames <- list.files("./easy_ham")
ham_fnames[1:6]
## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"
## [6] "00006.253ea2f9a9cc36fa0b1129b04b806608"
folders <- c("./easy_ham", "./spam")
categories <-c("easy_ham", "spam")
symbs <- c("h", "s")
read_folder_to_df <- function(folders, categories, symbs, vec_fnames){
df <- data_frame()
n <- length(folders)
for (i in 1:n) {
folder <- folders[i]
category <- categories[i]
symb <- symbs[i]
fnames <- vec_fnames[i]
temp <- data_frame(file = dir(folder, full.names = TRUE)) %>%
mutate(text = map(file, read_lines)) %>%
transmute(category = category, id = basename(file), text) %>%
unnest(text)
df <- bind_rows(df, temp)
}
return(df)
}
# Creating corpus
corpus_tibble <- function(folders, categories, symbs, vec_fnames)
corpus_tibble
str(ham_fnames)
## chr [1:2501] "00001.7c53336b37003a9286aba55d2945844c" ...
str(spam_fnames)
## chr [1:500] "00001.7848dde101aa985090474a91ec93fcf0" ...
Read the csv File
spam_ham_csv<-read.csv("~/Dropbox/Mac/Downloads/spam_ham.csv" ,stringsAsFactors = FALSE)
str(spam_ham_csv)
## 'data.frame': 2801 obs. of 3 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ text: chr NA "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
## $ type: chr "ham" "ham" "ham" "ham" ...
random_spam_ham <- spam_ham_csv[sample(nrow(spam_ham_csv)),]
str(random_spam_ham)
## 'data.frame': 2801 obs. of 3 variables:
## $ X : int 2784 1953 1630 130 1580 95 1528 613 612 2697 ...
## $ text: chr "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "Return-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived"| __truncated__ "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@loca"| __truncated__ ...
## $ type: chr "spam" "spam" "spam" "ham" ...
sms_corpus <- Corpus(VectorSource(random_spam_ham$text))
print(sms_corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 2801
Cleaning the text of Corpus
clean_corpus <- tm_map(sms_corpus, tolower)
## Warning in tm_map.SimpleCorpus(sms_corpus, tolower): transformation drops
## documents
clean_corpus<- tm_map(sms_corpus,content_transformer(gsub), pattern="\\W",replace=" ")
## Warning in tm_map.SimpleCorpus(sms_corpus, content_transformer(gsub), pattern =
## "\\W", : transformation drops documents
removeURL <- function(x) gsub("http^\\s\\s*", "", x)%>%
clean_corpus <- tm_map(clean_corpus, content_transformer(removeURL))
# removing numbers
clean_corpus <- tm_map(clean_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(clean_corpus, removeNumbers): transformation
## drops documents
# removing punctuation
clean_corpus <- tm_map(clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(clean_corpus, removePunctuation): transformation
## drops documents
## removing stop words
clean_corpus <- tm_map(clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(clean_corpus, removeWords, stopwords()):
## transformation drops documents
## removing whitespace
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(clean_corpus, stripWhitespace): transformation
## drops documents
Change into a Document Term Matrix
sms_dtm <- DocumentTermMatrix(clean_corpus)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 556)>>
## Non-/sparse entries: 887448/669908
## Sparsity : 43%
## Maximal term length: 15
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs com exmh invoked line list org spamassassin taint within workers
## 1 18 33 25 23 17 24 22 22 25 22
## 10 18 33 25 23 17 24 22 22 25 22
## 15 18 33 25 23 17 24 22 22 25 22
## 16 18 33 25 23 17 24 22 22 25 22
## 17 18 33 25 23 17 24 22 22 25 22
## 19 18 33 25 23 17 24 22 22 25 22
## 2 18 33 25 23 17 24 22 22 25 22
## 3 18 33 25 23 17 24 22 22 25 22
## 5 18 33 25 23 17 24 22 22 25 22
## 7 18 33 25 23 17 24 22 22 25 22
sms_dtm = removeSparseTerms(sms_dtm, 0.10)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 78)>>
## Non-/sparse entries: 218322/156
## Sparsity : 0%
## Maximal term length: 10
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aug com esmtp list localhost message one org received within
## 1 12 18 6 17 9 10 7 24 10 25
## 10 12 18 6 17 9 10 7 24 10 25
## 15 12 18 6 17 9 10 7 24 10 25
## 16 12 18 6 17 9 10 7 24 10 25
## 17 12 18 6 17 9 10 7 24 10 25
## 19 12 18 6 17 9 10 7 24 10 25
## 2 12 18 6 17 9 10 7 24 10 25
## 3 12 18 6 17 9 10 7 24 10 25
## 5 12 18 6 17 9 10 7 24 10 25
## 7 12 18 6 17 9 10 7 24 10 25
After Remove the Sparse Terms
sms_dtm = removeSparseTerms(sms_dtm, 0.10)
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 2801, terms: 78)>>
## Non-/sparse entries: 218322/156
## Sparsity : 0%
## Maximal term length: 10
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aug com esmtp list localhost message one org received within
## 1 12 18 6 17 9 10 7 24 10 25
## 10 12 18 6 17 9 10 7 24 10 25
## 15 12 18 6 17 9 10 7 24 10 25
## 16 12 18 6 17 9 10 7 24 10 25
## 17 12 18 6 17 9 10 7 24 10 25
## 19 12 18 6 17 9 10 7 24 10 25
## 2 12 18 6 17 9 10 7 24 10 25
## 3 12 18 6 17 9 10 7 24 10 25
## 5 12 18 6 17 9 10 7 24 10 25
## 7 12 18 6 17 9 10 7 24 10 25
Spam
just_spam <- which(random_spam_ham$type == "spam")
just_spam[1:5]
## [1] 1 2 3 5 7
Ham
just_ham <- which(random_spam_ham$type == "ham")
just_ham[1:5]
## [1] 4 6 8 9 11
sms_raw_train <- random_spam_ham[1:1680,]
sms_raw_test <- random_spam_ham[1681:2801,]
sms_dtm_train <- sms_dtm[1:1680, ]
sms_dtm_test <- sms_dtm[1681:2801,]
sms_corpus_train <- clean_corpus[1:1680]
sms_corpus_test <- clean_corpus[1681:2801]
spam <- subset(sms_raw_train, type == "spam")
ham <- subset(sms_raw_train, type == "ham")
Create Document Term Matrix for Train and Test
sms_train <- DocumentTermMatrix(sms_corpus_train)
sms_test <- DocumentTermMatrix(sms_corpus_test)
sms_train
## <<DocumentTermMatrix (documents: 1680, terms: 556)>>
## Non-/sparse entries: 528668/405412
## Sparsity : 43%
## Maximal term length: 15
## Weighting : term frequency (tf)
sms_test
## <<DocumentTermMatrix (documents: 1121, terms: 556)>>
## Non-/sparse entries: 358780/264496
## Sparsity : 42%
## Maximal term length: 15
## Weighting : term frequency (tf)
convert_count <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y }
sms_train <- apply(sms_train, 2, convert_count)
sms_test <- apply(sms_test, 2, convert_count)
The Naive Bayes
The R function for the Naives Bayes Classifier is e1071.
Naive Bayes has the capability to assign the probability that a new sample is either spam or ham.
This is bases on Bayes Rule the analysis of frequent occurrences of words and its assumption.
sms_classifier <- naiveBayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naiveBayes"
Conclusion
Using the Naive Bayes method is one of the best methods for spam filtering. Based on the results it correctly classified 99.84% of ham and it correctly classified 100% of spam. Strangely enough though when I went based on the most frequent terms did it incorrectly.
Rpubs => https://rpubs.com/gunduzhazal/833684