It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/
In this project, we will be using dataset sms Collection from UCI Machine Learning Repository to create a Spam Classifier for SMS. This dataset includes the messages with a label indicating whether the message is unwanted, spam, or ham(legitimate messages)
library(tm)
## Loading required package: NLP
library(SnowballC)
library(RColorBrewer)
library(wordcloud) #word clouds visualisation
library(e1071) #naive bayes classifier
library(gmodels) #provides CrossTable() function for comparison
library(class)
# Loading data
raw_text <- read.csv("https://raw.githubusercontent.com/theoracley/Data607/master/Project4/sms_spam_ham.csv", stringsAsFactors = FALSE)
str(raw_text)
## 'data.frame': 5559 obs. of 2 variables:
## $ type: chr "ham" "ham" "ham" "spam" ...
## $ text: chr "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline "| __truncated__ ...
# Converting character vector to categorical vector
raw_text$type <- factor(raw_text$type)
str(raw_text$type) # verifying the conversion
## Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
table(raw_text$type)
##
## ham spam
## 4812 747
# creating our corpus
text_corpus <- VCorpus(VectorSource(raw_text$text))
# get summary of the first 5 texts
inspect(text_corpus[1:5]) # viewing first 5 texts
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 49
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 23
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 43
##
## [[4]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 149
##
## [[5]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 161
# Viewing the content of the first text
as.character(text_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
# Viewing the content of more than one texts using lapply() function
lapply(text_corpus[1:5], as.character)
## $`1`
## [1] "Hope you are having a good week. Just checking in"
##
## $`2`
## [1] "K..give back my thanks."
##
## $`3`
## [1] "Am also doing in cbe only. But have to pay."
##
## $`4`
## [1] "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out! Box434SK38WP150PPM18+"
##
## $`5`
## [1] "okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm"
# Do the usual words clean up
cleanCorpus <- tm_map(text_corpus, content_transformer(tolower)) # lowercase all texts
cleanCorpus <- tm_map(cleanCorpus, removeNumbers) # remove all numbers
cleanCorpus <- tm_map(cleanCorpus, removeWords, stopwords('english')) # remove all common words such as to, but and etc.
cleanCorpus <- tm_map(cleanCorpus, removePunctuation) # remove all punctuation
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace) # remove all whitespace
text_dtm <- DocumentTermMatrix(cleanCorpus)
inspect(text_dtm)
## <<DocumentTermMatrix (documents: 5559, terms: 7908)>>
## Non-/sparse entries: 42628/43917944
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs call can free get got just know like now will
## 1814 0 0 0 0 0 0 0 0 0 0
## 2046 0 0 0 0 0 0 1 1 0 0
## 295 0 0 1 0 0 0 0 1 0 0
## 2993 0 1 0 0 0 0 0 0 0 0
## 313 0 0 0 1 0 0 0 1 0 11
## 3201 0 0 1 0 0 0 0 1 0 0
## 3522 0 0 0 0 0 0 0 0 0 0
## 399 0 0 0 0 0 0 0 0 0 0
## 5068 0 0 0 0 0 0 0 0 0 0
## 5279 0 3 1 1 0 0 0 0 0 0
# Creating train and test portions
train <- text_dtm[1:4169, ] # 75% for training
test <- text_dtm[4170:5559, ] # 25% for testing
train_type <- raw_text[1:4169, ]$type
test_type <- raw_text[4170:5559, ]$type
#training portion
tbl_train <- prop.table(table(train_type))
tbl_train
## train_type
## ham spam
## 0.8647158 0.1352842
#testing portion
tbl_test <- prop.table(table(test_type))
tbl_test
## test_type
## ham spam
## 0.8683453 0.1316547
# Visualizing text by spam type
spamText <- subset(raw_text, type == "spam")
wordcloud(spamText$text, max.words = 50, scale = c(5, 0.3),random.order = FALSE, rot.per = 0.15, colors = brewer.pal(8, "Dark2") )
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
hamText <- subset(raw_text, type =="ham") # selecting ham texts
wordcloud(hamText$text, max.words = 50, scale = c(5, 0.3),random.order = FALSE, rot.per = 0.15, colors = brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
# eliminating any word that appear in less than 5 texts
freq_words <- findFreqTerms(train, 5)
str(freq_words)
## chr [1:1216] "£wk" "\200\230m" "\200\230s" "abiola" "able" "abt" ...
# Selecting only the frequent words from the train and test datasets
freq_words_train <- train[ , freq_words]
freq_words_test <- test[ , freq_words]
# creating a function for conversion
convert <- function(x) {x <- ifelse(x > 0, "y", "n")}
train <- apply(freq_words_train, MARGIN = 2, convert)
test <- apply(freq_words_test, MARGIN = 2, convert)
str(train) # verifying the conversion
## chr [1:4169, 1:1216] "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" ...
## - attr(*, "dimnames")=List of 2
## ..$ Docs : chr [1:4169] "1" "2" "3" "4" ...
## ..$ Terms: chr [1:1216] "£wk" "\200\230m" "\200\230s" "abiola" ...
# Creating a Naive Bayes classifier
sms_classifier <- naiveBayes(train, train_type)
# Making prediction & evaluation with the classifier
test_prediction <- predict(sms_classifier, test)
CrossTable(test_prediction, test_type,
prop.chisq = FALSE, prop.t = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1203 | 32 | 1235 |
## | 0.974 | 0.026 | 0.888 |
## | 0.997 | 0.175 | |
## -------------|-----------|-----------|-----------|
## spam | 4 | 151 | 155 |
## | 0.026 | 0.974 | 0.112 |
## | 0.003 | 0.825 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1207 | 183 | 1390 |
## | 0.868 | 0.132 | |
## -------------|-----------|-----------|-----------|
##
##
This Classifier gives us 97% accuracy, with only 36 (30 + 6) mislabeled messages from the total.
Let’s tweak one of the parameters of the NaiveBayes classifier, that is laplace parameter, and see what’s going to happen. So let’s improve the model!!
#laplace assures that one word is not mislabled just because it appeared once on ham/spam texts
sms_classifier_improved <- naiveBayes(train, train_type, laplace = 1)
test_prediction_improved <- predict(sms_classifier_improved, test)
CrossTable(test_prediction_improved, test_type,
prop.chisq = FALSE, prop.t = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1204 | 31 | 1235 |
## | 0.975 | 0.025 | 0.888 |
## | 0.998 | 0.169 | |
## -------------|-----------|-----------|-----------|
## spam | 3 | 152 | 155 |
## | 0.019 | 0.981 | 0.112 |
## | 0.002 | 0.831 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1207 | 183 | 1390 |
## | 0.868 | 0.132 | |
## -------------|-----------|-----------|-----------|
##
##
As we can see, only 34 messages are now mislabeled.