Project 4 - Document Classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

In this project, we will be using dataset sms Collection from UCI Machine Learning Repository to create a Spam Classifier for SMS. This dataset includes the messages with a label indicating whether the message is unwanted, spam, or ham(legitimate messages)

library(tm)

## Loading required package: NLP

library(SnowballC)
library(RColorBrewer)
library(wordcloud) #word clouds visualisation
library(e1071) #naive bayes classifier
library(gmodels) #provides CrossTable() function for comparison
library(class)

# Loading data
raw_text <- read.csv("https://raw.githubusercontent.com/theoracley/Data607/master/Project4/sms_spam_ham.csv", stringsAsFactors = FALSE)

str(raw_text)

## 'data.frame':    5559 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "ham" "spam" ...
##  $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline "| __truncated__ ...

# Converting character vector to categorical vector
raw_text$type <- factor(raw_text$type)
str(raw_text$type) # verifying the conversion

##  Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...

table(raw_text$type)

## 
##  ham spam 
## 4812  747

# creating our corpus
text_corpus <- VCorpus(VectorSource(raw_text$text))

# get summary of the first 5 texts
inspect(text_corpus[1:5]) # viewing first 5 texts

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 5
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 49
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 23
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 43
## 
## [[4]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 149
## 
## [[5]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 161

# Viewing the content of the first text
as.character(text_corpus[[1]])

## [1] "Hope you are having a good week. Just checking in"

# Viewing the content of more than one texts using lapply() function
lapply(text_corpus[1:5], as.character)

## $`1`
## [1] "Hope you are having a good week. Just checking in"
## 
## $`2`
## [1] "K..give back my thanks."
## 
## $`3`
## [1] "Am also doing in cbe only. But have to pay."
## 
## $`4`
## [1] "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out! Box434SK38WP150PPM18+"
## 
## $`5`
## [1] "okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm"

Cleaning our Corpus

# Do the usual words clean up
cleanCorpus <- tm_map(text_corpus, content_transformer(tolower)) # lowercase all texts
cleanCorpus <- tm_map(cleanCorpus, removeNumbers) # remove all numbers
cleanCorpus <- tm_map(cleanCorpus, removeWords, stopwords('english')) # remove all common words such as to, but and etc.
cleanCorpus <- tm_map(cleanCorpus, removePunctuation) # remove all punctuation
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace) # remove all whitespace


text_dtm <- DocumentTermMatrix(cleanCorpus)
inspect(text_dtm)

## <<DocumentTermMatrix (documents: 5559, terms: 7908)>>
## Non-/sparse entries: 42628/43917944
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   call can free get got just know like now will
##   1814    0   0    0   0   0    0    0    0   0    0
##   2046    0   0    0   0   0    0    1    1   0    0
##   295     0   0    1   0   0    0    0    1   0    0
##   2993    0   1    0   0   0    0    0    0   0    0
##   313     0   0    0   1   0    0    0    1   0   11
##   3201    0   0    1   0   0    0    0    1   0    0
##   3522    0   0    0   0   0    0    0    0   0    0
##   399     0   0    0   0   0    0    0    0   0    0
##   5068    0   0    0   0   0    0    0    0   0    0
##   5279    0   3    1   1   0    0    0    0   0    0

# Creating train and test portions 
train <- text_dtm[1:4169, ] # 75% for training
test <- text_dtm[4170:5559, ] # 25% for testing
train_type <- raw_text[1:4169, ]$type
test_type <- raw_text[4170:5559, ]$type

Verifying both portions are equally distributed

#training portion
tbl_train <- prop.table(table(train_type))
tbl_train

## train_type
##       ham      spam 
## 0.8647158 0.1352842

#testing portion
tbl_test <- prop.table(table(test_type))
tbl_test

## test_type
##       ham      spam 
## 0.8683453 0.1316547

Build the spam cloud

# Visualizing text by spam type
spamText <- subset(raw_text, type == "spam") 
wordcloud(spamText$text, max.words = 50, scale = c(5, 0.3),random.order = FALSE, rot.per = 0.15, colors = brewer.pal(8, "Dark2") )

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Build the ham cloud

hamText <- subset(raw_text, type =="ham") # selecting ham texts
wordcloud(hamText$text, max.words = 50, scale = c(5, 0.3),random.order = FALSE, rot.per = 0.15, colors = brewer.pal(8, "Dark2"))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

# eliminating any word that appear in less than 5 texts
freq_words <- findFreqTerms(train, 5) 
str(freq_words)

##  chr [1:1216] "£wk" "\200\230m" "\200\230s" "abiola" "able" "abt" ...

Converting numerical vectors of the DTM to categorical vector for the model

# Selecting only the frequent words from the train and test datasets
freq_words_train <- train[ , freq_words]
freq_words_test <- test[ , freq_words]


# creating a function for conversion
convert <- function(x) {x <- ifelse(x > 0, "y", "n")} 
train <- apply(freq_words_train, MARGIN = 2, convert)
test <- apply(freq_words_test, MARGIN = 2, convert)
str(train) # verifying the conversion

##  chr [1:4169, 1:1216] "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" ...
##  - attr(*, "dimnames")=List of 2
##   ..$ Docs : chr [1:4169] "1" "2" "3" "4" ...
##   ..$ Terms: chr [1:1216] "£wk" "\200\230m" "\200\230s" "abiola" ...

Training our Model

# Creating a Naive Bayes classifier
sms_classifier <- naiveBayes(train, train_type)

# Making prediction & evaluation with the classifier
test_prediction <- predict(sms_classifier, test)

CrossTable(test_prediction, test_type, 
           prop.chisq = FALSE, prop.t = FALSE,
           dnn = c('predicted', 'actual'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1390 
## 
##  
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1203 |        32 |      1235 | 
##              |     0.974 |     0.026 |     0.888 | 
##              |     0.997 |     0.175 |           | 
## -------------|-----------|-----------|-----------|
##         spam |         4 |       151 |       155 | 
##              |     0.026 |     0.974 |     0.112 | 
##              |     0.003 |     0.825 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1207 |       183 |      1390 | 
##              |     0.868 |     0.132 |           | 
## -------------|-----------|-----------|-----------|
## 
##

This Classifier gives us 97% accuracy, with only 36 (30 + 6) mislabeled messages from the total.

Let’s tweak one of the parameters of the NaiveBayes classifier, that is laplace parameter, and see what’s going to happen. So let’s improve the model!!

#laplace assures that one word is not mislabled just because it appeared once on ham/spam texts
sms_classifier_improved <- naiveBayes(train, train_type, laplace = 1)
test_prediction_improved <- predict(sms_classifier_improved, test)

CrossTable(test_prediction_improved, test_type, 
           prop.chisq = FALSE, prop.t = FALSE,
           dnn = c('predicted', 'actual'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1390 
## 
##  
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1204 |        31 |      1235 | 
##              |     0.975 |     0.025 |     0.888 | 
##              |     0.998 |     0.169 |           | 
## -------------|-----------|-----------|-----------|
##         spam |         3 |       152 |       155 | 
##              |     0.019 |     0.981 |     0.112 | 
##              |     0.002 |     0.831 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1207 |       183 |      1390 | 
##              |     0.868 |     0.132 |           | 
## -------------|-----------|-----------|-----------|
## 
##

As we can see, only 34 messages are now mislabeled.