Example: Filtering spam SMS messages

Step 1: Exploring and preparing the data Read the sms data into the sms data frame

sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)

Examine the structure of the sms data

str(sms_raw)
## 'data.frame':    5559 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "ham" "spam" ...
##  $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out"| __truncated__ ...

Convert spam/ham to factor.

sms_raw$type <- factor(sms_raw$type)

Examine the type variable more carefully

str(sms_raw$type)
##  Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
table(sms_raw$type)
## 
##  ham spam 
## 4812  747

Build a corpus using the text mining (tm) package

library(tm)
## Loading required package: NLP
sms_corpus <- Corpus(VectorSource(sms_raw$text))

examine the sms corpus

print(sms_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5559
inspect(sms_corpus[1:3])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] Hope you are having a good week. Just checking in
## [2] K..give back my thanks.                          
## [3] Am also doing in cbe only. But have to pay.

clean up the corpus using tm_map()

corpus_clean <- tm_map(sms_corpus, tolower)
corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, stripWhitespace)

examine the clean corpus

inspect(sms_corpus[1:3])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] Hope you are having a good week. Just checking in
## [2] K..give back my thanks.                          
## [3] Am also doing in cbe only. But have to pay.
inspect(corpus_clean[1:3])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] hope good week just checking  kgive back thanks            
## [3]  also cbe pay

create a document-term sparse matrix

sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_dtm
## <<DocumentTermMatrix (documents: 5559, terms: 7923)>>
## Non-/sparse entries: 42675/44001282
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency (tf)

creating training and test datasets

sms_raw_train <- sms_raw[1:4169, ]
sms_raw_test  <- sms_raw[4170:5559, ]

sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test  <- sms_dtm[4170:5559, ]

sms_corpus_train <- corpus_clean[1:4169]
sms_corpus_test  <- corpus_clean[4170:5559]

check that the proportion of spam is similar

prop.table(table(sms_raw_train$type))
## 
##       ham      spam 
## 0.8647158 0.1352842
prop.table(table(sms_raw_test$type))
## 
##       ham      spam 
## 0.8683453 0.1316547

word cloud visualization

library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(sms_corpus_train, min.freq = 30, random.order = FALSE)

subset the training data into spam and ham groups

spam <- subset(sms_raw_train, type == "spam")
ham  <- subset(sms_raw_train, type == "ham")

wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

indicator features for frequent words

sms_dict <- findFreqTerms(sms_dtm_train, 5)
#sms_dict <- Dictionary(findFreqTerms(sms_dtm_train, 5))
sms_train <- DocumentTermMatrix(sms_corpus_train, list(dictionary = sms_dict))
sms_test  <- DocumentTermMatrix(sms_corpus_test, list(dictionary = sms_dict))

convert counts to a factor

convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
}

apply() convert_counts() to columns of train/test data

sms_train <- apply(sms_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_test, MARGIN = 2, convert_counts)

** Step 3: Training a model on the data **

library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_raw_train$type)
#sms_classifier

Step 4: Evaluating model performance

sms_test_pred <- predict(sms_classifier, sms_test)

library(gmodels)
CrossTable(sms_test_pred, sms_raw_test$type,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1390 
## 
##  
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1128 |       159 |      1287 | 
##              |     0.935 |     0.869 |           | 
## -------------|-----------|-----------|-----------|
##         spam |        79 |        24 |       103 | 
##              |     0.065 |     0.131 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1207 |       183 |      1390 | 
##              |     0.868 |     0.132 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Step 5: Improving model performance

sms_classifier2 <- naiveBayes(sms_train, sms_raw_train$type, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_raw_test$type,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1390 
## 
##  
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1111 |       153 |      1264 | 
##              |     0.920 |     0.836 |           | 
## -------------|-----------|-----------|-----------|
##         spam |        96 |        30 |       126 | 
##              |     0.080 |     0.164 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1207 |       183 |      1390 | 
##              |     0.868 |     0.132 |           | 
## -------------|-----------|-----------|-----------|
## 
##