Step 1: Exploring and preparing the data Read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
Examine the structure of the sms data
str(sms_raw)
## 'data.frame': 5559 obs. of 2 variables:
## $ type: chr "ham" "ham" "ham" "spam" ...
## $ text: chr "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out"| __truncated__ ...
Convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)
Examine the type variable more carefully
str(sms_raw$type)
## Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
table(sms_raw$type)
##
## ham spam
## 4812 747
Build a corpus using the text mining (tm) package
library(tm)
## Loading required package: NLP
sms_corpus <- Corpus(VectorSource(sms_raw$text))
examine the sms corpus
print(sms_corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5559
inspect(sms_corpus[1:3])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] Hope you are having a good week. Just checking in
## [2] K..give back my thanks.
## [3] Am also doing in cbe only. But have to pay.
clean up the corpus using tm_map()
corpus_clean <- tm_map(sms_corpus, tolower)
corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
examine the clean corpus
inspect(sms_corpus[1:3])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] Hope you are having a good week. Just checking in
## [2] K..give back my thanks.
## [3] Am also doing in cbe only. But have to pay.
inspect(corpus_clean[1:3])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] hope good week just checking kgive back thanks
## [3] also cbe pay
create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_dtm
## <<DocumentTermMatrix (documents: 5559, terms: 7923)>>
## Non-/sparse entries: 42675/44001282
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
creating training and test datasets
sms_raw_train <- sms_raw[1:4169, ]
sms_raw_test <- sms_raw[4170:5559, ]
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
sms_corpus_train <- corpus_clean[1:4169]
sms_corpus_test <- corpus_clean[4170:5559]
check that the proportion of spam is similar
prop.table(table(sms_raw_train$type))
##
## ham spam
## 0.8647158 0.1352842
prop.table(table(sms_raw_test$type))
##
## ham spam
## 0.8683453 0.1316547
word cloud visualization
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(sms_corpus_train, min.freq = 30, random.order = FALSE)
subset the training data into spam and ham groups
spam <- subset(sms_raw_train, type == "spam")
ham <- subset(sms_raw_train, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))
indicator features for frequent words
sms_dict <- findFreqTerms(sms_dtm_train, 5)
#sms_dict <- Dictionary(findFreqTerms(sms_dtm_train, 5))
sms_train <- DocumentTermMatrix(sms_corpus_train, list(dictionary = sms_dict))
sms_test <- DocumentTermMatrix(sms_corpus_test, list(dictionary = sms_dict))
convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
}
apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_train, MARGIN = 2, convert_counts)
sms_test <- apply(sms_test, MARGIN = 2, convert_counts)
** Step 3: Training a model on the data **
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_raw_train$type)
#sms_classifier
Step 4: Evaluating model performance
sms_test_pred <- predict(sms_classifier, sms_test)
library(gmodels)
CrossTable(sms_test_pred, sms_raw_test$type,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1128 | 159 | 1287 |
## | 0.935 | 0.869 | |
## -------------|-----------|-----------|-----------|
## spam | 79 | 24 | 103 |
## | 0.065 | 0.131 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1207 | 183 | 1390 |
## | 0.868 | 0.132 | |
## -------------|-----------|-----------|-----------|
##
##
Step 5: Improving model performance
sms_classifier2 <- naiveBayes(sms_train, sms_raw_train$type, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_raw_test$type,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1111 | 153 | 1264 |
## | 0.920 | 0.836 | |
## -------------|-----------|-----------|-----------|
## spam | 96 | 30 | 126 |
## | 0.080 | 0.164 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1207 | 183 | 1390 |
## | 0.868 | 0.132 | |
## -------------|-----------|-----------|-----------|
##
##