Chapter 4: Classification using Naive Bayes ——————–

Example: Filtering spam SMS messages —-

Step 2: Exploring and preparing the data —-

read the sms data into the sms data frame

sms_raw <- read.csv(“sms_spam.csv”, stringsAsFactors = FALSE)

examine the structure of the sms data

str(sms_raw)

convert spam/ham to factor.

sms_raw$type <- factor(sms_raw$type)

examine the type variable more carefully

str(sms_raw$type) table(sms_raw$type)

build a corpus using the text mining (tm) package

library(tm) sms_corpus <- VCorpus(VectorSource(sms_raw$text))

examine the sms corpus

print(sms_corpus) inspect(sms_corpus[1:2])

as.character(sms_corpus[[1]]) lapply(sms_corpus[1:2], as.character)

clean up the corpus using tm_map()

sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))

show the difference between sms_corpus and corpus_clean

as.character(sms_corpus[[1]]) as.character(sms_corpus_clean[[1]])

sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers) # remove numbers sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation

tip: create a custom function to replace (rather than remove) punctuation

removePunctuation(“hello…world”) replacePunctuation <- function(x) { gsub(“[[:punct:]]+”, " “, x) } replacePunctuation(”hello…world“)

illustration of word stemming

library(SnowballC) wordStem(c(“learn”, “learned”, “learning”, “learns”))

sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)

sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace

examine the final clean corpus

lapply(sms_corpus[1:3], as.character) lapply(sms_corpus_clean[1:3], as.character)

create a document-term sparse matrix

sms_dtm <- DocumentTermMatrix(sms_corpus_clean)

alternative solution: create a document-term sparse matrix directly from the SMS corpus

sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list( tolower = TRUE, removeNumbers = TRUE, stopwords = TRUE, removePunctuation = TRUE, stemming = TRUE ))

alternative solution: using custom stop words function ensures identical result

sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list( tolower = TRUE, removeNumbers = TRUE, stopwords = function(x) { removeWords(x, stopwords()) }, removePunctuation = TRUE, stemming = TRUE ))

compare the result

sms_dtm sms_dtm2 sms_dtm3

creating training and test datasets

sms_dtm_train <- sms_dtm[1:4169, ] sms_dtm_test <- sms_dtm[4170:5559, ]

also save the labels

sms_train_labels <- sms_raw[1:4169, ]$type sms_test_labels <- sms_raw[4170:5559, ]$type

check that the proportion of spam is similar

prop.table(table(sms_train_labels)) prop.table(table(sms_test_labels))

word cloud visualization

library(wordcloud) wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)

subset the training data into spam and ham groups

spam <- subset(sms_raw, type == “spam”) ham <- subset(sms_raw, type == “ham”)

wordcloud(spam$text, max.words = 40, scale = c(3, 0.5)) wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999) sms_dtm_freq_train

indicator features for frequent words

findFreqTerms(sms_dtm_train, 5)

save frequently-appearing terms to a character vector

sms_freq_words <- findFreqTerms(sms_dtm_train, 5) str(sms_freq_words)

create DTMs with only the frequent terms

sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words] sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]

convert counts to a factor

convert_counts <- function(x) { x <- ifelse(x > 0, “Yes”, “No”) }

apply() convert_counts() to columns of train/test data

sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts) sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)

Step 3: Training a model on the data —-

library(e1071) sms_classifier <- naiveBayes(sms_train, sms_train_labels)

Step 4: Evaluating model performance —-

sms_test_pred <- predict(sms_classifier, sms_test)

head(sms_test_pred)

library(gmodels) CrossTable(sms_test_pred, sms_test_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c(‘predicted’, ‘actual’))

Step 5: Improving model performance —-

sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 5) sms_test_pred2 <- predict(sms_classifier2, sms_test) CrossTable(sms_test_pred2, sms_test_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c(‘predicted’, ‘actual’))