Step 1-Collecting Data
To develop the Naive Bayes classifier, we will use data adapted from the SMS Spam Collection at http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/.
This dataset includes the text of SMS messages along with a label indicating whether the message is unwanted. Junk messages are labeled spam, while legitimate messages are labeled ham.
Step 2-Exploring and preparing the data
# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
# examine the structure of the sms data
str(sms_raw)
'data.frame': 5559 obs. of 2 variables:
$ type: Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
$ text: chr "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out!"| __truncated__ ...
# convert spam/ham to factor. Factor converts charactor vector to numerical like 0,1.
sms_raw$type <- factor(sms_raw$type)
# build a corpus using the text mining (tm) package
library(tm)
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
# vectorSource takes apart email,it's a reader function.Vcourpus put into a nice structure.Vcorpus stores in memory as oppose to Pcorpus, stored in Disk
# Corpus is a collection of text documents
# examine the sms corpus
print(sms_corpus)
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 5559
inspect(sms_corpus[1:2]) # summary of the 1st and 2nd SMS message in the corpus
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 2
[[1]]
<<PlainTextDocument>>
Metadata: 7
Content: chars: 49
[[2]]
<<PlainTextDocument>>
Metadata: 7
Content: chars: 23
as.character(sms_corpus[[1]])
[1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[1:2], as.character) # lapply apply a procedure to each element in the data sturture
$`1`
[1] "Hope you are having a good week. Just checking in"
$`2`
[1] "K..give back my thanks."
# clean up the corpus using tm_map()
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower)) #tolower : transform all letters to lower case
# show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]]) # as.charactor to view the actual text
[1] "Hope you are having a good week. Just checking in"
as.character(sms_corpus_clean[[1]])
[1] "hope you are having a good week. just checking in"
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers)
# remove numbers
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words like and, or the
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation
# tip: create a custom function to replace (rather than remove) punctuation
removePunctuation("hello...world")
[1] "helloworld"
replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) } # replace the punctuations in x with a space rather than remove
replacePunctuation("hello...world")
[1] "hello world"
# illustration of word stemming
library(SnowballC) # snowwball put words into stems: eg: left to leave
wordStem(c("learn", "learned", "learning", "learns"))
[1] "learn" "learn" "learn" "learn"
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace
# create a document-term sparse matrix: DTM:rows indicates text messgaes and columns indicate words
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
# alternative solution: create a document-term sparse matrix directly from the SMS corpus # NICE:do all the data cleaning in one step.
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stemming = TRUE
))
# alternative solution: using custom stop words function ensures identical result
sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = function(x) { removeWords(x, stopwords()) },
removePunctuation = TRUE,
stemming = TRUE
))
# compare the result
sms_dtm
<<DocumentTermMatrix (documents: 5559, terms: 6518)>>
Non-/sparse entries: 42113/36191449
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
sms_dtm2
<<DocumentTermMatrix (documents: 5559, terms: 6909)>>
Non-/sparse entries: 43192/38363939
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
sms_dtm3
<<DocumentTermMatrix (documents: 5559, terms: 6518)>>
Non-/sparse entries: 42113/36191449
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
# creating training and test datasets
# Here I used dtm3 for my clean data as oppose to dtm was used in the lecture
sms_dtm_train <- sms_dtm3[1:4169, ]
sms_dtm_test <- sms_dtm3[4170:5559, ]
# also save the labels
sms_train_labels <- sms_raw[1:4169, ]$type # label data by their types: ham or spam
sms_test_labels <- sms_raw[4170:5559, ]$type
# check that the proportion of spam is similar in training vs testing
prop.table(table(sms_train_labels))
sms_train_labels
ham spam
0.8647158 0.1352842
prop.table(table(sms_test_labels))
sms_test_labels
ham spam
0.8683453 0.1316547
# word cloud visualization
library(wordcloud)
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)
# random =false means the higher frequence word will appear closer to the center
# min.fre=50 means the word has to accur in the corpus=50/5559=1%.
# subset the training data into spam and ham groups
spam <- subset(sms_raw, type == "spam")
ham <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))
sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)
sms_dtm_freq_train
<<DocumentTermMatrix (documents: 4169, terms: 1101)>>
Non-/sparse entries: 24834/4565235
Sparsity : 99%
Maximal term length: 19
Weighting : term frequency (tf)
# indicator features for frequent words
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)
chr [1:1136] "abiola" "abl" "abt" ...
# 5 represents at least 5 accurances
#findFreqTerms: takes a DTM and returns a character vector containing
# the words that appear for at least the specified number of times
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words] # remove colums that does not appear at least 5 times
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]
# convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
#The ifelse(x > 0, "Yes", "No") statement transforms the values in x, so that if the value is greater than 0, then it will be replaced by "Yes",otherwise it will be replaced by a "No" string. Lastly, the newly transformed x vector is returned.
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)
# margin= 2 is to apply to column, margin=1 is to row.
Step 3- Training a model on the data
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
# what's in the classifier: conditional probability for each
Step 4-Evaluating model performance
sms_test_pred <- predict(sms_classifier, sms_test)
sms_test_pred
[1] ham ham ham ham spam ham ham ham ham
[10] spam ham ham ham spam spam ham ham ham
[19] ham ham ham ham ham ham ham ham ham
[28] ham ham ham ham ham ham ham spam ham
[37] ham ham ham spam ham ham ham ham ham
[46] ham ham ham ham spam ham ham ham ham
[55] ham ham ham ham ham ham ham ham ham
[64] ham ham ham ham ham ham ham ham ham
[73] ham ham ham ham ham ham ham ham spam
[82] ham ham ham ham ham ham ham ham ham
[91] ham ham ham ham ham ham ham spam ham
[100] ham ham ham ham ham ham ham ham ham
[109] ham spam spam ham ham ham ham ham ham
[118] ham ham ham ham ham ham ham ham spam
[127] ham ham ham ham ham ham spam ham ham
[136] ham ham ham ham ham ham ham ham ham
[145] ham ham spam ham ham ham ham ham spam
[154] ham spam ham ham ham spam ham ham ham
[163] ham ham spam ham ham ham spam ham ham
[172] ham ham ham ham ham ham ham spam ham
[181] ham ham ham ham ham ham spam ham ham
[190] ham spam ham ham ham ham ham ham ham
[199] ham spam spam ham ham ham ham ham ham
[208] ham ham spam ham ham ham ham ham ham
[217] spam ham ham ham ham ham ham ham ham
[226] ham ham ham ham ham ham ham ham spam
[235] ham ham ham ham spam ham ham ham ham
[244] ham ham ham ham ham ham spam ham ham
[253] ham ham ham ham ham ham ham ham ham
[262] ham ham ham ham ham ham ham ham spam
[271] ham ham ham ham ham ham ham ham ham
[280] ham ham ham ham ham ham ham ham ham
[289] ham ham ham spam ham ham ham ham ham
[298] spam ham ham ham ham ham ham ham ham
[307] ham spam ham ham ham spam ham ham ham
[316] ham ham ham ham ham ham ham ham ham
[325] ham ham ham spam ham ham ham ham ham
[334] ham ham ham spam ham ham ham ham ham
[343] ham ham ham ham spam spam ham ham ham
[352] ham ham ham ham ham ham ham ham ham
[361] ham ham ham ham ham ham ham spam ham
[370] ham ham ham ham ham ham ham ham ham
[379] spam ham ham ham ham spam ham spam ham
[388] ham ham ham ham ham ham ham ham ham
[397] ham ham ham ham ham ham ham ham ham
[406] spam ham ham spam ham ham ham spam spam
[415] ham ham ham ham ham ham spam ham ham
[424] ham ham ham ham ham ham ham ham ham
[433] ham ham spam ham spam ham ham ham ham
[442] ham ham ham ham ham ham spam ham ham
[451] ham spam ham ham ham ham ham ham ham
[460] ham ham ham ham spam ham ham ham ham
[469] ham ham ham ham ham ham ham ham ham
[478] ham spam ham ham ham spam ham ham ham
[487] ham ham ham spam ham ham ham ham ham
[496] ham spam ham ham spam spam spam ham ham
[505] ham ham ham ham spam ham ham ham ham
[514] spam ham ham ham ham ham ham ham ham
[523] ham spam ham ham ham ham ham ham ham
[532] ham ham ham ham spam ham spam ham ham
[541] ham ham ham ham spam ham ham ham ham
[550] ham ham ham ham ham ham ham ham ham
[559] ham ham ham ham ham spam ham ham ham
[568] ham ham ham ham ham ham ham ham ham
[577] spam ham ham ham ham ham ham ham ham
[586] ham ham ham ham spam ham ham ham ham
[595] ham ham ham ham ham ham ham ham ham
[604] ham ham ham ham spam ham spam ham ham
[613] ham ham ham ham ham ham ham ham ham
[622] ham ham ham spam ham ham ham spam ham
[631] ham ham ham ham ham ham ham ham ham
[640] spam ham ham ham ham ham ham ham ham
[649] ham spam ham ham ham ham ham ham ham
[658] ham ham ham ham ham ham ham ham ham
[667] ham ham ham ham ham ham ham ham ham
[676] ham spam ham ham ham ham ham ham ham
[685] ham spam spam ham ham spam ham ham spam
[694] ham ham ham ham ham ham ham ham ham
[703] ham ham ham ham ham spam ham ham ham
[712] ham ham ham ham ham spam ham ham ham
[721] ham ham ham ham ham ham ham ham ham
[730] ham spam ham spam ham ham spam ham ham
[739] ham ham spam ham spam ham ham ham ham
[748] ham spam ham spam ham ham ham ham ham
[757] ham ham ham ham ham ham ham ham ham
[766] ham ham ham ham spam ham ham ham ham
[775] ham ham ham ham ham ham spam ham ham
[784] ham ham ham ham ham ham ham ham ham
[793] ham ham ham ham ham ham ham ham ham
[802] spam ham ham ham ham ham ham ham ham
[811] ham ham ham spam ham ham spam ham spam
[820] ham ham ham ham spam ham spam ham spam
[829] ham ham ham spam ham ham ham ham spam
[838] ham ham ham ham ham ham ham ham ham
[847] ham ham ham ham ham ham ham ham ham
[856] ham ham spam ham ham spam ham ham ham
[865] ham ham ham ham ham ham ham ham ham
[874] ham ham ham ham ham ham ham ham ham
[883] ham ham ham ham ham ham ham ham ham
[892] ham ham ham ham spam spam ham ham spam
[901] ham ham ham ham ham ham ham ham spam
[910] ham ham ham ham spam ham ham ham ham
[919] ham ham ham ham ham ham ham ham ham
[928] ham ham ham spam ham ham ham ham ham
[937] ham ham ham ham ham ham ham ham ham
[946] ham ham ham ham ham ham ham ham ham
[955] ham spam ham ham ham ham ham ham ham
[964] ham ham ham spam ham ham ham ham ham
[973] ham ham ham ham ham ham ham spam ham
[982] ham ham ham ham ham ham ham ham ham
[991] ham ham ham spam ham ham ham ham ham
[1000] spam
[ reached getOption("max.print") -- omitted 390 entries ]
Levels: ham spam
library(gmodels)
CrossTable(sms_test_pred, sms_test_labels,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
Cell Contents
|-------------------------|
| N |
| N / Col Total |
|-------------------------|
Total Observations in Table: 1390
| actual
predicted | ham | spam | Row Total |
-------------|-----------|-----------|-----------|
ham | 1201 | 30 | 1231 |
| 0.995 | 0.164 | |
-------------|-----------|-----------|-----------|
spam | 6 | 153 | 159 |
| 0.005 | 0.836 | |
-------------|-----------|-----------|-----------|
Column Total | 1207 | 183 | 1390 |
| 0.868 | 0.132 | |
-------------|-----------|-----------|-----------|
Step 5: Improving model performancestep
sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
Cell Contents
|-------------------------|
| N |
| N / Col Total |
|-------------------------|
Total Observations in Table: 1390
| actual
predicted | ham | spam | Row Total |
-------------|-----------|-----------|-----------|
ham | 1202 | 28 | 1230 |
| 0.996 | 0.153 | |
-------------|-----------|-----------|-----------|
spam | 5 | 155 | 160 |
| 0.004 | 0.847 | |
-------------|-----------|-----------|-----------|
Column Total | 1207 | 183 | 1390 |
| 0.868 | 0.132 | |
-------------|-----------|-----------|-----------|
## change lplace =1 makes the model a lil bit beter (cuz FN and FP decrease)
## tuning parameter is to add 1.5 to each value is that to move them a lil away from 0
## Naive Bayes is just as slow as k-NN
Accuracy when laplace=0: Accuracy=(1201+153)/1390= 97%
Accuracy when laplace=1: Accuracy=(1202+155)/1390= 98%