##### Chapter 4: Classification using Naive Bayes --------------------
## Example: Filtering spam SMS messages ----
## Step 2: Exploring and preparing the data ----
# read the sms data into the sms data frame
sms_raw <- read.csv("sms2.csv", stringsAsFactors = FALSE)
# examine the structure of the sms data
str(sms_raw)
## 'data.frame': 5572 obs. of 2 variables:
## $ type: chr "ham" "ham" "spam" "ham" ...
## $ text: chr "Go until jurong point" "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...
# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)
# examine the type variable more carefully
str(sms_raw$type)
## Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
table(sms_raw$type)
##
## ham spam
## 4825 747
# build a corpus using the text mining (tm) package
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
# examine the sms corpus
print(sms_corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5572
inspect(sms_corpus[1:2])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 21
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 29
as.character(sms_corpus[[1]])
## [1] "Go until jurong point"
lapply(sms_corpus[1:2], as.character)
## $`1`
## [1] "Go until jurong point"
##
## $`2`
## [1] "Ok lar... Joking wif u oni..."
# clean up the corpus using tm_map()
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))
# show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]])
## [1] "Go until jurong point"
as.character(sms_corpus_clean[[1]])
## [1] "go until jurong point"
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers) # remove numbers
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation
# tip: create a custom function to replace (rather than remove) punctuation
removePunctuation("hello...world")
## [1] "helloworld"
replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) }
replacePunctuation("hello...world")
## [1] "hello world"
# illustration of word stemming
library(SnowballC)
wordStem(c("learn", "learned", "learning", "learns"))
## [1] "learn" "learn" "learn" "learn"
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace
# examine the final clean corpus
lapply(sms_corpus[1:3], as.character)
## $`1`
## [1] "Go until jurong point"
##
## $`2`
## [1] "Ok lar... Joking wif u oni..."
##
## $`3`
## [1] "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
lapply(sms_corpus_clean[1:3], as.character)
## $`1`
## [1] "go jurong point"
##
## $`2`
## [1] "ok lar joke wif u oni"
##
## $`3`
## [1] "free entri wkli comp win fa cup final tkts st may text fa receiv entri questionstd txt ratetc appli s"
# create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
# alternative solution: create a document-term sparse matrix directly from the SMS corpus
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stemming = TRUE
))
# alternative solution: using custom stop words function ensures identical result
sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = function(x) { removeWords(x, stopwords()) },
removePunctuation = TRUE,
stemming = TRUE
))
# compare the result
sms_dtm
## <<DocumentTermMatrix (documents: 5572, terms: 5669)>>
## Non-/sparse entries: 34183/31553485
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
sms_dtm2
## <<DocumentTermMatrix (documents: 5572, terms: 5984)>>
## Non-/sparse entries: 34983/33307865
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
sms_dtm3
## <<DocumentTermMatrix (documents: 5572, terms: 5669)>>
## Non-/sparse entries: 34183/31553485
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
# creating training and test datasets
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
# also save the labels
sms_train_labels <- sms_raw[1:4169, ]$type
sms_test_labels <- sms_raw[4170:5559, ]$type
# check that the proportion of spam is similar
prop.table(table(sms_train_labels))
## sms_train_labels
## ham spam
## 0.8644759 0.1355241
prop.table(table(sms_test_labels))
## sms_test_labels
## ham spam
## 0.8705036 0.1294964
# word cloud visualization
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
## Loading required package: RColorBrewer
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)

# subset the training data into spam and ham groups
spam <- subset(sms_raw, type == "spam")
ham <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)
sms_dtm_freq_train
## <<DocumentTermMatrix (documents: 4169, terms: 939)>>
## Non-/sparse entries: 19581/3895110
## Sparsity : 99%
## Maximal term length: 13
## Weighting : term frequency (tf)
# indicator features for frequent words
findFreqTerms(sms_dtm_train, 5)
## [1] "â<U+0080><U+0093>" "abiola" "abl" "abt"
## [5] "accept" "access" "account" "activ"
## [9] "actual" "add" "address" "admir"
## [13] "adult" "advanc" "aft" "afternoon"
## [17] "age" "ago" "aight" "aint"
## [21] "air" "aiyah" "almost" "alon"
## [25] "alreadi" "alright" "alrit" "also"
## [29] "alway" "amp" "angri" "announc"
## [33] "anoth" "answer" "anymor" "anyon"
## [37] "anyth" "anytim" "anyway" "apart"
## [41] "app" "appli" "appoint" "appreci"
## [45] "april" "ard" "area" "arm"
## [49] "around" "ask" "askd" "ass"
## [53] "attempt" "auction" "avail" "ave"
## [57] "await" "award" "away" "awesom"
## [61] "babe" "babi" "back" "bad"
## [65] "bag" "bak" "bank" "bath"
## [69] "batteri" "bcum" "bday" "beauti"
## [73] "becom" "bed" "believ" "best"
## [77] "better" "bid" "big" "bill"
## [81] "bird" "birthday" "bit" "black"
## [85] "bless" "bonus" "boo" "book"
## [89] "bore" "bout" "box" "boy"
## [93] "boytoy" "brand" "break" "bring"
## [97] "brother" "bugi" "bus" "busi"
## [101] "buy" "buzz" "cabin" "call"
## [105] "caller" "callertun" "camcord" "came"
## [109] "camera" "can" "cant" "car"
## [113] "card" "care" "carlo" "case"
## [117] "cash" "cashbal" "catch" "caus"
## [121] "chanc" "chang" "charact" "charg"
## [125] "chariti" "chat" "check" "cheer"
## [129] "chennai" "chikku" "choos" "christma"
## [133] "cine" "cinema" "claim" "class"
## [137] "clean" "clear" "clock" "close"
## [141] "club" "code" "coffe" "coin"
## [145] "collect" "colleg" "colour" "come"
## [149] "comin" "comp" "compani" "competit"
## [153] "complet" "complimentari" "comput" "concentr"
## [157] "condit" "confirm" "congrat" "congratul"
## [161] "connect" "contact" "content" "convey"
## [165] "cool" "copi" "correct" "cos"
## [169] "cost" "countri" "cours" "cover"
## [173] "coz" "crave" "crazi" "credit"
## [177] "cum" "cup" "current" "custcar"
## [181] "custom" "cut" "cute" "dad"
## [185] "daddi" "damn" "darl" "darlin"
## [189] "darren" "dat" "date" "day"
## [193] "deal" "dear" "decid" "definit"
## [197] "del" "deliveri" "den" "depend"
## [201] "detail" "didnt" "die" "differ"
## [205] "difficult" "digit" "din" "dinner"
## [209] "direct" "dis" "discount" "disturb"
## [213] "dnt" "doctor" "doesnt" "dog"
## [217] "doin" "don" "donâ<U+0080><U+0098>t" "done"
## [221] "dont" "door" "doubl" "download"
## [225] "draw" "dream" "drink" "drive"
## [229] "drop" "drug" "dude" "dun"
## [233] "dunno" "dvd" "earli" "earlier"
## [237] "easi" "eat" "either" "els"
## [241] "email" "empti" "end" "enemi"
## [245] "england" "enjoy" "enough" "enter"
## [249] "entri" "euro" "eve" "even"
## [253] "ever" "everi" "everyon" "everyth"
## [257] "exact" "exam" "excel" "excit"
## [261] "excus" "expect" "experi" "expir"
## [265] "eye" "face" "facebook" "fact"
## [269] "fall" "famili" "fanci" "fantasi"
## [273] "far" "fast" "fat" "father"
## [277] "feel" "fetch" "fight" "figur"
## [281] "film" "final" "find" "fine"
## [285] "finger" "finish" "first" "five"
## [289] "fix" "flight" "flirt" "flower"
## [293] "follow" "fone" "food" "forget"
## [297] "forgot" "forward" "found" "free"
## [301] "freemsg" "freephon" "fren" "fri"
## [305] "friday" "friend" "friendship" "frm"
## [309] "frnd" "frnds" "fuck" "full"
## [313] "fun" "funni" "gal" "game"
## [317] "gas" "gave" "gay" "get"
## [321] "gettin" "gift" "girl" "give"
## [325] "glad" "god" "goe" "goin"
## [329] "gone" "gonna" "good" "goodmorn"
## [333] "got" "gotta" "great" "green"
## [337] "greet" "group" "guarante" "gud"
## [341] "guess" "guy" "gym" "haf"
## [345] "haha" "hair" "half" "hand"
## [349] "happen" "happi" "hard" "hav"
## [353] "havent" "head" "hear" "heard"
## [357] "heart" "heavi" "hee" "hello"
## [361] "help" "hey" "hgsuiteland" "high"
## [365] "hit" "hiya" "hmm" "hmmm"
## [369] "hmv" "hold" "holder" "holiday"
## [373] "home" "honey" "hook" "hop"
## [377] "hope" "horni" "hospit" "hot"
## [381] "hotel" "hour" "hous" "how"
## [385] "howz" "hrs" "huh" "hungri"
## [389] "hurri" "hurt" "ice" "idea"
## [393] "identifi" "ignor" "ill" "imma"
## [397] "immedi" "import" "inc" "india"
## [401] "indian" "info" "inform" "invit"
## [405] "ipod" "ish" "issu" "ive"
## [409] "izzit" "jay" "job" "john"
## [413] "join" "joke" "joy" "jus"
## [417] "just" "juz" "kalli" "kate"
## [421] "keep" "kept" "kick" "kid"
## [425] "kind" "kinda" "kiss" "knew"
## [429] "know" "ladi" "land" "landlin"
## [433] "laptop" "lar" "last" "late"
## [437] "later" "latest" "laugh" "lazi"
## [441] "ldn" "learn" "least" "leav"
## [445] "lect" "left" "leh" "lei"
## [449] "lesson" "let" "liao" "librari"
## [453] "lick" "life" "lift" "light"
## [457] "like" "line" "link" "listen"
## [461] "littl" "live" "load" "loan"
## [465] "locat" "log" "lol" "long"
## [469] "look" "lor" "lose" "lost"
## [473] "lot" "lovabl" "love" "lover"
## [477] "loyalti" "ltd" "ltdecimalgt" "ltgt"
## [481] "luck" "lucki" "lunch" "luv"
## [485] "made" "mah" "mail" "make"
## [489] "man" "mani" "march" "marri"
## [493] "match" "mate" "maxim" "may"
## [497] "mayb" "mean" "meant" "med"
## [501] "medic" "meet" "meh" "mell"
## [505] "member" "men" "menu" "merri"
## [509] "messag" "met" "mid" "might"
## [513] "min" "mind" "mine" "minut"
## [517] "miss" "mistak" "moan" "mob"
## [521] "mobil" "mobileupd" "mode" "mom"
## [525] "moment" "mon" "monday" "money"
## [529] "month" "moon" "morn" "motorola"
## [533] "move" "movi" "mrng" "msg"
## [537] "msgs" "mths" "much" "mum"
## [541] "murder" "music" "must" "muz"
## [545] "nah" "name" "natur" "naughti"
## [549] "near" "need" "network" "neva"
## [553] "never" "new" "news" "next"
## [557] "nice" "nigeria" "night" "nite"
## [561] "nobodi" "noe" "nokia" "noon"
## [565] "nope" "normal" "noth" "notic"
## [569] "now" "ntt" "num" "number"
## [573] "nyt" "offer" "offic" "okay"
## [577] "oki" "old" "one" "onlin"
## [581] "oop" "open" "oper" "opinion"
## [585] "opt" "optout" "orang" "orchard"
## [589] "order" "oredi" "oso" "otherwis"
## [593] "outsid" "page" "pain" "paper"
## [597] "parent" "park" "part" "parti"
## [601] "partner" "pass" "password" "pay"
## [605] "peopl" "per" "person" "phone"
## [609] "photo" "pic" "pick" "pictur"
## [613] "pix" "pizza" "place" "plan"
## [617] "plane" "play" "player" "pleas"
## [621] "pls" "plus" "plz" "pmin"
## [625] "pmsg" "pobox" "point" "poli"
## [629] "pop" "possibl" "post" "pound"
## [633] "power" "pple" "ppm" "practic"
## [637] "pray" "prepar" "press" "pretti"
## [641] "price" "princess" "privat" "prize"
## [645] "prob" "probabl" "problem" "process"
## [649] "project" "promis" "pub" "put"
## [653] "qualiti" "question" "quick" "quit"
## [657] "quiz" "quot" "rain" "rate"
## [661] "rcvd" "reach" "read" "readi"
## [665] "real" "realiz" "realli" "reason"
## [669] "receipt" "receiv" "red" "regard"
## [673] "regist" "rememb" "remind" "rent"
## [677] "rental" "repli" "repres" "request"
## [681] "respond" "result" "return" "reveal"
## [685] "right" "ring" "rington" "rite"
## [689] "road" "rock" "room" "rose"
## [693] "round" "rowwjhl" "rpli" "rreveal"
## [697] "run" "sad" "sae" "safe"
## [701] "said" "sale" "sam" "sat"
## [705] "saturday" "savamob" "save" "saw"
## [709] "say" "sch" "school" "score"
## [713] "scream" "search" "second" "secret"
## [717] "see" "seem" "select" "sell"
## [721] "semest" "send" "sent" "serious"
## [725] "servic" "set" "settl" "sex"
## [729] "sexi" "shall" "shd" "shirt"
## [733] "shit" "shop" "short" "show"
## [737] "shower" "shuhui" "sick" "side"
## [741] "sign" "silent" "simpl" "sinc"
## [745] "sing" "singl" "sir" "sis"
## [749] "sister" "sit" "situat" "sky"
## [753] "sleep" "slept" "slow" "slowli"
## [757] "smile" "smoke" "sms" "smth"
## [761] "snow" "solv" "somebodi" "someon"
## [765] "someth" "sometim" "somewher" "song"
## [769] "soni" "soon" "sorri" "sort"
## [773] "sound" "space" "speak" "special"
## [777] "specialcal" "spend" "spent" "spoke"
## [781] "sport" "spree" "stand" "start"
## [785] "statement" "station" "stay" "still"
## [789] "stop" "store" "stori" "str"
## [793] "street" "strong" "student" "studi"
## [797] "stuff" "stupid" "style" "sub"
## [801] "subscrib" "success" "summer" "sun"
## [805] "sunday" "sunshin" "support" "suppos"
## [809] "sure" "sweet" "system" "take"
## [813] "talk" "tampa" "tcs" "teach"
## [817] "team" "tel" "tell" "ten"
## [821] "tenerif" "term" "test" "text"
## [825] "thank" "thanx" "that" "thing"
## [829] "think" "thinkin" "thk" "thnk"
## [833] "tho" "though" "thought" "ticket"
## [837] "til" "till" "time" "tire"
## [841] "tmr" "tncs" "today" "togeth"
## [845] "told" "tomo" "tomorrow" "tone"
## [849] "tonight" "took" "top" "tot"
## [853] "total" "touch" "tough" "tour"
## [857] "town" "train" "transact" "treat"
## [861] "tri" "trip" "troubl" "true"
## [865] "trust" "truth" "tscs" "tuesday"
## [869] "turn" "two" "txt" "txting"
## [873] "txts" "type" "ufind" "ugh"
## [877] "umma" "uncl" "understand" "unlimit"
## [881] "unredeem" "unsubscrib" "updat" "ure"
## [885] "urgent" "use" "usual" "uve"
## [889] "valentin" "valid" "valu" "via"
## [893] "video" "voic" "voucher" "wait"
## [897] "wake" "walk" "wan" "wana"
## [901] "wanna" "want" "warm" "wast"
## [905] "wat" "watch" "water" "way"
## [909] "weak" "wear" "weather" "weed"
## [913] "week" "weekend" "welcom" "well"
## [917] "wen" "went" "wer" "wet"
## [921] "what" "whatev" "whole" "wid"
## [925] "wif" "wife" "will" "win"
## [929] "wine" "winner" "wish" "wit"
## [933] "within" "without" "wiv" "wkli"
## [937] "wnt" "woke" "won" "wonder"
## [941] "wont" "word" "work" "world"
## [945] "worri" "worth" "wot" "wow"
## [949] "wrong" "wun" "wwwgetzedcouk" "xmas"
## [953] "xxx" "yahoo" "yar" "yeah"
## [957] "year" "yep" "yes" "yest"
## [961] "yesterday" "yet" "yrs" "yun"
## [965] "yup"
# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)
## chr [1:965] "â<U+0080><U+0093>""| __truncated__ "abiola" "abl" "abt" "accept" "access" ...
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]
# convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)
## Step 3: Training a model on the data ----
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
## Step 4: Evaluating model performance ----
sms_test_pred <- predict(sms_classifier, sms_test)
head(sms_test_pred)
## [1] ham ham ham ham ham ham
## Levels: ham spam
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.3.3
CrossTable(sms_test_pred, sms_test_labels,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1203 | 29 | 1232 |
## | 0.994 | 0.161 | |
## -------------|-----------|-----------|-----------|
## spam | 7 | 151 | 158 |
## | 0.006 | 0.839 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1210 | 180 | 1390 |
## | 0.871 | 0.129 | |
## -------------|-----------|-----------|-----------|
##
##
## Step 5: Improving model performance ----
sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1205 | 31 | 1236 |
## | 0.996 | 0.172 | |
## -------------|-----------|-----------|-----------|
## spam | 5 | 149 | 154 |
## | 0.004 | 0.828 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1210 | 180 | 1390 |
## | 0.871 | 0.129 | |
## -------------|-----------|-----------|-----------|
##
##