# Filtering spam SMS using Naive Bayes
# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
# examine the structure of the sms data
str(sms_raw)
## 'data.frame': 5559 obs. of 2 variables:
## $ type: chr "ham" "ham" "ham" "spam" ...
## $ text: chr "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out!"| __truncated__ ...
# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)
# examine the type variable more carefully
str(sms_raw$type)
## Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
table(sms_raw$type)
##
## ham spam
## 4812 747
# build a corpus using the text mining (tm) package
library(tm)
sms_corpus <- Corpus(VectorSource(sms_raw$text))
# examine the sms corpus
print(sms_corpus)
## A corpus with 5559 text documents
inspect(sms_corpus[1:3])
## A corpus with 3 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## Hope you are having a good week. Just checking in
##
## [[2]]
## K..give back my thanks.
##
## [[3]]
## Am also doing in cbe only. But have to pay.
# clean up the corpus using tm_map()
corpus_clean <- tm_map(sms_corpus, tolower)
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## hope you are having a good week. just checking in
##
## [[2]]
## k..give back my thanks.
##
## [[3]]
## am also doing in cbe only. but have to pay.
corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## hope good week. just checking
##
## [[2]]
## k..give back thanks.
##
## [[3]]
## also cbe . pay.
corpus_clean <- tm_map(corpus_clean, removePunctuation)
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## hope good week just checking
##
## [[2]]
## kgive back thanks
##
## [[3]]
## also cbe pay
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## hope good week just checking
##
## [[2]]
## kgive back thanks
##
## [[3]]
## also cbe pay
# examine the clean corpus
inspect(sms_corpus[1:3])
## A corpus with 3 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## Hope you are having a good week. Just checking in
##
## [[2]]
## K..give back my thanks.
##
## [[3]]
## Am also doing in cbe only. But have to pay.
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
##
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
## create_date creator
## Available variables in the data frame are:
## MetaID
##
## [[1]]
## hope good week just checking
##
## [[2]]
## kgive back thanks
##
## [[3]]
## also cbe pay
# create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_dtm
## A document-term matrix (5559 documents, 7877 terms)
##
## Non-/sparse entries: 42599/43745644
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
# creating training and test datasets
sms_raw_train <- sms_raw[1:4169, ]
sms_raw_test <- sms_raw[4170:5559, ]
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
sms_corpus_train <- corpus_clean[1:4169]
sms_corpus_test <- corpus_clean[4170:5559]
# check that the proportion of spam is similar
prop.table(table(sms_raw_train$type))
##
## ham spam
## 0.8647 0.1353
prop.table(table(sms_raw_test$type))
##
## ham spam
## 0.8683 0.1317
# word cloud visualization
library(wordcloud)
## Loading required package: Rcpp
## Loading required package: RColorBrewer
wordcloud(sms_corpus_train, min.freq = 30)
wordcloud(sms_corpus_train, min.freq = 30, random.order = FALSE)
wordcloud(sms_corpus_train, min.freq = 50, random.order = FALSE)
# subset the training data into spam and ham groups
spam <- subset(sms_raw_train, type == "spam")
ham <- subset(sms_raw_train, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5), colors = brewer.pal(9,
"Blues"))
# indicator features for frequent words
findFreqTerms(sms_dtm_train, 5)
## [1] "abiola" "able" "abt"
## [4] "accept" "access" "account"
## [7] "across" "activate" "actually"
## [10] "add" "added" "address"
## [13] "admirer" "advance" "aft"
## [16] "afternoon" "age" "ago"
## [19] "aha" "ahead" "aight"
## [22] "aint" "air" "aiyo"
## [25] "alex" "almost" "alone"
## [28] "already" "alright" "also"
## [31] "always" "angry" "announcement"
## [34] "another" "ans" "answer"
## [37] "anymore" "anyone" "anything"
## [40] "anytime" "anyway" "apartment"
## [43] "apply" "appreciate" "arcade"
## [46] "ard" "area" "argument"
## [49] "around" "arrive" "asap"
## [52] "ask" "askd" "asked"
## [55] "asking" "attempt" "auction"
## [58] "available" "ave" "await"
## [61] "awaiting" "awake" "award"
## [64] "awarded" "away" "awesome"
## [67] "babe" "babes" "baby"
## [70] "back" "bad" "bag"
## [73] "bank" "barely" "bathe"
## [76] "battery" "bcoz" "bday"
## [79] "beautiful" "become" "bed"
## [82] "bedroom" "believe" "best"
## [85] "better" "bid" "big"
## [88] "bill" "birthday" "bit"
## [91] "blue" "bluetooth" "bold"
## [94] "bonus" "boo" "book"
## [97] "booked" "boost" "bored"
## [100] "boss" "bother" "bout"
## [103] "box" "boy" "boys"
## [106] "boytoy" "break" "bring"
## [109] "brings" "brother" "bslvyl"
## [112] "btnationalrate" "bucks" "bus"
## [115] "busy" "buy" "buying"
## [118] "cabin" "call" "called"
## [121] "caller" "callertune" "calling"
## [124] "calls" "camcorder" "came"
## [127] "camera" "campus" "can"
## [130] "cancel" "cancer" "cant"
## [133] "car" "card" "care"
## [136] "carlos" "case" "cash"
## [139] "cashbalance" "catch" "cause"
## [142] "cell" "centre" "chance"
## [145] "change" "charge" "charged"
## [148] "charges" "chat" "cheap"
## [151] "cheaper" "check" "checked"
## [154] "checking" "cheers" "chennai"
## [157] "chikku" "childish" "children"
## [160] "choose" "christmas" "claim"
## [163] "class" "clean" "close"
## [166] "club" "code" "coffee"
## [169] "cold" "colleagues" "collect"
## [172] "collection" "college" "colour"
## [175] "come" "comes" "comin"
## [178] "coming" "comp" "company"
## [181] "competition" "completely" "complimentary"
## [184] "computer" "confirm" "congrats"
## [187] "congratulations" "contact" "content"
## [190] "contract" "cool" "copy"
## [193] "correct" "cos" "cost"
## [196] "costa" "costpm" "couple"
## [199] "course" "cover" "coz"
## [202] "crave" "crazy" "created"
## [205] "credit" "credits" "cry"
## [208] "cum" "cup" "currently"
## [211] "custcare" "customer" "cut"
## [214] "cute" "cuz" "dad"
## [217] "daddy" "darlin" "darren"
## [220] "dat" "date" "dating"
## [223] "day" "days" "dead"
## [226] "deal" "dear" "decide"
## [229] "decided" "decimal" "deep"
## [232] "definitely" "del" "delivery"
## [235] "den" "depends" "details"
## [238] "didnt" "die" "died"
## [241] "different" "difficult" "digital"
## [244] "din" "dinner" "direct"
## [247] "dis" "discount" "disturb"
## [250] "dnt" "doctor" "doesnt"
## [253] "dog" "dogging" "doin"
## [256] "don" "done" "dont"
## [259] "door" "double" "download"
## [262] "draw" "dream" "dreams"
## [265] "drink" "drive" "driving"
## [268] "drop" "dropped" "drug"
## [271] "drugs" "dude" "due"
## [274] "dun" "dunno" "dvd"
## [277] "earlier" "early" "earth"
## [280] "easy" "eat" "eatin"
## [283] "eating" "either" "else"
## [286] "email" "end" "ending"
## [289] "ends" "energy" "england"
## [292] "enjoy" "enough" "enter"
## [295] "entered" "entitled" "entry"
## [298] "envelope" "etc" "euro"
## [301] "eve" "even" "evening"
## [304] "ever" "every" "everyone"
## [307] "everything" "exactly" "exam"
## [310] "exams" "excellent" "exciting"
## [313] "excuse" "expecting" "experience"
## [316] "expires" "extra" "eyes"
## [319] "face" "facebook" "fact"
## [322] "family" "fancy" "fantastic"
## [325] "far" "fast" "fat"
## [328] "father" "fault" "feb"
## [331] "feel" "feeling" "feels"
## [334] "felt" "figure" "film"
## [337] "final" "finally" "find"
## [340] "fine" "fingers" "finish"
## [343] "finished" "first" "fixed"
## [346] "flag" "flat" "flower"
## [349] "following" "fone" "food"
## [352] "forever" "forget" "forgot"
## [355] "forward" "forwarded" "found"
## [358] "free" "freemsg" "freephone"
## [361] "frens" "fri" "friday"
## [364] "friend" "friends" "friendship"
## [367] "frm" "frnd" "frnds"
## [370] "full" "fullonsmscom" "fun"
## [373] "funny" "future" "gal"
## [376] "game" "games" "gap"
## [379] "gas" "gave" "gay"
## [382] "get" "gets" "gettin"
## [385] "getting" "gift" "girl"
## [388] "girlfrnd" "girls" "give"
## [391] "glad" "god" "goes"
## [394] "goin" "going" "gone"
## [397] "gonna" "good" "goodmorning"
## [400] "goodnight" "got" "goto"
## [403] "gotta" "great" "grins"
## [406] "guaranteed" "gud" "guess"
## [409] "guy" "guys" "gym"
## [412] "haf" "haha" "hai"
## [415] "hair" "half" "hand"
## [418] "handset" "happen" "happened"
## [421] "happening" "happens" "happiness"
## [424] "happy" "hard" "hate"
## [427] "hav" "havent" "head"
## [430] "hear" "heard" "heart"
## [433] "heavy" "hee" "hell"
## [436] "hello" "help" "hey"
## [439] "hgsuitelands" "hit" "hiya"
## [442] "hmm" "hmmm" "hmv"
## [445] "hold" "holder" "holding"
## [448] "holiday" "home" "hop"
## [451] "hope" "hoping" "horny"
## [454] "hospital" "hot" "hour"
## [457] "hours" "house" "however"
## [460] "hows" "howz" "hrs"
## [463] "httpwwwurawinnercom" "huh" "hungry"
## [466] "hurry" "hurt" "hurts"
## [469] "ice" "identifier" "ill"
## [472] "immediately" "important" "inc"
## [475] "india" "info" "information"
## [478] "inside" "instead" "interested"
## [481] "invited" "ipod" "ish"
## [484] "ive" "izzit" "january"
## [487] "jay" "job" "john"
## [490] "join" "joined" "joke"
## [493] "joking" "jst" "jus"
## [496] "just" "juz" "kate"
## [499] "keep" "keeping" "kept"
## [502] "kick" "kids" "kind"
## [505] "kinda" "king" "kiss"
## [508] "knew" "know" "knows"
## [511] "knw" "lady" "land"
## [514] "landline" "laptop" "lar"
## [517] "last" "late" "later"
## [520] "latest" "laugh" "lazy"
## [523] "ldn" "learn" "least"
## [526] "leave" "leaves" "leaving"
## [529] "lect" "left" "leh"
## [532] "lei" "less" "lesson"
## [535] "lessons" "let" "lets"
## [538] "liao" "library" "life"
## [541] "lift" "light" "like"
## [544] "liked" "line" "link"
## [547] "listen" "little" "live"
## [550] "lmao" "loads" "loan"
## [553] "local" "log" "lol"
## [556] "london" "long" "longer"
## [559] "look" "lookin" "looking"
## [562] "looks" "lor" "lose"
## [565] "lost" "lot" "lots"
## [568] "lovable" "love" "loved"
## [571] "lovely" "lover" "loves"
## [574] "loving" "loyalty" "ltd"
## [577] "luck" "lucky" "lunch"
## [580] "luv" "mad" "made"
## [583] "mah" "mail" "make"
## [586] "makes" "making" "malaria"
## [589] "man" "many" "march"
## [592] "mark" "married" "marry"
## [595] "match" "matches" "mate"
## [598] "mates" "maximize" "maxmins"
## [601] "may" "mayb" "maybe"
## [604] "mean" "meaning" "means"
## [607] "meant" "medical" "meds"
## [610] "meet" "meeting" "meh"
## [613] "member" "merry" "message"
## [616] "messages" "met" "midnight"
## [619] "might" "min" "mind"
## [622] "mine" "mins" "minute"
## [625] "minutes" "miracle" "miss"
## [628] "missed" "missing" "mistake"
## [631] "moan" "mob" "mobile"
## [634] "mobiles" "mobileupd" "mode"
## [637] "mom" "moment" "moms"
## [640] "mon" "monday" "money"
## [643] "month" "months" "morning"
## [646] "mother" "motorola" "move"
## [649] "movie" "movies" "mrng"
## [652] "mrt" "mrw" "msg"
## [655] "msgs" "mths" "much"
## [658] "mum" "music" "must"
## [661] "muz" "nah" "naked"
## [664] "name" "national" "nature"
## [667] "naughty" "near" "need"
## [670] "needs" "net" "network"
## [673] "neva" "never" "new"
## [676] "news" "next" "nice"
## [679] "nigeria" "night" "nite"
## [682] "nobody" "noe" "nokia"
## [685] "noon" "nope" "normal"
## [688] "normptone" "nothing" "notice"
## [691] "now" "num" "number"
## [694] "numbers" "nyt" "obviously"
## [697] "offer" "offers" "office"
## [700] "official" "okay" "okie"
## [703] "old" "omg" "one"
## [706] "ones" "online" "onto"
## [709] "oops" "open" "operator"
## [712] "opinion" "opt" "optout"
## [715] "orange" "orchard" "order"
## [718] "oredi" "oso" "others"
## [721] "otherwise" "outside" "paid"
## [724] "pain" "paper" "parents"
## [727] "park" "part" "partner"
## [730] "party" "pass" "password"
## [733] "past" "pay" "paying"
## [736] "people" "per" "person"
## [739] "persons" "pete" "phone"
## [742] "phones" "photo" "photos"
## [745] "pic" "pick" "picked"
## [748] "picking" "pics" "pictures"
## [751] "pin" "pix" "pizza"
## [754] "place" "plan" "planned"
## [757] "planning" "plans" "play"
## [760] "player" "players" "please"
## [763] "pleasure" "plenty" "pls"
## [766] "plus" "plz" "pmin"
## [769] "pmsg" "pobox" "point"
## [772] "points" "police" "poly"
## [775] "polys" "poor" "possible"
## [778] "post" "posted" "pound"
## [781] "pounds" "ppm" "pray"
## [784] "press" "pretty" "price"
## [787] "princess" "private" "prize"
## [790] "prob" "probably" "problem"
## [793] "project" "promise" "pub"
## [796] "put" "putting" "quality"
## [799] "question" "questions" "quick"
## [802] "quite" "quiz" "rain"
## [805] "raining" "rate" "rates"
## [808] "rather" "rcvd" "reach"
## [811] "reached" "reaching" "read"
## [814] "reading" "ready" "real"
## [817] "really" "realy" "reason"
## [820] "receipt" "receive" "recently"
## [823] "records" "reference" "regards"
## [826] "registered" "relation" "relax"
## [829] "remember" "remind" "remove"
## [832] "rent" "rental" "replied"
## [835] "reply" "replying" "representative"
## [838] "request" "rest" "review"
## [841] "reward" "right" "ring"
## [844] "ringtone" "rite" "road"
## [847] "rock" "role" "room"
## [850] "rose" "round" "rowwjhl"
## [853] "rply" "rreveal" "run"
## [856] "sad" "sae" "safe"
## [859] "said" "sale" "sat"
## [862] "saturday" "savamob" "save"
## [865] "saw" "say" "saying"
## [868] "says" "sch" "school"
## [871] "screaming" "sea" "search"
## [874] "sec" "second" "secret"
## [877] "see" "seeing" "seems"
## [880] "seen" "selected" "self"
## [883] "sell" "semester" "send"
## [886] "sending" "sense" "sent"
## [889] "serious" "seriously" "service"
## [892] "services" "set" "sex"
## [895] "sexy" "shall" "share"
## [898] "shd" "shop" "shopping"
## [901] "short" "show" "shower"
## [904] "shows" "sick" "side"
## [907] "sight" "sign" "silent"
## [910] "simple" "since" "single"
## [913] "sipix" "sir" "sis"
## [916] "sister" "sit" "sitting"
## [919] "situation" "skxh" "slave"
## [922] "sleep" "sleeping" "slept"
## [925] "slow" "slowly" "small"
## [928] "smile" "smiling" "smoke"
## [931] "sms" "smth" "snow"
## [934] "sofa" "sol" "somebody"
## [937] "someone" "something" "sometimes"
## [940] "somewhere" "song" "sony"
## [943] "sonyericsson" "soon" "sorry"
## [946] "sort" "sound" "sounds"
## [949] "south" "space" "speak"
## [952] "special" "specialcall" "specially"
## [955] "spend" "spent" "spoke"
## [958] "spree" "stand" "start"
## [961] "started" "starting" "starts"
## [964] "statement" "station" "stay"
## [967] "staying" "std" "still"
## [970] "stockport" "stop" "store"
## [973] "story" "street" "study"
## [976] "studying" "stuff" "stupid"
## [979] "sub" "sucks" "summer"
## [982] "sun" "sunday" "sunshine"
## [985] "sup" "support" "supposed"
## [988] "sure" "surely" "surprise"
## [991] "sweet" "swing" "system"
## [994] "take" "takes" "taking"
## [997] "talk" "talking" "tampa"
## [1000] "tariffs" "tcs" "tea"
## [1003] "teach" "tear" "tel"
## [1006] "tell" "telling" "tells"
## [1009] "ten" "tenerife" "terms"
## [1012] "test" "text" "texting"
## [1015] "texts" "thank" "thanks"
## [1018] "thanx" "thats" "thing"
## [1021] "things" "think" "thinkin"
## [1024] "thinking" "thinks" "thk"
## [1027] "tho" "though" "thought"
## [1030] "thru" "tht" "thurs"
## [1033] "tick" "ticket" "tickets"
## [1036] "til" "till" "time"
## [1039] "times" "timing" "tired"
## [1042] "tmr" "toclaim" "today"
## [1045] "todays" "together" "told"
## [1048] "tomo" "tomorrow" "tone"
## [1051] "tones" "tonight" "tonite"
## [1054] "took" "top" "torch"
## [1057] "tot" "touch" "tough"
## [1060] "tour" "towards" "town"
## [1063] "track" "train" "training"
## [1066] "transaction" "treat" "tried"
## [1069] "trip" "trouble" "true"
## [1072] "trust" "truth" "try"
## [1075] "trying" "tscs" "ttyl"
## [1078] "tuesday" "twice" "two"
## [1081] "txt" "txting" "txts"
## [1084] "type" "ufind" "ugh"
## [1087] "uks" "ull" "uncle"
## [1090] "understand" "unless" "unlimited"
## [1093] "unredeemed" "unsub" "unsubscribe"
## [1096] "update" "ure" "urgent"
## [1099] "urself" "use" "used"
## [1102] "user" "usf" "using"
## [1105] "usual" "uve" "valentine"
## [1108] "valentines" "valid" "valued"
## [1111] "via" "video" "visit"
## [1114] "vodafone" "voice" "vomit"
## [1117] "voucher" "vouchers" "wait"
## [1120] "waiting" "wake" "waking"
## [1123] "walk" "walking" "wan"
## [1126] "wana" "wanna" "want"
## [1129] "wanted" "wants" "wap"
## [1132] "warm" "waste" "wat"
## [1135] "watch" "watching" "water"
## [1138] "wats" "way" "weather"
## [1141] "wed" "wednesday" "weed"
## [1144] "week" "weekend" "weekends"
## [1147] "weekly" "weeks" "welcome"
## [1150] "well" "wen" "went"
## [1153] "whatever" "whats" "whenever"
## [1156] "whole" "wid" "wif"
## [1159] "wife" "wil" "will"
## [1162] "willing" "win" "wine"
## [1165] "winner" "wins" "wish"
## [1168] "wishing" "wit" "within"
## [1171] "without" "wiv" "wkly"
## [1174] "wks" "wnt" "woke"
## [1177] "won" "wonder" "wonderful"
## [1180] "wont" "word" "words"
## [1183] "work" "workin" "working"
## [1186] "works" "world" "worried"
## [1189] "worries" "worry" "worse"
## [1192] "worth" "wot" "wow"
## [1195] "write" "wrong" "wwq"
## [1198] "wwwgetzedcouk" "xmas" "xxx"
## [1201] "yahoo" "yar" "yeah"
## [1204] "year" "years" "yep"
## [1207] "yes" "yesterday" "yet"
## [1210] "yoga" "yup"
sms_dict <- c(findFreqTerms(sms_dtm_train, 5))
sms_train <- DocumentTermMatrix(sms_corpus_train, list(dictionary = sms_dict))
sms_test <- DocumentTermMatrix(sms_corpus_test, list(dictionary = sms_dict))
# convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
}
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_train, MARGIN = 2, convert_counts)
sms_test <- apply(sms_test, MARGIN = 2, convert_counts)
summary(sms_train[, 1:5])
## abiola able abt accept access
## No :4162 No :4150 No :4154 No :4164 No :4165
## Yes: 7 Yes: 19 Yes: 15 Yes: 5 Yes: 4
## Step 3: Training a model on the data ----
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_raw_train$type)
names(sms_classifier)
## [1] "apriori" "tables" "levels" "call"
sms_classifier$tables[1:2]
## $abiola
## abiola
## sms_raw_train$type No Yes
## ham 0.998058 0.001942
## spam 1.000000 0.000000
##
## $able
## able
## sms_raw_train$type No Yes
## ham 0.99473 0.00527
## spam 1.00000 0.00000
## Step 4: Evaluating model performance ----
sms_test_pred <- predict(sms_classifier, sms_test)
table(sms_raw_test$type, sms_test_pred)
## sms_test_pred
## ham spam
## ham 1202 5
## spam 32 151