# Filtering spam SMS using Naive Bayes

# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)

# examine the structure of the sms data
str(sms_raw)
## 'data.frame':    5559 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "ham" "spam" ...
##  $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out!"| __truncated__ ...

# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)

# examine the type variable more carefully
str(sms_raw$type)
##  Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
table(sms_raw$type)
## 
##  ham spam 
## 4812  747

# build a corpus using the text mining (tm) package
library(tm)
sms_corpus <- Corpus(VectorSource(sms_raw$text))

# examine the sms corpus
print(sms_corpus)
## A corpus with 5559 text documents
inspect(sms_corpus[1:3])
## A corpus with 3 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## Hope you are having a good week. Just checking in
## 
## [[2]]
## K..give back my thanks.
## 
## [[3]]
## Am also doing in cbe only. But have to pay.

# clean up the corpus using tm_map()
corpus_clean <- tm_map(sms_corpus, tolower)
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## hope you are having a good week. just checking in
## 
## [[2]]
## k..give back my thanks.
## 
## [[3]]
## am also doing in cbe only. but have to pay.

corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## hope     good week. just checking 
## 
## [[2]]
## k..give back  thanks.
## 
## [[3]]
##  also   cbe .    pay.

corpus_clean <- tm_map(corpus_clean, removePunctuation)
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## hope     good week just checking 
## 
## [[2]]
## kgive back  thanks
## 
## [[3]]
##  also   cbe     pay

corpus_clean <- tm_map(corpus_clean, stripWhitespace)
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## hope good week just checking 
## 
## [[2]]
## kgive back thanks
## 
## [[3]]
##  also cbe pay

# examine the clean corpus
inspect(sms_corpus[1:3])
## A corpus with 3 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## Hope you are having a good week. Just checking in
## 
## [[2]]
## K..give back my thanks.
## 
## [[3]]
## Am also doing in cbe only. But have to pay.
inspect(corpus_clean[1:3])
## A corpus with 3 text documents
## 
## The metadata consists of 2 tag-value pairs and a data frame
## Available tags are:
##   create_date creator 
## Available variables in the data frame are:
##   MetaID 
## 
## [[1]]
## hope good week just checking 
## 
## [[2]]
## kgive back thanks
## 
## [[3]]
##  also cbe pay

# create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_dtm
## A document-term matrix (5559 documents, 7877 terms)
## 
## Non-/sparse entries: 42599/43745644
## Sparsity           : 100%
## Maximal term length: 40 
## Weighting          : term frequency (tf)

# creating training and test datasets
sms_raw_train <- sms_raw[1:4169, ]
sms_raw_test <- sms_raw[4170:5559, ]

sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]

sms_corpus_train <- corpus_clean[1:4169]
sms_corpus_test <- corpus_clean[4170:5559]

# check that the proportion of spam is similar
prop.table(table(sms_raw_train$type))
## 
##    ham   spam 
## 0.8647 0.1353
prop.table(table(sms_raw_test$type))
## 
##    ham   spam 
## 0.8683 0.1317


# word cloud visualization
library(wordcloud)
## Loading required package: Rcpp
## Loading required package: RColorBrewer

wordcloud(sms_corpus_train, min.freq = 30)

plot of chunk unnamed-chunk-1

wordcloud(sms_corpus_train, min.freq = 30, random.order = FALSE)

plot of chunk unnamed-chunk-1

wordcloud(sms_corpus_train, min.freq = 50, random.order = FALSE)

plot of chunk unnamed-chunk-1


# subset the training data into spam and ham groups
spam <- subset(sms_raw_train, type == "spam")
ham <- subset(sms_raw_train, type == "ham")

wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))

plot of chunk unnamed-chunk-1

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5), colors = brewer.pal(9, 
    "Blues"))

plot of chunk unnamed-chunk-1


# indicator features for frequent words
findFreqTerms(sms_dtm_train, 5)
##    [1] "abiola"              "able"                "abt"                
##    [4] "accept"              "access"              "account"            
##    [7] "across"              "activate"            "actually"           
##   [10] "add"                 "added"               "address"            
##   [13] "admirer"             "advance"             "aft"                
##   [16] "afternoon"           "age"                 "ago"                
##   [19] "aha"                 "ahead"               "aight"              
##   [22] "aint"                "air"                 "aiyo"               
##   [25] "alex"                "almost"              "alone"              
##   [28] "already"             "alright"             "also"               
##   [31] "always"              "angry"               "announcement"       
##   [34] "another"             "ans"                 "answer"             
##   [37] "anymore"             "anyone"              "anything"           
##   [40] "anytime"             "anyway"              "apartment"          
##   [43] "apply"               "appreciate"          "arcade"             
##   [46] "ard"                 "area"                "argument"           
##   [49] "around"              "arrive"              "asap"               
##   [52] "ask"                 "askd"                "asked"              
##   [55] "asking"              "attempt"             "auction"            
##   [58] "available"           "ave"                 "await"              
##   [61] "awaiting"            "awake"               "award"              
##   [64] "awarded"             "away"                "awesome"            
##   [67] "babe"                "babes"               "baby"               
##   [70] "back"                "bad"                 "bag"                
##   [73] "bank"                "barely"              "bathe"              
##   [76] "battery"             "bcoz"                "bday"               
##   [79] "beautiful"           "become"              "bed"                
##   [82] "bedroom"             "believe"             "best"               
##   [85] "better"              "bid"                 "big"                
##   [88] "bill"                "birthday"            "bit"                
##   [91] "blue"                "bluetooth"           "bold"               
##   [94] "bonus"               "boo"                 "book"               
##   [97] "booked"              "boost"               "bored"              
##  [100] "boss"                "bother"              "bout"               
##  [103] "box"                 "boy"                 "boys"               
##  [106] "boytoy"              "break"               "bring"              
##  [109] "brings"              "brother"             "bslvyl"             
##  [112] "btnationalrate"      "bucks"               "bus"                
##  [115] "busy"                "buy"                 "buying"             
##  [118] "cabin"               "call"                "called"             
##  [121] "caller"              "callertune"          "calling"            
##  [124] "calls"               "camcorder"           "came"               
##  [127] "camera"              "campus"              "can"                
##  [130] "cancel"              "cancer"              "cant"               
##  [133] "car"                 "card"                "care"               
##  [136] "carlos"              "case"                "cash"               
##  [139] "cashbalance"         "catch"               "cause"              
##  [142] "cell"                "centre"              "chance"             
##  [145] "change"              "charge"              "charged"            
##  [148] "charges"             "chat"                "cheap"              
##  [151] "cheaper"             "check"               "checked"            
##  [154] "checking"            "cheers"              "chennai"            
##  [157] "chikku"              "childish"            "children"           
##  [160] "choose"              "christmas"           "claim"              
##  [163] "class"               "clean"               "close"              
##  [166] "club"                "code"                "coffee"             
##  [169] "cold"                "colleagues"          "collect"            
##  [172] "collection"          "college"             "colour"             
##  [175] "come"                "comes"               "comin"              
##  [178] "coming"              "comp"                "company"            
##  [181] "competition"         "completely"          "complimentary"      
##  [184] "computer"            "confirm"             "congrats"           
##  [187] "congratulations"     "contact"             "content"            
##  [190] "contract"            "cool"                "copy"               
##  [193] "correct"             "cos"                 "cost"               
##  [196] "costa"               "costpm"              "couple"             
##  [199] "course"              "cover"               "coz"                
##  [202] "crave"               "crazy"               "created"            
##  [205] "credit"              "credits"             "cry"                
##  [208] "cum"                 "cup"                 "currently"          
##  [211] "custcare"            "customer"            "cut"                
##  [214] "cute"                "cuz"                 "dad"                
##  [217] "daddy"               "darlin"              "darren"             
##  [220] "dat"                 "date"                "dating"             
##  [223] "day"                 "days"                "dead"               
##  [226] "deal"                "dear"                "decide"             
##  [229] "decided"             "decimal"             "deep"               
##  [232] "definitely"          "del"                 "delivery"           
##  [235] "den"                 "depends"             "details"            
##  [238] "didnt"               "die"                 "died"               
##  [241] "different"           "difficult"           "digital"            
##  [244] "din"                 "dinner"              "direct"             
##  [247] "dis"                 "discount"            "disturb"            
##  [250] "dnt"                 "doctor"              "doesnt"             
##  [253] "dog"                 "dogging"             "doin"               
##  [256] "don"                 "done"                "dont"               
##  [259] "door"                "double"              "download"           
##  [262] "draw"                "dream"               "dreams"             
##  [265] "drink"               "drive"               "driving"            
##  [268] "drop"                "dropped"             "drug"               
##  [271] "drugs"               "dude"                "due"                
##  [274] "dun"                 "dunno"               "dvd"                
##  [277] "earlier"             "early"               "earth"              
##  [280] "easy"                "eat"                 "eatin"              
##  [283] "eating"              "either"              "else"               
##  [286] "email"               "end"                 "ending"             
##  [289] "ends"                "energy"              "england"            
##  [292] "enjoy"               "enough"              "enter"              
##  [295] "entered"             "entitled"            "entry"              
##  [298] "envelope"            "etc"                 "euro"               
##  [301] "eve"                 "even"                "evening"            
##  [304] "ever"                "every"               "everyone"           
##  [307] "everything"          "exactly"             "exam"               
##  [310] "exams"               "excellent"           "exciting"           
##  [313] "excuse"              "expecting"           "experience"         
##  [316] "expires"             "extra"               "eyes"               
##  [319] "face"                "facebook"            "fact"               
##  [322] "family"              "fancy"               "fantastic"          
##  [325] "far"                 "fast"                "fat"                
##  [328] "father"              "fault"               "feb"                
##  [331] "feel"                "feeling"             "feels"              
##  [334] "felt"                "figure"              "film"               
##  [337] "final"               "finally"             "find"               
##  [340] "fine"                "fingers"             "finish"             
##  [343] "finished"            "first"               "fixed"              
##  [346] "flag"                "flat"                "flower"             
##  [349] "following"           "fone"                "food"               
##  [352] "forever"             "forget"              "forgot"             
##  [355] "forward"             "forwarded"           "found"              
##  [358] "free"                "freemsg"             "freephone"          
##  [361] "frens"               "fri"                 "friday"             
##  [364] "friend"              "friends"             "friendship"         
##  [367] "frm"                 "frnd"                "frnds"              
##  [370] "full"                "fullonsmscom"        "fun"                
##  [373] "funny"               "future"              "gal"                
##  [376] "game"                "games"               "gap"                
##  [379] "gas"                 "gave"                "gay"                
##  [382] "get"                 "gets"                "gettin"             
##  [385] "getting"             "gift"                "girl"               
##  [388] "girlfrnd"            "girls"               "give"               
##  [391] "glad"                "god"                 "goes"               
##  [394] "goin"                "going"               "gone"               
##  [397] "gonna"               "good"                "goodmorning"        
##  [400] "goodnight"           "got"                 "goto"               
##  [403] "gotta"               "great"               "grins"              
##  [406] "guaranteed"          "gud"                 "guess"              
##  [409] "guy"                 "guys"                "gym"                
##  [412] "haf"                 "haha"                "hai"                
##  [415] "hair"                "half"                "hand"               
##  [418] "handset"             "happen"              "happened"           
##  [421] "happening"           "happens"             "happiness"          
##  [424] "happy"               "hard"                "hate"               
##  [427] "hav"                 "havent"              "head"               
##  [430] "hear"                "heard"               "heart"              
##  [433] "heavy"               "hee"                 "hell"               
##  [436] "hello"               "help"                "hey"                
##  [439] "hgsuitelands"        "hit"                 "hiya"               
##  [442] "hmm"                 "hmmm"                "hmv"                
##  [445] "hold"                "holder"              "holding"            
##  [448] "holiday"             "home"                "hop"                
##  [451] "hope"                "hoping"              "horny"              
##  [454] "hospital"            "hot"                 "hour"               
##  [457] "hours"               "house"               "however"            
##  [460] "hows"                "howz"                "hrs"                
##  [463] "httpwwwurawinnercom" "huh"                 "hungry"             
##  [466] "hurry"               "hurt"                "hurts"              
##  [469] "ice"                 "identifier"          "ill"                
##  [472] "immediately"         "important"           "inc"                
##  [475] "india"               "info"                "information"        
##  [478] "inside"              "instead"             "interested"         
##  [481] "invited"             "ipod"                "ish"                
##  [484] "ive"                 "izzit"               "january"            
##  [487] "jay"                 "job"                 "john"               
##  [490] "join"                "joined"              "joke"               
##  [493] "joking"              "jst"                 "jus"                
##  [496] "just"                "juz"                 "kate"               
##  [499] "keep"                "keeping"             "kept"               
##  [502] "kick"                "kids"                "kind"               
##  [505] "kinda"               "king"                "kiss"               
##  [508] "knew"                "know"                "knows"              
##  [511] "knw"                 "lady"                "land"               
##  [514] "landline"            "laptop"              "lar"                
##  [517] "last"                "late"                "later"              
##  [520] "latest"              "laugh"               "lazy"               
##  [523] "ldn"                 "learn"               "least"              
##  [526] "leave"               "leaves"              "leaving"            
##  [529] "lect"                "left"                "leh"                
##  [532] "lei"                 "less"                "lesson"             
##  [535] "lessons"             "let"                 "lets"               
##  [538] "liao"                "library"             "life"               
##  [541] "lift"                "light"               "like"               
##  [544] "liked"               "line"                "link"               
##  [547] "listen"              "little"              "live"               
##  [550] "lmao"                "loads"               "loan"               
##  [553] "local"               "log"                 "lol"                
##  [556] "london"              "long"                "longer"             
##  [559] "look"                "lookin"              "looking"            
##  [562] "looks"               "lor"                 "lose"               
##  [565] "lost"                "lot"                 "lots"               
##  [568] "lovable"             "love"                "loved"              
##  [571] "lovely"              "lover"               "loves"              
##  [574] "loving"              "loyalty"             "ltd"                
##  [577] "luck"                "lucky"               "lunch"              
##  [580] "luv"                 "mad"                 "made"               
##  [583] "mah"                 "mail"                "make"               
##  [586] "makes"               "making"              "malaria"            
##  [589] "man"                 "many"                "march"              
##  [592] "mark"                "married"             "marry"              
##  [595] "match"               "matches"             "mate"               
##  [598] "mates"               "maximize"            "maxmins"            
##  [601] "may"                 "mayb"                "maybe"              
##  [604] "mean"                "meaning"             "means"              
##  [607] "meant"               "medical"             "meds"               
##  [610] "meet"                "meeting"             "meh"                
##  [613] "member"              "merry"               "message"            
##  [616] "messages"            "met"                 "midnight"           
##  [619] "might"               "min"                 "mind"               
##  [622] "mine"                "mins"                "minute"             
##  [625] "minutes"             "miracle"             "miss"               
##  [628] "missed"              "missing"             "mistake"            
##  [631] "moan"                "mob"                 "mobile"             
##  [634] "mobiles"             "mobileupd"           "mode"               
##  [637] "mom"                 "moment"              "moms"               
##  [640] "mon"                 "monday"              "money"              
##  [643] "month"               "months"              "morning"            
##  [646] "mother"              "motorola"            "move"               
##  [649] "movie"               "movies"              "mrng"               
##  [652] "mrt"                 "mrw"                 "msg"                
##  [655] "msgs"                "mths"                "much"               
##  [658] "mum"                 "music"               "must"               
##  [661] "muz"                 "nah"                 "naked"              
##  [664] "name"                "national"            "nature"             
##  [667] "naughty"             "near"                "need"               
##  [670] "needs"               "net"                 "network"            
##  [673] "neva"                "never"               "new"                
##  [676] "news"                "next"                "nice"               
##  [679] "nigeria"             "night"               "nite"               
##  [682] "nobody"              "noe"                 "nokia"              
##  [685] "noon"                "nope"                "normal"             
##  [688] "normptone"           "nothing"             "notice"             
##  [691] "now"                 "num"                 "number"             
##  [694] "numbers"             "nyt"                 "obviously"          
##  [697] "offer"               "offers"              "office"             
##  [700] "official"            "okay"                "okie"               
##  [703] "old"                 "omg"                 "one"                
##  [706] "ones"                "online"              "onto"               
##  [709] "oops"                "open"                "operator"           
##  [712] "opinion"             "opt"                 "optout"             
##  [715] "orange"              "orchard"             "order"              
##  [718] "oredi"               "oso"                 "others"             
##  [721] "otherwise"           "outside"             "paid"               
##  [724] "pain"                "paper"               "parents"            
##  [727] "park"                "part"                "partner"            
##  [730] "party"               "pass"                "password"           
##  [733] "past"                "pay"                 "paying"             
##  [736] "people"              "per"                 "person"             
##  [739] "persons"             "pete"                "phone"              
##  [742] "phones"              "photo"               "photos"             
##  [745] "pic"                 "pick"                "picked"             
##  [748] "picking"             "pics"                "pictures"           
##  [751] "pin"                 "pix"                 "pizza"              
##  [754] "place"               "plan"                "planned"            
##  [757] "planning"            "plans"               "play"               
##  [760] "player"              "players"             "please"             
##  [763] "pleasure"            "plenty"              "pls"                
##  [766] "plus"                "plz"                 "pmin"               
##  [769] "pmsg"                "pobox"               "point"              
##  [772] "points"              "police"              "poly"               
##  [775] "polys"               "poor"                "possible"           
##  [778] "post"                "posted"              "pound"              
##  [781] "pounds"              "ppm"                 "pray"               
##  [784] "press"               "pretty"              "price"              
##  [787] "princess"            "private"             "prize"              
##  [790] "prob"                "probably"            "problem"            
##  [793] "project"             "promise"             "pub"                
##  [796] "put"                 "putting"             "quality"            
##  [799] "question"            "questions"           "quick"              
##  [802] "quite"               "quiz"                "rain"               
##  [805] "raining"             "rate"                "rates"              
##  [808] "rather"              "rcvd"                "reach"              
##  [811] "reached"             "reaching"            "read"               
##  [814] "reading"             "ready"               "real"               
##  [817] "really"              "realy"               "reason"             
##  [820] "receipt"             "receive"             "recently"           
##  [823] "records"             "reference"           "regards"            
##  [826] "registered"          "relation"            "relax"              
##  [829] "remember"            "remind"              "remove"             
##  [832] "rent"                "rental"              "replied"            
##  [835] "reply"               "replying"            "representative"     
##  [838] "request"             "rest"                "review"             
##  [841] "reward"              "right"               "ring"               
##  [844] "ringtone"            "rite"                "road"               
##  [847] "rock"                "role"                "room"               
##  [850] "rose"                "round"               "rowwjhl"            
##  [853] "rply"                "rreveal"             "run"                
##  [856] "sad"                 "sae"                 "safe"               
##  [859] "said"                "sale"                "sat"                
##  [862] "saturday"            "savamob"             "save"               
##  [865] "saw"                 "say"                 "saying"             
##  [868] "says"                "sch"                 "school"             
##  [871] "screaming"           "sea"                 "search"             
##  [874] "sec"                 "second"              "secret"             
##  [877] "see"                 "seeing"              "seems"              
##  [880] "seen"                "selected"            "self"               
##  [883] "sell"                "semester"            "send"               
##  [886] "sending"             "sense"               "sent"               
##  [889] "serious"             "seriously"           "service"            
##  [892] "services"            "set"                 "sex"                
##  [895] "sexy"                "shall"               "share"              
##  [898] "shd"                 "shop"                "shopping"           
##  [901] "short"               "show"                "shower"             
##  [904] "shows"               "sick"                "side"               
##  [907] "sight"               "sign"                "silent"             
##  [910] "simple"              "since"               "single"             
##  [913] "sipix"               "sir"                 "sis"                
##  [916] "sister"              "sit"                 "sitting"            
##  [919] "situation"           "skxh"                "slave"              
##  [922] "sleep"               "sleeping"            "slept"              
##  [925] "slow"                "slowly"              "small"              
##  [928] "smile"               "smiling"             "smoke"              
##  [931] "sms"                 "smth"                "snow"               
##  [934] "sofa"                "sol"                 "somebody"           
##  [937] "someone"             "something"           "sometimes"          
##  [940] "somewhere"           "song"                "sony"               
##  [943] "sonyericsson"        "soon"                "sorry"              
##  [946] "sort"                "sound"               "sounds"             
##  [949] "south"               "space"               "speak"              
##  [952] "special"             "specialcall"         "specially"          
##  [955] "spend"               "spent"               "spoke"              
##  [958] "spree"               "stand"               "start"              
##  [961] "started"             "starting"            "starts"             
##  [964] "statement"           "station"             "stay"               
##  [967] "staying"             "std"                 "still"              
##  [970] "stockport"           "stop"                "store"              
##  [973] "story"               "street"              "study"              
##  [976] "studying"            "stuff"               "stupid"             
##  [979] "sub"                 "sucks"               "summer"             
##  [982] "sun"                 "sunday"              "sunshine"           
##  [985] "sup"                 "support"             "supposed"           
##  [988] "sure"                "surely"              "surprise"           
##  [991] "sweet"               "swing"               "system"             
##  [994] "take"                "takes"               "taking"             
##  [997] "talk"                "talking"             "tampa"              
## [1000] "tariffs"             "tcs"                 "tea"                
## [1003] "teach"               "tear"                "tel"                
## [1006] "tell"                "telling"             "tells"              
## [1009] "ten"                 "tenerife"            "terms"              
## [1012] "test"                "text"                "texting"            
## [1015] "texts"               "thank"               "thanks"             
## [1018] "thanx"               "thats"               "thing"              
## [1021] "things"              "think"               "thinkin"            
## [1024] "thinking"            "thinks"              "thk"                
## [1027] "tho"                 "though"              "thought"            
## [1030] "thru"                "tht"                 "thurs"              
## [1033] "tick"                "ticket"              "tickets"            
## [1036] "til"                 "till"                "time"               
## [1039] "times"               "timing"              "tired"              
## [1042] "tmr"                 "toclaim"             "today"              
## [1045] "todays"              "together"            "told"               
## [1048] "tomo"                "tomorrow"            "tone"               
## [1051] "tones"               "tonight"             "tonite"             
## [1054] "took"                "top"                 "torch"              
## [1057] "tot"                 "touch"               "tough"              
## [1060] "tour"                "towards"             "town"               
## [1063] "track"               "train"               "training"           
## [1066] "transaction"         "treat"               "tried"              
## [1069] "trip"                "trouble"             "true"               
## [1072] "trust"               "truth"               "try"                
## [1075] "trying"              "tscs"                "ttyl"               
## [1078] "tuesday"             "twice"               "two"                
## [1081] "txt"                 "txting"              "txts"               
## [1084] "type"                "ufind"               "ugh"                
## [1087] "uks"                 "ull"                 "uncle"              
## [1090] "understand"          "unless"              "unlimited"          
## [1093] "unredeemed"          "unsub"               "unsubscribe"        
## [1096] "update"              "ure"                 "urgent"             
## [1099] "urself"              "use"                 "used"               
## [1102] "user"                "usf"                 "using"              
## [1105] "usual"               "uve"                 "valentine"          
## [1108] "valentines"          "valid"               "valued"             
## [1111] "via"                 "video"               "visit"              
## [1114] "vodafone"            "voice"               "vomit"              
## [1117] "voucher"             "vouchers"            "wait"               
## [1120] "waiting"             "wake"                "waking"             
## [1123] "walk"                "walking"             "wan"                
## [1126] "wana"                "wanna"               "want"               
## [1129] "wanted"              "wants"               "wap"                
## [1132] "warm"                "waste"               "wat"                
## [1135] "watch"               "watching"            "water"              
## [1138] "wats"                "way"                 "weather"            
## [1141] "wed"                 "wednesday"           "weed"               
## [1144] "week"                "weekend"             "weekends"           
## [1147] "weekly"              "weeks"               "welcome"            
## [1150] "well"                "wen"                 "went"               
## [1153] "whatever"            "whats"               "whenever"           
## [1156] "whole"               "wid"                 "wif"                
## [1159] "wife"                "wil"                 "will"               
## [1162] "willing"             "win"                 "wine"               
## [1165] "winner"              "wins"                "wish"               
## [1168] "wishing"             "wit"                 "within"             
## [1171] "without"             "wiv"                 "wkly"               
## [1174] "wks"                 "wnt"                 "woke"               
## [1177] "won"                 "wonder"              "wonderful"          
## [1180] "wont"                "word"                "words"              
## [1183] "work"                "workin"              "working"            
## [1186] "works"               "world"               "worried"            
## [1189] "worries"             "worry"               "worse"              
## [1192] "worth"               "wot"                 "wow"                
## [1195] "write"               "wrong"               "wwq"                
## [1198] "wwwgetzedcouk"       "xmas"                "xxx"                
## [1201] "yahoo"               "yar"                 "yeah"               
## [1204] "year"                "years"               "yep"                
## [1207] "yes"                 "yesterday"           "yet"                
## [1210] "yoga"                "yup"
sms_dict <- c(findFreqTerms(sms_dtm_train, 5))
sms_train <- DocumentTermMatrix(sms_corpus_train, list(dictionary = sms_dict))
sms_test <- DocumentTermMatrix(sms_corpus_test, list(dictionary = sms_dict))

# convert counts to a factor
convert_counts <- function(x) {
    x <- ifelse(x > 0, 1, 0)
    x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
}

# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_train, MARGIN = 2, convert_counts)
sms_test <- apply(sms_test, MARGIN = 2, convert_counts)

summary(sms_train[, 1:5])
##  abiola      able       abt       accept     access    
##  No :4162   No :4150   No :4154   No :4164   No :4165  
##  Yes:   7   Yes:  19   Yes:  15   Yes:   5   Yes:   4
## Step 3: Training a model on the data ----
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_raw_train$type)
names(sms_classifier)
## [1] "apriori" "tables"  "levels"  "call"
sms_classifier$tables[1:2]
## $abiola
##                   abiola
## sms_raw_train$type       No      Yes
##               ham  0.998058 0.001942
##               spam 1.000000 0.000000
## 
## $able
##                   able
## sms_raw_train$type      No     Yes
##               ham  0.99473 0.00527
##               spam 1.00000 0.00000

## Step 4: Evaluating model performance ----
sms_test_pred <- predict(sms_classifier, sms_test)
table(sms_raw_test$type, sms_test_pred)
##       sms_test_pred
##         ham spam
##   ham  1202    5
##   spam   32  151