This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

Example: Filtering spam SMS messages

Step 2: Exploring and preparing the data —-

# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
# examine the structure of the sms data
str(sms_raw)
'data.frame':   5559 obs. of  2 variables:
 $ type: chr  "ham" "ham" "ham" "spam" ...
 $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out!"| __truncated__ ...
# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)
# examine the type variable more carefully
str(sms_raw$type)
 Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
table(sms_raw$type)

 ham spam 
4812  747 
# build a corpus using the text mining (tm) package
library(tm)
Loading required package: NLP
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
# examine the sms corpus
print(sms_corpus)
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 5559
inspect(sms_corpus[1:2])
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 2

[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 49

[[2]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 23
as.character(sms_corpus[[1]])
[1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[1:2], as.character)
$`1`
[1] "Hope you are having a good week. Just checking in"

$`2`
[1] "K..give back my thanks."
# clean up the corpus using tm_map()
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))
# show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]])
[1] "Hope you are having a good week. Just checking in"
as.character(sms_corpus_clean[[1]])
[1] "hope you are having a good week. just checking in"
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers) # remove numbers
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation
# tip: create a custom function to replace (rather than remove) punctuation
removePunctuation("hello...world")
[1] "helloworld"
replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) }
replacePunctuation("hello...world")
[1] "hello world"
# illustration of word stemming
library(SnowballC)
wordStem(c("learn", "learned", "learning", "learns"))
[1] "learn" "learn" "learn" "learn"
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace
# examine the final clean corpus
lapply(sms_corpus[1:3], as.character)
$`1`
[1] "Hope you are having a good week. Just checking in"

$`2`
[1] "K..give back my thanks."

$`3`
[1] "Am also doing in cbe only. But have to pay."
lapply(sms_corpus_clean[1:3], as.character)
$`1`
[1] "hope good week just check"

$`2`
[1] "kgive back thank"

$`3`
[1] " also cbe pay"
# create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
# alternative solution: create a document-term sparse matrix directly from the SMS corpus
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))
# alternative solution: using custom stop words function ensures identical result
sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = function(x) { removeWords(x, stopwords()) },
  removePunctuation = TRUE,
  stemming = TRUE
))
# compare the result
sms_dtm
<<DocumentTermMatrix (documents: 5559, terms: 6518)>>
Non-/sparse entries: 42113/36191449
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
sms_dtm2
<<DocumentTermMatrix (documents: 5559, terms: 6909)>>
Non-/sparse entries: 43192/38363939
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
sms_dtm3
<<DocumentTermMatrix (documents: 5559, terms: 6518)>>
Non-/sparse entries: 42113/36191449
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
# creating training and test datasets
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test  <- sms_dtm[4170:5559, ]
# also save the labels
sms_train_labels <- sms_raw[1:4169, ]$type
sms_test_labels  <- sms_raw[4170:5559, ]$type
# check that the proportion of spam is similar
prop.table(table(sms_train_labels))
sms_train_labels
      ham      spam 
0.8647158 0.1352842 
prop.table(table(sms_test_labels))
sms_test_labels
      ham      spam 
0.8683453 0.1316547 
# word cloud visualization
library(wordcloud)
Loading required package: RColorBrewer
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)

# subset the training data into spam and ham groups
spam <- subset(sms_raw, type == "spam")
ham  <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)
sms_dtm_freq_train
<<DocumentTermMatrix (documents: 4169, terms: 1101)>>
Non-/sparse entries: 24834/4565235
Sparsity           : 99%
Maximal term length: 19
Weighting          : term frequency (tf)
# indicator features for frequent words
findFreqTerms(sms_dtm_train, 5)
   [1] "abiola"              "abl"                 "abt"                
   [4] "accept"              "access"              "account"            
   [7] "across"              "act"                 "activ"              
  [10] "actual"              "add"                 "address"            
  [13] "admir"               "adult"               "advanc"             
  [16] "aft"                 "afternoon"           "age"                
  [19] "ago"                 "aha"                 "ahead"              
  [22] "aight"               "aint"                "air"                
  [25] "aiyo"                "alex"                "almost"             
  [28] "alon"                "alreadi"             "alright"            
  [31] "also"                "alway"               "angri"              
  [34] "announc"             "anoth"               "answer"             
  [37] "anymor"              "anyon"               "anyth"              
  [40] "anytim"              "anyway"              "apart"              
  [43] "app"                 "appli"               "appreci"            
  [46] "arcad"               "ard"                 "area"               
  [49] "argu"                "argument"            "armand"             
  [52] "around"              "arrang"              "arriv"              
  [55] "asap"                "ask"                 "askd"               
  [58] "attempt"             "auction"             "avail"              
  [61] "ave"                 "avoid"               "await"              
  [64] "awak"                "award"               "away"               
  [67] "awesom"              "babe"                "babi"               
  [70] "back"                "bad"                 "bag"                
  [73] "bank"                "bare"                "basic"              
  [76] "bath"                "batteri"             "bcoz"               
  [79] "bday"                "beauti"              "becom"              
  [82] "bed"                 "bedroom"             "beer"               
  [85] "begin"               "believ"              "best"               
  [88] "better"              "bid"                 "big"                
  [91] "bill"                "bird"                "birthday"           
  [94] "bit"                 "black"               "blank"              
  [97] "bless"               "blue"                "bluetooth"          
 [100] "bold"                "bonus"               "boo"                
 [103] "book"                "boost"               "bore"               
 [106] "boss"                "bother"              "bout"               
 [109] "box"                 "boy"                 "boytoy"             
 [112] "break"               "breath"              "bring"              
 [115] "brother"             "bslvyl"              "btnationalr"        
 [118] "buck"                "bus"                 "busi"               
 [121] "buy"                 "cabin"               "call"               
 [124] "caller"              "callertun"           "camcord"            
 [127] "came"                "camera"              "campus"             
 [130] "can"                 "cancel"              "cancer"             
 [133] "cant"                "car"                 "card"               
 [136] "care"                "carlo"               "case"               
 [139] "cash"                "cashbal"             "catch"              
 [142] "caus"                "celebr"              "cell"               
 [145] "centr"               "chanc"               "chang"              
 [148] "charg"               "chat"                "cheap"              
 [151] "cheaper"             "check"               "cheer"              
 [154] "chennai"             "chikku"              "childish"           
 [157] "children"            "choic"               "choos"              
 [160] "christma"            "claim"               "class"              
 [163] "clean"               "clear"               "close"              
 [166] "club"                "code"                "coffe"              
 [169] "cold"                "colleagu"            "collect"            
 [172] "colleg"              "colour"              "come"               
 [175] "comin"               "comp"                "compani"            
 [178] "competit"            "complet"             "complimentari"      
 [181] "comput"              "condit"              "confirm"            
 [184] "congrat"             "congratul"           "connect"            
 [187] "contact"             "content"             "contract"           
 [190] "cook"                "cool"                "copi"               
 [193] "correct"             "cos"                 "cost"               
 [196] "costa"               "costpm"              "coupl"              
 [199] "cours"               "cover"               "coz"                
 [202] "crave"               "crazi"               "creat"              
 [205] "credit"              "cri"                 "cross"              
 [208] "cuddl"               "cum"                 "cup"                
 [211] "current"             "custcar"             "custom"             
 [214] "cut"                 "cute"                "cuz"                
 [217] "dad"                 "daddi"               "darl"               
 [220] "darlin"              "darren"              "dat"                
 [223] "date"                "day"                 "dead"               
 [226] "deal"                "dear"                "decid"              
 [229] "decim"               "decis"               "deep"               
 [232] "definit"             "del"                 "deliv"              
 [235] "deliveri"            "den"                 "depend"             
 [238] "detail"              "didnt"               "die"                
 [241] "diet"                "differ"              "difficult"          
 [244] "digit"               "din"                 "dinner"             
 [247] "direct"              "dis"                 "discount"           
 [250] "discuss"             "disturb"             "dnt"                
 [253] "doc"                 "doctor"              "doesnt"             
 [256] "dog"                 "doin"                "don"                
 [259] "done"                "dont"                "door"               
 [262] "doubl"               "download"            "draw"               
 [265] "dream"               "drink"               "drive"              
 [268] "drop"                "drug"                "dude"               
 [271] "due"                 "dun"                 "dunno"              
 [274] "dvd"                 "earli"               "earlier"            
 [277] "earth"               "easi"                "eat"                
 [280] "eatin"               "egg"                 "either"             
 [283] "els"                 "email"               "embarass"           
 [286] "end"                 "energi"              "england"            
 [289] "enjoy"               "enough"              "enter"              
 [292] "entitl"              "entri"               "envelop"            
 [295] "etc"                 "euro"                "eve"                
 [298] "even"                "ever"                "everi"              
 [301] "everybodi"           "everyon"             "everyth"            
 [304] "exact"               "exam"                "excel"              
 [307] "excit"               "excus"               "expect"             
 [310] "experi"              "expir"               "extra"              
 [313] "eye"                 "face"                "facebook"           
 [316] "fact"                "fall"                "famili"             
 [319] "fanci"               "fantasi"             "fantast"            
 [322] "far"                 "fast"                "fat"                
 [325] "father"              "fault"               "feb"                
 [328] "feel"                "felt"                "fetch"              
 [331] "fight"               "figur"               "file"               
 [334] "fill"                "film"                "final"              
 [337] "find"                "fine"                "finger"             
 [340] "finish"              "first"               "fix"                
 [343] "flag"                "flat"                "flight"             
 [346] "flower"              "follow"              "fone"               
 [349] "food"                "forev"               "forget"             
 [352] "forgot"              "forward"             "found"              
 [355] "freak"               "free"                "freemsg"            
 [358] "freephon"            "fren"                "fri"                
 [361] "friday"              "friend"              "friendship"         
 [364] "frm"                 "frnd"                "frnds"              
 [367] "full"                "fullonsmscom"        "fun"                
 [370] "funni"               "futur"               "gal"                
 [373] "game"                "gap"                 "gas"                
 [376] "gave"                "gay"                 "gentl"              
 [379] "get"                 "gettin"              "gift"               
 [382] "girl"                "girlfrnd"            "give"               
 [385] "glad"                "god"                 "goe"                
 [388] "goin"                "gone"                "gonna"              
 [391] "good"                "goodmorn"            "goodnight"          
 [394] "got"                 "goto"                "gotta"              
 [397] "great"               "grin"                "guarante"           
 [400] "gud"                 "guess"               "guy"                
 [403] "gym"                 "haf"                 "haha"               
 [406] "hai"                 "hair"                "half"               
 [409] "hand"                "handset"             "hang"               
 [412] "happen"              "happi"               "hard"               
 [415] "hate"                "hav"                 "havent"             
 [418] "head"                "hear"                "heard"              
 [421] "heart"               "heavi"               "hee"                
 [424] "hell"                "hello"               "help"               
 [427] "hey"                 "hgsuiteland"         "hit"                
 [430] "hiya"                "hmm"                 "hmmm"               
 [433] "hmv"                 "hol"                 "hold"               
 [436] "holder"              "holiday"             "home"               
 [439] "hook"                "hop"                 "hope"               
 [442] "horni"               "hospit"              "hot"                
 [445] "hotel"               "hour"                "hous"               
 [448] "how"                 "howev"               "howz"               
 [451] "hrs"                 "httpwwwurawinnercom" "hug"                
 [454] "huh"                 "hungri"              "hurri"              
 [457] "hurt"                "ice"                 "idea"               
 [460] "identifi"            "ignor"               "ill"                
 [463] "immedi"              "import"              "inc"                
 [466] "includ"              "india"               "info"               
 [469] "inform"              "insid"               "instead"            
 [472] "interest"            "invit"               "ipod"               
 [475] "irrit"               "ish"                 "island"             
 [478] "issu"                "ive"                 "izzit"              
 [481] "januari"             "jay"                 "job"                
 [484] "john"                "join"                "joke"               
 [487] "joy"                 "jst"                 "jus"                
 [490] "just"                "juz"                 "kate"               
 [493] "keep"                "kept"                "kick"               
 [496] "kid"                 "kill"                "kind"               
 [499] "kinda"               "king"                "kiss"               
 [502] "knew"                "know"                "knw"                
 [505] "ladi"                "land"                "landlin"            
 [508] "laptop"              "lar"                 "last"               
 [511] "late"                "later"               "latest"             
 [514] "laugh"               "lazi"                "ldn"                
 [517] "lead"                "learn"               "least"              
 [520] "leav"                "lect"                "left"               
 [523] "leh"                 "lei"                 "less"               
 [526] "lesson"              "let"                 "letter"             
 [529] "liao"                "librari"             "lie"                
 [532] "life"                "lift"                "light"              
 [535] "like"                "line"                "link"               
 [538] "list"                "listen"              "littl"              
 [541] "live"                "lmao"                "load"               
 [544] "loan"                "local"               "locat"              
 [547] "log"                 "lol"                 "london"             
 [550] "long"                "longer"              "look"               
 [553] "lookin"              "lor"                 "lose"               
 [556] "lost"                "lot"                 "lovabl"             
 [559] "love"                "lover"               "loyalti"            
 [562] "ltd"                 "luck"                "lucki"              
 [565] "lunch"               "luv"                 "mad"                
 [568] "made"                "mah"                 "mail"               
 [571] "make"                "malaria"             "man"                
 [574] "mani"                "march"               "mark"               
 [577] "marri"               "match"               "mate"               
 [580] "matter"              "maxim"               "maxmin"             
 [583] "may"                 "mayb"                "meal"               
 [586] "mean"                "meant"               "med"                
 [589] "medic"               "meet"                "meetin"             
 [592] "meh"                 "member"              "men"                
 [595] "merri"               "messag"              "met"                
 [598] "mid"                 "midnight"            "might"              
 [601] "min"                 "mind"                "mine"               
 [604] "minut"               "miracl"              "miss"               
 [607] "mistak"              "moan"                "mob"                
 [610] "mobil"               "mobileupd"           "mode"               
 [613] "mom"                 "moment"              "mon"                
 [616] "monday"              "money"               "month"              
 [619] "morn"                "mother"              "motorola"           
 [622] "move"                "movi"                "mrng"               
 [625] "mrt"                 "mrw"                 "msg"                
 [628] "msgs"                "mths"                "much"               
 [631] "mum"                 "murder"              "music"              
 [634] "must"                "muz"                 "nah"                
 [637] "nake"                "name"                "nation"             
 [640] "natur"               "naughti"             "near"               
 [643] "need"                "net"                 "network"            
 [646] "neva"                "never"               "new"                
 [649] "news"                "next"                "nice"               
 [652] "nigeria"             "night"               "nite"               
 [655] "nobodi"              "noe"                 "nokia"              
 [658] "noon"                "nope"                "normal"             
 [661] "normpton"            "noth"                "notic"              
 [664] "now"                 "num"                 "number"             
 [667] "nyt"                 "obvious"             "offer"              
 [670] "offic"               "offici"              "okay"               
 [673] "oki"                 "old"                 "omg"                
 [676] "one"                 "onlin"               "onto"               
 [679] "oop"                 "open"                "oper"               
 [682] "opinion"             "opt"                 "optout"             
 [685] "orang"               "orchard"             "order"              
 [688] "oredi"               "oso"                 "other"              
 [691] "otherwis"            "outsid"              "pack"               
 [694] "page"                "paid"                "pain"               
 [697] "paper"               "parent"              "park"               
 [700] "part"                "parti"               "partner"            
 [703] "pass"                "passion"             "password"           
 [706] "past"                "pay"                 "peopl"              
 [709] "per"                 "person"              "pete"               
 [712] "phone"               "photo"               "pic"                
 [715] "pick"                "pictur"              "pin"                
 [718] "piss"                "pix"                 "pizza"              
 [721] "place"               "plan"                "play"               
 [724] "player"              "pleas"               "pleasur"            
 [727] "plenti"              "pls"                 "plus"               
 [730] "plz"                 "pmin"                "pmsg"               
 [733] "pobox"               "point"               "poli"               
 [736] "polic"               "poor"                "pop"                
 [739] "possess"             "possibl"             "post"               
 [742] "pound"               "power"               "ppm"                
 [745] "pray"                "present"             "press"              
 [748] "pretti"              "previous"            "price"              
 [751] "princess"            "privat"              "prize"              
 [754] "prob"                "probabl"             "problem"            
 [757] "project"             "promis"              "pub"                
 [760] "put"                 "qualiti"             "question"           
 [763] "quick"               "quit"                "quiz"               
 [766] "quot"                "rain"                "random"             
 [769] "rang"                "rate"                "rather"             
 [772] "rcvd"                "reach"               "read"               
 [775] "readi"               "real"                "reali"              
 [778] "realli"              "reason"              "receipt"            
 [781] "receiv"              "recent"              "record"             
 [784] "refer"               "regard"              "regist"             
 [787] "relat"               "relax"               "remain"             
 [790] "rememb"              "remind"              "remov"              
 [793] "rent"                "rental"              "repli"              
 [796] "repres"              "request"             "respond"            
 [799] "respons"             "rest"                "result"             
 [802] "return"              "reveal"              "review"             
 [805] "reward"              "right"               "ring"               
 [808] "rington"             "rite"                "road"               
 [811] "rock"                "role"                "room"               
 [814] "roommat"             "rose"                "round"              
 [817] "rowwjhl"             "rpli"                "rreveal"            
 [820] "run"                 "rush"                "sad"                
 [823] "sae"                 "safe"                "said"               
 [826] "sale"                "sat"                 "saturday"           
 [829] "savamob"             "save"                "saw"                
 [832] "say"                 "sch"                 "school"             
 [835] "scream"              "sea"                 "search"             
 [838] "sec"                 "second"              "secret"             
 [841] "see"                 "seem"                "seen"               
 [844] "select"              "self"                "sell"               
 [847] "semest"              "send"                "sens"               
 [850] "sent"                "serious"             "servic"             
 [853] "set"                 "settl"               "sex"                
 [856] "sexi"                "shall"               "share"              
 [859] "shd"                 "ship"                "shirt"              
 [862] "shop"                "short"               "show"               
 [865] "shower"              "sick"                "side"               
 [868] "sigh"                "sight"               "sign"               
 [871] "silent"              "simpl"               "sinc"               
 [874] "singl"               "sipix"               "sir"                
 [877] "sis"                 "sister"              "sit"                
 [880] "situat"              "skxh"                "skype"              
 [883] "slave"               "sleep"               "slept"              
 [886] "slow"                "slowli"              "small"              
 [889] "smile"               "smoke"               "sms"                
 [892] "smth"                "snow"                "sofa"               
 [895] "sol"                 "somebodi"            "someon"             
 [898] "someth"              "sometim"             "somewher"           
 [901] "song"                "soni"                "sonyericsson"       
 [904] "soon"                "sorri"               "sort"               
 [907] "sound"               "south"               "space"              
 [910] "speak"               "special"             "specialcal"         
 [913] "spend"               "spent"               "spoke"              
 [916] "spree"               "stand"               "start"              
 [919] "statement"           "station"             "stay"               
 [922] "std"                 "step"                "still"              
 [925] "stockport"           "stone"               "stop"               
 [928] "store"               "stori"               "street"             
 [931] "student"             "studi"               "stuff"              
 [934] "stupid"              "style"               "sub"                
 [937] "subscrib"            "success"             "suck"               
 [940] "suit"                "summer"              "sun"                
 [943] "sunday"              "sunshin"             "sup"                
 [946] "support"             "suppos"              "sure"               
 [949] "surf"                "surpris"             "sweet"              
 [952] "swing"               "system"              "take"               
 [955] "talk"                "tampa"               "tariff"             
 [958] "tcs"                 "tea"                 "teach"              
 [961] "tear"                "teas"                "tel"                
 [964] "tell"                "ten"                 "tenerif"            
 [967] "term"                "test"                "text"               
 [970] "thank"               "thanx"               "that"               
 [973] "thing"               "think"               "thinkin"            
 [976] "thk"                 "tho"                 "though"             
 [979] "thought"             "throw"               "thru"               
 [982] "tht"                 "thur"                "tick"               
 [985] "ticket"              "til"                 "till"               
 [988] "time"                "tire"                "titl"               
 [991] "tmr"                 "toclaim"             "today"              
 [994] "togeth"              "told"                "tomo"               
 [997] "tomorrow"            "tone"                "tonight"            
[1000] "tonit"              
 [ reached getOption("max.print") -- omitted 136 entries ]
# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)
 chr [1:1136] "abiola" "abl" "abt" "accept" "access" "account" ...
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]
# convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)

Step 3: Training a model on the data —-

library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)

Step 4: Evaluating model performance —-

sms_test_pred <- predict(sms_classifier, sms_test)
head(sms_test_pred)
[1] ham  ham  ham  ham  spam ham 
Levels: ham spam
library(gmodels)
CrossTable(sms_test_pred, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Col Total |
|-------------------------|

 
Total Observations in Table:  1390 

 
             | actual 
   predicted |       ham |      spam | Row Total | 
-------------|-----------|-----------|-----------|
         ham |      1201 |        30 |      1231 | 
             |     0.995 |     0.164 |           | 
-------------|-----------|-----------|-----------|
        spam |         6 |       153 |       159 | 
             |     0.005 |     0.836 |           | 
-------------|-----------|-----------|-----------|
Column Total |      1207 |       183 |      1390 | 
             |     0.868 |     0.132 |           | 
-------------|-----------|-----------|-----------|

 

Step 5: Improving model performance —-

sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Col Total |
|-------------------------|

 
Total Observations in Table:  1390 

 
             | actual 
   predicted |       ham |      spam | Row Total | 
-------------|-----------|-----------|-----------|
         ham |      1202 |        28 |      1230 | 
             |     0.996 |     0.153 |           | 
-------------|-----------|-----------|-----------|
        spam |         5 |       155 |       160 | 
             |     0.004 |     0.847 |           | 
-------------|-----------|-----------|-----------|
Column Total |      1207 |       183 |      1390 | 
             |     0.868 |     0.132 |           | 
-------------|-----------|-----------|-----------|

 
LS0tCnRpdGxlOiAiQ2hhcHRlciA0OiBDbGFzc2lmaWNhdGlvbiB1c2luZyBOYWl2ZSBCYXllcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBpcyBhbiBbUiBNYXJrZG93bl0oaHR0cDovL3JtYXJrZG93bi5yc3R1ZGlvLmNvbSkgTm90ZWJvb2suIFdoZW4geW91IGV4ZWN1dGUgY29kZSB3aXRoaW4gdGhlIG5vdGVib29rLCB0aGUgcmVzdWx0cyBhcHBlYXIgYmVuZWF0aCB0aGUgY29kZS4gCgpUcnkgZXhlY3V0aW5nIHRoaXMgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpSdW4qIGJ1dHRvbiB3aXRoaW4gdGhlIGNodW5rIG9yIGJ5IHBsYWNpbmcgeW91ciBjdXJzb3IgaW5zaWRlIGl0IGFuZCBwcmVzc2luZyAqQ3RybCtTaGlmdCtFbnRlciouIAoKQWRkIGEgbmV3IGNodW5rIGJ5IGNsaWNraW5nIHRoZSAqSW5zZXJ0IENodW5rKiBidXR0b24gb24gdGhlIHRvb2xiYXIgb3IgYnkgcHJlc3NpbmcgKkN0cmwrQWx0K0kqLgoKV2hlbiB5b3Ugc2F2ZSB0aGUgbm90ZWJvb2ssIGFuIEhUTUwgZmlsZSBjb250YWluaW5nIHRoZSBjb2RlIGFuZCBvdXRwdXQgd2lsbCBiZSBzYXZlZCBhbG9uZ3NpZGUgaXQgKGNsaWNrIHRoZSAqUHJldmlldyogYnV0dG9uIG9yIHByZXNzICpDdHJsK1NoaWZ0K0sqIHRvIHByZXZpZXcgdGhlIEhUTUwgZmlsZSkuCgojICoqRXhhbXBsZTogRmlsdGVyaW5nIHNwYW0gU01TIG1lc3NhZ2VzKioKCiMjIFN0ZXAgMjogRXhwbG9yaW5nIGFuZCBwcmVwYXJpbmcgdGhlIGRhdGEgLS0tLSAKCmBgYHtyfQojIHJlYWQgdGhlIHNtcyBkYXRhIGludG8gdGhlIHNtcyBkYXRhIGZyYW1lCnNtc19yYXcgPC0gcmVhZC5jc3YoInNtc19zcGFtLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkKCiMgZXhhbWluZSB0aGUgc3RydWN0dXJlIG9mIHRoZSBzbXMgZGF0YQpzdHIoc21zX3JhdykKCmBgYApgYGB7cn0KIyBjb252ZXJ0IHNwYW0vaGFtIHRvIGZhY3Rvci4Kc21zX3JhdyR0eXBlIDwtIGZhY3RvcihzbXNfcmF3JHR5cGUpCgojIGV4YW1pbmUgdGhlIHR5cGUgdmFyaWFibGUgbW9yZSBjYXJlZnVsbHkKc3RyKHNtc19yYXckdHlwZSkKdGFibGUoc21zX3JhdyR0eXBlKQoKYGBgCmBgYHtyfQojIGJ1aWxkIGEgY29ycHVzIHVzaW5nIHRoZSB0ZXh0IG1pbmluZyAodG0pIHBhY2thZ2UKbGlicmFyeSh0bSkKc21zX2NvcnB1cyA8LSBWQ29ycHVzKFZlY3RvclNvdXJjZShzbXNfcmF3JHRleHQpKQoKIyBleGFtaW5lIHRoZSBzbXMgY29ycHVzCnByaW50KHNtc19jb3JwdXMpCmluc3BlY3Qoc21zX2NvcnB1c1sxOjJdKQpgYGAKYGBge3J9CmFzLmNoYXJhY3RlcihzbXNfY29ycHVzW1sxXV0pCmxhcHBseShzbXNfY29ycHVzWzE6Ml0sIGFzLmNoYXJhY3RlcikKYGBgCmBgYHtyfQojIGNsZWFuIHVwIHRoZSBjb3JwdXMgdXNpbmcgdG1fbWFwKCkKc21zX2NvcnB1c19jbGVhbiA8LSB0bV9tYXAoc21zX2NvcnB1cywgY29udGVudF90cmFuc2Zvcm1lcih0b2xvd2VyKSkKCiMgc2hvdyB0aGUgZGlmZmVyZW5jZSBiZXR3ZWVuIHNtc19jb3JwdXMgYW5kIGNvcnB1c19jbGVhbgphcy5jaGFyYWN0ZXIoc21zX2NvcnB1c1tbMV1dKQphcy5jaGFyYWN0ZXIoc21zX2NvcnB1c19jbGVhbltbMV1dKQpgYGAKYGBge3J9CnNtc19jb3JwdXNfY2xlYW4gPC0gdG1fbWFwKHNtc19jb3JwdXNfY2xlYW4sIHJlbW92ZU51bWJlcnMpICMgcmVtb3ZlIG51bWJlcnMKc21zX2NvcnB1c19jbGVhbiA8LSB0bV9tYXAoc21zX2NvcnB1c19jbGVhbiwgcmVtb3ZlV29yZHMsIHN0b3B3b3JkcygpKSAjIHJlbW92ZSBzdG9wIHdvcmRzCnNtc19jb3JwdXNfY2xlYW4gPC0gdG1fbWFwKHNtc19jb3JwdXNfY2xlYW4sIHJlbW92ZVB1bmN0dWF0aW9uKSAjIHJlbW92ZSBwdW5jdHVhdGlvbgpgYGAKYGBge3J9CiMgdGlwOiBjcmVhdGUgYSBjdXN0b20gZnVuY3Rpb24gdG8gcmVwbGFjZSAocmF0aGVyIHRoYW4gcmVtb3ZlKSBwdW5jdHVhdGlvbgpyZW1vdmVQdW5jdHVhdGlvbigiaGVsbG8uLi53b3JsZCIpCnJlcGxhY2VQdW5jdHVhdGlvbiA8LSBmdW5jdGlvbih4KSB7IGdzdWIoIltbOnB1bmN0Ol1dKyIsICIgIiwgeCkgfQpyZXBsYWNlUHVuY3R1YXRpb24oImhlbGxvLi4ud29ybGQiKQpgYGAKYGBge3J9CiMgaWxsdXN0cmF0aW9uIG9mIHdvcmQgc3RlbW1pbmcKbGlicmFyeShTbm93YmFsbEMpCndvcmRTdGVtKGMoImxlYXJuIiwgImxlYXJuZWQiLCAibGVhcm5pbmciLCAibGVhcm5zIikpCgpzbXNfY29ycHVzX2NsZWFuIDwtIHRtX21hcChzbXNfY29ycHVzX2NsZWFuLCBzdGVtRG9jdW1lbnQpCgpzbXNfY29ycHVzX2NsZWFuIDwtIHRtX21hcChzbXNfY29ycHVzX2NsZWFuLCBzdHJpcFdoaXRlc3BhY2UpICMgZWxpbWluYXRlIHVubmVlZGVkIHdoaXRlc3BhY2UKCiMgZXhhbWluZSB0aGUgZmluYWwgY2xlYW4gY29ycHVzCmxhcHBseShzbXNfY29ycHVzWzE6M10sIGFzLmNoYXJhY3RlcikKbGFwcGx5KHNtc19jb3JwdXNfY2xlYW5bMTozXSwgYXMuY2hhcmFjdGVyKQpgYGAKYGBge3J9CiMgY3JlYXRlIGEgZG9jdW1lbnQtdGVybSBzcGFyc2UgbWF0cml4CnNtc19kdG0gPC0gRG9jdW1lbnRUZXJtTWF0cml4KHNtc19jb3JwdXNfY2xlYW4pCgojIGFsdGVybmF0aXZlIHNvbHV0aW9uOiBjcmVhdGUgYSBkb2N1bWVudC10ZXJtIHNwYXJzZSBtYXRyaXggZGlyZWN0bHkgZnJvbSB0aGUgU01TIGNvcnB1cwpzbXNfZHRtMiA8LSBEb2N1bWVudFRlcm1NYXRyaXgoc21zX2NvcnB1cywgY29udHJvbCA9IGxpc3QoCiAgdG9sb3dlciA9IFRSVUUsCiAgcmVtb3ZlTnVtYmVycyA9IFRSVUUsCiAgc3RvcHdvcmRzID0gVFJVRSwKICByZW1vdmVQdW5jdHVhdGlvbiA9IFRSVUUsCiAgc3RlbW1pbmcgPSBUUlVFCikpCgojIGFsdGVybmF0aXZlIHNvbHV0aW9uOiB1c2luZyBjdXN0b20gc3RvcCB3b3JkcyBmdW5jdGlvbiBlbnN1cmVzIGlkZW50aWNhbCByZXN1bHQKc21zX2R0bTMgPC0gRG9jdW1lbnRUZXJtTWF0cml4KHNtc19jb3JwdXMsIGNvbnRyb2wgPSBsaXN0KAogIHRvbG93ZXIgPSBUUlVFLAogIHJlbW92ZU51bWJlcnMgPSBUUlVFLAogIHN0b3B3b3JkcyA9IGZ1bmN0aW9uKHgpIHsgcmVtb3ZlV29yZHMoeCwgc3RvcHdvcmRzKCkpIH0sCiAgcmVtb3ZlUHVuY3R1YXRpb24gPSBUUlVFLAogIHN0ZW1taW5nID0gVFJVRQopKQoKIyBjb21wYXJlIHRoZSByZXN1bHQKc21zX2R0bQpzbXNfZHRtMgpzbXNfZHRtMwpgYGAKYGBge3J9CiMgY3JlYXRpbmcgdHJhaW5pbmcgYW5kIHRlc3QgZGF0YXNldHMKc21zX2R0bV90cmFpbiA8LSBzbXNfZHRtWzE6NDE2OSwgXQpzbXNfZHRtX3Rlc3QgIDwtIHNtc19kdG1bNDE3MDo1NTU5LCBdCgojIGFsc28gc2F2ZSB0aGUgbGFiZWxzCnNtc190cmFpbl9sYWJlbHMgPC0gc21zX3Jhd1sxOjQxNjksIF0kdHlwZQpzbXNfdGVzdF9sYWJlbHMgIDwtIHNtc19yYXdbNDE3MDo1NTU5LCBdJHR5cGUKCiMgY2hlY2sgdGhhdCB0aGUgcHJvcG9ydGlvbiBvZiBzcGFtIGlzIHNpbWlsYXIKcHJvcC50YWJsZSh0YWJsZShzbXNfdHJhaW5fbGFiZWxzKSkKcHJvcC50YWJsZSh0YWJsZShzbXNfdGVzdF9sYWJlbHMpKQpgYGAKYGBge3J9CiMgd29yZCBjbG91ZCB2aXN1YWxpemF0aW9uCmxpYnJhcnkod29yZGNsb3VkKQp3b3JkY2xvdWQoc21zX2NvcnB1c19jbGVhbiwgbWluLmZyZXEgPSA1MCwgcmFuZG9tLm9yZGVyID0gRkFMU0UpCmBgYApgYGB7cn0KIyBzdWJzZXQgdGhlIHRyYWluaW5nIGRhdGEgaW50byBzcGFtIGFuZCBoYW0gZ3JvdXBzCnNwYW0gPC0gc3Vic2V0KHNtc19yYXcsIHR5cGUgPT0gInNwYW0iKQpoYW0gIDwtIHN1YnNldChzbXNfcmF3LCB0eXBlID09ICJoYW0iKQoKd29yZGNsb3VkKHNwYW0kdGV4dCwgbWF4LndvcmRzID0gNDAsIHNjYWxlID0gYygzLCAwLjUpKQp3b3JkY2xvdWQoaGFtJHRleHQsIG1heC53b3JkcyA9IDQwLCBzY2FsZSA9IGMoMywgMC41KSkKYGBgCmBgYHtyfQpzbXNfZHRtX2ZyZXFfdHJhaW4gPC0gcmVtb3ZlU3BhcnNlVGVybXMoc21zX2R0bV90cmFpbiwgMC45OTkpCnNtc19kdG1fZnJlcV90cmFpbgoKIyBpbmRpY2F0b3IgZmVhdHVyZXMgZm9yIGZyZXF1ZW50IHdvcmRzCmZpbmRGcmVxVGVybXMoc21zX2R0bV90cmFpbiwgNSkKCiMgc2F2ZSBmcmVxdWVudGx5LWFwcGVhcmluZyB0ZXJtcyB0byBhIGNoYXJhY3RlciB2ZWN0b3IKc21zX2ZyZXFfd29yZHMgPC0gZmluZEZyZXFUZXJtcyhzbXNfZHRtX3RyYWluLCA1KQpzdHIoc21zX2ZyZXFfd29yZHMpCgojIGNyZWF0ZSBEVE1zIHdpdGggb25seSB0aGUgZnJlcXVlbnQgdGVybXMKc21zX2R0bV9mcmVxX3RyYWluIDwtIHNtc19kdG1fdHJhaW5bICwgc21zX2ZyZXFfd29yZHNdCnNtc19kdG1fZnJlcV90ZXN0IDwtIHNtc19kdG1fdGVzdFsgLCBzbXNfZnJlcV93b3Jkc10KCiMgY29udmVydCBjb3VudHMgdG8gYSBmYWN0b3IKY29udmVydF9jb3VudHMgPC0gZnVuY3Rpb24oeCkgewogIHggPC0gaWZlbHNlKHggPiAwLCAiWWVzIiwgIk5vIikKfQoKIyBhcHBseSgpIGNvbnZlcnRfY291bnRzKCkgdG8gY29sdW1ucyBvZiB0cmFpbi90ZXN0IGRhdGEKc21zX3RyYWluIDwtIGFwcGx5KHNtc19kdG1fZnJlcV90cmFpbiwgTUFSR0lOID0gMiwgY29udmVydF9jb3VudHMpCnNtc190ZXN0ICA8LSBhcHBseShzbXNfZHRtX2ZyZXFfdGVzdCwgTUFSR0lOID0gMiwgY29udmVydF9jb3VudHMpCmBgYAoKIyMgU3RlcCAzOiBUcmFpbmluZyBhIG1vZGVsIG9uIHRoZSBkYXRhIC0tLS0KCmBgYHtyfQpsaWJyYXJ5KGUxMDcxKQpzbXNfY2xhc3NpZmllciA8LSBuYWl2ZUJheWVzKHNtc190cmFpbiwgc21zX3RyYWluX2xhYmVscykKCmBgYAoKIyMgU3RlcCA0OiBFdmFsdWF0aW5nIG1vZGVsIHBlcmZvcm1hbmNlIC0tLS0KCmBgYHtyfQpzbXNfdGVzdF9wcmVkIDwtIHByZWRpY3Qoc21zX2NsYXNzaWZpZXIsIHNtc190ZXN0KQoKaGVhZChzbXNfdGVzdF9wcmVkKQoKbGlicmFyeShnbW9kZWxzKQpDcm9zc1RhYmxlKHNtc190ZXN0X3ByZWQsIHNtc190ZXN0X2xhYmVscywKICAgICAgICAgICBwcm9wLmNoaXNxID0gRkFMU0UsIHByb3AudCA9IEZBTFNFLCBwcm9wLnIgPSBGQUxTRSwKICAgICAgICAgICBkbm4gPSBjKCdwcmVkaWN0ZWQnLCAnYWN0dWFsJykpCmBgYAoKIyMgU3RlcCA1OiBJbXByb3ZpbmcgbW9kZWwgcGVyZm9ybWFuY2UgLS0tLQoKYGBge3J9CnNtc19jbGFzc2lmaWVyMiA8LSBuYWl2ZUJheWVzKHNtc190cmFpbiwgc21zX3RyYWluX2xhYmVscywgbGFwbGFjZSA9IDEpCnNtc190ZXN0X3ByZWQyIDwtIHByZWRpY3Qoc21zX2NsYXNzaWZpZXIyLCBzbXNfdGVzdCkKQ3Jvc3NUYWJsZShzbXNfdGVzdF9wcmVkMiwgc21zX3Rlc3RfbGFiZWxzLAogICAgICAgICAgIHByb3AuY2hpc3EgPSBGQUxTRSwgcHJvcC50ID0gRkFMU0UsIHByb3AuciA9IEZBTFNFLAogICAgICAgICAgIGRubiA9IGMoJ3ByZWRpY3RlZCcsICdhY3R1YWwnKSkKYGBgCgoKCg==