Probabilistic Learning - Classification Using Naive Bayes

Exploring and preparing the data

sms_raw <- read.csv("C:/Users/Justice2/Desktop/Machine Learning & Data Science/R/data/sms_spam.csv",stringsAsFactors = FALSE)
str(sms_raw)

## 'data.frame':    5574 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "spam" "ham" ...
##  $ text: chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...

sms_raw$type <- factor(sms_raw$type)
str(sms_raw$type)

##  Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...

table(sms_raw$type)

## 
##  ham spam 
## 4827  747

Data preparation - cleaning and standardizing text data

library(tm)

## Loading required package: NLP

library(SnowballC)
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
print(sms_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 5574

inspect(sms_corpus[1:2])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 111
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 29

as.character(sms_corpus[[9]])

## [1] "WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."

lapply(sms_corpus[1:5], as.character)

## $`1`
## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
## 
## $`2`
## [1] "Ok lar... Joking wif u oni..."
## 
## $`3`
## [1] "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
## 
## $`4`
## [1] "U dun say so early hor... U c already then say..."
## 
## $`5`
## [1] "Nah I don't think he goes to usf, he lives around here though"

sms_corpus_clean <- tm_map(sms_corpus,content_transformer(tolower))
as.character(sms_corpus[[1]])

## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."

as.character(sms_corpus_clean[[1]])

## [1] "go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."

sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers)
#getTransformations()
sms_corpus_clean <- tm_map(sms_corpus_clean,removeWords, stopwords())
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation)
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace)
lapply(sms_corpus[21:22], as.character)

## $`21`
## [1] "Is that seriously how you spell his name?"
## 
## $`22`
## [1] "Iâ<U+0080><U+0098>m going to try for 2 months ha ha only joking"

lapply(sms_corpus_clean[21:22], as.character)

## $`21`
## [1] " serious spell name"
## 
## $`22`
## [1] "iâ<U+0080><U+0098>m go tri month ha ha joke"

Data preparation - splitting text documents into words

sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(tolower = TRUE,removeNumbers = TRUE,stopwords = TRUE,removePunctuation = TRUE,stemming = TRUE))
sms_dtm

## <<DocumentTermMatrix (documents: 5574, terms: 6604)>>
## Non-/sparse entries: 42631/36768065
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency (tf)

sms_dtm2

## <<DocumentTermMatrix (documents: 5574, terms: 6998)>>
## Non-/sparse entries: 43720/38963132
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency (tf)

Data preparation - creating training and test datasets

sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
sms_train_labels <- sms_raw[1:4169, ]$type
sms_test_labels <- sms_raw[4170:5559, ]$type
prop.table(table(sms_train_labels))

## sms_train_labels
##       ham      spam 
## 0.8647158 0.1352842

prop.table(table(sms_test_labels))

## sms_test_labels
##       ham      spam 
## 0.8697842 0.1302158

Visualizing text data - word clouds

library(wordcloud)

## Loading required package: RColorBrewer

wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)

spam <- subset(sms_raw, type == "spam")
ham <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

Data preparation - creating indicator features for frequent words

findFreqTerms(sms_dtm_train, 5)

##    [1] "â<U+0080><U+0093>"           "abiola"        "abl"           "abt"          
##    [5] "accept"        "access"        "account"       "across"       
##    [9] "activ"         "actual"        "add"           "address"      
##   [13] "admir"         "adult"         "advanc"        "aft"          
##   [17] "afternoon"     "aftr"          "age"           "ago"          
##   [21] "ahead"         "aight"         "aint"          "air"          
##   [25] "aiyah"         "alex"          "almost"        "alon"         
##   [29] "alreadi"       "alright"       "alrit"         "also"         
##   [33] "alway"         "amp"           "angri"         "announc"      
##   [37] "anoth"         "answer"        "anybodi"       "anymor"       
##   [41] "anyon"         "anyth"         "anytim"        "anyway"       
##   [45] "apart"         "app"           "appli"         "appoint"      
##   [49] "appreci"       "april"         "ard"           "area"         
##   [53] "argument"      "arm"           "around"        "arrang"       
##   [57] "arrest"        "arriv"         "asap"          "ask"          
##   [61] "askd"          "asleep"        "ass"           "attempt"      
##   [65] "auction"       "avail"         "ave"           "avoid"        
##   [69] "await"         "award"         "away"          "awesom"       
##   [73] "âwk"           "babe"          "babi"          "back"         
##   [77] "bad"           "bag"           "bak"           "balanc"       
##   [81] "bank"          "bare"          "bath"          "batteri"      
##   [85] "bcoz"          "bcum"          "bday"          "beauti"       
##   [89] "becom"         "bed"           "bedroom"       "begin"        
##   [93] "believ"        "belli"         "best"          "better"       
##   [97] "bid"           "big"           "bill"          "bird"         
##  [101] "birthday"      "bit"           "black"         "blank"        
##  [105] "bless"         "blue"          "bluetooth"     "bodi"         
##  [109] "bold"          "bonus"         "boo"           "book"         
##  [113] "bore"          "boss"          "bother"        "bout"         
##  [117] "bowl"          "box"           "boy"           "boytoy"       
##  [121] "brand"         "break"         "breath"        "brilliant"    
##  [125] "bring"         "brother"       "bslvyl"        "btnationalr"  
##  [129] "budget"        "bugi"          "bus"           "busi"         
##  [133] "buy"           "buzz"          "cabin"         "cafe"         
##  [137] "cal"           "caller"        "callertun"     "call"         
##  [141] "camcord"       "came"          "camera"        "can"          
##  [145] "cancel"        "cant"          "car"           "card"         
##  [149] "care"          "carlo"         "case"          "cash"         
##  [153] "cashbal"       "catch"         "caus"          "chanc"        
##  [157] "chang"         "charact"       "charg"         "chariti"      
##  [161] "chat"          "cheap"         "check"         "cheer"        
##  [165] "chennai"       "chikku"        "childish"      "children"     
##  [169] "chines"        "choic"         "choos"         "christma"     
##  [173] "cine"          "cinema"        "claim"         "class"        
##  [177] "clean"         "clear"         "click"         "clock"        
##  [181] "close"         "club"          "code"          "coffe"        
##  [185] "coin"          "cold"          "colleagu"      "collect"      
##  [189] "colleg"        "colour"        "come"          "comin"        
##  [193] "comp"          "compani"       "competit"      "complet"      
##  [197] "complimentari" "comput"        "concentr"      "condit"       
##  [201] "confid"        "confirm"       "congrat"       "congratul"    
##  [205] "connect"       "contact"       "content"       "convey"       
##  [209] "cook"          "cool"          "copi"          "correct"      
##  [213] "cos"           "cost"          "countri"       "coupl"        
##  [217] "cours"         "cover"         "coz"           "crave"        
##  [221] "crazi"         "credit"        "cri"           "croydon"      
##  [225] "cuddl"         "cum"           "cup"           "current"      
##  [229] "custcar"       "custom"        "cut"           "cute"         
##  [233] "cuz"           "dad"           "daddi"         "damn"         
##  [237] "darl"          "darlin"        "darren"        "dat"          
##  [241] "date"          "day"           "dead"          "deal"         
##  [245] "dear"          "decid"         "deep"          "definit"      
##  [249] "del"           "delet"         "deliv"         "deliveri"     
##  [253] "den"           "depend"        "detail"        "dey"          
##  [257] "didnt"         "die"           "differ"        "difficult"    
##  [261] "digit"         "din"           "dinner"        "direct"       
##  [265] "dis"           "discount"      "discuss"       "disturb"      
##  [269] "dnt"           "doctor"        "doesnt"        "dog"          
##  [273] "doin"          "dollar"        "don"           "donâ<U+0080><U+0098>t"      
##  [277] "done"          "dont"          "door"          "doubl"        
##  [281] "download"      "draw"          "dream"         "drink"        
##  [285] "drive"         "drop"          "drug"          "dude"         
##  [289] "dun"           "dunno"         "dvd"           "earli"        
##  [293] "earlier"       "easi"          "eat"           "eatin"        
##  [297] "either"        "els"           "email"         "embarass"     
##  [301] "empti"         "end"           "enemi"         "energi"       
##  [305] "england"       "enjoy"         "enough"        "enter"        
##  [309] "entri"         "envelop"       "especi"        "etc"          
##  [313] "euro"          "eve"           "even"          "ever"         
##  [317] "everi"         "everyon"       "everyth"       "exact"        
##  [321] "exam"          "excel"         "excit"         "excus"        
##  [325] "expect"        "experi"        "expir"         "extra"        
##  [329] "eye"           "face"          "facebook"      "fact"         
##  [333] "fall"          "famili"        "fanci"         "fantasi"      
##  [337] "fantast"       "far"           "fast"          "fat"          
##  [341] "father"        "fault"         "feel"          "felt"         
##  [345] "fetch"         "fight"         "figur"         "file"         
##  [349] "fill"          "film"          "final"         "find"         
##  [353] "fine"          "finger"        "finish"        "first"        
##  [357] "five"          "fix"           "flight"        "flirt"        
##  [361] "flower"        "follow"        "fone"          "food"         
##  [365] "forev"         "forget"        "forgot"        "forward"      
##  [369] "found"         "free"          "freemsg"       "freephon"     
##  [373] "fren"          "fri"           "friday"        "friend"       
##  [377] "friendship"    "frm"           "frnd"          "frnds"        
##  [381] "fuck"          "full"          "fullonsmscom"  "fun"          
##  [385] "funni"         "futur"         "gal"           "game"         
##  [389] "gap"           "gas"           "gave"          "gay"          
##  [393] "gentl"         "get"           "gettin"        "gift"         
##  [397] "girl"          "give"          "glad"          "god"          
##  [401] "goe"           "goin"          "gone"          "gonna"        
##  [405] "good"          "goodmorn"      "goodnight"     "got"          
##  [409] "goto"          "gotta"         "great"         "green"        
##  [413] "greet"         "grin"          "group"         "guarante"     
##  [417] "gud"           "guess"         "guy"           "gym"          
##  [421] "haf"           "haha"          "hai"           "hair"         
##  [425] "half"          "hand"          "hang"          "happen"       
##  [429] "happi"         "hard"          "hav"           "havent"       
##  [433] "head"          "hear"          "heard"         "heart"        
##  [437] "heavi"         "hee"           "hell"          "hello"        
##  [441] "help"          "hey"           "hgsuiteland"   "high"         
##  [445] "hit"           "hiya"          "hmm"           "hmmm"         
##  [449] "hmv"           "hol"           "hold"          "holder"       
##  [453] "holiday"       "home"          "honey"         "hook"         
##  [457] "hop"           "hope"          "horni"         "hospit"       
##  [461] "hot"           "hotel"         "hour"          "hous"         
##  [465] "housemaid"     "how"           "howev"         "howz"         
##  [469] "hrs"           "hug"           "huh"           "hungri"       
##  [473] "hurri"         "hurt"          "iam"           "ice"          
##  [477] "idea"          "identifi"      "ignor"         "ill"          
##  [481] "imagin"        "imma"          "immedi"        "import"       
##  [485] "inc"           "inch"          "includ"        "india"        
##  [489] "indian"        "info"          "inform"        "instead"      
##  [493] "interest"      "interview"     "invit"         "ipod"         
##  [497] "irrit"         "ish"           "issu"          "ive"          
##  [501] "izzit"         "januari"       "jay"           "job"          
##  [505] "john"          "join"          "joke"          "joy"          
##  [509] "jus"           "just"          "juz"           "kalli"        
##  [513] "kate"          "keep"          "kept"          "key"          
##  [517] "kick"          "kid"           "kill"          "kind"         
##  [521] "kinda"         "king"          "kiss"          "knew"         
##  [525] "know"          "knw"           "ladi"          "land"         
##  [529] "landlin"       "laptop"        "lar"           "last"         
##  [533] "late"          "later"         "latest"        "laugh"        
##  [537] "lazi"          "ldn"           "lead"          "learn"        
##  [541] "least"         "leav"          "lect"          "left"         
##  [545] "leh"           "lei"           "lemm"          "less"         
##  [549] "lesson"        "let"           "letter"        "liao"         
##  [553] "librari"       "lick"          "lie"           "life"         
##  [557] "lift"          "light"         "like"          "line"         
##  [561] "link"          "list"          "listen"        "littl"        
##  [565] "live"          "load"          "loan"          "local"        
##  [569] "locat"         "log"           "login"         "lol"          
##  [573] "long"          "longer"        "look"          "lor"          
##  [577] "lose"          "lost"          "lot"           "lovabl"       
##  [581] "love"          "lover"         "loverboy"      "loyalti"      
##  [585] "ltd"           "ltdecimalgt"   "ltgt"          "lttimegt"     
##  [589] "luck"          "lucki"         "lunch"         "luv"          
##  [593] "made"          "mah"           "mail"          "make"         
##  [597] "man"           "mani"          "march"         "mark"         
##  [601] "marri"         "marriag"       "match"         "mate"         
##  [605] "matter"        "maxim"         "may"           "mayb"         
##  [609] "mean"          "meant"         "med"           "medic"        
##  [613] "meet"          "meh"           "mell"          "member"       
##  [617] "men"           "menu"          "merri"         "messag"       
##  [621] "met"           "mid"           "midnight"      "might"        
##  [625] "min"           "mind"          "mine"          "minut"        
##  [629] "miracl"        "miss"          "mistak"        "moan"         
##  [633] "mob"           "mobil"         "mobileupd"     "mode"         
##  [637] "mom"           "moment"        "mon"           "monday"       
##  [641] "money"         "month"         "mood"          "moon"         
##  [645] "morn"          "motorola"      "move"          "movi"         
##  [649] "mrng"          "mrt"           "msg"           "msgs"         
##  [653] "mths"          "much"          "mum"           "murder"       
##  [657] "music"         "must"          "muz"           "nah"          
##  [661] "nake"          "name"          "nation"        "natur"        
##  [665] "naughti"       "near"          "need"          "net"          
##  [669] "network"       "neva"          "never"         "new"          
##  [673] "news"          "next"          "nice"          "nigeria"      
##  [677] "night"         "nite"          "nobodi"        "noe"          
##  [681] "nokia"         "none"          "noon"          "nope"         
##  [685] "normal"        "noth"          "notic"         "now"          
##  [689] "ntt"           "num"           "number"        "nxt"          
##  [693] "nyt"           "offer"         "offic"         "offici"       
##  [697] "okay"          "oki"           "old"           "omw"          
##  [701] "one"           "onlin"         "oop"           "open"         
##  [705] "oper"          "opinion"       "opt"           "optout"       
##  [709] "orang"         "orchard"       "order"         "oredi"        
##  [713] "oso"           "other"         "otherwis"      "outsid"       
##  [717] "pack"          "page"          "paid"          "pain"         
##  [721] "paper"         "parent"        "park"          "part"         
##  [725] "parti"         "partner"       "pass"          "passion"      
##  [729] "password"      "past"          "pay"           "peac"         
##  [733] "peopl"         "per"           "person"        "pete"         
##  [737] "phone"         "photo"         "pic"           "pick"         
##  [741] "pictur"        "piec"          "pix"           "pizza"        
##  [745] "place"         "plan"          "plane"         "play"         
##  [749] "player"        "pleas"         "pleasur"       "pls"          
##  [753] "plus"          "plz"           "pmin"          "pmsg"         
##  [757] "pobox"         "poboxwwq"      "point"         "poli"         
##  [761] "polic"         "poor"          "pop"           "possibl"      
##  [765] "post"          "pound"         "power"         "pple"         
##  [769] "ppm"           "practic"       "pray"          "prefer"       
##  [773] "prepar"        "press"         "pretti"        "price"        
##  [777] "princess"      "privat"        "prize"         "prob"         
##  [781] "probabl"       "problem"       "process"       "project"      
##  [785] "promis"        "pub"           "put"           "qualiti"      
##  [789] "question"      "quick"         "quit"          "quiz"         
##  [793] "quot"          "rain"          "rate"          "rather"       
##  [797] "rcvd"          "reach"         "read"          "readi"        
##  [801] "real"          "realiz"        "realli"        "reason"       
##  [805] "receipt"       "receiv"        "recent"        "record"       
##  [809] "refer"         "regard"        "regist"        "remain"       
##  [813] "rememb"        "remind"        "remov"         "rent"         
##  [817] "rental"        "repli"         "repres"        "request"      
##  [821] "respond"       "respons"       "rest"          "result"       
##  [825] "return"        "reveal"        "review"        "right"        
##  [829] "ring"          "rington"       "rite"          "road"         
##  [833] "rock"          "room"          "roommat"       "rose"         
##  [837] "round"         "rowwjhl"       "rpli"          "rreveal"      
##  [841] "run"           "sad"           "sae"           "safe"         
##  [845] "said"          "sale"          "sam"           "sat"          
##  [849] "saturday"      "savamob"       "save"          "saw"          
##  [853] "say"           "sch"           "school"        "score"        
##  [857] "scream"        "sea"           "search"        "season"       
##  [861] "sec"           "second"        "secret"        "see"          
##  [865] "seem"          "seen"          "select"        "self"         
##  [869] "sell"          "semest"        "send"          "sens"         
##  [873] "sent"          "serious"       "servic"        "set"          
##  [877] "settl"         "sex"           "sexi"          "shall"        
##  [881] "share"         "shd"           "ship"          "shirt"        
##  [885] "shit"          "shop"          "short"         "show"         
##  [889] "shower"        "shuhui"        "sick"          "side"         
##  [893] "sigh"          "sight"         "sign"          "silent"       
##  [897] "simpl"         "sinc"          "sing"          "singl"        
##  [901] "sir"           "sis"           "sister"        "sit"          
##  [905] "situat"        "sky"           "slave"         "sleep"        
##  [909] "slept"         "slow"          "slowli"        "small"        
##  [913] "smile"         "smoke"         "sms"           "smth"         
##  [917] "snow"          "sofa"          "solv"          "somebodi"     
##  [921] "someon"        "someth"        "sometim"       "somewher"     
##  [925] "song"          "soni"          "sonyericsson"  "soon"         
##  [929] "sorri"         "sort"          "sound"         "space"        
##  [933] "speak"         "special"       "specialcal"    "spend"        
##  [937] "spent"         "spoke"         "sport"         "spree"        
##  [941] "stand"         "star"          "start"         "statement"    
##  [945] "station"       "stay"          "std"           "still"        
##  [949] "stock"         "stop"          "store"         "stori"        
##  [953] "str"           "straight"      "street"        "strong"       
##  [957] "student"       "studi"         "stuff"         "stupid"       
##  [961] "style"         "sub"           "subscrib"      "success"      
##  [965] "summer"        "sun"           "sunday"        "sunshin"      
##  [969] "support"       "suppos"        "sure"          "surpris"      
##  [973] "sweet"         "swing"         "system"        "take"         
##  [977] "talk"          "tampa"         "tcs"           "teach"        
##  [981] "team"          "tear"          "teas"          "tel"          
##  [985] "tell"          "ten"           "tenerif"       "term"         
##  [989] "test"          "text"          "thank"         "thanx"        
##  [993] "that"          "thing"         "think"         "thinkin"      
##  [997] "thk"           "thnk"          "tho"           "though"       
## [1001] "thought"       "throw"         "thru"          "tht"          
## [1005] "thur"          "ticket"        "til"           "till"         
## [1009] "time"          "tire"          "titl"          "tmr"          
## [1013] "tncs"          "today"         "togeth"        "told"         
## [1017] "tomo"          "tomorrow"      "tone"          "tonight"      
## [1021] "tonit"         "took"          "top"           "tot"          
## [1025] "total"         "touch"         "tough"         "tour"         
## [1029] "toward"        "town"          "track"         "train"        
## [1033] "transact"      "treat"         "tri"           "trip"         
## [1037] "troubl"        "true"          "trust"         "truth"        
## [1041] "tscs"          "ttyl"          "tuesday"       "turn"         
## [1045] "twice"         "two"           "txt"           "txting"       
## [1049] "txts"          "type"          "ufind"         "ugh"          
## [1053] "umma"          "uncl"          "understand"    "unless"       
## [1057] "unlimit"       "unredeem"      "unsub"         "unsubscrib"   
## [1061] "updat"         "ure"           "urgent"        "urself"       
## [1065] "use"           "usf"           "usual"         "uve"          
## [1069] "valentin"      "valid"         "valu"          "vari"         
## [1073] "verifi"        "via"           "video"         "visit"        
## [1077] "voic"          "voucher"       "wait"          "wake"         
## [1081] "walk"          "wan"           "wana"          "wanna"        
## [1085] "want"          "wap"           "warm"          "wast"         
## [1089] "wat"           "watch"         "water"         "way"          
## [1093] "weak"          "wear"          "weather"       "wed"          
## [1097] "wednesday"     "weed"          "week"          "weekend"      
## [1101] "weight"        "welcom"        "well"          "wen"          
## [1105] "went"          "wer"           "wet"           "what"         
## [1109] "whatev"        "whenev"        "whole"         "wid"          
## [1113] "wif"           "wife"          "wil"           "will"         
## [1117] "win"           "wine"          "winner"        "wish"         
## [1121] "wit"           "within"        "without"       "wiv"          
## [1125] "wkli"          "wnt"           "woke"          "won"          
## [1129] "wonder"        "wont"          "word"          "work"         
## [1133] "workin"        "world"         "worri"         "worth"        
## [1137] "wot"           "wow"           "write"         "wrong"        
## [1141] "wun"           "wwwgetzedcouk" "xmas"          "xxx"          
## [1145] "yahoo"         "yar"           "yeah"          "year"         
## [1149] "yep"           "yes"           "yest"          "yesterday"    
## [1153] "yet"           "yoga"          "yogasana"      "yrs"          
## [1157] "yun"           "yup"

sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)

##  chr [1:1158] "â<U+0080><U+0093>""| __truncated__ "abiola" "abl" "abt" "accept" "access" ...

sms_dtm_freq_train<- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]
convert_counts <- function(x) {
                                x <- ifelse(x > 0, "Yes", "No")
                              }
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2,convert_counts)
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2,convert_counts)

Training a model on the data

library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)

Evaluating model performance

library(gmodels)
sms_test_pred <- predict(sms_classifier, sms_test)
CrossTable(sms_test_pred, sms_test_labels,prop.chisq = FALSE, prop.t = FALSE,dnn = c('predicted', 'actual'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1390 
## 
##  
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1200 |        20 |      1220 | 
##              |     0.984 |     0.016 |     0.878 | 
##              |     0.993 |     0.110 |           | 
## -------------|-----------|-----------|-----------|
##         spam |         9 |       161 |       170 | 
##              |     0.053 |     0.947 |     0.122 | 
##              |     0.007 |     0.890 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1209 |       181 |      1390 | 
##              |     0.870 |     0.130 |           | 
## -------------|-----------|-----------|-----------|
## 
##

Improving model performance

sms_classifier2 <- naiveBayes(sms_train, sms_train_labels,laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,dnn = c('predicted', 'actual'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1390 
## 
##  
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1202 |        28 |      1230 | 
##              |     0.994 |     0.155 |           | 
## -------------|-----------|-----------|-----------|
##         spam |         7 |       153 |       160 | 
##              |     0.006 |     0.845 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1209 |       181 |      1390 | 
##              |     0.870 |     0.130 |           | 
## -------------|-----------|-----------|-----------|
## 
##

Probabilistic Learning - Classification Using Naive Bayes

Edwin Cheteni

Friday, June 01, 2018