library(openxlsx)
library(tm)
library(car)
library(foreign) 
library(readr)
library(dplyr)
library(RWeka)
library(RODBC)
library(class)
library(gmodels)
library(wordcloud)
library(e1071) 

This project explores a basic application of “naive bayes” classification using strings. The data used are for practice and were drawn from sms_spam data file and text: “Machine Learning with R”.

# call data
s23<-read.xlsx("C:\\Users\\Jaire\\OneDrive\\Desktop\\Exploratory Research\\ML\\smsdata.xlsx")

Features & Transformations

# check data
str(s23)
## 'data.frame':    5574 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "spam" "ham" ...
##  $ text: chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
# transform type feature to factor
s23$type <- factor(s23$type)
table(s23$type)
## 
##  ham spam 
## 4827  747
str(s23$type)
##  Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
# create corpus
s23_corpus <- Corpus(VectorSource(s23$text))
print(s23_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5574
inspect(s23_corpus[1:10])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 10
## 
##  [1] Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                                 
##  [2] Ok lar... Joking wif u oni...                                                                                                                                   
##  [3] Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's     
##  [4] U dun say so early hor... U c already then say...                                                                                                               
##  [5] Nah I don't think he goes to usf, he lives around here though                                                                                                   
##  [6] FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv             
##  [7] Even my brother is not like to speak with me. They treat me like aids patent.                                                                                   
##  [8] As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
##  [9] WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.   
## [10] Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030
# remove lowercase letters and numbers in corpus
corpus_clean <- tm_map(s23_corpus, tolower)
## Warning in tm_map.SimpleCorpus(s23_corpus, tolower): transformation drops
## documents
corpus_clean <- tm_map(corpus_clean, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_clean, removeNumbers): transformation
## drops documents
# remove stop words 
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(corpus_clean, removeWords, stopwords()):
## transformation drops documents
# remove punctuation
corpus_clean <- tm_map(corpus_clean, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_clean, removePunctuation): transformation
## drops documents
# remove whitespace
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_clean, stripWhitespace): transformation
## drops documents
# inspect corpus clean
inspect(corpus_clean[1:10])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 10
## 
##  [1] go jurong point crazy available bugis n great world la e buffet cine got amore wat                              
##  [2] ok lar joking wif u oni                                                                                         
##  [3] free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply s         
##  [4] u dun say early hor u c already say                                                                             
##  [5] nah think goes usf lives around though                                                                          
##  [6] freemsg hey darling weeks now word back like fun still tb ok xxx std chgs send £ rcv                            
##  [7] even brother like speak treat like aids patent                                                                  
##  [8]  per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune
##  [9] winner valued network customer selected receivea £ prize reward claim call claim code kl valid hours            
## [10]  mobile months u r entitled update latest colour mobiles camera free call mobile update co free
# create sparse matrix 
s23_dtm <- DocumentTermMatrix(corpus_clean)

Train and Test Preparation

# raw data
s23_train <- s23[1:4169, ]
s23_test <- s23[4170:5559, ]
# sparse matrix
s23_dtm_train <- s23_dtm[1:4169, ]
s23_dtm_test <- s23_dtm[4170:5559, ]
# corpus
s23_corpus_train <- corpus_clean[1:4169]
s23_corpus_test <- corpus_clean[4170:5559]
# check proportions and dimensions 
prop.table(table(s23_train$type))
## 
##       ham      spam 
## 0.8647158 0.1352842
prop.table(table(s23_test$type))
## 
##       ham      spam 
## 0.8697842 0.1302158
dim(s23_train)
## [1] 4169    2
dim(s23_test)
## [1] 1390    2
dim(s23_dtm_train)
## [1] 4169 7951
dim(s23_dtm_test)
## [1] 1390 7951
dim(s23_corpus_train)
## NULL
dim(s23_corpus_test)
## NULL
# word cloud from corpus
wordcloud(s23_corpus_train, min.freq = 40, random.order = FALSE)

# compare clouds for spam and ham
spam <- subset(s23_train, type == "spam")
ham <- subset(s23_train, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

# creating indicator features for frequent words (frequency of at least 5 appearances)
findFreqTerms(s23_dtm_train, 5)
##    [1] "available"       "bugis"           "cine"            "crazy"          
##    [5] "got"             "great"           "point"           "wat"            
##    [9] "world"           "lar"             "wif"             "apply"          
##   [13] "comp"            "cup"             "entry"           "final"          
##   [17] "free"            "may"             "receive"         "text"           
##   [21] "txt"             "win"             "wkly"            "already"        
##   [25] "dun"             "early"           "say"             "around"         
##   [29] "goes"            "nah"             "think"           "though"         
##   [33] "usf"             "back"            "freemsg"         "fun"            
##   [37] "hey"             "like"            "now"             "send"           
##   [41] "std"             "still"           "weeks"           "word"           
##   [45] "xxx"             "brother"         "even"            "speak"          
##   [49] "treat"           "callers"         "callertune"      "copy"           
##   [53] "friends"         "melle"           "per"             "press"          
##   [57] "request"         "set"             "call"            "claim"          
##   [61] "code"            "customer"        "hours"           "network"        
##   [65] "prize"           "selected"        "valid"           "valued"         
##   [69] "winner"          "camera"          "colour"          "latest"         
##   [73] "mobile"          "mobiles"         "months"          "update"         
##   [77] "anymore"         "enough"          "gonna"           "home"           
##   [81] "soon"            "stuff"           "talk"            "today"          
##   [85] "tonight"         "want"            "cash"            "cost"           
##   [89] "days"            "info"            "pounds"          "reply"          
##   [93] "pobox"           "urgent"          "week"            "won"            
##   [97] "help"            "promise"         "right"           "take"           
##  [101] "thank"           "times"           "will"            "wonderful"      
##  [105] "wont"            "words"           "date"            "sunday"         
##  [109] "click"           "credit"          "link"            "message"        
##  [113] "next"            "use"             "wap"             "watching"       
##  [117] "make"            "name"            "naughty"         "remember"       
##  [121] "wet"             "yes"             "feel"            "fine"           
##  [125] "way"             "dont"            "england"         "miss"           
##  [129] "national"        "news"            "team"            "seriously"      
##  [133] "going"           "try"             "comin"           "first"          
##  [137] "pay"             "aft"             "ard"             "finish"         
##  [141] "lor"             "lunch"           "smth"            "str"            
##  [145] "alright"         "can"             "meet"            "eat"            
##  [149] "getting"         "hungry"          "just"            "knows"          
##  [153] "lol"             "pizza"           "really"          "sick"           
##  [157] "tho"             "worried"         "always"          "bus"            
##  [161] "catch"           "dinner"          "eating"          "left"           
##  [165] "love"            "moms"            "amp"             "car"            
##  [169] "know"            "let"             "room"            "work"           
##  [173] "live"            "sure"            "wait"            "till"           
##  [177] "yeah"            "anything"        "tell"            "quick"          
##  [181] "charged"         "confirm"         "please"          "replying"       
##  [185] "ringtone"        "thanks"          "learn"           "lesson"         
##  [189] "look"            "msg"             "yup"             "done"           
##  [193] "oops"            "see"             "decide"          "decided"        
##  [197] "hello"           "saturday"        "tomo"            "trying"         
##  [201] "abiola"          "ahead"           "pls"             "wanted"         
##  [205] "weekend"         "crave"           "forget"          "need"           
##  [209] "sweet"           "camcorder"       "delivery"        "nokia"          
##  [213] "sms"             "tomorrow"        "tried"           "seeing"         
##  [217] "hope"            "man"             "well"            "calls"          
##  [221] "get"             "nigeria"         "ask"             "bit"            
##  [225] "maybe"           "kept"            "telling"         "class"          
##  [229] "saw"             "time"            "almost"          "gets"           
##  [233] "half"            "second"          "usually"         "whole"          
##  [237] "morning"         "place"           "best"            "give"           
##  [241] "happy"           "liked"           "never"           "since"          
##  [245] "sorry"           "thought"         "wow"             "correct"        
##  [249] "end"             "ice"             "new"             "play"           
##  [253] "find"            "yesterday"       "cinema"          "congrats"       
##  [257] "etc"             "pass"            "special"         "year"           
##  [261] "later"           "meeting"         "reached"         "move"           
##  [265] "pain"            "pick"            "girls"           "good"           
##  [269] "situation"       "checking"        "part"            "come"           
##  [273] "forever"         "took"            "check"           "cut"            
##  [277] "double"          "hair"            "nice"            "said"           
##  [281] "short"           "wun"             "awarded"         "bonus"          
##  [285] "mob"             "pleased"         "review"          "day"            
##  [289] "frnds"           "rply"            "song"            "complimentary"  
##  [293] "dis"             "trip"            "comes"           "hear"           
##  [297] "month"           "plane"           "hee"             "lucky"          
##  [301] "money"           "save"            "finished"        "babe"           
##  [305] "something"       "wanna"           "waiting"         "cool"           
##  [309] "thats"           "much"            "job"             "looking"        
##  [313] "stop"            "one"             "real"            "tickets"        
##  [317] "used"            "bed"             "came"            "gotta"          
##  [321] "started"         "download"        "wen"             "close"          
##  [325] "don‘t"           "stand"           "afternoon"       "another"        
##  [329] "late"            "means"           "night"           "spent"          
##  [333] "loves"           "pleasure"        "rain"            "smile"          
##  [337] "smiling"         "someone"         "trouble"         "guaranteed"     
##  [341] "representative"  "service"         "buy"             "havent"         
##  [345] "planning"        "show"            "box"             "password"       
##  [349] "abt"             "loads"           "cause"           "forgot"         
##  [353] "prob"            "run"             "shower"          "coffee"         
##  [357] "else"            "nothing"         "ave"             "long"           
##  [361] "okay"            "price"           "driving"         "gone"           
##  [365] "test"            "yet"             "changed"         "cuz"            
##  [369] "gave"            "guess"           "mean"            "men"            
##  [373] "page"            "says"            "search"          "dear"           
##  [377] "life"            "lot"             "umma"            "birthday"       
##  [381] "making"          "wishes"          "aight"           "hit"            
##  [385] "address"         "computer"        "better"          "mom"            
##  [389] "old"             "people"          "busy"            "worry"          
##  [393] "cos"             "mah"             "things"          "contact"        
##  [397] "draw"            "hrs"             "last"            "ppm"            
##  [401] "shows"           "weekends"        "anyway"          "eatin"          
##  [405] "happened"        "juz"             "apartment"       "askd"           
##  [409] "bday"            "boss"            "cabin"           "entered"        
##  [413] "felt"            "invited"         "went"            "flights"        
##  [417] "holiday"         "inc"             "operator"        "pmin"           
##  [421] "specially"       "friday"          "must"            "food"           
##  [425] "hmm"             "paying"          "school"          "uncle"          
##  [429] "account"         "expires"         "identifier"      "points"         
##  [433] "private"         "statement"       "unredeemed"      "caller"         
##  [437] "landline"        "app"             "award"           "ending"         
##  [441] "match"           "numbers"         "rates"           "todays"         
##  [445] "bother"          "sending"         "sent"            "bak"            
##  [449] "del"             "girl"            "answer"          "question"       
##  [453] "country"         "dvd"             "player"          "quiz"           
##  [457] "sony"            "sunshine"        "top"             "direct"         
##  [461] "dogging"         "join"            "locations"       "txting"         
##  [465] "uks"             "haf"             "age"             "chat"           
##  [469] "msgs"            "services"        "lazy"            "lect"           
##  [473] "type"            "mail"            "sir"             "gud"            
##  [477] "little"          "lovable"         "persons"         "tired"          
##  [481] "open"            "taking"          "whats"           "local"          
##  [485] "ltd"             "luv"             "replied"         "sexy"           
##  [489] "hard"            "pray"            "wine"            "thk"            
##  [493] "become"          "dream"           "lots"            "sometimes"      
##  [497] "without"         "house"           "leaving"         "boy"            
##  [501] "missing"         "years"           "everyone"        "keep"           
##  [505] "safe"            "hand"            "parents"         "spend"          
##  [509] "friend"          "frnd"            "content"         "goto"           
##  [513] "menu"            "order"           "ones"            "wit"            
##  [517] "fancy"           "needs"           "also"            "completely"     
##  [521] "waste"           "bank"            "hmmm"            "hop"            
##  [525] "liao"            "muz"             "coming"          "believe"        
##  [529] "cant"            "hell"            "ill"             "bath"           
##  [533] "carlos"          "smoke"           "staying"         "til"            
##  [537] "turns"           "worth"           "doesnt"          "log"            
##  [541] "experience"      "spoke"           "especially"      "offer"          
##  [545] "studying"        "trust"           "guys"            "boytoy"         
##  [549] "net"             "towards"         "working"         "awesome"        
##  [553] "haha"            "minute"          "freephone"       "xmas"           
##  [557] "bathe"           "jus"             "sis"             "using"          
##  [561] "deal"            "joined"          "personal"        "touch"          
##  [565] "course"          "finally"         "able"            "every"          
##  [569] "however"         "hav"             "mrng"            "story"          
##  [573] "dead"            "mrt"             "orchard"         "tmr"            
##  [577] "evening"         "kate"            "found"           "college"        
##  [581] "darlin"          "ive"             "balance"         "decimal"        
##  [585] "transaction"     "goodmorning"     "sleeping"        "dat"            
##  [589] "oredi"           "oso"             "straight"        "bill"           
##  [593] "big"             "ready"           "break"           "semester"       
##  [597] "leh"             "noe"             "sounds"          "past"           
##  [601] "slept"           "easy"            "exam"            "march"          
##  [605] "called"          "important"       "shop"            "system"         
##  [609] "happen"          "nite"            "collect"         "optout"         
##  [613] "appreciate"      "partner"         "sign"            "start"          
##  [617] "company"         "bcoz"            "lessons"         "road"           
##  [621] "side"            "street"          "walk"            "battery"        
##  [625] "died"            "flirt"           "sam"             "reach"          
##  [629] "wil"             "kick"            "person"          "admirer"        
##  [633] "rreveal"         "secret"          "specialcall"     "thinks"         
##  [637] "ufind"           "case"            "laptop"          "tel"            
##  [641] "meant"           "everything"      "face"            "thanx"          
##  [645] "told"            "uve"             "watch"           "asked"          
##  [649] "kallis"          "didnt"           "goodnight"       "missed"         
##  [653] "sleep"           "wake"            "congratulations" "gift"           
##  [657] "music"           "tncs"            "vouchers"        "cal"            
##  [661] "hold"            "min"             "angry"           "care"           
##  [665] "childish"        "coz"             "deep"            "dnt"            
##  [669] "showing"         "true"            "wid"             "takes"          
##  [673] "lemme"           "lover"           "anytime"         "mins"           
##  [677] "unlimited"       "video"           "disturb"         "shopping"       
##  [681] "ring"            "horny"           "hot"             "naked"          
##  [685] "unsubscribe"     "plan"            "wana"            "choose"         
##  [689] "club"            "credits"         "charge"          "quality"        
##  [693] "singles"         "blue"            "ended"           "leaves"         
##  [697] "worries"         "hmv"             "questions"       "might"          
##  [701] "full"            "swing"           "definitely"      "far"            
##  [705] "okie"            "usual"           "lets"            "baby"           
##  [709] "fone"            "hour"            "sense"           "stupid"         
##  [713] "card"            "ends"            "loyalty"         "phone"          
##  [717] "unless"          "bslvyl"          "die"             "hurt"           
##  [721] "plz"             "rose"            "high"            "somebody"       
##  [725] "imagine"         "shit"            "somewhere"       "book"           
##  [729] "friendship"      "games"           "tones"           "accept"         
##  [733] "sister"          "weekly"          "lost"            "normal"         
##  [737] "rest"            "wot"             "made"            "dunno"          
##  [741] "power"           "yoga"            "dude"            "mths"           
##  [745] "christmas"       "merry"           "pete"            "plans"          
##  [749] "problem"         "reading"         "track"           "light"          
##  [753] "read"            "movie"           "immediately"     "access"         
##  [757] "fixed"           "line"            "number"          "via"            
##  [761] "chance"          "custcare"        "pmsg"            "rcvd"           
##  [765] "valentines"      "calling"         "post"            "texts"          
##  [769] "wiv"             "round"           "two"             "num"            
##  [773] "small"           "txts"            "ever"            "fault"          
##  [777] "urself"          "figure"          "jay"             "weed"           
##  [781] "ago"             "ish"             "minutes"         "met"            
##  [785] "cashbalance"     "currently"       "hgsuitelands"    "maximize"       
##  [789] "rowwjhl"         "moment"          "cold"            "posted"         
##  [793] "chikku"          "forward"         "air"             "bluetooth"      
##  [797] "mobileupd"       "motorola"        "orange"          "sonyericsson"   
##  [801] "discount"        "messages"        "wish"            "woke"           
##  [805] "talking"         "willing"         "reference"       "seen"           
##  [809] "happening"       "sighs"           "brings"          "mistake"        
##  [813] "project"         "body"            "quite"           "reason"         
##  [817] "slow"            "couple"          "leave"           "phones"         
##  [821] "rental"          "huh"             "sat"             "office"         
##  [825] "bout"            "actually"        "rock"            "ass"            
##  [829] "facebook"        "put"             "putting"         "god"            
##  [833] "india"           "change"          "poly"            "tone"           
##  [837] "none"            "starts"          "yep"             "stay"           
##  [841] "drink"           "fullonsmscom"    "thing"           "den"            
##  [845] "bring"           "dating"          "competition"     "head"           
##  [849] "eve"             "heart"           "poboxwwq"        "yahoo"          
##  [853] "contacted"       "land"            "funny"           "voice"          
##  [857] "giving"          "lift"            "mind"            "wnt"            
##  [861] "fucking"         "ldn"             "vary"            "booked"         
##  [865] "ticket"          "ten"             "tough"           "supposed"       
##  [869] "din"             "group"           "tot"             "doin"           
##  [873] "kinda"           "loan"            "welcome"         "beautiful"      
##  [877] "ure"             "asking"          "kind"            "ttyl"           
##  [881] "bad"             "different"       "thru"            "gives"          
##  [885] "opt"             "tcs"             "enjoy"           "princess"       
##  [889] "style"           "many"            "notice"          "sae"            
##  [893] "tenerife"        "details"         "rate"            "remove"         
##  [897] "moan"            "nyt"             "cum"             "thinking"       
##  [901] "sec"             "activate"        "terms"           "visit"          
##  [905] "depends"         "meh"             "monday"          "nope"           
##  [909] "either"          "lose"            "water"           "bored"          
##  [913] "outside"         "near"            "park"            "rent"           
##  [917] "character"       "opinion"         "silent"          "simple"         
##  [921] "ipod"            "fri"             "less"            "children"       
##  [925] "shall"           "attempt"         "member"          "offers"         
##  [929] "savamob"         "sub"             "unsub"           "lady"           
##  [933] "pretty"          "single"          "within"          "hoping"         
##  [937] "across"          "kiss"            "sea"             "probably"       
##  [941] "fat"             "fingers"         "confidence"      "listen"         
##  [945] "married"         "quote"           "self"            "mine"           
##  [949] "rather"          "hotel"           "omw"             "hurry"          
##  [953] "warm"            "weight"          "cheap"           "‘s"             
##  [957] "online"          "pics"            "fast"            "workin"         
##  [961] "fuck"            "gym"             "whatever"        "daddy"          
##  [965] "scream"          "clean"           "mum"             "sch"            
##  [969] "yar"             "door"            "marriage"        "izzit"          
##  [973] "kids"            "yogasana"        "spree"           "tscs"           
##  [977] "pound"           "announcement"    "train"           "noon"           
##  [981] "neva"            "imma"            "euro"            "game"           
##  [985] "future"          "shuhui"          "family"          "happiness"      
##  [989] "snow"            "together"        "weather"         "alex"           
##  [993] "pub"             "paid"            "darren"          "longer"         
##  [997] "area"            "matches"         "forwarded"       "–"              
## [1001] "holder"          "voucher"         "dad"             "login"          
## [1005] "sound"           "email"           "starting"        "tuesday"        
## [1009] "drugs"           "town"            "drug"            "sun"            
## [1013] "envelope"        "paper"           "fetch"           "study"          
## [1017] "belly"           "laugh"           "saying"          "knw"            
## [1021] "bedroom"         "sex"             "king"            "ans"            
## [1025] "bold"            "looks"           "…"               "idea"           
## [1029] "away"            "fantastic"       "aftr"            "dey"            
## [1033] "sit"             "students"        "wer"             "seems"          
## [1037] "sell"            "feeling"         "wants"           "twice"          
## [1041] "add"             "tonite"          "rite"            "glad"           
## [1045] "eyes"            "library"         "anyone"          "makes"          
## [1049] "yest"            "advance"         "wishing"         "store"          
## [1053] "summer"          "goin"            "wonder"          "drive"          
## [1057] "damn"            "wats"            "space"           "picking"        
## [1061] "guy"             "lovely"          "slave"           "mates"          
## [1065] "wwwgetzedcouk"   "aint"            "ugh"             "bag"            
## [1069] "bid"             "boo"             "gettin"          "teasing"        
## [1073] "extra"           "charity"         "polys"           "green"          
## [1077] "mate"            "five"            "wed"             "understand"     
## [1081] "await"           "collection"      "wan"             "poor"           
## [1085] "fall"            "hai"             "tampa"           "whenever"       
## [1089] "sort"            "yrs"             "asap"            "drop"           
## [1093] "otherwise"       "ntt"             "mid"             "knew"           
## [1097] "midnight"        "training"        "sad"             "murder"         
## [1101] "wife"            "possible"        "teach"           "plus"           
## [1105] "gay"             "january"         "sale"            "iam"            
## [1109] "wrong"           "photo"           "registered"      "least"          
## [1113] "medical"         "places"          "recently"        "feels"          
## [1117] "heavy"           "truth"           "brand"           "earlier"        
## [1121] "nxt"             "loving"          "yun"             "party"          
## [1125] "miracle"         "heard"           "frm"             "don"            
## [1129] "asleep"          "loverboy"        "serious"         "april"          
## [1133] "hiya"            "flower"          "support"         "movies"         
## [1137] "convey"          "pic"             "digital"         "doctor"         
## [1141] "receipt"         "black"           "aiyah"           "information"    
## [1145] "surprise"        "grins"           "gal"             "howz"           
## [1149] "film"            "luck"            "raining"         "cute"           
## [1153] "energy"          "chennai"         "£wk"             "choice"         
## [1157] "enter"           "strong"          "chinese"         "nobody"         
## [1161] "honey"           "picked"          "mode"            "others"         
## [1165] "wondering"       "tour"            "dreams"          "alrite"         
## [1169] "shd"             "frens"           "tht"             "marry"          
## [1173] "pple"            "arrested"        "croydon"         "fantasies"      
## [1177] "cover"           "write"           "fact"            "cafe"           
## [1181] "alone"           "loved"           "hows"            "arrive"         
## [1185] "screaming"       "auction"         "difficult"       "gas"            
## [1189] "excellent"       "john"            "instead"         "buzz"           
## [1193] "lei"             "waking"          "sight"           "father"         
## [1197] "hook"            "slowly"          "joys"            "holding"        
## [1201] "exciting"        "btnationalrate"  "wednesday"       "thnk"           
## [1205] "excuse"          "season"          "thinkin"         "mon"            
## [1209] "sitting"         "expecting"       "pix"             "colleagues"     
## [1213] "mood"            "sofa"            "empty"           "checked"        
## [1217] "sky"             "housemaid"       "murdered"        "murderer"       
## [1221] "police"          "budget"          "happens"         "thurs"
s23_dict <- c(findFreqTerms(s23_dtm_train, 5))
# limit training and test matrices to words in s23_dict
s23_train2 <- DocumentTermMatrix(s23_corpus_train, list(dictionary = s23_dict))
s23_test2 <- DocumentTermMatrix(s23_corpus_test,list(dictionary = s23_dict))
# create function, factor of email classification
convert_counts <- function(x) {
 x <- ifelse(x > 0, 1, 0)
 x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
 return(x)}
# convert counts to factors 
s23_train2 <- apply(s23_train2, MARGIN = 2, convert_counts)
s23_test2 <- apply(s23_test2, MARGIN = 2, convert_counts)
# train model
s23_classifier <- naiveBayes(s23_train2, s23_train$type)
# predictions
s23_test_pred <- predict(s23_classifier, s23_test2)
# performance evaluation
CrossTable(s23_test_pred, s23_test$type, prop.chisq = FALSE, prop.t = FALSE, dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1390 
## 
##  
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1203 |        28 |      1231 | 
##              |     0.977 |     0.023 |     0.886 | 
##              |     0.995 |     0.155 |           | 
## -------------|-----------|-----------|-----------|
##         spam |         6 |       153 |       159 | 
##              |     0.038 |     0.962 |     0.114 | 
##              |     0.005 |     0.845 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1209 |       181 |      1390 | 
##              |     0.870 |     0.130 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
153 /1390 
## [1] 0.1100719

The model classified 86.5% of emails as true negatives (ham), 2% as false positives, 0.5% as false negatives, and 11% as true positives (spam).