Exploring and preparing the data
sms_raw <- read.csv("C:/Users/Justice2/Desktop/Machine Learning & Data Science/R/data/sms_spam.csv",stringsAsFactors = FALSE)
str(sms_raw)
## 'data.frame': 5574 obs. of 2 variables:
## $ type: chr "ham" "ham" "spam" "ham" ...
## $ text: chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...
sms_raw$type <- factor(sms_raw$type)
str(sms_raw$type)
## Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
table(sms_raw$type)
##
## ham spam
## 4827 747
Data preparation - cleaning and standardizing text data
library(tm)
## Loading required package: NLP
library(SnowballC)
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
print(sms_corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5574
inspect(sms_corpus[1:2])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 111
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 29
as.character(sms_corpus[[9]])
## [1] "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
lapply(sms_corpus[1:5], as.character)
## $`1`
## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
##
## $`2`
## [1] "Ok lar... Joking wif u oni..."
##
## $`3`
## [1] "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
##
## $`4`
## [1] "U dun say so early hor... U c already then say..."
##
## $`5`
## [1] "Nah I don't think he goes to usf, he lives around here though"
sms_corpus_clean <- tm_map(sms_corpus,content_transformer(tolower))
as.character(sms_corpus[[1]])
## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
as.character(sms_corpus_clean[[1]])
## [1] "go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers)
#getTransformations()
sms_corpus_clean <- tm_map(sms_corpus_clean,removeWords, stopwords())
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation)
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace)
lapply(sms_corpus[21:22], as.character)
## $`21`
## [1] "Is that seriously how you spell his name?"
##
## $`22`
## [1] "Iâ<U+0080><U+0098>m going to try for 2 months ha ha only joking"
lapply(sms_corpus_clean[21:22], as.character)
## $`21`
## [1] " serious spell name"
##
## $`22`
## [1] "iâ<U+0080><U+0098>m go tri month ha ha joke"
Data preparation - splitting text documents into words
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(tolower = TRUE,removeNumbers = TRUE,stopwords = TRUE,removePunctuation = TRUE,stemming = TRUE))
sms_dtm
## <<DocumentTermMatrix (documents: 5574, terms: 6604)>>
## Non-/sparse entries: 42631/36768065
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
sms_dtm2
## <<DocumentTermMatrix (documents: 5574, terms: 6998)>>
## Non-/sparse entries: 43720/38963132
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
Data preparation - creating training and test datasets
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
sms_train_labels <- sms_raw[1:4169, ]$type
sms_test_labels <- sms_raw[4170:5559, ]$type
prop.table(table(sms_train_labels))
## sms_train_labels
## ham spam
## 0.8647158 0.1352842
prop.table(table(sms_test_labels))
## sms_test_labels
## ham spam
## 0.8697842 0.1302158
Visualizing text data - word clouds
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)
spam <- subset(sms_raw, type == "spam")
ham <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))
Data preparation - creating indicator features for frequent words
findFreqTerms(sms_dtm_train, 5)
## [1] "â<U+0080><U+0093>" "abiola" "abl" "abt"
## [5] "accept" "access" "account" "across"
## [9] "activ" "actual" "add" "address"
## [13] "admir" "adult" "advanc" "aft"
## [17] "afternoon" "aftr" "age" "ago"
## [21] "ahead" "aight" "aint" "air"
## [25] "aiyah" "alex" "almost" "alon"
## [29] "alreadi" "alright" "alrit" "also"
## [33] "alway" "amp" "angri" "announc"
## [37] "anoth" "answer" "anybodi" "anymor"
## [41] "anyon" "anyth" "anytim" "anyway"
## [45] "apart" "app" "appli" "appoint"
## [49] "appreci" "april" "ard" "area"
## [53] "argument" "arm" "around" "arrang"
## [57] "arrest" "arriv" "asap" "ask"
## [61] "askd" "asleep" "ass" "attempt"
## [65] "auction" "avail" "ave" "avoid"
## [69] "await" "award" "away" "awesom"
## [73] "âwk" "babe" "babi" "back"
## [77] "bad" "bag" "bak" "balanc"
## [81] "bank" "bare" "bath" "batteri"
## [85] "bcoz" "bcum" "bday" "beauti"
## [89] "becom" "bed" "bedroom" "begin"
## [93] "believ" "belli" "best" "better"
## [97] "bid" "big" "bill" "bird"
## [101] "birthday" "bit" "black" "blank"
## [105] "bless" "blue" "bluetooth" "bodi"
## [109] "bold" "bonus" "boo" "book"
## [113] "bore" "boss" "bother" "bout"
## [117] "bowl" "box" "boy" "boytoy"
## [121] "brand" "break" "breath" "brilliant"
## [125] "bring" "brother" "bslvyl" "btnationalr"
## [129] "budget" "bugi" "bus" "busi"
## [133] "buy" "buzz" "cabin" "cafe"
## [137] "cal" "caller" "callertun" "call"
## [141] "camcord" "came" "camera" "can"
## [145] "cancel" "cant" "car" "card"
## [149] "care" "carlo" "case" "cash"
## [153] "cashbal" "catch" "caus" "chanc"
## [157] "chang" "charact" "charg" "chariti"
## [161] "chat" "cheap" "check" "cheer"
## [165] "chennai" "chikku" "childish" "children"
## [169] "chines" "choic" "choos" "christma"
## [173] "cine" "cinema" "claim" "class"
## [177] "clean" "clear" "click" "clock"
## [181] "close" "club" "code" "coffe"
## [185] "coin" "cold" "colleagu" "collect"
## [189] "colleg" "colour" "come" "comin"
## [193] "comp" "compani" "competit" "complet"
## [197] "complimentari" "comput" "concentr" "condit"
## [201] "confid" "confirm" "congrat" "congratul"
## [205] "connect" "contact" "content" "convey"
## [209] "cook" "cool" "copi" "correct"
## [213] "cos" "cost" "countri" "coupl"
## [217] "cours" "cover" "coz" "crave"
## [221] "crazi" "credit" "cri" "croydon"
## [225] "cuddl" "cum" "cup" "current"
## [229] "custcar" "custom" "cut" "cute"
## [233] "cuz" "dad" "daddi" "damn"
## [237] "darl" "darlin" "darren" "dat"
## [241] "date" "day" "dead" "deal"
## [245] "dear" "decid" "deep" "definit"
## [249] "del" "delet" "deliv" "deliveri"
## [253] "den" "depend" "detail" "dey"
## [257] "didnt" "die" "differ" "difficult"
## [261] "digit" "din" "dinner" "direct"
## [265] "dis" "discount" "discuss" "disturb"
## [269] "dnt" "doctor" "doesnt" "dog"
## [273] "doin" "dollar" "don" "donâ<U+0080><U+0098>t"
## [277] "done" "dont" "door" "doubl"
## [281] "download" "draw" "dream" "drink"
## [285] "drive" "drop" "drug" "dude"
## [289] "dun" "dunno" "dvd" "earli"
## [293] "earlier" "easi" "eat" "eatin"
## [297] "either" "els" "email" "embarass"
## [301] "empti" "end" "enemi" "energi"
## [305] "england" "enjoy" "enough" "enter"
## [309] "entri" "envelop" "especi" "etc"
## [313] "euro" "eve" "even" "ever"
## [317] "everi" "everyon" "everyth" "exact"
## [321] "exam" "excel" "excit" "excus"
## [325] "expect" "experi" "expir" "extra"
## [329] "eye" "face" "facebook" "fact"
## [333] "fall" "famili" "fanci" "fantasi"
## [337] "fantast" "far" "fast" "fat"
## [341] "father" "fault" "feel" "felt"
## [345] "fetch" "fight" "figur" "file"
## [349] "fill" "film" "final" "find"
## [353] "fine" "finger" "finish" "first"
## [357] "five" "fix" "flight" "flirt"
## [361] "flower" "follow" "fone" "food"
## [365] "forev" "forget" "forgot" "forward"
## [369] "found" "free" "freemsg" "freephon"
## [373] "fren" "fri" "friday" "friend"
## [377] "friendship" "frm" "frnd" "frnds"
## [381] "fuck" "full" "fullonsmscom" "fun"
## [385] "funni" "futur" "gal" "game"
## [389] "gap" "gas" "gave" "gay"
## [393] "gentl" "get" "gettin" "gift"
## [397] "girl" "give" "glad" "god"
## [401] "goe" "goin" "gone" "gonna"
## [405] "good" "goodmorn" "goodnight" "got"
## [409] "goto" "gotta" "great" "green"
## [413] "greet" "grin" "group" "guarante"
## [417] "gud" "guess" "guy" "gym"
## [421] "haf" "haha" "hai" "hair"
## [425] "half" "hand" "hang" "happen"
## [429] "happi" "hard" "hav" "havent"
## [433] "head" "hear" "heard" "heart"
## [437] "heavi" "hee" "hell" "hello"
## [441] "help" "hey" "hgsuiteland" "high"
## [445] "hit" "hiya" "hmm" "hmmm"
## [449] "hmv" "hol" "hold" "holder"
## [453] "holiday" "home" "honey" "hook"
## [457] "hop" "hope" "horni" "hospit"
## [461] "hot" "hotel" "hour" "hous"
## [465] "housemaid" "how" "howev" "howz"
## [469] "hrs" "hug" "huh" "hungri"
## [473] "hurri" "hurt" "iam" "ice"
## [477] "idea" "identifi" "ignor" "ill"
## [481] "imagin" "imma" "immedi" "import"
## [485] "inc" "inch" "includ" "india"
## [489] "indian" "info" "inform" "instead"
## [493] "interest" "interview" "invit" "ipod"
## [497] "irrit" "ish" "issu" "ive"
## [501] "izzit" "januari" "jay" "job"
## [505] "john" "join" "joke" "joy"
## [509] "jus" "just" "juz" "kalli"
## [513] "kate" "keep" "kept" "key"
## [517] "kick" "kid" "kill" "kind"
## [521] "kinda" "king" "kiss" "knew"
## [525] "know" "knw" "ladi" "land"
## [529] "landlin" "laptop" "lar" "last"
## [533] "late" "later" "latest" "laugh"
## [537] "lazi" "ldn" "lead" "learn"
## [541] "least" "leav" "lect" "left"
## [545] "leh" "lei" "lemm" "less"
## [549] "lesson" "let" "letter" "liao"
## [553] "librari" "lick" "lie" "life"
## [557] "lift" "light" "like" "line"
## [561] "link" "list" "listen" "littl"
## [565] "live" "load" "loan" "local"
## [569] "locat" "log" "login" "lol"
## [573] "long" "longer" "look" "lor"
## [577] "lose" "lost" "lot" "lovabl"
## [581] "love" "lover" "loverboy" "loyalti"
## [585] "ltd" "ltdecimalgt" "ltgt" "lttimegt"
## [589] "luck" "lucki" "lunch" "luv"
## [593] "made" "mah" "mail" "make"
## [597] "man" "mani" "march" "mark"
## [601] "marri" "marriag" "match" "mate"
## [605] "matter" "maxim" "may" "mayb"
## [609] "mean" "meant" "med" "medic"
## [613] "meet" "meh" "mell" "member"
## [617] "men" "menu" "merri" "messag"
## [621] "met" "mid" "midnight" "might"
## [625] "min" "mind" "mine" "minut"
## [629] "miracl" "miss" "mistak" "moan"
## [633] "mob" "mobil" "mobileupd" "mode"
## [637] "mom" "moment" "mon" "monday"
## [641] "money" "month" "mood" "moon"
## [645] "morn" "motorola" "move" "movi"
## [649] "mrng" "mrt" "msg" "msgs"
## [653] "mths" "much" "mum" "murder"
## [657] "music" "must" "muz" "nah"
## [661] "nake" "name" "nation" "natur"
## [665] "naughti" "near" "need" "net"
## [669] "network" "neva" "never" "new"
## [673] "news" "next" "nice" "nigeria"
## [677] "night" "nite" "nobodi" "noe"
## [681] "nokia" "none" "noon" "nope"
## [685] "normal" "noth" "notic" "now"
## [689] "ntt" "num" "number" "nxt"
## [693] "nyt" "offer" "offic" "offici"
## [697] "okay" "oki" "old" "omw"
## [701] "one" "onlin" "oop" "open"
## [705] "oper" "opinion" "opt" "optout"
## [709] "orang" "orchard" "order" "oredi"
## [713] "oso" "other" "otherwis" "outsid"
## [717] "pack" "page" "paid" "pain"
## [721] "paper" "parent" "park" "part"
## [725] "parti" "partner" "pass" "passion"
## [729] "password" "past" "pay" "peac"
## [733] "peopl" "per" "person" "pete"
## [737] "phone" "photo" "pic" "pick"
## [741] "pictur" "piec" "pix" "pizza"
## [745] "place" "plan" "plane" "play"
## [749] "player" "pleas" "pleasur" "pls"
## [753] "plus" "plz" "pmin" "pmsg"
## [757] "pobox" "poboxwwq" "point" "poli"
## [761] "polic" "poor" "pop" "possibl"
## [765] "post" "pound" "power" "pple"
## [769] "ppm" "practic" "pray" "prefer"
## [773] "prepar" "press" "pretti" "price"
## [777] "princess" "privat" "prize" "prob"
## [781] "probabl" "problem" "process" "project"
## [785] "promis" "pub" "put" "qualiti"
## [789] "question" "quick" "quit" "quiz"
## [793] "quot" "rain" "rate" "rather"
## [797] "rcvd" "reach" "read" "readi"
## [801] "real" "realiz" "realli" "reason"
## [805] "receipt" "receiv" "recent" "record"
## [809] "refer" "regard" "regist" "remain"
## [813] "rememb" "remind" "remov" "rent"
## [817] "rental" "repli" "repres" "request"
## [821] "respond" "respons" "rest" "result"
## [825] "return" "reveal" "review" "right"
## [829] "ring" "rington" "rite" "road"
## [833] "rock" "room" "roommat" "rose"
## [837] "round" "rowwjhl" "rpli" "rreveal"
## [841] "run" "sad" "sae" "safe"
## [845] "said" "sale" "sam" "sat"
## [849] "saturday" "savamob" "save" "saw"
## [853] "say" "sch" "school" "score"
## [857] "scream" "sea" "search" "season"
## [861] "sec" "second" "secret" "see"
## [865] "seem" "seen" "select" "self"
## [869] "sell" "semest" "send" "sens"
## [873] "sent" "serious" "servic" "set"
## [877] "settl" "sex" "sexi" "shall"
## [881] "share" "shd" "ship" "shirt"
## [885] "shit" "shop" "short" "show"
## [889] "shower" "shuhui" "sick" "side"
## [893] "sigh" "sight" "sign" "silent"
## [897] "simpl" "sinc" "sing" "singl"
## [901] "sir" "sis" "sister" "sit"
## [905] "situat" "sky" "slave" "sleep"
## [909] "slept" "slow" "slowli" "small"
## [913] "smile" "smoke" "sms" "smth"
## [917] "snow" "sofa" "solv" "somebodi"
## [921] "someon" "someth" "sometim" "somewher"
## [925] "song" "soni" "sonyericsson" "soon"
## [929] "sorri" "sort" "sound" "space"
## [933] "speak" "special" "specialcal" "spend"
## [937] "spent" "spoke" "sport" "spree"
## [941] "stand" "star" "start" "statement"
## [945] "station" "stay" "std" "still"
## [949] "stock" "stop" "store" "stori"
## [953] "str" "straight" "street" "strong"
## [957] "student" "studi" "stuff" "stupid"
## [961] "style" "sub" "subscrib" "success"
## [965] "summer" "sun" "sunday" "sunshin"
## [969] "support" "suppos" "sure" "surpris"
## [973] "sweet" "swing" "system" "take"
## [977] "talk" "tampa" "tcs" "teach"
## [981] "team" "tear" "teas" "tel"
## [985] "tell" "ten" "tenerif" "term"
## [989] "test" "text" "thank" "thanx"
## [993] "that" "thing" "think" "thinkin"
## [997] "thk" "thnk" "tho" "though"
## [1001] "thought" "throw" "thru" "tht"
## [1005] "thur" "ticket" "til" "till"
## [1009] "time" "tire" "titl" "tmr"
## [1013] "tncs" "today" "togeth" "told"
## [1017] "tomo" "tomorrow" "tone" "tonight"
## [1021] "tonit" "took" "top" "tot"
## [1025] "total" "touch" "tough" "tour"
## [1029] "toward" "town" "track" "train"
## [1033] "transact" "treat" "tri" "trip"
## [1037] "troubl" "true" "trust" "truth"
## [1041] "tscs" "ttyl" "tuesday" "turn"
## [1045] "twice" "two" "txt" "txting"
## [1049] "txts" "type" "ufind" "ugh"
## [1053] "umma" "uncl" "understand" "unless"
## [1057] "unlimit" "unredeem" "unsub" "unsubscrib"
## [1061] "updat" "ure" "urgent" "urself"
## [1065] "use" "usf" "usual" "uve"
## [1069] "valentin" "valid" "valu" "vari"
## [1073] "verifi" "via" "video" "visit"
## [1077] "voic" "voucher" "wait" "wake"
## [1081] "walk" "wan" "wana" "wanna"
## [1085] "want" "wap" "warm" "wast"
## [1089] "wat" "watch" "water" "way"
## [1093] "weak" "wear" "weather" "wed"
## [1097] "wednesday" "weed" "week" "weekend"
## [1101] "weight" "welcom" "well" "wen"
## [1105] "went" "wer" "wet" "what"
## [1109] "whatev" "whenev" "whole" "wid"
## [1113] "wif" "wife" "wil" "will"
## [1117] "win" "wine" "winner" "wish"
## [1121] "wit" "within" "without" "wiv"
## [1125] "wkli" "wnt" "woke" "won"
## [1129] "wonder" "wont" "word" "work"
## [1133] "workin" "world" "worri" "worth"
## [1137] "wot" "wow" "write" "wrong"
## [1141] "wun" "wwwgetzedcouk" "xmas" "xxx"
## [1145] "yahoo" "yar" "yeah" "year"
## [1149] "yep" "yes" "yest" "yesterday"
## [1153] "yet" "yoga" "yogasana" "yrs"
## [1157] "yun" "yup"
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)
## chr [1:1158] "â<U+0080><U+0093>""| __truncated__ "abiola" "abl" "abt" "accept" "access" ...
sms_dtm_freq_train<- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2,convert_counts)
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2,convert_counts)
Training a model on the data
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
Evaluating model performance
library(gmodels)
sms_test_pred <- predict(sms_classifier, sms_test)
CrossTable(sms_test_pred, sms_test_labels,prop.chisq = FALSE, prop.t = FALSE,dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1200 | 20 | 1220 |
## | 0.984 | 0.016 | 0.878 |
## | 0.993 | 0.110 | |
## -------------|-----------|-----------|-----------|
## spam | 9 | 161 | 170 |
## | 0.053 | 0.947 | 0.122 |
## | 0.007 | 0.890 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1209 | 181 | 1390 |
## | 0.870 | 0.130 | |
## -------------|-----------|-----------|-----------|
##
##
Improving model performance
sms_classifier2 <- naiveBayes(sms_train, sms_train_labels,laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1202 | 28 | 1230 |
## | 0.994 | 0.155 | |
## -------------|-----------|-----------|-----------|
## spam | 7 | 153 | 160 |
## | 0.006 | 0.845 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1209 | 181 | 1390 |
## | 0.870 | 0.130 | |
## -------------|-----------|-----------|-----------|
##
##