library(openxlsx)
library(tm)
library(car)
library(foreign)
library(readr)
library(dplyr)
library(RWeka)
library(RODBC)
library(class)
library(gmodels)
library(wordcloud)
library(e1071)
This project explores a basic application of “naive bayes” classification using strings. The data used are for practice and were drawn from sms_spam data file and text: “Machine Learning with R”.
# call data
s23<-read.xlsx("C:\\Users\\Jaire\\OneDrive\\Desktop\\Exploratory Research\\ML\\smsdata.xlsx")
# check data
str(s23)
## 'data.frame': 5574 obs. of 2 variables:
## $ type: chr "ham" "ham" "spam" "ham" ...
## $ text: chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
# transform type feature to factor
s23$type <- factor(s23$type)
table(s23$type)
##
## ham spam
## 4827 747
str(s23$type)
## Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
# create corpus
s23_corpus <- Corpus(VectorSource(s23$text))
print(s23_corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5574
inspect(s23_corpus[1:10])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
##
## [1] Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
## [2] Ok lar... Joking wif u oni...
## [3] Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
## [4] U dun say so early hor... U c already then say...
## [5] Nah I don't think he goes to usf, he lives around here though
## [6] FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
## [7] Even my brother is not like to speak with me. They treat me like aids patent.
## [8] As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
## [9] WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
## [10] Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030
# remove lowercase letters and numbers in corpus
corpus_clean <- tm_map(s23_corpus, tolower)
## Warning in tm_map.SimpleCorpus(s23_corpus, tolower): transformation drops
## documents
corpus_clean <- tm_map(corpus_clean, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_clean, removeNumbers): transformation
## drops documents
# remove stop words
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(corpus_clean, removeWords, stopwords()):
## transformation drops documents
# remove punctuation
corpus_clean <- tm_map(corpus_clean, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_clean, removePunctuation): transformation
## drops documents
# remove whitespace
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_clean, stripWhitespace): transformation
## drops documents
# inspect corpus clean
inspect(corpus_clean[1:10])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
##
## [1] go jurong point crazy available bugis n great world la e buffet cine got amore wat
## [2] ok lar joking wif u oni
## [3] free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply s
## [4] u dun say early hor u c already say
## [5] nah think goes usf lives around though
## [6] freemsg hey darling weeks now word back like fun still tb ok xxx std chgs send £ rcv
## [7] even brother like speak treat like aids patent
## [8] per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune
## [9] winner valued network customer selected receivea £ prize reward claim call claim code kl valid hours
## [10] mobile months u r entitled update latest colour mobiles camera free call mobile update co free
# create sparse matrix
s23_dtm <- DocumentTermMatrix(corpus_clean)
# raw data
s23_train <- s23[1:4169, ]
s23_test <- s23[4170:5559, ]
# sparse matrix
s23_dtm_train <- s23_dtm[1:4169, ]
s23_dtm_test <- s23_dtm[4170:5559, ]
# corpus
s23_corpus_train <- corpus_clean[1:4169]
s23_corpus_test <- corpus_clean[4170:5559]
# check proportions and dimensions
prop.table(table(s23_train$type))
##
## ham spam
## 0.8647158 0.1352842
prop.table(table(s23_test$type))
##
## ham spam
## 0.8697842 0.1302158
dim(s23_train)
## [1] 4169 2
dim(s23_test)
## [1] 1390 2
dim(s23_dtm_train)
## [1] 4169 7951
dim(s23_dtm_test)
## [1] 1390 7951
dim(s23_corpus_train)
## NULL
dim(s23_corpus_test)
## NULL
# word cloud from corpus
wordcloud(s23_corpus_train, min.freq = 40, random.order = FALSE)
# compare clouds for spam and ham
spam <- subset(s23_train, type == "spam")
ham <- subset(s23_train, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
# creating indicator features for frequent words (frequency of at least 5 appearances)
findFreqTerms(s23_dtm_train, 5)
## [1] "available" "bugis" "cine" "crazy"
## [5] "got" "great" "point" "wat"
## [9] "world" "lar" "wif" "apply"
## [13] "comp" "cup" "entry" "final"
## [17] "free" "may" "receive" "text"
## [21] "txt" "win" "wkly" "already"
## [25] "dun" "early" "say" "around"
## [29] "goes" "nah" "think" "though"
## [33] "usf" "back" "freemsg" "fun"
## [37] "hey" "like" "now" "send"
## [41] "std" "still" "weeks" "word"
## [45] "xxx" "brother" "even" "speak"
## [49] "treat" "callers" "callertune" "copy"
## [53] "friends" "melle" "per" "press"
## [57] "request" "set" "call" "claim"
## [61] "code" "customer" "hours" "network"
## [65] "prize" "selected" "valid" "valued"
## [69] "winner" "camera" "colour" "latest"
## [73] "mobile" "mobiles" "months" "update"
## [77] "anymore" "enough" "gonna" "home"
## [81] "soon" "stuff" "talk" "today"
## [85] "tonight" "want" "cash" "cost"
## [89] "days" "info" "pounds" "reply"
## [93] "pobox" "urgent" "week" "won"
## [97] "help" "promise" "right" "take"
## [101] "thank" "times" "will" "wonderful"
## [105] "wont" "words" "date" "sunday"
## [109] "click" "credit" "link" "message"
## [113] "next" "use" "wap" "watching"
## [117] "make" "name" "naughty" "remember"
## [121] "wet" "yes" "feel" "fine"
## [125] "way" "dont" "england" "miss"
## [129] "national" "news" "team" "seriously"
## [133] "going" "try" "comin" "first"
## [137] "pay" "aft" "ard" "finish"
## [141] "lor" "lunch" "smth" "str"
## [145] "alright" "can" "meet" "eat"
## [149] "getting" "hungry" "just" "knows"
## [153] "lol" "pizza" "really" "sick"
## [157] "tho" "worried" "always" "bus"
## [161] "catch" "dinner" "eating" "left"
## [165] "love" "moms" "amp" "car"
## [169] "know" "let" "room" "work"
## [173] "live" "sure" "wait" "till"
## [177] "yeah" "anything" "tell" "quick"
## [181] "charged" "confirm" "please" "replying"
## [185] "ringtone" "thanks" "learn" "lesson"
## [189] "look" "msg" "yup" "done"
## [193] "oops" "see" "decide" "decided"
## [197] "hello" "saturday" "tomo" "trying"
## [201] "abiola" "ahead" "pls" "wanted"
## [205] "weekend" "crave" "forget" "need"
## [209] "sweet" "camcorder" "delivery" "nokia"
## [213] "sms" "tomorrow" "tried" "seeing"
## [217] "hope" "man" "well" "calls"
## [221] "get" "nigeria" "ask" "bit"
## [225] "maybe" "kept" "telling" "class"
## [229] "saw" "time" "almost" "gets"
## [233] "half" "second" "usually" "whole"
## [237] "morning" "place" "best" "give"
## [241] "happy" "liked" "never" "since"
## [245] "sorry" "thought" "wow" "correct"
## [249] "end" "ice" "new" "play"
## [253] "find" "yesterday" "cinema" "congrats"
## [257] "etc" "pass" "special" "year"
## [261] "later" "meeting" "reached" "move"
## [265] "pain" "pick" "girls" "good"
## [269] "situation" "checking" "part" "come"
## [273] "forever" "took" "check" "cut"
## [277] "double" "hair" "nice" "said"
## [281] "short" "wun" "awarded" "bonus"
## [285] "mob" "pleased" "review" "day"
## [289] "frnds" "rply" "song" "complimentary"
## [293] "dis" "trip" "comes" "hear"
## [297] "month" "plane" "hee" "lucky"
## [301] "money" "save" "finished" "babe"
## [305] "something" "wanna" "waiting" "cool"
## [309] "thats" "much" "job" "looking"
## [313] "stop" "one" "real" "tickets"
## [317] "used" "bed" "came" "gotta"
## [321] "started" "download" "wen" "close"
## [325] "don‘t" "stand" "afternoon" "another"
## [329] "late" "means" "night" "spent"
## [333] "loves" "pleasure" "rain" "smile"
## [337] "smiling" "someone" "trouble" "guaranteed"
## [341] "representative" "service" "buy" "havent"
## [345] "planning" "show" "box" "password"
## [349] "abt" "loads" "cause" "forgot"
## [353] "prob" "run" "shower" "coffee"
## [357] "else" "nothing" "ave" "long"
## [361] "okay" "price" "driving" "gone"
## [365] "test" "yet" "changed" "cuz"
## [369] "gave" "guess" "mean" "men"
## [373] "page" "says" "search" "dear"
## [377] "life" "lot" "umma" "birthday"
## [381] "making" "wishes" "aight" "hit"
## [385] "address" "computer" "better" "mom"
## [389] "old" "people" "busy" "worry"
## [393] "cos" "mah" "things" "contact"
## [397] "draw" "hrs" "last" "ppm"
## [401] "shows" "weekends" "anyway" "eatin"
## [405] "happened" "juz" "apartment" "askd"
## [409] "bday" "boss" "cabin" "entered"
## [413] "felt" "invited" "went" "flights"
## [417] "holiday" "inc" "operator" "pmin"
## [421] "specially" "friday" "must" "food"
## [425] "hmm" "paying" "school" "uncle"
## [429] "account" "expires" "identifier" "points"
## [433] "private" "statement" "unredeemed" "caller"
## [437] "landline" "app" "award" "ending"
## [441] "match" "numbers" "rates" "todays"
## [445] "bother" "sending" "sent" "bak"
## [449] "del" "girl" "answer" "question"
## [453] "country" "dvd" "player" "quiz"
## [457] "sony" "sunshine" "top" "direct"
## [461] "dogging" "join" "locations" "txting"
## [465] "uks" "haf" "age" "chat"
## [469] "msgs" "services" "lazy" "lect"
## [473] "type" "mail" "sir" "gud"
## [477] "little" "lovable" "persons" "tired"
## [481] "open" "taking" "whats" "local"
## [485] "ltd" "luv" "replied" "sexy"
## [489] "hard" "pray" "wine" "thk"
## [493] "become" "dream" "lots" "sometimes"
## [497] "without" "house" "leaving" "boy"
## [501] "missing" "years" "everyone" "keep"
## [505] "safe" "hand" "parents" "spend"
## [509] "friend" "frnd" "content" "goto"
## [513] "menu" "order" "ones" "wit"
## [517] "fancy" "needs" "also" "completely"
## [521] "waste" "bank" "hmmm" "hop"
## [525] "liao" "muz" "coming" "believe"
## [529] "cant" "hell" "ill" "bath"
## [533] "carlos" "smoke" "staying" "til"
## [537] "turns" "worth" "doesnt" "log"
## [541] "experience" "spoke" "especially" "offer"
## [545] "studying" "trust" "guys" "boytoy"
## [549] "net" "towards" "working" "awesome"
## [553] "haha" "minute" "freephone" "xmas"
## [557] "bathe" "jus" "sis" "using"
## [561] "deal" "joined" "personal" "touch"
## [565] "course" "finally" "able" "every"
## [569] "however" "hav" "mrng" "story"
## [573] "dead" "mrt" "orchard" "tmr"
## [577] "evening" "kate" "found" "college"
## [581] "darlin" "ive" "balance" "decimal"
## [585] "transaction" "goodmorning" "sleeping" "dat"
## [589] "oredi" "oso" "straight" "bill"
## [593] "big" "ready" "break" "semester"
## [597] "leh" "noe" "sounds" "past"
## [601] "slept" "easy" "exam" "march"
## [605] "called" "important" "shop" "system"
## [609] "happen" "nite" "collect" "optout"
## [613] "appreciate" "partner" "sign" "start"
## [617] "company" "bcoz" "lessons" "road"
## [621] "side" "street" "walk" "battery"
## [625] "died" "flirt" "sam" "reach"
## [629] "wil" "kick" "person" "admirer"
## [633] "rreveal" "secret" "specialcall" "thinks"
## [637] "ufind" "case" "laptop" "tel"
## [641] "meant" "everything" "face" "thanx"
## [645] "told" "uve" "watch" "asked"
## [649] "kallis" "didnt" "goodnight" "missed"
## [653] "sleep" "wake" "congratulations" "gift"
## [657] "music" "tncs" "vouchers" "cal"
## [661] "hold" "min" "angry" "care"
## [665] "childish" "coz" "deep" "dnt"
## [669] "showing" "true" "wid" "takes"
## [673] "lemme" "lover" "anytime" "mins"
## [677] "unlimited" "video" "disturb" "shopping"
## [681] "ring" "horny" "hot" "naked"
## [685] "unsubscribe" "plan" "wana" "choose"
## [689] "club" "credits" "charge" "quality"
## [693] "singles" "blue" "ended" "leaves"
## [697] "worries" "hmv" "questions" "might"
## [701] "full" "swing" "definitely" "far"
## [705] "okie" "usual" "lets" "baby"
## [709] "fone" "hour" "sense" "stupid"
## [713] "card" "ends" "loyalty" "phone"
## [717] "unless" "bslvyl" "die" "hurt"
## [721] "plz" "rose" "high" "somebody"
## [725] "imagine" "shit" "somewhere" "book"
## [729] "friendship" "games" "tones" "accept"
## [733] "sister" "weekly" "lost" "normal"
## [737] "rest" "wot" "made" "dunno"
## [741] "power" "yoga" "dude" "mths"
## [745] "christmas" "merry" "pete" "plans"
## [749] "problem" "reading" "track" "light"
## [753] "read" "movie" "immediately" "access"
## [757] "fixed" "line" "number" "via"
## [761] "chance" "custcare" "pmsg" "rcvd"
## [765] "valentines" "calling" "post" "texts"
## [769] "wiv" "round" "two" "num"
## [773] "small" "txts" "ever" "fault"
## [777] "urself" "figure" "jay" "weed"
## [781] "ago" "ish" "minutes" "met"
## [785] "cashbalance" "currently" "hgsuitelands" "maximize"
## [789] "rowwjhl" "moment" "cold" "posted"
## [793] "chikku" "forward" "air" "bluetooth"
## [797] "mobileupd" "motorola" "orange" "sonyericsson"
## [801] "discount" "messages" "wish" "woke"
## [805] "talking" "willing" "reference" "seen"
## [809] "happening" "sighs" "brings" "mistake"
## [813] "project" "body" "quite" "reason"
## [817] "slow" "couple" "leave" "phones"
## [821] "rental" "huh" "sat" "office"
## [825] "bout" "actually" "rock" "ass"
## [829] "facebook" "put" "putting" "god"
## [833] "india" "change" "poly" "tone"
## [837] "none" "starts" "yep" "stay"
## [841] "drink" "fullonsmscom" "thing" "den"
## [845] "bring" "dating" "competition" "head"
## [849] "eve" "heart" "poboxwwq" "yahoo"
## [853] "contacted" "land" "funny" "voice"
## [857] "giving" "lift" "mind" "wnt"
## [861] "fucking" "ldn" "vary" "booked"
## [865] "ticket" "ten" "tough" "supposed"
## [869] "din" "group" "tot" "doin"
## [873] "kinda" "loan" "welcome" "beautiful"
## [877] "ure" "asking" "kind" "ttyl"
## [881] "bad" "different" "thru" "gives"
## [885] "opt" "tcs" "enjoy" "princess"
## [889] "style" "many" "notice" "sae"
## [893] "tenerife" "details" "rate" "remove"
## [897] "moan" "nyt" "cum" "thinking"
## [901] "sec" "activate" "terms" "visit"
## [905] "depends" "meh" "monday" "nope"
## [909] "either" "lose" "water" "bored"
## [913] "outside" "near" "park" "rent"
## [917] "character" "opinion" "silent" "simple"
## [921] "ipod" "fri" "less" "children"
## [925] "shall" "attempt" "member" "offers"
## [929] "savamob" "sub" "unsub" "lady"
## [933] "pretty" "single" "within" "hoping"
## [937] "across" "kiss" "sea" "probably"
## [941] "fat" "fingers" "confidence" "listen"
## [945] "married" "quote" "self" "mine"
## [949] "rather" "hotel" "omw" "hurry"
## [953] "warm" "weight" "cheap" "‘s"
## [957] "online" "pics" "fast" "workin"
## [961] "fuck" "gym" "whatever" "daddy"
## [965] "scream" "clean" "mum" "sch"
## [969] "yar" "door" "marriage" "izzit"
## [973] "kids" "yogasana" "spree" "tscs"
## [977] "pound" "announcement" "train" "noon"
## [981] "neva" "imma" "euro" "game"
## [985] "future" "shuhui" "family" "happiness"
## [989] "snow" "together" "weather" "alex"
## [993] "pub" "paid" "darren" "longer"
## [997] "area" "matches" "forwarded" "–"
## [1001] "holder" "voucher" "dad" "login"
## [1005] "sound" "email" "starting" "tuesday"
## [1009] "drugs" "town" "drug" "sun"
## [1013] "envelope" "paper" "fetch" "study"
## [1017] "belly" "laugh" "saying" "knw"
## [1021] "bedroom" "sex" "king" "ans"
## [1025] "bold" "looks" "…" "idea"
## [1029] "away" "fantastic" "aftr" "dey"
## [1033] "sit" "students" "wer" "seems"
## [1037] "sell" "feeling" "wants" "twice"
## [1041] "add" "tonite" "rite" "glad"
## [1045] "eyes" "library" "anyone" "makes"
## [1049] "yest" "advance" "wishing" "store"
## [1053] "summer" "goin" "wonder" "drive"
## [1057] "damn" "wats" "space" "picking"
## [1061] "guy" "lovely" "slave" "mates"
## [1065] "wwwgetzedcouk" "aint" "ugh" "bag"
## [1069] "bid" "boo" "gettin" "teasing"
## [1073] "extra" "charity" "polys" "green"
## [1077] "mate" "five" "wed" "understand"
## [1081] "await" "collection" "wan" "poor"
## [1085] "fall" "hai" "tampa" "whenever"
## [1089] "sort" "yrs" "asap" "drop"
## [1093] "otherwise" "ntt" "mid" "knew"
## [1097] "midnight" "training" "sad" "murder"
## [1101] "wife" "possible" "teach" "plus"
## [1105] "gay" "january" "sale" "iam"
## [1109] "wrong" "photo" "registered" "least"
## [1113] "medical" "places" "recently" "feels"
## [1117] "heavy" "truth" "brand" "earlier"
## [1121] "nxt" "loving" "yun" "party"
## [1125] "miracle" "heard" "frm" "don"
## [1129] "asleep" "loverboy" "serious" "april"
## [1133] "hiya" "flower" "support" "movies"
## [1137] "convey" "pic" "digital" "doctor"
## [1141] "receipt" "black" "aiyah" "information"
## [1145] "surprise" "grins" "gal" "howz"
## [1149] "film" "luck" "raining" "cute"
## [1153] "energy" "chennai" "£wk" "choice"
## [1157] "enter" "strong" "chinese" "nobody"
## [1161] "honey" "picked" "mode" "others"
## [1165] "wondering" "tour" "dreams" "alrite"
## [1169] "shd" "frens" "tht" "marry"
## [1173] "pple" "arrested" "croydon" "fantasies"
## [1177] "cover" "write" "fact" "cafe"
## [1181] "alone" "loved" "hows" "arrive"
## [1185] "screaming" "auction" "difficult" "gas"
## [1189] "excellent" "john" "instead" "buzz"
## [1193] "lei" "waking" "sight" "father"
## [1197] "hook" "slowly" "joys" "holding"
## [1201] "exciting" "btnationalrate" "wednesday" "thnk"
## [1205] "excuse" "season" "thinkin" "mon"
## [1209] "sitting" "expecting" "pix" "colleagues"
## [1213] "mood" "sofa" "empty" "checked"
## [1217] "sky" "housemaid" "murdered" "murderer"
## [1221] "police" "budget" "happens" "thurs"
s23_dict <- c(findFreqTerms(s23_dtm_train, 5))
# limit training and test matrices to words in s23_dict
s23_train2 <- DocumentTermMatrix(s23_corpus_train, list(dictionary = s23_dict))
s23_test2 <- DocumentTermMatrix(s23_corpus_test,list(dictionary = s23_dict))
# create function, factor of email classification
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)}
# convert counts to factors
s23_train2 <- apply(s23_train2, MARGIN = 2, convert_counts)
s23_test2 <- apply(s23_test2, MARGIN = 2, convert_counts)
# train model
s23_classifier <- naiveBayes(s23_train2, s23_train$type)
# predictions
s23_test_pred <- predict(s23_classifier, s23_test2)
# performance evaluation
CrossTable(s23_test_pred, s23_test$type, prop.chisq = FALSE, prop.t = FALSE, dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1390
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1203 | 28 | 1231 |
## | 0.977 | 0.023 | 0.886 |
## | 0.995 | 0.155 | |
## -------------|-----------|-----------|-----------|
## spam | 6 | 153 | 159 |
## | 0.038 | 0.962 | 0.114 |
## | 0.005 | 0.845 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1209 | 181 | 1390 |
## | 0.870 | 0.130 | |
## -------------|-----------|-----------|-----------|
##
##
153 /1390
## [1] 0.1100719
The model classified 86.5% of emails as true negatives (ham), 2% as false positives, 0.5% as false negatives, and 11% as true positives (spam).