load in the data

library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ------------------------------------------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats
sms=read.csv("sms_spam.txt",stringsAsFactors = FALSE)
str(sms)
'data.frame':   5574 obs. of  2 variables:
 $ type: chr  "ham" "ham" "spam" "ham" ...
 $ text: chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...

need to convert type into factor for classification analysis

sms$type=sms$type%>%factor
str(sms)
'data.frame':   5574 obs. of  2 variables:
 $ type: Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
 $ text: chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C"| __truncated__ "U dun say so early hor... U c already then say..." ...

process the text data

The first step in processing text data involves creating a corpus,which is a collection of text documents.

Instead of following the steps on Machine learning with R, we are most interested in how we could do this with tidytext packge recently developed.

library(tidytext)
sms_tidy=sms%>%unnest_tokens(output=word,input=text,token="words")%>%anti_join(stop_words,by="word")
sms_tidy%>%head(20)

visualizing text data using the famous wordcloud package

# create a similar document term matrix 
names=sms%>%unnest_tokens(input=text,output=word,token="words")%>%anti_join(stop_words,by="word")%>%group_by(word)%>%summarise(n=n())%>%arrange(desc(n))%>%filter(n>5)
col_dim=dim(names)[1]
row_dim=dim(sms)[1]
library(foreach)
library(stringr)
DTM=foreach(i=1:col_dim,.combine = cbind)%do%{
  ifelse(str_detect(sms$text,names$word[i]),"yes","no")%>%factor()
}
colnames(DTM)=names$word
summary(DTM[,1:7])
      call             2               ur             â               gt              4              lt      
 Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.00  
 1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.00   1st Qu.:1.00  
 Median :1.000   Median :1.000   Median :1.00   Median :1.000   Median :1.000   Median :1.00   Median :1.00  
 Mean   :1.084   Mean   :1.159   Mean   :1.25   Mean   :1.009   Mean   :1.052   Mean   :1.12   Mean   :1.06  
 3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:1.00   3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:1.00   3rd Qu.:1.00  
 Max.   :2.000   Max.   :2.000   Max.   :2.00   Max.   :2.000   Max.   :2.000   Max.   :2.00   Max.   :2.00  

create training and testing data set

library(caret)
index=createDataPartition(DTM[,1],p=0.8,list=FALSE)
DTM%>%dim
[1] 5574 1255
sms_train%>%dim
[1] 4460 1255
sms_test%>%dim
[1] 1114 1255
sms_train=DTM[index,]
sms_test=DTM[-index,]
sms_train_cl=sms$type[index]
sms_test_cl=sms$type[-index]

apply naive bayes

sms_pred=predict(sms_cl,sms_test,type="class",threshold = 0.35)
sms_pred=predict(sms_cl,sms_test,type="class",threshold = 0.35)
table(sms_pred, sms_test_cl)
        sms_test_cl
sms_pred ham spam
    ham  953   63
    spam  20   78

when I changeed the thresold to 0.1 it gives a much much better result

LS0tDQp0aXRsZTogIk5haXZlIEJheWVzIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyMgbG9hZCBpbiB0aGUgZGF0YSANCg0KYGBge3J9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCnNtcz1yZWFkLmNzdigic21zX3NwYW0udHh0IixzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQpzdHIoc21zKQ0KYGBgDQoNCiMjIG5lZWQgdG8gY29udmVydCB0eXBlIGludG8gZmFjdG9yIGZvciBjbGFzc2lmaWNhdGlvbiBhbmFseXNpcw0KDQpgYGB7cn0NCnNtcyR0eXBlPXNtcyR0eXBlJT4lZmFjdG9yDQoNCnN0cihzbXMpDQpgYGANCg0KIyMgcHJvY2VzcyB0aGUgdGV4dCBkYXRhDQoNCiBUaGUgZmlyc3Qgc3RlcCBpbiBwcm9jZXNzaW5nIHRleHQgZGF0YSBpbnZvbHZlcyBjcmVhdGluZyBhIGNvcnB1cyx3aGljaCBpcyBhIGNvbGxlY3Rpb24NCm9mIHRleHQgZG9jdW1lbnRzLg0KDQpJbnN0ZWFkIG9mIGZvbGxvd2luZyB0aGUgc3RlcHMgb24gTWFjaGluZSBsZWFybmluZyB3aXRoIFIsIHdlIGFyZSBtb3N0IGludGVyZXN0ZWQgaW4gaG93IHdlIGNvdWxkIGRvIHRoaXMgd2l0aCB0aWR5dGV4dCBwYWNrZ2UgcmVjZW50bHkgZGV2ZWxvcGVkLiANCg0KDQpgYGB7cn0NCmxpYnJhcnkodGlkeXRleHQpDQoNCnNtc190aWR5PXNtcyU+JXVubmVzdF90b2tlbnMob3V0cHV0PXdvcmQsaW5wdXQ9dGV4dCx0b2tlbj0id29yZHMiKSU+JWFudGlfam9pbihzdG9wX3dvcmRzLGJ5PSJ3b3JkIikNCg0Kc21zX3RpZHklPiVoZWFkKDIwKQ0KDQpgYGANCg0KIyMgdmlzdWFsaXppbmcgdGV4dCBkYXRhIHVzaW5nIHRoZSBmYW1vdXMgd29yZGNsb3VkIHBhY2thZ2UNCg0KDQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCJ3b3JkY2xvdWQiKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQoNCnNtc19zdW09c21zX3RpZHklPiVncm91cF9ieSh3b3JkKSU+JXN1bW1hcmlzZShuPW4oKSklPiVhcnJhbmdlKGRlc2MobikpDQpzbXNfc3VtDQp3b3JkY2xvdWQod29yZHM9c21zX3N1bSR3b3JkLGZyZXE9c21zX3N1bSRuLG1pbi5mcmVxID0gNTAscmFuZG9tLm9yZGVyID0gRkFMU0UscmFuZG9tLmNvbG9yID0gRkFMU0UpDQpgYGANCg0KYGBge3J9DQojIGNyZWF0ZSBhIHNpbWlsYXIgZG9jdW1lbnQgdGVybSBtYXRyaXggDQoNCg0KbmFtZXM9c21zJT4ldW5uZXN0X3Rva2VucyhpbnB1dD10ZXh0LG91dHB1dD13b3JkLHRva2VuPSJ3b3JkcyIpJT4lYW50aV9qb2luKHN0b3Bfd29yZHMsYnk9IndvcmQiKSU+JWdyb3VwX2J5KHdvcmQpJT4lc3VtbWFyaXNlKG49bigpKSU+JWFycmFuZ2UoZGVzYyhuKSklPiVmaWx0ZXIobj41KQ0KDQpjb2xfZGltPWRpbShuYW1lcylbMV0NCnJvd19kaW09ZGltKHNtcylbMV0NCg0KDQoNCmxpYnJhcnkoZm9yZWFjaCkNCg0KbGlicmFyeShzdHJpbmdyKQ0KDQpEVE09Zm9yZWFjaChpPTE6Y29sX2RpbSwuY29tYmluZSA9IGNiaW5kKSVkbyV7DQogIGlmZWxzZShzdHJfZGV0ZWN0KHNtcyR0ZXh0LG5hbWVzJHdvcmRbaV0pLCJ5ZXMiLCJubyIpJT4lZmFjdG9yKCkNCn0NCg0KY29sbmFtZXMoRFRNKT1uYW1lcyR3b3JkDQoNCnN1bW1hcnkoRFRNWywxOjddKQ0KDQpgYGANCg0KDQojIyBjcmVhdGUgdHJhaW5pbmcgYW5kIHRlc3RpbmcgZGF0YSBzZXQNCmBgYHtyfQ0KbGlicmFyeShjYXJldCkNCg0KaW5kZXg9Y3JlYXRlRGF0YVBhcnRpdGlvbihEVE1bLDFdLHA9MC44LGxpc3Q9RkFMU0UpDQoNCkRUTSU+JWRpbQ0Kc21zX3RyYWluJT4lZGltDQpzbXNfdGVzdCU+JWRpbQ0KDQoNCnNtc190cmFpbj1EVE1baW5kZXgsXQ0KDQpzbXNfdGVzdD1EVE1bLWluZGV4LF0NCg0Kc21zX3RyYWluX2NsPXNtcyR0eXBlW2luZGV4XQ0KDQpzbXNfdGVzdF9jbD1zbXMkdHlwZVstaW5kZXhdDQoNCmBgYA0KDQoNCiMjIGFwcGx5IG5haXZlIGJheWVzDQoNCmBgYHtyfQ0KbGlicmFyeShlMTA3MSkNCg0Kc21zX2NsPW5haXZlQmF5ZXMoc21zX3RyYWluLHNtc190cmFpbl9jbCxsYXBsYWNlID0gMCkNCg0Kc21zX3Rlc3QlPiVkaW0NCg0Kc21zX3ByZWQ9cHJlZGljdChzbXNfY2wsc21zX3Rlc3QsdHlwZT0iY2xhc3MiLHRocmVzaG9sZCA9IDAuMzUpDQoNCg0KdGFibGUoc21zX3ByZWQsIHNtc190ZXN0X2NsKQ0KDQpgYGANCg0KIyMgd2hlbiBJIGNoYW5nZWVkIHRoZSB0aHJlc29sZCB0byAwLjEgaXQgZ2l2ZXMgYSBtdWNoIG11Y2ggYmV0dGVyIHJlc3VsdCANCg0KDQo=