Data Analysis and Preprocessing
sms$type<-factor(sms$type)
table(sms$type)
ham spam
4827 747
Corpus creation
sms_corpus<-VCorpus(VectorSource(sms$text))
sms_corpus
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 5574
as.character(sms_corpus[[1]]) # single document
[1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
lapply(sms_corpus[1:10],as.character) # multiple document
$`1`
[1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
$`2`
[1] "Ok lar... Joking wif u oni..."
$`3`
[1] "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
$`4`
[1] "U dun say so early hor... U c already then say..."
$`5`
[1] "Nah I don't think he goes to usf, he lives around here though"
$`6`
[1] "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
$`7`
[1] "Even my brother is not like to speak with me. They treat me like aids patent."
$`8`
[1] "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"
$`9`
[1] "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
$`10`
[1] "Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"
Converting to lowercase
sms_corpus_clean<-tm_map(sms_corpus,content_transformer(tolower))
sms_corpus_clean
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 5574
as.character(sms_corpus[[1]])
[1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
as.character(sms_corpus_clean[[1]])
[1] "go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."
Removing Numbers,Punctuations and whitespaces
sms_corpus_clean<-tm_map(sms_corpus_clean,removeNumbers)
as.character(sms_corpus_clean[[1]])
[1] "go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."
sms_corpus_clean<-tm_map(sms_corpus_clean,removePunctuation)
as.character(sms_corpus_clean[[1]])
[1] "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat"
sms_corpus_clean<-tm_map(sms_corpus_clean,stripWhitespace)
as.character(sms_corpus_clean[[1]])
[1] "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat"
Removing stopwords
sms_corpus_clean<-tm_map(sms_corpus_clean,removeWords,stopwords())
as.character(sms_corpus_clean[[1]])
[1] "go jurong point crazy available bugis n great world la e buffet cine got amore wat"
Stemming
#install.packages("SnowballC")
library(SnowballC)
sms_corpus_clean<-tm_map(sms_corpus_clean,stemDocument)
as.character(sms_corpus_clean[[1]])
[1] "go jurong point crazi avail bugi n great world la e buffet cine got amor wat"
Tokenization
sms_dtm<-DocumentTermMatrix(sms_corpus_clean)
sms_dtm
<<DocumentTermMatrix (documents: 5574, terms: 6942)>>
Non-/sparse entries: 43749/38650959
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
Data Analyzed and Cleaned
Splitting data into train and test categories
sms_train<-sms_dtm[1:4169,]
head(sms_train)
<<DocumentTermMatrix (documents: 6, terms: 6942)>>
Non-/sparse entries: 60/41592
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
sms_test<-sms_dtm[4170:5574,]
sms_train_labels<-sms$type[1:4169]
head(sms_train_labels)
[1] ham ham spam ham ham spam
Levels: ham spam
nrow(sms)
[1] 5574
sms_test_labels<-sms$type[4170:5574]
Proportion of spam in training and test data
prop.table(table(sms_train_labels))
sms_train_labels
ham spam
0.8647158 0.1352842
prop.table(table(sms_test_labels))
sms_test_labels
ham spam
0.8697509 0.1302491
Visualizing Wordcloud
#install.packages("wordcloud")
library(wordcloud)
wordcloud(sms_corpus_clean,min.freq = 50,random.order=TRUE)
Warning in wordcloud(sms_corpus_clean, min.freq = 50, random.order = TRUE) :
will could not be fit on page. It will not be plotted.

Finding Frequent terms
sms_freq_words<-findFreqTerms(sms_train,5)
str(sms_freq_words)
chr [1:1173] "£wk" "abiola" "abl" "abt" "accept" "access" "account" "across" ...
sms_dtm_freq_train<-sms_train[,sms_freq_words]
head(sms_dtm_freq_train)
<<DocumentTermMatrix (documents: 6, terms: 1173)>>
Non-/sparse entries: 49/6989
Sparsity : 99%
Maximal term length: 13
Weighting : term frequency (tf)
sms_dtm_freq_test<-sms_test[,sms_freq_words]
head(sms_dtm_freq_test)
<<DocumentTermMatrix (documents: 6, terms: 1173)>>
Non-/sparse entries: 57/6981
Sparsity : 99%
Maximal term length: 13
Weighting : term frequency (tf)
Converting 1 to ‘Yes’ and 0 to ‘No’
convert_counts<-function(x){
x<-ifelse(x>0,'Yes','No')
}
sms_train<-apply(sms_dtm_freq_train,MARGIN = 2,convert_counts)
sms_test<-apply(sms_dtm_freq_test,MARGIN = 2,convert_counts)
library(e1071)
sms_classifier<-naiveBayes(sms_train,sms_train_labels)
sms_test_pred<-predict(sms_classifier,sms_test)
library(gmodels)
confusion_matrix<-CrossTable(sms_test_pred,sms_test_labels,chisq=FALSE,prop.t = FALSE,dnn=c('predicted','actual'))
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
|-------------------------|
Total Observations in Table: 1405
| actual
predicted | ham | spam | Row Total |
-------------|-----------|-----------|-----------|
ham | 1214 | 21 | 1235 |
| 18.210 | 121.599 | |
| 0.983 | 0.017 | 0.879 |
| 0.993 | 0.115 | |
-------------|-----------|-----------|-----------|
spam | 8 | 162 | 170 |
| 132.290 | 883.382 | |
| 0.047 | 0.953 | 0.121 |
| 0.007 | 0.885 | |
-------------|-----------|-----------|-----------|
Column Total | 1222 | 183 | 1405 |
| 0.870 | 0.130 | |
-------------|-----------|-----------|-----------|
confusion_matrix_matrix <- confusion_matrix$t
accuracy<-sum(diag(confusion_matrix_matrix))/sum(confusion_matrix_matrix)
cat("Accuracy is",accuracy*100,"%")
Accuracy is 97.93594 %
Accuracy is 97.9%
LS0tDQp0aXRsZTogIlNNUyBTcGFtIENsYXNzaWZpY2F0aW9uIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyBTcGFtIGFuZCBIYW0gQ2xhc3NpZmljYXRpb24NCiANCiMjIEltcG9ydGluZyBEYXRhDQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCJ0bSIpDQpsaWJyYXJ5KHRtKQ0Kc21zPC1yZWFkLmNzdigiRDovQ2xhc3MgUERGcy9JTlQyMzQvTm90ZXMvUiBEYXRhL3Ntc19zcGFtLmNzdiIpDQpzdHIoc21zKQ0KYGBgDQojIERhdGEgQW5hbHlzaXMgYW5kIFByZXByb2Nlc3NpbmcNCjxocj4NCmBgYHtyfQ0Kc21zJHR5cGU8LWZhY3RvcihzbXMkdHlwZSkNCnRhYmxlKHNtcyR0eXBlKQ0KYGBgIA0KDQojIyBDb3JwdXMgY3JlYXRpb24NCmBgYHtyfQ0Kc21zX2NvcnB1czwtVkNvcnB1cyhWZWN0b3JTb3VyY2Uoc21zJHRleHQpKQ0Kc21zX2NvcnB1cw0KYGBgDQoNCg0KYGBge3J9DQphcy5jaGFyYWN0ZXIoc21zX2NvcnB1c1tbMV1dKSAjIHNpbmdsZSBkb2N1bWVudA0KbGFwcGx5KHNtc19jb3JwdXNbMToxMF0sYXMuY2hhcmFjdGVyKSAjIG11bHRpcGxlIGRvY3VtZW50DQpgYGANCg0KIyMgQ29udmVydGluZyB0byBsb3dlcmNhc2UNCmBgYHtyfQ0Kc21zX2NvcnB1c19jbGVhbjwtdG1fbWFwKHNtc19jb3JwdXMsY29udGVudF90cmFuc2Zvcm1lcih0b2xvd2VyKSkNCnNtc19jb3JwdXNfY2xlYW4NCmFzLmNoYXJhY3RlcihzbXNfY29ycHVzW1sxXV0pDQphcy5jaGFyYWN0ZXIoc21zX2NvcnB1c19jbGVhbltbMV1dKQ0KYGBgDQoNCiMjIFJlbW92aW5nIE51bWJlcnMsUHVuY3R1YXRpb25zIGFuZCB3aGl0ZXNwYWNlcw0KYGBge3J9DQpzbXNfY29ycHVzX2NsZWFuPC10bV9tYXAoc21zX2NvcnB1c19jbGVhbixyZW1vdmVOdW1iZXJzKQ0KYXMuY2hhcmFjdGVyKHNtc19jb3JwdXNfY2xlYW5bWzFdXSkNCnNtc19jb3JwdXNfY2xlYW48LXRtX21hcChzbXNfY29ycHVzX2NsZWFuLHJlbW92ZVB1bmN0dWF0aW9uKQ0KYXMuY2hhcmFjdGVyKHNtc19jb3JwdXNfY2xlYW5bWzFdXSkNCnNtc19jb3JwdXNfY2xlYW48LXRtX21hcChzbXNfY29ycHVzX2NsZWFuLHN0cmlwV2hpdGVzcGFjZSkNCmFzLmNoYXJhY3RlcihzbXNfY29ycHVzX2NsZWFuW1sxXV0pDQpgYGANCg0KIyMgUmVtb3Zpbmcgc3RvcHdvcmRzDQpgYGB7cn0NCnNtc19jb3JwdXNfY2xlYW48LXRtX21hcChzbXNfY29ycHVzX2NsZWFuLHJlbW92ZVdvcmRzLHN0b3B3b3JkcygpKQ0KYXMuY2hhcmFjdGVyKHNtc19jb3JwdXNfY2xlYW5bWzFdXSkNCmBgYA0KDQojIyBTdGVtbWluZw0KYGBge3J9DQojaW5zdGFsbC5wYWNrYWdlcygiU25vd2JhbGxDIikNCmxpYnJhcnkoU25vd2JhbGxDKQ0Kc21zX2NvcnB1c19jbGVhbjwtdG1fbWFwKHNtc19jb3JwdXNfY2xlYW4sc3RlbURvY3VtZW50KQ0KYXMuY2hhcmFjdGVyKHNtc19jb3JwdXNfY2xlYW5bWzFdXSkNCmBgYA0KDQojIyBUb2tlbml6YXRpb24NCmBgYHtyfQ0Kc21zX2R0bTwtRG9jdW1lbnRUZXJtTWF0cml4KHNtc19jb3JwdXNfY2xlYW4pDQpzbXNfZHRtDQpgYGANCjxocj4NCiMgRGF0YSBBbmFseXplZCBhbmQgQ2xlYW5lZA0KPGhyPg0KIyMgU3BsaXR0aW5nIGRhdGEgaW50byB0cmFpbiBhbmQgdGVzdCBjYXRlZ29yaWVzDQoNCmBgYHtyfQ0Kc21zX3RyYWluPC1zbXNfZHRtWzE6NDE2OSxdDQpoZWFkKHNtc190cmFpbikNCnNtc190ZXN0PC1zbXNfZHRtWzQxNzA6NTU3NCxdDQpzbXNfdHJhaW5fbGFiZWxzPC1zbXMkdHlwZVsxOjQxNjldDQpoZWFkKHNtc190cmFpbl9sYWJlbHMpDQpucm93KHNtcykNCnNtc190ZXN0X2xhYmVsczwtc21zJHR5cGVbNDE3MDo1NTc0XQ0KYGBgDQoNCiMjIFByb3BvcnRpb24gb2Ygc3BhbSBpbiB0cmFpbmluZyBhbmQgdGVzdCBkYXRhDQpgYGB7cn0NCnByb3AudGFibGUodGFibGUoc21zX3RyYWluX2xhYmVscykpDQpwcm9wLnRhYmxlKHRhYmxlKHNtc190ZXN0X2xhYmVscykpDQpgYGANCg0KIyMgVmlzdWFsaXppbmcgV29yZGNsb3VkDQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCJ3b3JkY2xvdWQiKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQp3b3JkY2xvdWQoc21zX2NvcnB1c19jbGVhbixtaW4uZnJlcSA9IDUwLHJhbmRvbS5vcmRlcj1UUlVFKQ0KYGBgDQoNCiMjIEZpbmRpbmcgRnJlcXVlbnQgdGVybXMNCmBgYHtyfQ0Kc21zX2ZyZXFfd29yZHM8LWZpbmRGcmVxVGVybXMoc21zX3RyYWluLDUpDQpzdHIoc21zX2ZyZXFfd29yZHMpDQpzbXNfZHRtX2ZyZXFfdHJhaW48LXNtc190cmFpblssc21zX2ZyZXFfd29yZHNdDQpoZWFkKHNtc19kdG1fZnJlcV90cmFpbikNCnNtc19kdG1fZnJlcV90ZXN0PC1zbXNfdGVzdFssc21zX2ZyZXFfd29yZHNdDQpoZWFkKHNtc19kdG1fZnJlcV90ZXN0KQ0KYGBgDQoNCiMjIENvbnZlcnRpbmcgMSB0byAnWWVzJyBhbmQgMCB0byAnTm8nDQpgYGB7cn0NCmNvbnZlcnRfY291bnRzPC1mdW5jdGlvbih4KXsNCiAgeDwtaWZlbHNlKHg+MCwnWWVzJywnTm8nKQ0KfQ0KYGBgDQoNCg0KYGBge3J9DQpzbXNfdHJhaW48LWFwcGx5KHNtc19kdG1fZnJlcV90cmFpbixNQVJHSU4gPSAyLGNvbnZlcnRfY291bnRzKQ0Kc21zX3Rlc3Q8LWFwcGx5KHNtc19kdG1fZnJlcV90ZXN0LE1BUkdJTiA9IDIsY29udmVydF9jb3VudHMpDQpgYGANCg0KDQpgYGB7cn0NCmxpYnJhcnkoZTEwNzEpDQpzbXNfY2xhc3NpZmllcjwtbmFpdmVCYXllcyhzbXNfdHJhaW4sc21zX3RyYWluX2xhYmVscykNCnNtc190ZXN0X3ByZWQ8LXByZWRpY3Qoc21zX2NsYXNzaWZpZXIsc21zX3Rlc3QpDQpsaWJyYXJ5KGdtb2RlbHMpDQpjb25mdXNpb25fbWF0cml4PC1Dcm9zc1RhYmxlKHNtc190ZXN0X3ByZWQsc21zX3Rlc3RfbGFiZWxzLGNoaXNxPUZBTFNFLHByb3AudCA9IEZBTFNFLGRubj1jKCdwcmVkaWN0ZWQnLCdhY3R1YWwnKSkNCmNvbmZ1c2lvbl9tYXRyaXhfbWF0cml4IDwtIGNvbmZ1c2lvbl9tYXRyaXgkdA0KYWNjdXJhY3k8LXN1bShkaWFnKGNvbmZ1c2lvbl9tYXRyaXhfbWF0cml4KSkvc3VtKGNvbmZ1c2lvbl9tYXRyaXhfbWF0cml4KQ0KY2F0KCJBY2N1cmFjeSBpcyIsYWNjdXJhY3kqMTAwLCIlIikNCmBgYA0KIyMjIEFjY3VyYWN5IGlzIDk3Ljkl