Spam and Ham Classification

Importing Data

#install.packages("tm")
library(tm)
sms<-read.csv("D:/Class PDFs/INT234/Notes/R Data/sms_spam.csv")
str(sms)
'data.frame':   5574 obs. of  2 variables:
 $ type: chr  "ham" "ham" "spam" "ham" ...
 $ text: chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...

Data Analysis and Preprocessing


sms$type<-factor(sms$type)
table(sms$type)

 ham spam 
4827  747 

Corpus creation

sms_corpus<-VCorpus(VectorSource(sms$text))
sms_corpus
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 5574
as.character(sms_corpus[[1]]) # single document
[1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
lapply(sms_corpus[1:10],as.character) # multiple document
$`1`
[1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."

$`2`
[1] "Ok lar... Joking wif u oni..."

$`3`
[1] "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

$`4`
[1] "U dun say so early hor... U c already then say..."

$`5`
[1] "Nah I don't think he goes to usf, he lives around here though"

$`6`
[1] "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"

$`7`
[1] "Even my brother is not like to speak with me. They treat me like aids patent."

$`8`
[1] "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"

$`9`
[1] "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."

$`10`
[1] "Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"

Converting to lowercase

sms_corpus_clean<-tm_map(sms_corpus,content_transformer(tolower))
sms_corpus_clean
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 5574
as.character(sms_corpus[[1]])
[1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
as.character(sms_corpus_clean[[1]])
[1] "go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."

Removing Numbers,Punctuations and whitespaces

sms_corpus_clean<-tm_map(sms_corpus_clean,removeNumbers)
as.character(sms_corpus_clean[[1]])
[1] "go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."
sms_corpus_clean<-tm_map(sms_corpus_clean,removePunctuation)
as.character(sms_corpus_clean[[1]])
[1] "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat"
sms_corpus_clean<-tm_map(sms_corpus_clean,stripWhitespace)
as.character(sms_corpus_clean[[1]])
[1] "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat"

Removing stopwords

sms_corpus_clean<-tm_map(sms_corpus_clean,removeWords,stopwords())
as.character(sms_corpus_clean[[1]])
[1] "go  jurong point crazy available   bugis n great world la e buffet cine  got amore wat"

Stemming

#install.packages("SnowballC")
library(SnowballC)
sms_corpus_clean<-tm_map(sms_corpus_clean,stemDocument)
as.character(sms_corpus_clean[[1]])
[1] "go jurong point crazi avail bugi n great world la e buffet cine got amor wat"

Tokenization

sms_dtm<-DocumentTermMatrix(sms_corpus_clean)
sms_dtm
<<DocumentTermMatrix (documents: 5574, terms: 6942)>>
Non-/sparse entries: 43749/38650959
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)

Data Analyzed and Cleaned


Splitting data into train and test categories

sms_train<-sms_dtm[1:4169,]
head(sms_train)
<<DocumentTermMatrix (documents: 6, terms: 6942)>>
Non-/sparse entries: 60/41592
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
sms_test<-sms_dtm[4170:5574,]
sms_train_labels<-sms$type[1:4169]
head(sms_train_labels)
[1] ham  ham  spam ham  ham  spam
Levels: ham spam
nrow(sms)
[1] 5574
sms_test_labels<-sms$type[4170:5574]

Proportion of spam in training and test data

prop.table(table(sms_train_labels))
sms_train_labels
      ham      spam 
0.8647158 0.1352842 
prop.table(table(sms_test_labels))
sms_test_labels
      ham      spam 
0.8697509 0.1302491 

Visualizing Wordcloud

#install.packages("wordcloud")
library(wordcloud)
wordcloud(sms_corpus_clean,min.freq = 50,random.order=TRUE)
Warning in wordcloud(sms_corpus_clean, min.freq = 50, random.order = TRUE) :
  will could not be fit on page. It will not be plotted.

Finding Frequent terms

sms_freq_words<-findFreqTerms(sms_train,5)
str(sms_freq_words)
 chr [1:1173] "£wk" "abiola" "abl" "abt" "accept" "access" "account" "across" ...
sms_dtm_freq_train<-sms_train[,sms_freq_words]
head(sms_dtm_freq_train)
<<DocumentTermMatrix (documents: 6, terms: 1173)>>
Non-/sparse entries: 49/6989
Sparsity           : 99%
Maximal term length: 13
Weighting          : term frequency (tf)
sms_dtm_freq_test<-sms_test[,sms_freq_words]
head(sms_dtm_freq_test)
<<DocumentTermMatrix (documents: 6, terms: 1173)>>
Non-/sparse entries: 57/6981
Sparsity           : 99%
Maximal term length: 13
Weighting          : term frequency (tf)

Converting 1 to ‘Yes’ and 0 to ‘No’

convert_counts<-function(x){
  x<-ifelse(x>0,'Yes','No')
}
sms_train<-apply(sms_dtm_freq_train,MARGIN = 2,convert_counts)
sms_test<-apply(sms_dtm_freq_test,MARGIN = 2,convert_counts)
library(e1071)
sms_classifier<-naiveBayes(sms_train,sms_train_labels)
sms_test_pred<-predict(sms_classifier,sms_test)
library(gmodels)
confusion_matrix<-CrossTable(sms_test_pred,sms_test_labels,chisq=FALSE,prop.t = FALSE,dnn=c('predicted','actual'))

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|-------------------------|

 
Total Observations in Table:  1405 

 
             | actual 
   predicted |       ham |      spam | Row Total | 
-------------|-----------|-----------|-----------|
         ham |      1214 |        21 |      1235 | 
             |    18.210 |   121.599 |           | 
             |     0.983 |     0.017 |     0.879 | 
             |     0.993 |     0.115 |           | 
-------------|-----------|-----------|-----------|
        spam |         8 |       162 |       170 | 
             |   132.290 |   883.382 |           | 
             |     0.047 |     0.953 |     0.121 | 
             |     0.007 |     0.885 |           | 
-------------|-----------|-----------|-----------|
Column Total |      1222 |       183 |      1405 | 
             |     0.870 |     0.130 |           | 
-------------|-----------|-----------|-----------|

 
confusion_matrix_matrix <- confusion_matrix$t
accuracy<-sum(diag(confusion_matrix_matrix))/sum(confusion_matrix_matrix)
cat("Accuracy is",accuracy*100,"%")
Accuracy is 97.93594 %

Accuracy is 97.9%

LS0tDQp0aXRsZTogIlNNUyBTcGFtIENsYXNzaWZpY2F0aW9uIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyBTcGFtIGFuZCBIYW0gQ2xhc3NpZmljYXRpb24NCiANCiMjIEltcG9ydGluZyBEYXRhDQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCJ0bSIpDQpsaWJyYXJ5KHRtKQ0Kc21zPC1yZWFkLmNzdigiRDovQ2xhc3MgUERGcy9JTlQyMzQvTm90ZXMvUiBEYXRhL3Ntc19zcGFtLmNzdiIpDQpzdHIoc21zKQ0KYGBgDQojIERhdGEgQW5hbHlzaXMgYW5kIFByZXByb2Nlc3NpbmcNCjxocj4NCmBgYHtyfQ0Kc21zJHR5cGU8LWZhY3RvcihzbXMkdHlwZSkNCnRhYmxlKHNtcyR0eXBlKQ0KYGBgIA0KDQojIyBDb3JwdXMgY3JlYXRpb24NCmBgYHtyfQ0Kc21zX2NvcnB1czwtVkNvcnB1cyhWZWN0b3JTb3VyY2Uoc21zJHRleHQpKQ0Kc21zX2NvcnB1cw0KYGBgDQoNCg0KYGBge3J9DQphcy5jaGFyYWN0ZXIoc21zX2NvcnB1c1tbMV1dKSAjIHNpbmdsZSBkb2N1bWVudA0KbGFwcGx5KHNtc19jb3JwdXNbMToxMF0sYXMuY2hhcmFjdGVyKSAjIG11bHRpcGxlIGRvY3VtZW50DQpgYGANCg0KIyMgQ29udmVydGluZyB0byBsb3dlcmNhc2UNCmBgYHtyfQ0Kc21zX2NvcnB1c19jbGVhbjwtdG1fbWFwKHNtc19jb3JwdXMsY29udGVudF90cmFuc2Zvcm1lcih0b2xvd2VyKSkNCnNtc19jb3JwdXNfY2xlYW4NCmFzLmNoYXJhY3RlcihzbXNfY29ycHVzW1sxXV0pDQphcy5jaGFyYWN0ZXIoc21zX2NvcnB1c19jbGVhbltbMV1dKQ0KYGBgDQoNCiMjIFJlbW92aW5nIE51bWJlcnMsUHVuY3R1YXRpb25zIGFuZCB3aGl0ZXNwYWNlcw0KYGBge3J9DQpzbXNfY29ycHVzX2NsZWFuPC10bV9tYXAoc21zX2NvcnB1c19jbGVhbixyZW1vdmVOdW1iZXJzKQ0KYXMuY2hhcmFjdGVyKHNtc19jb3JwdXNfY2xlYW5bWzFdXSkNCnNtc19jb3JwdXNfY2xlYW48LXRtX21hcChzbXNfY29ycHVzX2NsZWFuLHJlbW92ZVB1bmN0dWF0aW9uKQ0KYXMuY2hhcmFjdGVyKHNtc19jb3JwdXNfY2xlYW5bWzFdXSkNCnNtc19jb3JwdXNfY2xlYW48LXRtX21hcChzbXNfY29ycHVzX2NsZWFuLHN0cmlwV2hpdGVzcGFjZSkNCmFzLmNoYXJhY3RlcihzbXNfY29ycHVzX2NsZWFuW1sxXV0pDQpgYGANCg0KIyMgUmVtb3Zpbmcgc3RvcHdvcmRzDQpgYGB7cn0NCnNtc19jb3JwdXNfY2xlYW48LXRtX21hcChzbXNfY29ycHVzX2NsZWFuLHJlbW92ZVdvcmRzLHN0b3B3b3JkcygpKQ0KYXMuY2hhcmFjdGVyKHNtc19jb3JwdXNfY2xlYW5bWzFdXSkNCmBgYA0KDQojIyBTdGVtbWluZw0KYGBge3J9DQojaW5zdGFsbC5wYWNrYWdlcygiU25vd2JhbGxDIikNCmxpYnJhcnkoU25vd2JhbGxDKQ0Kc21zX2NvcnB1c19jbGVhbjwtdG1fbWFwKHNtc19jb3JwdXNfY2xlYW4sc3RlbURvY3VtZW50KQ0KYXMuY2hhcmFjdGVyKHNtc19jb3JwdXNfY2xlYW5bWzFdXSkNCmBgYA0KDQojIyBUb2tlbml6YXRpb24NCmBgYHtyfQ0Kc21zX2R0bTwtRG9jdW1lbnRUZXJtTWF0cml4KHNtc19jb3JwdXNfY2xlYW4pDQpzbXNfZHRtDQpgYGANCjxocj4NCiMgRGF0YSBBbmFseXplZCBhbmQgQ2xlYW5lZA0KPGhyPg0KIyMgU3BsaXR0aW5nIGRhdGEgaW50byB0cmFpbiBhbmQgdGVzdCBjYXRlZ29yaWVzDQoNCmBgYHtyfQ0Kc21zX3RyYWluPC1zbXNfZHRtWzE6NDE2OSxdDQpoZWFkKHNtc190cmFpbikNCnNtc190ZXN0PC1zbXNfZHRtWzQxNzA6NTU3NCxdDQpzbXNfdHJhaW5fbGFiZWxzPC1zbXMkdHlwZVsxOjQxNjldDQpoZWFkKHNtc190cmFpbl9sYWJlbHMpDQpucm93KHNtcykNCnNtc190ZXN0X2xhYmVsczwtc21zJHR5cGVbNDE3MDo1NTc0XQ0KYGBgDQoNCiMjIFByb3BvcnRpb24gb2Ygc3BhbSBpbiB0cmFpbmluZyBhbmQgdGVzdCBkYXRhDQpgYGB7cn0NCnByb3AudGFibGUodGFibGUoc21zX3RyYWluX2xhYmVscykpDQpwcm9wLnRhYmxlKHRhYmxlKHNtc190ZXN0X2xhYmVscykpDQpgYGANCg0KIyMgVmlzdWFsaXppbmcgV29yZGNsb3VkDQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCJ3b3JkY2xvdWQiKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQp3b3JkY2xvdWQoc21zX2NvcnB1c19jbGVhbixtaW4uZnJlcSA9IDUwLHJhbmRvbS5vcmRlcj1UUlVFKQ0KYGBgDQoNCiMjIEZpbmRpbmcgRnJlcXVlbnQgdGVybXMNCmBgYHtyfQ0Kc21zX2ZyZXFfd29yZHM8LWZpbmRGcmVxVGVybXMoc21zX3RyYWluLDUpDQpzdHIoc21zX2ZyZXFfd29yZHMpDQpzbXNfZHRtX2ZyZXFfdHJhaW48LXNtc190cmFpblssc21zX2ZyZXFfd29yZHNdDQpoZWFkKHNtc19kdG1fZnJlcV90cmFpbikNCnNtc19kdG1fZnJlcV90ZXN0PC1zbXNfdGVzdFssc21zX2ZyZXFfd29yZHNdDQpoZWFkKHNtc19kdG1fZnJlcV90ZXN0KQ0KYGBgDQoNCiMjIENvbnZlcnRpbmcgMSB0byAnWWVzJyBhbmQgMCB0byAnTm8nDQpgYGB7cn0NCmNvbnZlcnRfY291bnRzPC1mdW5jdGlvbih4KXsNCiAgeDwtaWZlbHNlKHg+MCwnWWVzJywnTm8nKQ0KfQ0KYGBgDQoNCg0KYGBge3J9DQpzbXNfdHJhaW48LWFwcGx5KHNtc19kdG1fZnJlcV90cmFpbixNQVJHSU4gPSAyLGNvbnZlcnRfY291bnRzKQ0Kc21zX3Rlc3Q8LWFwcGx5KHNtc19kdG1fZnJlcV90ZXN0LE1BUkdJTiA9IDIsY29udmVydF9jb3VudHMpDQpgYGANCg0KDQpgYGB7cn0NCmxpYnJhcnkoZTEwNzEpDQpzbXNfY2xhc3NpZmllcjwtbmFpdmVCYXllcyhzbXNfdHJhaW4sc21zX3RyYWluX2xhYmVscykNCnNtc190ZXN0X3ByZWQ8LXByZWRpY3Qoc21zX2NsYXNzaWZpZXIsc21zX3Rlc3QpDQpsaWJyYXJ5KGdtb2RlbHMpDQpjb25mdXNpb25fbWF0cml4PC1Dcm9zc1RhYmxlKHNtc190ZXN0X3ByZWQsc21zX3Rlc3RfbGFiZWxzLGNoaXNxPUZBTFNFLHByb3AudCA9IEZBTFNFLGRubj1jKCdwcmVkaWN0ZWQnLCdhY3R1YWwnKSkNCmNvbmZ1c2lvbl9tYXRyaXhfbWF0cml4IDwtIGNvbmZ1c2lvbl9tYXRyaXgkdA0KYWNjdXJhY3k8LXN1bShkaWFnKGNvbmZ1c2lvbl9tYXRyaXhfbWF0cml4KSkvc3VtKGNvbmZ1c2lvbl9tYXRyaXhfbWF0cml4KQ0KY2F0KCJBY2N1cmFjeSBpcyIsYWNjdXJhY3kqMTAwLCIlIikNCmBgYA0KIyMjIEFjY3VyYWN5IGlzIDk3Ljkl