library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(e1071)
library(gmodels)
library(SnowballC)

Data loading & cleansing

The dataset is available in Kaggle ("https://www.kaggle.com/uciml/sms-spam-collection-dataset).The dataset has 5 columns with no meaningful data in third to fifth columns. First, the necessary columns are uploaded as dataframe and column names are changed to meaningful names.

sms_df <- read.csv("C:\\Users\\Sunil\\Downloads\\spam.csv")
dim(sms_df)                             ## 5572 rows
## [1] 5572    5
names(sms_df)
## [1] "v1"  "v2"  "X"   "X.1" "X.2"
summary(sms_df)
##       v1                 v2                 X                 X.1           
##  Length:5572        Length:5572        Length:5572        Length:5572       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      X.2           
##  Length:5572       
##  Class :character  
##  Mode  :character
head(sms_df)
##     v1
## 1  ham
## 2  ham
## 3 spam
## 4  ham
## 5  ham
## 6 spam
##                                                                                                                                                            v2
## 1                                             Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
## 2                                                                                                                               Ok lar... Joking wif u oni...
## 3 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
## 4                                                                                                           U dun say so early hor... U c already then say...
## 5                                                                                               Nah I don't think he goes to usf, he lives around here though
## 6        FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv
##   X X.1 X.2
## 1          
## 2          
## 3          
## 4          
## 5          
## 6
str(sms_df)
## 'data.frame':    5572 obs. of  5 variables:
##  $ v1 : chr  "ham" "ham" "spam" "ham" ...
##  $ v2 : chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
##  $ X  : chr  "" "" "" "" ...
##  $ X.1: chr  "" "" "" "" ...
##  $ X.2: chr  "" "" "" "" ...
sms_df <- sms_df[-3:-5]
names(sms_df) <- c("type","message")
sms_df$type <- as.factor(sms_df$type)

Subsetting to SPAM & HAM, Create wordclouds

Let’s look into the frequent words in spam and ham messages. The dataframe is subsetted into spam and ham dataframe. Wordcloud function is applied on the datasets.

spam_df <- subset(sms_df,type == 'spam')  #747 rows      
ham_df <- subset(sms_df,type=='ham')      #4825 rows
prop.table(table(sms_df$type))*100
## 
##      ham     spam 
## 86.59368 13.40632
### wordcloud ###
wordcloud(spam_df$message,max.words = 20,colors = rainbow(10))

wordcloud(ham_df$message,max.words = 10,colors = rainbow(10))

wordcloud(sms_df$message,max.words = 10,colors = rainbow(10))

Creation of Corpus & Document term matrix

The words in messages has to be anayzed to check on the probability of the words used in spam or ham message. The messages have to be changed to Corpus(Document) and tokenized using Document term matrix function which converts the same in to a matrix with count of words in message. The words are cleansed in DocumentTermMatrix to remove white spaces, common english words, punctuations, numbers etc.

sms_corpus <- VCorpus(VectorSource(sms_df$message))
sms_dtm <- DocumentTermMatrix(sms_corpus,control = list(
                tolower = T,
                removeNumbers = T,
                removePunctuation = T,
                stopwords = T,
                stemDocument = T 
                ))
inspect(sms_dtm[1:4,])
## <<DocumentTermMatrix (documents: 4, terms: 8239)>>
## Non-/sparse entries: 37/32919
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs already amore apply available buffet bugis cine comp entry say
##    1       0     1     0         1      1     1    1    0     0   0
##    2       0     0     0         0      0     0    0    0     0   0
##    3       0     0     1         0      0     0    0    1     2   0
##    4       1     0     0         0      0     0    0    0     0   2

Partition into Train & Test Dataset

Inorder to predict the words being in spam or ham message, the dataset has to be divided into train and test dataset. 75% of data is assigned to train and 25% is assigned to test.

sms_train <- sms_dtm[1:4179, ]
sms_test <- sms_dtm[4180:5572, ]

sms_train_label <- sms_df[1:4179, ]$type
sms_test_label <- sms_df[4180:5572, ]$type

prop.table(table(sms_train_label))*100
## sms_train_label
##      ham     spam 
## 86.48002 13.51998
prop.table(table(sms_test_label))*100
## sms_test_label
##      ham     spam 
## 86.93467 13.06533

Find Frequent words

Words count more than 5 is filtered out using ‘FindFreqTerms’ function. The values are converted into list and assigned to train and test variables.

sms_freq_train <- findFreqTerms(sms_train,lowfreq = 5)
sms_freq_test <- findFreqTerms(sms_test,lowfreq = 5)
freq_train <- sms_train[,sms_freq_train]
freq_test <- sms_test[ ,sms_freq_test]
inspect(freq_test[5,])     #fifth row in freq_test
## <<DocumentTermMatrix (documents: 1, terms: 510)>>
## Non-/sparse entries: 8/502
## Sparsity           : 98%
## Maximal term length: 11
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   able account just know like need really shit tomorrow wont
##   4184    0       0    1    1    1    1      1    1        1    1

Find Frequent words

Naive Bayes classification is used to analyse the data.As Naive Bayes works on categorical data, the 0’s and 1’s are changed to ‘Yes’ and ‘No’ using user-defined function.

convert_count <- function(x){
  x <- ifelse(x>0,'yes','no')
}

train <-apply(freq_train,MARGIN = 2,convert_count)
test <- apply(freq_test,MARGIN = 2,convert_count)

table(train)                        
## train
##      no     yes 
## 5161569   24570
table(test)                         
## test
##     no    yes 
## 703845   6585
classifier <- naiveBayes(train,sms_train_label)
sms_predict <- predict(classifier,test)
CrossTable(sms_test_label,sms_predict, prop.t = F,prop.chisq = F,dnn = c("Actual","Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1393 
## 
##  
##              | Predicted 
##       Actual |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1195 |        16 |      1211 | 
##              |     0.987 |     0.013 |     0.869 | 
##              |     0.980 |     0.092 |           | 
## -------------|-----------|-----------|-----------|
##         spam |        24 |       158 |       182 | 
##              |     0.132 |     0.868 |     0.131 | 
##              |     0.020 |     0.908 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1219 |       174 |      1393 | 
##              |     0.875 |     0.125 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

16 messages that are not-spam(ham) are predicted as spam and 24 messages that are spam are predicted as not-spam by the model. To reduce false positive values, the model has to be improved. In Naive bayes algorithm, laplace value is 0 by default. So, lets change laplace value and check the results.

Laplace = 1

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1393 
## 
##  
##              | Predicted 
##       Actual |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1191 |        20 |      1211 | 
##              |     0.983 |     0.017 |     0.869 | 
##              |     0.983 |     0.110 |           | 
## -------------|-----------|-----------|-----------|
##         spam |        20 |       162 |       182 | 
##              |     0.110 |     0.890 |     0.131 | 
##              |     0.017 |     0.890 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1211 |       182 |      1393 | 
##              |     0.869 |     0.131 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

laplace = 1, increased the false positives of ham messages by 4 and decreased the false positives in spam messaged by 4.

laplace = -1

classifier <- naiveBayes(train,sms_train_label,laplace = -1)
sms_predict <- predict(classifier,test)
CrossTable(sms_test_label,sms_predict, prop.t = F,prop.chisq = F,dnn = c("Actual","Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1393 
## 
##  
##              | Predicted 
##       Actual |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1202 |         9 |      1211 | 
##              |     0.993 |     0.007 |     0.869 | 
##              |     0.975 |     0.056 |           | 
## -------------|-----------|-----------|-----------|
##         spam |        31 |       151 |       182 | 
##              |     0.170 |     0.830 |     0.131 | 
##              |     0.025 |     0.944 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1233 |       160 |      1393 | 
##              |     0.885 |     0.115 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

laplace = -1 produced a good model in terms of non-spam data.