library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(e1071)
library(gmodels)
library(SnowballC)
The dataset is available in Kaggle ("https://www.kaggle.com/uciml/sms-spam-collection-dataset).The dataset has 5 columns with no meaningful data in third to fifth columns. First, the necessary columns are uploaded as dataframe and column names are changed to meaningful names.
sms_df <- read.csv("C:\\Users\\Sunil\\Downloads\\spam.csv")
dim(sms_df) ## 5572 rows
## [1] 5572 5
names(sms_df)
## [1] "v1" "v2" "X" "X.1" "X.2"
summary(sms_df)
## v1 v2 X X.1
## Length:5572 Length:5572 Length:5572 Length:5572
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X.2
## Length:5572
## Class :character
## Mode :character
head(sms_df)
## v1
## 1 ham
## 2 ham
## 3 spam
## 4 ham
## 5 ham
## 6 spam
## v2
## 1 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
## 2 Ok lar... Joking wif u oni...
## 3 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
## 4 U dun say so early hor... U c already then say...
## 5 Nah I don't think he goes to usf, he lives around here though
## 6 FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv
## X X.1 X.2
## 1
## 2
## 3
## 4
## 5
## 6
str(sms_df)
## 'data.frame': 5572 obs. of 5 variables:
## $ v1 : chr "ham" "ham" "spam" "ham" ...
## $ v2 : chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
## $ X : chr "" "" "" "" ...
## $ X.1: chr "" "" "" "" ...
## $ X.2: chr "" "" "" "" ...
sms_df <- sms_df[-3:-5]
names(sms_df) <- c("type","message")
sms_df$type <- as.factor(sms_df$type)
Let’s look into the frequent words in spam and ham messages. The dataframe is subsetted into spam and ham dataframe. Wordcloud function is applied on the datasets.
spam_df <- subset(sms_df,type == 'spam') #747 rows
ham_df <- subset(sms_df,type=='ham') #4825 rows
prop.table(table(sms_df$type))*100
##
## ham spam
## 86.59368 13.40632
### wordcloud ###
wordcloud(spam_df$message,max.words = 20,colors = rainbow(10))
wordcloud(ham_df$message,max.words = 10,colors = rainbow(10))
wordcloud(sms_df$message,max.words = 10,colors = rainbow(10))
The words in messages has to be anayzed to check on the probability of the words used in spam or ham message. The messages have to be changed to Corpus(Document) and tokenized using Document term matrix function which converts the same in to a matrix with count of words in message. The words are cleansed in DocumentTermMatrix to remove white spaces, common english words, punctuations, numbers etc.
sms_corpus <- VCorpus(VectorSource(sms_df$message))
sms_dtm <- DocumentTermMatrix(sms_corpus,control = list(
tolower = T,
removeNumbers = T,
removePunctuation = T,
stopwords = T,
stemDocument = T
))
inspect(sms_dtm[1:4,])
## <<DocumentTermMatrix (documents: 4, terms: 8239)>>
## Non-/sparse entries: 37/32919
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs already amore apply available buffet bugis cine comp entry say
## 1 0 1 0 1 1 1 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 1 0 0 0 0 1 2 0
## 4 1 0 0 0 0 0 0 0 0 2
Inorder to predict the words being in spam or ham message, the dataset has to be divided into train and test dataset. 75% of data is assigned to train and 25% is assigned to test.
sms_train <- sms_dtm[1:4179, ]
sms_test <- sms_dtm[4180:5572, ]
sms_train_label <- sms_df[1:4179, ]$type
sms_test_label <- sms_df[4180:5572, ]$type
prop.table(table(sms_train_label))*100
## sms_train_label
## ham spam
## 86.48002 13.51998
prop.table(table(sms_test_label))*100
## sms_test_label
## ham spam
## 86.93467 13.06533
Words count more than 5 is filtered out using ‘FindFreqTerms’ function. The values are converted into list and assigned to train and test variables.
sms_freq_train <- findFreqTerms(sms_train,lowfreq = 5)
sms_freq_test <- findFreqTerms(sms_test,lowfreq = 5)
freq_train <- sms_train[,sms_freq_train]
freq_test <- sms_test[ ,sms_freq_test]
inspect(freq_test[5,]) #fifth row in freq_test
## <<DocumentTermMatrix (documents: 1, terms: 510)>>
## Non-/sparse entries: 8/502
## Sparsity : 98%
## Maximal term length: 11
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs able account just know like need really shit tomorrow wont
## 4184 0 0 1 1 1 1 1 1 1 1
Naive Bayes classification is used to analyse the data.As Naive Bayes works on categorical data, the 0’s and 1’s are changed to ‘Yes’ and ‘No’ using user-defined function.
convert_count <- function(x){
x <- ifelse(x>0,'yes','no')
}
train <-apply(freq_train,MARGIN = 2,convert_count)
test <- apply(freq_test,MARGIN = 2,convert_count)
table(train)
## train
## no yes
## 5161569 24570
table(test)
## test
## no yes
## 703845 6585
classifier <- naiveBayes(train,sms_train_label)
sms_predict <- predict(classifier,test)
CrossTable(sms_test_label,sms_predict, prop.t = F,prop.chisq = F,dnn = c("Actual","Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1393
##
##
## | Predicted
## Actual | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1195 | 16 | 1211 |
## | 0.987 | 0.013 | 0.869 |
## | 0.980 | 0.092 | |
## -------------|-----------|-----------|-----------|
## spam | 24 | 158 | 182 |
## | 0.132 | 0.868 | 0.131 |
## | 0.020 | 0.908 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1219 | 174 | 1393 |
## | 0.875 | 0.125 | |
## -------------|-----------|-----------|-----------|
##
##
16 messages that are not-spam(ham) are predicted as spam and 24 messages that are spam are predicted as not-spam by the model. To reduce false positive values, the model has to be improved. In Naive bayes algorithm, laplace value is 0 by default. So, lets change laplace value and check the results.
Laplace = 1
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1393
##
##
## | Predicted
## Actual | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1191 | 20 | 1211 |
## | 0.983 | 0.017 | 0.869 |
## | 0.983 | 0.110 | |
## -------------|-----------|-----------|-----------|
## spam | 20 | 162 | 182 |
## | 0.110 | 0.890 | 0.131 |
## | 0.017 | 0.890 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1211 | 182 | 1393 |
## | 0.869 | 0.131 | |
## -------------|-----------|-----------|-----------|
##
##
laplace = 1, increased the false positives of ham messages by 4 and decreased the false positives in spam messaged by 4.
laplace = -1
classifier <- naiveBayes(train,sms_train_label,laplace = -1)
sms_predict <- predict(classifier,test)
CrossTable(sms_test_label,sms_predict, prop.t = F,prop.chisq = F,dnn = c("Actual","Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1393
##
##
## | Predicted
## Actual | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1202 | 9 | 1211 |
## | 0.993 | 0.007 | 0.869 |
## | 0.975 | 0.056 | |
## -------------|-----------|-----------|-----------|
## spam | 31 | 151 | 182 |
## | 0.170 | 0.830 | 0.131 |
## | 0.025 | 0.944 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1233 | 160 | 1393 |
## | 0.885 | 0.115 | |
## -------------|-----------|-----------|-----------|
##
##
laplace = -1 produced a good model in terms of non-spam data.