Libraries

#library
library(tm)
## Loading required package: NLP
library(SnowballC)
library(Matrix)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(caret)
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(glmnet) #logistic regression
## Loading required package: foreach
## Loaded glmnet 2.0-9
library(e1071) #naive bayes

Exploring and preparing the data

set.seed(123)

# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)

# examine the structure of the sms data
str(sms_raw)
## 'data.frame':    5559 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "ham" "spam" ...
##  $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out"| __truncated__ ...
# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)

# create DTM using TM package

sms_dtm <- Corpus(VectorSource(sms_raw$text)) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(stemDocument) %>%
  DocumentTermMatrix()

Logistic Regression

Creating train and test data

#creating train and test data
index = sample(5559, 5559*0.8)

sms_train_matrix <- as.matrix(sms_dtm[index, ])
sms_test_matrix <- as.matrix(sms_dtm[-index, ])

sms_dtm_train <- Matrix(sms_train_matrix, sparse = T)
sms_dtm_test  <- Matrix(sms_test_matrix, sparse = T)

# save the labels
sms_train_labels <- sms_raw[index, ]$type
sms_test_labels  <- sms_raw[-index, ]$type

# check for the proportion of train and test
prop.table(table(sms_train_labels))
## sms_train_labels
##       ham      spam 
## 0.8682258 0.1317742
prop.table(table(sms_test_labels))
## sms_test_labels
##       ham      spam 
## 0.8552158 0.1447842

Train model

#create model and prediction
fit_ridge <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial',alpha = 0)
fit_lasso <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial',alpha = 1)
pred <- predict(fit_ridge, sms_dtm_test, type = 'response')
head(pred[,1:11])
##           s0        s1        s2        s3        s4        s5        s6
## 25 0.1317742 0.1310068 0.1309697 0.1309309 0.1308902 0.1308476 0.1308029
## 37 0.1317742 0.1317430 0.1317408 0.1317383 0.1317357 0.1317328 0.1317297
## 40 0.1317742 0.1309899 0.1309521 0.1309125 0.1308711 0.1308276 0.1307820
## 43 0.1317742 0.1311696 0.1311402 0.1311094 0.1310770 0.1310431 0.1310074
## 46 0.1317742 0.1309396 0.1308994 0.1308573 0.1308132 0.1307669 0.1307184
## 51 0.1317742 0.1312852 0.1312613 0.1312362 0.1312099 0.1311822 0.1311532
##           s7        s8        s9       s10
## 25 0.1307559 0.1307067 0.1306551 0.1306010
## 37 0.1317264 0.1317228 0.1317188 0.1317146
## 40 0.1307342 0.1306841 0.1306315 0.1305764
## 43 0.1309700 0.1309308 0.1308895 0.1308463
## 46 0.1306676 0.1306142 0.1305583 0.1304996
## 51 0.1311227 0.1310907 0.1310570 0.1310216
head(coef(fit_ridge, s= 1.52))
## 6 x 1 sparse Matrix of class "dgCMatrix"
##                       1
## (Intercept) -2.41854617
## check       -0.03471553
## good        -0.02413085
## hope        -0.03213501
## just         0.03199336
## week         0.10763024
glmnet_predict <- predict(fit_ridge, newx=sms_dtm_test,s=c(1.52, 0.1))
head(glmnet_predict)
##            1         2
## 25 -2.500983 -2.522771
## 37 -2.184031 -2.197195
## 40 -2.473129 -2.493386
## 43 -2.462126 -2.483747
## 46 -2.523148 -2.545249
## 51 -2.379942 -2.398343
#plotting Lasso and Ridge method
plot(fit_ridge, xlab = 'Ridge Method')
abline(0,0)

plot(fit_lasso, xlab = 'Lasso Method')
abline(0,0)

#lambda, comparison between lasso and ridge
glmnet_fit <- cv.glmnet(x=sms_dtm_train , y=sms_train_labels, family='binomial', alpha=0)
head(coef(glmnet_fit, s='lambda.min'))
## 6 x 1 sparse Matrix of class "dgCMatrix"
##                       1
## (Intercept) -2.41925304
## check       -0.03475428
## good        -0.02415112
## hope        -0.03216682
## just         0.03202778
## week         0.10772625
plot(glmnet_fit)

glmnet_fit$lambda.min
## [1] 1.517223
glmnet_lasso <- cv.glmnet(x=sms_dtm_train , y=sms_train_labels, family='binomial', alpha=1)
plot(glmnet_lasso)

glmnet_lasso$lambda.min
## [1] 0.002306044
# Generate predictions
preds_logit <- predict(glmnet_fit, newx=sms_dtm_test, type='response', s='lambda.min')
head(preds_logit)
##             1
## 25 0.07573374
## 37 0.10115024
## 40 0.07771065
## 43 0.07849947
## 46 0.07419597
## 51 0.08466318
summary(preds_logit)
##        1          
##  Min.   :0.04811  
##  1st Qu.:0.07278  
##  Median :0.07909  
##  Mean   :0.12678  
##  3rd Qu.:0.09089  
##  Max.   :0.70894
preds_newlogit <- rep('ham',length(preds_logit))
preds_newlogit[preds_logit>=0.5] <- 'spam'

Check performance of Logistic Regression

#confusion matrix
confusionMatrix(sms_test_labels, preds_newlogit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ham spam
##       ham  951    0
##       spam  97   64
##                                           
##                Accuracy : 0.9128          
##                  95% CI : (0.8946, 0.9287)
##     No Information Rate : 0.9424          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5302          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9074          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.3975          
##              Prevalence : 0.9424          
##          Detection Rate : 0.8552          
##    Detection Prevalence : 0.8552          
##       Balanced Accuracy : 0.9537          
##                                           
##        'Positive' Class : ham             
## 
# Put results into dataframe for plotting.
results <- data.frame(pred=preds_logit, actual=sms_test_labels )


ggplot(results, aes(x=preds_logit, fill=actual)) + geom_density(alpha = 0.2)

prediction_logit <- prediction(preds_logit, sms_test_labels)
perf <- performance(prediction_logit, measure = "tpr", x.measure = "fpr")

auc <- performance(prediction_logit, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9835805
roc_logit <- data.frame(fpr=unlist(perf@x.values), tpr=unlist(perf@y.values))

ggplot(roc_logit, aes(x=fpr, ymin=0, ymax=tpr)) +
  geom_ribbon(alpha=0.1) +
  geom_line(aes(y=tpr)) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  ggtitle("ROC Curve For Logistic Regression") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Naive Bayes

# creating training and test datasets
sms_naive_train <- sms_dtm[index, ]
sms_naive_test  <- sms_dtm[-index, ]

# save the labels
sms_naive_train_labels <- sms_raw[index, ]$type
sms_naive_test_labels  <- sms_raw[-index, ]$type

# check for the proportion of train and test
prop.table(table(sms_naive_train_labels))
## sms_naive_train_labels
##       ham      spam 
## 0.8682258 0.1317742
prop.table(table(sms_naive_test_labels))
## sms_naive_test_labels
##       ham      spam 
## 0.8552158 0.1447842
# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_naive_train, 5)

# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_naive_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_naive_test[ , sms_freq_words]

# convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)

#apply model
sms_classifier <- naiveBayes(sms_train, sms_naive_train_labels)

Check performance of Naive Bayes

#confusion matrix
sms_test_pred_confusion <- predict(sms_classifier, sms_test)
confusionMatrix(sms_naive_test_labels, sms_test_pred_confusion)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ham spam
##       ham  943    8
##       spam  15  146
##                                           
##                Accuracy : 0.9793          
##                  95% CI : (0.9691, 0.9868)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9149          
##  Mcnemar's Test P-Value : 0.2109          
##                                           
##             Sensitivity : 0.9843          
##             Specificity : 0.9481          
##          Pos Pred Value : 0.9916          
##          Neg Pred Value : 0.9068          
##              Prevalence : 0.8615          
##          Detection Rate : 0.8480          
##    Detection Prevalence : 0.8552          
##       Balanced Accuracy : 0.9662          
##                                           
##        'Positive' Class : ham             
## 
sms_naive_pred <- predict(sms_classifier, sms_test,type = 'raw')

#auc and roc curve
prediction_naive <- prediction(sms_naive_pred[,"spam"], sms_naive_test_labels)

perf_naive <- performance(prediction_naive, measure = "tpr", x.measure = "fpr")

auc_naive <- performance(prediction_naive, measure = "auc")
auc_naive <- auc_naive@y.values[[1]]
auc_naive
## [1] 0.9850795
roc_naive <- data.frame(fpr=unlist(perf_naive@x.values), tpr=unlist(perf_naive@y.values))

ggplot(roc_naive, aes(x=fpr, ymin=0, ymax=tpr)) +
  geom_ribbon(alpha=0.1) +
  geom_line(aes(y=tpr)) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  ggtitle("ROC Curve For Naive Bayes") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Comparing Logistic Regression and Naive Bayes

roc_naive$model <- 'Naive Bayes'
roc_logit$model <- 'Logistic Regression'

bind1 <- rbind(roc_naive, roc_logit)
bind1 %>%
  ggplot(data=., aes(x=fpr, y=tpr, linetype=model, color=model)) +
  geom_line() +
  geom_abline(slope=1, intercept=0, linetype='dashed')  +
  theme(legend.position=c(0.8,0.3), legend.background = element_rect(fill = 'grey90')) +
  ggtitle("Logistic Regression vs. Naive Bayes") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Improving Model

Use bigram to improve our accuracy. NGramTokenizer from RWeka package splits strings into n-grams with given minimal and miximal numbers of gram.

# tokenizer
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))

control <- list(
  tokenize=BigramTokenizer,
  bounds = list(global = c(5, 500))
  
)

# creating dtm using 'BigramTokenzier'
sms_dtm_bi <- Corpus(VectorSource(sms_raw$text)) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(stemDocument) %>%
  DocumentTermMatrix(control=control)

# create training and test data
sms_bitr_matrix <- as.matrix(sms_dtm_bi[index, ])
sms_bite_matrix <- as.matrix(sms_dtm_bi[-index, ])

sms_bi_train <- Matrix(sms_bitr_matrix, sparse = T)
sms_bi_test  <- Matrix(sms_bite_matrix, sparse = T)

# labels
sms_trainb_labels <- sms_raw[index, ]$type
sms_testb_labels  <- sms_raw[-index, ]$type

# train the model - using Lasso method
glmnet_bi <- cv.glmnet(sms_bi_train, sms_trainb_labels, family='binomial', alpha=0)

# generate predictions
preds_bi <- predict(glmnet_bi, sms_bi_test, type='response', s="lambda.min")

# put results into dataframe for plotting
results_bi <- data.frame(pred=preds_bi, actual = sms_testb_labels)

ggplot(results_bi, aes(x=preds_bi, fill = actual)) + geom_density(alpha = 0.2)

Check performance of bigram method

preds_newbi <- rep('ham',length(preds_bi))
preds_newbi[preds_bi>=0.5] <- 'spam'

#confusion matrix
confusionMatrix(sms_testb_labels, preds_newbi)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ham spam
##       ham  945    6
##       spam  26  135
##                                           
##                Accuracy : 0.9712          
##                  95% CI : (0.9596, 0.9802)
##     No Information Rate : 0.8732          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8775          
##  Mcnemar's Test P-Value : 0.0007829       
##                                           
##             Sensitivity : 0.9732          
##             Specificity : 0.9574          
##          Pos Pred Value : 0.9937          
##          Neg Pred Value : 0.8385          
##              Prevalence : 0.8732          
##          Detection Rate : 0.8498          
##    Detection Prevalence : 0.8552          
##       Balanced Accuracy : 0.9653          
##                                           
##        'Positive' Class : ham             
## 
# auc and roc curve
prediction_bi <- prediction(preds_bi, sms_testb_labels)
perf_bi <- performance(prediction_bi, measure = "tpr", x.measure = "fpr")

auc_bi <- performance(prediction_bi, measure = "auc")
auc_bi <- auc_bi@y.values[[1]]
auc_bi
## [1] 0.9792634
roc_bi <- data.frame(fpr=unlist(perf_bi@x.values), tpr=unlist(perf_bi@y.values))

ggplot(roc_bi, aes(x=fpr, ymin=0, ymax=tpr)) +
  geom_ribbon(alpha=0.1) +
  geom_line(aes(y=tpr)) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  ggtitle("ROC Curve") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Comparing Logistic Regression, Naive Bayes and bigram

#comparing three roc curves
roc_bi$model <- 'With Bigram'
roc_logit$model <- 'Logistic Regression'
roc_naive$model <- 'Naive Bayes'
bind2 <- rbind(roc_bi, roc_naive, roc_logit)

bind2 %>%
  ggplot(data=., aes(x=fpr, y=tpr, linetype=model, color=model)) +
  geom_line() +
  geom_abline(slope=1, intercept=0, linetype='dashed')  +
  theme(legend.position=c(0.8,0.3), legend.background = element_rect(fill = 'grey90')) +
  ggtitle("Logistic Regression vs. Naive Bayes vs. Bigram model") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')