Libraries

#library
library(tm)

## Loading required package: NLP

library(SnowballC)
library(Matrix)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(caret)

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(glmnet) #logistic regression

## Loading required package: foreach

## Loaded glmnet 2.0-9

library(e1071) #naive bayes

Exploring and preparing the data

set.seed(123)

# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)

# examine the structure of the sms data
str(sms_raw)

## 'data.frame':    5559 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "ham" "spam" ...
##  $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or Â£10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out"| __truncated__ ...

# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)

# create DTM using TM package

sms_dtm <- Corpus(VectorSource(sms_raw$text)) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(stemDocument) %>%
  DocumentTermMatrix()

Logistic Regression

Creating train and test data

#creating train and test data
index = sample(5559, 5559*0.8)

sms_train_matrix <- as.matrix(sms_dtm[index, ])
sms_test_matrix <- as.matrix(sms_dtm[-index, ])

sms_dtm_train <- Matrix(sms_train_matrix, sparse = T)
sms_dtm_test  <- Matrix(sms_test_matrix, sparse = T)

# save the labels
sms_train_labels <- sms_raw[index, ]$type
sms_test_labels  <- sms_raw[-index, ]$type

# check for the proportion of train and test
prop.table(table(sms_train_labels))

## sms_train_labels
##       ham      spam 
## 0.8682258 0.1317742

prop.table(table(sms_test_labels))

## sms_test_labels
##       ham      spam 
## 0.8552158 0.1447842

Train model

#create model and prediction
fit_ridge <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial',alpha = 0)
fit_lasso <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial',alpha = 1)
pred <- predict(fit_ridge, sms_dtm_test, type = 'response')
head(pred[,1:11])

##           s0        s1        s2        s3        s4        s5        s6
## 25 0.1317742 0.1310068 0.1309697 0.1309309 0.1308902 0.1308476 0.1308029
## 37 0.1317742 0.1317430 0.1317408 0.1317383 0.1317357 0.1317328 0.1317297
## 40 0.1317742 0.1309899 0.1309521 0.1309125 0.1308711 0.1308276 0.1307820
## 43 0.1317742 0.1311696 0.1311402 0.1311094 0.1310770 0.1310431 0.1310074
## 46 0.1317742 0.1309396 0.1308994 0.1308573 0.1308132 0.1307669 0.1307184
## 51 0.1317742 0.1312852 0.1312613 0.1312362 0.1312099 0.1311822 0.1311532
##           s7        s8        s9       s10
## 25 0.1307559 0.1307067 0.1306551 0.1306010
## 37 0.1317264 0.1317228 0.1317188 0.1317146
## 40 0.1307342 0.1306841 0.1306315 0.1305764
## 43 0.1309700 0.1309308 0.1308895 0.1308463
## 46 0.1306676 0.1306142 0.1305583 0.1304996
## 51 0.1311227 0.1310907 0.1310570 0.1310216

head(coef(fit_ridge, s= 1.52))

## 6 x 1 sparse Matrix of class "dgCMatrix"
##                       1
## (Intercept) -2.41854617
## check       -0.03471553
## good        -0.02413085
## hope        -0.03213501
## just         0.03199336
## week         0.10763024

glmnet_predict <- predict(fit_ridge, newx=sms_dtm_test,s=c(1.52, 0.1))
head(glmnet_predict)

##            1         2
## 25 -2.500983 -2.522771
## 37 -2.184031 -2.197195
## 40 -2.473129 -2.493386
## 43 -2.462126 -2.483747
## 46 -2.523148 -2.545249
## 51 -2.379942 -2.398343

#plotting Lasso and Ridge method
plot(fit_ridge, xlab = 'Ridge Method')
abline(0,0)

plot(fit_lasso, xlab = 'Lasso Method')
abline(0,0)

#lambda, comparison between lasso and ridge
glmnet_fit <- cv.glmnet(x=sms_dtm_train , y=sms_train_labels, family='binomial', alpha=0)
head(coef(glmnet_fit, s='lambda.min'))

## 6 x 1 sparse Matrix of class "dgCMatrix"
##                       1
## (Intercept) -2.41925304
## check       -0.03475428
## good        -0.02415112
## hope        -0.03216682
## just         0.03202778
## week         0.10772625

plot(glmnet_fit)

glmnet_fit$lambda.min

## [1] 1.517223

glmnet_lasso <- cv.glmnet(x=sms_dtm_train , y=sms_train_labels, family='binomial', alpha=1)
plot(glmnet_lasso)

glmnet_lasso$lambda.min

## [1] 0.002306044

# Generate predictions
preds_logit <- predict(glmnet_fit, newx=sms_dtm_test, type='response', s='lambda.min')
head(preds_logit)

##             1
## 25 0.07573374
## 37 0.10115024
## 40 0.07771065
## 43 0.07849947
## 46 0.07419597
## 51 0.08466318

summary(preds_logit)

##        1          
##  Min.   :0.04811  
##  1st Qu.:0.07278  
##  Median :0.07909  
##  Mean   :0.12678  
##  3rd Qu.:0.09089  
##  Max.   :0.70894

preds_newlogit <- rep('ham',length(preds_logit))
preds_newlogit[preds_logit>=0.5] <- 'spam'

Check performance of Logistic Regression

#confusion matrix
confusionMatrix(sms_test_labels, preds_newlogit)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ham spam
##       ham  951    0
##       spam  97   64
##                                           
##                Accuracy : 0.9128          
##                  95% CI : (0.8946, 0.9287)
##     No Information Rate : 0.9424          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5302          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9074          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.3975          
##              Prevalence : 0.9424          
##          Detection Rate : 0.8552          
##    Detection Prevalence : 0.8552          
##       Balanced Accuracy : 0.9537          
##                                           
##        'Positive' Class : ham             
##

# Put results into dataframe for plotting.
results <- data.frame(pred=preds_logit, actual=sms_test_labels )


ggplot(results, aes(x=preds_logit, fill=actual)) + geom_density(alpha = 0.2)

prediction_logit <- prediction(preds_logit, sms_test_labels)
perf <- performance(prediction_logit, measure = "tpr", x.measure = "fpr")

auc <- performance(prediction_logit, measure = "auc")
auc <- auc@y.values[[1]]
auc

## [1] 0.9835805

roc_logit <- data.frame(fpr=unlist(perf@x.values), tpr=unlist(perf@y.values))

ggplot(roc_logit, aes(x=fpr, ymin=0, ymax=tpr)) +
  geom_ribbon(alpha=0.1) +
  geom_line(aes(y=tpr)) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  ggtitle("ROC Curve For Logistic Regression") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Naive Bayes

# creating training and test datasets
sms_naive_train <- sms_dtm[index, ]
sms_naive_test  <- sms_dtm[-index, ]

# save the labels
sms_naive_train_labels <- sms_raw[index, ]$type
sms_naive_test_labels  <- sms_raw[-index, ]$type

# check for the proportion of train and test
prop.table(table(sms_naive_train_labels))

## sms_naive_train_labels
##       ham      spam 
## 0.8682258 0.1317742

prop.table(table(sms_naive_test_labels))

## sms_naive_test_labels
##       ham      spam 
## 0.8552158 0.1447842

# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_naive_train, 5)

# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_naive_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_naive_test[ , sms_freq_words]

# convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)

#apply model
sms_classifier <- naiveBayes(sms_train, sms_naive_train_labels)

Check performance of Naive Bayes

#confusion matrix
sms_test_pred_confusion <- predict(sms_classifier, sms_test)
confusionMatrix(sms_naive_test_labels, sms_test_pred_confusion)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ham spam
##       ham  943    8
##       spam  15  146
##                                           
##                Accuracy : 0.9793          
##                  95% CI : (0.9691, 0.9868)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9149          
##  Mcnemar's Test P-Value : 0.2109          
##                                           
##             Sensitivity : 0.9843          
##             Specificity : 0.9481          
##          Pos Pred Value : 0.9916          
##          Neg Pred Value : 0.9068          
##              Prevalence : 0.8615          
##          Detection Rate : 0.8480          
##    Detection Prevalence : 0.8552          
##       Balanced Accuracy : 0.9662          
##                                           
##        'Positive' Class : ham             
##

sms_naive_pred <- predict(sms_classifier, sms_test,type = 'raw')

#auc and roc curve
prediction_naive <- prediction(sms_naive_pred[,"spam"], sms_naive_test_labels)

perf_naive <- performance(prediction_naive, measure = "tpr", x.measure = "fpr")

auc_naive <- performance(prediction_naive, measure = "auc")
auc_naive <- auc_naive@y.values[[1]]
auc_naive

## [1] 0.9850795

roc_naive <- data.frame(fpr=unlist(perf_naive@x.values), tpr=unlist(perf_naive@y.values))

ggplot(roc_naive, aes(x=fpr, ymin=0, ymax=tpr)) +
  geom_ribbon(alpha=0.1) +
  geom_line(aes(y=tpr)) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  ggtitle("ROC Curve For Naive Bayes") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Comparing Logistic Regression and Naive Bayes

roc_naive$model <- 'Naive Bayes'
roc_logit$model <- 'Logistic Regression'

bind1 <- rbind(roc_naive, roc_logit)
bind1 %>%
  ggplot(data=., aes(x=fpr, y=tpr, linetype=model, color=model)) +
  geom_line() +
  geom_abline(slope=1, intercept=0, linetype='dashed')  +
  theme(legend.position=c(0.8,0.3), legend.background = element_rect(fill = 'grey90')) +
  ggtitle("Logistic Regression vs. Naive Bayes") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Improving Model

Use bigram to improve our accuracy. NGramTokenizer from RWeka package splits strings into n-grams with given minimal and miximal numbers of gram.

# tokenizer
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))

control <- list(
  tokenize=BigramTokenizer,
  bounds = list(global = c(5, 500))
  
)

# creating dtm using 'BigramTokenzier'
sms_dtm_bi <- Corpus(VectorSource(sms_raw$text)) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(stemDocument) %>%
  DocumentTermMatrix(control=control)

# create training and test data
sms_bitr_matrix <- as.matrix(sms_dtm_bi[index, ])
sms_bite_matrix <- as.matrix(sms_dtm_bi[-index, ])

sms_bi_train <- Matrix(sms_bitr_matrix, sparse = T)
sms_bi_test  <- Matrix(sms_bite_matrix, sparse = T)

# labels
sms_trainb_labels <- sms_raw[index, ]$type
sms_testb_labels  <- sms_raw[-index, ]$type

# train the model - using Lasso method
glmnet_bi <- cv.glmnet(sms_bi_train, sms_trainb_labels, family='binomial', alpha=0)

# generate predictions
preds_bi <- predict(glmnet_bi, sms_bi_test, type='response', s="lambda.min")

# put results into dataframe for plotting
results_bi <- data.frame(pred=preds_bi, actual = sms_testb_labels)

ggplot(results_bi, aes(x=preds_bi, fill = actual)) + geom_density(alpha = 0.2)

Check performance of bigram method

preds_newbi <- rep('ham',length(preds_bi))
preds_newbi[preds_bi>=0.5] <- 'spam'

#confusion matrix
confusionMatrix(sms_testb_labels, preds_newbi)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ham spam
##       ham  945    6
##       spam  26  135
##                                           
##                Accuracy : 0.9712          
##                  95% CI : (0.9596, 0.9802)
##     No Information Rate : 0.8732          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8775          
##  Mcnemar's Test P-Value : 0.0007829       
##                                           
##             Sensitivity : 0.9732          
##             Specificity : 0.9574          
##          Pos Pred Value : 0.9937          
##          Neg Pred Value : 0.8385          
##              Prevalence : 0.8732          
##          Detection Rate : 0.8498          
##    Detection Prevalence : 0.8552          
##       Balanced Accuracy : 0.9653          
##                                           
##        'Positive' Class : ham             
##

# auc and roc curve
prediction_bi <- prediction(preds_bi, sms_testb_labels)
perf_bi <- performance(prediction_bi, measure = "tpr", x.measure = "fpr")

auc_bi <- performance(prediction_bi, measure = "auc")
auc_bi <- auc_bi@y.values[[1]]
auc_bi

## [1] 0.9792634

roc_bi <- data.frame(fpr=unlist(perf_bi@x.values), tpr=unlist(perf_bi@y.values))

ggplot(roc_bi, aes(x=fpr, ymin=0, ymax=tpr)) +
  geom_ribbon(alpha=0.1) +
  geom_line(aes(y=tpr)) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  ggtitle("ROC Curve") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Comparing Logistic Regression, Naive Bayes and bigram

#comparing three roc curves
roc_bi$model <- 'With Bigram'
roc_logit$model <- 'Logistic Regression'
roc_naive$model <- 'Naive Bayes'
bind2 <- rbind(roc_bi, roc_naive, roc_logit)

bind2 %>%
  ggplot(data=., aes(x=fpr, y=tpr, linetype=model, color=model)) +
  geom_line() +
  geom_abline(slope=1, intercept=0, linetype='dashed')  +
  theme(legend.position=c(0.8,0.3), legend.background = element_rect(fill = 'grey90')) +
  ggtitle("Logistic Regression vs. Naive Bayes vs. Bigram model") +
  ylab('True Positive Rate') +
  xlab('False Positive Rate')

Logistic Regression and Naive Bayes with the SMS Spam Collection

Kyeong Yang

Libraries

Exploring and preparing the data

Logistic Regression

Creating train and test data

Train model

Check performance of Logistic Regression

Naive Bayes

Check performance of Naive Bayes

Comparing Logistic Regression and Naive Bayes

Improving Model

Check performance of bigram method

Comparing Logistic Regression, Naive Bayes and bigram