#library
library(tm)
## Loading required package: NLP
library(SnowballC)
library(Matrix)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(caret)
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(glmnet) #logistic regression
## Loading required package: foreach
## Loaded glmnet 2.0-9
library(e1071) #naive bayes
set.seed(123)
# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
# examine the structure of the sms data
str(sms_raw)
## 'data.frame': 5559 obs. of 2 variables:
## $ type: chr "ham" "ham" "ham" "spam" ...
## $ text: chr "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out"| __truncated__ ...
# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)
# create DTM using TM package
sms_dtm <- Corpus(VectorSource(sms_raw$text)) %>%
tm_map(removeNumbers) %>%
tm_map(stripWhitespace) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(stemDocument) %>%
DocumentTermMatrix()
#creating train and test data
index = sample(5559, 5559*0.8)
sms_train_matrix <- as.matrix(sms_dtm[index, ])
sms_test_matrix <- as.matrix(sms_dtm[-index, ])
sms_dtm_train <- Matrix(sms_train_matrix, sparse = T)
sms_dtm_test <- Matrix(sms_test_matrix, sparse = T)
# save the labels
sms_train_labels <- sms_raw[index, ]$type
sms_test_labels <- sms_raw[-index, ]$type
# check for the proportion of train and test
prop.table(table(sms_train_labels))
## sms_train_labels
## ham spam
## 0.8682258 0.1317742
prop.table(table(sms_test_labels))
## sms_test_labels
## ham spam
## 0.8552158 0.1447842
#create model and prediction
fit_ridge <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial',alpha = 0)
fit_lasso <- glmnet(sms_dtm_train, sms_train_labels, family = 'binomial',alpha = 1)
pred <- predict(fit_ridge, sms_dtm_test, type = 'response')
head(pred[,1:11])
## s0 s1 s2 s3 s4 s5 s6
## 25 0.1317742 0.1310068 0.1309697 0.1309309 0.1308902 0.1308476 0.1308029
## 37 0.1317742 0.1317430 0.1317408 0.1317383 0.1317357 0.1317328 0.1317297
## 40 0.1317742 0.1309899 0.1309521 0.1309125 0.1308711 0.1308276 0.1307820
## 43 0.1317742 0.1311696 0.1311402 0.1311094 0.1310770 0.1310431 0.1310074
## 46 0.1317742 0.1309396 0.1308994 0.1308573 0.1308132 0.1307669 0.1307184
## 51 0.1317742 0.1312852 0.1312613 0.1312362 0.1312099 0.1311822 0.1311532
## s7 s8 s9 s10
## 25 0.1307559 0.1307067 0.1306551 0.1306010
## 37 0.1317264 0.1317228 0.1317188 0.1317146
## 40 0.1307342 0.1306841 0.1306315 0.1305764
## 43 0.1309700 0.1309308 0.1308895 0.1308463
## 46 0.1306676 0.1306142 0.1305583 0.1304996
## 51 0.1311227 0.1310907 0.1310570 0.1310216
head(coef(fit_ridge, s= 1.52))
## 6 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.41854617
## check -0.03471553
## good -0.02413085
## hope -0.03213501
## just 0.03199336
## week 0.10763024
glmnet_predict <- predict(fit_ridge, newx=sms_dtm_test,s=c(1.52, 0.1))
head(glmnet_predict)
## 1 2
## 25 -2.500983 -2.522771
## 37 -2.184031 -2.197195
## 40 -2.473129 -2.493386
## 43 -2.462126 -2.483747
## 46 -2.523148 -2.545249
## 51 -2.379942 -2.398343
#plotting Lasso and Ridge method
plot(fit_ridge, xlab = 'Ridge Method')
abline(0,0)
plot(fit_lasso, xlab = 'Lasso Method')
abline(0,0)
#lambda, comparison between lasso and ridge
glmnet_fit <- cv.glmnet(x=sms_dtm_train , y=sms_train_labels, family='binomial', alpha=0)
head(coef(glmnet_fit, s='lambda.min'))
## 6 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.41925304
## check -0.03475428
## good -0.02415112
## hope -0.03216682
## just 0.03202778
## week 0.10772625
plot(glmnet_fit)
glmnet_fit$lambda.min
## [1] 1.517223
glmnet_lasso <- cv.glmnet(x=sms_dtm_train , y=sms_train_labels, family='binomial', alpha=1)
plot(glmnet_lasso)
glmnet_lasso$lambda.min
## [1] 0.002306044
# Generate predictions
preds_logit <- predict(glmnet_fit, newx=sms_dtm_test, type='response', s='lambda.min')
head(preds_logit)
## 1
## 25 0.07573374
## 37 0.10115024
## 40 0.07771065
## 43 0.07849947
## 46 0.07419597
## 51 0.08466318
summary(preds_logit)
## 1
## Min. :0.04811
## 1st Qu.:0.07278
## Median :0.07909
## Mean :0.12678
## 3rd Qu.:0.09089
## Max. :0.70894
preds_newlogit <- rep('ham',length(preds_logit))
preds_newlogit[preds_logit>=0.5] <- 'spam'
#confusion matrix
confusionMatrix(sms_test_labels, preds_newlogit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 951 0
## spam 97 64
##
## Accuracy : 0.9128
## 95% CI : (0.8946, 0.9287)
## No Information Rate : 0.9424
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.5302
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9074
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.3975
## Prevalence : 0.9424
## Detection Rate : 0.8552
## Detection Prevalence : 0.8552
## Balanced Accuracy : 0.9537
##
## 'Positive' Class : ham
##
# Put results into dataframe for plotting.
results <- data.frame(pred=preds_logit, actual=sms_test_labels )
ggplot(results, aes(x=preds_logit, fill=actual)) + geom_density(alpha = 0.2)
prediction_logit <- prediction(preds_logit, sms_test_labels)
perf <- performance(prediction_logit, measure = "tpr", x.measure = "fpr")
auc <- performance(prediction_logit, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9835805
roc_logit <- data.frame(fpr=unlist(perf@x.values), tpr=unlist(perf@y.values))
ggplot(roc_logit, aes(x=fpr, ymin=0, ymax=tpr)) +
geom_ribbon(alpha=0.1) +
geom_line(aes(y=tpr)) +
geom_abline(slope=1, intercept=0, linetype='dashed') +
ggtitle("ROC Curve For Logistic Regression") +
ylab('True Positive Rate') +
xlab('False Positive Rate')
# creating training and test datasets
sms_naive_train <- sms_dtm[index, ]
sms_naive_test <- sms_dtm[-index, ]
# save the labels
sms_naive_train_labels <- sms_raw[index, ]$type
sms_naive_test_labels <- sms_raw[-index, ]$type
# check for the proportion of train and test
prop.table(table(sms_naive_train_labels))
## sms_naive_train_labels
## ham spam
## 0.8682258 0.1317742
prop.table(table(sms_naive_test_labels))
## sms_naive_test_labels
## ham spam
## 0.8552158 0.1447842
# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_naive_train, 5)
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_naive_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_naive_test[ , sms_freq_words]
# convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)
#apply model
sms_classifier <- naiveBayes(sms_train, sms_naive_train_labels)
#confusion matrix
sms_test_pred_confusion <- predict(sms_classifier, sms_test)
confusionMatrix(sms_naive_test_labels, sms_test_pred_confusion)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 943 8
## spam 15 146
##
## Accuracy : 0.9793
## 95% CI : (0.9691, 0.9868)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9149
## Mcnemar's Test P-Value : 0.2109
##
## Sensitivity : 0.9843
## Specificity : 0.9481
## Pos Pred Value : 0.9916
## Neg Pred Value : 0.9068
## Prevalence : 0.8615
## Detection Rate : 0.8480
## Detection Prevalence : 0.8552
## Balanced Accuracy : 0.9662
##
## 'Positive' Class : ham
##
sms_naive_pred <- predict(sms_classifier, sms_test,type = 'raw')
#auc and roc curve
prediction_naive <- prediction(sms_naive_pred[,"spam"], sms_naive_test_labels)
perf_naive <- performance(prediction_naive, measure = "tpr", x.measure = "fpr")
auc_naive <- performance(prediction_naive, measure = "auc")
auc_naive <- auc_naive@y.values[[1]]
auc_naive
## [1] 0.9850795
roc_naive <- data.frame(fpr=unlist(perf_naive@x.values), tpr=unlist(perf_naive@y.values))
ggplot(roc_naive, aes(x=fpr, ymin=0, ymax=tpr)) +
geom_ribbon(alpha=0.1) +
geom_line(aes(y=tpr)) +
geom_abline(slope=1, intercept=0, linetype='dashed') +
ggtitle("ROC Curve For Naive Bayes") +
ylab('True Positive Rate') +
xlab('False Positive Rate')
roc_naive$model <- 'Naive Bayes'
roc_logit$model <- 'Logistic Regression'
bind1 <- rbind(roc_naive, roc_logit)
bind1 %>%
ggplot(data=., aes(x=fpr, y=tpr, linetype=model, color=model)) +
geom_line() +
geom_abline(slope=1, intercept=0, linetype='dashed') +
theme(legend.position=c(0.8,0.3), legend.background = element_rect(fill = 'grey90')) +
ggtitle("Logistic Regression vs. Naive Bayes") +
ylab('True Positive Rate') +
xlab('False Positive Rate')
Use bigram to improve our accuracy. NGramTokenizer from RWeka package splits strings into n-grams with given minimal and miximal numbers of gram.
# tokenizer
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
control <- list(
tokenize=BigramTokenizer,
bounds = list(global = c(5, 500))
)
# creating dtm using 'BigramTokenzier'
sms_dtm_bi <- Corpus(VectorSource(sms_raw$text)) %>%
tm_map(removeNumbers) %>%
tm_map(stripWhitespace) %>%
tm_map(removeWords, stopwords()) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(stemDocument) %>%
DocumentTermMatrix(control=control)
# create training and test data
sms_bitr_matrix <- as.matrix(sms_dtm_bi[index, ])
sms_bite_matrix <- as.matrix(sms_dtm_bi[-index, ])
sms_bi_train <- Matrix(sms_bitr_matrix, sparse = T)
sms_bi_test <- Matrix(sms_bite_matrix, sparse = T)
# labels
sms_trainb_labels <- sms_raw[index, ]$type
sms_testb_labels <- sms_raw[-index, ]$type
# train the model - using Lasso method
glmnet_bi <- cv.glmnet(sms_bi_train, sms_trainb_labels, family='binomial', alpha=0)
# generate predictions
preds_bi <- predict(glmnet_bi, sms_bi_test, type='response', s="lambda.min")
# put results into dataframe for plotting
results_bi <- data.frame(pred=preds_bi, actual = sms_testb_labels)
ggplot(results_bi, aes(x=preds_bi, fill = actual)) + geom_density(alpha = 0.2)
preds_newbi <- rep('ham',length(preds_bi))
preds_newbi[preds_bi>=0.5] <- 'spam'
#confusion matrix
confusionMatrix(sms_testb_labels, preds_newbi)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 945 6
## spam 26 135
##
## Accuracy : 0.9712
## 95% CI : (0.9596, 0.9802)
## No Information Rate : 0.8732
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8775
## Mcnemar's Test P-Value : 0.0007829
##
## Sensitivity : 0.9732
## Specificity : 0.9574
## Pos Pred Value : 0.9937
## Neg Pred Value : 0.8385
## Prevalence : 0.8732
## Detection Rate : 0.8498
## Detection Prevalence : 0.8552
## Balanced Accuracy : 0.9653
##
## 'Positive' Class : ham
##
# auc and roc curve
prediction_bi <- prediction(preds_bi, sms_testb_labels)
perf_bi <- performance(prediction_bi, measure = "tpr", x.measure = "fpr")
auc_bi <- performance(prediction_bi, measure = "auc")
auc_bi <- auc_bi@y.values[[1]]
auc_bi
## [1] 0.9792634
roc_bi <- data.frame(fpr=unlist(perf_bi@x.values), tpr=unlist(perf_bi@y.values))
ggplot(roc_bi, aes(x=fpr, ymin=0, ymax=tpr)) +
geom_ribbon(alpha=0.1) +
geom_line(aes(y=tpr)) +
geom_abline(slope=1, intercept=0, linetype='dashed') +
ggtitle("ROC Curve") +
ylab('True Positive Rate') +
xlab('False Positive Rate')
#comparing three roc curves
roc_bi$model <- 'With Bigram'
roc_logit$model <- 'Logistic Regression'
roc_naive$model <- 'Naive Bayes'
bind2 <- rbind(roc_bi, roc_naive, roc_logit)
bind2 %>%
ggplot(data=., aes(x=fpr, y=tpr, linetype=model, color=model)) +
geom_line() +
geom_abline(slope=1, intercept=0, linetype='dashed') +
theme(legend.position=c(0.8,0.3), legend.background = element_rect(fill = 'grey90')) +
ggtitle("Logistic Regression vs. Naive Bayes vs. Bigram model") +
ylab('True Positive Rate') +
xlab('False Positive Rate')