library(ElemStatLearn)
library(e1071)
library(ROCR)
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(reshape)
theme_set(theme_bw())
# partially adapted from http://joshwalters.com/2012/11/27/naive-bayes-classification-in-r.html
# create train/text indices from toy spam data
ndx <- sample(nrow(spam), floor(nrow(spam) * 0.9))
train <- spam[ndx,]
test <- spam[-ndx,]
# split into train/test data
xTrain <- train[,-58]
yTrain <- train$spam
xTest <- test[,-58]
yTest <- test$spam
# fit naive bayes model with default params
model <- naiveBayes(xTrain, yTrain)
# look at confusion matrix
table(predict(model, xTest), yTest)
## yTest
## email spam
## email 159 12
## spam 123 167
# plot histogram of predicted probabilities
# note overconfident predictions
probs <- predict(model, xTest, type="raw")
qplot(x=probs[, "spam"], geom="histogram")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# plot ROC curve
pred <- prediction(probs[, "spam"], yTest)
perf_nb <- performance(pred, measure='tpr', x.measure='fpr')
plot(perf_nb)

performance(pred, 'auc')
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.8825
##
##
## Slot "alpha.values":
## list()
# sample pos/neg pairs
predicted <- probs[, "spam"]
actual <- yTest == "spam"
ndx_pos <- sample(which(actual == 1), size=100, replace=T)
ndx_neg <- sample(which(actual == 0), size=100, replace=T)
mean(predicted[ndx_pos] > predicted[ndx_neg])
## [1] 0.84
# plot calibration
# note overconfidence
data.frame(predicted=probs[, "spam"], actual=yTest) %>%
group_by(predicted=round(predicted*10)/10) %>%
summarize(num=n(), actual=mean(actual == "spam")) %>%
ggplot(data=., aes(x=predicted, y=actual, size=num)) +
geom_point() +
geom_abline(a=1, b=0, linetype=2) +
scale_x_continuous(labels=percent, lim=c(0,1)) +
scale_y_continuous(labels=percent, lim=c(0,1))

# fit logistic regression
model <- glm(spam ~ ., data=spam[ndx, ], family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# look at confusion matrix
table(predict(model, spam[-ndx, ]) > 0, spam[-ndx, "spam"])
##
## email spam
## FALSE 270 16
## TRUE 12 163
# plot histogram of predicted probabilities
probs <- predict(model, spam[-ndx, ], type="response")
qplot(x=probs, geom="histogram")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# plot ROC curve
pred <- prediction(probs, yTest)
perf_lr <- performance(pred, measure='tpr', x.measure='fpr')
plot(perf_lr)

performance(pred, 'auc')
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.9671
##
##
## Slot "alpha.values":
## list()
# plot(performance(pred, "cal"))
# plot calibration
data.frame(predicted=probs, actual=yTest) %>%
group_by(predicted=round(predicted*10)/10) %>%
summarize(num=n(), actual=mean(actual == "spam")) %>%
ggplot(data=., aes(x=predicted, y=actual, size=num)) +
geom_point() +
geom_abline(a=1, b=0, linetype=2) +
scale_x_continuous(labels=percent, lim=c(0,1)) +
scale_y_continuous(labels=percent, lim=c(0,1))

# plot distribution of predicted labels split by actual label
data.frame(predicted=probs, actual=yTest) %>%
ggplot(data=., aes(x=probs)) +
geom_density(aes(fill=yTest), alpha=0.5) +
xlab('Predicted probability of spam') +
scale_fill_discrete(name="Actual label") +
theme(legend.position=c(0.8,0.8))

# plot ROC for each method
roc_nb <- data.frame(fpr=unlist(perf_nb@x.values), tpr=unlist(perf_nb@y.values))
roc_nb$method <- "naive bayes"
roc_lr <- data.frame(fpr=unlist(perf_lr@x.values), tpr=unlist(perf_lr@y.values))
roc_lr$method <- "logistic regression"
rbind(roc_nb, roc_lr) %>%
ggplot(data=., aes(x=fpr, y=tpr, linetype=method, color=method)) +
geom_line() +
geom_abline(a=1, b=0, linetype=2) +
scale_x_continuous(labels=percent, lim=c(0,1)) +
scale_y_continuous(labels=percent, lim=c(0,1)) +
theme(legend.position=c(0.8,0.2), legend.title=element_blank())

# use sampling to approximate the AUC
# sample pairs of randomly selected positive and negative examples
# compute the fraction of time the positive example scored higher than the negative one
predicted <- probs
actual <- yTest == "spam"
ndx_pos <- sample(which(actual == 1), size=100, replace=T)
ndx_neg <- sample(which(actual == 0), size=100, replace=T)
mean(predicted[ndx_pos] > predicted[ndx_neg])
## [1] 0.97