library(ElemStatLearn)
library(e1071)
library(ROCR)
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(reshape)

theme_set(theme_bw())

# partially adapted from http://joshwalters.com/2012/11/27/naive-bayes-classification-in-r.html

# create train/text indices from toy spam data
ndx <- sample(nrow(spam), floor(nrow(spam) * 0.9))
train <- spam[ndx,]
test <- spam[-ndx,]

# split into train/test data
xTrain <- train[,-58]
yTrain <- train$spam
xTest <- test[,-58]
yTest <- test$spam

# fit naive bayes model with default params
model <- naiveBayes(xTrain, yTrain)

# look at confusion matrix
table(predict(model, xTest), yTest)
##        yTest
##         email spam
##   email   159   12
##   spam    123  167
# plot histogram of predicted probabilities
# note overconfident predictions
probs <- predict(model, xTest, type="raw")
qplot(x=probs[, "spam"], geom="histogram")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-1

# plot ROC curve
pred <- prediction(probs[, "spam"], yTest)
perf_nb <- performance(pred, measure='tpr', x.measure='fpr')
plot(perf_nb)

plot of chunk unnamed-chunk-1

performance(pred, 'auc')
## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.8825
## 
## 
## Slot "alpha.values":
## list()
# sample pos/neg pairs
predicted <- probs[, "spam"]
actual <- yTest == "spam"
ndx_pos <- sample(which(actual == 1), size=100, replace=T)
ndx_neg <- sample(which(actual == 0), size=100, replace=T)
mean(predicted[ndx_pos] > predicted[ndx_neg])
## [1] 0.84
# plot calibration
# note overconfidence
data.frame(predicted=probs[, "spam"], actual=yTest) %>%
  group_by(predicted=round(predicted*10)/10) %>%
  summarize(num=n(), actual=mean(actual == "spam")) %>%
  ggplot(data=., aes(x=predicted, y=actual, size=num)) +
  geom_point() +
  geom_abline(a=1, b=0, linetype=2) +
  scale_x_continuous(labels=percent, lim=c(0,1)) +
  scale_y_continuous(labels=percent, lim=c(0,1))

plot of chunk unnamed-chunk-1

# fit logistic regression
model <- glm(spam ~ ., data=spam[ndx, ], family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# look at confusion matrix
table(predict(model, spam[-ndx, ]) > 0, spam[-ndx, "spam"])
##        
##         email spam
##   FALSE   270   16
##   TRUE     12  163
# plot histogram of predicted probabilities
probs <- predict(model, spam[-ndx, ], type="response")
qplot(x=probs, geom="histogram")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-1

# plot ROC curve
pred <- prediction(probs, yTest)
perf_lr <- performance(pred, measure='tpr', x.measure='fpr')
plot(perf_lr)

plot of chunk unnamed-chunk-1

performance(pred, 'auc')
## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.9671
## 
## 
## Slot "alpha.values":
## list()
# plot(performance(pred, "cal"))

# plot calibration
data.frame(predicted=probs, actual=yTest) %>%
  group_by(predicted=round(predicted*10)/10) %>%
  summarize(num=n(), actual=mean(actual == "spam")) %>%
  ggplot(data=., aes(x=predicted, y=actual, size=num)) +
  geom_point() +
  geom_abline(a=1, b=0, linetype=2) +
  scale_x_continuous(labels=percent, lim=c(0,1)) +
  scale_y_continuous(labels=percent, lim=c(0,1))

plot of chunk unnamed-chunk-1

# plot distribution of predicted labels split by actual label
data.frame(predicted=probs, actual=yTest) %>%
  ggplot(data=., aes(x=probs)) +
  geom_density(aes(fill=yTest), alpha=0.5) +
  xlab('Predicted probability of spam') +
  scale_fill_discrete(name="Actual label") +
  theme(legend.position=c(0.8,0.8))

plot of chunk unnamed-chunk-1

# plot ROC for each method
roc_nb <- data.frame(fpr=unlist(perf_nb@x.values), tpr=unlist(perf_nb@y.values))
roc_nb$method <- "naive bayes"
roc_lr <- data.frame(fpr=unlist(perf_lr@x.values), tpr=unlist(perf_lr@y.values))
roc_lr$method <- "logistic regression"
rbind(roc_nb, roc_lr) %>%
  ggplot(data=., aes(x=fpr, y=tpr, linetype=method, color=method)) + 
  geom_line() +
  geom_abline(a=1, b=0, linetype=2) +
  scale_x_continuous(labels=percent, lim=c(0,1)) +
  scale_y_continuous(labels=percent, lim=c(0,1)) +
  theme(legend.position=c(0.8,0.2), legend.title=element_blank())

plot of chunk unnamed-chunk-1

# use sampling to approximate the AUC
# sample pairs of randomly selected positive and negative examples 
# compute the fraction of time the positive example scored higher than the negative one
predicted <- probs
actual <- yTest == "spam"
ndx_pos <- sample(which(actual == 1), size=100, replace=T)
ndx_neg <- sample(which(actual == 0), size=100, replace=T)
mean(predicted[ndx_pos] > predicted[ndx_neg])
## [1] 0.97