AUC/Overfitting

I.

# load data
library(kernlab);data(spam);set.seed(333);
plot(density(spam$your[spam$type == 'nonspam']),col = 'blue',
     xlab = '',ylab = "Frequency of word 'your'",main = '',frame = F)
lines(density(spam$your[spam$type == 'spam']),col = 'red')
# Benchmark v=0.5,notice almost all the blue density falls left to vertical line
abline(v = 0.5,lwd = 2)

# Simple classification statement
prediction <- ifelse(spam$your < 0.5,'nonspam','spam')
# Proportional table 
round(table(prediction,spam$type)/length(spam$type),2)

          
prediction nonspam spam
   nonspam    0.46 0.10
   spam       0.15 0.29

# Accuracy "sums of the diagonal"
0.46 + 0.29

[1] 0.75

II.

# picking a small subset of 10 values from spam data set
smallSpam <- spam[sample(dim(spam)[1],size = 10),]
# label spam = 2 and ham = 1
spamLabel <- (smallSpam$type == "spam")*1 + 1
# plot the capitalAve values for the dataset with colors differentiated by spam/ham (2vs1)
plot(smallSpam$capitalAve,col = spamLabel,ylab = 'average capital letters in the e-mail')

# first rule (over - fitting to capture all variation)
# fit a model on the data set 100%
rule1 <- function(x) {
  prediction <- rep(NA,length(x))
  prediction[x > 2.7] <- "spam"
  prediction[x < 2.4] <- "nonspam"
  prediction[x >= 2.4  & x <= 2.45] <- "spam"
  prediction[x >  2.45 & x <= 2.7 ] <- "nonspam"
  return(prediction)
}
# tabulate results of prediction algorithm 1 (in sample error -> no error in this case)
# case of overfit, biased towards prediction in new samples
table(rule1(smallSpam$capitalAve),smallSpam$type)

         
          nonspam spam
  nonspam       5    0
  spam          0    5

# second rule (simple, setting a threshold)
rule2 <- function(x){
  prediction <- rep(NA,length(x))
  prediction[x  > 2.8] <- "spam"
  prediction[x <= 2.8] <- "nonspam"
  return(prediction)
}
# tabulate results of prediction algorithm 2 (in sample error -> 10% in this case)
table(rule2(smallSpam$capitalAve),smallSpam$type)

         
          nonspam spam
  nonspam       5    1
  spam          0    4

# tabulate out of sample error for algorithm 1 
table(rule1(spam$capitalAve),spam$type)

         
          nonspam spam
  nonspam    2141  588
  spam        647 1225

# tabulate out of sample error for algorithm 2
table(rule2(spam$capitalAve),spam$type)

         
          nonspam spam
  nonspam    2224  642
  spam        564 1171

# accuracy and total correct for algorithm 1 and 2
rbind("Rule 1" = c(Accuracy = mean(rule1(spam$capitalAve) == spam$type),
      "Total Correct" = sum(rule1(spam$capitalAve) == spam$type)),
      "Rule 2" = c(Accuracy = mean(rule2(spam$capitalAve) == spam$type),
      "Total Correct" = sum(rule2(spam$capitalAve) == spam$type)))

        Accuracy Total Correct
Rule 1 0.7315801          3366
Rule 2 0.7378831          3395

## Plot ROC curves
sensitivity  <- rep(NA,4601)
sensitivity2 <- rep(NA,4601)
for (ii in 1:4601) {
  x = spam$type[ii] == 'spam'
  y = spam$capitalAve[ii]
  if (rule1(y) == 'spam' &&  x) {
    sensitivity[ii] <- 1
  } else {
    sensitivity[ii] <- 0
  }
  if (rule2(y) == 'spam' &&  x) {
    sensitivity2[ii] <- 1
  } else {
    sensitivity2[ii] <- 0
  }
}
sensitivity  <- cumsum(sensitivity)/sum(sensitivity)
sensitivity2 <- cumsum(sensitivity2)/sum(sensitivity2)
onemspecificity  <- rep(NA,4601)
onemspecificity2 <- rep(NA,4601)
for (ii in 1:4601) {
  x = spam$type[ii] == 'nonspam'
  y = spam$capitalAve[ii]
  if (rule1(y) == 'spam' &&  x) {
    onemspecificity[ii] <- 0
  } else {
    onemspecificity[ii] <- 1
  }
  if (rule2(y) == 'nonspam' &&  x) {
    onemspecificity2[ii] <- 0
  } else {
    onemspecificity2[ii] <- 1
  }
}
onemspecificity  <- cumsum(onemspecificity)/sum(onemspecificity)
onemspecificity2 <- cumsum(onemspecificity2)/sum(onemspecificity2)
plot(onemspecificity,sensitivity,type = 'l',col = 'red',lwd = 1.2,xlab = '1 - specificity')
lines(onemspecificity2,sensitivity2,col = 'blue',lwd = 1.2)
legend(col = c('red','blue','black'),'bottomright',legend = c('rule 1','rule 2','random guessing'),lty = 2)
abline(0,1,col = 'black',lwd = 2)

AUC/Overfitting

George Papadopoulos

4/25/2019

I.

II.