I.
# load data
library(kernlab);data(spam);set.seed(333);
plot(density(spam$your[spam$type == 'nonspam']),col = 'blue',
xlab = '',ylab = "Frequency of word 'your'",main = '',frame = F)
lines(density(spam$your[spam$type == 'spam']),col = 'red')
# Benchmark v=0.5,notice almost all the blue density falls left to vertical line
abline(v = 0.5,lwd = 2)

# Simple classification statement
prediction <- ifelse(spam$your < 0.5,'nonspam','spam')
# Proportional table
round(table(prediction,spam$type)/length(spam$type),2)
prediction nonspam spam
nonspam 0.46 0.10
spam 0.15 0.29
# Accuracy "sums of the diagonal"
0.46 + 0.29
[1] 0.75
II.
# picking a small subset of 10 values from spam data set
smallSpam <- spam[sample(dim(spam)[1],size = 10),]
# label spam = 2 and ham = 1
spamLabel <- (smallSpam$type == "spam")*1 + 1
# plot the capitalAve values for the dataset with colors differentiated by spam/ham (2vs1)
plot(smallSpam$capitalAve,col = spamLabel,ylab = 'average capital letters in the e-mail')

# first rule (over - fitting to capture all variation)
# fit a model on the data set 100%
rule1 <- function(x) {
prediction <- rep(NA,length(x))
prediction[x > 2.7] <- "spam"
prediction[x < 2.4] <- "nonspam"
prediction[x >= 2.4 & x <= 2.45] <- "spam"
prediction[x > 2.45 & x <= 2.7 ] <- "nonspam"
return(prediction)
}
# tabulate results of prediction algorithm 1 (in sample error -> no error in this case)
# case of overfit, biased towards prediction in new samples
table(rule1(smallSpam$capitalAve),smallSpam$type)
nonspam spam
nonspam 5 0
spam 0 5
# second rule (simple, setting a threshold)
rule2 <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 2.8] <- "spam"
prediction[x <= 2.8] <- "nonspam"
return(prediction)
}
# tabulate results of prediction algorithm 2 (in sample error -> 10% in this case)
table(rule2(smallSpam$capitalAve),smallSpam$type)
nonspam spam
nonspam 5 1
spam 0 4
# tabulate out of sample error for algorithm 1
table(rule1(spam$capitalAve),spam$type)
nonspam spam
nonspam 2141 588
spam 647 1225
# tabulate out of sample error for algorithm 2
table(rule2(spam$capitalAve),spam$type)
nonspam spam
nonspam 2224 642
spam 564 1171
# accuracy and total correct for algorithm 1 and 2
rbind("Rule 1" = c(Accuracy = mean(rule1(spam$capitalAve) == spam$type),
"Total Correct" = sum(rule1(spam$capitalAve) == spam$type)),
"Rule 2" = c(Accuracy = mean(rule2(spam$capitalAve) == spam$type),
"Total Correct" = sum(rule2(spam$capitalAve) == spam$type)))
Accuracy Total Correct
Rule 1 0.7315801 3366
Rule 2 0.7378831 3395
## Plot ROC curves
sensitivity <- rep(NA,4601)
sensitivity2 <- rep(NA,4601)
for (ii in 1:4601) {
x = spam$type[ii] == 'spam'
y = spam$capitalAve[ii]
if (rule1(y) == 'spam' && x) {
sensitivity[ii] <- 1
} else {
sensitivity[ii] <- 0
}
if (rule2(y) == 'spam' && x) {
sensitivity2[ii] <- 1
} else {
sensitivity2[ii] <- 0
}
}
sensitivity <- cumsum(sensitivity)/sum(sensitivity)
sensitivity2 <- cumsum(sensitivity2)/sum(sensitivity2)
onemspecificity <- rep(NA,4601)
onemspecificity2 <- rep(NA,4601)
for (ii in 1:4601) {
x = spam$type[ii] == 'nonspam'
y = spam$capitalAve[ii]
if (rule1(y) == 'spam' && x) {
onemspecificity[ii] <- 0
} else {
onemspecificity[ii] <- 1
}
if (rule2(y) == 'nonspam' && x) {
onemspecificity2[ii] <- 0
} else {
onemspecificity2[ii] <- 1
}
}
onemspecificity <- cumsum(onemspecificity)/sum(onemspecificity)
onemspecificity2 <- cumsum(onemspecificity2)/sum(onemspecificity2)
plot(onemspecificity,sensitivity,type = 'l',col = 'red',lwd = 1.2,xlab = '1 - specificity')
lines(onemspecificity2,sensitivity2,col = 'blue',lwd = 1.2)
legend(col = c('red','blue','black'),'bottomright',legend = c('rule 1','rule 2','random guessing'),lty = 2)
abline(0,1,col = 'black',lwd = 2)
