Receiver Operating Characteristics curve

The ROC curve is created by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings.

In statistics, a receiver operating characteristic curve, i.e. ROC curve, is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied.

Using Breast Cancer dataset

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. A few of the images can be found at https://dollar.biz.uiowa.edu/~street/research/cc97_02.pdf.

The completed dataset can be found: http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29.

# load the mlbench package which has the BreastCancer data set
require(mlbench)
## Loading required package: mlbench
## Warning: package 'mlbench' was built under R version 3.4.3
# if you don't have any required package, use the install.packages() command
# load the data set
data(BreastCancer)
head(BreastCancer)
##        Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025            5         1          1             1            2
## 2 1002945            5         4          4             5            7
## 3 1015425            3         1          1             1            2
## 4 1016277            6         8          8             1            3
## 5 1017023            4         1          1             3            2
## 6 1017122            8        10         10             8            7
##   Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1           1           3               1       1    benign
## 2          10           3               2       1    benign
## 3           2           3               1       1    benign
## 4           4           3               7       1    benign
## 5           1           3               1       1    benign
## 6          10           9               7       1 malignant
summary(BreastCancer)
##       Id             Cl.thickness   Cell.size     Cell.shape 
##  Length:699         1      :145   1      :384   1      :353  
##  Class :character   5      :130   10     : 67   2      : 59  
##  Mode  :character   3      :108   3      : 52   10     : 58  
##                     4      : 80   2      : 45   3      : 56  
##                     10     : 69   4      : 40   4      : 44  
##                     2      : 50   5      : 30   5      : 34  
##                     (Other):117   (Other): 81   (Other): 95  
##  Marg.adhesion  Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli
##  1      :407   2      :386   1      :402   2      :166   1      :443    
##  2      : 58   3      : 72   10     :132   3      :165   10     : 61    
##  3      : 58   4      : 48   2      : 30   1      :152   3      : 44    
##  10     : 55   1      : 47   5      : 30   7      : 73   2      : 36    
##  4      : 33   6      : 41   3      : 28   4      : 40   8      : 24    
##  8      : 25   5      : 39   (Other): 61   5      : 34   6      : 22    
##  (Other): 63   (Other): 66   NA's   : 16   (Other): 69   (Other): 69    
##     Mitoses          Class    
##  1      :579   benign   :458  
##  2      : 35   malignant:241  
##  3      : 33                  
##  10     : 14                  
##  4      : 12                  
##  7      :  9                  
##  (Other): 17
# some algorithms don't like missing values, so remove rows with missing values
BreastCancer <- na.omit(BreastCancer) 
# remove the unique identifier, which is useless and would confuse the machine learning algorithms
BreastCancer$Id <- NULL 
# partition the data set for 80% training and 20% evaluation (adapted from ?randomForest)
set.seed(2)
ind <- sample(2, nrow(BreastCancer), replace = TRUE, prob=c(0.8, 0.2))

Comparison of various classifiers

create model using conditional inference trees

Statistics-based approach that uses non-parametric tests as splitting criteria, corrected for multiple testing to avoid overfitting. This approach results in unbiased predictor selection and does not require pruning.

require(party)
## Loading required package: party
## Warning: package 'party' was built under R version 3.4.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.4.3
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.3
x.ct <- ctree(Class ~ ., data=BreastCancer[ind == 1,])
x.ct.pred <- predict(x.ct, newdata=BreastCancer[ind == 2,])
x.ct.prob <-  1- unlist(treeresponse(x.ct, BreastCancer[ind == 2,]), use.names=F)[seq(1,nrow(BreastCancer[ind == 2,])*2,2)]
# To view the decision tree, uncomment this line.
plot(x.ct, main="Decision tree created using condition inference trees")

create model using random forest

x.cf <- cforest(Class ~ ., data=BreastCancer[ind == 1,], control = cforest_unbiased(mtry = ncol(BreastCancer)-2))
x.cf.pred <- predict(x.cf, newdata=BreastCancer[ind == 2,])
x.cf.prob <-  1- unlist(treeresponse(x.cf, BreastCancer[ind == 2,]), use.names=F)[seq(1,nrow(BreastCancer[ind == 2,])*2,2)]

create model using svm (support vector machine)

require(e1071)
## Loading required package: e1071
## Warning: package 'e1071' was built under R version 3.4.3
# svm requires tuning
x.svm.tune <- tune(svm, Class~., data = BreastCancer[ind == 1,],
                   ranges = list(gamma = 2^(-8:1), cost = 2^(0:4)),
                   tunecontrol = tune.control(sampling = "fix"))
# display the tuning results (in text format)
x.svm.tune
## 
## Parameter tuning of 'svm':
## 
## - sampling method: fixed training/validation set 
## 
## - best parameters:
##  gamma cost
##  0.125    1
## 
## - best performance: 0.01675978
# If the tuning results are on the margin of the parameters (e.g., gamma = 2^-8), 
# then widen the parameters.
# I manually copied the cost and gamma from console messages above to parameters below.
x.svm <- svm(Class~., data = BreastCancer[ind == 1,], cost=4, gamma=0.0625, probability = TRUE)
x.svm.prob <- predict(x.svm, type="prob", newdata=BreastCancer[ind == 2,], probability = TRUE)

plot ROC curves to compare the performance of the individual classifiers

# load the ROCR package which draws the ROC curves
require(ROCR)
## Loading required package: ROCR
## Warning: package 'ROCR' was built under R version 3.4.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# ctree
x.ct.prob.rocr <- prediction(x.ct.prob, BreastCancer[ind == 2,'Class'])
x.ct.perf <- performance(x.ct.prob.rocr, "tpr","fpr")
# add=TRUE draws on the existing chart 
plot(x.ct.perf, col=4, main="ROC curves of different machine learning classifier")

# Draw a legend.
legend(0.6, 0.6, c('ctree', 'cforest','svm'), 4:6)

# cforest
x.cf.prob.rocr <- prediction(x.cf.prob, BreastCancer[ind == 2,'Class'])
x.cf.perf <- performance(x.cf.prob.rocr, "tpr","fpr")
plot(x.cf.perf, col=5, add=TRUE)

# svm
x.svm.prob.rocr <- prediction(attr(x.svm.prob, "probabilities")[,2], BreastCancer[ind == 2,'Class'])
x.svm.perf <- performance(x.svm.prob.rocr, "tpr","fpr")
plot(x.svm.perf, col=6, add=TRUE)