ROC on Classifiers

Receiver Operating Characteristics curve
- Using Breast Cancer dataset
Comparison of various classifiers
plot ROC curves to compare the performance of the individual classifiers

Receiver Operating Characteristics curve

The ROC curve is created by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings.

In statistics, a receiver operating characteristic curve, i.e. ROC curve, is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied.

Using Breast Cancer dataset

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. A few of the images can be found at https://dollar.biz.uiowa.edu/~street/research/cc97_02.pdf.

The completed dataset can be found: http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29.

# load the mlbench package which has the BreastCancer data set
require(mlbench)

## Loading required package: mlbench

## Warning: package 'mlbench' was built under R version 3.4.3

# if you don't have any required package, use the install.packages() command
# load the data set
data(BreastCancer)
head(BreastCancer)

##        Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025            5         1          1             1            2
## 2 1002945            5         4          4             5            7
## 3 1015425            3         1          1             1            2
## 4 1016277            6         8          8             1            3
## 5 1017023            4         1          1             3            2
## 6 1017122            8        10         10             8            7
##   Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1           1           3               1       1    benign
## 2          10           3               2       1    benign
## 3           2           3               1       1    benign
## 4           4           3               7       1    benign
## 5           1           3               1       1    benign
## 6          10           9               7       1 malignant

summary(BreastCancer)

##       Id             Cl.thickness   Cell.size     Cell.shape 
##  Length:699         1      :145   1      :384   1      :353  
##  Class :character   5      :130   10     : 67   2      : 59  
##  Mode  :character   3      :108   3      : 52   10     : 58  
##                     4      : 80   2      : 45   3      : 56  
##                     10     : 69   4      : 40   4      : 44  
##                     2      : 50   5      : 30   5      : 34  
##                     (Other):117   (Other): 81   (Other): 95  
##  Marg.adhesion  Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli
##  1      :407   2      :386   1      :402   2      :166   1      :443    
##  2      : 58   3      : 72   10     :132   3      :165   10     : 61    
##  3      : 58   4      : 48   2      : 30   1      :152   3      : 44    
##  10     : 55   1      : 47   5      : 30   7      : 73   2      : 36    
##  4      : 33   6      : 41   3      : 28   4      : 40   8      : 24    
##  8      : 25   5      : 39   (Other): 61   5      : 34   6      : 22    
##  (Other): 63   (Other): 66   NA's   : 16   (Other): 69   (Other): 69    
##     Mitoses          Class    
##  1      :579   benign   :458  
##  2      : 35   malignant:241  
##  3      : 33                  
##  10     : 14                  
##  4      : 12                  
##  7      :  9                  
##  (Other): 17

# some algorithms don't like missing values, so remove rows with missing values
BreastCancer <- na.omit(BreastCancer) 
# remove the unique identifier, which is useless and would confuse the machine learning algorithms
BreastCancer$Id <- NULL 
# partition the data set for 80% training and 20% evaluation (adapted from ?randomForest)
set.seed(2)
ind <- sample(2, nrow(BreastCancer), replace = TRUE, prob=c(0.8, 0.2))

Comparison of various classifiers

create model using conditional inference trees

Statistics-based approach that uses non-parametric tests as splitting criteria, corrected for multiple testing to avoid overfitting. This approach results in unbiased predictor selection and does not require pruning.

require(party)

## Loading required package: party

## Warning: package 'party' was built under R version 3.4.3

## Loading required package: grid

## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 3.4.3

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 3.4.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.4.3

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 3.4.3

x.ct <- ctree(Class ~ ., data=BreastCancer[ind == 1,])
x.ct.pred <- predict(x.ct, newdata=BreastCancer[ind == 2,])
x.ct.prob <-  1- unlist(treeresponse(x.ct, BreastCancer[ind == 2,]), use.names=F)[seq(1,nrow(BreastCancer[ind == 2,])*2,2)]
# To view the decision tree, uncomment this line.
plot(x.ct, main="Decision tree created using condition inference trees")

create model using random forest

x.cf <- cforest(Class ~ ., data=BreastCancer[ind == 1,], control = cforest_unbiased(mtry = ncol(BreastCancer)-2))
x.cf.pred <- predict(x.cf, newdata=BreastCancer[ind == 2,])
x.cf.prob <-  1- unlist(treeresponse(x.cf, BreastCancer[ind == 2,]), use.names=F)[seq(1,nrow(BreastCancer[ind == 2,])*2,2)]

create model using svm (support vector machine)

require(e1071)

## Loading required package: e1071

## Warning: package 'e1071' was built under R version 3.4.3

# svm requires tuning
x.svm.tune <- tune(svm, Class~., data = BreastCancer[ind == 1,],
                   ranges = list(gamma = 2^(-8:1), cost = 2^(0:4)),
                   tunecontrol = tune.control(sampling = "fix"))
# display the tuning results (in text format)
x.svm.tune

## 
## Parameter tuning of 'svm':
## 
## - sampling method: fixed training/validation set 
## 
## - best parameters:
##  gamma cost
##  0.125    1
## 
## - best performance: 0.01675978

# If the tuning results are on the margin of the parameters (e.g., gamma = 2^-8), 
# then widen the parameters.
# I manually copied the cost and gamma from console messages above to parameters below.
x.svm <- svm(Class~., data = BreastCancer[ind == 1,], cost=4, gamma=0.0625, probability = TRUE)
x.svm.prob <- predict(x.svm, type="prob", newdata=BreastCancer[ind == 2,], probability = TRUE)

plot ROC curves to compare the performance of the individual classifiers

# load the ROCR package which draws the ROC curves
require(ROCR)

## Loading required package: ROCR

## Warning: package 'ROCR' was built under R version 3.4.3

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.4.3

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

# ctree
x.ct.prob.rocr <- prediction(x.ct.prob, BreastCancer[ind == 2,'Class'])
x.ct.perf <- performance(x.ct.prob.rocr, "tpr","fpr")
# add=TRUE draws on the existing chart 
plot(x.ct.perf, col=4, main="ROC curves of different machine learning classifier")

# Draw a legend.
legend(0.6, 0.6, c('ctree', 'cforest','svm'), 4:6)

# cforest
x.cf.prob.rocr <- prediction(x.cf.prob, BreastCancer[ind == 2,'Class'])
x.cf.perf <- performance(x.cf.prob.rocr, "tpr","fpr")
plot(x.cf.perf, col=5, add=TRUE)

# svm
x.svm.prob.rocr <- prediction(attr(x.svm.prob, "probabilities")[,2], BreastCancer[ind == 2,'Class'])
x.svm.perf <- performance(x.svm.prob.rocr, "tpr","fpr")
plot(x.svm.perf, col=6, add=TRUE)