library(data.table) # Importing data
library(car) # model diagnostics
library(MASS) # step function for model selection
library(rpart) # desicion tree
library(rattle) # creating decision tree plot
library(rpart.plot) # descision tree
library(RColorBrewer) # colloring decision tree
library(formatR) # colloring tree
library(e1071) # Performing Support Vector Machine
library(corrplot) # correlation plot
library(class) # knn

1 Data Description

Determination of a cancer type is a crucial component for curing cancer. In this analysis I am comparing different supervised learning techniques such as: decision tree, logistic regression, support vector machines and K-nearest neighbors using genetic data from patients suffering from Acute Lymphoblastic Leukemia. This data set comes from the paper: “A subtype of childhood acute lymphoblastic leukemia with poor treatment outcome: a genome-wide classification study” by M.L.Den Boer.

In this case we are looking at 40 different genes/features and a classifier that has 11 different cancer subtypes. We grouped cancer types into two categories which are (Bo,T) and (others).

1.1 Importing the data

datafile = read.table("GSE13425-Bo-T.txt",check.names = F) # Importing data
df = as.data.frame(t(datafile))                            # Transposing 
df[,length(df)+1] = rownames(df)                           # Class column

for (i in 1:length(df)) {                                  # Renaming columns 
  colnames(df)[i] = paste("v", i, sep = "")
  if (i == length(df)){
    colnames(df)[i] = "class"
  }
}
attach(df)
df$class  = factor((class == "T")|(class == "Bo"),         # Bo or T vs others
                       levels = c(FALSE, TRUE),
                       labels = c("other", "Bo|T"))

rownames(df) = 1:nrow(df)

2 Descriptive Statistics

Since some of the algorithms, especially logistic regression are influenced by multicollinearity issue, let’s have a look on correlation between our covariates.

z = cor(df[,1:40])              # Create a correlation matrix
z[lower.tri(z,diag=TRUE)] = NA  # Drop duplicates
z = as.data.frame(as.table(z))  # Turn into a 3-column table
z = na.omit(z)                  # Get rid of NAs
z = z[order(-abs(z$Freq)),]     # Sort by highest correlation 

subset(z, abs(Freq) > 0.8)      # Most correlated variables

##      Var1 Var2       Freq
## 1124   v4  v29  0.9819464
## 1598  v38  v40  0.8862302
## 533   v13  v14  0.8852458
## 1254  v14  v32  0.8834559
## 523    v3  v14  0.8556361
## 1253  v13  v32  0.8452803
## 1122   v2  v29  0.8194641
## 122    v2   v4  0.8159920
## 883    v3  v23 -0.8134034
## 1183  v23  v30  0.8111894
## 1174  v14  v30 -0.8095060
## 1416  v16  v36  0.8048125

We can see a high correlation between some of the variables. For example, variable v4 and v29 has almost perfect correlation.

3 Training vs Testing sets

Let’s divide our model into training and testing set (75% vs 25%). I am going to use training set to create a model and testing set to evaluate the model’s performance.

# Set random seed for reproducible research
set.seed(97459)

# Shuffle the data
n = nrow(df)
df = df[sample(n), ]

# Split the data in train and test
split    = 0.75
df_train = df[1:round(split * n), ]
df_test  = df[(round(split * n) + 1):n, ]

Let’s check whether the split sets are well balanced, meaning that they have similar proportions of both classes.

# Check class bias
table(df$class, dnn = "Overal")

## Overal
## other  Bo|T 
##   110    80

table(df_train$class, dnn = "Train")

## Train
## other  Bo|T 
##    81    61

table(df_test$class, dnn = "Test")

## Test
## other  Bo|T 
##    29    19

Based on the above results, both training and testing sets are more or less well balanced.

4 Decision tree

For decision tree I am going to look at both Gini Index and Entropy. Initial tree will be based on very small value of cp (complexity parameter).

4.1 Creating initial tree

# Build a tree model using GINI index
tree_g = rpart(class ~ .,
               data = df_train,
               method = "class",
               control = rpart.control(cp=0.0001),
               parms = list(split = "gini"))

# Build a tree model using Entropy index
tree_e = rpart(class ~ .,
               data = df_train,
               method = "class",
               control = rpart.control(cp=0.0001),
               parms = list(split = "information"))

par(mfrow = c(1,2))
# Ploting tree
fancyRpartPlot(tree_g,
               sub = "Tree based on GINI Index")

# Ploting tree
fancyRpartPlot(tree_e,
               sub = "Tree based on Entropy")

So, based on the above results, we can notice that tree based on Entropy is different from the three based on Gini index. Let’s try to prune these trees.

4.2 Possible pruning

# Cp table Gini
printcp(tree_g)

## 
## Classification tree:
## rpart(formula = class ~ ., data = df_train, method = "class", 
##     parms = list(split = "gini"), control = rpart.control(cp = 1e-04))
## 
## Variables actually used in tree construction:
## [1] v16 v21 v22 v24
## 
## Root node error: 61/142 = 0.42958
## 
## n= 142 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.622951      0   1.00000 1.00000 0.096702
## 2 0.065574      1   0.37705 0.49180 0.079744
## 3 0.049180      2   0.31148 0.49180 0.079744
## 4 0.032787      3   0.26230 0.49180 0.079744
## 5 0.000100      4   0.22951 0.44262 0.076658

plotcp(tree_g)

Based on the above plot we decided to prune the tree using cp value of 10^{-4}

# Cp table Entropy
printcp(tree_e)

## 
## Classification tree:
## rpart(formula = class ~ ., data = df_train, method = "class", 
##     parms = list(split = "information"), control = rpart.control(cp = 1e-04))
## 
## Variables actually used in tree construction:
## [1] v21 v26 v30
## 
## Root node error: 61/142 = 0.42958
## 
## n= 142 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.622951      0   1.00000 1.00000 0.096702
## 2 0.065574      1   0.37705 0.52459 0.081620
## 3 0.000100      3   0.24590 0.57377 0.084188

plotcp(tree_e)

For the model based on Entropy the pruning cp is 0.0655738.

# Pruning
pruned_g = prune(tree_g, cp = tree_g$cptable[which.min(tree_g$cptable[,"xerror"]),"CP"])
pruned_e = prune(tree_e, cp = tree_e$cptable[which.min(tree_e$cptable[,"xerror"]),"CP"])

# Gini
par(mfrow = c(1,2))
fancyRpartPlot(tree_g,
               sub = "Initial tree")
fancyRpartPlot(pruned_g, 
               sub = "Pruned tree")

We can notice that for Gini based tree, the pruned tree is the same.

# Entropy
par(mfrow = c(1,2))
fancyRpartPlot(tree_e,
               sub = "Initial tree")
fancyRpartPlot(pruned_e, 
               sub = "Pruned tree")

For Entropy based tree, the pruned tree has only one split at the very beginning. We can notice that, the variable on which we are splitting is v21. Let’s look at this variable closer.

plot(df$class, df$v21)

Basically this variable is telling us a lot about variation in our classification.

4.3 Assesing trees

# Function that returns accuracy, false positive and true positive rates
ac = function(conf) {
  accuracy = round(sum(diag(conf))/sum(conf), 2)
  False.positive = round(conf[1, 2]/sum(conf[1,]), 2)
  True.positive = round(conf[2, 2]/sum(conf[2,]), 2)
  output = list(accuracy = accuracy, 
                False.positive = False.positive,
                True.positive = True.positive)
  return(output)
}

# Using our model to predict the test set
# Gini
pred_g = predict(tree_g,
                 df_test,
                 type = "class")

pred_g_pruned = predict(pruned_g,
                        df_test,
                        type = "class")

# Entropy
pred_e = predict(tree_e,
                 df_test,
                 type = "class")

pred_e_pruned = predict(pruned_e,
                        df_test,
                        type = "class")


# Construct the confusion matrix 
conf_g = table(df_test$class,
               pred_g, dnn=c("Actual:", "Predicted:"))
conf_g

##        Predicted:
## Actual: other Bo|T
##   other    22    7
##   Bo|T      5   14

conf_g_pruned = table(df_test$class,
                      pred_g_pruned, dnn=c("Actual:", "Predicted:"))
conf_g_pruned

##        Predicted:
## Actual: other Bo|T
##   other    22    7
##   Bo|T      5   14

conf_e = table(df_test$class,
               pred_e, dnn=c("Actual:", "Predicted:"))
conf_e

##        Predicted:
## Actual: other Bo|T
##   other    25    4
##   Bo|T      1   18

conf_e_pruned = table(df_test$class,
                      pred_e_pruned, dnn=c("Actual:", "Predicted:"))
conf_e_pruned

##        Predicted:
## Actual: other Bo|T
##   other    28    1
##   Bo|T      4   15

summ = rbind(dtree_g = ac(conf_g),
             dtree_g_prune = ac(conf_g_pruned),
             dtree_e = ac(conf_e),
             dtree_e_pruned = ac(conf_e_pruned))
summ

##                accuracy False.positive True.positive
## dtree_g        0.75     0.24           0.74         
## dtree_g_prune  0.75     0.24           0.74         
## dtree_e        0.9      0.14           0.95         
## dtree_e_pruned 0.9      0.03           0.79

We can notice that models based on Entropy are much better in prediction. Also, we can notice that the pruned Entropy model gives you the lowest false positive while the regular entropy model gives you the highest true positive rate.

5 Logistic regression

5.1 Variable selection

Let’s use model selection procedure based on AIC criteria.

logit_full = glm(class~., data = df_train, family = binomial)
logit_null = glm(class~1, data = df_train, family = binomial)

# Both 
both = step(logit_full, scope = list(lower = logit_null, upper = logit_full), direction = "both")

Below is the model selected using AIC criterion.

Both

both$formula

## class ~ v3 + v4 + v6 + v7 + v12 + v19 + v20 + v22 + v23 + v24 + 
##     v27 + v30 + v34 + v36 + v40

# Model after movel selection procedure
logit = glm(both$formula, data=df_train, family = binomial)
summary(logit)

## 
## Call:
## glm(formula = both$formula, family = binomial, data = df_train)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -3.182e-04  -2.000e-08  -2.000e-08   2.000e-08   3.259e-04  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   1881.8   278364.5   0.007    0.995
## v3            -198.6    12100.4  -0.016    0.987
## v4            -911.5    51833.6  -0.018    0.986
## v6            -547.4    31006.9  -0.018    0.986
## v7            -107.2     6369.7  -0.017    0.987
## v12           -111.5     6782.0  -0.016    0.987
## v19           1462.3    86905.6   0.017    0.987
## v20           -454.9    26355.3  -0.017    0.986
## v22          -1563.3    91095.7  -0.017    0.986
## v23            197.6    11307.3   0.017    0.986
## v24            220.3    13179.2   0.017    0.987
## v27           -700.6    39966.3  -0.018    0.986
## v30            264.0    16226.8   0.016    0.987
## v34           -251.5    14573.9  -0.017    0.986
## v36            218.2    12555.9   0.017    0.986
## v40            520.2    31749.9   0.016    0.987
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.9403e+02  on 141  degrees of freedom
## Residual deviance: 7.9146e-07  on 126  degrees of freedom
## AIC: 32
## 
## Number of Fisher Scoring iterations: 25

We can see that all the variables are insignificant with high p-value. It seems, that we have a multicollinearity issue. Let’s check correlation between covariates and calculate variance inflation factors.

c = as.vector(attributes(both$terms)$term.labels)
z = cor(df_train[ , c[1:length(c)]])   # Create a correlation matrix
z[lower.tri(z,diag=TRUE)] = NA         # Drop duplicates
z = as.data.frame(as.table(z))         # Turn into a 3-column table
z = na.omit(z)                         # Get rid of NAs
z = z[order(-abs(z$Freq)),]            # Sort by highest correlation 
subset(z, abs(Freq) > 0.6)

##     Var1 Var2       Freq
## 121   v3  v23 -0.8158225
## 174  v23  v30  0.7891883
## 166   v3  v30 -0.7797809
## 196   v3  v36  0.7473499
## 207  v30  v36 -0.6952707
## 200  v12  v36  0.6910695
## 16    v3   v4 -0.6749409
## 167   v4  v30  0.6696646
## 144  v23  v24 -0.6373471
## 224  v36  v40  0.6367633
## 197   v4  v36 -0.6360559
## 202  v20  v36  0.6326354
## 62    v4  v12 -0.6305739
## 136   v3  v24  0.6275150
## 61    v3  v12  0.6251031
## 211   v3  v40  0.6192273
## 169   v7  v30 -0.6147119
## 159  v23  v27 -0.6067034
## 199   v7  v36  0.6034343
## 170  v12  v30 -0.6023743
## 219  v23  v40 -0.6012588
## 151   v3  v27  0.6008704

as.data.frame(vif(logit))

##     vif(logit)
## v3    33.48283
## v4  1047.67976
## v6   534.38246
## v7    31.11210
## v12   61.77942
## v19  309.30714
## v20  108.44294
## v22  598.92020
## v23  116.69826
## v24   99.38380
## v27  379.05176
## v30  110.80431
## v34  141.95973
## v36  280.69093
## v40  317.73377

sqrt(vif(logit)) > 2  # Exclude v4

##   v3   v4   v6   v7  v12  v19  v20  v22  v23  v24  v27  v30  v34  v36  v40 
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

# Updated model (without v4)
logit_1 = glm(class ~ v3  + v6 + v7 + v12 + v19 + v20 + v22 + v23 + v24 + 
    v27 + v30 + v34 + v36 + v40, data=df_train, family = binomial(link='logit'))
summary(logit_1)

## 
## Call:
## glm(formula = class ~ v3 + v6 + v7 + v12 + v19 + v20 + v22 + 
##     v23 + v24 + v27 + v30 + v34 + v36 + v40, family = binomial(link = "logit"), 
##     data = df_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.82690  -0.23057  -0.02470   0.03454   2.85365  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.55277   26.45472   0.172 0.863362    
## v3           0.22899    0.91213   0.251 0.801779    
## v6          -1.82863    0.72035  -2.539 0.011132 *  
## v7          -0.13874    0.45577  -0.304 0.760812    
## v12          0.03635    0.33748   0.108 0.914224    
## v19          2.29510    2.00935   1.142 0.253365    
## v20         -1.84401    0.78685  -2.344 0.019102 *  
## v22         -5.54700    1.63027  -3.403 0.000668 ***
## v23          1.15180    0.40071   2.874 0.004048 ** 
## v24          1.30132    0.60597   2.148 0.031753 *  
## v27         -2.41564    0.88055  -2.743 0.006082 ** 
## v30          0.45185    0.81917   0.552 0.581222    
## v34         -1.04703    0.49369  -2.121 0.033937 *  
## v36          0.47742    0.36408   1.311 0.189762    
## v40          1.00203    1.09295   0.917 0.359245    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 194.028  on 141  degrees of freedom
## Residual deviance:  54.778  on 127  degrees of freedom
## AIC: 84.778
## 
## Number of Fisher Scoring iterations: 8

as.data.frame(vif(logit_1))

##     vif(logit_1)
## v3      2.655741
## v6      2.189958
## v7      1.344187
## v12     2.076312
## v19     1.760441
## v20     1.596779
## v22     2.467322
## v23     1.763671
## v24     2.510195
## v27     2.153361
## v30     1.837583
## v34     1.241599
## v36     2.871578
## v40     2.037418

sqrt(vif(logit_1)) > 2

##    v3    v6    v7   v12   v19   v20   v22   v23   v24   v27   v30   v34 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##   v36   v40 
## FALSE FALSE

logit_2 = glm(class ~ v6 + v20 + v22 + v23 + v24 + 
    v27 + v34, data=df_train, family = binomial(link='logit'))
summary(logit_2)

## 
## Call:
## glm(formula = class ~ v6 + v20 + v22 + v23 + v24 + v27 + v34, 
##     family = binomial(link = "logit"), data = df_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.85125  -0.24869  -0.02757   0.03701   2.99127  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  29.9413     7.5720   3.954 7.68e-05 ***
## v6           -1.5231     0.5059  -3.010 0.002609 ** 
## v20          -1.2236     0.6156  -1.988 0.046856 *  
## v22          -4.5785     1.2437  -3.681 0.000232 ***
## v23           1.2175     0.3592   3.390 0.000700 ***
## v24           1.2115     0.4884   2.481 0.013110 *  
## v27          -1.9812     0.7030  -2.818 0.004830 ** 
## v34          -0.9210     0.4545  -2.027 0.042711 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 194.028  on 141  degrees of freedom
## Residual deviance:  59.667  on 134  degrees of freedom
## AIC: 75.667
## 
## Number of Fisher Scoring iterations: 8

# Checking wether we should leave insignificant values in the model.
anova(logit_1, logit_2,  test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: class ~ v3 + v6 + v7 + v12 + v19 + v20 + v22 + v23 + v24 + v27 + 
##     v30 + v34 + v36 + v40
## Model 2: class ~ v6 + v20 + v22 + v23 + v24 + v27 + v34
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       127     54.778                     
## 2       134     59.667 -7  -4.8892   0.6735

Based on the vif criteria we should eliminate v4. We can notice that after we eliminate v4 we got rid of the multicollinearity issue. However, we see that some of the values are insignificant, to check whether we need them in the model we use anova table with deviance residuals. The table above, show that we should get rid of these insignificant variables.

log_pred_df = predict(logit_2,            # specifing the model given by model selection procedure
                      df_test,            # predicting on the testing data   
                      type="response") 

optim = function(precision,               # How many models you want to compare
                 pred,
                 dataset) {               # Predicted values 
  accuracy = matrix(0, nrow = precision, ncol = 4)
  accur = 0
  treshold = 0
  tp = 0
  fp = 1
  k = 1/precision
  for (i in 1:precision) {
    f = factor(pred > i*k,
               levels = c(FALSE, TRUE),
               labels = c("other", "Bo|T"))
    t = table(dataset$class, f,  dnn = c("Actual", "Predicted"))
    accuracy[i, 1] = i*k
    accuracy[i, 2] = sum(diag(t)/sum(t))
    accuracy[i, 3] = t[1, 2]/sum(t[1,])
    accuracy[i, 4] = t[2, 2]/sum(t[2,])
    if (accuracy[i, 2] > accur) {
        accur = accuracy[i, 2]
        treshold = accuracy[i, 1] 
        co = t
        fp = accuracy[i, 3]
        tp = accuracy[i, 4]
    }
  }
  output = list(highest_accuracy = accur,                    # Highest accuracy
                optimal_treshold = treshold,                 # Optimal threshold
                matrix = accuracy,                           # All possible models    
                False.positive = fp,
                True.positive = tp,
                conf = co)                                        
  return(output)
}


opt_d1 = optim(1000, log_pred_df, df_test)

Below is the plot that shows how Accuracy changes when we change the threshold.

# Ploting Accuracy vs Threshold
plot(x = opt_d1$matrix[, 1],
     y = opt_d1$matrix[, 2],
     col = ifelse(opt_d1$matrix[, 1] == opt_d1$optimal_treshold, "red", "black"),
     ylab = "Accuracy",
     xlab = "Treshold",
     type = 'l',
     main = "Best treshold for log_d2")
abline(v = opt_d1$optimal_treshold, lty = 2)

Based on the above plot, we can see that Accuracy maximizes when threshold is 0.604.

5.2 Assesing the model

# The best combination of treshhold and accuracy 
data.frame("Optimal Treshold" = opt_d1$optimal_treshold, 
           "Highest Accuracy" = opt_d1$highest_accuracy,
           "False.positive" = opt_d1$False.positive,
           "True.positive" = opt_d1$True.positive)

##   Optimal.Treshold Highest.Accuracy False.positive True.positive
## 1            0.604        0.8333333     0.03448276     0.6315789

summ = rbind(summ, log_d2 = ac(opt_d1$conf))

Based on the table above, we can see that logistic regression gives us quite low false positive rate, however in terms of accuracy it has lower than decision tree based on Entropy.

6 SVM

6.1 Tuning the model

To come up with the best model for svm we use tuning procedure for linear kernel type.

set.seed(11)
# Tuning svm fom d1 
tune = tune(svm, #ten-fold cross-validation on a set of models of interest.
            class ~ .,
            data = df_train,
            kernel = "linear",
            ranges = list(cost = seq(0.01, 5, 0.1)))

# Assigning the best model based on cost component
bestmod = tune$best.model 
summary(bestmod)

## 
## Call:
## best.tune(method = svm, train.x = class ~ ., data = df_train, 
##     ranges = list(cost = seq(0.01, 5, 0.1)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.11 
##       gamma:  0.025 
## 
## Number of Support Vectors:  51
## 
##  ( 26 25 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  other Bo|T

We can notice that the best model has a cost value of 0.11 and 51 support vectors.

# Predicting on testing set
pred_svm = predict(bestmod, df_test)

svm.conf = table(df_test$class,
                 pred_svm,
                 dnn = c("Actual", "Predicted"))
svm.conf

##        Predicted
## Actual  other Bo|T
##   other    28    1
##   Bo|T      3   16

6.2 Assesing model

summ = rbind(summ,
             svm_d1 = ac(svm.conf))
summ

##                accuracy False.positive True.positive
## dtree_g        0.75     0.24           0.74         
## dtree_g_prune  0.75     0.24           0.74         
## dtree_e        0.9      0.14           0.95         
## dtree_e_pruned 0.9      0.03           0.79         
## log_d2         0.83     0.03           0.63         
## svm_d1         0.92     0.03           0.84

Based on the above results, we can see that svm performs pretty good. It is even better than decision tree based on Entropy it terms of accuracy.

7 K nearest neighbor

7.1 Finding the best k

Below, we find the value of k, that returns the highest accuracy.

knn_matrix = matrix(0, nrow = 15, ncol = 2, byrow = TRUE)

set.seed(5)
# Find the best k in terms of highest accuracy
for (i in 1:15) {
knn.pred = knn(df_train[,-which(colnames(df_train)=="class")],
               df_test[,-which(colnames(df_test)=="class")],
               df_train$class, 
               k = i)
knn_matrix[i, 1] = i
knn_matrix[i, 2] = mean(knn.pred == df_test$class)
}

# find the k that gives the highest accuracy
plot(knn_matrix,
     type = "l",
     col = "red",
     xlab = "k",
     ylab = "Accuracy")
abline(v = which.max(knn_matrix[,2]), lty = 2)

By looking at the plot above, we can see that the best k is 4, that returns approximately 94% of accuracy.

7.2 Assesing the model

bestknn =  knn(df_train[,-which(colnames(df_train)=="class")],
               df_test[,-which(colnames(df_test)=="class")],
               df_train$class, 
               k = which.max(knn_matrix[,2]))

conf = table(bestknn, df_test$class)
conf

##        
## bestknn other Bo|T
##   other    28    2
##   Bo|T      1   17

summ = rbind(summ, knn = ac(conf))
summ

##                accuracy False.positive True.positive
## dtree_g        0.75     0.24           0.74         
## dtree_g_prune  0.75     0.24           0.74         
## dtree_e        0.9      0.14           0.95         
## dtree_e_pruned 0.9      0.03           0.79         
## log_d2         0.83     0.03           0.63         
## svm_d1         0.92     0.03           0.84         
## knn            0.94     0.07           0.94

8 Conclusion

We can see that knn returns the best model. It has 94% of accuracy, quite low false positive rate and pretty high true positive rate. So, we will stick to the model based on knn.

9 References

M.L. Den Boer et al, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2707020/. “A subtype of childhood acute lymphoblastic leukaemia with poor treatment outcome: a genome-wide classification study”,

Supervised learning techniques comparison of cancer data

Artur Grygorian

April 28, 2017