library(data.table) # Importing data
library(car) # model diagnostics
library(MASS) # step function for model selection
library(rpart) # desicion tree
library(rattle) # creating decision tree plot
library(rpart.plot) # descision tree
library(RColorBrewer) # colloring decision tree
library(formatR) # colloring tree
library(e1071) # Performing Support Vector Machine
library(corrplot) # correlation plot
library(class) # knn Determination of a cancer type is a crucial component for curing cancer. In this analysis I am comparing different supervised learning techniques such as: decision tree, logistic regression, support vector machines and K-nearest neighbors using genetic data from patients suffering from Acute Lymphoblastic Leukemia. This data set comes from the paper: “A subtype of childhood acute lymphoblastic leukemia with poor treatment outcome: a genome-wide classification study” by M.L.Den Boer.
In this case we are looking at 40 different genes/features and a classifier that has 11 different cancer subtypes. We grouped cancer types into two categories which are (Bo,T) and (others).
datafile = read.table("GSE13425-Bo-T.txt",check.names = F) # Importing data
df = as.data.frame(t(datafile)) # Transposing
df[,length(df)+1] = rownames(df) # Class column
for (i in 1:length(df)) { # Renaming columns
colnames(df)[i] = paste("v", i, sep = "")
if (i == length(df)){
colnames(df)[i] = "class"
}
}
attach(df)
df$class = factor((class == "T")|(class == "Bo"), # Bo or T vs others
levels = c(FALSE, TRUE),
labels = c("other", "Bo|T"))
rownames(df) = 1:nrow(df)Since some of the algorithms, especially logistic regression are influenced by multicollinearity issue, let’s have a look on correlation between our covariates.
z = cor(df[,1:40]) # Create a correlation matrix
z[lower.tri(z,diag=TRUE)] = NA # Drop duplicates
z = as.data.frame(as.table(z)) # Turn into a 3-column table
z = na.omit(z) # Get rid of NAs
z = z[order(-abs(z$Freq)),] # Sort by highest correlation
subset(z, abs(Freq) > 0.8) # Most correlated variables## Var1 Var2 Freq
## 1124 v4 v29 0.9819464
## 1598 v38 v40 0.8862302
## 533 v13 v14 0.8852458
## 1254 v14 v32 0.8834559
## 523 v3 v14 0.8556361
## 1253 v13 v32 0.8452803
## 1122 v2 v29 0.8194641
## 122 v2 v4 0.8159920
## 883 v3 v23 -0.8134034
## 1183 v23 v30 0.8111894
## 1174 v14 v30 -0.8095060
## 1416 v16 v36 0.8048125
We can see a high correlation between some of the variables. For example, variable v4 and v29 has almost perfect correlation.
Let’s divide our model into training and testing set (75% vs 25%). I am going to use training set to create a model and testing set to evaluate the model’s performance.
# Set random seed for reproducible research
set.seed(97459)
# Shuffle the data
n = nrow(df)
df = df[sample(n), ]
# Split the data in train and test
split = 0.75
df_train = df[1:round(split * n), ]
df_test = df[(round(split * n) + 1):n, ]Let’s check whether the split sets are well balanced, meaning that they have similar proportions of both classes.
# Check class bias
table(df$class, dnn = "Overal")## Overal
## other Bo|T
## 110 80
table(df_train$class, dnn = "Train")## Train
## other Bo|T
## 81 61
table(df_test$class, dnn = "Test")## Test
## other Bo|T
## 29 19
Based on the above results, both training and testing sets are more or less well balanced.
For decision tree I am going to look at both Gini Index and Entropy. Initial tree will be based on very small value of cp (complexity parameter).
# Build a tree model using GINI index
tree_g = rpart(class ~ .,
data = df_train,
method = "class",
control = rpart.control(cp=0.0001),
parms = list(split = "gini"))
# Build a tree model using Entropy index
tree_e = rpart(class ~ .,
data = df_train,
method = "class",
control = rpart.control(cp=0.0001),
parms = list(split = "information"))
par(mfrow = c(1,2))
# Ploting tree
fancyRpartPlot(tree_g,
sub = "Tree based on GINI Index")
# Ploting tree
fancyRpartPlot(tree_e,
sub = "Tree based on Entropy")So, based on the above results, we can notice that tree based on Entropy is different from the three based on Gini index. Let’s try to prune these trees.
# Cp table Gini
printcp(tree_g)##
## Classification tree:
## rpart(formula = class ~ ., data = df_train, method = "class",
## parms = list(split = "gini"), control = rpart.control(cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] v16 v21 v22 v24
##
## Root node error: 61/142 = 0.42958
##
## n= 142
##
## CP nsplit rel error xerror xstd
## 1 0.622951 0 1.00000 1.00000 0.096702
## 2 0.065574 1 0.37705 0.49180 0.079744
## 3 0.049180 2 0.31148 0.49180 0.079744
## 4 0.032787 3 0.26230 0.49180 0.079744
## 5 0.000100 4 0.22951 0.44262 0.076658
plotcp(tree_g)Based on the above plot we decided to prune the tree using cp value of 10^{-4}
# Cp table Entropy
printcp(tree_e)##
## Classification tree:
## rpart(formula = class ~ ., data = df_train, method = "class",
## parms = list(split = "information"), control = rpart.control(cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] v21 v26 v30
##
## Root node error: 61/142 = 0.42958
##
## n= 142
##
## CP nsplit rel error xerror xstd
## 1 0.622951 0 1.00000 1.00000 0.096702
## 2 0.065574 1 0.37705 0.52459 0.081620
## 3 0.000100 3 0.24590 0.57377 0.084188
plotcp(tree_e)For the model based on Entropy the pruning cp is 0.0655738.
# Pruning
pruned_g = prune(tree_g, cp = tree_g$cptable[which.min(tree_g$cptable[,"xerror"]),"CP"])
pruned_e = prune(tree_e, cp = tree_e$cptable[which.min(tree_e$cptable[,"xerror"]),"CP"])
# Gini
par(mfrow = c(1,2))
fancyRpartPlot(tree_g,
sub = "Initial tree")
fancyRpartPlot(pruned_g,
sub = "Pruned tree")We can notice that for Gini based tree, the pruned tree is the same.
# Entropy
par(mfrow = c(1,2))
fancyRpartPlot(tree_e,
sub = "Initial tree")
fancyRpartPlot(pruned_e,
sub = "Pruned tree")For Entropy based tree, the pruned tree has only one split at the very beginning. We can notice that, the variable on which we are splitting is v21. Let’s look at this variable closer.
plot(df$class, df$v21)Basically this variable is telling us a lot about variation in our classification.
# Function that returns accuracy, false positive and true positive rates
ac = function(conf) {
accuracy = round(sum(diag(conf))/sum(conf), 2)
False.positive = round(conf[1, 2]/sum(conf[1,]), 2)
True.positive = round(conf[2, 2]/sum(conf[2,]), 2)
output = list(accuracy = accuracy,
False.positive = False.positive,
True.positive = True.positive)
return(output)
}
# Using our model to predict the test set
# Gini
pred_g = predict(tree_g,
df_test,
type = "class")
pred_g_pruned = predict(pruned_g,
df_test,
type = "class")
# Entropy
pred_e = predict(tree_e,
df_test,
type = "class")
pred_e_pruned = predict(pruned_e,
df_test,
type = "class")
# Construct the confusion matrix
conf_g = table(df_test$class,
pred_g, dnn=c("Actual:", "Predicted:"))
conf_g## Predicted:
## Actual: other Bo|T
## other 22 7
## Bo|T 5 14
conf_g_pruned = table(df_test$class,
pred_g_pruned, dnn=c("Actual:", "Predicted:"))
conf_g_pruned## Predicted:
## Actual: other Bo|T
## other 22 7
## Bo|T 5 14
conf_e = table(df_test$class,
pred_e, dnn=c("Actual:", "Predicted:"))
conf_e## Predicted:
## Actual: other Bo|T
## other 25 4
## Bo|T 1 18
conf_e_pruned = table(df_test$class,
pred_e_pruned, dnn=c("Actual:", "Predicted:"))
conf_e_pruned## Predicted:
## Actual: other Bo|T
## other 28 1
## Bo|T 4 15
summ = rbind(dtree_g = ac(conf_g),
dtree_g_prune = ac(conf_g_pruned),
dtree_e = ac(conf_e),
dtree_e_pruned = ac(conf_e_pruned))
summ## accuracy False.positive True.positive
## dtree_g 0.75 0.24 0.74
## dtree_g_prune 0.75 0.24 0.74
## dtree_e 0.9 0.14 0.95
## dtree_e_pruned 0.9 0.03 0.79
We can notice that models based on Entropy are much better in prediction. Also, we can notice that the pruned Entropy model gives you the lowest false positive while the regular entropy model gives you the highest true positive rate.
Let’s use model selection procedure based on AIC criteria.
logit_full = glm(class~., data = df_train, family = binomial)
logit_null = glm(class~1, data = df_train, family = binomial)# Both
both = step(logit_full, scope = list(lower = logit_null, upper = logit_full), direction = "both")Below is the model selected using AIC criterion.
Both
both$formula## class ~ v3 + v4 + v6 + v7 + v12 + v19 + v20 + v22 + v23 + v24 +
## v27 + v30 + v34 + v36 + v40
# Model after movel selection procedure
logit = glm(both$formula, data=df_train, family = binomial)
summary(logit)##
## Call:
## glm(formula = both$formula, family = binomial, data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.182e-04 -2.000e-08 -2.000e-08 2.000e-08 3.259e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1881.8 278364.5 0.007 0.995
## v3 -198.6 12100.4 -0.016 0.987
## v4 -911.5 51833.6 -0.018 0.986
## v6 -547.4 31006.9 -0.018 0.986
## v7 -107.2 6369.7 -0.017 0.987
## v12 -111.5 6782.0 -0.016 0.987
## v19 1462.3 86905.6 0.017 0.987
## v20 -454.9 26355.3 -0.017 0.986
## v22 -1563.3 91095.7 -0.017 0.986
## v23 197.6 11307.3 0.017 0.986
## v24 220.3 13179.2 0.017 0.987
## v27 -700.6 39966.3 -0.018 0.986
## v30 264.0 16226.8 0.016 0.987
## v34 -251.5 14573.9 -0.017 0.986
## v36 218.2 12555.9 0.017 0.986
## v40 520.2 31749.9 0.016 0.987
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.9403e+02 on 141 degrees of freedom
## Residual deviance: 7.9146e-07 on 126 degrees of freedom
## AIC: 32
##
## Number of Fisher Scoring iterations: 25
We can see that all the variables are insignificant with high p-value. It seems, that we have a multicollinearity issue. Let’s check correlation between covariates and calculate variance inflation factors.
c = as.vector(attributes(both$terms)$term.labels)
z = cor(df_train[ , c[1:length(c)]]) # Create a correlation matrix
z[lower.tri(z,diag=TRUE)] = NA # Drop duplicates
z = as.data.frame(as.table(z)) # Turn into a 3-column table
z = na.omit(z) # Get rid of NAs
z = z[order(-abs(z$Freq)),] # Sort by highest correlation
subset(z, abs(Freq) > 0.6)## Var1 Var2 Freq
## 121 v3 v23 -0.8158225
## 174 v23 v30 0.7891883
## 166 v3 v30 -0.7797809
## 196 v3 v36 0.7473499
## 207 v30 v36 -0.6952707
## 200 v12 v36 0.6910695
## 16 v3 v4 -0.6749409
## 167 v4 v30 0.6696646
## 144 v23 v24 -0.6373471
## 224 v36 v40 0.6367633
## 197 v4 v36 -0.6360559
## 202 v20 v36 0.6326354
## 62 v4 v12 -0.6305739
## 136 v3 v24 0.6275150
## 61 v3 v12 0.6251031
## 211 v3 v40 0.6192273
## 169 v7 v30 -0.6147119
## 159 v23 v27 -0.6067034
## 199 v7 v36 0.6034343
## 170 v12 v30 -0.6023743
## 219 v23 v40 -0.6012588
## 151 v3 v27 0.6008704
as.data.frame(vif(logit))## vif(logit)
## v3 33.48283
## v4 1047.67976
## v6 534.38246
## v7 31.11210
## v12 61.77942
## v19 309.30714
## v20 108.44294
## v22 598.92020
## v23 116.69826
## v24 99.38380
## v27 379.05176
## v30 110.80431
## v34 141.95973
## v36 280.69093
## v40 317.73377
sqrt(vif(logit)) > 2 # Exclude v4## v3 v4 v6 v7 v12 v19 v20 v22 v23 v24 v27 v30 v34 v36 v40
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Updated model (without v4)
logit_1 = glm(class ~ v3 + v6 + v7 + v12 + v19 + v20 + v22 + v23 + v24 +
v27 + v30 + v34 + v36 + v40, data=df_train, family = binomial(link='logit'))
summary(logit_1)##
## Call:
## glm(formula = class ~ v3 + v6 + v7 + v12 + v19 + v20 + v22 +
## v23 + v24 + v27 + v30 + v34 + v36 + v40, family = binomial(link = "logit"),
## data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.82690 -0.23057 -0.02470 0.03454 2.85365
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.55277 26.45472 0.172 0.863362
## v3 0.22899 0.91213 0.251 0.801779
## v6 -1.82863 0.72035 -2.539 0.011132 *
## v7 -0.13874 0.45577 -0.304 0.760812
## v12 0.03635 0.33748 0.108 0.914224
## v19 2.29510 2.00935 1.142 0.253365
## v20 -1.84401 0.78685 -2.344 0.019102 *
## v22 -5.54700 1.63027 -3.403 0.000668 ***
## v23 1.15180 0.40071 2.874 0.004048 **
## v24 1.30132 0.60597 2.148 0.031753 *
## v27 -2.41564 0.88055 -2.743 0.006082 **
## v30 0.45185 0.81917 0.552 0.581222
## v34 -1.04703 0.49369 -2.121 0.033937 *
## v36 0.47742 0.36408 1.311 0.189762
## v40 1.00203 1.09295 0.917 0.359245
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 194.028 on 141 degrees of freedom
## Residual deviance: 54.778 on 127 degrees of freedom
## AIC: 84.778
##
## Number of Fisher Scoring iterations: 8
as.data.frame(vif(logit_1))## vif(logit_1)
## v3 2.655741
## v6 2.189958
## v7 1.344187
## v12 2.076312
## v19 1.760441
## v20 1.596779
## v22 2.467322
## v23 1.763671
## v24 2.510195
## v27 2.153361
## v30 1.837583
## v34 1.241599
## v36 2.871578
## v40 2.037418
sqrt(vif(logit_1)) > 2 ## v3 v6 v7 v12 v19 v20 v22 v23 v24 v27 v30 v34
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## v36 v40
## FALSE FALSE
logit_2 = glm(class ~ v6 + v20 + v22 + v23 + v24 +
v27 + v34, data=df_train, family = binomial(link='logit'))
summary(logit_2)##
## Call:
## glm(formula = class ~ v6 + v20 + v22 + v23 + v24 + v27 + v34,
## family = binomial(link = "logit"), data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.85125 -0.24869 -0.02757 0.03701 2.99127
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 29.9413 7.5720 3.954 7.68e-05 ***
## v6 -1.5231 0.5059 -3.010 0.002609 **
## v20 -1.2236 0.6156 -1.988 0.046856 *
## v22 -4.5785 1.2437 -3.681 0.000232 ***
## v23 1.2175 0.3592 3.390 0.000700 ***
## v24 1.2115 0.4884 2.481 0.013110 *
## v27 -1.9812 0.7030 -2.818 0.004830 **
## v34 -0.9210 0.4545 -2.027 0.042711 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 194.028 on 141 degrees of freedom
## Residual deviance: 59.667 on 134 degrees of freedom
## AIC: 75.667
##
## Number of Fisher Scoring iterations: 8
# Checking wether we should leave insignificant values in the model.
anova(logit_1, logit_2, test = "Chisq")## Analysis of Deviance Table
##
## Model 1: class ~ v3 + v6 + v7 + v12 + v19 + v20 + v22 + v23 + v24 + v27 +
## v30 + v34 + v36 + v40
## Model 2: class ~ v6 + v20 + v22 + v23 + v24 + v27 + v34
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 127 54.778
## 2 134 59.667 -7 -4.8892 0.6735
Based on the vif criteria we should eliminate v4. We can notice that after we eliminate v4 we got rid of the multicollinearity issue. However, we see that some of the values are insignificant, to check whether we need them in the model we use anova table with deviance residuals. The table above, show that we should get rid of these insignificant variables.
log_pred_df = predict(logit_2, # specifing the model given by model selection procedure
df_test, # predicting on the testing data
type="response")
optim = function(precision, # How many models you want to compare
pred,
dataset) { # Predicted values
accuracy = matrix(0, nrow = precision, ncol = 4)
accur = 0
treshold = 0
tp = 0
fp = 1
k = 1/precision
for (i in 1:precision) {
f = factor(pred > i*k,
levels = c(FALSE, TRUE),
labels = c("other", "Bo|T"))
t = table(dataset$class, f, dnn = c("Actual", "Predicted"))
accuracy[i, 1] = i*k
accuracy[i, 2] = sum(diag(t)/sum(t))
accuracy[i, 3] = t[1, 2]/sum(t[1,])
accuracy[i, 4] = t[2, 2]/sum(t[2,])
if (accuracy[i, 2] > accur) {
accur = accuracy[i, 2]
treshold = accuracy[i, 1]
co = t
fp = accuracy[i, 3]
tp = accuracy[i, 4]
}
}
output = list(highest_accuracy = accur, # Highest accuracy
optimal_treshold = treshold, # Optimal threshold
matrix = accuracy, # All possible models
False.positive = fp,
True.positive = tp,
conf = co)
return(output)
}
opt_d1 = optim(1000, log_pred_df, df_test)Below is the plot that shows how Accuracy changes when we change the threshold.
# Ploting Accuracy vs Threshold
plot(x = opt_d1$matrix[, 1],
y = opt_d1$matrix[, 2],
col = ifelse(opt_d1$matrix[, 1] == opt_d1$optimal_treshold, "red", "black"),
ylab = "Accuracy",
xlab = "Treshold",
type = 'l',
main = "Best treshold for log_d2")
abline(v = opt_d1$optimal_treshold, lty = 2)Based on the above plot, we can see that Accuracy maximizes when threshold is 0.604.
# The best combination of treshhold and accuracy
data.frame("Optimal Treshold" = opt_d1$optimal_treshold,
"Highest Accuracy" = opt_d1$highest_accuracy,
"False.positive" = opt_d1$False.positive,
"True.positive" = opt_d1$True.positive)## Optimal.Treshold Highest.Accuracy False.positive True.positive
## 1 0.604 0.8333333 0.03448276 0.6315789
summ = rbind(summ, log_d2 = ac(opt_d1$conf))Based on the table above, we can see that logistic regression gives us quite low false positive rate, however in terms of accuracy it has lower than decision tree based on Entropy.
To come up with the best model for svm we use tuning procedure for linear kernel type.
set.seed(11)
# Tuning svm fom d1
tune = tune(svm, #ten-fold cross-validation on a set of models of interest.
class ~ .,
data = df_train,
kernel = "linear",
ranges = list(cost = seq(0.01, 5, 0.1)))
# Assigning the best model based on cost component
bestmod = tune$best.model
summary(bestmod) ##
## Call:
## best.tune(method = svm, train.x = class ~ ., data = df_train,
## ranges = list(cost = seq(0.01, 5, 0.1)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.11
## gamma: 0.025
##
## Number of Support Vectors: 51
##
## ( 26 25 )
##
##
## Number of Classes: 2
##
## Levels:
## other Bo|T
We can notice that the best model has a cost value of 0.11 and 51 support vectors.
# Predicting on testing set
pred_svm = predict(bestmod, df_test)
svm.conf = table(df_test$class,
pred_svm,
dnn = c("Actual", "Predicted"))
svm.conf## Predicted
## Actual other Bo|T
## other 28 1
## Bo|T 3 16
summ = rbind(summ,
svm_d1 = ac(svm.conf))
summ## accuracy False.positive True.positive
## dtree_g 0.75 0.24 0.74
## dtree_g_prune 0.75 0.24 0.74
## dtree_e 0.9 0.14 0.95
## dtree_e_pruned 0.9 0.03 0.79
## log_d2 0.83 0.03 0.63
## svm_d1 0.92 0.03 0.84
Based on the above results, we can see that svm performs pretty good. It is even better than decision tree based on Entropy it terms of accuracy.
Below, we find the value of k, that returns the highest accuracy.
knn_matrix = matrix(0, nrow = 15, ncol = 2, byrow = TRUE)
set.seed(5)
# Find the best k in terms of highest accuracy
for (i in 1:15) {
knn.pred = knn(df_train[,-which(colnames(df_train)=="class")],
df_test[,-which(colnames(df_test)=="class")],
df_train$class,
k = i)
knn_matrix[i, 1] = i
knn_matrix[i, 2] = mean(knn.pred == df_test$class)
}
# find the k that gives the highest accuracy
plot(knn_matrix,
type = "l",
col = "red",
xlab = "k",
ylab = "Accuracy")
abline(v = which.max(knn_matrix[,2]), lty = 2)By looking at the plot above, we can see that the best k is 4, that returns approximately 94% of accuracy.
bestknn = knn(df_train[,-which(colnames(df_train)=="class")],
df_test[,-which(colnames(df_test)=="class")],
df_train$class,
k = which.max(knn_matrix[,2]))
conf = table(bestknn, df_test$class)
conf##
## bestknn other Bo|T
## other 28 2
## Bo|T 1 17
summ = rbind(summ, knn = ac(conf))
summ## accuracy False.positive True.positive
## dtree_g 0.75 0.24 0.74
## dtree_g_prune 0.75 0.24 0.74
## dtree_e 0.9 0.14 0.95
## dtree_e_pruned 0.9 0.03 0.79
## log_d2 0.83 0.03 0.63
## svm_d1 0.92 0.03 0.84
## knn 0.94 0.07 0.94
We can see that knn returns the best model. It has 94% of accuracy, quite low false positive rate and pretty high true positive rate. So, we will stick to the model based on knn.