Exercise 1

In this exercise you will reproduce the comparison described in the textbook of support vector machines with different kernels on the heart data.

  1. You will first tune an svm with a radial basis function (RBF) kernel with respect to its two parameters \(\,C\) and \(\,\gamma\)
library(mlr, quietly = TRUE)
## 'mlr' is in maintenance mode since July 2019. Future development
## efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).
library(irace, quietly = TRUE)

setwd("D:/gdrive/_PM591 Machine learning/Assignment")
heart = read.csv('Heart.csv')
heart = heart[complete.cases(heart), ]
heart.tsk = makeClassifTask(id = "heart disease", 
                            data = heart, target = "AHD")

# split data into train/test
split_desc = makeResampleDesc(method='Holdout', stratify = TRUE, split=0.7)
set.seed(2020)
split = makeResampleInstance(split_desc, task = heart.tsk)
trainid = split$train.inds[[1]]; testid = split$test.inds[[1]]
train = heart[trainid, ]; test = heart[-trainid, ]

# tune an svm with a radial basis fuction (RBF) kernel
rbfsvm.lrn = makeLearner("classif.ksvm", par.vals = list(kernel = "rbfdot"), 
                         predict.type = "prob")

cv5.stratified = makeResampleDesc("CV", iters=5, stratify = TRUE) 

ctrl = makeTuneControlIrace(maxExperiments = 200L)

ps = makeParamSet(
  makeDiscreteParam("C", values = seq(1e-6, 5, length=10)),
  makeDiscreteParam("sigma", values = c(1e-3, 1e-2, 1e-1))
)

svm.tune = tuneParams(rbfsvm.lrn, subsetTask(heart.tsk, trainid), 
                            cv5.stratified, measures=auc, 
                            par.set = ps, control = ctrl, show.info = FALSE)
svm.tune
## Tune result:
## Op. pars: C=2.2222227777...; sigma=0.01
## auc.test.mean=0.9186191
  1. Retrain on the training set using a) the optimal set of parameters (best CV AUC) and b) a/the set of parametrs with the worst CV AUC. Predict on the test set using both sets sof parameters, plot the corresponding test ROC curves, and compute the test AUCs. (hint: remember the mlr function plotROCCurves.) Based on these results, does parameter tuning make a big difference in performance?
optimal.param = setHyperPars(rbfsvm.lrn, par.vals = svm.tune$x)
svm.tr.best = train(optimal.param, heart.tsk, subset=trainid)
svm.pred.best = predict(svm.tr.best, newdata=test, type='prob')
# find the worst parameter (which returns the minimum AUC.test.mean)
svm.opt.path = as.data.frame(svm.tune$opt.path)[1:3]
svm.opt.path[which.min(svm.opt.path$auc.test.mean),]
# C: 3.88888911111111   , sigma = 0.1

worst.param = setHyperPars(rbfsvm.lrn, par.vals = list(C = 3.88888911111111 , sigma = 0.1 ))
svm.tr.worst = train(worst.param, heart.tsk, subset=trainid)
svm.pred.worst = predict(svm.tr.worst, newdata=test,type='prob')
df = generateThreshVsPerfData(list(best = svm.pred.best, worst = svm.pred.worst), measures = list(fpr, tpr))
plotROCCurves(df)

rbind(best = performance(svm.pred.best, auc), worst = performance(svm.pred.worst, auc))
##             auc
## best  0.8794643
## worst 0.8422619
  1. Retrain on the full dataset (using the optimal parameter set only)
svm.final = train(optimal.param, heart.tsk)
  1. The code below extends the comparison above to include both and RBF, a polynomial kernel, and the linear kernel (i.e. support vector classifier). Now, RBF, the linear and the polynomial kernels share the cost parameter \(\,C\) but also have additional non-shared tuning parameters (the polynomial degree \(\,d\) is a tuning parameter only for polynomial kernels and \(\,gamma\) is a tuning parameter only for RBF kernels). The code below constructs a set of parameters that enables tuning both shared and non-shared parameters all at once!
ps.extended = makeParamSet(
  makeDiscreteParam("kernel", values = c("vanilladot", "polydot", "rbfdot")),
  makeDiscreteParam("C", values = seq(1e-6, 5, length=10) ),
  makeDiscreteParam("sigma", values = c(1e-3, 1e-2, 1e-1), requires = quote(kernel == "rbfdot") ), 
  makeIntegerParam("degree", lower = 1L, upper = 5L, requires = quote(kernel == "polydot") )
)

Tune the svm learner using this extended set of parameters ps.extended.

svm.lrn = makeLearner("classif.ksvm", predict.type = "prob")

svm.tune.extd = tuneParams(svm.lrn, subsetTask(heart.tsk, trainid), cv5.stratified, 
                                measures=auc, par.set = ps.extended, control = ctrl, show.info = FALSE)
  1. Explore a wider range of values for \(\,C\), \(\,\gamma\) and \(\,d\) and repeat ii. and iii. Which kernel performed best? What may this say about the nature of the true decision boundary?
svm.opt.path = as.data.frame(svm.tune.extd$opt.path)

# RBF
svm.opt.path.rbf = subset(svm.opt.path, kernel=="rbfdot")
options(digits=14)
bestC = as.numeric(as.character(svm.opt.path.rbf$C[which.max(svm.opt.path.rbf$auc.test.mean)]))
options(digits=2)
bestsigma = as.numeric(as.character(svm.opt.path.rbf$sigma[which.max(svm.opt.path.rbf$auc.test.mean)]))

optimal.param.rbf = setHyperPars(svm.lrn, par.vals = list(kernel = "rbfdot", C = bestC, sigma = bestsigma))
svm.tr.rbf = train(optimal.param.rbf, heart.tsk, subset = trainid)
svm.pred.rbf = predict(svm.tr.rbf, newdata = test, type = 'prob')

# polynomial
svm.opt.path.poly = subset(svm.opt.path, kernel=="polydot")
options(digits=14)
bestC = as.numeric(as.character(svm.opt.path.poly$C[which.max(svm.opt.path.poly$auc.test.mean)]))
options(digits=0)
bestd = as.numeric(as.character(svm.opt.path.poly$degree[which.max(svm.opt.path.poly$auc.test.mean)]))

optimal.param.poly = setHyperPars(svm.lrn, par.vals = list(kernel = "polydot", C = bestC, degree = bestd))
svm.tr.poly = train(optimal.param.poly, heart.tsk, subset = trainid)
svm.pred.poly = predict(svm.tr.poly, newdata = test, type = 'prob')

# linear
svm.opt.path.ln = subset(svm.opt.path, kernel=="vanilladot")
options(digits=14)
bestC = as.numeric(as.character(svm.opt.path.ln$C[which.max(svm.opt.path.ln$auc.test.mean)]))

optimal.param.ln = setHyperPars(svm.lrn, par.vals = list(C = bestC))
svm.tr.ln = train(optimal.param.ln, heart.tsk, subset = trainid)
svm.pred.ln = predict(svm.tr.ln, newdata = test, type = 'prob')

# Compare test AUC of the three svm models
df = generateThreshVsPerfData(list(rbf = svm.pred.rbf, poly = svm.pred.poly, linear = svm.pred.ln), 
                              measures = list(fpr, tpr))
plotROCCurves(df)

data.frame(model = c("RBF", "Polynomial", "Linear"), 
           testAUC = c(performance(svm.pred.rbf, auc), 
           performance(svm.pred.poly, auc), performance(svm.pred.ln, auc)))
# Train the final model
svm.final = train(optimal.param.poly, heart.tsk)

Exercise 2

Train svms on the metabric data in a similar way to part iv of Exercise 1 and report your results. Compare the performance to that of lasso ridge regression (recall the similarity between SVM with a linear Kernel logistic ridge regression).

load('metabric.Rdata') 

# specify the task and split data into train/test 
metabric.tsk = makeClassifTask(id = "One-year breast-cancer mortality", data = metabric, 
                               target = "y")
split.desc = makeResampleDesc(method='Holdout', stratify = TRUE, split=0.7)
set.seed(2020)
split = makeResampleInstance(split.desc, task = metabric.tsk)
trainid = split$train.inds[[1]]; testid = split$test.inds[[1]]
train = metabric[trainid,]; test = metabric[-trainid,]

# tune parameters for svm models
ps.extended = makeParamSet(
  makeDiscreteParam("kernel", values = c("vanilladot", "polydot", "rbfdot")),
  makeDiscreteParam("C", values = seq(1e-6, 5, length=10) ),
  makeDiscreteParam("sigma", values = c(1e-3, 1e-2, 1e-1), 
                    requires = quote(kernel == "rbfdot") ), 
  makeIntegerParam("degree", lower = 1L, upper = 5L, 
                   requires = quote(kernel == "polydot") )
)

cv5.stratified = makeResampleDesc("CV", iters=5, stratify = TRUE) 
ctrl = makeTuneControlIrace(maxExperiments = 200L)

svm.lrn = makeLearner("classif.ksvm", predict.type = "prob")
svm.tune.extd = tuneParams(svm.lrn, subsetTask(metabric.tsk, trainid), cv5.stratified, 
                                measures=auc, par.set = ps.extended, control = ctrl, show.info = FALSE)

# find the optimal parameters for the selected svm model and predict on the test set
optimal.param = setHyperPars(svm.lrn, par.vals = svm.tune.extd$x)
svm.tr = train(optimal.param, metabric.tsk, subset=trainid)
svm.pred = predict(svm.tr, newdata=test, type='prob')

# ridge regression with lambda.1se (ridge performed better than lasso in the previous exercise: Assignment 4)
ridge.lrn = makeLearner("classif.cvglmnet", fix.factors.prediction=TRUE, alpha=0, 
                          type.measure = 'auc')
ridge.tr = train(ridge.lrn, task = metabric.tsk, subset = trainid)

lambda.1se = ridge.tr$learner.model$lambda.1se
ridge.1se = makeLearner("classif.glmnet", lambda = lambda.1se, fix.factors.prediction = TRUE, 
                        predict.type = "prob", alpha = 0)

ridge.1se.tr = train(learner = ridge.1se, task = metabric.tsk, subset = trainid)
ridge.pred = predict(ridge.1se.tr, type = "prob", task = metabric.tsk, subset = testid)

# plot the test AUC of svm and ridge
df = generateThreshVsPerfData(list(svm = svm.pred, ridge = ridge.pred), measures = list(fpr, tpr))
plotROCCurves(df)

# compare the results
rbind(svm = performance(svm.pred, auc), ridge = performance(ridge.pred, auc))
##                    auc
## svm   0.75493572084481
## ridge 0.79602846648301
  • In the case of metabric exercise, ridge regression performs better than svm given the results of their test AUCs. Since ridge (and lasso) regression and support vector machines function very similarly in terms of the use of loss fuctions, ridge can be chosen over svm not only because of its better test AUC performance, but also due to its lower running time costs.

Exercise 3

Compute PCs on the metabric data (features only). Plot the cumulative variance explained by the PCs. Plot the first PC against the second PC distinguishing the two classes by color. Repeat for PC1 vs.PC3 and comment on whether the classes are well separated by the first 3 PCs.

library(MASS, quietly = TRUE)
library(dplyr, quietly = TRUE)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2, quietly = TRUE)

# compute PCs
load('metabric.Rdata') 
metabric.ft = metabric[, -1]

# detailed method
the_cov <-  cov(metabric.ft) 
the_eigen <- eigen(the_cov)

# simplified method
pc.metabric = prcomp(metabric.ft)

# plot the cumulative variance explained by the PCs.
plot(cumsum(100*pc.metabric$sdev^2/sum(pc.metabric$sdev^2)), col = 'red4', 
     xlab = "PC index", ylab = "Cumulative variance")

# plot function
pcplot = function(pc1, pc2) {
  map <- 
    as.matrix(metabric.ft) %*% the_eigen[[2]][, c(pc1, pc2)]  %>% 
    as.data.frame()
  
  temp_df <- data.frame(group = metabric$y)
  plotting_df <- 
    bind_cols(map, temp_df)

  ggplot(plotting_df, aes(x = V1, y = V2, color = group)) +
  geom_point() +
  xlab(paste("PC", as.character(pc1))) + ylab(paste("PC", as.character(pc2)))
}

# plot PC1 and PC2
pcplot(1, 2)

# plot PC1 and PC2
pcplot(1, 3)

  • No. classes are not seperated well by the first 3 PCs.

Exercise 4

Compare hierarchical clustering on the TCGA pan-cancer data (features only) using complete, single, and average linkage

library(purrr, quietly = TRUE)
library(cluster, quietly = TRUE)
library(fpc, quietly = TRUE)

# Import data & pre-analysis
pancancer = read.csv("TCGA_pancancer.csv")
pancancer.labs = read.csv("TCGA_pancancer_labels.csv")
pancancer.labs = factor(pancancer.labs$Class)
table(pancancer.labs)
## pancancer.labs
## BRCA COAD KIRC LUAD PRAD 
##  300   78  146  141  136
dim(pancancer)
## [1]  801 1000
head(pancancer)
# vector of methods to compare
m <- c( "average", "single", "complete")
names(m) <- c( "average", "single", "complete")
 
# function to compute coefficient
ac <- function(x) {
  agnes(pancancer, method = x)$ac
}

# compare hierarchical clustering using complete, single, and average linkage
map_dbl(m, ac)
##          average           single         complete 
## 0.54086384044131 0.45156281687015 0.63439079114398
  • Agglomerative coefficient (AC) is a useful method to choose the strongest clustering structure. It measures the averaged dissimilarity based on the clustering structure. Complete method is chosen since it has the higest AC (which can go upto 1).
# plot the dendogram
hc <- agnes(pancancer, method = "complete")
pltree(hc, cex = 0.6, hang = -1, main = "Dendrogram")

# choose the best K based on the aveage silhouette width
clust <- numeric(20)
for(k in 2:20){
  clust[[k]] <- pam(pancancer, k) $ silinfo $ avg.width
  k.best <- which.max(clust)}
cat("The best K with the maximum average silhouette width is", k.best, "\n")
## The best K with the maximum average silhouette width is 4
hc.bestk <- hclust(dist(pancancer),method="complete")
plot(hc.bestk, cex = 0.6, hang = -1)
rect.hclust(hc.bestk,k=4,border="red")