In this exercise you will reproduce the comparison described in the textbook of support vector machines with different kernels on the heart data.
library(mlr, quietly = TRUE)
## 'mlr' is in maintenance mode since July 2019. Future development
## efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).
library(irace, quietly = TRUE)
setwd("D:/gdrive/_PM591 Machine learning/Assignment")
heart = read.csv('Heart.csv')
heart = heart[complete.cases(heart), ]
heart.tsk = makeClassifTask(id = "heart disease",
data = heart, target = "AHD")
# split data into train/test
split_desc = makeResampleDesc(method='Holdout', stratify = TRUE, split=0.7)
set.seed(2020)
split = makeResampleInstance(split_desc, task = heart.tsk)
trainid = split$train.inds[[1]]; testid = split$test.inds[[1]]
train = heart[trainid, ]; test = heart[-trainid, ]
# tune an svm with a radial basis fuction (RBF) kernel
rbfsvm.lrn = makeLearner("classif.ksvm", par.vals = list(kernel = "rbfdot"),
predict.type = "prob")
cv5.stratified = makeResampleDesc("CV", iters=5, stratify = TRUE)
ctrl = makeTuneControlIrace(maxExperiments = 200L)
ps = makeParamSet(
makeDiscreteParam("C", values = seq(1e-6, 5, length=10)),
makeDiscreteParam("sigma", values = c(1e-3, 1e-2, 1e-1))
)
svm.tune = tuneParams(rbfsvm.lrn, subsetTask(heart.tsk, trainid),
cv5.stratified, measures=auc,
par.set = ps, control = ctrl, show.info = FALSE)
svm.tune
## Tune result:
## Op. pars: C=2.2222227777...; sigma=0.01
## auc.test.mean=0.9186191
mlr
function plotROCCurves
.) Based on these results, does parameter tuning make a big difference in performance?optimal.param = setHyperPars(rbfsvm.lrn, par.vals = svm.tune$x)
svm.tr.best = train(optimal.param, heart.tsk, subset=trainid)
svm.pred.best = predict(svm.tr.best, newdata=test, type='prob')
# find the worst parameter (which returns the minimum AUC.test.mean)
svm.opt.path = as.data.frame(svm.tune$opt.path)[1:3]
svm.opt.path[which.min(svm.opt.path$auc.test.mean),]
# C: 3.88888911111111 , sigma = 0.1
worst.param = setHyperPars(rbfsvm.lrn, par.vals = list(C = 3.88888911111111 , sigma = 0.1 ))
svm.tr.worst = train(worst.param, heart.tsk, subset=trainid)
svm.pred.worst = predict(svm.tr.worst, newdata=test,type='prob')
df = generateThreshVsPerfData(list(best = svm.pred.best, worst = svm.pred.worst), measures = list(fpr, tpr))
plotROCCurves(df)
rbind(best = performance(svm.pred.best, auc), worst = performance(svm.pred.worst, auc))
## auc
## best 0.8794643
## worst 0.8422619
svm.final = train(optimal.param, heart.tsk)
ps.extended = makeParamSet(
makeDiscreteParam("kernel", values = c("vanilladot", "polydot", "rbfdot")),
makeDiscreteParam("C", values = seq(1e-6, 5, length=10) ),
makeDiscreteParam("sigma", values = c(1e-3, 1e-2, 1e-1), requires = quote(kernel == "rbfdot") ),
makeIntegerParam("degree", lower = 1L, upper = 5L, requires = quote(kernel == "polydot") )
)
Tune the svm learner using this extended set of parameters ps.extended
.
svm.lrn = makeLearner("classif.ksvm", predict.type = "prob")
svm.tune.extd = tuneParams(svm.lrn, subsetTask(heart.tsk, trainid), cv5.stratified,
measures=auc, par.set = ps.extended, control = ctrl, show.info = FALSE)
svm.opt.path = as.data.frame(svm.tune.extd$opt.path)
# RBF
svm.opt.path.rbf = subset(svm.opt.path, kernel=="rbfdot")
options(digits=14)
bestC = as.numeric(as.character(svm.opt.path.rbf$C[which.max(svm.opt.path.rbf$auc.test.mean)]))
options(digits=2)
bestsigma = as.numeric(as.character(svm.opt.path.rbf$sigma[which.max(svm.opt.path.rbf$auc.test.mean)]))
optimal.param.rbf = setHyperPars(svm.lrn, par.vals = list(kernel = "rbfdot", C = bestC, sigma = bestsigma))
svm.tr.rbf = train(optimal.param.rbf, heart.tsk, subset = trainid)
svm.pred.rbf = predict(svm.tr.rbf, newdata = test, type = 'prob')
# polynomial
svm.opt.path.poly = subset(svm.opt.path, kernel=="polydot")
options(digits=14)
bestC = as.numeric(as.character(svm.opt.path.poly$C[which.max(svm.opt.path.poly$auc.test.mean)]))
options(digits=0)
bestd = as.numeric(as.character(svm.opt.path.poly$degree[which.max(svm.opt.path.poly$auc.test.mean)]))
optimal.param.poly = setHyperPars(svm.lrn, par.vals = list(kernel = "polydot", C = bestC, degree = bestd))
svm.tr.poly = train(optimal.param.poly, heart.tsk, subset = trainid)
svm.pred.poly = predict(svm.tr.poly, newdata = test, type = 'prob')
# linear
svm.opt.path.ln = subset(svm.opt.path, kernel=="vanilladot")
options(digits=14)
bestC = as.numeric(as.character(svm.opt.path.ln$C[which.max(svm.opt.path.ln$auc.test.mean)]))
optimal.param.ln = setHyperPars(svm.lrn, par.vals = list(C = bestC))
svm.tr.ln = train(optimal.param.ln, heart.tsk, subset = trainid)
svm.pred.ln = predict(svm.tr.ln, newdata = test, type = 'prob')
# Compare test AUC of the three svm models
df = generateThreshVsPerfData(list(rbf = svm.pred.rbf, poly = svm.pred.poly, linear = svm.pred.ln),
measures = list(fpr, tpr))
plotROCCurves(df)
data.frame(model = c("RBF", "Polynomial", "Linear"),
testAUC = c(performance(svm.pred.rbf, auc),
performance(svm.pred.poly, auc), performance(svm.pred.ln, auc)))
# Train the final model
svm.final = train(optimal.param.poly, heart.tsk)
Train svms on the metabric data in a similar way to part iv of Exercise 1 and report your results. Compare the performance to that of lasso ridge regression (recall the similarity between SVM with a linear Kernel logistic ridge regression).
load('metabric.Rdata')
# specify the task and split data into train/test
metabric.tsk = makeClassifTask(id = "One-year breast-cancer mortality", data = metabric,
target = "y")
split.desc = makeResampleDesc(method='Holdout', stratify = TRUE, split=0.7)
set.seed(2020)
split = makeResampleInstance(split.desc, task = metabric.tsk)
trainid = split$train.inds[[1]]; testid = split$test.inds[[1]]
train = metabric[trainid,]; test = metabric[-trainid,]
# tune parameters for svm models
ps.extended = makeParamSet(
makeDiscreteParam("kernel", values = c("vanilladot", "polydot", "rbfdot")),
makeDiscreteParam("C", values = seq(1e-6, 5, length=10) ),
makeDiscreteParam("sigma", values = c(1e-3, 1e-2, 1e-1),
requires = quote(kernel == "rbfdot") ),
makeIntegerParam("degree", lower = 1L, upper = 5L,
requires = quote(kernel == "polydot") )
)
cv5.stratified = makeResampleDesc("CV", iters=5, stratify = TRUE)
ctrl = makeTuneControlIrace(maxExperiments = 200L)
svm.lrn = makeLearner("classif.ksvm", predict.type = "prob")
svm.tune.extd = tuneParams(svm.lrn, subsetTask(metabric.tsk, trainid), cv5.stratified,
measures=auc, par.set = ps.extended, control = ctrl, show.info = FALSE)
# find the optimal parameters for the selected svm model and predict on the test set
optimal.param = setHyperPars(svm.lrn, par.vals = svm.tune.extd$x)
svm.tr = train(optimal.param, metabric.tsk, subset=trainid)
svm.pred = predict(svm.tr, newdata=test, type='prob')
# ridge regression with lambda.1se (ridge performed better than lasso in the previous exercise: Assignment 4)
ridge.lrn = makeLearner("classif.cvglmnet", fix.factors.prediction=TRUE, alpha=0,
type.measure = 'auc')
ridge.tr = train(ridge.lrn, task = metabric.tsk, subset = trainid)
lambda.1se = ridge.tr$learner.model$lambda.1se
ridge.1se = makeLearner("classif.glmnet", lambda = lambda.1se, fix.factors.prediction = TRUE,
predict.type = "prob", alpha = 0)
ridge.1se.tr = train(learner = ridge.1se, task = metabric.tsk, subset = trainid)
ridge.pred = predict(ridge.1se.tr, type = "prob", task = metabric.tsk, subset = testid)
# plot the test AUC of svm and ridge
df = generateThreshVsPerfData(list(svm = svm.pred, ridge = ridge.pred), measures = list(fpr, tpr))
plotROCCurves(df)
# compare the results
rbind(svm = performance(svm.pred, auc), ridge = performance(ridge.pred, auc))
## auc
## svm 0.75493572084481
## ridge 0.79602846648301
Compute PCs on the metabric data (features only). Plot the cumulative variance explained by the PCs. Plot the first PC against the second PC distinguishing the two classes by color. Repeat for PC1 vs.PC3 and comment on whether the classes are well separated by the first 3 PCs.
library(MASS, quietly = TRUE)
library(dplyr, quietly = TRUE)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2, quietly = TRUE)
# compute PCs
load('metabric.Rdata')
metabric.ft = metabric[, -1]
# detailed method
the_cov <- cov(metabric.ft)
the_eigen <- eigen(the_cov)
# simplified method
pc.metabric = prcomp(metabric.ft)
# plot the cumulative variance explained by the PCs.
plot(cumsum(100*pc.metabric$sdev^2/sum(pc.metabric$sdev^2)), col = 'red4',
xlab = "PC index", ylab = "Cumulative variance")
# plot function
pcplot = function(pc1, pc2) {
map <-
as.matrix(metabric.ft) %*% the_eigen[[2]][, c(pc1, pc2)] %>%
as.data.frame()
temp_df <- data.frame(group = metabric$y)
plotting_df <-
bind_cols(map, temp_df)
ggplot(plotting_df, aes(x = V1, y = V2, color = group)) +
geom_point() +
xlab(paste("PC", as.character(pc1))) + ylab(paste("PC", as.character(pc2)))
}
# plot PC1 and PC2
pcplot(1, 2)
# plot PC1 and PC2
pcplot(1, 3)
Compare hierarchical clustering on the TCGA pan-cancer data (features only) using complete, single, and average linkage
library(purrr, quietly = TRUE)
library(cluster, quietly = TRUE)
library(fpc, quietly = TRUE)
# Import data & pre-analysis
pancancer = read.csv("TCGA_pancancer.csv")
pancancer.labs = read.csv("TCGA_pancancer_labels.csv")
pancancer.labs = factor(pancancer.labs$Class)
table(pancancer.labs)
## pancancer.labs
## BRCA COAD KIRC LUAD PRAD
## 300 78 146 141 136
dim(pancancer)
## [1] 801 1000
head(pancancer)
# vector of methods to compare
m <- c( "average", "single", "complete")
names(m) <- c( "average", "single", "complete")
# function to compute coefficient
ac <- function(x) {
agnes(pancancer, method = x)$ac
}
# compare hierarchical clustering using complete, single, and average linkage
map_dbl(m, ac)
## average single complete
## 0.54086384044131 0.45156281687015 0.63439079114398
# plot the dendogram
hc <- agnes(pancancer, method = "complete")
pltree(hc, cex = 0.6, hang = -1, main = "Dendrogram")
# choose the best K based on the aveage silhouette width
clust <- numeric(20)
for(k in 2:20){
clust[[k]] <- pam(pancancer, k) $ silinfo $ avg.width
k.best <- which.max(clust)}
cat("The best K with the maximum average silhouette width is", k.best, "\n")
## The best K with the maximum average silhouette width is 4
hc.bestk <- hclust(dist(pancancer),method="complete")
plot(hc.bestk, cex = 0.6, hang = -1)
rect.hclust(hc.bestk,k=4,border="red")