ISLR: Chapter 9 Lab

Sameer Mathur

Support Vector Machine

Creating the Dataset

# fixing the data points
set.seed(1)
# creating a matrix from normal distribution of dimentions 200*2
x <- matrix(rnorm(200*2), ncol=2)
# applying some transformation on first 100 rows
x[1:100,] = x[1:100,]+2
# applying some transformation on last 100 rows
x[101:150,] = x[101:150,]-2
# creating a variable y with frist 150 observation as "1" reamaing 50 as "2"
y <- c(rep(1,150),rep(2,50))
# creating dataframe with matrix X and factor variable y
dat <- data.frame(x=x,y=as.factor(y))
# dimentions of the dataframe
dim(dat)

[1] 200   3

# some upper rows of the dataframe
head(dat, n = 8)

       x.1        x.2 y
1 1.373546  2.4094018 1
2 2.183643  3.6888733 1
3 1.164371  3.5865884 1
4 3.595281  1.6690922 1
5 2.329508 -0.2852355 1
6 1.179532  4.4976616 1
7 2.487429  2.6670662 1
8 2.738325  2.5413273 1

# some lower rows of the dataframe
tail(dat, n = 8)

           x.1        x.2 y
193 -0.7317482  1.0916690 2
194  0.8303732  0.3066049 2
195 -1.2080828 -0.1101588 2
196 -1.0479844 -0.9243128 2
197  1.4411577  1.5929138 2
198 -1.0158475  0.0450106 2
199  0.4119747 -0.7151284 2
200 -0.3810761  0.8652231 2

# plotting 
plot(x, col=y)

plot of chunk unnamed-chunk-3

Creating Traing Set and Fitting the Model

# taking sample of 100 datapoints in training set
train <- sample(200,100)
library(e1071)
# fitting the model
svmfit = svm(y~., data=dat[train,], kernel="radial",  gamma=1, cost=1)

# plotting 
plot(svmfit, dat)

plot of chunk unnamed-chunk-5

# printing the summary
summary(svmfit)


Call:
svm(formula = y ~ ., data = dat[train, ], kernel = "radial", 
    gamma = 1, cost = 1)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 
      gamma:  1 

Number of Support Vectors:  37

 ( 17 20 )


Number of Classes:  2 

Levels: 
 1 2

# fitting the model for cost = 1e5
svmfit <- svm(y~., data=dat[train,], kernel="radial",gamma=1,cost = 1e5)
# plotting 
plot(svmfit,dat[train,])

plot of chunk unnamed-chunk-7

set.seed(1)
tune.out <- tune(svm, y~., data=dat[train,], kernel="radial", ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
summary(tune.out)


Parameter tuning of 'svm':

- sampling method: 10-fold cross validation 

- best parameters:
 cost gamma
    1     2

- best performance: 0.12 

- Detailed performance results:
    cost gamma error dispersion
1  1e-01   0.5  0.27 0.11595018
2  1e+00   0.5  0.13 0.08232726
3  1e+01   0.5  0.15 0.07071068
4  1e+02   0.5  0.17 0.08232726
5  1e+03   0.5  0.21 0.09944289
6  1e-01   1.0  0.25 0.13540064
7  1e+00   1.0  0.13 0.08232726
8  1e+01   1.0  0.16 0.06992059
9  1e+02   1.0  0.20 0.09428090
10 1e+03   1.0  0.20 0.08164966
11 1e-01   2.0  0.25 0.12692955
12 1e+00   2.0  0.12 0.09189366
13 1e+01   2.0  0.17 0.09486833
14 1e+02   2.0  0.19 0.09944289
15 1e+03   2.0  0.20 0.09428090
16 1e-01   3.0  0.27 0.11595018
17 1e+00   3.0  0.13 0.09486833
18 1e+01   3.0  0.18 0.10327956
19 1e+02   3.0  0.21 0.08755950
20 1e+03   3.0  0.22 0.10327956
21 1e-01   4.0  0.27 0.11595018
22 1e+00   4.0  0.15 0.10801234
23 1e+01   4.0  0.18 0.11352924
24 1e+02   4.0  0.21 0.08755950
25 1e+03   4.0  0.24 0.10749677

true <- dat[-train,"y"]
pred <- predict(tune.out$best.model,newdata=dat[-train,])
table(true, pred)

ROC Curves

library(ROCR)
rocplot=function(pred, truth, ...){
   predob = prediction(pred, truth)
   perf = performance(predob, "tpr", "fpr")
   plot(perf,...)}

# fitting the model
svmfit.opt <- svm(y~., data=dat[train,], kernel = "radial",gamma=2, cost=1,decision.values=T)
fitted <- attributes(predict(svmfit.opt,dat[train,],
                      decision.values=TRUE))$decision.values
par(mfrow=c(1,2))
# plotting ROC 
rocplot(fitted,dat[train,"y"],main="Training Data")

plot of chunk unnamed-chunk-11

# fitting the model for gamma=50 
svmfit.flex <- svm(y~., data = dat[train,], kernel="radial",gamma=50, cost=1, decision.values=T)
fitted <- attributes(predict(svmfit.flex,dat[train,],
                    decision.values=T))$decision.values
# plotting ROC 
rocplot(fitted,dat[train,"y"])

plot of chunk unnamed-chunk-12

fitted <- attributes(predict(svmfit.opt,dat[-train,],
                          decision.values=T))$decision.values
# plotting ROC
rocplot(fitted,dat[-train,"y"],main="Test Data")

plot of chunk unnamed-chunk-13

fitted <- attributes(predict(svmfit.flex,dat[-train,],
                          decision.values=T))$decision.values
# plotting ROC
rocplot(fitted,dat[-train,"y"])

plot of chunk unnamed-chunk-14

SVM with Multiple Classes

Creating the Dataset

# fixing the datapoints
set.seed(1)
# creating a matrix from normal distribution of dimentions 50*2
x <- rbind(x, matrix(rnorm(50*2), ncol=2))
# creating a variable y 
y <- c(y, rep(0,50))
# applying some transformation
x[y==0,2]=x[y==0,2]+2
#creating data frame 
dat <- data.frame(x=x, y=as.factor(y))
# some rows of the dataframe
head(dat)

       x.1        x.2 y
1 1.373546  2.4094018 1
2 2.183643  3.6888733 1
3 1.164371  3.5865884 1
4 3.595281  1.6690922 1
5 2.329508 -0.2852355 1
6 1.179532  4.4976616 1

par(mfrow=c(1,1))
# plotting matrix
plot(x,col=(y+1))

plot of chunk unnamed-chunk-16

# fitting the model
svmfit=svm(y~., data=dat, kernel="radial", cost=10, gamma=1)
# plotting the model
plot(svmfit, dat)

plot of chunk unnamed-chunk-17

Application to Gene Expression Data

# loading the package
library(ISLR)
# name of the data sets
names(Khan)

[1] "xtrain" "xtest"  "ytrain" "ytest"

# dimentions of the training dataset
dim(Khan$xtrain)

[1]   63 2308

# imentions of the testing dataset
dim(Khan$xtest)

[1]   20 2308

# length of the dependent variable in training set
length(Khan$ytrain)

[1] 63

# length of the dependent variable in testing set
length(Khan$ytest)

[1] 20

# table for ytrain
table(Khan$ytrain)


 1  2  3  4 
 8 23 12 20

## table for ytest
table(Khan$ytest)


1 2 3 4 
3 6 6 5

# creating the dataframe
dat <- data.frame(x=Khan$xtrain, y=as.factor(Khan$ytrain))

# fitting the model
out <- svm(y~., data=dat, kernel="linear",cost=10)
# summary
summary(out)


Call:
svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  linear 
       cost:  10 
      gamma:  0.0004332756 

Number of Support Vectors:  58

 ( 20 20 11 7 )


Number of Classes:  4 

Levels: 
 1 2 3 4

# prediction table
table(out$fitted, dat$y)


     1  2  3  4
  1  8  0  0  0
  2  0 23  0  0
  3  0  0 12  0
  4  0  0  0 20

# creating data frame for testing dataset
dat.te <- data.frame(x=Khan$xtest, y=as.factor(Khan$ytest))
# prediction
pred.te <- predict(out, newdata=dat.te)
# prediction table
table(pred.te, dat.te$y)


pred.te 1 2 3 4
      1 3 0 0 0
      2 0 6 2 0
      3 0 0 4 0
      4 0 0 0 5