Generate a toy dataset.
library('e1071')
set.seed(102)
x = matrix(rnorm(20),10,2)
y = rep(c(-1,1),c(5,5))
x[y==1,] = x[y==1,]+2
plot(x,col=y+3)
Define a re-usable linear svm function and try on the toy data set.
makegrid <- function(x,n=75){
grange = apply(x,2,range)
x1_g = seq(from=grange[1,1],to=grange[2,1],length.out = n)
x2_g = seq(from=grange[1,2],to=grange[2,2],length.out = n)
expand.grid(x1=x1_g,x2=x2_g)
}
train_linear_svm <- function(x,y,...){
#x: data.frame with columns x1,x2...
#y: outcome labels in (-1,1)
svmfit = svm(x,y,type='C',kernel='linear',...)
#print(svmfit)
#plot(svmfit,training)
xgrid = makegrid(x)
ygrid = predict(svmfit,xgrid)
beta = t(svmfit$coefs)%*%as.matrix(x[svmfit$index,])
beta0 = svmfit$rho #negative intercept
plot(xgrid,col=c('red','blue')[as.numeric(ygrid)],pch=20,cex=.2)
points(x,col=y+3,pch=20)
points(svmfit$SV,pch=5,cex=2)
abline(beta0/beta[2],-beta[1]/beta[2])
abline((beta0-1)/beta[2],-beta[1]/beta[2])
abline((beta0+1)/beta[2],-beta[1]/beta[2])
return(svmfit)
}
invisible(train_linear_svm(x,y,scale=F,cost = .2))
#== data from ESL ==#
setwd('/Users/Nan/Dropbox/R/practice/svm/')
load('ESL.mixture.rda')
str(ESL.mixture)
## List of 8
## $ x : num [1:200, 1:2] 2.5261 0.367 0.7682 0.6934 -0.0198 ...
## $ y : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
## $ xnew : matrix [1:6831, 1:2] -2.6 -2.5 -2.4 -2.3 -2.2 -2.1 -2 -1.9 -1.8 -1.7 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:6831] "1" "2" "3" "4" ...
## .. ..$ : chr [1:2] "x1" "x2"
## $ prob : atomic [1:6831] 3.55e-05 3.05e-05 2.63e-05 2.27e-05 1.96e-05 ...
## ..- attr(*, ".Names")= chr [1:6831] "1" "2" "3" "4" ...
## $ marginal: atomic [1:6831] 6.65e-15 2.31e-14 7.62e-14 2.39e-13 7.15e-13 ...
## ..- attr(*, ".Names")= chr [1:6831] "1" "2" "3" "4" ...
## $ px1 : num [1:69] -2.6 -2.5 -2.4 -2.3 -2.2 -2.1 -2 -1.9 -1.8 -1.7 ...
## $ px2 : num [1:99] -2 -1.95 -1.9 -1.85 -1.8 -1.75 -1.7 -1.65 -1.6 -1.55 ...
## $ means : num [1:20, 1:2] -0.2534 0.2667 2.0965 -0.0613 2.7035 ...
training = as.data.frame(ESL.mixture$x)
names(training) = c('x1','x2')
y_train = ESL.mixture$y
y_train[y_train==0] = -1
When Cost=10000:
svmfit = train_linear_svm(training,y_train,scale=F,cost = 10000)
#bayes decision boundary
contour(ESL.mixture$px1,ESL.mixture$px2,matrix(ESL.mixture$prob,ncol=length(ESL.mixture$px2)),
levels=0.5,drawlabels = F,add=T,lty=2,lwd=2,col='purple')
names(svmfit)
## [1] "call" "type" "kernel"
## [4] "cost" "degree" "gamma"
## [7] "coef0" "nu" "epsilon"
## [10] "sparse" "scaled" "x.scale"
## [13] "y.scale" "nclasses" "levels"
## [16] "tot.nSV" "nSV" "labels"
## [19] "SV" "index" "rho"
## [22] "compprob" "probA" "probB"
## [25] "sigma" "coefs" "na.action"
## [28] "fitted" "decision.values"
ct = table(y_train,svmfit$fitted)
(ct[2]+ct[3])/sum(ct)
## [1] 0.27
prob = ESL.mixture$prob
(bayes.error<-sum(ESL.mixture$marginal*(prob*I(prob<0.5)+(1-prob)*I(prob>=.5))) )
## [1] 0.2101192
pred_y = predict(svmfit,ESL.mixture$xnew)
pred_y = as.integer(levels(pred_y)[pred_y])
(test.error<-sum(ESL.mixture$marginal*(prob*I(pred_y<0)+(1-prob)*I(pred_y>=0))) )
## [1] 0.2877039
So Bayes error is 0.2101192, training error 0.27 and test error 0.2877039. When Cost=.01:
svmfit = train_linear_svm(training,y_train,scale = F,cost = .01)
#bayes decision boundary
contour(ESL.mixture$px1,ESL.mixture$px2,matrix(ESL.mixture$prob,ncol=length(ESL.mixture$px2)),
levels=0.5,drawlabels = F,add=T,lty=2,lwd=2,col='purple')
ct = table(y_train,svmfit$fitted)
(ct[2]+ct[3])/sum(ct)
## [1] 0.26
pred_y = predict(svmfit,ESL.mixture$xnew)
pred_y = as.integer(levels(pred_y)[pred_y])
(test.error<-sum(ESL.mixture$marginal*(prob*I(pred_y<0)+(1-prob)*I(pred_y>=0))) )
## [1] 0.2999182
So training error is 0.26 and test error 0.2999182.
We need to find the parameters for optimal decision boundary using cross-validation. For the RBF kernel:
set.seed(123)
tuneout = tune(svm,training,y_train,kernel='radial',gamma=1,scale=F,
ranges = list(cost=c(0.1,.2,.5,1,2,5,10,20,50,100)))
tuneout
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 2
##
## - best performance: 0.5935475
tuneout = tune(svm,training,y_train,kernel='radial',gamma=1,scale=F,
ranges = list(cost=seq(1,4,length.out = 20)))
tuneout
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1.631579
##
## - best performance: 0.6176497
tuneout = tune(svm,training,y_train,kernel='radial',gamma=1,scale=F,
ranges = list(cost=seq(1,2.5,length.out = 20)))
tuneout
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 2.421053
##
## - best performance: 0.6201025
tuneout = tune(svm,training,y_train,kernel='radial',gamma=1,scale=F,
ranges = list(cost=seq(1.6,2.4,length.out = 20)))
tuneout
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 2.189474
##
## - best performance: 0.6220007
Then we use a cost = 2.19.
svmfit = svm(training,y_train,type='C',kernel='radial',scale = F,cost = 2.19,gamma = 1)
ct = table(y_train,svmfit$fitted)
(ct[2]+ct[3])/sum(ct)
## [1] 0.16
xgrid = makegrid(training)
ygrid = predict(svmfit,xgrid)
plot(xgrid,col=c('red','blue')[as.numeric(ygrid)],pch=20,cex=.2)
points(training,col=y_train+3,pch=20)
#decision boundary grid
xcont = makegrid(training,n = 200)
xcont = xcont[order(xcont$x1,xcont$x2),]
ycont = predict(svmfit,xcont,decision.values = T)
#decision boundary
contour(unique(xcont$x1),unique(xcont$x2),
t(matrix(attr(ycont,'decision.values'),ncol=length(unique(xcont$x2)))),
levels=0,drawlabels = F,add=T,lty=1,lwd=2,col='black')
#hyperplane for class -1
contour(unique(xcont$x1),unique(xcont$x2),
t(matrix(attr(ycont,'decision.values'),ncol=length(unique(xcont$x2)))),
levels=-1,drawlabels = F,add=T,lty=1,lwd=1,col='blue')
#hyperplane for class 1
contour(unique(xcont$x1),unique(xcont$x2),
t(matrix(attr(ycont,'decision.values'),ncol=length(unique(xcont$x2)))),
levels=1,drawlabels = F,add=T,lty=1,lwd=1,col='red')
pred_y = predict(svmfit,ESL.mixture$xnew)
pred_y = as.integer(levels(pred_y)[pred_y])
(test.error<-sum(ESL.mixture$marginal*(prob*I(pred_y<0)+(1-prob)*I(pred_y>=0))) )
## [1] 0.2185895
So training error is 0.16 and test error 0.2185895.
Now manually predict the fitted decision values for the training data and compare the result.
rbf <- function(sv,newdata,coef,gamma=1){
sv = t(sv)
rbf1 <- function(x){ #for a single vector x
diff = apply(sv-x,2,function(x) sum(x^2))
sum(exp(-gamma*diff)*coef)
}
apply(newdata,1,rbf1)
}
beta0 = svmfit$rho #negative intercept
my_pred = rbf(svmfit$SV,training,svmfit$coefs)-beta0
plot(my_pred,svmfit$decision.values)
abline(0,1,col='red',lwd=2)