Different type of support vector machines dealing with non-linear boundary

We’ll compare performance of support vector classifier and support vector machines with polynomial and radial kernels on the data with non-linear separation.

First, let’s generate some random data and assign points within some ellipse to class 1 while all the others will be assigned to class 0.

set.seed(1987)
xtrain=matrix(rnorm(200),ncol=2)
y=rep(1,100)
for (i in 1:nrow(xtrain)){
  if(xtrain[i,1]^2/3+xtrain[i,2]^2<1){
    y[i]=2
  }
}
plot(xtrain,col=y+2)

Now let see how linear support vector classifier deals with this data.

library(e1071)
dat=data.frame(x=xtrain,y=as.factor(y))
svc=svm(y~.,data=dat,kernel="linear",cost=10,scale=FALSE)
make.grid=function(x,n=75){
  grange=apply(x,2,range)
  x1=seq(from=grange[1,1],to=grange[2,1],length=n)
  x2=seq(from=grange[1,2],to=grange[2,2],length=n)
  expand.grid(x.1=x1,x.2=x2)
  }
xgrid=make.grid(xtrain)
ygrid=predict(svc,xgrid)
plot(xgrid,col=as.numeric(ygrid)+2,pch=20,cex=.4)
points(xtrain,col=y+2,pch=19)

Quite poor. But still let’s check the error rate.

ertable=table(true=y,pred=predict(svc,dat))
ertable
##     pred
## true  1  2
##    1 25 24
##    2  8 43
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.32

Now, polynomial with degree equals 3.

poly3=svm(y~.,data=dat,kernel="polynomial",degree=3, cost=10)
ygrid=predict(poly3,xgrid)
plot(xgrid,col=as.numeric(ygrid)+2,pch=20,cex=.4)
points(xtrain,col=y+2,pch=19)

ertable=table(true=y,pred=predict(poly3,dat))
ertable
##     pred
## true  1  2
##    1 10 39
##    2  0 51
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.39

Second degree does much better:

poly2=svm(y~.,data=dat,kernel="polynomial",degree=2, cost=10)
ygrid=predict(poly2,xgrid)
plot(xgrid,col=as.numeric(ygrid)+2,pch=20,cex=.4)
points(xtrain,col=y+2,pch=19)

ertable=table(true=y,pred=predict(poly2,dat))
ertable
##     pred
## true  1  2
##    1 47  2
##    2  4 47
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.06

Radial kernel:

radial=svm(y~.,data=dat,kernel="radial", cost=10)
ygrid=predict(radial,xgrid)
plot(xgrid,col=as.numeric(ygrid)+2,pch=20,cex=.4)
points(xtrain,col=y+2,pch=19)

ertable=table(true=y,pred=predict(radial,dat))
ertable
##     pred
## true  1  2
##    1 48  1
##    2  2 49
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.03

Now let’s generate some test data

set.seed(2048)
xtest=matrix(rnorm(2000),ncol=2)
ytest=rep(1,1000)
for (i in 1:nrow(xtest)){
  if(xtest[i,1]^2/3+xtest[i,2]^2<1){
    ytest[i]=2
  }
}
plot(xtest,col=ytest+2)

ertable=table(true=y,pred=predict(svc,dat))
ertable
##     pred
## true  1  2
##    1 25 24
##    2  8 43
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.32
ertable=table(true=y,pred=predict(poly3,dat))
ertable
##     pred
## true  1  2
##    1 10 39
##    2  0 51
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.39
ertable=table(true=y,pred=predict(poly2,dat))
ertable
##     pred
## true  1  2
##    1 47  2
##    2  4 47
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.06
ertable=table(true=y,pred=predict(radial,dat))
ertable
##     pred
## true  1  2
##    1 48  1
##    2  2 49
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.03

Comparison of logistic regression and SVM dealing with non-linear boundary

Here we will explore obtaining a non-linear decision boundary by performing logistic regression using non-linear transformation of the features.

First, let’s generate some data:

#500 points with coordinates drawn from uniform distribution in [-0.5,0.5]
x1=runif(500)-.5
x2=runif(500)-.5
#classification rule: point goes to class 1 if |x1|>|x2|
y=1*(x1^2-x2^2>0)

Let’s look on the points:

plot(x1,x2,col=y+2)

What if we try to fit simple logistic regression to this data?

simpleLog=glm(y~x1+x2,family="binomial")
pred=round(predict(simpleLog,newdata=as.data.frame(cbind(x1,x2)),type = "response"))
plot(x1,x2,col=pred+2)

table(y,pred)
##    pred
## y     0   1
##   0  62 180
##   1 114 144
#Error rate
ertable=table(y,pred)
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.588

Boundary is linear, error rate is quite high. Let’s try some non-linear predictors:

compLog=glm(y~x1+x2+I(x1^2)+I(x2^2)+log(x1+1)+sin(x2),family="binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred=round(predict(compLog,newdata=as.data.frame(cbind(x1,x2)),type = "response"))
plot(x1,x2,col=pred+2)

table(y,pred)
##    pred
## y     0   1
##   0 242   0
##   1   0 258
#Error rate
ertable=table(y,pred)
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0

Now let’s check how support vector classifier deals with such problem.

svc=svm(y~x1+x2,kernel="linear",cost=10)
pred=round(predict(svc,as.data.frame(cbind(x1,x2))))
plot(x1,x2,col=pred+2)

table(y,pred)
##    pred
## y     0   1
##   0 136 106
##   1 124 134
#Error rate
ertable=table(y,pred)
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.46

Boundary is linear, error rate is not very good.

How about SVM with radial kernel?

radial=svm(y~x1+x2,kernel="radial",cost=10)
pred=round(predict(radial,as.data.frame(cbind(x1,x2))))
plot(x1,x2,col=pred+2)

table(y,pred)
##    pred
## y     0   1
##   0 231  11
##   1   4 254
#Error rate
ertable=table(y,pred)
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.03

3% error rate with problems only in the little area in the center. Quite good.