We’ll compare performance of support vector classifier and support vector machines with polynomial and radial kernels on the data with non-linear separation.
First, let’s generate some random data and assign points within some ellipse to class 1 while all the others will be assigned to class 0.
set.seed(1987)
xtrain=matrix(rnorm(200),ncol=2)
y=rep(1,100)
for (i in 1:nrow(xtrain)){
if(xtrain[i,1]^2/3+xtrain[i,2]^2<1){
y[i]=2
}
}
plot(xtrain,col=y+2)
Now let see how linear support vector classifier deals with this data.
library(e1071)
dat=data.frame(x=xtrain,y=as.factor(y))
svc=svm(y~.,data=dat,kernel="linear",cost=10,scale=FALSE)
make.grid=function(x,n=75){
grange=apply(x,2,range)
x1=seq(from=grange[1,1],to=grange[2,1],length=n)
x2=seq(from=grange[1,2],to=grange[2,2],length=n)
expand.grid(x.1=x1,x.2=x2)
}
xgrid=make.grid(xtrain)
ygrid=predict(svc,xgrid)
plot(xgrid,col=as.numeric(ygrid)+2,pch=20,cex=.4)
points(xtrain,col=y+2,pch=19)
Quite poor. But still let’s check the error rate.
ertable=table(true=y,pred=predict(svc,dat))
ertable
## pred
## true 1 2
## 1 25 24
## 2 8 43
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.32
Now, polynomial with degree equals 3.
poly3=svm(y~.,data=dat,kernel="polynomial",degree=3, cost=10)
ygrid=predict(poly3,xgrid)
plot(xgrid,col=as.numeric(ygrid)+2,pch=20,cex=.4)
points(xtrain,col=y+2,pch=19)
ertable=table(true=y,pred=predict(poly3,dat))
ertable
## pred
## true 1 2
## 1 10 39
## 2 0 51
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.39
Second degree does much better:
poly2=svm(y~.,data=dat,kernel="polynomial",degree=2, cost=10)
ygrid=predict(poly2,xgrid)
plot(xgrid,col=as.numeric(ygrid)+2,pch=20,cex=.4)
points(xtrain,col=y+2,pch=19)
ertable=table(true=y,pred=predict(poly2,dat))
ertable
## pred
## true 1 2
## 1 47 2
## 2 4 47
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.06
Radial kernel:
radial=svm(y~.,data=dat,kernel="radial", cost=10)
ygrid=predict(radial,xgrid)
plot(xgrid,col=as.numeric(ygrid)+2,pch=20,cex=.4)
points(xtrain,col=y+2,pch=19)
ertable=table(true=y,pred=predict(radial,dat))
ertable
## pred
## true 1 2
## 1 48 1
## 2 2 49
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.03
Now let’s generate some test data
set.seed(2048)
xtest=matrix(rnorm(2000),ncol=2)
ytest=rep(1,1000)
for (i in 1:nrow(xtest)){
if(xtest[i,1]^2/3+xtest[i,2]^2<1){
ytest[i]=2
}
}
plot(xtest,col=ytest+2)
ertable=table(true=y,pred=predict(svc,dat))
ertable
## pred
## true 1 2
## 1 25 24
## 2 8 43
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.32
ertable=table(true=y,pred=predict(poly3,dat))
ertable
## pred
## true 1 2
## 1 10 39
## 2 0 51
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.39
ertable=table(true=y,pred=predict(poly2,dat))
ertable
## pred
## true 1 2
## 1 47 2
## 2 4 47
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.06
ertable=table(true=y,pred=predict(radial,dat))
ertable
## pred
## true 1 2
## 1 48 1
## 2 2 49
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.03
Here we will explore obtaining a non-linear decision boundary by performing logistic regression using non-linear transformation of the features.
First, let’s generate some data:
#500 points with coordinates drawn from uniform distribution in [-0.5,0.5]
x1=runif(500)-.5
x2=runif(500)-.5
#classification rule: point goes to class 1 if |x1|>|x2|
y=1*(x1^2-x2^2>0)
Let’s look on the points:
plot(x1,x2,col=y+2)
What if we try to fit simple logistic regression to this data?
simpleLog=glm(y~x1+x2,family="binomial")
pred=round(predict(simpleLog,newdata=as.data.frame(cbind(x1,x2)),type = "response"))
plot(x1,x2,col=pred+2)
table(y,pred)
## pred
## y 0 1
## 0 62 180
## 1 114 144
#Error rate
ertable=table(y,pred)
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.588
Boundary is linear, error rate is quite high. Let’s try some non-linear predictors:
compLog=glm(y~x1+x2+I(x1^2)+I(x2^2)+log(x1+1)+sin(x2),family="binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred=round(predict(compLog,newdata=as.data.frame(cbind(x1,x2)),type = "response"))
plot(x1,x2,col=pred+2)
table(y,pred)
## pred
## y 0 1
## 0 242 0
## 1 0 258
#Error rate
ertable=table(y,pred)
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0
Now let’s check how support vector classifier deals with such problem.
svc=svm(y~x1+x2,kernel="linear",cost=10)
pred=round(predict(svc,as.data.frame(cbind(x1,x2))))
plot(x1,x2,col=pred+2)
table(y,pred)
## pred
## y 0 1
## 0 136 106
## 1 124 134
#Error rate
ertable=table(y,pred)
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.46
Boundary is linear, error rate is not very good.
How about SVM with radial kernel?
radial=svm(y~x1+x2,kernel="radial",cost=10)
pred=round(predict(radial,as.data.frame(cbind(x1,x2))))
plot(x1,x2,col=pred+2)
table(y,pred)
## pred
## y 0 1
## 0 231 11
## 1 4 254
#Error rate
ertable=table(y,pred)
(ertable[2,1]+ertable[1,2])/sum(ertable)
## [1] 0.03
3% error rate with problems only in the little area in the center. Quite good.