Becareful I only display how to do svm in linear seperable dataset and the dataset only have two label (y=1 or y=-1)
Our goal is to make a hyperplane that can separate two label(y)
In this simple case we only have two feature x1 and x2 and two label
y=1 or y=-1(maybe y=1 represent boy,y=-1 represent girl)
set.seed(2)
n = 5
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 - b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,4,2),pch=".",cex=7,xlab = "x1",ylab = "x2")
#blue is+1
#red is-1
df<- cbind(x,y)
colnames(df) <- c("x1","x2","y(label)")
df
## x1 x2 y(label)
## [1,] -0.89691455 3.0022627 1
## [2,] 0.18484918 1.2929403 1
## [3,] 1.58784533 0.9331813 1
## [4,] -1.13037567 2.4920159 1
## [5,] -0.08025176 1.8908161 1
## [6,] 1.05177437 -3.0293208 -1
## [7,] -0.75266554 -0.5458282 -1
## [8,] -1.43967680 -0.2744484 -1
## [9,] -0.28571150 -2.6395766 -1
## [10,] -1.03428514 -0.2304589 -1
\(f(x)=w_0+w_1x_1+w_2x_2\)
main algorithm
set.seed(2)
X<- cbind(1,x) #make design matrix
n <- nrow(X) #number of sample
p <- ncol(X) #number of feature+1 (bias)
w_intial <- rnorm(p,-2,1)
w <- w_intial
eta <- 0.1
R <- 7 #number of iteration
W <- matrix(w_intial ,nrow = R+1,ncol = p,byrow = T) #matrix put intial guess and the procedure to do gradient descent
W
## [,1] [,2] [,3]
## [1,] -2.896915 -1.815151 -0.4121547
## [2,] -2.896915 -1.815151 -0.4121547
## [3,] -2.896915 -1.815151 -0.4121547
## [4,] -2.896915 -1.815151 -0.4121547
## [5,] -2.896915 -1.815151 -0.4121547
## [6,] -2.896915 -1.815151 -0.4121547
## [7,] -2.896915 -1.815151 -0.4121547
## [8,] -2.896915 -1.815151 -0.4121547
See the W matrix carefully.we have 3 parameter to optimize,so we have 3 column.the first line record the intial guess value of weight.we have 8 row because we want to record every time we do iteration.
X%*%w_intial
## [,1]
## [1,] -2.5062760
## [2,] -3.7653351
## [3,] -6.1637083
## [4,] -1.8722082
## [5,] -3.5305542
## [6,] -3.5574949
## [7,] -1.3057474
## [8,] -0.1705688
## [9,] -1.2903913
## [10,] -0.9245463
you can see that if you use intial guess to do it. you will predict all sample as girl.So,now you know you need algorithm to update it
indicator<-function(condition) ifelse(condition,1,0)
a <- 3
indicator( a<1 )
## [1] 0
(a<4)*1
## [1] 1
X%*%w
## [,1]
## [1,] -2.5062760
## [2,] -3.7653351
## [3,] -6.1637083
## [4,] -1.8722082
## [5,] -3.5305542
## [6,] -3.5574949
## [7,] -1.3057474
## [8,] -0.1705688
## [9,] -1.2903913
## [10,] -0.9245463
dim(X)
## [1] 10 3
sum( ((y*(X%*%w_intial))<1)*1 * y * X[,1] )
## [1] 3
sum( ((y*(X%*%w_intial))<1)*1 * y * X[,2] )
## [1] 2.139114
sum( ((y*(X%*%w_intial))<1)*1 * y * X[,3] )
## [1] 10.11612
w_intial[1]+eta*sum( ((y*(X%*%w))<1)*1 * y * X[,1] )
## [1] -2.596915
w_intial[2]+eta*sum( ((y*(X%*%w))<1)*1 * y * X[,2] )
## [1] -1.601239
w_intial[3]+eta*sum( ((y*(X%*%w))<1)*1 * y * X[,3] )
## [1] 0.5994577
w_intial[1]+eta*sum( ((y*(X%*%W[1,]))<1)*1 * y * X[,1] )
## [1] -2.596915
#W[1,] imply the 1th iteration
#X[,1] imply the
w_intial[2]+eta*sum( ((y*(X%*%W[1,]))<1)*1 * y * X[,2] )
## [1] -1.601239
#X[,2] imply the x1 feature
w_intial[3]+eta*sum( ((y*(X%*%W[1,]))<1)*1 * y * X[,3] )
## [1] 0.5994577
for(i in 1:R){
for(j in 1:p){
W[i+1,j]<- W[i,j]+eta*sum(((y*(X%*%W[i,]))<1)*1 * y * X[,j] )
}
}
W
## [,1] [,2] [,3]
## [1,] -2.896915 -1.8151508 -0.4121547
## [2,] -2.596915 -1.6012394 0.5994577
## [3,] -2.196915 -1.4907564 1.5880242
## [4,] -1.996915 -1.1775445 2.0271628
## [5,] -1.896915 -0.8563073 2.2772198
## [6,] -1.696915 -0.6790379 2.4998319
## [7,] -1.596915 -0.5202534 2.5931501
## [8,] -1.496915 -0.3614688 2.6864682
print( " the intial guess result ")
## [1] " the intial guess result "
X%*%W[1,]
## [,1]
## [1,] -2.5062760
## [2,] -3.7653351
## [3,] -6.1637083
## [4,] -1.8722082
## [5,] -3.5305542
## [6,] -3.5574949
## [7,] -1.3057474
## [8,] -0.1705688
## [9,] -1.2903913
## [10,] -0.9245463
print( " the result after 7 iteration ")
## [1] " the result after 7 iteration "
X%*%W[nrow(W),]
## [,1]
## [1,] 6.8927753
## [2,] 1.9097113
## [3,] 0.4360907
## [4,] 5.6064024
## [5,] 3.6117113
## [6,] -10.0152722
## [7,] -2.6911995
## [8,] -1.7138131
## [9,] -8.4847773
## [10,] -1.7421731
svm_gradient<- function(x,eta=0.001,R=10000){
X<- cbind(1,x)#make design matrix
n <- nrow(X) #number of sample
p <- ncol(X) #number of feature+1 (bias)
w_intial <- rep(0,p)
W <- matrix(w_intial ,nrow = R+1,ncol = p,byrow = T) #matrix put intial guess and the procedure to do gradient descent
for(i in 1:R){
for(j in 1:p)
{
W[i+1,j]<- W[i,j]+eta*sum(((y*(X%*%W[i,]))<1)*1 * y * X[,j] )
}
}
return(W)
}
getsvm <- function(x){
w_answer<- svm_gradient(x)[nrow(svm_gradient(x)),]
return(w_answer )
}
set.seed(2)
n = 5
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 - b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,4,2),pch=".",cex=7,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline((1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline((-1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
set.seed(8)
n = 5
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 - b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,4,2),pch=".",cex=7,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline((1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline((-1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
n = 100
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 - b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,2,4),pch=".",cex=3,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline((1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline((-1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
n = 100
a1 = rnorm(n)
a2 = 1 + a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 + b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,2,4),pch=".",cex=7,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline(1-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline(-1-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
n = 100
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 + b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,2,4),pch=".",cex=7,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline((1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline((-1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)