1 Make linear seperable sample

Becareful I only display how to do svm in linear seperable dataset and the dataset only have two label (y=1 or y=-1)

1.1 show how to make it

Our goal is to make a hyperplane that can separate two label(y)

In this simple case we only have two feature x1 and x2 and two label

y=1 or y=-1(maybe y=1 represent boy,y=-1 represent girl)

set.seed(2)
n = 5
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 - b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,4,2),pch=".",cex=7,xlab = "x1",ylab = "x2")

#blue is+1
#red  is-1

1.2 show the data we called df

df<- cbind(x,y)
colnames(df) <- c("x1","x2","y(label)")
df
##                x1         x2 y(label)
##  [1,] -0.89691455  3.0022627        1
##  [2,]  0.18484918  1.2929403        1
##  [3,]  1.58784533  0.9331813        1
##  [4,] -1.13037567  2.4920159        1
##  [5,] -0.08025176  1.8908161        1
##  [6,]  1.05177437 -3.0293208       -1
##  [7,] -0.75266554 -0.5458282       -1
##  [8,] -1.43967680 -0.2744484       -1
##  [9,] -0.28571150 -2.6395766       -1
## [10,] -1.03428514 -0.2304589       -1

2 Introduce svm algorithm

2.1 step1

\(f(x)=w_0+w_1x_1+w_2x_2\)

2.2 step2

2.3 step3 gradient decesnt

main algorithm

main algorithm

3 svm from scratch in R

3.1 prepare step

set.seed(2)
X<- cbind(1,x) #make design matrix
n <- nrow(X)  #number of sample
p <- ncol(X) #number of feature+1 (bias)
w_intial <- rnorm(p,-2,1)
w <- w_intial
eta <- 0.1
R <- 7 #number of iteration
W <- matrix(w_intial ,nrow = R+1,ncol = p,byrow = T) #matrix put intial guess and the procedure to do gradient descent 
W
##           [,1]      [,2]       [,3]
## [1,] -2.896915 -1.815151 -0.4121547
## [2,] -2.896915 -1.815151 -0.4121547
## [3,] -2.896915 -1.815151 -0.4121547
## [4,] -2.896915 -1.815151 -0.4121547
## [5,] -2.896915 -1.815151 -0.4121547
## [6,] -2.896915 -1.815151 -0.4121547
## [7,] -2.896915 -1.815151 -0.4121547
## [8,] -2.896915 -1.815151 -0.4121547

See the W matrix carefully.we have 3 parameter to optimize,so we have 3 column.the first line record the intial guess value of weight.we have 8 row because we want to record every time we do iteration.

X%*%w_intial
##             [,1]
##  [1,] -2.5062760
##  [2,] -3.7653351
##  [3,] -6.1637083
##  [4,] -1.8722082
##  [5,] -3.5305542
##  [6,] -3.5574949
##  [7,] -1.3057474
##  [8,] -0.1705688
##  [9,] -1.2903913
## [10,] -0.9245463

you can see that if you use intial guess to do it. you will predict all sample as girl.So,now you know you need algorithm to update it

3.2 indicator function

indicator<-function(condition) ifelse(condition,1,0)
a <- 3
indicator( a<1 )
## [1] 0
(a<4)*1
## [1] 1

3.3 part of gradient descent

3.3.1 step1

X%*%w
##             [,1]
##  [1,] -2.5062760
##  [2,] -3.7653351
##  [3,] -6.1637083
##  [4,] -1.8722082
##  [5,] -3.5305542
##  [6,] -3.5574949
##  [7,] -1.3057474
##  [8,] -0.1705688
##  [9,] -1.2903913
## [10,] -0.9245463
dim(X)
## [1] 10  3
sum(   ((y*(X%*%w_intial))<1)*1 * y * X[,1] )
## [1] 3
sum(   ((y*(X%*%w_intial))<1)*1 * y * X[,2] )
## [1] 2.139114
sum(   ((y*(X%*%w_intial))<1)*1 * y * X[,3] )
## [1] 10.11612

3.3.2 step2

w_intial[1]+eta*sum(   ((y*(X%*%w))<1)*1 * y * X[,1] )
## [1] -2.596915
w_intial[2]+eta*sum(   ((y*(X%*%w))<1)*1 * y * X[,2] )
## [1] -1.601239
w_intial[3]+eta*sum(   ((y*(X%*%w))<1)*1 * y * X[,3] )
## [1] 0.5994577

3.3.3 step3

w_intial[1]+eta*sum(   ((y*(X%*%W[1,]))<1)*1 * y * X[,1] )
## [1] -2.596915
#W[1,] imply the 1th iteration
#X[,1] imply the 
w_intial[2]+eta*sum(   ((y*(X%*%W[1,]))<1)*1 * y * X[,2] )
## [1] -1.601239
#X[,2] imply the x1 feature
w_intial[3]+eta*sum(   ((y*(X%*%W[1,]))<1)*1 * y * X[,3] )
## [1] 0.5994577

3.3.4 step4

for(i in 1:R){
for(j in 1:p){
W[i+1,j]<- W[i,j]+eta*sum(((y*(X%*%W[i,]))<1)*1 * y * X[,j] )  
}
}
W
##           [,1]       [,2]       [,3]
## [1,] -2.896915 -1.8151508 -0.4121547
## [2,] -2.596915 -1.6012394  0.5994577
## [3,] -2.196915 -1.4907564  1.5880242
## [4,] -1.996915 -1.1775445  2.0271628
## [5,] -1.896915 -0.8563073  2.2772198
## [6,] -1.696915 -0.6790379  2.4998319
## [7,] -1.596915 -0.5202534  2.5931501
## [8,] -1.496915 -0.3614688  2.6864682

3.3.5 step5 the result

print(   "  the intial guess result  ")
## [1] "  the intial guess result  "
X%*%W[1,]
##             [,1]
##  [1,] -2.5062760
##  [2,] -3.7653351
##  [3,] -6.1637083
##  [4,] -1.8722082
##  [5,] -3.5305542
##  [6,] -3.5574949
##  [7,] -1.3057474
##  [8,] -0.1705688
##  [9,] -1.2903913
## [10,] -0.9245463
print(   "  the result after 7 iteration  ")
## [1] "  the result after 7 iteration  "
X%*%W[nrow(W),]
##              [,1]
##  [1,]   6.8927753
##  [2,]   1.9097113
##  [3,]   0.4360907
##  [4,]   5.6064024
##  [5,]   3.6117113
##  [6,] -10.0152722
##  [7,]  -2.6911995
##  [8,]  -1.7138131
##  [9,]  -8.4847773
## [10,]  -1.7421731
  • you can see after 7 iteration our svm sucessfully predict perfectly

3.3.6 step6 the completely function

svm_gradient<- function(x,eta=0.001,R=10000){
X<- cbind(1,x)#make design matrix
n <- nrow(X)  #number of sample
p <- ncol(X) #number of feature+1 (bias)
w_intial <- rep(0,p)
W <- matrix(w_intial ,nrow = R+1,ncol = p,byrow = T) #matrix put intial guess and the procedure to do gradient descent
for(i in 1:R){
  for(j in 1:p)
  {
    W[i+1,j]<- W[i,j]+eta*sum(((y*(X%*%W[i,]))<1)*1 * y * X[,j] )  
  }
  }
return(W)  
}
getsvm <- function(x){
w_answer<- svm_gradient(x)[nrow(svm_gradient(x)),]
return(w_answer )
}

3.4 simple case

set.seed(2)
n = 5
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 - b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,4,2),pch=".",cex=7,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline((1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline((-1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)

set.seed(8)
n = 5
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 - b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,4,2),pch=".",cex=7,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline((1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline((-1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)

4 Result

4.1 Case1:

n = 100
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 - b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,2,4),pch=".",cex=3,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline((1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline((-1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)

4.2 Case2:

n = 100
a1 = rnorm(n)
a2 = 1 + a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 + b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,2,4),pch=".",cex=7,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline(1-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline(-1-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3],lty=2)

4.3 Case3: not linear seperable set

n = 100
a1 = rnorm(n)
a2 = 1 - a1 + 2* runif(n)
b1 = rnorm(n)
b2 = -1 + b1 - 2*runif(n)
x = rbind(matrix(cbind(a1,a2),,2),matrix(cbind(b1,b2),,2))
y <- matrix(c(rep(1,n),rep(-1,n)))
plot(x,col=ifelse(y>0,2,4),pch=".",cex=7,xlab = "x1",ylab = "x2")
w_answer<- getsvm(x)
abline(-w_answer[1]/w_answer[3],-w_answer[2]/w_answer[3])
abline((1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)
abline((-1-w_answer[1])/w_answer[3],-w_answer[2]/w_answer[3],lty=2)