Using quadprog package in R I performed the following minimization problem: \[\begin{align}
\text{minimize} & \quad \frac{1}{2} b^T \mathbf{D} b - d^T b, \nonumber \\
\text{subject to} & \quad \mathbf{A}^T b \geq b_0, \nonumber
\end{align}\] where \(b\) is the unknown parameter. For more details, read the documentation of the package on CRAN.
set.seed(4); n <-30; p <- 2
xpos <- matrix(rnorm(n*p, mean=0, sd=1), n, p)
xneg <- matrix(rnorm(n*p, mean=3, sd=1), n, p)
x <- rbind(xpos, xneg)
y <- matrix(c(rep(1, n), rep(-1, n)))
plot(x,col=ifelse(y>0,"darkorange", "deepskyblue"), pch = 19, xlab = "x1", ylab = "x2")
legend("topleft", c("Positive","Negative"),
col=c("darkorange", "deepskyblue"), pch=c(19, 19), text.col=c("darkorange", "deepskyblue"))
solve.QP() function solves QP problem in the form of: \[ min -d^T \beta +\frac{1}{2}\beta^T D \beta \hspace{0.3in} s.t.\hspace{0.3in} A^T \beta \geqslant a \].
The primal problem is: \[\underset{\beta,\beta_0} {min} \frac{1}{2}||\beta||^2 \hspace{0.3in} s.t.\hspace{0.3in} y_i(x_i^T\beta+\beta_0)\geq1,\hspace{0.1in}i=1,2,...,n\]
I defined a (p+1).(p+1) matrix D, n(p+1) matrix \(A^T\) and a \(1_{n+1}\) as a.
\(\beta_0\) is included in the objective function.
A small ridge (1e-5)I is added to D.
The \(i^{th}\) row of \(A^T\) is \(y_i(x_i^T,1)\).
library(quadprog)
# D matrix
D = diag(3)
D[3,3] = 1e-5
# d vector
d = rep(0, 3)
# A transpose matrix
A = sweep(cbind(x, 1), 1, y, FUN = "*")
# a vector
a = rep(1, 60)
# solve the quadratic program
alpha = solve.QP(Dmat = D, dvec = d, Amat = t(A), bvec = a)$solution
# get beta
b = alpha[1:2]
b0 = alpha[3]
# plot the decision line and the support vectors
plot(x,col=ifelse(y>0,"darkorange", "deepskyblue"), pch = 19, xlab = "x1",
ylab = "x2")
legend("topright", c("Positive","Negative"),
col=c("darkorange", "deepskyblue"), pch=c(19, 19), text.col=c("darkorange", "deepskyblue"))
points( x[sweep(x %*% b + b0, 1, y, FUN = "*") <= 1 + 1e-5, ] , pch = 1,
cex = 2)
abline(-b0/b[2], -b[1]/b[2])
abline(-(b0 - 1)/b[2], -b[1]/b[2], lty=3)
abline(-(b0 + 1)/b[2], -b[1]/b[2], lty=3)
The dual problem is: \[\underset{\alpha_i \geq 0} {max}\sum_{i}\alpha_i-\frac{1}{2}\sum_{jk}\alpha_j\alpha_ky_jy_k(x_j^Tx_k)\hspace{0.3in} s.t.\hspace{0.3in} 0\leq\alpha_i\leq C \hspace{0.1in}i=1,2,...,n \hspace{0.1in}and \hspace{0.1in} \sum_{i}\alpha_iy_i=0\]
The dual version of classifier is: \[f(x)=\sum_{i}^{n}\alpha_iy_i(x_i^Tx)+b\]
library(e1071)
eps <- 1e-6
# build the system matrices
Q <- sapply(1:n, function(i) y[i]*t(x)[,i])
D <- t(Q)%*%Q
d <- matrix(1, nrow=n)
b0 <- rbind( matrix(0, nrow=1, ncol=1) , matrix(0, nrow=n, ncol=1) )
A <- t(rbind(matrix(y, nrow=1, ncol=n), diag(nrow=n)))
# call the QP solver:
sol <- solve.QP(D +eps*diag(n), d, A, b0, meq=1, factorized=FALSE)
qpsol <- matrix(sol$solution, nrow=n)
qpsol2 <- cbind (1, qpsol)
# extract and plot the decision boundary.
# build the support vectors, slopes, and intercepts
findLine <- function(aa, y, x){
nonzero <- abs(aa) > 1e-6
W <- rowSums(sapply(which(nonzero), function(i) aa[i]*y[i]*x[i,]))
b <- mean(sapply(which(nonzero), function(i) x[i,]%*%W- y[i]))
slope <- -W[1]/W[2]
intercept <- b/W[2]
return(c(intercept,slope))
}
qpline <- findLine(qpsol2, y, x)
train <- data.frame(x,y)
#Determine svm line
C <- 1e5 # Huge value forces hard margin problem
sv <- svm(y~X1+X2, data=train, kernel="linear", scale=FALSE, type="C-classification", cost=C)
# get the slope and intercept terms from e1071 svm:
W <- rowSums(sapply(1:length(sv$coefs), function(i) sv$coefs[i]*sv$SV[i,]))
svmline = c(sv$rho/W[2], -W[1]/W[2])
# Plot
library(ggplot2)
plt <- ggplot(train, aes(x=X1, y=X2)) +
ggtitle("Solving the SVM QP") +
geom_point(aes(fill=factor(y)), size=3, pch=21) +
geom_abline(intercept=(svmline[1]-1), slope=svmline[2], size=1, linetype="dashed") +
geom_abline(intercept=svmline[1], slope=svmline[2], size=1)+
geom_abline(intercept=(svmline[1]+1), slope=svmline[2], size=1, linetype="dashed")+
scale_fill_manual(values=c("deepskyblue","darkorange"), guide='none')
print(plt)
One can also perform linear and nonlinear classification using the penalized loss framework. Consider the following logistic loss function: \[L(y, f(x)) = \log(1 + e^{- y f(x)}).\] ## a) Linear SVM
When \(f(x)\) is a linear function, SVM can be solved by optimizing the penalized loss: \[ \underset{\beta_0, \boldsymbol\beta}{\arg\min} \sum_{i=1}^n L(y_i, \beta_0 + x_i^T \boldsymbol\beta) + \lambda \lVert \beta \rVert^2\] I combined \(\beta_0\) and \(\beta\) into \(\alpha\) vector and the gradient of \(L(\alpha)\) becomes: \[\frac{\partial L}{\partial \alpha}=\frac{\partial L}{\partial f}\frac{\partial f}{\partial \alpha} \hspace{0.1in} with \hspace{0.1in} f(x)=x^T\beta \] \[=\frac{-ye^{-yf(x)}}{1-ye^{-yf(x)}}x\] where: \[\frac{\partial L}{\partial f}=\frac{-ye^{-yf(x)}}{1-ye^{-yf(x)}}\hspace{0.1in}and\hspace{0.1in}\frac{\partial f}{\partial \alpha}=x \]
Thus, the derivative of the loss function \(L(y,f(x))\) is: \[\sum_{i=1}^{n}\frac{\partial L_i}{\partial \alpha}+2\lambda(0,\beta^T)^T\] I chose \(\lambda=1\)
# loss function
svmfn <- function(b, x, y, lambda){
sum(log(1 + exp(- (x %*% b) * y))) + lambda * sum(b[-1]^2)
}
# gradient of loss function
svmgn <- function(b, x, y, lambda){
e1 = exp(-y*(x %*% b))
e2 = y*e1 / (1+e1)
return( - t(x) %*% e2 + c(0, 2*lambda*b[-1]) )
}
mysvmfit <- optim(par = rep(0, 3), fn = svmfn, gr = svmgn,
x = cbind(1, x), y = ifelse(y == "1", 1, -1), lambda =1,method = "BFGS")
b0 = mysvmfit$par[1]
b = mysvmfit$par[2:3]
print(c(b0, b))
## [1] 4.218373 -1.324632 -1.370595
plot(x,col=ifelse(y>0,"darkorange", "deepskyblue"), pch = 19, xlab = "x1",
ylab = "x2")
legend("topright", c("Positive","Negative"), col=c("darkorange", "deepskyblue"),
pch=c(19, 19), text.col=c("darkorange", "deepskyblue"))
abline(a= -b0/b[1], b=-b[1]/b[2], col="black", lty=1, lwd = 2)
#Training error
error=1 - mean( y * (x %*% b + b0) > 0)
error
## [1] 0
I generated data using the code provided below:
set.seed(2)
n = 300
p = 2 # dimension
# Generate the positive and negative examples
x <- matrix(runif(n*p), n, p)
side <- (x[, 2] > 0.5 + 0.3*sin(3*pi*x[, 1]))
y <- sample(c(1, -1), n, TRUE, c(0.9, 0.1))*(side == 1) + sample(c(1, -1), n, TRUE, c(0.1, 0.9))*(side == 0)
plot(x,col=ifelse(y>0,"darkorange", "deepskyblue"), pch = 19, xlab = "x1", ylab = "x2")
legend("top", c("Positive","Negative"),
col=c("darkorange", "deepskyblue"), pch=c(19, 19), text.col=c("darkorange", "deepskyblue"))
The gradient for for \(\lambda\beta^TK\beta\) is \(\lambda(K+K^T)\beta=2\lambda K \beta\). The gradient for the loss function is: \[\sum_{i=1}^{n}\frac{\partial L_i}{\partial \beta}+2\lambda K \beta\] where: \[\frac{\partial L_i}{\partial \beta}=\frac{-ye^{-yf(x_i)}}{1-ye^{-yf(x_i)}}x_i \] We used the following Gaussian kernel model: \[K_\lambda(x_i,x_j)=e^{-\frac{1}{2}\sum_{k=1}^{p}\left(\frac{x_{ik}-x_{jk}}{\lambda_k}\right)^2} \hspace{0.2in} and \hspace{0.2in} \lambda_k= \left (\frac{4}{p+2}\right)^{\frac{1}{p+4}}n^{-\frac{1}{p+4}}\widehat\sigma_k \]
# calculate the kernel matrix
# scaling x in dist() to avoid standard deviation
scale.factor = (4 / p+2)^(1/(p+4)) * n^(-1/(p+4))
K = exp(- as.matrix(dist( scale(x) / scale.factor))^2 / 2)
lambda = 1
# define the objective function
svmfn_k <- function(b, k, y, lambda){
sum(log(1 + exp(- (k %*% b) * y))) + lambda * t(b) %*% k %*% b
}
# define the gradient function
svmgn_k <- function(b, k, y, lambda){
a1 = exp(- (k %*% b) * y)
a2 = a1 / (1 + a1) * (-y)
return(colSums(sweep(k, 1, a2, FUN = "*")) + k %*% b * lambda * 2)
}
# perform optimization
mysvmfit <- optim(par = rep(0, n), fn = svmfn_k, gr = svmgn_k,
k = K, y = ifelse(y == "1", 1, -1), lambda = lambda,
method = "BFGS")
# the fitted data
plot(x, col = ifelse(K %*% mysvmfit$par > 0,"darkorange", "deepskyblue"),
pch = 19, xlab = "x1", ylab = "x2")
legend("top", c("Positive","Negative"),
col=c("darkorange", "deepskyblue"), pch=c(19, 19), text.col=c("darkorange", "deepskyblue"))
# mis-classification
table(K %*% mysvmfit$par > 0, y)
## y
## -1 1
## FALSE 148 30
## TRUE 16 106
# training err
1 - mean(( K %*% mysvmfit$par * y ) > 0)
## [1] 0.1533333