#n: number of observations
#p: dimensionality
gen_data <- function(n, p, sparsity, level){
set.seed(44347)
X = matrix(rnorm(n*p), n, p)
w = c(rep(level, sparsity), rep(0,p - sparsity))
Y = X%*%w + rnorm(n)
training = list(X, Y, w)
return(training)
}
lasso_loss <- function(w, lambda){
loss <- sum((y - X%*%w)^2) + lambda * sum(abs(w))
return(loss)
}
training <- gen_data(50, 100, 5, 5)
X = training[[1]]
y = training[[2]]
w_true = training[[3]]
opt1 <- optim(par = rep(0,100), lasso_loss, method= 'BFGS', lambda = 1)
opt1$par
## [1] 4.606824e+00 3.908415e+00 4.723776e+00 4.700850e+00 5.362772e+00
## [6] -7.273283e-04 1.398976e-03 -1.525739e-01 4.157714e-01 1.530271e-03
## [11] 9.161512e-04 1.395263e-01 9.360521e-04 -1.700743e-02 -6.031554e-01
## [16] -2.451210e-01 -5.288032e-04 2.828499e-01 -1.690672e-03 -6.393211e-02
## [21] -2.719162e-01 -6.453265e-04 -4.047939e-05 -2.373779e-03 3.140585e-01
## [26] 8.465724e-02 2.192199e-03 -7.617371e-03 -5.531177e-04 -7.083452e-04
## [31] -2.074211e-01 1.788335e-02 -1.907016e-03 1.041423e-02 -1.293527e-01
## [36] -6.858803e-02 3.369819e-03 3.061043e-01 5.654715e-02 1.920272e-03
## [41] 3.202323e-01 2.847158e-01 1.835515e-01 4.737732e-04 -3.707745e-02
## [46] -6.827712e-02 -6.412329e-02 -1.452116e-02 -2.037922e-03 6.441002e-02
## [51] -4.745715e-03 1.216664e-02 -1.124578e-02 3.288435e-03 -8.313261e-04
## [56] -5.198048e-02 -2.496557e-01 -5.826357e-04 -1.278808e-01 6.199230e-02
## [61] 2.825630e-02 4.265550e-03 -2.756101e-01 -3.046280e-03 -1.599315e-03
## [66] -6.444854e-02 -1.051239e-01 -1.996625e-02 -8.623252e-02 1.056873e-01
## [71] -2.484298e-03 1.884428e-01 -6.716328e-04 -3.554912e-03 8.890913e-05
## [76] 2.423336e-03 -6.299370e-04 -9.630452e-04 -6.370846e-04 -5.592838e-03
## [81] 1.818511e-03 -1.145535e-01 -5.178572e-02 -1.759647e-01 -4.243718e-01
## [86] -7.930885e-03 -1.710921e-03 -2.120791e-01 -2.985285e-03 2.356901e-03
## [91] -7.168790e-04 -1.789444e-03 2.782760e-02 2.479881e-03 4.914437e-03
## [96] 7.396344e-02 -9.902068e-04 -1.782865e-02 1.498581e-01 6.382138e-02
plot(w_true, ylim = c(-5,5), main = "Solved w vs true w")
points(opt1$par, col = 'grey')
f_wrapper <- function(lambda_w){
return(lasso_loss(lambda_w[-1], abs(lambda_w[1])))
}
opt2 <- optim(par = c(1, rep(0,100)), f_wrapper, method= 'BFGS')
opt2$par
## [1] 4.371956e-11 3.253162e+00 3.149580e+00 3.331079e+00 3.996138e+00
## [6] 5.001000e+00 1.002772e-01 8.312386e-01 -3.282502e-01 7.207548e-01
## [11] -7.490584e-02 5.314196e-01 5.208195e-01 5.831044e-01 2.827395e-03
## [16] -5.478232e-01 2.559483e-01 3.345115e-01 1.017809e+00 -3.147572e-01
## [21] -1.632058e-01 6.213101e-01 9.224735e-01 -6.070451e-03 -3.781202e-01
## [26] 7.087044e-01 2.093848e-01 8.298121e-01 -6.718597e-02 -7.514664e-02
## [31] 1.711098e-01 1.284418e-01 1.706392e-01 -6.504792e-01 8.162341e-01
## [36] -7.501053e-01 -3.832642e-01 -2.900541e-01 7.178881e-01 -1.552082e-01
## [41] -2.021774e-01 5.301502e-01 1.768693e-02 2.919209e-01 -5.680135e-01
## [46] -1.458962e-01 2.881403e-02 1.083057e-01 2.057268e-01 -3.416004e-02
## [51] 1.457053e+00 2.511416e-02 6.718943e-01 -3.562613e-01 -9.585788e-02
## [56] -4.992048e-01 -1.029903e-01 -5.579465e-01 3.047045e-01 -7.668500e-01
## [61] 5.196560e-01 1.085961e-01 -3.004939e-02 -7.351503e-01 -1.099648e-01
## [66] -4.005508e-01 7.509414e-01 -1.509712e-01 -5.072848e-01 1.009408e-01
## [71] 4.117163e-01 5.014644e-03 -4.564629e-01 1.997661e-01 -2.416757e-01
## [76] -3.082098e-01 2.302661e-01 5.765322e-02 2.383085e-01 2.361254e-01
## [81] 3.951391e-01 -5.874315e-01 1.853459e-01 -6.809456e-03 -1.352024e+00
## [86] -8.099469e-01 -2.918493e-01 3.454813e-01 8.077418e-01 -2.681701e-01
## [91] 1.549052e-01 -2.311945e-01 1.029321e+00 -3.954511e-01 8.308367e-03
## [96] -9.356233e-02 -6.516291e-01 -6.046855e-01 -1.331489e-01 1.370477e+00
## [101] 1.712282e-01
As lambda is nonnegative, I set it to be absolute value in the loss function. When optimizing at the same time, lambda is solved to be approximately 0. This is a trivial result, because when minimizing error on lambda, the penalty will also be set to 0. That is why we should not optimize lambda together with w, but use cross validation to find the optimal value of lambda.
plot(w_true, ylim = c(-5,5), main="Solved w vs true w")
points(opt2$par[-1], col = 'grey')
soft_threshold <- function(w, th){
if(abs(w) < th){
return(0)
}else{
return(sign(w)*(abs(w)-th))
}
}
w = -5:5
plot(sapply(w, soft_threshold, th=1), type='l', col='red')
lasso1d <- function(x1, y1, lambda){
w = t(y1)%*%x1 / t(x1)%*%x1
return( soft_threshold(w, lambda/t(x1)%*%x1) )
}
lasso1d(X[,1], y, 1)
## [,1]
## [1,] 4.412147
get_residual <- function(w, dim){
w[dim] <- 0
Y_pred <- X %*% w
return(y - Y_pred)
}
coor_descent <- function(w0, epsilon, lambda){
w = w0
w_prev = w0 + epsilon + 1
while(sum(abs(w-w_prev) > epsilon)) {
w_prev = w
for(d in 1:length(w0)){
rd = get_residual(w, d)
xd = X[,d]
w_OLS = t(xd) %*% rd / t(xd) %*% xd
w[d] = soft_threshold(w_OLS, lambda/t(xd)%*%xd)
}
}
return(w)
}
w0 = rep(1, 100)
w_coor = coor_descent(w0, epsilon = 0.001, lambda = 1)
w_coor
## [1] 4.702946817 4.032027148 4.720111872 4.761301649 5.234768896
## [6] 0.000000000 0.000000000 -0.158086823 0.361122748 0.009207935
## [11] 0.000000000 0.082564517 0.000000000 0.000000000 -0.575217791
## [16] -0.243746428 0.000000000 0.261166421 0.000000000 -0.062083195
## [21] -0.249836216 0.000000000 0.000000000 0.000000000 0.267052324
## [26] 0.097939557 0.000000000 0.000000000 0.000000000 0.000000000
## [31] -0.102652544 0.002884931 0.000000000 0.000000000 -0.123126183
## [36] -0.141584954 0.000000000 0.286834249 0.000000000 0.000000000
## [41] 0.251910578 0.314751506 0.189547833 0.000000000 -0.046829812
## [46] 0.000000000 0.000000000 0.000000000 0.000000000 0.030106425
## [51] 0.000000000 0.000000000 0.000000000 0.012224755 0.000000000
## [56] -0.136867791 -0.189998739 0.000000000 -0.114320695 0.006695207
## [61] 0.043382151 0.000000000 -0.278524166 0.000000000 0.000000000
## [66] -0.029742288 -0.114762620 0.000000000 -0.102809184 0.105296598
## [71] -0.029216047 0.197163659 0.000000000 0.000000000 0.000000000
## [76] 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## [81] 0.000000000 -0.078705126 -0.037057968 -0.115805143 -0.366884192
## [86] -0.009452972 0.000000000 -0.206448566 0.000000000 0.000000000
## [91] 0.000000000 0.000000000 0.000000000 0.056485955 0.005867281
## [96] 0.003056681 -0.028667315 0.000000000 0.117932307 0.014486723
Result looks much better than the result of optim.
L2 = numeric(51)
L2[1] = sum((w_coor - w_true)^2)
for(i in 50:3){
X = X[-i,]
y = y[-i]
w = coor_descent(w0, epsilon = 0.001, lambda = 1)
L2[i + 1] = sum((w - w_true)^2)
}
# With only one sample point
X = matrix(X[-2,], 1, 100)
y = y[-2]
w = coor_descent(w0, epsilon = 0.001, lambda = 1)
L2[2] = sum((w - w_true)^2)
# 0 sample point
L2[1] = sum((w0 - w_true)^2)