Problem 1: LASSO

(a) Function to generate training dataset.

#n: number of observations
#p: dimensionality
gen_data <- function(n, p, sparsity, level){
    set.seed(44347)
    X = matrix(rnorm(n*p), n, p)
    w = c(rep(level, sparsity), rep(0,p - sparsity))
    Y = X%*%w + rnorm(n)
    training = list(X, Y, w)
    return(training)
}

(b) Define loss function of LASSO.

lasso_loss <- function(w, lambda){
    loss <- sum((y - X%*%w)^2) + lambda * sum(abs(w))    
    return(loss)
}

(c) Generate dataset.

training <- gen_data(50, 100, 5, 5)
X = training[[1]]
y = training[[2]]
w_true = training[[3]] 

(d) Use optim to find w with lambda=1.

opt1 <- optim(par = rep(0,100), lasso_loss, method= 'BFGS', lambda = 1)
opt1$par
##   [1]  4.606824e+00  3.908415e+00  4.723776e+00  4.700850e+00  5.362772e+00
##   [6] -7.273283e-04  1.398976e-03 -1.525739e-01  4.157714e-01  1.530271e-03
##  [11]  9.161512e-04  1.395263e-01  9.360521e-04 -1.700743e-02 -6.031554e-01
##  [16] -2.451210e-01 -5.288032e-04  2.828499e-01 -1.690672e-03 -6.393211e-02
##  [21] -2.719162e-01 -6.453265e-04 -4.047939e-05 -2.373779e-03  3.140585e-01
##  [26]  8.465724e-02  2.192199e-03 -7.617371e-03 -5.531177e-04 -7.083452e-04
##  [31] -2.074211e-01  1.788335e-02 -1.907016e-03  1.041423e-02 -1.293527e-01
##  [36] -6.858803e-02  3.369819e-03  3.061043e-01  5.654715e-02  1.920272e-03
##  [41]  3.202323e-01  2.847158e-01  1.835515e-01  4.737732e-04 -3.707745e-02
##  [46] -6.827712e-02 -6.412329e-02 -1.452116e-02 -2.037922e-03  6.441002e-02
##  [51] -4.745715e-03  1.216664e-02 -1.124578e-02  3.288435e-03 -8.313261e-04
##  [56] -5.198048e-02 -2.496557e-01 -5.826357e-04 -1.278808e-01  6.199230e-02
##  [61]  2.825630e-02  4.265550e-03 -2.756101e-01 -3.046280e-03 -1.599315e-03
##  [66] -6.444854e-02 -1.051239e-01 -1.996625e-02 -8.623252e-02  1.056873e-01
##  [71] -2.484298e-03  1.884428e-01 -6.716328e-04 -3.554912e-03  8.890913e-05
##  [76]  2.423336e-03 -6.299370e-04 -9.630452e-04 -6.370846e-04 -5.592838e-03
##  [81]  1.818511e-03 -1.145535e-01 -5.178572e-02 -1.759647e-01 -4.243718e-01
##  [86] -7.930885e-03 -1.710921e-03 -2.120791e-01 -2.985285e-03  2.356901e-03
##  [91] -7.168790e-04 -1.789444e-03  2.782760e-02  2.479881e-03  4.914437e-03
##  [96]  7.396344e-02 -9.902068e-04 -1.782865e-02  1.498581e-01  6.382138e-02
plot(w_true, ylim = c(-5,5), main = "Solved w vs true w")
points(opt1$par, col = 'grey')

(e) Use optim to find best w and lambda.

f_wrapper <- function(lambda_w){
    return(lasso_loss(lambda_w[-1], abs(lambda_w[1])))    
}
    
opt2 <- optim(par = c(1, rep(0,100)), f_wrapper, method= 'BFGS')
opt2$par
##   [1]  4.371956e-11  3.253162e+00  3.149580e+00  3.331079e+00  3.996138e+00
##   [6]  5.001000e+00  1.002772e-01  8.312386e-01 -3.282502e-01  7.207548e-01
##  [11] -7.490584e-02  5.314196e-01  5.208195e-01  5.831044e-01  2.827395e-03
##  [16] -5.478232e-01  2.559483e-01  3.345115e-01  1.017809e+00 -3.147572e-01
##  [21] -1.632058e-01  6.213101e-01  9.224735e-01 -6.070451e-03 -3.781202e-01
##  [26]  7.087044e-01  2.093848e-01  8.298121e-01 -6.718597e-02 -7.514664e-02
##  [31]  1.711098e-01  1.284418e-01  1.706392e-01 -6.504792e-01  8.162341e-01
##  [36] -7.501053e-01 -3.832642e-01 -2.900541e-01  7.178881e-01 -1.552082e-01
##  [41] -2.021774e-01  5.301502e-01  1.768693e-02  2.919209e-01 -5.680135e-01
##  [46] -1.458962e-01  2.881403e-02  1.083057e-01  2.057268e-01 -3.416004e-02
##  [51]  1.457053e+00  2.511416e-02  6.718943e-01 -3.562613e-01 -9.585788e-02
##  [56] -4.992048e-01 -1.029903e-01 -5.579465e-01  3.047045e-01 -7.668500e-01
##  [61]  5.196560e-01  1.085961e-01 -3.004939e-02 -7.351503e-01 -1.099648e-01
##  [66] -4.005508e-01  7.509414e-01 -1.509712e-01 -5.072848e-01  1.009408e-01
##  [71]  4.117163e-01  5.014644e-03 -4.564629e-01  1.997661e-01 -2.416757e-01
##  [76] -3.082098e-01  2.302661e-01  5.765322e-02  2.383085e-01  2.361254e-01
##  [81]  3.951391e-01 -5.874315e-01  1.853459e-01 -6.809456e-03 -1.352024e+00
##  [86] -8.099469e-01 -2.918493e-01  3.454813e-01  8.077418e-01 -2.681701e-01
##  [91]  1.549052e-01 -2.311945e-01  1.029321e+00 -3.954511e-01  8.308367e-03
##  [96] -9.356233e-02 -6.516291e-01 -6.046855e-01 -1.331489e-01  1.370477e+00
## [101]  1.712282e-01

As lambda is nonnegative, I set it to be absolute value in the loss function. When optimizing at the same time, lambda is solved to be approximately 0. This is a trivial result, because when minimizing error on lambda, the penalty will also be set to 0. That is why we should not optimize lambda together with w, but use cross validation to find the optimal value of lambda.

plot(w_true, ylim = c(-5,5), main="Solved w vs true w")
points(opt2$par[-1], col = 'grey')

Problem 2: Coordinate Descent

1. Calculate soft threshold.

soft_threshold <- function(w, th){
    if(abs(w) < th){
        return(0)
    }else{
        return(sign(w)*(abs(w)-th))
    }
}

w = -5:5
plot(sapply(w, soft_threshold, th=1), type='l', col='red')

2. Solve 1-d case.

lasso1d <- function(x1, y1, lambda){
    w = t(y1)%*%x1 / t(x1)%*%x1 
    return( soft_threshold(w, lambda/t(x1)%*%x1) )
}
lasso1d(X[,1], y, 1)
##          [,1]
## [1,] 4.412147

3. Get residual for some dimension.

get_residual <- function(w, dim){
    w[dim] <- 0
    Y_pred <- X %*% w
    return(y - Y_pred)
}

4.Solve w by coordinate descent.

coor_descent <-  function(w0, epsilon, lambda){
    w = w0
    w_prev = w0 + epsilon + 1
    while(sum(abs(w-w_prev) > epsilon)) {
        w_prev = w
        for(d in 1:length(w0)){
            rd = get_residual(w, d)
            xd = X[,d]
            w_OLS = t(xd) %*% rd / t(xd) %*% xd
            w[d] = soft_threshold(w_OLS, lambda/t(xd)%*%xd)
        } 
    }
    return(w)
}

5. Run coordinate descent on earlier dataset, with lambda = 1.

w0 = rep(1, 100)
w_coor = coor_descent(w0, epsilon = 0.001, lambda = 1)
w_coor
##   [1]  4.702946817  4.032027148  4.720111872  4.761301649  5.234768896
##   [6]  0.000000000  0.000000000 -0.158086823  0.361122748  0.009207935
##  [11]  0.000000000  0.082564517  0.000000000  0.000000000 -0.575217791
##  [16] -0.243746428  0.000000000  0.261166421  0.000000000 -0.062083195
##  [21] -0.249836216  0.000000000  0.000000000  0.000000000  0.267052324
##  [26]  0.097939557  0.000000000  0.000000000  0.000000000  0.000000000
##  [31] -0.102652544  0.002884931  0.000000000  0.000000000 -0.123126183
##  [36] -0.141584954  0.000000000  0.286834249  0.000000000  0.000000000
##  [41]  0.251910578  0.314751506  0.189547833  0.000000000 -0.046829812
##  [46]  0.000000000  0.000000000  0.000000000  0.000000000  0.030106425
##  [51]  0.000000000  0.000000000  0.000000000  0.012224755  0.000000000
##  [56] -0.136867791 -0.189998739  0.000000000 -0.114320695  0.006695207
##  [61]  0.043382151  0.000000000 -0.278524166  0.000000000  0.000000000
##  [66] -0.029742288 -0.114762620  0.000000000 -0.102809184  0.105296598
##  [71] -0.029216047  0.197163659  0.000000000  0.000000000  0.000000000
##  [76]  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000
##  [81]  0.000000000 -0.078705126 -0.037057968 -0.115805143 -0.366884192
##  [86] -0.009452972  0.000000000 -0.206448566  0.000000000  0.000000000
##  [91]  0.000000000  0.000000000  0.000000000  0.056485955  0.005867281
##  [96]  0.003056681 -0.028667315  0.000000000  0.117932307  0.014486723

Result looks much better than the result of optim.

6.

L2 = numeric(51)
L2[1] = sum((w_coor - w_true)^2)
for(i in 50:3){
    X = X[-i,]
    y = y[-i]
    w = coor_descent(w0, epsilon = 0.001, lambda = 1)
    L2[i + 1] = sum((w - w_true)^2)
}
# With only one sample point
X = matrix(X[-2,], 1, 100)
y = y[-2]
w = coor_descent(w0, epsilon = 0.001, lambda = 1)
L2[2] = sum((w - w_true)^2)
# 0 sample point
L2[1] = sum((w0 - w_true)^2)