# Big Data Exercise07 Alternating Direction Method of Multipliers
# Yanxin Li
# Oct 29 2017
rm(list=ls())
setwd("D:/2017 UT Austin/Statistical Models for Big Data/R")
# Negative log likelihood function
nll <- function(X, Y, beta) {
A <- Y - X %*% beta
loglike <- crossprod(A)
return(loglike)
}
# Proximal operator
prox.l1 <- function(u, lambda) {
uhat <- abs(u) - lambda
prox <- sign(u) * pmax(rep(0, length(u)), uhat)
return(prox)
}
# Alternating Direction Method of Multipliers
ADMM <- function(X,Y,rho=5,lambda=.1,iter=50000,e.abs=1E-3,e.rel=1E-6){
n <- nrow(X); p <- ncol(X)
XtX <- crossprod(X)
XtY <- crossprod(X, Y) # For simplicity
# Rescale lambda to match glmnet results
lambda <- lambda * n
# Define Euclidean (l2) norm of a vector
l2norm <- function(x) sqrt(sum(x^2))
# Initialize coefficients matrix
beta <- matrix(0, nrow=iter, ncol=p)
beta[1,] <- rep(0, p)
# Initialize objective funtion
obj <- rep(0, iter)
obj[1] <- nll(X, Y, beta[1,]) + lambda * sum(abs(beta[1,]))
# Initialize gamma and v
gamma <- matrix(0, nrow=iter, ncol=p)
v <- rep(0, p) # Lagrangian
# Initialize convergence message in case convergence not reached
message <- "Convergence not reached..."
# Compute inverse matrix to obtain beta
invmat <- solve(XtX + diag(rho, p))
# Initialize residual vectors.
s <- 0 # dual residual
r <- 0 # primal residual
# Initialize iterator
t <- 0
# ADMM updates
for (t in 2:iter){
# Update beta, gamma, v, and obj
beta[t,] <- invmat %*% (XtY + rho * (gamma[t-1,]-v) )
gamma[t,] <- prox.l1(beta[t,] + v, lambda/rho)
v <- v + beta[t,] - gamma[t,]
obj[t] <- nll(X, Y, beta[t,]) + lambda * sum(abs(beta[t,]))
# Calculate residuals for iteration t
r <- beta[t,] - gamma[t,]
s <- -rho * (gamma[t,] - gamma[t-1,])
r.norm <- l2norm(r)
s.norm <- l2norm(s)
e.primal <- sqrt(p) * e.abs + e.rel * max(l2norm(beta[t,]), l2norm(gamma[t,]))
e.dual <- sqrt(n) * e.abs + e.rel * l2norm(v)
# Check convergence
if (r.norm <= e.primal && s.norm <= e.dual){
# Remove excess beta and nll
beta <- beta[-((t+1):nrow(beta)),]
obj <- obj[-((t+1):length(obj))]
# Update convergence message
message <- sprintf("Convergence reached after %i iterations", (t))
break
}
}
result <- list("beta.hat"=beta[nrow(beta),],"beta"=beta,
"objective"=obj,"conv"=message, "iter"=t)
return(result)
}
library(MASS)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
# Read in data
X <- as.matrix(read.csv("diabetesX.csv", header = TRUE))
Y <- as.numeric(unlist(read.csv("diabetesY.csv", header = FALSE)))
# Scale data
X <- scale(X)
Y <- scale(Y)
# Run ADMM and calculate time to reach convergence
lassoADMM <- ADMM(X,Y,rho=5,lambda=0.012)
system.time(ADMM(X,Y,rho=5,lambda=0.012))
## user system elapsed
## 0.15 0.00 0.16
# Check interation used for convergence
print(lassoADMM$conv)
## [1] "Convergence reached after 121 iterations"
# Number of nonzero coefficients by using ADMM
sum(lassoADMM$beta.hat != 0)
## [1] 64
# Plot
plot(lassoADMM$objective, type="l", xlab="iteration", ylab="objective",
log="x", cex.main=1, main= "Lasso objective function")
points(lassoADMM$iter, lassoADMM$objective[lassoADMM$iter],
pch=19, col="red")

# Compare ADMM with glmnet
# Setup the range of lambda
lambda <- exp(seq(0,-7,length.out=100)) # [0.00091, 1]
# Calculate coefficients using ADMM
betamat <- matrix(rep(NA, length(lambda)*ncol(X)), ncol=length(lambda))
for (i in 1:length(lambda)) {
mylasso <- ADMM(X, Y, lambda=lambda[i])
betamat[, i] <- mylasso$beta.hat
}
# Fit Lasso model across a range of lambda values
myLasso <- glmnet(X,Y,alpha=1,lambda=lambda)
# Plot all beta's vs. lambda both from accelerated proximal gradient method and glmnet
par(mar=c(4,5,4,2))
par(mfrow=c(1,2))
plot(0,0,type='n', ylim=c(min(betamat), max(betamat)), cex.main=0.8,
ylab=expression(hat(beta)[lambda]), xlim=log(range(lambda)),
xlab=expression(paste('log(',lambda,')')), main='Coefficients of ADMM')
for (i in 1:nrow(betamat)) { lines(log(lambda), betamat[i, ], col=i) }
# plot(myLasso) # R default, the L1 norm is the regularization term for Lasso
plot(0,0,type='n', ylim=range(myLasso$beta), cex.main=0.8,
ylab=expression(hat(beta)[lambda]), xlim=log(range(myLasso$lambda)),
xlab=expression(paste('log(',lambda,')')), main='Coefficients of glmnet Package')
for(i in 1:nrow(myLasso$beta)){ lines(log(lambda),myLasso$beta[i,],col=i) }

# 10-fold Cross-validation to choose the best tuning parameter lambda for glmnet
set.seed(1)
ratio <- 0.5 # Split data
index <- sample(1:nrow(X), floor(ratio*nrow(X))) # Make equal partition
# 10-fold cross-validation
set.seed(1)
cv.out <- cv.glmnet(X[index, ],Y[index],alpha =1)
par(mfrow=c(1,1))
plot(cv.out)

bestlam <- cv.out$lambda.min
# The value of lambda that results in the smallest CV error is 0.01212987
print(bestlam)
## [1] 0.01212987
# Comparison of different rho with best lambda
rho <- c(0.1, 0.3, 1, 5, 10, 20)
color <- c(1:length(rho))
niter <- rep(0,length(rho))
# Plot ADMM objective by varying penalty parameter
# pdf("ADMM.pdf")
plot(0,type="l", xlim=c(1,15000), ylim=c(180,420),
xlab="iteration", ylab="objective",
log="x", cex.main=1, main= "Lasso objective function")
for (k in 1:length(rho)) {
lassoADMM <- ADMM(X, Y, lambda=0.012, rho=rho[k])
lines(lassoADMM$objective, col=color[k])
niter[k] <- lassoADMM$iter
obj <- lassoADMM$objective[niter[k]]
points(niter[k], obj, pch=19, col=color[7-k])
}
legend('topright', legend=rho, title='Penalty Parameter', col=7-color, pch=19)

# dev.off()
print(cbind(rho,niter))
## rho niter
## [1,] 0.1 5958
## [2,] 0.3 1987
## [3,] 1.0 598
## [4,] 5.0 121
## [5,] 10.0 62
## [6,] 20.0 32