# Big Data Exercise07 Alternating Direction Method of Multipliers
# Yanxin Li
# Oct 29 2017

rm(list=ls())
setwd("D:/2017 UT Austin/Statistical Models for Big Data/R")
# Negative log likelihood function
nll <- function(X, Y, beta) {
  A <- Y - X %*% beta
  loglike <- crossprod(A)
  return(loglike)
}
# Proximal operator
prox.l1 <- function(u, lambda) {
  uhat <- abs(u) - lambda
  prox <- sign(u) * pmax(rep(0, length(u)), uhat)
  return(prox)
}
# Alternating Direction Method of Multipliers
ADMM <- function(X,Y,rho=5,lambda=.1,iter=50000,e.abs=1E-3,e.rel=1E-6){
  
  n <- nrow(X); p <- ncol(X)
  XtX <- crossprod(X)
  XtY <- crossprod(X, Y) # For simplicity
  
  # Rescale lambda to match glmnet results
  lambda <- lambda * n
  
  # Define Euclidean (l2) norm of a vector
  l2norm <- function(x) sqrt(sum(x^2))
  
  # Initialize coefficients matrix 
  beta <- matrix(0, nrow=iter, ncol=p) 
  beta[1,] <- rep(0, p)
  
  # Initialize objective funtion
  obj <- rep(0, iter)
  obj[1] <- nll(X, Y, beta[1,]) + lambda * sum(abs(beta[1,]))
  
  # Initialize gamma and v
  gamma <- matrix(0, nrow=iter, ncol=p)
  v <- rep(0, p)    # Lagrangian 
  
  # Initialize convergence message in case convergence not reached
  message <- "Convergence not reached..."
 
  # Compute inverse matrix to obtain beta
  invmat <- solve(XtX + diag(rho, p))
  
  # Initialize residual vectors.
  s <- 0    # dual residual
  r <- 0    # primal residual
  
  # Initialize iterator
  t <- 0
  
  # ADMM updates
  for (t in 2:iter){
    
    # Update beta, gamma, v, and obj
    beta[t,] <- invmat %*% (XtY + rho * (gamma[t-1,]-v) )
    gamma[t,] <- prox.l1(beta[t,] + v, lambda/rho)
    v <- v + beta[t,] - gamma[t,]
    obj[t] <- nll(X, Y, beta[t,]) + lambda * sum(abs(beta[t,]))
    
    # Calculate residuals for iteration t
    r <- beta[t,] - gamma[t,]
    s <- -rho * (gamma[t,] - gamma[t-1,])
    
    r.norm <- l2norm(r)
    s.norm <- l2norm(s)
    
    e.primal <- sqrt(p) * e.abs + e.rel * max(l2norm(beta[t,]), l2norm(gamma[t,])) 
    e.dual <-  sqrt(n) * e.abs + e.rel * l2norm(v)
    
    # Check convergence
    if (r.norm <= e.primal && s.norm <= e.dual){
      # Remove excess beta and nll
      beta <- beta[-((t+1):nrow(beta)),]
      obj <- obj[-((t+1):length(obj))]
      
      # Update convergence message
      message <- sprintf("Convergence reached after %i iterations", (t))
      break
    }
  }
  result <- list("beta.hat"=beta[nrow(beta),],"beta"=beta,
                 "objective"=obj,"conv"=message, "iter"=t)
  return(result)
}
library(MASS)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
# Read in data
X <- as.matrix(read.csv("diabetesX.csv", header = TRUE))
Y <- as.numeric(unlist(read.csv("diabetesY.csv", header = FALSE)))

# Scale data
X <- scale(X)
Y <- scale(Y)

# Run ADMM and calculate time to reach convergence
lassoADMM <- ADMM(X,Y,rho=5,lambda=0.012)
system.time(ADMM(X,Y,rho=5,lambda=0.012))
##    user  system elapsed 
##    0.15    0.00    0.16
# Check interation used for convergence
print(lassoADMM$conv)
## [1] "Convergence reached after 121 iterations"
# Number of nonzero coefficients by using ADMM
sum(lassoADMM$beta.hat != 0) 
## [1] 64
# Plot 
plot(lassoADMM$objective, type="l", xlab="iteration", ylab="objective", 
     log="x", cex.main=1, main= "Lasso objective function")
points(lassoADMM$iter, lassoADMM$objective[lassoADMM$iter],
       pch=19, col="red")

# Compare ADMM with glmnet
# Setup the range of lambda
lambda <- exp(seq(0,-7,length.out=100)) # [0.00091, 1]

# Calculate coefficients using ADMM
betamat <- matrix(rep(NA, length(lambda)*ncol(X)), ncol=length(lambda))
for (i in 1:length(lambda)) {
  mylasso <- ADMM(X, Y, lambda=lambda[i])
  betamat[, i]  <- mylasso$beta.hat
}

# Fit Lasso model across a range of lambda values
myLasso <- glmnet(X,Y,alpha=1,lambda=lambda)

# Plot all beta's vs. lambda both from accelerated proximal gradient method and glmnet
par(mar=c(4,5,4,2))
par(mfrow=c(1,2))
plot(0,0,type='n', ylim=c(min(betamat), max(betamat)), cex.main=0.8,
     ylab=expression(hat(beta)[lambda]), xlim=log(range(lambda)),
     xlab=expression(paste('log(',lambda,')')), main='Coefficients of ADMM')

for (i in 1:nrow(betamat)) { lines(log(lambda), betamat[i, ], col=i) }

# plot(myLasso) # R default, the L1 norm is the regularization term for Lasso
plot(0,0,type='n', ylim=range(myLasso$beta), cex.main=0.8,
     ylab=expression(hat(beta)[lambda]), xlim=log(range(myLasso$lambda)),
     xlab=expression(paste('log(',lambda,')')), main='Coefficients of glmnet Package')

for(i in 1:nrow(myLasso$beta)){ lines(log(lambda),myLasso$beta[i,],col=i) }

# 10-fold Cross-validation to choose the best tuning parameter lambda for glmnet
set.seed(1)
ratio <- 0.5 # Split data
index <- sample(1:nrow(X), floor(ratio*nrow(X))) # Make equal partition

# 10-fold cross-validation
set.seed(1)
cv.out <- cv.glmnet(X[index, ],Y[index],alpha =1)
par(mfrow=c(1,1))
plot(cv.out)

bestlam <- cv.out$lambda.min
# The value of lambda that results in the smallest CV error is 0.01212987
print(bestlam)
## [1] 0.01212987
# Comparison of different rho with best lambda
rho <- c(0.1, 0.3, 1, 5, 10, 20)
color <- c(1:length(rho))
niter <- rep(0,length(rho))

# Plot ADMM objective by varying penalty parameter
# pdf("ADMM.pdf")
plot(0,type="l", xlim=c(1,15000), ylim=c(180,420),
     xlab="iteration", ylab="objective", 
     log="x", cex.main=1, main= "Lasso objective function")

for (k in 1:length(rho)) {
  lassoADMM <- ADMM(X, Y, lambda=0.012, rho=rho[k])
  lines(lassoADMM$objective, col=color[k])
  niter[k] <- lassoADMM$iter
  obj <- lassoADMM$objective[niter[k]]
  points(niter[k], obj, pch=19, col=color[7-k])
}
legend('topright', legend=rho, title='Penalty Parameter', col=7-color, pch=19)

# dev.off()

print(cbind(rho,niter))
##       rho niter
## [1,]  0.1  5958
## [2,]  0.3  1987
## [3,]  1.0   598
## [4,]  5.0   121
## [5,] 10.0    62
## [6,] 20.0    32