# Big Data Exercise06 The Proximal Gradient Method
# Yanxin Li
# Oct 20 2017

rm(list=ls())
setwd("D:/2017 UT Austin/Statistical Models for Big Data/R")
# Negative log likelihood function
nll <- function(X, Y, beta) {
  A <- Y - X %*% beta
  loglike <- (0.5/nrow(X)) * crossprod(A)
  return(loglike)
}
# Gradient of negative log likelihood function
gradient <- function(X, Y, beta) {
  A <- Y - X %*% beta
  grad <- -(1/nrow(X)) * crossprod(X, A)
  return(grad)
}
# Proximal operator
prox.l1 <- function(u, lambda) {
  uhat <- abs(u) - lambda
  ubind <- cbind(rep(0, length(u)), uhat)
  prox <- sign(u) * apply(ubind, 1, max) # 1: row, 2: column
  return(prox)
}
# Proximal gradient descent
proxGD <- function(X,Y,lambda=0.01,gamma=0.01,beta0=NA,iter=50000,conv=1e-10){
  # Set beta0 equal to a series of zeros
  if (is.na(beta0)) { beta0 <- rep(0, ncol(X)) }
  
  # Initialize coefficients matrix 
  beta <- matrix(rep(NA, ncol(X)*(iter+1)), nrow=ncol(X))
  beta[, 1] <- beta0
  
  # Initialize objective funtion
  obj <- rep(NA, iter+1)
  obj[1] <- nll(X, Y, beta0) + lambda * sum(abs(beta0))
  
  # Initialize convergence message in case convergence not reached
  message <- "Convergence not reached..."
  
  for (t in 1:iter) {
    # Update u, beta and obj
    u <- beta[,t] - gamma * gradient(X, Y, beta[,t])
    beta[,t+1] <- prox.l1(u, gamma * lambda)
    obj[t+1] <- nll(X, Y, beta[,t+1]) + lambda * sum(abs(beta[,t+1]))
    
    # Check convergence
    delta <- abs(obj[t+1]-obj[t]) /(abs(obj[t])+conv)
    if (delta < conv) {
      # Remove excess betas and obj
      beta <- beta[, -((t+2):ncol(beta))]
      obj <- obj[-((t+2):length(obj))]
      
      # Update convergence message
      message <- sprintf("Convergence reached after %i iterations", (t+1))
      break
    }
  }
  
  result <- list("beta.hat"=beta[,ncol(beta)],"beta"=beta,"objective"=obj,"conv"=message)
  return(result)
}
# Accelerated proximal gradient method
proxACCE <- function(X,Y,lambda=0.01,gamma=0.01,beta0=NA,iter=50000,conv=1e-10){
  # Set beta0 equal to a series of zeros
  if (is.na(beta0)) { beta0 <- rep(0, ncol(X)) }
  
  # Create s vector
  s <- rep(NA, iter+1)
  s[1] <- 1
  
  for (j in 2:length(s)) {
    s[j] <- (1+sqrt(1+4*s[j-1]^2))/2
  }
  
  # Initialize z matrix
  z <- matrix(0, nrow=ncol(X), ncol=iter+1)
  
  # Initialize coefficients matrix 
  beta <- matrix(rep(NA, ncol(X)*(iter+1)), nrow=ncol(X))
  beta[, 1] <- beta0
  
  # Initialize objective funtion
  obj <- rep(NA, iter+1)
  obj[1] <- nll(X, Y, beta0) + lambda * sum(abs(beta0))
  
  # Initialize convergence message in case convergence not reached
  message <- "Convergence not reached..."
  
  for (t in 1:iter) {
    # Update u, beta, z and obj
    u <- z[,t] - gamma * gradient(X, Y, z[,t])
    beta[,t+1] <- prox.l1(u, gamma * lambda)
    z[,t+1] <- beta[,t+1] + (s[t] - 1)/s[t+1] * (beta[,t+1] - beta[,t])
    obj[t+1] <- nll(X, Y, beta[,t+1]) + lambda * sum(abs(beta[,t+1]))
    
    # Check convergence 
    delta <- abs(obj[t+1]-obj[t]) /(abs(obj[t])+conv)
    if (delta < conv) {
      # Remove excess betas and nll
      beta <- beta[, -((t+2):ncol(beta))]
      obj <- obj[-((t+2):length(obj))]
      
      # Update convergence message
      message <- sprintf("Convergence reached after %i iterations", (t+1))
      break
    }
  }
  
  result <- list("beta.hat"=beta[,ncol(beta)],"beta"=beta,"objective"=obj,"conv"=message)
  return(result)
}
library(MASS)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
library(ggplot2)

# Read in data
X <- as.matrix(read.csv("diabetesX.csv", header = TRUE))
Y <- as.numeric(unlist(read.csv("diabetesY.csv", header = FALSE)))

# Scale data
X <- scale(X)
Y <- scale(Y)

# Setup the range of lambda
lambda <- exp(seq(0,-7,length.out=100)) # [0.00091, 1]
# Compare proximal gradient algorithm with glmnet
# Calculate coefficients using proximal gradient algorithm
betamat <- matrix(rep(NA, length(lambda)*ncol(X)), ncol=length(lambda))
for (i in 1:length(lambda)) {
  mylasso <- proxGD(X, Y, lambda=lambda[i])
  betamat[, i]  <- mylasso$beta.hat
}

# Fit Lasso model across a range of lambda values
myLasso <- glmnet(X,Y,alpha=1,lambda=lambda)

# Plot all beta's vs. lambda both from accelerated proximal gradient method and glmnet
par(mar=c(4,5,4,2))
par(mfrow=c(1,2))
plot(0,0,type='n', ylim=c(min(betamat), max(betamat)), cex.main=0.8,
     ylab=expression(hat(beta)[lambda]), xlim=log(range(lambda)),
     xlab=expression(paste('log(',lambda,')')), main='Coefficients of Proximal Gradient')

for (i in 1:nrow(betamat)) { lines(log(lambda), betamat[i, ], col=i) }

# plot(myLasso) # R default, the L1 norm is the regularization term for Lasso
plot(0,0,type='n', ylim=range(myLasso$beta), cex.main=0.8,
     ylab=expression(hat(beta)[lambda]), xlim=log(range(myLasso$lambda)),
     xlab=expression(paste('log(',lambda,')')), main='Coefficients of glmnet Package')

for(i in 1:nrow(myLasso$beta)){ lines(log(lambda),myLasso$beta[i,],col=i) }

# 10-fold Cross-validation to choose the best tuning parameter lambda for glmnet
set.seed(1)
ratio <- 0.5 # Split data
index <- sample(1:nrow(X), floor(ratio*nrow(X))) # Make equal partition

# 10-fold cross-validation
set.seed(1)
cv.out <- cv.glmnet(X[index, ],Y[index],alpha =1)
par(mfrow=c(1,1))
plot(cv.out)

bestlam <- cv.out$lambda.min
# The value of lambda that results in the smallest CV error is 0.01212987
print(bestlam)
## [1] 0.01212987
# Compare proximal gradient algorithm with accelerated proximal gradient algorithm
lassoGD <- proxGD(X, Y, lambda=0.01, gamma=0.01, conv=1e-10)
lassoACCE <- proxACCE(X, Y, lambda=0.01, gamma=0.01, conv=1e-10)

# Number of iterations when convergence met
n1 <- length(lassoGD$objective)
n2 <- length(lassoACCE$objective)
print(n1); print(n2)
## [1] 4807
## [1] 652
# Plot objective values and compare number of iterations used when convergence met
par(mfrow=c(1,1))
plot(lassoGD$objective, type="l", col="black", log="x",
     xlab="iteration", ylab="Objective Function", cex.main=1,
     main=sprintf("Objective for Prox and Accelerated Prox Methods, conv = 1e%i", -10))
lines(lassoACCE$objective, col="red")
points(n1, lassoGD$objective[n1], pch=19, col="red")
points(n2, lassoACCE$objective[n2], pch=19, col="blue")
legend("topright", legend=c("Proximal Method", "Accelerated Proximal Method"),
       col=c("black", "red"), lty=c(1,1), cex=1)

# Compare beta.hat's of the two algorithms
plot(lassoGD$beta.hat, lassoACCE$beta.hat,
     main="Comparison of Coefficients from Two Methods",
     xlab="Coefficients from Prox Method",
     ylab="Coefficients from Accel Prox Method",pch=16,col="purple")
abline(0, 1)

# Eyeball beta.hat
cbind(lassoGD$beta.hat,lassoACCE$beta.hat)
##                [,1]          [,2]
##  [1,]  0.0085564429  0.0085915940
##  [2,] -0.1272850118 -0.1273894244
##  [3,]  0.3054399755  0.3052213663
##  [4,]  0.1927202076  0.1927523430
##  [5,] -0.0389473856 -0.0390691752
##  [6,]  0.0000000000  0.0000000000
##  [7,] -0.1631759396 -0.1632479365
##  [8,]  0.0000000000  0.0000000000
##  [9,]  0.3233600948  0.3235603113
## [10,]  0.0337271163  0.0337561048
## [11,]  0.0350896498  0.0351730733
## [12,]  0.0238524080  0.0239467991
## [13,]  0.0000000000  0.0000000000
## [14,]  0.0000000000  0.0000000000
## [15,]  0.0000000000  0.0000000000
## [16,]  0.0000000000  0.0000000000
## [17,]  0.0000000000  0.0000000000
## [18,]  0.0000000000  0.0000000000
## [19,]  0.0637721035  0.0635910679
## [20,]  0.0961571643  0.0961659263
## [21,]  0.0000000000  0.0000000000
## [22,]  0.0145922703  0.0145892034
## [23,]  0.0000000000  0.0000000000
## [24,] -0.0389685266 -0.0390514872
## [25,]  0.0256917089  0.0256249736
## [26,]  0.0000000000  0.0000000000
## [27,]  0.0407828633  0.0407070250
## [28,]  0.0183655356  0.0183254028
## [29,]  0.0269937207  0.0268586053
## [30,]  0.0387531390  0.0387718137
## [31,]  0.0000000000  0.0000000000
## [32,] -0.0181018978 -0.0180600307
## [33,]  0.0428830143  0.0429078150
## [34,]  0.0000000000  0.0000000000
## [35,]  0.0000000000  0.0000000000
## [36,]  0.0000000000  0.0000000000
## [37,]  0.0837098733  0.0836933459
## [38,] -0.0004062054 -0.0003973438
## [39,]  0.0000000000  0.0000000000
## [40,]  0.0000000000  0.0000000000
## [41,]  0.0000000000  0.0000000000
## [42,]  0.0000000000  0.0000000000
## [43,]  0.0000000000  0.0000000000
## [44,]  0.0083483261  0.0082331604
## [45,]  0.0000000000  0.0000000000
## [46,]  0.0235917687  0.0235682975
## [47,]  0.0000000000  0.0000000000
## [48,]  0.0000000000  0.0000000000
## [49,] -0.0361962960 -0.0359988507
## [50,]  0.0000000000  0.0000000000
## [51,]  0.0074551531  0.0073961617
## [52,] -0.0476875712 -0.0477806271
## [53,] -0.0104831124 -0.0103190057
## [54,]  0.0000000000  0.0000000000
## [55,]  0.0000000000  0.0000000000
## [56,]  0.0000000000  0.0000000000
## [57,]  0.0745775553  0.0747505605
## [58,]  0.0069703990  0.0068581242
## [59,] -0.0519113331 -0.0520842898
## [60,]  0.0000000000  0.0000000000
## [61,]  0.0000000000  0.0000000000
## [62,] -0.0600073437 -0.0603700670
## [63,]  0.0211350485  0.0213435212
## [64,]  0.0000000000  0.0000000000
# Number of nonzero coefficients by using proximal gradient method
sum(lassoGD$beta.hat != 0) 
## [1] 34
# Number of nonzero coefficients by using accelerated proximal gradient
sum(lassoACCE$beta.hat != 0) 
## [1] 34