# Big Data Exercise06 The Proximal Gradient Method
# Yanxin Li
# Oct 20 2017
rm(list=ls())
setwd("D:/2017 UT Austin/Statistical Models for Big Data/R")
# Negative log likelihood function
nll <- function(X, Y, beta) {
A <- Y - X %*% beta
loglike <- (0.5/nrow(X)) * crossprod(A)
return(loglike)
}
# Gradient of negative log likelihood function
gradient <- function(X, Y, beta) {
A <- Y - X %*% beta
grad <- -(1/nrow(X)) * crossprod(X, A)
return(grad)
}
# Proximal operator
prox.l1 <- function(u, lambda) {
uhat <- abs(u) - lambda
ubind <- cbind(rep(0, length(u)), uhat)
prox <- sign(u) * apply(ubind, 1, max) # 1: row, 2: column
return(prox)
}
# Proximal gradient descent
proxGD <- function(X,Y,lambda=0.01,gamma=0.01,beta0=NA,iter=50000,conv=1e-10){
# Set beta0 equal to a series of zeros
if (is.na(beta0)) { beta0 <- rep(0, ncol(X)) }
# Initialize coefficients matrix
beta <- matrix(rep(NA, ncol(X)*(iter+1)), nrow=ncol(X))
beta[, 1] <- beta0
# Initialize objective funtion
obj <- rep(NA, iter+1)
obj[1] <- nll(X, Y, beta0) + lambda * sum(abs(beta0))
# Initialize convergence message in case convergence not reached
message <- "Convergence not reached..."
for (t in 1:iter) {
# Update u, beta and obj
u <- beta[,t] - gamma * gradient(X, Y, beta[,t])
beta[,t+1] <- prox.l1(u, gamma * lambda)
obj[t+1] <- nll(X, Y, beta[,t+1]) + lambda * sum(abs(beta[,t+1]))
# Check convergence
delta <- abs(obj[t+1]-obj[t]) /(abs(obj[t])+conv)
if (delta < conv) {
# Remove excess betas and obj
beta <- beta[, -((t+2):ncol(beta))]
obj <- obj[-((t+2):length(obj))]
# Update convergence message
message <- sprintf("Convergence reached after %i iterations", (t+1))
break
}
}
result <- list("beta.hat"=beta[,ncol(beta)],"beta"=beta,"objective"=obj,"conv"=message)
return(result)
}
# Accelerated proximal gradient method
proxACCE <- function(X,Y,lambda=0.01,gamma=0.01,beta0=NA,iter=50000,conv=1e-10){
# Set beta0 equal to a series of zeros
if (is.na(beta0)) { beta0 <- rep(0, ncol(X)) }
# Create s vector
s <- rep(NA, iter+1)
s[1] <- 1
for (j in 2:length(s)) {
s[j] <- (1+sqrt(1+4*s[j-1]^2))/2
}
# Initialize z matrix
z <- matrix(0, nrow=ncol(X), ncol=iter+1)
# Initialize coefficients matrix
beta <- matrix(rep(NA, ncol(X)*(iter+1)), nrow=ncol(X))
beta[, 1] <- beta0
# Initialize objective funtion
obj <- rep(NA, iter+1)
obj[1] <- nll(X, Y, beta0) + lambda * sum(abs(beta0))
# Initialize convergence message in case convergence not reached
message <- "Convergence not reached..."
for (t in 1:iter) {
# Update u, beta, z and obj
u <- z[,t] - gamma * gradient(X, Y, z[,t])
beta[,t+1] <- prox.l1(u, gamma * lambda)
z[,t+1] <- beta[,t+1] + (s[t] - 1)/s[t+1] * (beta[,t+1] - beta[,t])
obj[t+1] <- nll(X, Y, beta[,t+1]) + lambda * sum(abs(beta[,t+1]))
# Check convergence
delta <- abs(obj[t+1]-obj[t]) /(abs(obj[t])+conv)
if (delta < conv) {
# Remove excess betas and nll
beta <- beta[, -((t+2):ncol(beta))]
obj <- obj[-((t+2):length(obj))]
# Update convergence message
message <- sprintf("Convergence reached after %i iterations", (t+1))
break
}
}
result <- list("beta.hat"=beta[,ncol(beta)],"beta"=beta,"objective"=obj,"conv"=message)
return(result)
}
library(MASS)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
library(ggplot2)
# Read in data
X <- as.matrix(read.csv("diabetesX.csv", header = TRUE))
Y <- as.numeric(unlist(read.csv("diabetesY.csv", header = FALSE)))
# Scale data
X <- scale(X)
Y <- scale(Y)
# Setup the range of lambda
lambda <- exp(seq(0,-7,length.out=100)) # [0.00091, 1]
# Compare proximal gradient algorithm with glmnet
# Calculate coefficients using proximal gradient algorithm
betamat <- matrix(rep(NA, length(lambda)*ncol(X)), ncol=length(lambda))
for (i in 1:length(lambda)) {
mylasso <- proxGD(X, Y, lambda=lambda[i])
betamat[, i] <- mylasso$beta.hat
}
# Fit Lasso model across a range of lambda values
myLasso <- glmnet(X,Y,alpha=1,lambda=lambda)
# Plot all beta's vs. lambda both from accelerated proximal gradient method and glmnet
par(mar=c(4,5,4,2))
par(mfrow=c(1,2))
plot(0,0,type='n', ylim=c(min(betamat), max(betamat)), cex.main=0.8,
ylab=expression(hat(beta)[lambda]), xlim=log(range(lambda)),
xlab=expression(paste('log(',lambda,')')), main='Coefficients of Proximal Gradient')
for (i in 1:nrow(betamat)) { lines(log(lambda), betamat[i, ], col=i) }
# plot(myLasso) # R default, the L1 norm is the regularization term for Lasso
plot(0,0,type='n', ylim=range(myLasso$beta), cex.main=0.8,
ylab=expression(hat(beta)[lambda]), xlim=log(range(myLasso$lambda)),
xlab=expression(paste('log(',lambda,')')), main='Coefficients of glmnet Package')
for(i in 1:nrow(myLasso$beta)){ lines(log(lambda),myLasso$beta[i,],col=i) }

# 10-fold Cross-validation to choose the best tuning parameter lambda for glmnet
set.seed(1)
ratio <- 0.5 # Split data
index <- sample(1:nrow(X), floor(ratio*nrow(X))) # Make equal partition
# 10-fold cross-validation
set.seed(1)
cv.out <- cv.glmnet(X[index, ],Y[index],alpha =1)
par(mfrow=c(1,1))
plot(cv.out)

bestlam <- cv.out$lambda.min
# The value of lambda that results in the smallest CV error is 0.01212987
print(bestlam)
## [1] 0.01212987
# Compare proximal gradient algorithm with accelerated proximal gradient algorithm
lassoGD <- proxGD(X, Y, lambda=0.01, gamma=0.01, conv=1e-10)
lassoACCE <- proxACCE(X, Y, lambda=0.01, gamma=0.01, conv=1e-10)
# Number of iterations when convergence met
n1 <- length(lassoGD$objective)
n2 <- length(lassoACCE$objective)
print(n1); print(n2)
## [1] 4807
## [1] 652
# Plot objective values and compare number of iterations used when convergence met
par(mfrow=c(1,1))
plot(lassoGD$objective, type="l", col="black", log="x",
xlab="iteration", ylab="Objective Function", cex.main=1,
main=sprintf("Objective for Prox and Accelerated Prox Methods, conv = 1e%i", -10))
lines(lassoACCE$objective, col="red")
points(n1, lassoGD$objective[n1], pch=19, col="red")
points(n2, lassoACCE$objective[n2], pch=19, col="blue")
legend("topright", legend=c("Proximal Method", "Accelerated Proximal Method"),
col=c("black", "red"), lty=c(1,1), cex=1)

# Compare beta.hat's of the two algorithms
plot(lassoGD$beta.hat, lassoACCE$beta.hat,
main="Comparison of Coefficients from Two Methods",
xlab="Coefficients from Prox Method",
ylab="Coefficients from Accel Prox Method",pch=16,col="purple")
abline(0, 1)

# Eyeball beta.hat
cbind(lassoGD$beta.hat,lassoACCE$beta.hat)
## [,1] [,2]
## [1,] 0.0085564429 0.0085915940
## [2,] -0.1272850118 -0.1273894244
## [3,] 0.3054399755 0.3052213663
## [4,] 0.1927202076 0.1927523430
## [5,] -0.0389473856 -0.0390691752
## [6,] 0.0000000000 0.0000000000
## [7,] -0.1631759396 -0.1632479365
## [8,] 0.0000000000 0.0000000000
## [9,] 0.3233600948 0.3235603113
## [10,] 0.0337271163 0.0337561048
## [11,] 0.0350896498 0.0351730733
## [12,] 0.0238524080 0.0239467991
## [13,] 0.0000000000 0.0000000000
## [14,] 0.0000000000 0.0000000000
## [15,] 0.0000000000 0.0000000000
## [16,] 0.0000000000 0.0000000000
## [17,] 0.0000000000 0.0000000000
## [18,] 0.0000000000 0.0000000000
## [19,] 0.0637721035 0.0635910679
## [20,] 0.0961571643 0.0961659263
## [21,] 0.0000000000 0.0000000000
## [22,] 0.0145922703 0.0145892034
## [23,] 0.0000000000 0.0000000000
## [24,] -0.0389685266 -0.0390514872
## [25,] 0.0256917089 0.0256249736
## [26,] 0.0000000000 0.0000000000
## [27,] 0.0407828633 0.0407070250
## [28,] 0.0183655356 0.0183254028
## [29,] 0.0269937207 0.0268586053
## [30,] 0.0387531390 0.0387718137
## [31,] 0.0000000000 0.0000000000
## [32,] -0.0181018978 -0.0180600307
## [33,] 0.0428830143 0.0429078150
## [34,] 0.0000000000 0.0000000000
## [35,] 0.0000000000 0.0000000000
## [36,] 0.0000000000 0.0000000000
## [37,] 0.0837098733 0.0836933459
## [38,] -0.0004062054 -0.0003973438
## [39,] 0.0000000000 0.0000000000
## [40,] 0.0000000000 0.0000000000
## [41,] 0.0000000000 0.0000000000
## [42,] 0.0000000000 0.0000000000
## [43,] 0.0000000000 0.0000000000
## [44,] 0.0083483261 0.0082331604
## [45,] 0.0000000000 0.0000000000
## [46,] 0.0235917687 0.0235682975
## [47,] 0.0000000000 0.0000000000
## [48,] 0.0000000000 0.0000000000
## [49,] -0.0361962960 -0.0359988507
## [50,] 0.0000000000 0.0000000000
## [51,] 0.0074551531 0.0073961617
## [52,] -0.0476875712 -0.0477806271
## [53,] -0.0104831124 -0.0103190057
## [54,] 0.0000000000 0.0000000000
## [55,] 0.0000000000 0.0000000000
## [56,] 0.0000000000 0.0000000000
## [57,] 0.0745775553 0.0747505605
## [58,] 0.0069703990 0.0068581242
## [59,] -0.0519113331 -0.0520842898
## [60,] 0.0000000000 0.0000000000
## [61,] 0.0000000000 0.0000000000
## [62,] -0.0600073437 -0.0603700670
## [63,] 0.0211350485 0.0213435212
## [64,] 0.0000000000 0.0000000000
# Number of nonzero coefficients by using proximal gradient method
sum(lassoGD$beta.hat != 0)
## [1] 34
# Number of nonzero coefficients by using accelerated proximal gradient
sum(lassoACCE$beta.hat != 0)
## [1] 34