# Big Data Exercise 5 Sparsity
# Yanxin Li
# Oct 12 2017
rm(list=ls())
setwd("D:/2017 UT Austin/Statistical Models for Big Data/R")
# Part (A)
# Penalized likelihood function without the argmin (objective function)
obj = function(y, theta, lambda){
0.5 * (y-theta)^2 + lambda * theta
}
# Toy example with given lamda and y
lambda = 1
y = 5
# Plot S(y) function
curve(obj(y,x,lambda),-10,10,xlab=expression(theta),ylab=expression(S[lambda](5)))
a = abs(y) - lambda
abline(v=ifelse(y<0,-1,1)*ifelse(a>0,a,0),col=2)

# Hard thresholding function
hard.thresh = function(y,lambda){
ifelse(abs(y)>=lambda,y,0)
}
# Soft thresholing function
soft.thresh = function(y,lambda){
a = abs(y) - lambda
results = ifelse(y<0,-1,1)*ifelse(a>0,a,0)
return(results)
}
# Plot different thresholdings
curve(soft.thresh(x,1),-3,3,n=10000,xlab='y',ylab=expression(hat(theta)))
curve(hard.thresh(x,1),-3,3,n=10000,xlab='y',ylab=expression(hat(theta)),
col=2,lty=3,add=T,lwd=3)
legend('topleft',legend = c('Hard','Soft'), col=1:2, lty=c(1,3), lwd=c(1,3))
title(expression(paste('Thresholding, ',lambda,'=1')))

# Part (B)
# Toy-example to illustrate how soft -thresholding can be used to enforce sparsity
# Number of datapoints
n = 100
sparsity = 0.2
# Step 1: sparse theta vector
nonzero = round(n*sparsity)
theta = numeric(n)
theta[sample(1:n,nonzero)] <- 1:nonzero
sigma = numeric(n) + 1 # all sigma_i = 1
# Step 2: generate data, Gaussian sequence model
z = rnorm(n = n, mean = theta, sd = sigma)
# Step 3: Soft thresholding for multiple lambda values
lambda = 0:5
plot(0,0,type='n',xlim = range(theta)*1.1, ylim = range(theta)*1.2,
xlab=expression(theta),ylab=expression(paste(hat(theta),'(y)',sep='')),
main='Soft Thresholding')
for(i in 1:length(lambda)){
points = soft.thresh(y = z, lambda = lambda[i]*sigma)
points(theta, points, col = i, pch=19, cex = 0.5)
}
abline(0,1,lty=3)
text(22,23,expression(paste(hat(theta),'=',theta)))
legend('topleft',legend=lambda,title=expression(paste(lambda,' value')),col=1:6,pch=19)

# Step 4: Try out varying sparsity
n = 1000 # Number of datapoints
lambda = seq(0,3,length.out = 100) # A discrete grid of different lambda
sparsity = c(0.01,0.05,0.10,0.25,0.50) # Varying sparsity
plot(0,0,type='n',xlim = range(lambda)*1.1, ylim = c(0,6),
xlab=expression(lambda),ylab='Scaled MSE')
# Loop over varying sparsity
for(i in 1:length(sparsity)){
# Initialize vector to hold MSE for each lambda
MSE = rep(0,length(lambda))
# Sparse theta vector
nonzero = round(n*sparsity[i])
theta = numeric(n)
theta[sample(1:n,nonzero)] = rpois(nonzero,5)
# Vector of sigma's
sigma = numeric(n) + 1
# Simulate one data point
z = rnorm(n = n, mean = theta, sd = sigma)
# Loop over lambda's
for(j in 1:length(lambda)){
# Soft thresholding
theta.hat = soft.thresh(y = z, lambda = lambda[j]*sigma)
# Calculate MSE
MSE[j] = mean((theta.hat-theta)^2)
}
# Find the minimum
minimum = which.min(MSE)
lines(lambda,MSE/MSE[minimum],col = i) # Scale by minimum
# abline(v=lambda[minimum],col=i)
points(lambda[minimum],1,col=i,pch=19,cex=1.5) # Dot the minimum
}
# Add legend
legend('bottomright',legend=1-sparsity,title='Sparsity',
col=1:length(sparsity),pch=19)

# The Lasso
# Part (A)
rm(list=ls())
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
# Load X into matrix
X = read.csv('diabetesX.csv')
X = as.matrix(X)
# Load Y into vector
Y = read.csv('diabetesY.csv',header=F)
Y = Y[,1]
# Choice of lambda's
m = 100 # Number of lambda's
lambda = exp(seq(4,-4,length.out=m))
# Fit Lasso model across a range of lambda values (which glmnet does automatically)
myLasso = glmnet(X,Y,alpha=1,lambda=lambda)
# Plot both coefficient sizes and MSE change
# Plot beta's by lambda - R has a default
plot(myLasso) # the L1 norm is the regularization term for Lasso

par(mfrow=c(1,2))
plot(0,0,type='n',
ylim=range(myLasso$beta), ylab=expression(hat(beta)[lambda]),
xlim=log(range(myLasso$lambda)), xlab = expression(paste('log(',lambda,')')),
main = 'Shrinkage of Coefficients'
)
for(i in 1:nrow(myLasso$beta)){
lines(log(lambda),myLasso$beta[i,],col=i)
}
# Plot in-sample MSE by lambda
MSE = myLasso$lambda * 0 # Initialize
for(i in 1:ncol(myLasso$beta)){
MSE[i] = mean((Y - myLasso$a0[i] - X%*%myLasso$beta[,i])^2)
}
plot(log(myLasso$lambda), MSE, type = 'l', col='blue', main = 'In-sample MSE',
ylab= "MSE", xlab = expression(paste('log(',lambda,')')))
abline(v=log(myLasso$lambda[which.min(MSE)]),col='red')

# Part (B): Cross validation
# 10-fold Cross-validation to choose the best tuning parameter lambda
set.seed(1)
ratio = 0.5 # Split data
index = sample(1:nrow(X), floor(ratio*nrow(X))) # Make equal partition
# Training data set and test data set
x.train = X[index, ]
x.test = X[-index,]
y.train = Y[index]
y.test = Y[-index]
# 10-fold cross-validation
set.seed(1)
cv.out = cv.glmnet(x.train,y.train,alpha =1)
par(mfrow=c(1,1))
plot(cv.out)

bestlam = cv.out$lambda.min
print(bestlam) # The value of lambda that results in the
## [1] 0.9351282
# smallest cross-validation error is 0.9351282
# Compute cross-validation MSE for Lasso model with best lambda
cv.pred.lasso = predict(myLasso,s=bestlam,newx=x.test)
MOOSE.best = mean((cv.pred.lasso -y.test)^2)
print(MOOSE.best) # 2808.36
## [1] 2808.36
MOOSE = cv.out$cvm # All Mean out-of-sample squared errors
length(cv.out$lambda) # number of the values of lambda used in the fit
## [1] 90
# How many parameters at best lambda?
Lasso.mod = glmnet(X,Y,alpha=1,lambda=lambda)
lasso.coef <- predict(Lasso.mod,type ="coefficients",s=bestlam)[1:ncol(X),]
print(lasso.coef)
## (Intercept) age sex bmi map
## 152.1334842 9.1281343 -198.3086164 495.2550390 306.4372768
## tc ldl hdl tch ltg
## -54.4898986 0.0000000 -255.6297685 0.0000000 515.1454903
## glu age.2 bmi.2 map.2 tc.2
## 52.6385248 53.3611471 39.6850467 0.0000000 0.0000000
## ldl.2 hdl.2 tch.2 ltg.2 glu.2
## 0.0000000 0.0000000 0.0000000 -3.6573852 98.4531235
## age.sex age.bmi age.map age.tc age.ldl
## 150.3737668 0.0000000 24.8940141 0.0000000 -54.7136829
## age.hdl age.tch age.ltg age.glu sex.bmi
## 33.3015581 0.0000000 58.1631102 24.8240173 36.9413176
## sex.map sex.tc sex.ldl sex.hdl sex.tch
## 56.9937978 0.0000000 -24.0094851 59.5136464 0.0000000
## sex.ltg sex.glu bmi.map bmi.tc bmi.ldl
## 0.0000000 0.0000000 129.5304893 0.0000000 0.0000000
## bmi.hdl bmi.tch bmi.ltg bmi.glu map.tc
## 0.0000000 0.0000000 0.0000000 0.0000000 5.4462378
## map.ldl map.hdl map.tch map.ltg map.glu
## 0.0000000 36.9121383 0.0000000 0.0000000 -42.9236407
## tc.ldl tc.hdl tc.tch tc.ltg tc.glu
## 0.0000000 8.9794871 -68.9548880 -0.1253885 0.0000000
## ldl.hdl ldl.tch ldl.ltg ldl.glu hdl.tch
## 0.0000000 0.0000000 92.8388346 10.4573718 -71.7012208
## hdl.ltg hdl.glu tch.ltg tch.glu
## 0.0000000 0.0000000 -82.5353633 27.9789264
sum(lasso.coef != 0) # number of nonzero betas for minimum MOOSE
## [1] 35
###################
k = 10
set.seed(1) # Reproducibility
folds = sample(1:k, nrow(x.train), replace=TRUE)
cv.errors <- matrix(NA,k,m,dimnames=list(NULL, paste(1:m)))
for(j in 1:k){
model = glmnet(x.train[folds!=j, ], y.train[folds!=j],alpha=1, lambda=lambda)
for(i in 1:m){
pred <- predict(model, s=lambda[i], newx=x.train[folds==j,])
cv.errors[j,i] = mean((y.train[folds==j] - pred)^2)
}
}
# Get Mean OOS square error (MOOSE)
MOOSE = apply(cv.errors,2,mean)
best.lam = lambda[which.min(MOOSE)]
print(best.lam)
## [1] 0.9604013
# How many parameters at best lambda?
lasso.cv = glmnet(X,Y,alpha=1,lambda=lambda)
coef.cv <- predict(lasso.cv,type ="coefficients",s=best.lam)[1:ncol(X),]
print(coef.cv)
## (Intercept) age sex bmi map tc
## 152.133484 8.473917 -197.054357 495.195252 305.509029 -53.086810
## ldl hdl tch ltg glu age.2
## 0.000000 -254.302127 0.000000 514.319670 52.268556 52.811263
## bmi.2 map.2 tc.2 ldl.2 hdl.2 tch.2
## 39.919620 0.000000 0.000000 0.000000 0.000000 0.000000
## ltg.2 glu.2 age.sex age.bmi age.map age.tc
## -4.110733 97.717257 149.644623 0.000000 25.038607 0.000000
## age.ldl age.hdl age.tch age.ltg age.glu sex.bmi
## -53.470959 32.081328 0.000000 57.269570 24.000999 35.821855
## sex.map sex.tc sex.ldl sex.hdl sex.tch sex.ltg
## 56.002127 0.000000 -23.113078 57.973380 0.000000 0.000000
## sex.glu bmi.map bmi.tc bmi.ldl bmi.hdl bmi.tch
## 0.000000 128.601063 0.000000 0.000000 0.000000 0.000000
## bmi.ltg bmi.glu map.tc map.ldl map.hdl map.tch
## 0.000000 0.000000 4.353484 0.000000 36.872183 0.000000
## map.ltg map.glu tc.ldl tc.hdl tc.tch tc.ltg
## 0.000000 -40.373069 0.000000 8.817506 -67.336445 0.000000
## tc.glu ldl.hdl ldl.tch ldl.ltg ldl.glu hdl.tch
## 0.000000 0.000000 0.000000 90.131529 10.130020 -69.305190
## hdl.ltg hdl.glu tch.ltg tch.glu
## 0.000000 0.000000 -79.572650 27.130941
sum(coef.cv != 0) # number of nonzero betas for minimum MOOSE
## [1] 34
# Part (C): Collom Mallow's CP stat
n = length(Y)
p = ncol(X)
# Setup sequence of lambdas
lambda = exp(seq(4,-4,length.out=m))
# initialize CP vector
Cp = lambda * 0
# Run model with several lambda's
model = glmnet(X,Y,alpha=1,lambda=lambda)
# calculate MSE for each labmda
for(i in 1:ncol(model$beta)){
errors = Y - model$a0[i] - X%*%model$beta[,i]
Cp[i] = mean((errors)^2) + 2*model$df[i]*var(errors)/n
}
lambda[which.min(Cp)]
## [1] 2.532718
lasso.cp = glmnet(X,Y,alpha=1,lambda=lambda)
coef.cp <- predict(lasso.cp,type ="coefficients",s=lambda[which.min(Cp)])[1:ncol(X),]
print(coef.cp)
## (Intercept) age sex bmi map tc
## 152.133484 0.000000 -131.447431 500.702976 262.790076 0.000000
## ldl hdl tch ltg glu age.2
## 0.000000 -200.207989 0.000000 470.043039 25.280831 17.397960
## bmi.2 map.2 tc.2 ldl.2 hdl.2 tch.2
## 42.917308 0.000000 0.000000 0.000000 0.000000 0.000000
## ltg.2 glu.2 age.sex age.bmi age.map age.tc
## 0.000000 76.382357 115.363727 0.000000 30.526933 0.000000
## age.ldl age.hdl age.tch age.ltg age.glu sex.bmi
## 0.000000 0.000000 0.000000 12.606485 9.464140 0.000000
## sex.map sex.tc sex.ldl sex.hdl sex.tch sex.ltg
## 7.635682 0.000000 0.000000 0.000000 0.000000 0.000000
## sex.glu bmi.map bmi.tc bmi.ldl bmi.hdl bmi.tch
## 0.000000 90.472508 0.000000 0.000000 0.000000 0.000000
## bmi.ltg bmi.glu map.tc map.ldl map.hdl map.tch
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## map.ltg map.glu tc.ldl tc.hdl tc.tch tc.ltg
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## tc.glu ldl.hdl ldl.tch ldl.ltg ldl.glu hdl.tch
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## hdl.ltg hdl.glu tch.ltg tch.glu
## 0.000000 0.000000 0.000000 0.000000
sum(coef.cp != 0) # number of nonzero betas for minimum Cp
## [1] 16
# Plot log(lambda) v.s. MSE, MOOSE and Cp
plot(log(lambda),MSE,type='l',xlab=expression(paste("log(",lambda,")")), ylab='Error')
lines(log(lambda),MOOSE,col=2)
lines(log(lambda),Cp,col=3)
abline(v=log(lambda[which.min(MSE)]))
abline(v=log(lambda[which.min(MOOSE)]),col="red")
abline(v=log(lambda[which.min(Cp)]),col="green")
legend('topleft',legend=c('IS MSE','MOOSE','Cp'),col=1:3,lty=1)

# How many parameters at best lambda?
sum(model$beta[,which.min(Cp)] != 0)
## [1] 15