Part (A)
Penalized likelihood function without the argmin (objective function)
obj = function(y, theta, lambda){
0.5 * (y-theta)^2 + lambda * theta
}
Toy example with given lamda and y
lambda = 1
y = 5
Plot S(y) function
curve(obj(y,x,lambda),-10,10,xlab=expression(theta),ylab=expression(S[lambda](5)))
a = abs(y) - lambda
abline(v=ifelse(y<0,-1,1)*ifelse(a>0,a,0),col=2)

Hard thresholding function
hard.thresh = function(y,lambda){
ifelse(abs(y)>=lambda,y,0)
}
Soft thresholing function
soft.thresh = function(y,lambda){
a = abs(y) - lambda
results = ifelse(y<0,-1,1)*ifelse(a>0,a,0)
return(results)
}
Plot different thresholdings
curve(soft.thresh(x,1),-3,3,n=10000,xlab='y',ylab=expression(hat(theta)))
curve(hard.thresh(x,1),-3,3,n=10000,xlab='y',ylab=expression(hat(theta)),
col=2,lty=3,add=T,lwd=3)
legend('topleft',legend = c('Hard','Soft'), col=1:2, lty=c(1,3), lwd=c(1,3))
title(expression(paste('Thresholding, ',lambda,'=1')))

Part (A)
Toy-example to illustrate how soft-thresholding can be used to enforce sparsity
Number of datapoints
n = 100
sparsity = 0.2
Step 1: sparse theta vector
nonzero = round(n*sparsity)
theta = numeric(n)
theta[sample(1:n,nonzero)] <- 1:nonzero
sigma = numeric(n) + 1
Step 2: generate data, Gaussian sequence model
z = rnorm(n = n, mean = theta, sd = sigma)
Step 3: Soft thresholding for multiple lambda values
lambda = 0:5
plot(0,0,type='n',xlim = range(theta)*1.1, ylim = range(theta)*1.2,
xlab=expression(theta),ylab=expression(paste(hat(theta),'(y)',sep='')),
main='Soft Thresholding')
for(i in 1:length(lambda)){
points = soft.thresh(y = z, lambda = lambda[i]*sigma)
points(theta, points, col = i, pch=19, cex = 0.5)
}
abline(0,1,lty=3)
text(22,23,expression(paste(hat(theta),'=',theta)))
legend('topleft',legend=lambda,title=expression(paste(lambda,' value')),col=1:6,pch=19)

Step 4: Try out varying sparsity
Number of data points
n = 1000
A discrete grid of different lambda
lambda = seq(0,3,length.out = 100)
Varying sparsity
sparsity = c(0.01,0.05,0.10,0.25,0.50)
Loop over varying sparsity
scale.MSE = matrix(0, nrow=length(lambda), ncol=length(sparsity))
for(i in 1:length(sparsity)){
# Initialize vector to hold MSE for each lambda
MSE = rep(0,length(lambda))
# Sparse theta vector
nonzero = round(n*sparsity[i])
theta = numeric(n)
theta[sample(1:n,nonzero)] = rpois(nonzero,5)
# Vector of sigma's
sigma = numeric(n) + 1
# Simulate one data point
z = rnorm(n = n, mean = theta, sd = sigma)
# Loop over lambda's
for(j in 1:length(lambda)){
# Soft thresholding
theta.hat = soft.thresh(y = z, lambda = lambda[j]*sigma)
# Calculate MSE
MSE[j] = mean((theta.hat-theta)^2)
}
# Calculate scaled MSE
scale.MSE[,i] = MSE/MSE[which.min(MSE)] # Scale by minimum
}
Plot scaled MSE vs. lambda
plot(0,0,type='n',xlim = range(lambda)*1.1, ylim = c(0,6),
xlab=expression(lambda),ylab='Scaled MSE')
for(k in 1: length(sparsity)){
lines(lambda,scale.MSE[,k],col = k) # Scale by minimum
points(lambda[which.min(scale.MSE[,k])],1,col=k,pch=19,cex=1.5) # Dot the minimum
}
legend('bottomright',legend=1-sparsity,title='Sparsity',
col=1:length(sparsity),pch=19)

The Lasso
Part (A)
rm(list=ls())
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
# Load X into matrix
X = read.csv('diabetesX.csv')
X = as.matrix(X)
# Load Y into vector
Y = read.csv('diabetesY.csv',header=F)
Y = Y[,1]
Choice of lambda’s
m = 100
lambda = exp(seq(4,-4,length.out=m))
Fit Lasso model across a range of lambda values (which glmnet does automatically)
myLasso = glmnet(X,Y,alpha=1,lambda=lambda)
Plot both coefficient sizes and MSE change
Plot beta’s by lambda - R has a default
plot(myLasso) # the L1 norm is the regularization term for Lasso

plot(0,0,type='n',
ylim=range(myLasso$beta), ylab=expression(hat(beta)[lambda]),
xlim=log(range(myLasso$lambda)), xlab = expression(paste('log(',lambda,')')),
main = 'Shrinkage of Coefficients'
)
for(i in 1:nrow(myLasso$beta)){
lines(log(lambda),myLasso$beta[i,],col=i)
}

Plot in-sample MSE by lambda
MSE = myLasso$lambda * 0 # Initialize
for(i in 1:ncol(myLasso$beta)){
MSE[i] = mean((Y - myLasso$a0[i] - X%*%myLasso$beta[,i])^2)
}
plot(log(myLasso$lambda), MSE, type = 'l', col='blue', main = 'In-sample MSE',
ylab= "MSE", xlab = expression(paste('log(',lambda,')')))
abline(v=log(myLasso$lambda[which.min(MSE)]),col='red')

Part (B): Cross validation
10-fold Cross-validation to choose the best tuning parameter lambda
set.seed(1)
ratio = 0.5 # Split data
index = sample(1:nrow(X), floor(ratio*nrow(X))) # Make equal partition
# Training data set and test data set
x.train = X[index, ]
x.test = X[-index,]
y.train = Y[index]
y.test = Y[-index]
10-fold cross-validation
set.seed(1)
cv.out = cv.glmnet(x.train,y.train,alpha =1)
par(mfrow=c(1,1))
plot(cv.out)
lines(log(cv.out$lambda), cv.out$cvup,col="green", lwd=2)
lines(log(cv.out$lambda), cv.out$cvlo,col="blue", lwd=2)

bestlam = cv.out$lambda.min
print(bestlam)
## [1] 0.9351282
The value of lambda that results in the smallest cross-validation error is 0.9351282
check what the box plot and the dotted lines mean
dotted line 1: value of lambda that gives minimum cvm
dotted line 2: largest value of lambda such that error is within 1 standard error of the minimu
box plot: MOOSE + cvsd(upper: green line) and MOOSE - cvsd (lower curve: blue line)
which(cv.out$lambda==cv.out$lambda.min)
## [1] 43
which(cv.out$lambda==cv.out$lambda.1se)
## [1] 19
cv.out$cvm[19] # MSE of lambda.1se
## [1] 3049.041
cv.out$cvm[43]+cv.out$cvsd[43] # MSE of lambda.min + sd(lambda.min)
## [1] 3070.633
cv.out$cvm[18] # MSE of lambda[18]
## [1] 3090.661
Compute cross-validation MSE for Lasso model with best lambda
Lasso.mod = glmnet(X,Y,alpha=1,lambda=lambda)
lasso.coef <- predict(Lasso.mod,type ="coefficients",s=bestlam)[1:ncol(X),]
print(lasso.coef)
## (Intercept) age sex bmi map
## 152.1334842 9.1281343 -198.3086164 495.2550390 306.4372768
## tc ldl hdl tch ltg
## -54.4898986 0.0000000 -255.6297685 0.0000000 515.1454903
## glu age.2 bmi.2 map.2 tc.2
## 52.6385248 53.3611471 39.6850467 0.0000000 0.0000000
## ldl.2 hdl.2 tch.2 ltg.2 glu.2
## 0.0000000 0.0000000 0.0000000 -3.6573852 98.4531235
## age.sex age.bmi age.map age.tc age.ldl
## 150.3737668 0.0000000 24.8940141 0.0000000 -54.7136829
## age.hdl age.tch age.ltg age.glu sex.bmi
## 33.3015581 0.0000000 58.1631102 24.8240173 36.9413176
## sex.map sex.tc sex.ldl sex.hdl sex.tch
## 56.9937978 0.0000000 -24.0094851 59.5136464 0.0000000
## sex.ltg sex.glu bmi.map bmi.tc bmi.ldl
## 0.0000000 0.0000000 129.5304893 0.0000000 0.0000000
## bmi.hdl bmi.tch bmi.ltg bmi.glu map.tc
## 0.0000000 0.0000000 0.0000000 0.0000000 5.4462378
## map.ldl map.hdl map.tch map.ltg map.glu
## 0.0000000 36.9121383 0.0000000 0.0000000 -42.9236407
## tc.ldl tc.hdl tc.tch tc.ltg tc.glu
## 0.0000000 8.9794871 -68.9548880 -0.1253885 0.0000000
## ldl.hdl ldl.tch ldl.ltg ldl.glu hdl.tch
## 0.0000000 0.0000000 92.8388346 10.4573718 -71.7012208
## hdl.ltg hdl.glu tch.ltg tch.glu
## 0.0000000 0.0000000 -82.5353633 27.9789264
sum(lasso.coef != 0) # number of nonzero betas for minimum MOOSE
## [1] 35
Compute 10-fold CV MOOSE step by step instead of cv.glmnet
k = 10
set.seed(1) # Reproducibility
folds = sample(1:k, nrow(x.train), replace=TRUE)
cv.errors <- matrix(NA,k,m,dimnames=list(NULL, paste(1:m)))
for(j in 1:k){
model = glmnet(x.train[folds!=j, ], y.train[folds!=j],alpha=1, lambda=lambda)
for(i in 1:m){
pred <- predict(model, s=lambda[i], newx=x.train[folds==j,])
cv.errors[j,i] = mean((y.train[folds==j] - pred)^2)
}
}
Obtain Mean OOS square error (MOOSE)
MOOSE = apply(cv.errors,2,mean)
best.lam = lambda[which.min(MOOSE)]
print(best.lam)
## [1] 0.9604013
How many parameters at best lambda?
lasso.cv = glmnet(X,Y,alpha=1,lambda=lambda)
coef.cv <- predict(lasso.cv,type ="coefficients",s=best.lam)[1:ncol(X),]
print(coef.cv)
## (Intercept) age sex bmi map tc
## 152.133484 8.473917 -197.054357 495.195252 305.509029 -53.086810
## ldl hdl tch ltg glu age.2
## 0.000000 -254.302127 0.000000 514.319670 52.268556 52.811263
## bmi.2 map.2 tc.2 ldl.2 hdl.2 tch.2
## 39.919620 0.000000 0.000000 0.000000 0.000000 0.000000
## ltg.2 glu.2 age.sex age.bmi age.map age.tc
## -4.110733 97.717257 149.644623 0.000000 25.038607 0.000000
## age.ldl age.hdl age.tch age.ltg age.glu sex.bmi
## -53.470959 32.081328 0.000000 57.269570 24.000999 35.821855
## sex.map sex.tc sex.ldl sex.hdl sex.tch sex.ltg
## 56.002127 0.000000 -23.113078 57.973380 0.000000 0.000000
## sex.glu bmi.map bmi.tc bmi.ldl bmi.hdl bmi.tch
## 0.000000 128.601063 0.000000 0.000000 0.000000 0.000000
## bmi.ltg bmi.glu map.tc map.ldl map.hdl map.tch
## 0.000000 0.000000 4.353484 0.000000 36.872183 0.000000
## map.ltg map.glu tc.ldl tc.hdl tc.tch tc.ltg
## 0.000000 -40.373069 0.000000 8.817506 -67.336445 0.000000
## tc.glu ldl.hdl ldl.tch ldl.ltg ldl.glu hdl.tch
## 0.000000 0.000000 0.000000 90.131529 10.130020 -69.305190
## hdl.ltg hdl.glu tch.ltg tch.glu
## 0.000000 0.000000 -79.572650 27.130941
sum(coef.cv != 0) # number of nonzero betas for minimum MOOSE
## [1] 34
Compute Leave-one-out CV MOOSE
error.LOOCV = matrix(NA,nrow(X),m,dimnames=list(NULL, paste(1:m)))
for(i in 1:nrow(X)){
model = glmnet(X[-i,],Y[-i],alpha=1,lambda=lambda)
error.LOOCV[i,] = (Y[i] - predict(model,X[i,,drop=F]))^2
}
Get Mean OOS square error (MOOSE) and plot
MOOSE.LOOCV = apply(error.LOOCV,2,mean)
best.lam.LOOCV = lambda[which.min(MOOSE.LOOCV)]
print(best.lam.LOOCV) # 2.532718
## [1] 2.532718
plot(log(lambda),MOOSE.LOOCV,type='l',xlab=expression(paste("log(",lambda,")")))

How many parameters at best lambda?
lasso.LOOCV = glmnet(X,Y,alpha=1,lambda=lambda)
coef.LOOCV <- predict(lasso.LOOCV,type ="coefficients",s=best.lam.LOOCV)[1:ncol(X),]
print(coef.LOOCV)
## (Intercept) age sex bmi map tc
## 152.133484 0.000000 -131.447431 500.702976 262.790076 0.000000
## ldl hdl tch ltg glu age.2
## 0.000000 -200.207989 0.000000 470.043039 25.280831 17.397960
## bmi.2 map.2 tc.2 ldl.2 hdl.2 tch.2
## 42.917308 0.000000 0.000000 0.000000 0.000000 0.000000
## ltg.2 glu.2 age.sex age.bmi age.map age.tc
## 0.000000 76.382357 115.363727 0.000000 30.526933 0.000000
## age.ldl age.hdl age.tch age.ltg age.glu sex.bmi
## 0.000000 0.000000 0.000000 12.606485 9.464140 0.000000
## sex.map sex.tc sex.ldl sex.hdl sex.tch sex.ltg
## 7.635682 0.000000 0.000000 0.000000 0.000000 0.000000
## sex.glu bmi.map bmi.tc bmi.ldl bmi.hdl bmi.tch
## 0.000000 90.472508 0.000000 0.000000 0.000000 0.000000
## bmi.ltg bmi.glu map.tc map.ldl map.hdl map.tch
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## map.ltg map.glu tc.ldl tc.hdl tc.tch tc.ltg
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## tc.glu ldl.hdl ldl.tch ldl.ltg ldl.glu hdl.tch
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## hdl.ltg hdl.glu tch.ltg tch.glu
## 0.000000 0.000000 0.000000 0.000000
sum(coef.LOOCV != 0) # number of nonzero betas for minimum MOOSE
## [1] 16
Part (C): Collom Mallow’s CP stat
n = length(Y)
p = ncol(X)
# Setup sequence of lambdas
lambda = exp(seq(4,-4,length.out=m))
# initialize CP vector
Cp = lambda * 0
Run model with several lambda’s
model = glmnet(X,Y,alpha=1,lambda=lambda)
Calculate MSE for each labmda
for(i in 1:ncol(model$beta)){
errors = Y - model$a0[i] - X%*%model$beta[,i]
Cp[i] = mean((errors)^2) + 2*model$df[i]*var(errors)/n
}
lambda[which.min(Cp)]
## [1] 2.532718
lasso.cp = glmnet(X,Y,alpha=1,lambda=lambda)
coef.cp <- predict(lasso.cp,type ="coefficients",s=lambda[which.min(Cp)])[1:ncol(X),]
print(coef.cp)
## (Intercept) age sex bmi map tc
## 152.133484 0.000000 -131.447431 500.702976 262.790076 0.000000
## ldl hdl tch ltg glu age.2
## 0.000000 -200.207989 0.000000 470.043039 25.280831 17.397960
## bmi.2 map.2 tc.2 ldl.2 hdl.2 tch.2
## 42.917308 0.000000 0.000000 0.000000 0.000000 0.000000
## ltg.2 glu.2 age.sex age.bmi age.map age.tc
## 0.000000 76.382357 115.363727 0.000000 30.526933 0.000000
## age.ldl age.hdl age.tch age.ltg age.glu sex.bmi
## 0.000000 0.000000 0.000000 12.606485 9.464140 0.000000
## sex.map sex.tc sex.ldl sex.hdl sex.tch sex.ltg
## 7.635682 0.000000 0.000000 0.000000 0.000000 0.000000
## sex.glu bmi.map bmi.tc bmi.ldl bmi.hdl bmi.tch
## 0.000000 90.472508 0.000000 0.000000 0.000000 0.000000
## bmi.ltg bmi.glu map.tc map.ldl map.hdl map.tch
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## map.ltg map.glu tc.ldl tc.hdl tc.tch tc.ltg
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## tc.glu ldl.hdl ldl.tch ldl.ltg ldl.glu hdl.tch
## 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## hdl.ltg hdl.glu tch.ltg tch.glu
## 0.000000 0.000000 0.000000 0.000000
sum(coef.cp != 0) # number of nonzero betas for minimum Cp
## [1] 16
Plot log(lambda) v.s. MSE, 10-fold CV MOOSE, Cp, and LOOCV MOOSE
plot(log(lambda),MSE,type='l',xlab=expression(paste("log(",lambda,")")), ylab='Error')
abline(v=log(lambda[which.min(MSE)]))
col = c(2,3,4)
error = cbind(MOOSE,Cp,MOOSE.LOOCV)
for(i in 1:3){
lines(log(lambda),error[,i],col=col[i])
abline(v=log(lambda[which.min(error[,i])]),col=col[i])
}
legend('topleft',legend=c('IS MSE','10-fold CV MOOSE','Cp','LOOCV MOOSE'),col=1:4,lty=1)

How many parameters at best lambda? Not include intercept
sum(model$beta[,which.min(Cp)] != 0)
## [1] 15
Compare LOOCV MOOSE with Cp
which (Cp == MOOSE.LOOCV)
## named integer(0)