library(latex2exp)
Linear regression and Gaussian data generation process. The problem assumes the error is Gaussian. The purpose of this experiment is to show convergence rate gauss-markov solution \(\hat{\theta}\) to \(\theta_0\) as n increases.
theta_0 <- c(0,1)
lambda <- 0
p<-2
set.seed(123)
sizes <- c(1e3,5e3,1e4,2e4,3e4,5e4,1e5,2e5,5e5,1e6)
m <- length(sizes)
MSE <-rep(m,0)
results_0 <- rep(m,0)
results_1 <- rep(m,0)
for (i in 1:m)
{
print(i)
n <- sizes[i]
x <- rep(0.1,n)
eps <- rnorm(n)
y<-theta_0[1] + theta_0[2]*x + eps
X<-cbind(rep(1,n),x)
A<-solve((t(X)%*%X + diag(rep(n*lambda,p))))
b <- t(X)%*%y
hattheta <- A%*%b
print(hattheta)
results_0[i]<-hattheta[1]
results_1[i]<-hattheta[2]
MSE[i]<- mean(eps^2)
}
[1] 1
Error in solve.default((t(X) %*% X + diag(rep(n * lambda, p)))) :
system is computationally singular: reciprocal condition number = 9.39561e-17
\(n\) trial per experiment \(m\) experiments
m <- 10000
MSE <- rep(0,m)
n <- 10000
x <- rnorm(n)
# loop over experiments
for (i in 1:m){
#print(i)
eps <- rnorm(n)
y<-theta_0 + theta_1*x + eps
X<-cbind(rep(1,n),x)
A<-solve((t(X)%*%X + diag(rep(n*lambda,p))))
b <- t(X)%*%y
hattheta <- A%*%b
MSE[i]<-norm(hattheta-theta_0, type='2')^2
}
# plotting of the expected MSE
check_points <- seq(1,10)*1000
EMSE <- rep(0, length(check_points))
for (i in 1:10)
{
EMSE[i]<-mean(MSE[1:check_points[i]])
}
plot(check_points, EMSE)
f <- function(lambda){
A <- solve((t(X)%*%X + diag(rep(n*lambda,p))))
b <- diag(rep(1,n))-X%*%A%*%t(X)
lambda_prime<- n*norm(b%*%y,type='2')^2/tr(b)^2
return(lambda_prime)
}
jac <- function(param){
# empty
return(0)
}
get.lambda<-function(param0){
res<- nloptr::nloptr( x0=param0,
eval_f=f,
#eval_grad_f=jac,
lb = c(epsilon),
ub = c(999),
opts = list("algorithm"='NLOPT_LN_COBYLA', "maxeval" = 30, "xtol_rel" = 1e-16, "ftol_rel"= 1e-66, "ftol_abs"=1e-6, "print_level"=1,"check_derivatives" = TRUE,"check_derivatives_print" = "all"))
lambda_0<-res$solution
return(lambda_0)
}
Compute \(\lambda_0>0\) using the GCV estimate
epsilon <- 1e-3
p<-2 # dimensionality of the parameters
# Ensure that lambda>0
param0 <- 1.0
n <- 1000
x <- rnorm(n)
eps <- rnorm(n)
y<-theta_0[1] + theta_0[2]*x + eps
X<-cbind(rep(1,n),x)
lambda_0<-get.lambda(param0)
Skipping derivative checker because algorithm 'NLOPT_LN_COBYLA' does not use gradients.
iteration: 1
f(x) = 1.187594
iteration: 2
f(x) = 1.325575
iteration: 3
f(x) = 0.995559
iteration: 4
f(x) = 0.957927
iteration: 5
f(x) = 0.959041
iteration: 6
f(x) = 0.957926
iteration: 7
f(x) = 0.958049
iteration: 8
f(x) = 0.957927
iteration: 9
f(x) = 0.957933
iteration: 10
f(x) = 0.957927
iteration: 11
f(x) = 0.957926
iteration: 12
f(x) = 0.957927
iteration: 13
f(x) = 0.957926
iteration: 14
f(x) = 0.957926
iteration: 15
f(x) = 0.957926
iteration: 16
f(x) = 0.957926
iteration: 17
f(x) = 0.957926
iteration: 18
f(x) = 0.957926
iteration: 19
f(x) = 0.957926
iteration: 20
f(x) = 0.957926
iteration: 21
f(x) = 0.957926
iteration: 22
f(x) = 0.957926
iteration: 23
f(x) = 0.957926
iteration: 24
f(x) = 0.957926
iteration: 25
f(x) = 0.957926
iteration: 26
f(x) = 0.957926
iteration: 27
f(x) = 0.957926
iteration: 28
f(x) = 0.957926
iteration: 29
f(x) = 0.957926
iteration: 30
f(x) = 0.957926
iteration: 31
f(x) = 0.957926
iteration: 32
f(x) = 0.957926
iteration: 33
f(x) = 0.957926
iteration: 34
f(x) = 0.957926
iteration: 35
f(x) = 0.957926
iteration: 36
f(x) = 0.957926
iteration: 37
f(x) = 0.957926
iteration: 38
f(x) = 0.957926
iteration: 39
f(x) = 0.957926
iteration: 40
f(x) = 0.957926
iteration: 41
f(x) = 0.957926
iteration: 42
f(x) = 0.957926
iteration: 43
f(x) = 0.957926
iteration: 44
f(x) = 0.957926
iteration: 45
f(x) = 0.957926
iteration: 46
f(x) = 0.957926
iteration: 47
f(x) = 0.957926
iteration: 48
f(x) = 0.957926
iteration: 49
f(x) = 0.957926
iteration: 50
f(x) = 0.957926
iteration: 51
f(x) = 0.957926
iteration: 52
f(x) = 0.957926
iteration: 53
f(x) = 0.957926
iteration: 54
f(x) = 0.957926
iteration: 55
f(x) = 0.957926
iteration: 56
f(x) = 0.957926
iteration: 57
f(x) = 0.957926
iteration: 58
f(x) = 0.957926
print(lambda_0)
[1] 0.00201856
Now that we have \(\lambda_0\), let’s check that \(\hat{\theta}_{\lambda}\) reduces the EMSE compared to the Gauss-Markov estimate (LS). We’ll need to minimize the neg. log likelihood with a ridge penalty from Eq 1.3 in Wabha
Let’s review some closed form results to tighten up the variance analysis
\(\hat{\beta}_1 = C_{xy}/(\hat\sigma^2 + \lambda)\) \(\hat{\beta}_0 = (\bar{y}- \hat\beta_1\bar{x})/(1+\lambda)\)
get.theta.lambda <- function(lambda){
theta.lambda[2] <-(x%*%y)/(x%*%x + n*lambda) #
theta.lambda[1] <- (mean(y)-theta.lambda[2]*mean(x))/(1+lambda) #
return(theta.lambda)
}
Check with numerical results
objfun <- function(theta){
f <- (1/n)*norm(y-(theta[1] + theta[2]*x),type='2')^2 + lambda_0*norm(theta,type='2')^2
return(f)
}
get.theta.lambda.num <-function(theta_0){
res<- nloptr::nloptr( x0=theta_0,
eval_f=objfun,
#eval_grad_f=jac,
lb = c(-10,-10),
ub = c(10,10),
opts = list("algorithm"='NLOPT_LN_COBYLA', "maxeval" = 500, "xtol_rel" = 1e-16, "ftol_rel"= 1e-16, "ftol_abs"=1e-16, "print_level"=0,"check_derivatives" = FALSE,"check_derivatives_print" = "all"))
theta.lambda.num<-res$solution
return(theta.lambda.num)
}
print('numerical optimization')
[1] "numerical optimization"
print(get.theta.lambda.num(theta_0))
[1] 0.03840644 1.02183998
print('analytic gradients')
[1] "analytic gradients"
print(get.theta.lambda(lambda_0))
[1] 0.0382921 1.0197277
Let’s see that this estimate of \(\hat{\theta}_{\lambda}\) really does reduce the EMSE. I’m not going to reestimate \(\lambda_0\) each time due to numerical complexity.
# n trial per experiment
# m experiments
m <- 1000
MSE <- rep(0,m)
MSE.reg <-rep(0,m)
n <- 10000
x <- rnorm(n)
X<-cbind(rep(1,n),x)
# loop over experiments
for (i in 1:m){
#print(i)
eps <- rnorm(n)
y<-theta_0[1] + theta_0[2]*x + eps
A<-solve((t(X)%*%X + diag(rep(n*lambda,p))))
b <- t(X)%*%y
hattheta <- A%*%b
MSE[i]<-norm(hattheta-theta_0, type='2')^2
#lambda_0<-get.lambda(param0)
theta.lambda<-get.theta.lambda(lambda_0)
#print(lambda_0)
#print(theta.lambda)
MSE.reg[i]<-norm(theta.lambda-theta_0, type='2')^2
}
Good practice to always separating the plotting from the estimation, so that I can touch up figures without having to re-run experiments.
# plotting of the expected MSE
check_points <- seq(1,10)*m/10
EMSE <- rep(0, length(check_points))
EMSE.reg <- rep(0, length(check_points))
for (i in 1:10)
{
EMSE[i]<-mean(MSE[1:check_points[i]])
EMSE.reg[i]<-mean(MSE.reg[1:check_points[i]])
}
plot(check_points, EMSE, xlab='n', ylab='EMSE')
points(check_points, EMSE.reg, col='red')