From ordinary least squares to maximum entropy estimation

Hands-on

Author
Affiliations
Jorge Cabral
Published

November 8, 2024

1 Setting up

1.1 R software

“R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of UNIX platforms, Windows and MacOS.” — R software (RCoreTeam 2024)

Follow https://cran.r-project.org/ to install it.



1.2 R Studio

“RStudio is an integrated development environment (IDE) designed to support multiple languages, including both R and Python.” — RStudio

Follow https://posit.co/downloads/ to install it.


1.3 Packages

During the workshop, several R packages will be necessary. Open R Studio and create a new Quarto document.


Insert the following code inside the code chunk.

Code
# install.packages("devtools")
library(devtools)
# install.packages("ggcorrplot")
library(ggcorrplot)
# install.packages("glmnet")
library(glmnet)
# devtools::install_github("jorgevazcabral/GCE",
#                          build_vignettes = TRUE,
#                          build_manual = TRUE)
library(GCE)

https://github.com/jorgevazcabral

1.4 Functions

Code
RMSE <- function (y_pred, y_true) {
  RMSE <- sqrt(mean((y_true - y_pred) ^ 2))
  return(RMSE)
}


2 Simulated Data

2.1 Low collinearity

2.1.1 Generate Data

Code
n = 100 # Number of individuals
intercept.beta = 0 # Intercept
y.gen.cont.beta = c(3, 6, 9) # Coefficients used for generating y
cont.k = 0 # Number of noisy continuous variables 
condnumber = 1 # Condition number of the X matrix 

simulated_data_coef <- 
  c(intercept.beta,
    rep(0, cont.k),
    y.gen.cont.beta)

simulated_data <-
  gdata_GCE(
    n = n,
    cont.k = cont.k, 
    y.gen.cont.k = length(y.gen.cont.beta),
    y.gen.cont.beta = y.gen.cont.beta,
    intercept.beta = intercept.beta,
    Xgenerator.method = "svd",
    condnumber = condnumber,
    Xgenerator.seed = 28102024,
    error.dist = "normal",
    error.dist.mean = 0,
    error.dist.sd = 1,
    dataframe = TRUE)

prop_train = 0.7

set.seed(28102024)
ind <- sample(1:n,
              floor(prop_train * n))

simulated_data_train <- simulated_data[ind,]
simulated_data_test <- simulated_data[-ind,]
2.1.1.1 Summary
Code
summary(simulated_data_train)
      X001                 X002                X003                y          
 Min.   :-0.3451441   Min.   :-0.224542   Min.   :-0.26927   Min.   :-4.4779  
 1st Qu.:-0.0657684   1st Qu.:-0.074109   1st Qu.:-0.08039   1st Qu.:-1.0391  
 Median : 0.0023662   Median :-0.011115   Median :-0.02908   Median :-0.1823  
 Mean   :-0.0008795   Mean   :-0.008867   Mean   :-0.02325   Mean   :-0.2825  
 3rd Qu.: 0.0704543   3rd Qu.: 0.053042   3rd Qu.: 0.02526   3rd Qu.: 0.9982  
 Max.   : 0.2429219   Max.   : 0.207346   Max.   : 0.24159   Max.   : 4.2282  
2.1.1.2 Correlation matrix


2.1.2 Estimation

2.1.2.1 OLS
Code
sd_model_OLS <- 
  lm(data = simulated_data_train,
     y ~ . # or y ~ X001 + X002 + X003
     )
Code
# summary of the linear model
summary(sd_model_OLS)

Call:
lm(formula = y ~ ., data = simulated_data_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.95982 -0.88171  0.01528  0.80312  2.49201 

Coefficients:
            Estimate Std. Error t value      Pr(>|t|)    
(Intercept) -0.01196    0.13407  -0.089       0.92916    
X001         3.56848    1.25199   2.850       0.00582 ** 
X002         6.61132    1.26903   5.210 0.00000202846 ***
X003         8.98084    1.34498   6.677 0.00000000605 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.086 on 66 degrees of freedom
Multiple R-squared:  0.5474,    Adjusted R-squared:  0.5269 
F-statistic: 26.61 on 3 and 66 DF,  p-value: 0.00000000002132
Code
# coefficients of the linear model
sd_model_OLS_coef <- coef(sd_model_OLS)
Code
# prediction error
(sd_model_OLS_RMSE_train <-
   RMSE(fitted(sd_model_OLS), simulated_data_train$y))
[1] 1.054651
Code
(sd_model_OLS_RMSE_test <-
   RMSE(predict(sd_model_OLS,
                simulated_data_test),
        simulated_data_test$y))
[1] 0.9763157
Code
# precision error
(sd_model_OLS_beta_RMSE <-
    RMSE(sd_model_OLS_coef, simulated_data_coef))
[1] 0.4175516
Code
par(mfrow = c(2,2))
plot(sd_model_OLS)


2.1.2.2 Ridge
Code
sd_model_ridge <- 
  glmnet::cv.glmnet(
    x = as.matrix(simulated_data_train[,-ncol(simulated_data_train)]),
    y = simulated_data_train[,ncol(simulated_data_train)] ,
    alpha = 0,
    lambda = 10^seq(-3,3,by=0.1))
Code
plot(sd_model_ridge)

Code
plot(sd_model_ridge$glmnet.fit,
     xvar = "lambda")
abline(v = c(log(sd_model_ridge$lambda.min),
             log(sd_model_ridge$lambda.1se)),
       lty = 3)
abline(h = simulated_data_coef,
       lty = 2)

Code
(sd_model_ridge_coef <-
  t(as.matrix(coef(sd_model_ridge,
                   s = "lambda.min")))) # s = "lambda.1se"
   (Intercept)     X001     X002    X003
s1  -0.0147058 3.536252 6.544174 8.88975
Code
(sd_model_ridge_RMSE_train <-
  RMSE(predict(sd_model_ridge,
               as.matrix(simulated_data_train[,-ncol(simulated_data_train)]),
               s = "lambda.min"),
       simulated_data_train$y))
[1] 1.054715
Code
(sd_model_ridge_RMSE_test <-
  RMSE(predict(sd_model_ridge,
               as.matrix(simulated_data_test[,-ncol(simulated_data_test)]),
               s = "lambda.min"),
       simulated_data_test$y))
[1] 0.9753098
Code
(sd_model_ridge_beta_RMSE <-
  RMSE(sd_model_ridge_coef,
       simulated_data_coef))
[1] 0.3860254


2.1.2.3 GCE
Code
sd_model_GCE <- 
  GCE::gce(data = simulated_data_train,
           y ~ .,
           int.one.prior = "ridge",
           int.one.method = "adaptive",
           int.one.alpha.n = 10,
           model = TRUE,
           ci.B = 10,
           seed = 28102024
      )
Code
summary(sd_model_GCE)

Call:
GCE::gce(formula = y ~ ., data = simulated_data_train, model = TRUE, 
    int.one.prior = "ridge", int.one.method = "adaptive", int.one.alpha.n = 10, 
    ci.B = 10, seed = 28102024)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.03275 -0.83422  0.06768  0.79727  2.57732 

Coefficients:
               Estimate  LB 95% CI UB 95% CI
(Intercept) -0.02405397 -0.1200613 0.1294133
X001         3.43021989  2.3710957 4.8320563
X002         6.34404098  5.0259452 7.7960249
X003         8.56795518  5.8771018 9.6037053

Residual standard error: 1.087 on 66 degrees of freedom
Multiple R-squared:  0.5248,    Adjusted R-squared:  0.5032
Choosen support:  2.597
CV-RMSE:  1.129
Code
sd_model_GCE_coef <- coef(sd_model_GCE) 
Code
(sd_model_GCE_RMSE_train <-
  RMSE(fitted(sd_model_GCE),
       simulated_data_train$y))
[1] 1.055852
Code
(sd_model_GCE_RMSE_test <-
    RMSE(as.matrix(cbind(1,
                      simulated_data_test[,-ncol(simulated_data_test)]
                      )) %*% sd_model_GCE_coef,
         simulated_data_test$y))
[1] 0.9741523
Code
(sd_model_GCE_beta_RMSE <-
  RMSE(sd_model_GCE_coef,
                  simulated_data_coef))
[1] 0.350248
Code
par(mfrow = c(2,2))
plot(sd_model_GCE)


2.1.2.4 All
Code
data.frame(rbind(
  cbind(
    "Generating" = simulated_data_coef, 
    OLS = sd_model_OLS_coef,
    ridge = as.numeric(sd_model_ridge_coef),
    GCE = sd_model_GCE_coef
  ),
  beta_RMSE = c(
    0,
    sd_model_OLS_beta_RMSE,
    sd_model_ridge_beta_RMSE,
    sd_model_GCE_beta_RMSE
  ),
  RMSE_train = c(
    0,
    sd_model_OLS_RMSE_train,
    sd_model_ridge_RMSE_train,
    sd_model_GCE_RMSE_train),
  RMSE_test = c(
    0,
    sd_model_OLS_RMSE_test,
    sd_model_ridge_RMSE_test,
    sd_model_GCE_RMSE_test)
))

3 Acknowledgments

This work is supported by the Center for Research and Development in Mathematics and Applications (CIDMA) through the Portuguese Foundation for Science and Technology (FCT–Fundação para a Ciência e a Tecnologia), with references UIDB/04106/2020 https://doi.org/10.54499/UIDB/04106/2020 and UIDP/04106/2020 https://doi.org/10.54499/UIDP/04106/2020.

References

RCoreTeam. 2024. “R: A Language and Environment for Statistical Computing.” https://www.R-project.org/.