From ordinary least squares to maximum entropy estimation

1 Setting up

1.1 R software

“R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of UNIX platforms, Windows and MacOS.” — R software (RCoreTeam 2024)

Follow https://cran.r-project.org/ to install it.

1.2 R Studio

“RStudio is an integrated development environment (IDE) designed to support multiple languages, including both R and Python.” — RStudio

Follow https://posit.co/downloads/ to install it.

1.3 Packages

During the workshop, several R packages will be necessary. Open R Studio and create a new Quarto document.

Insert the following code inside the code chunk.

Code

# install.packages("devtools")
library(devtools)
# install.packages("ggcorrplot")
library(ggcorrplot)
# install.packages("glmnet")
library(glmnet)
# devtools::install_github("jorgevazcabral/GCE",
#                          build_vignettes = TRUE,
#                          build_manual = TRUE)
library(GCE)

https://github.com/jorgevazcabral

1.4 Functions

Code

RMSE <- function (y_pred, y_true) {
  RMSE <- sqrt(mean((y_true - y_pred) ^ 2))
  return(RMSE)
}

2 Simulated Data

2.1 Low collinearity

2.1.1 Generate Data

Code

n = 100 # Number of individuals
intercept.beta = 0 # Intercept
y.gen.cont.beta = c(3, 6, 9) # Coefficients used for generating y
cont.k = 0 # Number of noisy continuous variables 
condnumber = 1 # Condition number of the X matrix 

simulated_data_coef <- 
  c(intercept.beta,
    rep(0, cont.k),
    y.gen.cont.beta)

simulated_data <-
  gdata_GCE(
    n = n,
    cont.k = cont.k, 
    y.gen.cont.k = length(y.gen.cont.beta),
    y.gen.cont.beta = y.gen.cont.beta,
    intercept.beta = intercept.beta,
    Xgenerator.method = "svd",
    condnumber = condnumber,
    Xgenerator.seed = 28102024,
    error.dist = "normal",
    error.dist.mean = 0,
    error.dist.sd = 1,
    dataframe = TRUE)

prop_train = 0.7

set.seed(28102024)
ind <- sample(1:n,
              floor(prop_train * n))

simulated_data_train <- simulated_data[ind,]
simulated_data_test <- simulated_data[-ind,]

2.1.1.1 Summary

Code

summary(simulated_data_train)

      X001                 X002                X003                y          
 Min.   :-0.3451441   Min.   :-0.224542   Min.   :-0.26927   Min.   :-4.4779  
 1st Qu.:-0.0657684   1st Qu.:-0.074109   1st Qu.:-0.08039   1st Qu.:-1.0391  
 Median : 0.0023662   Median :-0.011115   Median :-0.02908   Median :-0.1823  
 Mean   :-0.0008795   Mean   :-0.008867   Mean   :-0.02325   Mean   :-0.2825  
 3rd Qu.: 0.0704543   3rd Qu.: 0.053042   3rd Qu.: 0.02526   3rd Qu.: 0.9982  
 Max.   : 0.2429219   Max.   : 0.207346   Max.   : 0.24159   Max.   : 4.2282

2.1.1.2 Correlation matrix

Code

ggcorrplot::ggcorrplot(
  cor(simulated_data),
  lab = TRUE,
  title = paste("Cond =",
                round(base::kappa(
                  simulated_data[,-ncol(simulated_data)],
                  exact = TRUE),
                  0)))

Code

ggcorrplot::ggcorrplot(
  cor(simulated_data_train),
  lab = TRUE,
  title = paste("Cond =",
                round(base::kappa(
                  simulated_data_train[,-ncol(simulated_data_train)],
                  exact = TRUE),
                  0)))

Code

ggcorrplot::ggcorrplot(
  cor(simulated_data_test),
  lab = TRUE,
  title = paste("Cond =",
                round(base::kappa(
                  simulated_data_test[,-ncol(simulated_data_test)],
                  exact = TRUE),
                  0)))

2.1.2 Estimation

2.1.2.1 OLS

Code

sd_model_OLS <- 
  lm(data = simulated_data_train,
     y ~ . # or y ~ X001 + X002 + X003
     )

Code

# summary of the linear model
summary(sd_model_OLS)


Call:
lm(formula = y ~ ., data = simulated_data_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.95982 -0.88171  0.01528  0.80312  2.49201 

Coefficients:
            Estimate Std. Error t value      Pr(>|t|)    
(Intercept) -0.01196    0.13407  -0.089       0.92916    
X001         3.56848    1.25199   2.850       0.00582 ** 
X002         6.61132    1.26903   5.210 0.00000202846 ***
X003         8.98084    1.34498   6.677 0.00000000605 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.086 on 66 degrees of freedom
Multiple R-squared:  0.5474,    Adjusted R-squared:  0.5269 
F-statistic: 26.61 on 3 and 66 DF,  p-value: 0.00000000002132

Code

# coefficients of the linear model
sd_model_OLS_coef <- coef(sd_model_OLS)

Code

# prediction error
(sd_model_OLS_RMSE_train <-
   RMSE(fitted(sd_model_OLS), simulated_data_train$y))

[1] 1.054651

Code

(sd_model_OLS_RMSE_test <-
   RMSE(predict(sd_model_OLS,
                simulated_data_test),
        simulated_data_test$y))

[1] 0.9763157

Code

# precision error
(sd_model_OLS_beta_RMSE <-
    RMSE(sd_model_OLS_coef, simulated_data_coef))

[1] 0.4175516

Code

par(mfrow = c(2,2))
plot(sd_model_OLS)

2.1.2.2 Ridge

Code

sd_model_ridge <- 
  glmnet::cv.glmnet(
    x = as.matrix(simulated_data_train[,-ncol(simulated_data_train)]),
    y = simulated_data_train[,ncol(simulated_data_train)] ,
    alpha = 0,
    lambda = 10^seq(-3,3,by=0.1))

Code

plot(sd_model_ridge)

Code

plot(sd_model_ridge$glmnet.fit,
     xvar = "lambda")
abline(v = c(log(sd_model_ridge$lambda.min),
             log(sd_model_ridge$lambda.1se)),
       lty = 3)
abline(h = simulated_data_coef,
       lty = 2)

Code

(sd_model_ridge_coef <-
  t(as.matrix(coef(sd_model_ridge,
                   s = "lambda.min")))) # s = "lambda.1se"

   (Intercept)     X001     X002    X003
s1  -0.0147058 3.536252 6.544174 8.88975

Code

(sd_model_ridge_RMSE_train <-
  RMSE(predict(sd_model_ridge,
               as.matrix(simulated_data_train[,-ncol(simulated_data_train)]),
               s = "lambda.min"),
       simulated_data_train$y))

[1] 1.054715

Code

(sd_model_ridge_RMSE_test <-
  RMSE(predict(sd_model_ridge,
               as.matrix(simulated_data_test[,-ncol(simulated_data_test)]),
               s = "lambda.min"),
       simulated_data_test$y))

[1] 0.9753098

Code

(sd_model_ridge_beta_RMSE <-
  RMSE(sd_model_ridge_coef,
       simulated_data_coef))

[1] 0.3860254

2.1.2.3 GCE

Code

sd_model_GCE <- 
  GCE::gce(data = simulated_data_train,
           y ~ .,
           int.one.prior = "ridge",
           int.one.method = "adaptive",
           int.one.alpha.n = 10,
           model = TRUE,
           ci.B = 10,
           seed = 28102024
      )

Code

summary(sd_model_GCE)


Call:
GCE::gce(formula = y ~ ., data = simulated_data_train, model = TRUE, 
    int.one.prior = "ridge", int.one.method = "adaptive", int.one.alpha.n = 10, 
    ci.B = 10, seed = 28102024)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.03275 -0.83422  0.06768  0.79727  2.57732 

Coefficients:
               Estimate  LB 95% CI UB 95% CI
(Intercept) -0.02405397 -0.1200613 0.1294133
X001         3.43021989  2.3710957 4.8320563
X002         6.34404098  5.0259452 7.7960249
X003         8.56795518  5.8771018 9.6037053

Residual standard error: 1.087 on 66 degrees of freedom
Multiple R-squared:  0.5248,    Adjusted R-squared:  0.5032
Choosen support:  2.597
CV-RMSE:  1.129

Code

sd_model_GCE_coef <- coef(sd_model_GCE)

Code

(sd_model_GCE_RMSE_train <-
  RMSE(fitted(sd_model_GCE),
       simulated_data_train$y))

[1] 1.055852

Code

(sd_model_GCE_RMSE_test <-
    RMSE(as.matrix(cbind(1,
                      simulated_data_test[,-ncol(simulated_data_test)]
                      )) %*% sd_model_GCE_coef,
         simulated_data_test$y))

[1] 0.9741523

Code

(sd_model_GCE_beta_RMSE <-
  RMSE(sd_model_GCE_coef,
                  simulated_data_coef))

[1] 0.350248

Code

par(mfrow = c(2,2))
plot(sd_model_GCE)

2.1.2.4 All

Code

data.frame(rbind(
  cbind(
    "Generating" = simulated_data_coef, 
    OLS = sd_model_OLS_coef,
    ridge = as.numeric(sd_model_ridge_coef),
    GCE = sd_model_GCE_coef
  ),
  beta_RMSE = c(
    0,
    sd_model_OLS_beta_RMSE,
    sd_model_ridge_beta_RMSE,
    sd_model_GCE_beta_RMSE
  ),
  RMSE_train = c(
    0,
    sd_model_OLS_RMSE_train,
    sd_model_ridge_RMSE_train,
    sd_model_GCE_RMSE_train),
  RMSE_test = c(
    0,
    sd_model_OLS_RMSE_test,
    sd_model_ridge_RMSE_test,
    sd_model_GCE_RMSE_test)
))

3 Acknowledgments

This work is supported by the Center for Research and Development in Mathematics and Applications (CIDMA) through the Portuguese Foundation for Science and Technology (FCT–Fundação para a Ciência e a Tecnologia), with references UIDB/04106/2020 https://doi.org/10.54499/UIDB/04106/2020 and UIDP/04106/2020 https://doi.org/10.54499/UIDP/04106/2020.

References

RCoreTeam. 2024. “R: A Language and Environment for Statistical Computing.” https://www.R-project.org/.

--- title: "From ordinary least squares to maximum entropy estimation" subtitle: "Hands-on" title-block-banner: "#70AD47" title-block-banner-color: "white" date: 2024-11-08 author: - name: Jorge Cabral id: jc email: jorgecabral@ua.pt affiliations: - name: Department of Mathematics, University of Aveiro, Aveiro, Portugal id: 1 department: Department of Mathematics city: Aveiro country: Portugal url: https://www.ua.pt/pt/dmat - name: CIDMA, University of Aveiro, Aveiro, Portugal id: 2 department: University of Aveiro city: Aveiro country: Portugal url: https://cidma.ua.pt/ attributes: corresponding: true format: html: toc: true toc_float: yes toc-location: left toc-depth: 6 theme: Sandstone highlight-style: Ayu-dark code-link: true code-fold: show code-tools: true df-print: paged number-sections: true bibliography: references.bib editor: source --- ```{r} #| echo: false options(scipen=999) ``` ![](UniSalento.png) ## Setting up ### R software > "R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of UNIX platforms, Windows and MacOS." --- [R software](https://www.r-project.org/) [@RCore2024] Follow [https://cran.r-project.org/](https://cran.r-project.org/) to install it. ![](CRAN.png) ### R Studio > "RStudio is an integrated development environment (IDE) designed to support multiple languages, including both R and Python." --- [RStudio](https://docs.posit.co/ide/user/) Follow [https://posit.co/downloads/](https://posit.co/downloads/) to install it. ![](RStudio.png) ### Packages During the workshop, several R packages will be necessary. Open R Studio and create a new Quarto document. ![](NewFile.png) ![](NewFile2.png) Insert the following code inside the code chunk. ```{r, echo=TRUE,warning=FALSE,message=FALSE} # install.packages("devtools") library(devtools) # install.packages("ggcorrplot") library(ggcorrplot) # install.packages("glmnet") library(glmnet) # devtools::install_github("jorgevazcabral/GCE", # build_vignettes = TRUE, # build_manual = TRUE) library(GCE) ``` [https://github.com/jorgevazcabral](https://github.com/jorgevazcabral) ![](Github.png) ### Functions ```{r} RMSE <- function (y_pred, y_true) { RMSE <- sqrt(mean((y_true - y_pred) ^ 2)) return(RMSE) } ``` ## Simulated Data ### Low collinearity #### Generate Data ```{r,echo=TRUE} n = 100 # Number of individuals intercept.beta = 0 # Intercept y.gen.cont.beta = c(3, 6, 9) # Coefficients used for generating y cont.k = 0 # Number of noisy continuous variables condnumber = 1 # Condition number of the X matrix simulated_data_coef <- c(intercept.beta, rep(0, cont.k), y.gen.cont.beta) simulated_data <- gdata_GCE( n = n, cont.k = cont.k, y.gen.cont.k = length(y.gen.cont.beta), y.gen.cont.beta = y.gen.cont.beta, intercept.beta = intercept.beta, Xgenerator.method = "svd", condnumber = condnumber, Xgenerator.seed = 28102024, error.dist = "normal", error.dist.mean = 0, error.dist.sd = 1, dataframe = TRUE) prop_train = 0.7 set.seed(28102024) ind <- sample(1:n, floor(prop_train * n)) simulated_data_train <- simulated_data[ind,] simulated_data_test <- simulated_data[-ind,] ``` ##### Summary ```{r,echo=TRUE} summary(simulated_data_train) ``` ##### Correlation matrix ::: {.panel-tabset .nav-pills} ###### All ```{r,echo=TRUE} ggcorrplot::ggcorrplot( cor(simulated_data), lab = TRUE, title = paste("Cond =", round(base::kappa( simulated_data[,-ncol(simulated_data)], exact = TRUE), 0))) ``` ###### Train ```{r,echo=TRUE} ggcorrplot::ggcorrplot( cor(simulated_data_train), lab = TRUE, title = paste("Cond =", round(base::kappa( simulated_data_train[,-ncol(simulated_data_train)], exact = TRUE), 0))) ``` ###### Test ```{r,echo=TRUE} ggcorrplot::ggcorrplot( cor(simulated_data_test), lab = TRUE, title = paste("Cond =", round(base::kappa( simulated_data_test[,-ncol(simulated_data_test)], exact = TRUE), 0))) ``` ::: #### Estimation ##### OLS ```{r,echo=TRUE} sd_model_OLS <- lm(data = simulated_data_train, y ~ . # or y ~ X001 + X002 + X003 ) ``` ```{r,echo=TRUE} # summary of the linear model summary(sd_model_OLS) # coefficients of the linear model sd_model_OLS_coef <- coef(sd_model_OLS) ``` ```{r,echo=TRUE} # prediction error (sd_model_OLS_RMSE_train <- RMSE(fitted(sd_model_OLS), simulated_data_train$y)) (sd_model_OLS_RMSE_test <- RMSE(predict(sd_model_OLS, simulated_data_test), simulated_data_test$y)) # precision error (sd_model_OLS_beta_RMSE <- RMSE(sd_model_OLS_coef, simulated_data_coef)) ``` ```{r,echo=TRUE} #| fig.width: 9 #| fig.height: 9 par(mfrow = c(2,2)) plot(sd_model_OLS) ``` ##### Ridge ```{r,echo=TRUE} sd_model_ridge <- glmnet::cv.glmnet( x = as.matrix(simulated_data_train[,-ncol(simulated_data_train)]), y = simulated_data_train[,ncol(simulated_data_train)] , alpha = 0, lambda = 10^seq(-3,3,by=0.1)) ``` ```{r,echo=TRUE} plot(sd_model_ridge) ``` ```{r,echo=TRUE} plot(sd_model_ridge$glmnet.fit, xvar = "lambda") abline(v = c(log(sd_model_ridge$lambda.min), log(sd_model_ridge$lambda.1se)), lty = 3) abline(h = simulated_data_coef, lty = 2) ``` ```{r,echo=TRUE} (sd_model_ridge_coef <- t(as.matrix(coef(sd_model_ridge, s = "lambda.min")))) # s = "lambda.1se" ``` ```{r,echo=TRUE} (sd_model_ridge_RMSE_train <- RMSE(predict(sd_model_ridge, as.matrix(simulated_data_train[,-ncol(simulated_data_train)]), s = "lambda.min"), simulated_data_train$y)) (sd_model_ridge_RMSE_test <- RMSE(predict(sd_model_ridge, as.matrix(simulated_data_test[,-ncol(simulated_data_test)]), s = "lambda.min"), simulated_data_test$y)) (sd_model_ridge_beta_RMSE <- RMSE(sd_model_ridge_coef, simulated_data_coef)) ``` ##### GCE ```{r,echo=TRUE} sd_model_GCE <- GCE::gce(data = simulated_data_train, y ~ ., int.one.prior = "ridge", int.one.method = "adaptive", int.one.alpha.n = 10, model = TRUE, ci.B = 10, seed = 28102024 ) ``` ```{r,echo=TRUE} summary(sd_model_GCE) sd_model_GCE_coef <- coef(sd_model_GCE) ``` ```{r,echo=TRUE} (sd_model_GCE_RMSE_train <- RMSE(fitted(sd_model_GCE), simulated_data_train$y)) (sd_model_GCE_RMSE_test <- RMSE(as.matrix(cbind(1, simulated_data_test[,-ncol(simulated_data_test)] )) %*% sd_model_GCE_coef, simulated_data_test$y)) (sd_model_GCE_beta_RMSE <- RMSE(sd_model_GCE_coef, simulated_data_coef)) ``` ```{r,echo=TRUE} #| fig.width: 9 #| fig.height: 9 par(mfrow = c(2,2)) plot(sd_model_GCE) ``` ##### All ```{r,echo=TRUE} data.frame(rbind( cbind( "Generating" = simulated_data_coef, OLS = sd_model_OLS_coef, ridge = as.numeric(sd_model_ridge_coef), GCE = sd_model_GCE_coef ), beta_RMSE = c( 0, sd_model_OLS_beta_RMSE, sd_model_ridge_beta_RMSE, sd_model_GCE_beta_RMSE ), RMSE_train = c( 0, sd_model_OLS_RMSE_train, sd_model_ridge_RMSE_train, sd_model_GCE_RMSE_train), RMSE_test = c( 0, sd_model_OLS_RMSE_test, sd_model_ridge_RMSE_test, sd_model_GCE_RMSE_test) )) ``` ## Acknowledgments This work is supported by the Center for Research and Development in Mathematics and Applications (CIDMA) through the Portuguese Foundation for Science and Technology (FCT–Fundação para a Ciência e a Tecnologia), with references UIDB/04106/2020 [https://doi.org/10.54499/UIDB/04106/2020](https://doi.org/10.54499/UIDB/04106/2020) and UIDP/04106/2020 [https://doi.org/10.54499/UIDP/04106/2020](https://doi.org/10.54499/UIDP/04106/2020). ![](Acknow.png)