1 Setting up
1.1 R software
“R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of UNIX platforms, Windows and MacOS.” — R software (RCoreTeam 2024)
Follow https://cran.r-project.org/ to install it.
1.2 R Studio
“RStudio is an integrated development environment (IDE) designed to support multiple languages, including both R and Python.” — RStudio
Follow https://posit.co/downloads/ to install it.
1.3 Packages
During the workshop, several R packages will be necessary. Open R Studio and create a new Quarto document.
Insert the following code inside the code chunk.
1.4 Functions
2 Simulated Data
2.1 Low collinearity
2.1.1 Generate Data
Code
n = 100 # Number of individuals
intercept.beta = 0 # Intercept
y.gen.cont.beta = c(3, 6, 9) # Coefficients used for generating y
cont.k = 0 # Number of noisy continuous variables
condnumber = 1 # Condition number of the X matrix
simulated_data_coef <-
c(intercept.beta,
rep(0, cont.k),
y.gen.cont.beta)
simulated_data <-
gdata_GCE(
n = n,
cont.k = cont.k,
y.gen.cont.k = length(y.gen.cont.beta),
y.gen.cont.beta = y.gen.cont.beta,
intercept.beta = intercept.beta,
Xgenerator.method = "svd",
condnumber = condnumber,
Xgenerator.seed = 28102024,
error.dist = "normal",
error.dist.mean = 0,
error.dist.sd = 1,
dataframe = TRUE)
prop_train = 0.7
set.seed(28102024)
ind <- sample(1:n,
floor(prop_train * n))
simulated_data_train <- simulated_data[ind,]
simulated_data_test <- simulated_data[-ind,]2.1.1.1 Summary
Code
summary(simulated_data_train) X001 X002 X003 y
Min. :-0.3451441 Min. :-0.224542 Min. :-0.26927 Min. :-4.4779
1st Qu.:-0.0657684 1st Qu.:-0.074109 1st Qu.:-0.08039 1st Qu.:-1.0391
Median : 0.0023662 Median :-0.011115 Median :-0.02908 Median :-0.1823
Mean :-0.0008795 Mean :-0.008867 Mean :-0.02325 Mean :-0.2825
3rd Qu.: 0.0704543 3rd Qu.: 0.053042 3rd Qu.: 0.02526 3rd Qu.: 0.9982
Max. : 0.2429219 Max. : 0.207346 Max. : 0.24159 Max. : 4.2282
2.1.1.2 Correlation matrix
2.1.2 Estimation
2.1.2.1 OLS
Code
sd_model_OLS <-
lm(data = simulated_data_train,
y ~ . # or y ~ X001 + X002 + X003
)Code
# summary of the linear model
summary(sd_model_OLS)
Call:
lm(formula = y ~ ., data = simulated_data_train)
Residuals:
Min 1Q Median 3Q Max
-1.95982 -0.88171 0.01528 0.80312 2.49201
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.01196 0.13407 -0.089 0.92916
X001 3.56848 1.25199 2.850 0.00582 **
X002 6.61132 1.26903 5.210 0.00000202846 ***
X003 8.98084 1.34498 6.677 0.00000000605 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.086 on 66 degrees of freedom
Multiple R-squared: 0.5474, Adjusted R-squared: 0.5269
F-statistic: 26.61 on 3 and 66 DF, p-value: 0.00000000002132
Code
# coefficients of the linear model
sd_model_OLS_coef <- coef(sd_model_OLS)Code
# prediction error
(sd_model_OLS_RMSE_train <-
RMSE(fitted(sd_model_OLS), simulated_data_train$y))[1] 1.054651
Code
(sd_model_OLS_RMSE_test <-
RMSE(predict(sd_model_OLS,
simulated_data_test),
simulated_data_test$y))[1] 0.9763157
Code
# precision error
(sd_model_OLS_beta_RMSE <-
RMSE(sd_model_OLS_coef, simulated_data_coef))[1] 0.4175516
2.1.2.2 Ridge
Code
plot(sd_model_ridge)Code
Code
(Intercept) X001 X002 X003
s1 -0.0147058 3.536252 6.544174 8.88975
Code
[1] 1.054715
Code
[1] 0.9753098
Code
(sd_model_ridge_beta_RMSE <-
RMSE(sd_model_ridge_coef,
simulated_data_coef))[1] 0.3860254
2.1.2.3 GCE
Code
sd_model_GCE <-
GCE::gce(data = simulated_data_train,
y ~ .,
int.one.prior = "ridge",
int.one.method = "adaptive",
int.one.alpha.n = 10,
model = TRUE,
ci.B = 10,
seed = 28102024
)Code
summary(sd_model_GCE)
Call:
GCE::gce(formula = y ~ ., data = simulated_data_train, model = TRUE,
int.one.prior = "ridge", int.one.method = "adaptive", int.one.alpha.n = 10,
ci.B = 10, seed = 28102024)
Residuals:
Min 1Q Median 3Q Max
-2.03275 -0.83422 0.06768 0.79727 2.57732
Coefficients:
Estimate LB 95% CI UB 95% CI
(Intercept) -0.02405397 -0.1200613 0.1294133
X001 3.43021989 2.3710957 4.8320563
X002 6.34404098 5.0259452 7.7960249
X003 8.56795518 5.8771018 9.6037053
Residual standard error: 1.087 on 66 degrees of freedom
Multiple R-squared: 0.5248, Adjusted R-squared: 0.5032
Choosen support: 2.597
CV-RMSE: 1.129
Code
sd_model_GCE_coef <- coef(sd_model_GCE) Code
(sd_model_GCE_RMSE_train <-
RMSE(fitted(sd_model_GCE),
simulated_data_train$y))[1] 1.055852
Code
[1] 0.9741523
Code
(sd_model_GCE_beta_RMSE <-
RMSE(sd_model_GCE_coef,
simulated_data_coef))[1] 0.350248
2.1.2.4 All
Code
data.frame(rbind(
cbind(
"Generating" = simulated_data_coef,
OLS = sd_model_OLS_coef,
ridge = as.numeric(sd_model_ridge_coef),
GCE = sd_model_GCE_coef
),
beta_RMSE = c(
0,
sd_model_OLS_beta_RMSE,
sd_model_ridge_beta_RMSE,
sd_model_GCE_beta_RMSE
),
RMSE_train = c(
0,
sd_model_OLS_RMSE_train,
sd_model_ridge_RMSE_train,
sd_model_GCE_RMSE_train),
RMSE_test = c(
0,
sd_model_OLS_RMSE_test,
sd_model_ridge_RMSE_test,
sd_model_GCE_RMSE_test)
))3 Acknowledgments
This work is supported by the Center for Research and Development in Mathematics and Applications (CIDMA) through the Portuguese Foundation for Science and Technology (FCT–Fundação para a Ciência e a Tecnologia), with references UIDB/04106/2020 https://doi.org/10.54499/UIDB/04106/2020 and UIDP/04106/2020 https://doi.org/10.54499/UIDP/04106/2020.