Adrian Noberto Marino
Claudio Collado
Fijamos la semilla para poder reproducir los experimentos. También se fija el numero de CPU’s a utilizar.
set.seed(42)
options(mc.cores = 24)Se importan las librerÃas a utilizar a lo largo de la notebook:
# ------------------------------------------------------------------
# Instalar la herramienta pacman
# ------------------------------------------------------------------
# install.packages('pacman') # <----
# ------------------------------------------------------------------
#
#
#
# ------------------------------------------------------------------
# Requerido si se corre rstudio-server con docker
# ------------------------------------------------------------------
# install.packages(
# "https://cran.r-project.org/src/contrib/rstan_2.21.2.tar.gz",
# repos = NULL,
# type="source"
# )
#
# sudo apt-get install libglpk-dev
# ------------------------------------------------------------------library(pacman)
p_load(tidyverse, tidymodels, rsample, rstan, shinystan, rstanarm, devtools)
source('../src/dataset.R')
source('../src/plot.R')
source('../src/model.R')
rstan_options(javascript = FALSE)Palmer Penguins
dataset <- load_dataset() %>% mutate_if(is.character, as.factor)
dataset %>% glimpse()## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel~
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse~
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, ~
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, ~
## $ flipper_length_mm <dbl> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186~
## $ body_mass_g <dbl> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, ~
## $ sex <fct> male, female, female, NA, female, male, female, male~
## $ year <dbl> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007~
Se enumeran y describen breve-mente cada variable que forma parte del dataset:
Variables Numéricas:
Variables Categóricas:
missings_summary(dataset)hist_plots(dataset)Observaciones
library(patchwork)
p1<-ggplot(dataset, aes(y=bill_depth_mm)) +
geom_boxplot(fill="coral") + ggtitle("bill_depth") + theme(plot.title = element_text(hjust = 0.5))
p2<-ggplot(dataset, aes(y=bill_length_mm)) +
geom_boxplot(fill="dodgerblue") + ggtitle("bill_length") + theme(plot.title = element_text(hjust = 0.5))
p3<-ggplot(dataset, aes(y=flipper_length_mm)) +
geom_boxplot(fill="seagreen3")+ ggtitle("flipper_depth") + theme(plot.title = element_text(hjust = 0.5))
p4<-ggplot(dataset, aes(y=body_mass_g)) +
geom_boxplot(fill = "pink1") + ggtitle("body_mass") + theme(plot.title = element_text(hjust = 0.5))
p1 + p2 + p3 + p4Observaciones
No se registran valores mas extremos que el mÃnimo y máximo valor en cada variables. Es decir que no encontramos outliers.
outliers(dataset, column='bill_length_mm')## $inf
## [1] 32.1
##
## $sup
## [1] 59.6
outliers(dataset, column='bill_length_mm')## $inf
## [1] 32.1
##
## $sup
## [1] 59.6
outliers(dataset, column='bill_depth_mm')## $inf
## [1] 13.1
##
## $sup
## [1] 21.5
outliers(dataset, column='flipper_length_mm')## $inf
## [1] 172
##
## $sup
## [1] 231
outliers(dataset, column='body_mass_g')## $inf
## [1] 2700
##
## $sup
## [1] 6300
bar_plots(dataset)Observaciones
dataset <- dataset %>% drop_na()
missings_summary(dataset)## Warning: attributes are not identical across measure variables;
## they will be dropped
corr_plot(dataset %>% dplyr::select(-year))segmented_pairs_plot(dataset, segment_column='species')train_test <- train_test_split(dataset, train_size = 0.7, shuffle = TRUE)## [1] "Train set size: 233"
## [1] "Test set size: 100"
train_set <- train_test[[1]]
test_set <- train_test[[2]]lineal_model_1 <- lm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set
)
summary(lineal_model_1)##
## Call:
## lm(formula = body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
## data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -868.52 -260.19 -17.83 239.28 1269.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6074.063 654.637 -9.279 <2e-16 ***
## bill_length_mm 3.530 6.297 0.561 0.576
## bill_depth_mm 12.534 16.232 0.772 0.441
## flipper_length_mm 49.318 2.874 17.157 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 382.4 on 229 degrees of freedom
## Multiple R-squared: 0.7755, Adjusted R-squared: 0.7725
## F-statistic: 263.6 on 3 and 229 DF, p-value: < 2.2e-16
Quitamos los coeficientes que no son significativos para explicar a la variable body_mass_g:
lineal_model_1 <- lm(body_mass_g ~ flipper_length_mm, data = train_set)
summary(lineal_model_1)##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -857.93 -257.17 -17.05 232.95 1278.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5688.500 353.398 -16.10 <2e-16 ***
## flipper_length_mm 49.240 1.749 28.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 381.7 on 231 degrees of freedom
## Multiple R-squared: 0.7743, Adjusted R-squared: 0.7734
## F-statistic: 792.7 on 1 and 231 DF, p-value: < 2.2e-16
bayesion_model_1 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori (Fijado por el investigador)
beta0 ~ normal(0, 8000); // Intercept
beta1 ~ normal(0, 100); // flipper_length_mm
sigma ~ exponential(0.1); // Varianza
// Likelihood
y ~ normal(beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set),
x = colvalues(train_set, 'flipper_length_mm'),
y = colvalues(train_set, 'body_mass_g')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)
params_1 <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_1, pars = params_1, inc_warmup = TRUE)lm_vs_br_coeficients(lineal_model_1, bayesion_model_1, params_1)vars_1 <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_1, params_1, vars_1, test_set)bayesion_predictor_1 <- BayesianRegressionPredictor.from(bayesion_model_1, params_1, vars_1)
plot_compare_fit(
lineal_model_1,
bayesion_predictor_1,
train_set,
label_1 = 'Regresion Lineal',
label_2 = 'Regresion Bayesiana'
)lineal_model_2 <- lm(
body_mass_g
~ bill_length_mm
+ bill_depth_mm
+ flipper_length_mm
+ sex,
data = train_set
)
summary(lineal_model_2)##
## Call:
## lm(formula = body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm +
## sex, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -755.02 -255.89 -6.03 221.97 989.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1922.6152 729.5654 -2.635 0.00898 **
## bill_length_mm -0.5331 5.4439 -0.098 0.92208
## bill_depth_mm -92.9217 18.2692 -5.086 7.62e-07 ***
## flipper_length_mm 37.2016 2.8209 13.188 < 2e-16 ***
## sexmale 534.1583 59.5409 8.971 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 329.5 on 228 degrees of freedom
## Multiple R-squared: 0.834, Adjusted R-squared: 0.8311
## F-statistic: 286.5 on 4 and 228 DF, p-value: < 2.2e-16
Quitamos los coeficientes que no son significativos para explicar a la variable body_mass_g:
lineal_model_2 <- lm(
body_mass_g
~ bill_depth_mm
+ flipper_length_mm
+ sex,
data = train_set
)
summary(lineal_model_2)##
## Call:
## lm(formula = body_mass_g ~ bill_depth_mm + flipper_length_mm +
## sex, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -758.30 -258.55 -5.04 223.02 989.59
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1912.697 720.936 -2.653 0.00853 **
## bill_depth_mm -93.106 18.133 -5.135 6.04e-07 ***
## flipper_length_mm 37.053 2.373 15.617 < 2e-16 ***
## sexmale 533.673 59.206 9.014 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 328.7 on 229 degrees of freedom
## Multiple R-squared: 0.834, Adjusted R-squared: 0.8319
## F-statistic: 383.6 on 3 and 229 DF, p-value: < 2.2e-16
Construimos una matriz con todas las variables(X) mas el intercept:
to_model_input <- function(df) {
model.matrix(
body_mass_g
~ bill_depth_mm
+ flipper_length_mm
+ sex,
data = df
)
}
train_X <- train_set %>% to_model_input()
test_X <- test_set %>% to_model_input()
head(train_X)## (Intercept) bill_depth_mm flipper_length_mm sexmale
## 1 1 18.8 197 1
## 2 1 14.6 211 0
## 3 1 19.1 195 1
## 4 1 15.9 224 1
## 5 1 18.5 201 1
## 6 1 18.3 195 1
Definimos y corremos el modelo bayesiano:
bayesion_model_2 <- stan(
model_code = "
data {
int<lower=1> obs_count;
int<lower=1> coef_count;
matrix[obs_count,coef_count] X;
vector[obs_count] y;
}
parameters {
vector[coef_count] beta;
real<lower=0> sigma;
}
model {
// Distribuciones a priori
beta[1] ~ normal(0, 3000); // Intecept = beta0 + sexfemale
beta[2] ~ normal(0, 200); // bill_depth_mm
beta[3] ~ normal(0, 200); // flipper_length_mm
beta[4] ~ normal(0, 600); // sexmale
sigma ~ exponential(0.1); // Varianza
// Likelihood
y ~ normal(X * beta, sigma);
}
",
data = list(
obs_count = dim(train_X)[1],
coef_count = dim(train_X)[2],
y = colvalues(train_set, 'body_mass_g'),
X = train_X
),
chains = 3,
iter = 4000,
warmup = 500,
thin = 1
)
params_2 <- c('beta[1]', 'beta[2]', 'beta[3]', 'beta[4]', 'sigma')
traceplot(bayesion_model_2, inc_warmup = TRUE, pars = params_2)lineal_model_2$coefficients## (Intercept) bill_depth_mm flipper_length_mm sexmale
## -1912.69656 -93.10594 37.05300 533.67323
br_coeficients(bayesion_model_2, params_2)vars_2 <- c('bill_depth_mm', 'flipper_length_mm', 'sexmale')
lm_vs_br_models_validation(
lineal_model_2,
bayesion_model_2,
params_2,
vars_2,
test_set,
as.data.frame(test_X)
)bayesion_predictor_2 <- BayesianRegressionPredictor.from(bayesion_model_2, params_2, vars_2)
plot_compare_fit(
lineal_model_2,
bayesion_predictor_2,
test_set,
as.data.frame(test_X),
label_1 = 'Regresion Lineal',
label_2 = 'Regresion Bayesiana'
)A continuación vamos a agregar outliers a la variable flipper_length_mm, la cual define la longitud de la aleta del individuo medida en milÃmetros.
Para visualizar los nuevos outliers a continuación graficamos flipper_length_mm vs. body_mass_g antes y después de agregar outliers:
plot_data(train_set)train_set_with_outliers <- train_set %>%
mutate(flipper_length_mm = ifelse(
body_mass_g > 5400 & body_mass_g < 5700,
flipper_length_mm + (flipper_length_mm * runif(1, 0.1, 0.2)),
flipper_length_mm
))
plot_data(train_set_with_outliers)lineal_model_3 <- lm(
body_mass_g ~ flipper_length_mm,
data = train_set_with_outliers
)
summary(lineal_model_3)##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm, data = train_set_with_outliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -799.37 -298.36 -5.72 282.48 1161.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3656.483 304.802 -12.00 <2e-16 ***
## flipper_length_mm 38.833 1.494 25.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 405.5 on 231 degrees of freedom
## Multiple R-squared: 0.7452, Adjusted R-squared: 0.7441
## F-statistic: 675.7 on 1 and 231 DF, p-value: < 2.2e-16
Comparemos el ajuste del modelo lineal ajustando en un dataset de train con y sin outliers:
plot_compare_fit(
lineal_model_1,
lineal_model_3,
train_set_with_outliers,
label_1='Regresión Lineal SIN outliers',
label_2='Regresión Lineal CON outliters'
)Se aprecia que el modelo entrenado en train_set_outliers esta apalancado por las observaciones atÃpicas
Entrenamos una regresión lineal múltiple robusta para intentar de disminuir el efecto de los nuevos outliers:
robust_lineal_model_3 <- rlm(
body_mass_g ~ flipper_length_mm,
data = train_set_with_outliers
)plot_compare_fit(
lineal_model_1,
robust_lineal_model_3,
train_set_with_outliers,
label_1='Regresión Lineal SIN outliers',
label_2='Regresión Lineal Robusta CON outliters'
)Gráficamente no se llega a distinguir pero el modelo robusto termina ajustando mejor que el modelo lineal clásico. Esto se puede observar cuando comparamos el RMSE/MAE sobre train y test.
lm_vs_lm_models_validation(lineal_model_3, robust_lineal_model_3, train_set_with_outliers)## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
lm_vs_lm_models_validation(lineal_model_3, robust_lineal_model_3, test_set)bayesion_model_3 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori (Fijado por el investigador)
beta0 ~ normal(0, 1000); // Intercept
beta1 ~ normal(0, 100); // flipper_length_mm
sigma ~ exponential(0.1); // Varianza
// Likelihood
y ~ normal(beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set_with_outliers),
x = colvalues(train_set_with_outliers, 'flipper_length_mm'),
y = colvalues(train_set_with_outliers, 'body_mass_g')
),
chains = 3,
iter = 1000,
warmup = 300,
thin = 1
)
params_3 <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_3, pars = params_3, inc_warmup = TRUE)lm_vs_br_coeficients(robust_lineal_model_3, bayesion_model_3, params_3)vars_3 <- c('flipper_length_mm')
lm_vs_br_models_validation(robust_lineal_model_3, bayesion_model_3, params_3, vars_3, test_set)bayesion_predictor_3 <- BayesianRegressionPredictor.from(bayesion_model_3, params_3, vars_3)
plot_compare_fit(
robust_lineal_model_3,
bayesion_predictor_3,
train_set,
label_1='Regresion Lineal Robusta CON outliers',
label_2='Regresion Bayesiana CON outliers'
)bayesion_model_3.1 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori (Fijado por el investigador)
beta0 ~ normal(-5700, 50); // Intercept (Muy informativa!!!)
beta1 ~ normal(49, 20); // flipper_length_mm (Muy informativa!!!)
sigma ~ exponential(0.1); // Varianza
// Likelihood
y ~ student_t(2, beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set_with_outliers),
x = colvalues(train_set_with_outliers, 'flipper_length_mm'),
y = colvalues(train_set_with_outliers, 'body_mass_g')
),
chains = 3,
iter = 1000,
warmup = 300,
thin = 1
)
params_3 <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_3.1, pars = params_3, inc_warmup = TRUE)lm_vs_br_coeficients(lineal_model_1, bayesion_model_3.1, params_3)vars_3 <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_3.1, params_3, vars_3, test_set)bayesion_predictor_3.1 <- BayesianRegressionPredictor.from(bayesion_model_3.1, params_3, vars_3)
plot_compare_fit(
lineal_model_1,
bayesion_predictor_3.1,
train_set_with_outliers,
label_1='Regresion Lineal SIN outliers',
label_2='Regresion Bayesiana CON outliers'
)En este aso entrenamos solo con el 10% de lo datos.
train_test <- train_test_split(dataset, train_size = 0.05, shuffle = TRUE)## [1] "Train set size: 16"
## [1] "Test set size: 317"
train_set_4 <- train_test[[1]]
test_set_4 <- train_test[[2]]plot_data(train_set_4)lineal_model_4 <- lm(
body_mass_g ~ flipper_length_mm,
data = train_set_4
)bayesion_model_4 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori
beta0 ~ normal(0, 8000); // Intercept
beta1 ~ normal(0, 100);
sigma ~ exponential(0.1);
// Likelihood
y ~ normal(beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set_4),
x = colvalues(train_set_4, 'flipper_length_mm'),
y = colvalues(train_set_4, 'body_mass_g')
),
chains = 3,
iter = 1000,
warmup = 180,
thin = 1
)
params_4 <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_4, pars = params_4, inc_warmup = TRUE)Coeficientes de la regresión múltiple:
lineal_model_4$coefficients## (Intercept) flipper_length_mm
## -4053.56855 41.16935
Coeficientes descubiertos por la regresión múltiple bayesiana:
for(param in params_4) print(get_posterior_mean(bayesion_model_4, par=param)[4])## [1] -3924.577
## [1] 40.50081
## [1] 262.5933
vars_4 <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_4, bayesion_model_4, params_4, vars_4, test_set)bayesion_predictor_4 <- BayesianRegressionPredictor.from(bayesion_model_4, params_4, vars_4)
plot_compare_fit(
lineal_model_4,
bayesion_predictor_4,
train_set_4,
label_1='Regresion Lineal',
label_2='Regresion Bayesiana'
)Igual al experimento 1 pero proponiendo dos nuevas regresiones bayesianas con priors para los parámetros que sean:
Comparar con resultados de la bayesiana del experimento A
Definimos una distribución uniforme para el beta asociado a la variable flipper_length_mm y el intercepto
bayesion_model_5a <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori
beta0 ~ uniform(-6000, 2000); // Intercept
beta1 ~ uniform(-50, 50);
sigma ~ exponential(0.5);
// Likelihood
y ~ normal(beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set),
x = colvalues(train_set, 'flipper_length_mm'),
y = colvalues(train_set, 'body_mass_g')
),
chains = 3,
iter = 2000,
warmup = 300,
thin = 1
)
params_5a <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_5a, pars = params_5a, inc_warmup = TRUE)br_vs_br_coeficients(bayesion_model_1, bayesion_model_5a, params_5a)vars_5a <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_5a, params_5a, vars_5a, test_set)bayesion_predictor_5a <- BayesianRegressionPredictor.from(bayesion_model_5a, params_5a, vars_5a)
plot_compare_fit(
bayesion_predictor_1,
bayesion_predictor_5a,
train_set,
label_1='Regresion Bayesiana con dist informativa',
label_2='Regresion Bayesiana con dist menos informativa (uniforme)'
)Para las distribuciones de los parametros (priors) indicamos una distribucion normal informativa tomando los valores de la media y el desvio como aquellos encontrados en los experimentos anteriores:
bayesion_model_5b <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori
beta0 ~ normal(-5000, 100); // Intercept
beta1 ~ normal(50, 5);
sigma ~ exponential(0.5);
// Likelihood
y ~ normal(beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set),
x = colvalues(train_set, 'flipper_length_mm'),
y = colvalues(train_set, 'body_mass_g')
),
chains = 3,
iter = 2000,
warmup = 300,
thin = 1
)
params_5b <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_5b, pars = params_5b, inc_warmup = TRUE)br_vs_br_coeficients(bayesion_model_5a, bayesion_model_5b,params_5a)vars_5b <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_5b, params_5b, vars_5b, test_set)bayesion_predictor_5a <- BayesianRegressionPredictor.from(bayesion_model_5a, params_5a, vars_5a)
bayesion_predictor_5b <- BayesianRegressionPredictor.from(bayesion_model_5b, params_5b, vars_5b)
plot_compare_fit(
bayesion_predictor_5a,
bayesion_predictor_5b,
train_set,
label_1='Regresion Bayesiana con dist menos informativa (uniforme)',
label_2='Regresion Bayesiana con dist muy informativa'
)