library(rstan)
#options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
library(shinystan)
Prepare data for multiple linear regression with 2 independent significant predictors.
# generate data
set.seed(03182021)
Ntotal <- 500
x <- cbind(rnorm(Ntotal, mean = 20, sd = 4),
rnorm(Ntotal, mean=10, sd = 6))
Nx <- ncol(x)
y <- 4 + 1.1*x[,1] + 3*x[,2] + rnorm(Ntotal, mean = 0, sd = 1)
dataListRegression<-list(Ntotal=Ntotal, y=y, x=as.matrix(x), Nx=Nx)
summary(lm(y~x))
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.83480 -0.63696 0.00131 0.69896 2.48803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.565419 0.230456 19.81 <2e-16 ***
## x1 1.074206 0.010945 98.14 <2e-16 ***
## x2 2.995385 0.007954 376.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9759 on 497 degrees of freedom
## Multiple R-squared: 0.9969, Adjusted R-squared: 0.9968
## F-statistic: 7.895e+04 on 2 and 497 DF, p-value: < 2.2e-16
The diagram of the model shows hierarchical structure with normal priors for intercept β0 and slopes βi, i=1,2.
Description of the model corresponding to the diagram: [ y_i = _0 + 1 x{i,1} + 2 x{i,2} + _i \ y_i t(_i, , ) \
\
\ _i = _0 + 1 x{i,1} + 2 x{i,2} \ Unif(L, H) \ = ^{} + 1, ^{} exp()
]
modelString<-"
data {
int<lower=1> Ntotal;
int<lower=1> Nx;
vector[Ntotal] y;
matrix[Ntotal, Nx] x;
}
transformed data {
real meanY;
real sdY;
vector[Ntotal] zy; // normalized
vector[Nx] meanX;
vector[Nx] sdX;
matrix[Ntotal, Nx] zx; // normalized
meanY = mean(y);
sdY = sd(y);
zy = (y - meanY) / sdY;
for ( j in 1:Nx ) {
meanX[j] = mean(x[,j]);
sdX[j] = sd(x[,j]);
for ( i in 1:Ntotal ) {
zx[i,j] = ( x[i,j] - meanX[j] ) / sdX[j];
}
}
}
parameters {
real zbeta0;
vector[Nx] zbeta;
real<lower=0> nu;
real<lower=0> zsigma;
}
transformed parameters{
vector[Ntotal] zy_hat;
zy_hat = zbeta0 + zx * zbeta;
}
model {
zbeta0 ~ normal(0, 2);
zbeta ~ normal(0, 2);
nu ~ exponential(1/30.0);
zsigma ~ uniform(1.0E-5 , 1.0E+1);
zy ~ student_t(1+nu, zy_hat, zsigma);
}
generated quantities {
// Transform to original scale:
real beta0;
vector[Nx] beta;
real sigma;
// .* and ./ are element-wise product and divide
beta0 = zbeta0*sdY + meanY - sdY * sum( zbeta .* meanX ./ sdX );
beta = sdY * ( zbeta ./ sdX );
sigma = zsigma * sdY;
} "
RobustMultipleRegressionDso <- stan_model(model_code=modelString)
If saved DSO is used load it, then run the chains.
# don't need to run this chunk, can so many problems
saveRDS(RobustMultipleRegressionDso, file="DSORobustMultRegr.Rds")
readRDS("DSORobustMultRegr.Rds")
## S4 class stanmodel '88098712ac251c9c72509b460fa2b903' coded as follows:
##
## data {
## int<lower=1> Ntotal;
## int<lower=1> Nx;
## vector[Ntotal] y;
## matrix[Ntotal, Nx] x;
## }
## transformed data {
## real meanY;
## real sdY;
## vector[Ntotal] zy; // normalized
## vector[Nx] meanX;
## vector[Nx] sdX;
## matrix[Ntotal, Nx] zx; // normalized
##
## meanY = mean(y);
## sdY = sd(y);
## zy = (y - meanY) / sdY;
## for ( j in 1:Nx ) {
## meanX[j] = mean(x[,j]);
## sdX[j] = sd(x[,j]);
## for ( i in 1:Ntotal ) {
## zx[i,j] = ( x[i,j] - meanX[j] ) / sdX[j];
## }
## }
## }
## parameters {
## real zbeta0;
## vector[Nx] zbeta;
## real<lower=0> nu;
## real<lower=0> zsigma;
## }
## transformed parameters{
## vector[Ntotal] zy_hat;
## zy_hat = zbeta0 + zx * zbeta;
## }
## model {
## zbeta0 ~ normal(0, 2);
## zbeta ~ normal(0, 2);
## nu ~ exponential(1/30.0);
## zsigma ~ uniform(1.0E-5 , 1.0E+1);
## zy ~ student_t(1+nu, zy_hat, zsigma);
## }
## generated quantities {
## // Transform to original scale:
## real beta0;
## vector[Nx] beta;
## real sigma;
## // .* and ./ are element-wise product and divide
## beta0 = zbeta0*sdY + meanY - sdY * sum( zbeta .* meanX ./ sdX );
## beta = sdY * ( zbeta ./ sdX );
## sigma = zsigma * sdY;
## }
Fit the model.
fit1<-sampling(RobustMultipleRegressionDso,
data=dataListRegression,
pars=c('beta0', 'beta', 'nu', 'sigma'),
iter = 5000, chains = 2, cores = 2)
stan_ac(fit1)
stan_trace(fit1)
Look at the results.
summary(fit1)$summary[,c(1,3,4,5,8,9)] #mean, sd, 2.5%, 97.5%, n_eff
## mean sd 2.5% 25% 97.5% n_eff
## beta0 4.5780310 0.231357219 4.1304172 4.422841 5.025614 7033.830
## beta[1] 1.0738282 0.011006048 1.0521979 1.066428 1.095054 6561.254
## beta[2] 2.9951527 0.008047017 2.9797451 2.989759 3.011046 6871.816
## nu 61.7987150 36.009712586 17.0095629 36.082778 151.390448 4557.868
## sigma 0.9611543 0.032896641 0.8978996 0.938909 1.027262 4328.823
## lp__ 1014.2033841 1.585209650 1010.1168812 1013.426674 1016.293483 2600.401
pairs(fit1,pars=c("beta0","beta[1]","beta[2]"))
plot(fit1,pars="nu")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(fit1,pars="sigma")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(fit1,pars="beta0")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(fit1,pars="beta[1]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(fit1,pars="beta[2]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
Analyze fitted model using shinystan
library(shinystan)
launch_shinystan(fit1)
##
## Launching ShinyStan interface... for large models this may take some time.
##
## Listening on http://127.0.0.1:4427
Conclusions:
Regression.Data <- as.matrix(read.csv("DataForRegressionANOVA.csv", header=TRUE, sep=","))
tail(Regression.Data)
## Output Input1 Input2
## [495,] 2.4054442 0.9276934 0.07278244
## [496,] 1.8663026 -0.3678520 1.51715986
## [497,] 1.3590146 0.5369795 0.96209003
## [498,] 3.1836007 1.0171332 -0.56660564
## [499,] 2.3615061 1.1637966 0.07815352
## [500,] 0.8483407 1.1775607 1.59720356
Prepare the data for Stan.
Ntotal <- nrow(Regression.Data)
x <- Regression.Data[,2:3]
tail(x)
## Input1 Input2
## [495,] 0.9276934 0.07278244
## [496,] -0.3678520 1.51715986
## [497,] 0.5369795 0.96209003
## [498,] 1.0171332 -0.56660564
## [499,] 1.1637966 0.07815352
## [500,] 1.1775607 1.59720356
Nx <- ncol(x)
y <- Regression.Data[,1]
dataListInsig <- list(Ntotal=Ntotal, y=y, x=as.matrix(x), Nx=Nx)
Run MCMC using the same DSO.
fit2 <- sampling(RobustMultipleRegressionDso, data=dataListInsig,
pars=c('beta0', 'beta', 'nu', 'sigma'),
iter=5000, chains = 2, cores = 2)
launch_shinystan(fit2)
##
## Launching ShinyStan interface... for large models this may take some time.
##
## Listening on http://127.0.0.1:4427
Analyze the results.
summary(fit2)$summary[,c(1,3,4,8,9)]
## mean sd 2.5% 97.5% n_eff
## beta0 1.218266e+00 0.05299934 1.11711302 1.32336805 5365.650
## beta[1] 8.003810e-01 0.02878846 0.74298668 0.85707012 5857.186
## beta[2] 8.795752e-03 0.02842077 -0.04621791 0.06421956 5832.891
## nu 5.298357e+01 33.85144567 13.14276069 141.24677427 4953.188
## sigma 5.979821e-01 0.02134564 0.55753809 0.63995232 5078.542
## lp__ -1.818615e+02 1.61483483 -185.83994952 -179.70767244 2374.855
pairs(fit2,pars=c("beta0","beta[1]","beta[2]"))
plot(fit2,pars="nu")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(fit2,pars="sigma")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(fit2,pars="beta0")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(fit2,pars="beta[1]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(fit2,pars="beta[2]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
We see that parameter β2 is not significant.
However, there is no strong correlation or redundancy between the predictors.
Compare with the output of linear model
pairs(Regression.Data)
summary(lm(Output~.,data=as.data.frame(Regression.Data)))
##
## Call:
## lm(formula = Output ~ ., data = as.data.frame(Regression.Data))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.76631 -0.39358 -0.01411 0.40432 1.91861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.21480 0.05179 23.458 <2e-16 ***
## Input1 0.80116 0.02819 28.423 <2e-16 ***
## Input2 0.00970 0.02787 0.348 0.728
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6107 on 497 degrees of freedom
## Multiple R-squared: 0.6204, Adjusted R-squared: 0.6188
## F-statistic: 406.1 on 2 and 497 DF, p-value: < 2.2e-16