Processing math: 100%

1 Multiple Regression

library(rstan)
#options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
library(shinystan)

1.1 Example 1: Two significant predictors

Prepare data for multiple linear regression with 2 independent significant predictors.

# generate data
set.seed(03182021)
Ntotal <- 500
x <- cbind(rnorm(Ntotal, mean = 20, sd = 4), 
           rnorm(Ntotal, mean=10, sd = 6))
Nx <- ncol(x)
y <- 4 + 1.1*x[,1] + 3*x[,2] + rnorm(Ntotal, mean = 0, sd = 1)
dataListRegression<-list(Ntotal=Ntotal, y=y, x=as.matrix(x), Nx=Nx)
summary(lm(y~x))
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.83480 -0.63696  0.00131  0.69896  2.48803 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.565419   0.230456   19.81   <2e-16 ***
## x1          1.074206   0.010945   98.14   <2e-16 ***
## x2          2.995385   0.007954  376.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9759 on 497 degrees of freedom
## Multiple R-squared:  0.9969, Adjusted R-squared:  0.9968 
## F-statistic: 7.895e+04 on 2 and 497 DF,  p-value: < 2.2e-16

The diagram of the model shows hierarchical structure with normal priors for intercept β0 and slopes βi, i=1,2.

Description of the model corresponding to the diagram: [ y_i = _0 + 1 x{i,1} + 2 x{i,2} + _i \ y_i t(_i, , ) \

\

\ _i = _0 + 1 x{i,1} + 2 x{i,2} \ Unif(L, H) \ = ^{} + 1,    ^{} exp()

]

modelString<-"
data {
    int<lower=1> Ntotal;
    int<lower=1> Nx;
    vector[Ntotal] y;
    matrix[Ntotal, Nx] x;
}
transformed data {
    real meanY;
    real sdY;
    vector[Ntotal] zy; // normalized
    vector[Nx] meanX;
    vector[Nx] sdX;
    matrix[Ntotal, Nx] zx; // normalized
    
    meanY = mean(y);
    sdY = sd(y);
    zy = (y - meanY) / sdY;
    for ( j in 1:Nx ) {
        meanX[j] = mean(x[,j]);
        sdX[j] = sd(x[,j]);
        for ( i in 1:Ntotal ) {
            zx[i,j] = ( x[i,j] - meanX[j] ) / sdX[j];
        }
    }
}
parameters {
    real zbeta0;
    vector[Nx] zbeta;
    real<lower=0> nu;
    real<lower=0> zsigma;
}
transformed parameters{
    vector[Ntotal] zy_hat;
    zy_hat = zbeta0 + zx * zbeta;
}
model {
    zbeta0 ~ normal(0, 2);
    zbeta  ~ normal(0, 2);
    nu ~ exponential(1/30.0);
    zsigma ~ uniform(1.0E-5 , 1.0E+1);
    zy ~ student_t(1+nu, zy_hat, zsigma);
}
generated quantities { 
    // Transform to original scale:
    real beta0; 
    vector[Nx] beta;
    real sigma;
    // .* and ./ are element-wise product and divide
    beta0 = zbeta0*sdY  + meanY - sdY * sum( zbeta .* meanX ./ sdX );
    beta = sdY * ( zbeta ./ sdX );
    sigma = zsigma * sdY;
} "
RobustMultipleRegressionDso <- stan_model(model_code=modelString)

If saved DSO is used load it, then run the chains.

# don't need to run this chunk, can so many problems
saveRDS(RobustMultipleRegressionDso, file="DSORobustMultRegr.Rds")
readRDS("DSORobustMultRegr.Rds")
## S4 class stanmodel '88098712ac251c9c72509b460fa2b903' coded as follows:
## 
## data {
##     int<lower=1> Ntotal;
##     int<lower=1> Nx;
##     vector[Ntotal] y;
##     matrix[Ntotal, Nx] x;
## }
## transformed data {
##     real meanY;
##     real sdY;
##     vector[Ntotal] zy; // normalized
##     vector[Nx] meanX;
##     vector[Nx] sdX;
##     matrix[Ntotal, Nx] zx; // normalized
##     
##     meanY = mean(y);
##     sdY = sd(y);
##     zy = (y - meanY) / sdY;
##     for ( j in 1:Nx ) {
##         meanX[j] = mean(x[,j]);
##         sdX[j] = sd(x[,j]);
##         for ( i in 1:Ntotal ) {
##             zx[i,j] = ( x[i,j] - meanX[j] ) / sdX[j];
##         }
##     }
## }
## parameters {
##     real zbeta0;
##     vector[Nx] zbeta;
##     real<lower=0> nu;
##     real<lower=0> zsigma;
## }
## transformed parameters{
##     vector[Ntotal] zy_hat;
##     zy_hat = zbeta0 + zx * zbeta;
## }
## model {
##     zbeta0 ~ normal(0, 2);
##     zbeta  ~ normal(0, 2);
##     nu ~ exponential(1/30.0);
##     zsigma ~ uniform(1.0E-5 , 1.0E+1);
##     zy ~ student_t(1+nu, zy_hat, zsigma);
## }
## generated quantities { 
##     // Transform to original scale:
##     real beta0; 
##     vector[Nx] beta;
##     real sigma;
##     // .* and ./ are element-wise product and divide
##     beta0 = zbeta0*sdY  + meanY - sdY * sum( zbeta .* meanX ./ sdX );
##     beta = sdY * ( zbeta ./ sdX );
##     sigma = zsigma * sdY;
## }

Fit the model.

fit1<-sampling(RobustMultipleRegressionDso,
               data=dataListRegression,
               pars=c('beta0', 'beta', 'nu', 'sigma'),
               iter = 5000, chains = 2, cores = 2)
stan_ac(fit1)

stan_trace(fit1)

Look at the results.

summary(fit1)$summary[,c(1,3,4,5,8,9)] #mean, sd, 2.5%, 97.5%, n_eff
##                 mean           sd         2.5%         25%       97.5%    n_eff
## beta0      4.5780310  0.231357219    4.1304172    4.422841    5.025614 7033.830
## beta[1]    1.0738282  0.011006048    1.0521979    1.066428    1.095054 6561.254
## beta[2]    2.9951527  0.008047017    2.9797451    2.989759    3.011046 6871.816
## nu        61.7987150 36.009712586   17.0095629   36.082778  151.390448 4557.868
## sigma      0.9611543  0.032896641    0.8978996    0.938909    1.027262 4328.823
## lp__    1014.2033841  1.585209650 1010.1168812 1013.426674 1016.293483 2600.401
pairs(fit1,pars=c("beta0","beta[1]","beta[2]"))

plot(fit1,pars="nu")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit1,pars="sigma")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit1,pars="beta0")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit1,pars="beta[1]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit1,pars="beta[2]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

Analyze fitted model using shinystan

library(shinystan)
launch_shinystan(fit1)
## 
## Launching ShinyStan interface... for large models this  may take some time.
## 
## Listening on http://127.0.0.1:4427

Conclusions:

  1. Normality parameter ν is large enough (>>2) to consider normal distribution: 2.5% HDI level is 12.1235594, mean value is 50.1246577. Not surprising: we simulated normal model.
  2. Parameters β0 and β1 are significantly negatively correlated, as expected.
  3. Parameters β0 and β2 are also negatively correlated, but correlation is not so strong.
  4. All parameter estimates are close to what we simulated.

1.2 Example 2: Insignificant predictor

Regression.Data <- as.matrix(read.csv("DataForRegressionANOVA.csv", header=TRUE, sep=","))
tail(Regression.Data)
##           Output     Input1      Input2
## [495,] 2.4054442  0.9276934  0.07278244
## [496,] 1.8663026 -0.3678520  1.51715986
## [497,] 1.3590146  0.5369795  0.96209003
## [498,] 3.1836007  1.0171332 -0.56660564
## [499,] 2.3615061  1.1637966  0.07815352
## [500,] 0.8483407  1.1775607  1.59720356

Prepare the data for Stan.

Ntotal <- nrow(Regression.Data)
x <- Regression.Data[,2:3]
tail(x)
##            Input1      Input2
## [495,]  0.9276934  0.07278244
## [496,] -0.3678520  1.51715986
## [497,]  0.5369795  0.96209003
## [498,]  1.0171332 -0.56660564
## [499,]  1.1637966  0.07815352
## [500,]  1.1775607  1.59720356
Nx <- ncol(x)
y <- Regression.Data[,1]
dataListInsig <- list(Ntotal=Ntotal, y=y, x=as.matrix(x), Nx=Nx)

Run MCMC using the same DSO.

fit2 <- sampling(RobustMultipleRegressionDso, data=dataListInsig,
                 pars=c('beta0', 'beta', 'nu', 'sigma'),
                 iter=5000, chains = 2, cores = 2)
launch_shinystan(fit2)
## 
## Launching ShinyStan interface... for large models this  may take some time.
## 
## Listening on http://127.0.0.1:4427

Analyze the results.

summary(fit2)$summary[,c(1,3,4,8,9)]
##                  mean          sd          2.5%         97.5%    n_eff
## beta0    1.218266e+00  0.05299934    1.11711302    1.32336805 5365.650
## beta[1]  8.003810e-01  0.02878846    0.74298668    0.85707012 5857.186
## beta[2]  8.795752e-03  0.02842077   -0.04621791    0.06421956 5832.891
## nu       5.298357e+01 33.85144567   13.14276069  141.24677427 4953.188
## sigma    5.979821e-01  0.02134564    0.55753809    0.63995232 5078.542
## lp__    -1.818615e+02  1.61483483 -185.83994952 -179.70767244 2374.855
pairs(fit2,pars=c("beta0","beta[1]","beta[2]"))

plot(fit2,pars="nu")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit2,pars="sigma")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit2,pars="beta0")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit2,pars="beta[1]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit2,pars="beta[2]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

We see that parameter β2 is not significant.
However, there is no strong correlation or redundancy between the predictors.

Compare with the output of linear model

pairs(Regression.Data)

summary(lm(Output~.,data=as.data.frame(Regression.Data)))
## 
## Call:
## lm(formula = Output ~ ., data = as.data.frame(Regression.Data))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76631 -0.39358 -0.01411  0.40432  1.91861 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.21480    0.05179  23.458   <2e-16 ***
## Input1       0.80116    0.02819  28.423   <2e-16 ***
## Input2       0.00970    0.02787   0.348    0.728    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6107 on 497 degrees of freedom
## Multiple R-squared:  0.6204, Adjusted R-squared:  0.6188 
## F-statistic: 406.1 on 2 and 497 DF,  p-value: < 2.2e-16

1.3 Example 3: Correlated predictors

1.3.1 Strong correlation

set.seed(03192021)
Ntotal <- 500
x1 <- rnorm(Ntotal, mean = 20, sd = 4)
x2 <- 1 - 1.5*x1 + rnorm(Ntotal, mean=0, sd = .1)
x <- cbind(x1,x2)      

plot(x)

Nx <- ncol(x)
y <- 4 + .2*x[,1] + 3*x[,2]+rnorm(Ntotal, mean = 0, sd = 1)
plot(x[,1],y)

plot(x[,2],y)

fitlm<-lm(y~x[,1]+x[,2])
summary(fitlm)
## 
## Call:
## lm(formula = y ~ x[, 1] + x[, 2])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7765 -0.7585  0.0172  0.6577  3.1721 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.5619     0.4878   9.353  < 2e-16 ***
## x[, 1]       -0.5468     0.6707  -0.815    0.415    
## x[, 2]        2.5034     0.4479   5.589 3.76e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9988 on 497 degrees of freedom
## Multiple R-squared:  0.9962, Adjusted R-squared:  0.9962 
## F-statistic: 6.53e+04 on 2 and 497 DF,  p-value: < 2.2e-16
drop1(fitlm)
ABCDEFGHIJ0123456789
 
 
Df
<dbl>
Sum of Sq
<dbl>
RSS
<dbl>
AIC
<dbl>
<none>NANA495.85111.8337997
x[, 1]10.6631643496.51430.5020661
x[, 2]131.1702453527.021430.3164847
dataListShrink2 <- list(Ntotal=Ntotal, y=y, x=as.matrix(x), Nx=Nx)

Note that actual coefficient for x[,1] is +0.2, but slope on the plot plot(x[,1],y) is negative.
Also note that estimated model coefficients are different from actual because of correlation:

cbind(actual=c(4,.2,3),estimated=fitlm$coefficients)
##             actual  estimated
## (Intercept)    4.0  4.5619222
## x[, 1]         0.2 -0.5468278
## x[, 2]         3.0  2.5034438

Run the chains and analyze the results.

tStart<-proc.time()
fit3<-sampling(RobustMultipleRegressionDso,
               data=dataListShrink2,
               pars=c('beta0', 'beta', 'nu', 'sigma'),
               iter=5000, chains = 2, cores = 2)
tEnd<-proc.time()
tEnd-tStart
##    user  system elapsed 
##    0.54    0.56   80.56

How long did it take runing MCMC? Why so long?

Check convergence in shiny.

launch_shinystan(fit3)
## 
## Launching ShinyStan interface... for large models this  may take some time.
## 
## Listening on http://127.0.0.1:4427
stan_dens(fit3)

stan_ac(fit3, separate_chains = T)

summary(fit3)$summary[,c(1,3,4,8,9)]
##                mean          sd        2.5%       97.5%    n_eff
## beta0     4.5857595  0.48813790   3.6314232   5.5544965 2114.939
## beta[1]  -0.5967437  0.67710628  -1.9314954   0.7388229 1713.891
## beta[2]   2.4699613  0.45229741   1.5765319   3.3619226 1712.967
## nu       56.0426640 33.61895666  15.8382163 143.5659182 2718.137
## sigma     0.9816801  0.03355264   0.9166907   1.0483195 2731.866
## lp__    967.3463697  1.56455325 963.5132940 969.3727437 1911.020
pairs(fit3,pars=c("beta0","beta[1]","beta[2]"))

plot(fit3,pars="nu")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit3,pars="sigma")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit3,pars="beta0")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit3,pars="beta[1]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit3,pars="beta[2]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

General signs of collinear predictors:
- High correlation between slopes (compensating sign)
- Wide posterior distributions for slopes
- Increased autocorrelation for slopes

pairs(cbind(y,x1,x2))

cbind(actual=c(4,.2,3),estimatedLm=fitlm$coefficients,estimatedBayes=summary(fit3)$summary[1:3,1])
##             actual estimatedLm estimatedBayes
## (Intercept)    4.0   4.5619222      4.5857595
## x[, 1]         0.2  -0.5468278     -0.5967437
## x[, 2]         3.0   2.5034438      2.4699613

Linear model shows the same information as Bayesian.

1.3.2 Collinearity

In case when predictors have strong collinearity, linear model may stop working.
Simulate the same model as in the previous section, but make predictors collinear.

set.seed(03192021)
Ntotal <- 500
x1 <- rnorm(Ntotal, mean = 20, sd = 4)
x2<-1-1.5*x1+rnorm(Ntotal, mean=0, sd = .000001) # sd closes to 0
x<-cbind(x1,x2)           
plot(x)

Nx <- ncol(x)
y <- 4 + .2*x[,1] + 3*x[,2]+rnorm(Ntotal, mean = 0, sd = 1)
plot(x[,1],y)

plot(x[,2],y)

dataListShrink2c <- list(Ntotal=Ntotal, y=y, x=as.matrix(x), Nx=Nx)
(lmFit <- lm(y~x1+x2))
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Coefficients:
## (Intercept)           x1           x2  
##       7.094       -4.303           NA
summary(lmFit)
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7626 -0.7636  0.0244  0.6410  3.1747 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.09394    0.24447   29.02   <2e-16 ***
## x1          -4.30334    0.01189 -361.96   <2e-16 ***
## x2                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9991 on 498 degrees of freedom
## Multiple R-squared:  0.9962, Adjusted R-squared:  0.9962 
## F-statistic: 1.31e+05 on 1 and 498 DF,  p-value: < 2.2e-16
drop1(lmFit)
ABCDEFGHIJ0123456789
 
 
Df
<dbl>
Sum of Sq
<dbl>
RSS
<dbl>
AIC
<dbl>
<none>NANA497.07731.068698
x100.0001416981497.07741.068841
x200.0000000000497.07731.068698

Linear model stops working.

Simulate Markov chains.

cbind(actual=c(4,.2,3),estimated=fitlm$coefficients)
##             actual  estimated
## (Intercept)    4.0  4.5619222
## x[, 1]         0.2 -0.5468278
## x[, 2]         3.0  2.5034438

Run the chains and analyze the results.

tStart <- proc.time()
fit3c <- sampling(RobustMultipleRegressionDso,
                data=dataListShrink2c,
                pars=c('beta0', 'beta', 'nu', 'sigma'),
                iter=5000, chains = 1, cores = 2)
## 
## SAMPLING FOR MODEL '88098712ac251c9c72509b460fa2b903' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 2501 / 5000 [ 50%]  (Sampling)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Sampling)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Sampling)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 113.607 seconds (Warm-up)
## Chain 1:                132.096 seconds (Sampling)
## Chain 1:                245.703 seconds (Total)
## Chain 1:
## Warning: There were 2292 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
tEnd <- proc.time()
tEnd-tStart
##    user  system elapsed 
##  243.12    0.18  245.75

With collinear predictors model definitely takes much longer time to simulate.

stan_dens(fit3c)

stan_ac(fit3c, separate_chains = T)

summary(fit3c)$summary[,c(1,3,4,8,9)]
##                mean          sd        2.5%      97.5%    n_eff
## beta0     5.2869784  4.12513863  -2.5410297  13.834655 258.6426
## beta[1]  -1.5985501  6.17402618 -14.3329219  10.098421 255.8424
## beta[2]   1.8031011  4.11591117  -6.6807200   9.601809 255.7672
## nu       57.5235574 34.84527395  15.7981673 148.866690 605.1602
## sigma     0.9834476  0.03440985   0.9179121   1.052713 331.5993
## lp__    967.5976423  1.48319659 964.0943396 969.579397 845.7838
pairs(fit3c,pars=c("beta0","beta[1]","beta[2]"))

plot(fit3c,pars="nu")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit3c,pars="sigma")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit3c,pars="beta0")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit3c,pars="beta[1]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(fit3c,pars="beta[2]")
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

Markov chains may go over limit on tree depths (yellow dots on pairs graph).
But Bayesian method still works. It shows that one or both of the slopes are not significantly different from zero.