ANLY505 - Intro to Stan

knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages(library(rstan))

## Warning: package 'rstan' was built under R version 3.6.1

## Warning: package 'StanHeaders' was built under R version 3.6.1

## Warning: package 'ggplot2' was built under R version 3.6.1

suppressPackageStartupMessages(library(gdata))
suppressPackageStartupMessages(library(bayesplot))

## Warning: package 'bayesplot' was built under R version 3.6.1

suppressPackageStartupMessages(library(parallel))

Intro to STAN Homework

After our Intro to Stan lecture I think it would be valuable to have you go through a similar exercise. Let’s test a second research question.

Research question: Is sea ice extent declining in the Southern Hemisphere over time? Is the same pattern happening in the Antarctic as in the Arctic? Fit a Stan model to find out!

Make sure you follow the steps we used in class.

1. Load and Inspect Data

#place the code here
seaice11<- read.table("seaice.csv", sep=",",header=T, fileEncoding="UTF-8-BOM")
head(seaice11)

##   year extent_north extent_south
## 1 1979       12.328       11.700
## 2 1980       12.337       11.230
## 3 1981       12.127       11.435
## 4 1982       12.447       11.640
## 5 1983       12.332       11.389
## 6 1984       11.910       11.454

2. Plot the data

#plot data
ggplot(aes(x=year, y=extent_south), data=seaice11) + geom_point() + theme_bw()

3. Run a general linear model using lm()

#write the code
lm_result <- lm(extent_south ~ year, data = seaice11)
summary(lm_result)

## 
## Call:
## lm(formula = extent_south ~ year, data = seaice11)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.23372 -0.18142  0.01587  0.18465  0.88814 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -14.199551  10.925576  -1.300   0.2018  
## year          0.012953   0.005468   2.369   0.0232 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared:  0.1317, Adjusted R-squared:  0.1082 
## F-statistic: 5.611 on 1 and 37 DF,  p-value: 0.02318

4. Index the data, re-run the lm(), extract summary statistics and turn the indexed data into a dataframe to pass into Stan

#write the code here
x<- I(seaice11$year - 1979) 
y<- seaice11$extent_south
N<- length(seaice11$year)

lm_result <- lm(y ~ x)
summary(lm_result)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.23372 -0.18142  0.01587  0.18465  0.88814 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.434508   0.120755  94.692   <2e-16 ***
## x            0.012953   0.005468   2.369   0.0232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared:  0.1317, Adjusted R-squared:  0.1082 
## F-statistic: 5.611 on 1 and 37 DF,  p-value: 0.02318

lm_alpha <- summary(lm_result)$coeff[1] #Intersect a
lm_beta <- summary(lm_result)$coeff[2] #Slope b
lm_sigma <- sigma(lm_result) #Residual error S

stan_data1 <- list(N = N, x = x, y = y)

5. Write the Stan model

#write the code

 write("// Stan model for simple linear regression

data {
 int < lower = 1 > N; // Sample size
 vector[N] x; // Predictor
 vector[N] y; // Outcome
}

parameters {
 real alpha; // Intercept
 real beta; // Slope (regression coefficients)
 real < lower = 0 > sigma; // Error SD
}

model {
 y ~ normal(alpha + x * beta , sigma);
}

generated quantities {
} // The posterior predictive distribution",

"stan_model1.stan")


stan_model1 <- "stan_model1.stan"

6. Check to see how many cores your computer has and enable parallel computing

detectCores(all.tests = FALSE, logical = TRUE)

options(mc.cores = parallel::detectCores())

7. Run the Stan model and inspect the results

#code here
result_stan1 <- stan(file = stan_model1, data = stan_data1, warmup = 500, iter = 1000, chains = 3, cores = 4, thin = 2)

result_stan1

## Inference for Stan model: stan_model1.
## 3 chains, each with iter=1000; warmup=500; thin=2; 
## post-warmup draws per chain=250, total post-warmup draws=750.
## 
##        mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## alpha 11.43    0.01 0.13 11.17 11.36 11.43 11.52 11.68   444 1.01
## beta   0.01    0.00 0.01  0.00  0.01  0.01  0.02  0.02   408 1.01
## sigma  0.40    0.00 0.05  0.32  0.36  0.39  0.42  0.51   558 1.00
## lp__  16.31    0.07 1.37 12.97 15.83 16.65 17.27 17.71   386 1.01
## 
## Samples were drawn using NUTS(diag_e) at Thu Oct 31 22:07:06 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

8. Extract and inspect the posterior estimates into a list so we can plot them

#code here
posterior_esti <- extract(result_stan1)
str(posterior_esti)

## List of 4
##  $ alpha: num [1:750(1d)] 11.4 11.6 11.5 11.4 11.4 ...
##   ..- attr(*, "dimnames")=List of 1
##   .. ..$ iterations: NULL
##  $ beta : num [1:750(1d)] 0.01716 0.00592 0.0103 0.01213 0.01162 ...
##   ..- attr(*, "dimnames")=List of 1
##   .. ..$ iterations: NULL
##  $ sigma: num [1:750(1d)] 0.364 0.365 0.394 0.388 0.463 ...
##   ..- attr(*, "dimnames")=List of 1
##   .. ..$ iterations: NULL
##  $ lp__ : num [1:750(1d)] 16.2 16.7 17.5 17.8 16.2 ...
##   ..- attr(*, "dimnames")=List of 1
##   .. ..$ iterations: NULL

9. Compare your results to our results to “lm”

#code here
plot(y ~ x, pch = 20)

abline(lm_result, col = 3, lty = 2, lw = 3)
abline( mean(posterior_esti$alpha), mean(posterior_esti$beta), col = 3, lw = 2)

10. Plot multiple estimates from the posterior

#code here
plot(y ~ x, pch = 20)

for (i in 1:500) {
 abline(posterior_esti$alpha[i], posterior_esti$beta[i], col = "brown", lty = 1)
}

abline(mean(posterior_esti$alpha), mean(posterior_esti$beta), col = 4, lw = 2)

11. Change the priors and see how that affects your results

#code here
write("// Stan model for simple linear regression

data {
 int < lower = 1 > N; // Sample size
 vector[N] x; // Predictor
 vector[N] y; // Outcome
}

parameters {
 real alpha; // Intercept
 real beta; // Slope (regression coefficients)
 real < lower = 0 > sigma; // Error SD
}

model {
 alpha ~ normal(10, 0.1);
 beta ~ normal(1, 0.1);
 y ~ normal(alpha + x * beta , sigma);
}

generated quantities {
} // The posterior predictive distribution",

"stan_model2.stan")

stan_model2 <- "stan_model2.stan"


result_stan2 <- stan(stan_model2, data = stan_data1, warmup = 500, iter = 2000, chains = 4, cores = 2, thin = 2)

posterior2 <- extract(result_stan2)

plot(y ~ x, pch = 20)
abline(lm_alpha, lm_beta, col = 3, lty = 2, lw = 2)
abline(mean(posterior2$alpha), mean(posterior2$beta), col = 3, lw = 2)
abline(mean(posterior_esti$alpha), mean(posterior_esti$beta), col = 4, lw = 3)

12. What happened when you changed the priors? Does the model fit better or not?

The model is not fit when we changed the priors. Therefore, selecting the correct priors is necessary.

13. Convergence diagnostics - create traceplots that show all 4 chains

#code here
plot(posterior_esti$alpha, type = "l" , col = "red")

plot(posterior_esti$beta, type = "l", col = "blue")

plot(posterior_esti$sigma, type = "l", col = "green")

14. Plot parameter summaries for:

\(\alpha\), \(\beta\), \(\sigma\)

par(mfrow = c(1,3))

plot(density(posterior_esti$alpha), main = "Alpha", col = "red")
abline(v = lm_alpha, col = 3, lty = 2)

plot(density(posterior_esti$beta), main = "Beta", col = "blue")
abline(v = lm_beta, col = 3, lty = 2)

plot(density(posterior_esti$sigma), main = "Sigma", col = "green")
abline(v = lm_sigma, col = 3, lty = 2)

traceplot(result_stan1)

stan_dens(result_stan1)

stan_hist(result_stan1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot(result_stan1, show_density = FALSE, ci_level = 0.5, outer_level = 0.95, fill_color = "yellow")

## ci_level: 0.5 (50% intervals)

## outer_level: 0.95 (95% intervals)

15. What do your Stan model results indicate?

#Posterior Accuracy Check

write("// Stan model for simple linear regression

data {
 int < lower = 1 > N; // Sample size
 vector[N] x; // Predictor
 vector[N] y; // Outcome
}

parameters {
 real alpha; // Intercept
 real beta; // Slope (regression coefficients)
 real < lower = 0 > sigma; // Error SD
}

model {
 y ~ normal(x * beta + alpha, sigma);
}

generated quantities {
 real y_rep[N];

 for (n in 1:N) {
 y_rep[n] = normal_rng(x[n] * beta + alpha, sigma);
 }

}",

"stan_model_pc.stan")

stan_model_new <- "stan_model_pc.stan"

result_fit <- stan(stan_model_new, data = stan_data1, iter = 2000, chains = 4, cores = 4, thin = 2)

y_rep <- as.matrix(result_fit, pars = "y_rep")

dim(y_rep)

## [1] 2000   39

ppc_dens_overlay(y, y_rep[1:300, ])

Stan model results indicate the model is a correct predictor with the result from posterior prediction.