ANLY505 - Intro to Stan

#knitr::opts_chunk$set(echo = TRUE)
#install.packages('rstan')
#install.packages('gdata')
#install.packages('bayesplot')
#install.packages('parallel')
suppressPackageStartupMessages(library(rstan))

## Warning: package 'rstan' was built under R version 3.5.3

## Warning: package 'StanHeaders' was built under R version 3.5.3

## Warning: package 'ggplot2' was built under R version 3.5.3

suppressPackageStartupMessages(library(gdata))

## Warning: package 'gdata' was built under R version 3.5.3

suppressPackageStartupMessages(library(bayesplot))

## Warning: package 'bayesplot' was built under R version 3.5.3

suppressPackageStartupMessages(library(parallel))

Intro to STAN Homework

After our Intro to Stan lecture I think it would be valuable to have you go through a similar exercise. Let’s test a second research question.

Research question: Is sea ice extent declining in the Southern Hemisphere over time? Is the same pattern happening in the Antarctic as in the Arctic? Fit a Stan model to find out!

Make sure you follow the steps we used in class.

1. Load and Inspect Data

#place the code here
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:gdata':
## 
##     combine, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#place the code here
library(pastecs)

## Warning: package 'pastecs' was built under R version 3.5.3

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## The following objects are masked from 'package:gdata':
## 
##     first, last

## The following object is masked from 'package:rstan':
## 
##     extract

#place the code here
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 3.5.3

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

#place the code here
seaice <- read.csv(file = "C:/Users/linto/Desktop/HU/ANLY 505/seaice.csv")
colnames(seaice) <- c("year", "extent_north", "extent_south")
head(seaice)

##   year extent_north extent_south
## 1 1979       12.328       11.700
## 2 1980       12.337       11.230
## 3 1981       12.127       11.435
## 4 1982       12.447       11.640
## 5 1983       12.332       11.389
## 6 1984       11.910       11.454

2. Plot the data

#plot data
plot(extent_north ~ year, pch = 20, data = seaice)

3. Run a general linear model using lm()

#write the code

lm1 <- lm (extent_north ~ year, data = seaice)
lm1

## 
## Call:
## lm(formula = extent_north ~ year, data = seaice)
## 
## Coefficients:
## (Intercept)         year  
##   120.50304     -0.05457

4. Index the data, re-run the lm(), extract summary statistics and turn the indexed data into a dataframe to pass into Stan

#write the code here
summary(lm1)

## 
## Call:
## lm(formula = extent_north ~ year, data = seaice)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49925 -0.17713  0.04898  0.16923  0.32829 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 120.503036   6.267203   19.23   <2e-16 ***
## year         -0.054574   0.003137  -17.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2205 on 37 degrees of freedom
## Multiple R-squared:  0.8911, Adjusted R-squared:  0.8881 
## F-statistic: 302.7 on 1 and 37 DF,  p-value: < 2.2e-16

#write the code here
plot(extent_north ~ year, pch = 20, data = seaice)
abline(lm1, col = 2, lty = 2, lw = 3)

#write the code here
x <- I(seaice$year - 1978)
y <- seaice$extent_north
N <- length(seaice$year)


lm1 <- lm(y ~ x)
summary(lm1)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49925 -0.17713  0.04898  0.16923  0.32829 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.555888   0.071985   174.4   <2e-16 ***
## x           -0.054574   0.003137   -17.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2205 on 37 degrees of freedom
## Multiple R-squared:  0.8911, Adjusted R-squared:  0.8881 
## F-statistic: 302.7 on 1 and 37 DF,  p-value: < 2.2e-16

#write the code here
lm_alpha <- summary(lm1)$coeff[1]#intersect
lm_beta <- summary(lm1)$coeff[2]#slope
lm_sigma <- sigma(lm1)#residual error
stan_data <- list(N = N, x = x, y = y)

5. Write the Stan model

#write the code

 write("// Stan model for simple linear regression

data {
 int < lower = 1 > N; // Sample size
 vector[N] x; // Predictor
 vector[N] y; // Outcome
}

parameters {
 real alpha; // Intercept
 real beta; // Slope (regression coefficients)
 real < lower = 0 > sigma; // Error SD
}

model {
 y ~ normal(alpha + x * beta , sigma);
}

generated quantities {
} // The posterior predictive distribution",

"stan_model1.stan")


stan_model1 <- "stan_model1.stan"

6. Check to see how many cores your computer has and enable parallel computing

detectCores(all.tests = FALSE, logical = TRUE)

options(mc.cores = parallel::detectCores())

7. Run the Stan model and inspect the results

#code here

fit <- stan(file = stan_model1, data = stan_data, warmup = 500, iter = 1000, chains = 4, cores = 4, thin = 1)

fit

## Inference for Stan model: stan_model1.
## 4 chains, each with iter=1000; warmup=500; thin=1; 
## post-warmup draws per chain=500, total post-warmup draws=2000.
## 
##        mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## alpha 12.56    0.00 0.08 12.40 12.51 12.56 12.61 12.71   940    1
## beta  -0.05    0.00 0.00 -0.06 -0.06 -0.05 -0.05 -0.05  1057    1
## sigma  0.23    0.00 0.03  0.18  0.21  0.23  0.24  0.29   854    1
## lp__  37.38    0.05 1.32 33.73 36.84 37.75 38.32 38.83   717    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Nov 05 21:51:35 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

8. Extract and inspect the posterior estimates into a list so we can plot them

#code here
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.5.3

## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --

## â tibble  2.1.3     â purrr   0.3.2
## â tidyr   1.0.0     â stringr 1.3.1
## â readr   1.3.1     â forcats 0.3.0

## Warning: package 'tibble' was built under R version 3.5.3

## Warning: package 'tidyr' was built under R version 3.5.3

## Warning: package 'purrr' was built under R version 3.5.3

## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::combine()         masks gdata::combine()
## x tidyr::extract()         masks pastecs::extract(), rstan::extract()
## x dplyr::filter()          masks stats::filter()
## x pastecs::first()         masks dplyr::first(), gdata::first()
## x kableExtra::group_rows() masks dplyr::group_rows()
## x purrr::keep()            masks gdata::keep()
## x dplyr::lag()             masks stats::lag()
## x pastecs::last()          masks dplyr::last(), gdata::last()

posterior <- rstan::extract(fit)
str(posterior)

## List of 4
##  $ alpha: num [1:2000(1d)] 12.5 12.6 12.6 12.7 12.5 ...
##   ..- attr(*, "dimnames")=List of 1
##   .. ..$ iterations: NULL
##  $ beta : num [1:2000(1d)] -0.0532 -0.0539 -0.054 -0.0643 -0.0494 ...
##   ..- attr(*, "dimnames")=List of 1
##   .. ..$ iterations: NULL
##  $ sigma: num [1:2000(1d)] 0.242 0.232 0.23 0.256 0.238 ...
##   ..- attr(*, "dimnames")=List of 1
##   .. ..$ iterations: NULL
##  $ lp__ : num [1:2000(1d)] 38.1 38.2 37.7 34.5 37.5 ...
##   ..- attr(*, "dimnames")=List of 1
##   .. ..$ iterations: NULL

9. Compare your results to our results to “lm”

#code here
plot(y ~ x, pch = 20)

abline(lm1, col = 2, lty = 2, lw = 3)
abline( mean(posterior$alpha), mean(posterior$beta), col = 5, lw = 2)

10. Plot multiple estimates from the posterior

#code here
plot(y ~ x, pch = 20)

for (i in 1:1000) {
 abline(posterior$alpha[i], posterior$beta[i], col = "gray", lty = 1)
}

abline(mean(posterior$alpha), mean(posterior$beta), col = 5, lw = 2)

11. Change the priors and see how that affects your results

#code here
write("// Stan model for simple linear regression

data {
 int < lower = 1 > N; // Sample size
 vector[N] x; // Predictor
 vector[N] y; // Outcome
}

parameters {
 real alpha; // Intercept
 real beta; // Slope (regression coefficients)
 real < lower = 0 > sigma; // Error SD
}

model {
 alpha ~ normal(10, 0.1);
 beta ~ normal(1, 0.1);
 y ~ normal(alpha + x * beta , sigma);
}

generated quantities {
} // The posterior predictive distribution",

"stan_model2.stan")

stan_model2 <- "stan_model2.stan"


fit2 <- stan(stan_model2, data = stan_data, warmup = 500, iter = 1000, chains = 4, cores = 2, thin = 1)

posterior2 <- rstan::extract(fit2)

plot(y ~ x, pch = 20)
abline(lm_alpha, lm_beta, col = 4, lty = 2, lw = 2)
abline(mean(posterior2$alpha), mean(posterior2$beta), col = 3, lw = 2)
abline(mean(posterior$alpha), mean(posterior$beta), col = 6, lw = 3)

12. What happened when you changed the priors? Does the model fit better or not?

After changing and “limiting” the priors, the model seems to not fit.

13. Convergence diagnostics - create traceplots that show all 4 chains

#code here
plot(posterior$alpha, type = "l")

#code here
plot(posterior$beta, type = "l")

#code here
plot(posterior$sigma, type = "l")

14. Plot parameter summaries for:

\(\alpha\), \(\beta\), \(\sigma\)

par(mfrow = c(1,3))

plot(density(posterior$alpha), main = "Alpha")
abline(v = lm_alpha, col = 4, lty = 2)

plot(density(posterior$beta), main = "Beta")
abline(v = lm_beta, col = 4, lty = 2)

plot(density(posterior$sigma), main = "Sigma")
abline(v = lm_sigma, col = 4, lty = 2)

#code here
traceplot(fit)

#code here
stan_dens(fit) # density

#code here
stan_hist(fit) # histogram

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

15. What do your Stan model results indicate?

#code here
write("// Stan model for simple linear regression

data {
 int < lower = 1 > N; // Sample size
 vector[N] x; // Predictor
 vector[N] y; // Outcome
}

parameters {
 real alpha; // Intercept
 real beta; // Slope (regression coefficients)
 real < lower = 0 > sigma; // Error SD
}

model {
 y ~ normal(x * beta + alpha, sigma);
}

generated quantities {
 real y_rep[N];

 for (n in 1:N) {
 y_rep[n] = normal_rng(x[n] * beta + alpha, sigma);
 }

}",

"stan_model_pc.stan")

stan_model_pc <- "stan_model_pc.stan"

fit_pc <- stan(stan_model_pc, data = stan_data, iter = 1000, chains = 4, cores = 4, thin = 1)

y_rep <- as.matrix(fit_pc, pars = "y_rep")

dim(y_rep)

## [1] 2000   39

#code here
ppc_dens_overlay(y, y_rep[1:200, ])