knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages(library(rstan))
## Warning: package 'rstan' was built under R version 3.6.1
## Warning: package 'StanHeaders' was built under R version 3.6.1
suppressPackageStartupMessages(library(gdata))
suppressPackageStartupMessages(library(bayesplot))
## Warning: package 'bayesplot' was built under R version 3.6.1
suppressPackageStartupMessages(library(parallel))
After our Intro to Stan lecture I think it would be valuable to have you go through a similar exercise. Let’s test a second research question.
Research question: Is sea ice extent declining in the Southern Hemisphere over time? Is the same pattern happening in the Antarctic as in the Arctic? Fit a Stan model to find out!
Make sure you follow the steps we used in class.
#place the code here
library(rstan)
library(gdata)
library(bayesplot)
library(parallel)
library(readr)
seaice <- read_csv("seaice.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## extent_north = col_double(),
## extent_south = col_double()
## )
View(seaice)
head(seaice)
## # A tibble: 6 x 3
## year extent_north extent_south
## <dbl> <dbl> <dbl>
## 1 1979 12.3 11.7
## 2 1980 12.3 11.2
## 3 1981 12.1 11.4
## 4 1982 12.4 11.6
## 5 1983 12.3 11.4
## 6 1984 11.9 11.5
#plot data
plot (extent_north ~ year, data = seaice)
#write the code
lm1 = lm(extent_south ~ year, data = seaice)
summary(lm1)
##
## Call:
## lm(formula = extent_south ~ year, data = seaice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23372 -0.18142 0.01587 0.18465 0.88814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.199551 10.925576 -1.300 0.2018
## year 0.012953 0.005468 2.369 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.1082
## F-statistic: 5.611 on 1 and 37 DF, p-value: 0.02318
#write the code here
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
##
## combine, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
seaice = seaice %>%
mutate("index" = I(year)-1978) # index 1978 -> 0
lm2 = lm(extent_south ~ index, data = seaice)
summary(lm2)
##
## Call:
## lm(formula = extent_south ~ index, data = seaice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23372 -0.18142 0.01587 0.18465 0.88814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.421555 0.125490 91.015 <2e-16 ***
## index 0.012953 0.005468 2.369 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.1082
## F-statistic: 5.611 on 1 and 37 DF, p-value: 0.02318
#write the code
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model1.stan")
stan_model1 = "stan_model1.stan"
detectCores(all.tests = FALSE, logical = TRUE)
options(mc.cores = parallel::detectCores())
#code here
stan_data = list(N=nrow(seaice), y=seaice$extent_south, x=seaice$index)
fit = stan(file = stan_model1, data = stan_data, warmup = 400, iter = 1000, chains = 4, cores = 2, thin = 1)
fit
## Inference for Stan model: stan_model1.
## 4 chains, each with iter=1000; warmup=400; thin=1;
## post-warmup draws per chain=600, total post-warmup draws=2400.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## alpha 11.42 0.00 0.13 11.17 11.33 11.42 11.50 11.67 1003 1.00
## beta 0.01 0.00 0.01 0.00 0.01 0.01 0.02 0.02 1039 1.00
## sigma 0.40 0.00 0.05 0.32 0.36 0.39 0.43 0.51 1031 1.01
## lp__ 16.29 0.05 1.25 12.99 15.71 16.62 17.23 17.74 762 1.01
##
## Samples were drawn using NUTS(diag_e) at Tue Nov 05 10:41:54 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
#code here
posterior <- extract(fit)
str(posterior)
## List of 4
## $ alpha: num [1:2400(1d)] 11.2 11.5 11.4 11.3 11.5 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ beta : num [1:2400(1d)] 0.01884 0.00821 0.00846 0.02148 0.01063 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ sigma: num [1:2400(1d)] 0.372 0.368 0.372 0.418 0.385 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ lp__ : num [1:2400(1d)] 16.5 17.4 16.6 16.5 17.3 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
#code here
plot(y ~ x, pch = 20, data = stan_data)
abline(lm2, col = 2, lty = 2, lw = 3)
abline(mean(posterior$alpha), mean(posterior$beta), col = 4, lw = 2)
#code here
plot(y ~ x, pch = 20, data = stan_data)
for (i in 1:400) {
abline(posterior$alpha[i], posterior$beta[i], col = "green", lty = 1)
}
abline(mean(posterior$alpha), mean(posterior$beta), col = 4, lw = 2)
#code here
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
alpha ~ normal(10, 0.1);
beta ~ normal(1, 0.1);
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {}",
"stan_model2.stan")
stan_model2 = "stan_model2.stan"
fit2 = stan(stan_model2, data = stan_data, warmup = 400, iter = 1000, chains = 4, cores = 2, thin = 1)
posterior2 = extract(fit2)
plot(y ~ x, pch = 20, data = stan_data)
When change the the period, the model becomes worse fit.
#code here
plot(posterior$alpha, type = "l")
plot(posterior$beta, type = "l")
plot(posterior$sigma, type = "l")
\(\alpha\), \(\beta\), \(\sigma\)
par(mfrow = c(1,3))
plot(density(posterior$alpha), main = "Alpha")
plot(density(posterior$beta), main = "Beta")
plot(density(posterior$sigma), main = "Sigma")
traceplot(fit)
stan_hist(fit)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot(fit, show_density = FALSE, ci_level = 0.5, outer_level = 0.95, fill_color = "red")
## ci_level: 0.5 (50% intervals)
## outer_level: 0.95 (95% intervals)
The Stan model indicates that it is a good model.