#install.packages('rstan')
#install.packages('gdata')
#install.packages('bayesplot')
#install.packages('parallel')
knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages(library(rstan))
suppressPackageStartupMessages(library(gdata))
suppressPackageStartupMessages(library(bayesplot))
suppressPackageStartupMessages(library(parallel))
After our Intro to Stan lecture I think it would be valuable to have you go through a similar exercise. Let’s test a second research question.
Research question: Is sea ice extent declining in the Southern Hemisphere over time? Is the same pattern happening in the Antarctic as in the Arctic? Fit a Stan model to find out!
Make sure you follow the steps we used in class.
#place the code here
seaice<- read.table("seaice.csv", sep=",",header=T, fileEncoding="UTF-8-BOM")
head(seaice)
## year extent_north extent_south
## 1 1979 12.328 11.700
## 2 1980 12.337 11.230
## 3 1981 12.127 11.435
## 4 1982 12.447 11.640
## 5 1983 12.332 11.389
## 6 1984 11.910 11.454
#plot data
ggplot(aes(x=year, y=extent_south), data=seaice) + geom_point() + theme_bw()
#write the code
lm1 <- lm(extent_south ~ year, data = seaice)
summary(lm1)
##
## Call:
## lm(formula = extent_south ~ year, data = seaice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23372 -0.18142 0.01587 0.18465 0.88814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.199551 10.925576 -1.300 0.2018
## year 0.012953 0.005468 2.369 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.1082
## F-statistic: 5.611 on 1 and 37 DF, p-value: 0.02318
#write the code here
x<- I(seaice$year - 1978) #we are only interested in the sea ice change during the period specified in the dataset
y<- seaice$extent_south
N<- length(seaice$year)
lm1 <- lm(y ~ x)
summary(lm1)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23372 -0.18142 0.01587 0.18465 0.88814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.421555 0.125490 91.015 <2e-16 ***
## x 0.012953 0.005468 2.369 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.1082
## F-statistic: 5.611 on 1 and 37 DF, p-value: 0.02318
lm_alpha <- summary(lm1)$coeff[1] #intersect
lm_beta <- summary(lm1)$coeff[2] #slope
lm_sigma <- sigma(lm1) #residual error
stan_data <- list(N = N, x = x, y = y)
#write the code
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model1.stan")
stan_model1 <- "stan_model1.stan"
detectCores(all.tests = FALSE, logical = TRUE)
options(mc.cores = parallel::detectCores())
#code here
fit <- stan(file = stan_model1, data = stan_data, warmup = 500, iter = 1000, chains = 4, cores = 4, thin = 1)
fit
## Inference for Stan model: stan_model1.
## 4 chains, each with iter=1000; warmup=500; thin=1;
## post-warmup draws per chain=500, total post-warmup draws=2000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## alpha 11.43 0.00 0.13 11.18 11.34 11.43 11.52 11.69 754 1
## beta 0.01 0.00 0.01 0.00 0.01 0.01 0.02 0.02 791 1
## sigma 0.40 0.00 0.05 0.32 0.37 0.39 0.42 0.50 995 1
## lp__ 16.35 0.05 1.21 13.35 15.79 16.65 17.25 17.73 669 1
##
## Samples were drawn using NUTS(diag_e) at Thu Oct 31 11:55:41 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
#code here
posterior <- extract(fit)
str(posterior)
## List of 4
## $ alpha: num [1:2000(1d)] 11.7 11.4 11.4 11.3 11.4 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ beta : num [1:2000(1d)] 0.00322 0.01229 0.01401 0.01334 0.01587 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ sigma: num [1:2000(1d)] 0.419 0.389 0.337 0.328 0.369 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ lp__ : num [1:2000(1d)] 14.5 17.4 17.1 16 17.4 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
#code here
plot(y ~ x, pch = 20)
abline(lm1, col = 2, lty = 2, lw = 3)
abline( mean(posterior$alpha), mean(posterior$beta), col = 5, lw = 2)
#code here
plot(y ~ x, pch = 20)
for (i in 1:1000) {
abline(posterior$alpha[i], posterior$beta[i], col = "gray", lty = 1)
}
abline(mean(posterior$alpha), mean(posterior$beta), col = 5, lw = 2)
#code here
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
alpha ~ normal(10, 0.1);
beta ~ normal(1, 0.1);
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model2.stan")
stan_model2 <- "stan_model2.stan"
fit2 <- stan(stan_model2, data = stan_data, warmup = 500, iter = 1000, chains = 4, cores = 2, thin = 1)
posterior2 <- extract(fit2)
plot(y ~ x, pch = 20)
abline(lm_alpha, lm_beta, col = 4, lty = 2, lw = 2)
abline(mean(posterior2$alpha), mean(posterior2$beta), col = 3, lw = 2)
abline(mean(posterior$alpha), mean(posterior$beta), col = 6, lw = 3)
Above plot shows that after changing and “limiting” the priors, the model does not fit as it should be. So choosing the right priors is very important.
#code here
plot(posterior$alpha, type = "l")
plot(posterior$beta, type = "l")
plot(posterior$sigma, type = "l")
\(\alpha\), \(\beta\), \(\sigma\)
par(mfrow = c(1,3))
plot(density(posterior$alpha), main = "Alpha")
abline(v = lm_alpha, col = 4, lty = 2)
plot(density(posterior$beta), main = "Beta")
abline(v = lm_beta, col = 4, lty = 2)
plot(density(posterior$sigma), main = "Sigma")
abline(v = lm_sigma, col = 4, lty = 2)
traceplot(fit)
stan_dens(fit) # density
stan_hist(fit) # histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# parameter estimation for the Stan Model
plot(fit, show_density = FALSE, ci_level = 0.5, outer_level = 0.95, fill_color = "salmon")
## ci_level: 0.5 (50% intervals)
## outer_level: 0.95 (95% intervals)
## ci_level: 0.5 (50% intervals)
## outer_level: 0.95 (95% intervals)
#Posterior Accuracy Check
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(x * beta + alpha, sigma);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = normal_rng(x[n] * beta + alpha, sigma);
}
}",
"stan_model_pc.stan")
stan_model_pc <- "stan_model_pc.stan"
fit_pc <- stan(stan_model_pc, data = stan_data, iter = 1000, chains = 4, cores = 4, thin = 1)
y_rep <- as.matrix(fit_pc, pars = "y_rep")
dim(y_rep)
## [1] 2000 39
ppc_dens_overlay(y, y_rep[1:200, ])
Here we see data, the bold line fit well with our posterior predictions. Which means model is a good predictor.