knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages(library(rstan))
## Warning: package 'rstan' was built under R version 3.6.1
## Warning: package 'StanHeaders' was built under R version 3.6.1
## Warning: package 'ggplot2' was built under R version 3.6.1
suppressPackageStartupMessages(library(gdata))
suppressPackageStartupMessages(library(bayesplot))
## Warning: package 'bayesplot' was built under R version 3.6.1
suppressPackageStartupMessages(library(parallel))
After our Intro to Stan lecture I think it would be valuable to have you go through a similar exercise. Let’s test a second research question.
Research question: Is sea ice extent declining in the Southern Hemisphere over time? Is the same pattern happening in the Antarctic as in the Arctic? Fit a Stan model to find out!
Make sure you follow the steps we used in class.
#place the code here
seaice11<- read.table("seaice.csv", sep=",",header=T, fileEncoding="UTF-8-BOM")
head(seaice11)
## year extent_north extent_south
## 1 1979 12.328 11.700
## 2 1980 12.337 11.230
## 3 1981 12.127 11.435
## 4 1982 12.447 11.640
## 5 1983 12.332 11.389
## 6 1984 11.910 11.454
#plot data
ggplot(aes(x=year, y=extent_south), data=seaice11) + geom_point() + theme_bw()
#write the code
lm_result <- lm(extent_south ~ year, data = seaice11)
summary(lm_result)
##
## Call:
## lm(formula = extent_south ~ year, data = seaice11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23372 -0.18142 0.01587 0.18465 0.88814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.199551 10.925576 -1.300 0.2018
## year 0.012953 0.005468 2.369 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.1082
## F-statistic: 5.611 on 1 and 37 DF, p-value: 0.02318
#write the code here
x<- I(seaice11$year - 1979)
y<- seaice11$extent_south
N<- length(seaice11$year)
lm_result <- lm(y ~ x)
summary(lm_result)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23372 -0.18142 0.01587 0.18465 0.88814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.434508 0.120755 94.692 <2e-16 ***
## x 0.012953 0.005468 2.369 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.1082
## F-statistic: 5.611 on 1 and 37 DF, p-value: 0.02318
lm_alpha <- summary(lm_result)$coeff[1] #Intersect a
lm_beta <- summary(lm_result)$coeff[2] #Slope b
lm_sigma <- sigma(lm_result) #Residual error S
stan_data1 <- list(N = N, x = x, y = y)
#write the code
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model1.stan")
stan_model1 <- "stan_model1.stan"
detectCores(all.tests = FALSE, logical = TRUE)
options(mc.cores = parallel::detectCores())
#code here
result_stan1 <- stan(file = stan_model1, data = stan_data1, warmup = 500, iter = 1000, chains = 3, cores = 4, thin = 2)
result_stan1
## Inference for Stan model: stan_model1.
## 3 chains, each with iter=1000; warmup=500; thin=2;
## post-warmup draws per chain=250, total post-warmup draws=750.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## alpha 11.43 0.01 0.13 11.17 11.36 11.43 11.52 11.68 444 1.01
## beta 0.01 0.00 0.01 0.00 0.01 0.01 0.02 0.02 408 1.01
## sigma 0.40 0.00 0.05 0.32 0.36 0.39 0.42 0.51 558 1.00
## lp__ 16.31 0.07 1.37 12.97 15.83 16.65 17.27 17.71 386 1.01
##
## Samples were drawn using NUTS(diag_e) at Thu Oct 31 22:07:06 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
#code here
posterior_esti <- extract(result_stan1)
str(posterior_esti)
## List of 4
## $ alpha: num [1:750(1d)] 11.4 11.6 11.5 11.4 11.4 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ beta : num [1:750(1d)] 0.01716 0.00592 0.0103 0.01213 0.01162 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ sigma: num [1:750(1d)] 0.364 0.365 0.394 0.388 0.463 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ lp__ : num [1:750(1d)] 16.2 16.7 17.5 17.8 16.2 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
#code here
plot(y ~ x, pch = 20)
abline(lm_result, col = 3, lty = 2, lw = 3)
abline( mean(posterior_esti$alpha), mean(posterior_esti$beta), col = 3, lw = 2)
#code here
plot(y ~ x, pch = 20)
for (i in 1:500) {
abline(posterior_esti$alpha[i], posterior_esti$beta[i], col = "brown", lty = 1)
}
abline(mean(posterior_esti$alpha), mean(posterior_esti$beta), col = 4, lw = 2)
#code here
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
alpha ~ normal(10, 0.1);
beta ~ normal(1, 0.1);
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model2.stan")
stan_model2 <- "stan_model2.stan"
result_stan2 <- stan(stan_model2, data = stan_data1, warmup = 500, iter = 2000, chains = 4, cores = 2, thin = 2)
posterior2 <- extract(result_stan2)
plot(y ~ x, pch = 20)
abline(lm_alpha, lm_beta, col = 3, lty = 2, lw = 2)
abline(mean(posterior2$alpha), mean(posterior2$beta), col = 3, lw = 2)
abline(mean(posterior_esti$alpha), mean(posterior_esti$beta), col = 4, lw = 3)
The model is not fit when we changed the priors. Therefore, selecting the correct priors is necessary.
#code here
plot(posterior_esti$alpha, type = "l" , col = "red")
plot(posterior_esti$beta, type = "l", col = "blue")
plot(posterior_esti$sigma, type = "l", col = "green")
\(\alpha\), \(\beta\), \(\sigma\)
par(mfrow = c(1,3))
plot(density(posterior_esti$alpha), main = "Alpha", col = "red")
abline(v = lm_alpha, col = 3, lty = 2)
plot(density(posterior_esti$beta), main = "Beta", col = "blue")
abline(v = lm_beta, col = 3, lty = 2)
plot(density(posterior_esti$sigma), main = "Sigma", col = "green")
abline(v = lm_sigma, col = 3, lty = 2)
traceplot(result_stan1)
stan_dens(result_stan1)
stan_hist(result_stan1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot(result_stan1, show_density = FALSE, ci_level = 0.5, outer_level = 0.95, fill_color = "yellow")
## ci_level: 0.5 (50% intervals)
## outer_level: 0.95 (95% intervals)
#Posterior Accuracy Check
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(x * beta + alpha, sigma);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = normal_rng(x[n] * beta + alpha, sigma);
}
}",
"stan_model_pc.stan")
stan_model_new <- "stan_model_pc.stan"
result_fit <- stan(stan_model_new, data = stan_data1, iter = 2000, chains = 4, cores = 4, thin = 2)
y_rep <- as.matrix(result_fit, pars = "y_rep")
dim(y_rep)
## [1] 2000 39
ppc_dens_overlay(y, y_rep[1:300, ])
Stan model results indicate the model is a correct predictor with the result from posterior prediction.