10/24/2019knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages(library(rstan))
suppressPackageStartupMessages(library(gdata))
suppressPackageStartupMessages(library(bayesplot))
suppressPackageStartupMessages(library(parallel))
After our Intro to Stan lecture I think it would be valuable to have you go through a similar exercise. Let’s test a second research question.
Research question: Is sea ice extent declining in the Southern Hemisphere over time? Is the same pattern happening in the Antarctic as in the Arctic? Fit a Stan model to find out!
Make sure you follow the steps we used in class.
seaice<- read.table("seaice.csv",sep=",",header=T)
head(seaice)
## year extent_north extent_south
## 1 1979 12.328 11.700
## 2 1980 12.337 11.230
## 3 1981 12.127 11.435
## 4 1982 12.447 11.640
## 5 1983 12.332 11.389
## 6 1984 11.910 11.454
plot(extent_south ~ year, pch = 20, data = seaice)
#write the code
lm1<- lm(extent_south ~ year, data=seaice)
lm1
##
## Call:
## lm(formula = extent_south ~ year, data = seaice)
##
## Coefficients:
## (Intercept) year
## -14.19955 0.01295
summary(lm1)
##
## Call:
## lm(formula = extent_south ~ year, data = seaice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23372 -0.18142 0.01587 0.18465 0.88814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.199551 10.925576 -1.300 0.2018
## year 0.012953 0.005468 2.369 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.1082
## F-statistic: 5.611 on 1 and 37 DF, p-value: 0.02318
#write the code here
x<- I(seaice$year - 1978) #we are only interested in the sea ice change during the period specified in the dataset
y<- seaice$extent_south
N<- length(seaice$year)
lm1 <- lm(y ~ x)
summary(lm1)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23372 -0.18142 0.01587 0.18465 0.88814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.421555 0.125490 91.015 <2e-16 ***
## x 0.012953 0.005468 2.369 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3843 on 37 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.1082
## F-statistic: 5.611 on 1 and 37 DF, p-value: 0.02318
lm_alpha<- summary(lm1)$coeff[1] #intercept
lm_beta<- summary(lm1)$coeff[2] #slope
lm_sigma<- sigma(lm1)#residuals
stan_data <- list(N = N, x = x, y = y)
#write the code
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model1.stan")
stan_model1 <- "stan_model1.stan"
detectCores(all.tests = FALSE, logical = TRUE)
## [1] 4
options(mc.cores = parallel::detectCores())
#code here
fit <- stan(file = stan_model1, data = stan_data, warmup = 500, iter = 1000, chains = 4, cores = 4, thin = 1)
fit
## Inference for Stan model: stan_model1.
## 4 chains, each with iter=1000; warmup=500; thin=1;
## post-warmup draws per chain=500, total post-warmup draws=2000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## alpha 11.42 0.00 0.13 11.17 11.33 11.43 11.51 11.67 1143 1
## beta 0.01 0.00 0.01 0.00 0.01 0.01 0.02 0.02 1172 1
## sigma 0.40 0.00 0.05 0.32 0.36 0.39 0.42 0.51 1145 1
## lp__ 16.29 0.05 1.24 13.08 15.69 16.61 17.19 17.72 638 1
##
## Samples were drawn using NUTS(diag_e) at Fri Oct 25 11:58:40 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
#code here
posterior <- extract(fit)
str(posterior)
## List of 4
## $ alpha: num [1:2000(1d)] 11.5 11.6 11.6 11.2 11.5 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ beta : num [1:2000(1d)] 0.00642 0.00762 0.00882 0.01875 0.01244 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ sigma: num [1:2000(1d)] 0.395 0.386 0.37 0.475 0.381 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ lp__ : num [1:2000(1d)] 16.8 15.8 17.1 15.2 17.1 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
#code here
plot(y ~ x, pch = 20)
abline(lm1, col = 2, lty = 2, lw = 3)
abline(mean(posterior$alpha), mean(posterior$beta), col = 6, lw = 2)
Based on the chart above, we can tell that the result is very identical to the lm output. We can also plot multiple estimates from the posterior to visualize the variability of our estimation of the regression line.
#code here
plot(y ~ x, pch = 20)
for (i in 1:500) {
abline(posterior$alpha[i], posterior$beta[i], col = "gray", lty = 1)
}
abline(mean(posterior$alpha), mean(posterior$beta), col = 6, lw = 2)
#code here
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
alpha ~ normal(10, 0.1);
beta ~ normal(1, 0.1);
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model2.stan")
stan_model2 <- "stan_model2.stan"
fit2 <- stan(stan_model2, data = stan_data, warmup = 500, iter = 1000, chains = 4, cores = 2, thin = 1)
posterior2 <- extract(fit2)
plot(y ~ x, pch = 20)
abline(lm_alpha, lm_beta, col = 4, lty = 2, lw = 2)
abline(mean(posterior2$alpha), mean(posterior2$beta), col = 3, lw = 2)
abline(mean(posterior$alpha), mean(posterior$beta), col = 36, lw = 3)
As we can see from the chart above that after we changed (limit the value selection of priors) the priors, the model does not fit any better.
#code here
plot(posterior$alpha, type = "l")
plot(posterior$beta, type = "l")
plot(posterior$sigma, type = "l")
\(\alpha\), \(\beta\), \(\sigma\)
par(mfrow = c(1,3))
plot(density(posterior$alpha), main = "Alpha")
abline(v = lm_alpha, col = 4, lty = 2)
plot(density(posterior$beta), main = "Beta")
abline(v = lm_beta, col = 4, lty = 2)
plot(density(posterior$sigma), main = "Sigma")
abline(v = lm_sigma, col = 4, lty = 2)
#Posterior Accuracy Check
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(x * beta + alpha, sigma);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = normal_rng(x[n] * beta + alpha, sigma);
}
}",
"stan_model_AC.stan")
stan_model_AC <- "stan_model_AC.stan"
fit3 <- stan(stan_model_AC, data = stan_data, iter = 1000, chains = 4, cores = 2, thin = 1)
y_rep <- as.matrix(fit3, pars = "y_rep")
dim(y_rep)
## [1] 2000 39
ppc_dens_overlay(y, y_rep[1:200, ])
Based on the chart above, we can see that the data (dark blue lines) fits relatively well with our posterior predictions, suggesting the model’s ability to accurately predict the outcome is high.