#knitr::opts_chunk$set(echo = TRUE)
#install.packages('rstan')
#install.packages('gdata')
#install.packages('bayesplot')
#install.packages('parallel')
suppressPackageStartupMessages(library(rstan))
## Warning: package 'rstan' was built under R version 3.5.3
## Warning: package 'StanHeaders' was built under R version 3.5.3
## Warning: package 'ggplot2' was built under R version 3.5.3
suppressPackageStartupMessages(library(gdata))
## Warning: package 'gdata' was built under R version 3.5.3
suppressPackageStartupMessages(library(bayesplot))
## Warning: package 'bayesplot' was built under R version 3.5.3
suppressPackageStartupMessages(library(parallel))
After our Intro to Stan lecture I think it would be valuable to have you go through a similar exercise. Let’s test a second research question.
Research question: Is sea ice extent declining in the Southern Hemisphere over time? Is the same pattern happening in the Antarctic as in the Arctic? Fit a Stan model to find out!
Make sure you follow the steps we used in class.
#place the code here
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
##
## combine, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#place the code here
library(pastecs)
## Warning: package 'pastecs' was built under R version 3.5.3
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following objects are masked from 'package:gdata':
##
## first, last
## The following object is masked from 'package:rstan':
##
## extract
#place the code here
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
#place the code here
seaice <- read.csv(file = "C:/Users/linto/Desktop/HU/ANLY 505/seaice.csv")
colnames(seaice) <- c("year", "extent_north", "extent_south")
head(seaice)
## year extent_north extent_south
## 1 1979 12.328 11.700
## 2 1980 12.337 11.230
## 3 1981 12.127 11.435
## 4 1982 12.447 11.640
## 5 1983 12.332 11.389
## 6 1984 11.910 11.454
#plot data
plot(extent_north ~ year, pch = 20, data = seaice)
#write the code
lm1 <- lm (extent_north ~ year, data = seaice)
lm1
##
## Call:
## lm(formula = extent_north ~ year, data = seaice)
##
## Coefficients:
## (Intercept) year
## 120.50304 -0.05457
#write the code here
summary(lm1)
##
## Call:
## lm(formula = extent_north ~ year, data = seaice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49925 -0.17713 0.04898 0.16923 0.32829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120.503036 6.267203 19.23 <2e-16 ***
## year -0.054574 0.003137 -17.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2205 on 37 degrees of freedom
## Multiple R-squared: 0.8911, Adjusted R-squared: 0.8881
## F-statistic: 302.7 on 1 and 37 DF, p-value: < 2.2e-16
#write the code here
plot(extent_north ~ year, pch = 20, data = seaice)
abline(lm1, col = 2, lty = 2, lw = 3)
#write the code here
x <- I(seaice$year - 1978)
y <- seaice$extent_north
N <- length(seaice$year)
lm1 <- lm(y ~ x)
summary(lm1)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49925 -0.17713 0.04898 0.16923 0.32829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.555888 0.071985 174.4 <2e-16 ***
## x -0.054574 0.003137 -17.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2205 on 37 degrees of freedom
## Multiple R-squared: 0.8911, Adjusted R-squared: 0.8881
## F-statistic: 302.7 on 1 and 37 DF, p-value: < 2.2e-16
#write the code here
lm_alpha <- summary(lm1)$coeff[1]#intersect
lm_beta <- summary(lm1)$coeff[2]#slope
lm_sigma <- sigma(lm1)#residual error
stan_data <- list(N = N, x = x, y = y)
#write the code
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model1.stan")
stan_model1 <- "stan_model1.stan"
detectCores(all.tests = FALSE, logical = TRUE)
options(mc.cores = parallel::detectCores())
#code here
fit <- stan(file = stan_model1, data = stan_data, warmup = 500, iter = 1000, chains = 4, cores = 4, thin = 1)
fit
## Inference for Stan model: stan_model1.
## 4 chains, each with iter=1000; warmup=500; thin=1;
## post-warmup draws per chain=500, total post-warmup draws=2000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## alpha 12.56 0.00 0.08 12.40 12.51 12.56 12.61 12.71 940 1
## beta -0.05 0.00 0.00 -0.06 -0.06 -0.05 -0.05 -0.05 1057 1
## sigma 0.23 0.00 0.03 0.18 0.21 0.23 0.24 0.29 854 1
## lp__ 37.38 0.05 1.32 33.73 36.84 37.75 38.32 38.83 717 1
##
## Samples were drawn using NUTS(diag_e) at Tue Nov 05 21:51:35 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
#code here
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## â tibble 2.1.3 â purrr 0.3.2
## â tidyr 1.0.0 â stringr 1.3.1
## â readr 1.3.1 â forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::combine() masks gdata::combine()
## x tidyr::extract() masks pastecs::extract(), rstan::extract()
## x dplyr::filter() masks stats::filter()
## x pastecs::first() masks dplyr::first(), gdata::first()
## x kableExtra::group_rows() masks dplyr::group_rows()
## x purrr::keep() masks gdata::keep()
## x dplyr::lag() masks stats::lag()
## x pastecs::last() masks dplyr::last(), gdata::last()
posterior <- rstan::extract(fit)
str(posterior)
## List of 4
## $ alpha: num [1:2000(1d)] 12.5 12.6 12.6 12.7 12.5 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ beta : num [1:2000(1d)] -0.0532 -0.0539 -0.054 -0.0643 -0.0494 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ sigma: num [1:2000(1d)] 0.242 0.232 0.23 0.256 0.238 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
## $ lp__ : num [1:2000(1d)] 38.1 38.2 37.7 34.5 37.5 ...
## ..- attr(*, "dimnames")=List of 1
## .. ..$ iterations: NULL
#code here
plot(y ~ x, pch = 20)
abline(lm1, col = 2, lty = 2, lw = 3)
abline( mean(posterior$alpha), mean(posterior$beta), col = 5, lw = 2)
#code here
plot(y ~ x, pch = 20)
for (i in 1:1000) {
abline(posterior$alpha[i], posterior$beta[i], col = "gray", lty = 1)
}
abline(mean(posterior$alpha), mean(posterior$beta), col = 5, lw = 2)
#code here
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
alpha ~ normal(10, 0.1);
beta ~ normal(1, 0.1);
y ~ normal(alpha + x * beta , sigma);
}
generated quantities {
} // The posterior predictive distribution",
"stan_model2.stan")
stan_model2 <- "stan_model2.stan"
fit2 <- stan(stan_model2, data = stan_data, warmup = 500, iter = 1000, chains = 4, cores = 2, thin = 1)
posterior2 <- rstan::extract(fit2)
plot(y ~ x, pch = 20)
abline(lm_alpha, lm_beta, col = 4, lty = 2, lw = 2)
abline(mean(posterior2$alpha), mean(posterior2$beta), col = 3, lw = 2)
abline(mean(posterior$alpha), mean(posterior$beta), col = 6, lw = 3)
After changing and “limiting” the priors, the model seems to not fit.
#code here
plot(posterior$alpha, type = "l")
#code here
plot(posterior$beta, type = "l")
#code here
plot(posterior$sigma, type = "l")
\(\alpha\), \(\beta\), \(\sigma\)
par(mfrow = c(1,3))
plot(density(posterior$alpha), main = "Alpha")
abline(v = lm_alpha, col = 4, lty = 2)
plot(density(posterior$beta), main = "Beta")
abline(v = lm_beta, col = 4, lty = 2)
plot(density(posterior$sigma), main = "Sigma")
abline(v = lm_sigma, col = 4, lty = 2)
#code here
traceplot(fit)
#code here
stan_dens(fit) # density
#code here
stan_hist(fit) # histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#code here
write("// Stan model for simple linear regression
data {
int < lower = 1 > N; // Sample size
vector[N] x; // Predictor
vector[N] y; // Outcome
}
parameters {
real alpha; // Intercept
real beta; // Slope (regression coefficients)
real < lower = 0 > sigma; // Error SD
}
model {
y ~ normal(x * beta + alpha, sigma);
}
generated quantities {
real y_rep[N];
for (n in 1:N) {
y_rep[n] = normal_rng(x[n] * beta + alpha, sigma);
}
}",
"stan_model_pc.stan")
stan_model_pc <- "stan_model_pc.stan"
fit_pc <- stan(stan_model_pc, data = stan_data, iter = 1000, chains = 4, cores = 4, thin = 1)
y_rep <- as.matrix(fit_pc, pars = "y_rep")
dim(y_rep)
## [1] 2000 39
#code here
ppc_dens_overlay(y, y_rep[1:200, ])