Age-period-cohort models

In this example, we use data from the Integrated Health Interview Series to fit an Age-Period-Cohort, or APC model. Discussion of these models can be found in the recent text on the subject.

Below we fit the model using INLA. APC models are becoming more and more popular in demographic research, becuase it allows researchers to measure three aspects of health and behavioral outcomes that are affected by age, cohort differences, and period effects.

The problem with the APC model is the lack of identifiability. This means, that if you know someone’s age, and the year they were surveyed, they you automatically know their cohort (birth year). This generates a “rank deficiency” problem in models, because of linear dependence among variables.

Data and recodes

Here, we load the IHIS data and do some recodes.

library(haven)
library(car)
## Loading required package: carData
library(INLA)
## Loading required package: Matrix
## Loading required package: sp
## This is INLA_18.07.12 built 2018-07-12 11:05:18 UTC.
## See www.r-inla.org/contact-us for how to get help.
library(ipumsr)
library(ggplot2)
ddi<-read_ipums_ddi("~/Google Drive/classes/dem7473/data/nhis_00011.xml")
dat<- read_ipums_micro(ddi)
## Use of data from NHIS is subject to conditions including that users should cite
## the data appropriately. Use command `ipums_conditions()` for more details.
dat<-zap_labels(dat)
names(dat)<-tolower(names(dat))

dat<-dat[dat$age>=18,]

Recodes

dat$goodhealth<-Recode(dat$health, recodes="1:3=0; 4:5=1; else=NA")

dat$bmi_clean<-ifelse(dat$bmicalc%in%c(0,996),NA, dat$bmicalc)

dat$obese<-ifelse(dat$bmi_clean>=30,1,0)

dat$hisp<-ifelse(dat$hispeth==10, 0, ifelse(dat$hispeth %in%c(90:93), NA, 1))

dat$race<-Recode(dat$racea, recodes="100=1; 200=2; 300:340=3; 400:434=4; 500:600=5; else=NA")

dat$race_eth<-factor(ifelse(dat$hisp==1, "Hispanic",
                     ifelse(dat$hisp==0&dat$race==1, "NHWhite", 
                            ifelse(dat$hisp==0&dat$race==2, "NHBlack",
                                   ifelse(dat$hisp==0&dat$race==3, "NHNAHN",
                                          ifelse(dat$hisp==0&dat$race==4, "NHAsian", 
                                                 ifelse(dat$hisp==0&dat$race==5, "NHAsian", NA)))))))

dat$educ<-Recode(dat$educrec2, recodes="10:41='lths'; 42='hs';50:53='somecoll'; 54:60='colledu'; else=NA", as.factor=T, levels = c("lths", "hs", "somecoll", "colledu"))

dat$male<-Recode(dat$sex, recodes="1=1;2=0")

dat$agec<-cut(dat$age, breaks =seq(15, 85, 5), include.lowest = T)

dat$currsmoke<-Recode(dat$smokestatus2, recodes="0=NA;90=NA; 10:13=1; else=0", as.factor = F)

dat$currdrink<-Recode(dat$alcstat1, recodes="1:2='former'; 3='current'; else=NA", as.factor = T)

dat$birth_year<-ifelse(dat$birthyr>=9997, NA, dat$birthyr)

dat$age2<-dat$age^2

dat<-subset(dat, complete.cases(goodhealth,bmi_clean, educ, currsmoke, currdrink, birth_year))

dat$bmiz<- scale(dat$bmi_clean)

dat$cohort<- cut(dat$birth_year, breaks=seq(1915, 2000, 5))

set.seed(1115)
#take a sample of subjects, because the NHIS is huge.
#Subset to only 100000 cases

samps<-sample(1:nrow(dat), size = 100000, replace = F) 
dat<- dat[samps,]

Simple descriptives

Here are some descriptive plots that illustrate the variation by age, period and cohort

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dat%>%
  filter(age>18)%>%
  group_by(cohort)%>%
  summarise(meanbmi=mean(bmi_clean, na.rm=T))%>%
  ggplot(aes(y=meanbmi, x=cohort))+geom_line(aes(group=1))+ggtitle(label = "Average BMI by cohort", subtitle = "IHIS by IPUMS")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

dat%>%
  filter(age>18)%>%
  group_by(year)%>%
  summarise(meanbmi=mean(bmi_clean, na.rm=T))%>%
  ggplot(aes(y=meanbmi, x=year))+geom_line(aes(group=1))+ggtitle(label = "Average BMI by year", subtitle = "IHIS by IPUMS")

dat%>%
  filter(age>18)%>%
  group_by(agec)%>%
  summarise(meanbmi=mean(bmi_clean, na.rm=T))%>%
  ggplot(aes(y=meanbmi, x=agec))+geom_line(aes(group=1))+ggtitle(label = "Average BMI by age", subtitle = "IHIS by IPUMS")

dat%>%
  filter(age>18)%>%
  group_by(year, cohort)%>%
  summarise(meanbmi=mean(bmi_clean, na.rm=T))%>%
  ggplot(aes(y=meanbmi, x=year))+geom_line(aes(group=cohort, color=cohort))+ggtitle(label = "Average BMI by year and cohort", subtitle = "IHIS by IPUMS")

Complexities of the APC model

To get around this, you cannot specify all three effects in the same way. It is customary to have period and cohort random effects and age as a continous predictor.

The APC model basically has two random effects, one for period (year of survey) and a second random effect for cohort (birth year). If we have a continuous outcome, our model structure would be :

\[y_{ijk} = \mu_{jk} +\sum_c \beta_c x_{ik} + e_{ij}\] with \[\mu_{jkl} = \mu + u_j + \tau_k \]

\[u_j \sim N(0, \sigma^2_u)\]

\[\tau_k \sim N(0, \sigma^2_v)\] Where the \(u\) and \(\tau\) effects are for the respective period and cohort random effects.

Hierarchical models with time as a grouping variable

The APC model is a demographic example of a model that is hierarchical in time. Meaning, you have repreated observations either within period (survey year) or cohort(birth year).

In other fields, temporal data, or time-series data, are often smoothed with some form of moving average model (ARIMA, ARMA, MA, etc.). In the Bayesian modeling world, these models exist as well, and sometimes we want to get a smoothed picture of what our annual variation in an outcome looks like. This is useful because it can be easier to identify time trends within data if some of the year to year noise is removed.

Random walk models

Often time changes are actually slower than an annual rate might show. In such situations, a time lagged, or time averaged model can be useful. One type of model that does this is a temporal random walk model. We would normally specify a random effect as:

\[z_t \sim Normal(0, \sigma_z)\]

where each year’s average can fluctuate randomly around the overall mean. This moves too fast some times though, and by lagging the time effect you can see slower time trends. Instead, a first order random walk effect is written:

\[z_t |z_{t-1} \sim Normal(z_{t-1}, \sigma_z)\]

Where the mean for a given year is now provided by the mean of the preceeding year. Higher order random walks are also used, but generally a RW-2 model is a high as most people go. In this model, the mean would be a combination of the previous two years worth of data.

These models are both called Nonparametric regressions, becuase they’re not depending on any parameters that tell the model how smooth to be. A parametric approach would be a first or second order autoregressive model, where the random effect would be specified as:

\[z_t \sim Normal(\rho z_{t-1}, \sigma_z)\] where the \(\rho\) parameter measures the strength of the association with the previous time’s value.

Using INLA to fit the APC model

On reason INLA is so nice, is that it fits models with correlations very easily. Below, we will first fit the APC model without any temporal smoothing, then we will use a random walk prior for the period effects.

Model trickery

In the example below, we first fit a lmer model to get starting points for the Bayesian model parameters. This is called initializing the model, meaning you give it a reasonable starting value. Complicated models sometimes have a hard time finding their way, but by starting the model in the right neighborhood, this can go faster.

library(lme4)
#user lmer to find good starting values for INLA
fit_in1.mer0<- lmer(bmiz~ race_eth+educ+male+currsmoke+currdrink+agec+(1|year)+(1|cohort), data=dat)


#INLA parameterizes variances on the log-precision scale, so if we take log(1/variance), we can get that number and feed it to INLA
vc0<-as.data.frame(VarCorr(fit_in1.mer0))
vc0$init<-log(1/vc0$vcov); vc0
#Feed the init values to the hyperparameter inits in each f()
#IID random effects for period and cohort
fit_in0<- inla(bmiz~ race_eth+educ+male+currsmoke+currdrink+agec+f(year, model="iid",hyper = list(prec=list(initial=vc0$init[2])) )+f(cohort, model="iid",hyper = list(prec=list(initial=vc0$init[1]))), family = "gaussian",
               data=dat,
               num.threads = 2,
               verbose = F,
                control.family = list(hyper = list(prec=list(initial=vc0$init[3]))),
               control.compute = list(waic=T))


# cohort as smooth RW1 random effect
fit_in2<- inla(bmiz~ race_eth+educ+male+currsmoke+currdrink+agec+f(year, model="iid",hyper = list(prec=list(initial=vc0$init[2])) )+f(cohort, model="rw1",hyper = list(prec=list(initial=vc0$init[1]))),
               family = "gaussian",
               data=dat,
               num.threads = 2, 
               verbose = F, 
               control.family = list(hyper = list(prec=list(initial=vc0$init[3]))),
               control.compute = list(waic=T))
gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  5106578 272.8    8139580  434.8   8139580  434.8
## Vcells 57572489 439.3  134804081 1028.5 140337586 1070.7
summary(fit_in0)
## 
## Call:
## c("inla(formula = bmiz ~ race_eth + educ + male + currsmoke + currdrink + ",  "    agec + f(year, model = \"iid\", hyper = list(prec = list(initial = vc0$init[2]))) + ",  "    f(cohort, model = \"iid\", hyper = list(prec = list(initial = vc0$init[1]))), ",  "    family = \"gaussian\", data = dat, verbose = F, control.compute = list(waic = T), ",  "    control.family = list(hyper = list(prec = list(initial = vc0$init[3]))), ",  "    num.threads = 2)")
## 
## Time used:
##  Pre-processing    Running inla Post-processing           Total 
##          6.6412        197.2181          4.8470        208.7064 
## 
## Fixed effects:
##                    mean     sd 0.025quant 0.5quant 0.975quant    mode kld
## (Intercept)     -0.4101 0.0307    -0.4694  -0.4105    -0.3486 -0.4112   0
## race_ethNHAsian -0.4933 0.0164    -0.5256  -0.4933    -0.4611 -0.4933   0
## race_ethNHBlack  0.1740 0.0111     0.1521   0.1740     0.1958  0.1740   0
## race_ethNHNAHN   0.1900 0.0386     0.1143   0.1900     0.2656  0.1900   0
## race_ethNHWhite -0.0933 0.0090    -0.1110  -0.0933    -0.0756 -0.0933   0
## educhs          -0.0480 0.0099    -0.0675  -0.0480    -0.0285 -0.0480   0
## educsomecoll    -0.0665 0.0102    -0.0866  -0.0665    -0.0464 -0.0665   0
## educcolledu     -0.2950 0.0109    -0.3165  -0.2950    -0.2735 -0.2950   0
## male             0.1296 0.0062     0.1175   0.1296     0.1417  0.1296   0
## currsmoke       -0.1708 0.0078    -0.1862  -0.1708    -0.1554 -0.1708   0
## currdrinkformer  0.0606 0.0067     0.0473   0.0606     0.0738  0.0606   0
## agec(20,25]      0.2480 0.0203     0.2080   0.2481     0.2878  0.2481   0
## agec(25,30]      0.4141 0.0225     0.3696   0.4142     0.4578  0.4145   0
## agec(30,35]      0.5125 0.0243     0.4642   0.5128     0.5593  0.5135   0
## agec(35,40]      0.5889 0.0250     0.5386   0.5893     0.6366  0.5904   0
## agec(40,45]      0.6224 0.0251     0.5716   0.6230     0.6702  0.6242   0
## agec(45,50]      0.6559 0.0255     0.6040   0.6565     0.7042  0.6579   0
## agec(50,55]      0.6907 0.0263     0.6369   0.6915     0.7403  0.6930   0
## agec(55,60]      0.6912 0.0271     0.6356   0.6921     0.7421  0.6939   0
## agec(60,65]      0.6935 0.0278     0.6362   0.6944     0.7457  0.6962   0
## agec(65,70]      0.6343 0.0281     0.5766   0.6352     0.6873  0.6368   0
## agec(70,75]      0.5347 0.0283     0.4773   0.5353     0.5887  0.5365   0
## agec(75,80]      0.4281 0.0291     0.3694   0.4285     0.4843  0.4293   0
## agec(80,85]      0.1526 0.0290     0.0944   0.1529     0.2091  0.1535   0
## 
## Random effects:
## Name   Model
##  year   IID model 
## cohort   IID model 
## 
## Model hyperparameters:
##                                             mean        sd 0.025quant
## Precision for the Gaussian observations    1.092    0.0049      1.082
## Precision for year                       191.336   71.4821     85.634
## Precision for cohort                    1874.274 1348.3356    464.830
##                                         0.5quant 0.975quant     mode
## Precision for the Gaussian observations    1.092      1.101    1.092
## Precision for year                       180.190    362.028  159.135
## Precision for cohort                    1511.340   5419.306 1022.548
## 
## Expected number of effective parameters(std dev): 48.47(1.585)
## Number of equivalent replicates : 2063.16 
## 
## Watanabe-Akaike information criterion (WAIC) ...: 275079.55
## Effective number of parameters .................: 50.61
## 
## Marginal log-Likelihood:  -137748.20 
## Posterior marginals for linear predictor and fitted values computed
summary(fit_in2) 
## 
## Call:
## c("inla(formula = bmiz ~ race_eth + educ + male + currsmoke + currdrink + ",  "    agec + f(year, model = \"iid\", hyper = list(prec = list(initial = vc0$init[2]))) + ",  "    f(cohort, model = \"rw1\", hyper = list(prec = list(initial = vc0$init[1]))), ",  "    family = \"gaussian\", data = dat, verbose = F, control.compute = list(waic = T), ",  "    control.family = list(hyper = list(prec = list(initial = vc0$init[3]))), ",  "    num.threads = 2)")
## 
## Time used:
##  Pre-processing    Running inla Post-processing           Total 
##          6.3700        352.4757          5.1453        363.9909 
## 
## Fixed effects:
##                    mean     sd 0.025quant 0.5quant 0.975quant    mode kld
## (Intercept)     -0.3871 0.0421    -0.4668  -0.3882    -0.3003 -0.3901   0
## race_ethNHAsian -0.4929 0.0162    -0.5248  -0.4929    -0.4610 -0.4929   0
## race_ethNHBlack  0.1741 0.0110     0.1525   0.1741     0.1957  0.1741   0
## race_ethNHNAHN   0.1903 0.0381     0.1154   0.1903     0.2651  0.1903   0
## race_ethNHWhite -0.0930 0.0089    -0.1105  -0.0930    -0.0756 -0.0930   0
## educhs          -0.0483 0.0098    -0.0675  -0.0483    -0.0290 -0.0483   0
## educsomecoll    -0.0668 0.0101    -0.0867  -0.0668    -0.0470 -0.0668   0
## educcolledu     -0.2955 0.0108    -0.3167  -0.2955    -0.2742 -0.2955   0
## male             0.1295 0.0061     0.1176   0.1295     0.1415  0.1295   0
## currsmoke       -0.1711 0.0078    -0.1863  -0.1711    -0.1559 -0.1711   0
## currdrinkformer  0.0607 0.0067     0.0476   0.0607     0.0738  0.0607   0
## agec(20,25]      0.2398 0.0202     0.2000   0.2399     0.2792  0.2401   0
## agec(25,30]      0.3961 0.0233     0.3492   0.3964     0.4411  0.3970   0
## agec(30,35]      0.4865 0.0273     0.4312   0.4870     0.5385  0.4880   0
## agec(35,40]      0.5575 0.0312     0.4938   0.5582     0.6168  0.5595   0
## agec(40,45]      0.5866 0.0351     0.5148   0.5875     0.6532  0.5890   0
## agec(45,50]      0.6170 0.0392     0.5364   0.6181     0.6911  0.6199   0
## agec(50,55]      0.6504 0.0437     0.5597   0.6517     0.7326  0.6540   0
## agec(55,60]      0.6487 0.0483     0.5484   0.6503     0.7392  0.6530   0
## agec(60,65]      0.6508 0.0530     0.5403   0.6527     0.7498  0.6557   0
## agec(65,70]      0.5927 0.0573     0.4732   0.5947     0.6999  0.5979   0
## agec(70,75]      0.4955 0.0615     0.3676   0.4975     0.6109  0.5007   0
## agec(75,80]      0.3932 0.0661     0.2559   0.3953     0.5172  0.3987   0
## agec(80,85]      0.1189 0.0700    -0.0265   0.1212     0.2504  0.1247   0
## 
## Random effects:
## Name   Model
##  year   IID model 
## cohort   RW1 model 
## 
## Model hyperparameters:
##                                             mean       sd 0.025quant
## Precision for the Gaussian observations    1.117   0.0004      1.116
## Precision for year                       136.419  35.6202     64.852
## Precision for cohort                    1523.214 649.5906    621.267
##                                         0.5quant 0.975quant     mode
## Precision for the Gaussian observations    1.117      1.118    1.117
## Precision for year                       138.810    195.390  148.807
## Precision for cohort                    1402.635   3136.808 1189.358
## 
## Expected number of effective parameters(std dev): 47.98(1.206)
## Number of equivalent replicates : 2084.18 
## 
## Watanabe-Akaike information criterion (WAIC) ...: 275094.80
## Effective number of parameters .................: 49.21
## 
## Marginal log-Likelihood:  -137758.60 
## Posterior marginals for linear predictor and fitted values computed

Plot some of the effects:

fit_in2$summary.random$year%>%
  ggplot(aes(x=ID, y=mean))+geom_line(aes(group=1))+geom_line(aes(x=ID, y=`0.025quant`), lty=2)+geom_line(aes(x=ID, y=`0.975quant`), lty=2)+ggtitle("Period random effect- IID specification")+ylab("Period Random Effect (IID)")+xlab("Year")

fit_in2$summary.random$cohort%>%
  ggplot(aes(x=ID, y=mean))+geom_line(aes(group=1))+geom_line(aes(x=ID, y=`0.025quant`, group=1), lty=2)+geom_line(aes(x=ID, y=`0.975quant`, group=1), lty=2)+ggtitle("Cohort random effect- RW1 specification")+ylab("Cohort Random Effect (IID)")+xlab("Cohort")

Plot the marginals for the variance terms

library(brinla)
m2<- inla.tmarginal(
        function(x) 1/x,
        fit_in2$marginals.hyperpar$`Precision for year`)

m3<- inla.tmarginal(
        function(x) 1/x,
        fit_in2$marginals.hyperpar$`Precision for cohort`)

plot(m2, type="l", main="Posterior distibution for year variance")

plot(m3, type="l", main="Posterior distibution for cohort variance")

bri.hyperpar.plot(fit_in2, together = F)