Setup

library(data.table)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(stringr)
library(tidyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(readr)
library(tinytex)

rm(list = ls())
gc()

# root dir
data_dir <<- "C:/Users/cfogal/Dropbox/predoc/noto/great_recession/cex/data/"
cpi_dir <<- paste(data_dir, "cpi/", sep='')

# functions
get_yr_avg <- function(year, yr_dir){
  # This function calculates the average yearly expenditure for a given year, and outputs
  # a valid population estimated mean
  
  year <- as.numeric(year)
  
  # Read in and stack the MTBI files
  mtbi <- lapply(dir(yr_dir, pattern = "^mtbi.*[.]csv$", full.names = TRUE),
                 fread,
                 select = c('NEWID', 'COST', 'REF_YR'),
                 na.strings = c("", ".", "NA")) %>% 
          bind_rows() %>%
          rename_with(tolower) %>% # Change the column names to lower case
          filter(ref_yr %in% year) %>% # Filter for expenditures made in the given year and UCC's used for publication
          mutate(newid = as.character(newid)) %>% # Change "newid" to a character variable
          select(-ref_yr) %>% # Remove unnecessary columns
          group_by(newid) %>% # Group the data by newid
          summarise(avg_exp = sum(cost)) # Get the total expenditure for each person for this year
  
  # Read in and stack the fmli files
  fmli <- lapply(dir(yr_dir, pattern = "^fmli.*[.]csv$", full.names = TRUE),
                 fread,
                 select = c('NEWID', 'FINLWT21', 'QINTRVMO', 'QINTRVYR', 'AGE_REF'),
                 na.strings = c("", ".", "NA")) %>% 
          bind_rows() %>%
          rename_with(tolower) %>%
          mutate(newid = as.character(newid), # clean the newid and qintrvmo vars, then generate a calendar-year population weight variable
                 qintrvmo = as.numeric(qintrvmo),
                 popwt = ifelse(qintrvmo %in% 1:3 & qintrvyr %in% year, (qintrvmo - 1) / 3 * finlwt21 / 4, # calculate the popwt according to how the cex docs say
                                ifelse(qintrvyr %in% (year + 1), (4 - qintrvmo) / 3 *finlwt21 / 4, finlwt21 / 4)), 
                 working_age = ifelse(age_ref >= 16 & age_ref <= 64, 1, 0), # create indicator if in working age
                 prime_age = ifelse(age_ref >= 25 & age_ref <= 54, 1, 0), # create indicator if in prime age
                 elderly = ifelse(age_ref >= 64, 1, 0)) %>% # create indicator if elderly
          select(-c(qintrvyr, qintrvmo, age_ref)) # drop unneeded vars
  
  working_age <- get_group_avg(fmli %>% filter(working_age==1), mtbi) # get an average 'average personal expenditure' for working age group in this year
  prime_age <- get_group_avg(fmli %>% filter(prime_age==1), mtbi) # get an average 'average personal expenditure' for prime age group in this year
  elderly <- get_group_avg(fmli %>% filter(elderly==1), mtbi) # get an average 'average personal expenditure' for elderly group in this year
  
  out <- bind_rows(list(working_age, prime_age, elderly), .id="group") %>% # combine all three groups back into same df
          mutate(year=year) # note that group 1 is working age, group 2 is prime age
                            # and group 3 is elderly
  
  return(out)
}

get_group_avg <- function(df, mtbi){
  mean_exp <- left_join(df, mtbi, by = "newid") %>% # merge back in mbti to weigh avgs in a sec
      mutate(avg_exp = replace(avg_exp, is.na(avg_exp), 0)) %>% # replace any missing with 0 (as cex docs say)
      summarise(pop_exp = sum(avg_exp * finlwt21) / sum(popwt)) %>% # calculate group avg with weights
      unlist # keep just the avg value
  
  return(mean_exp)
}

get_real_values <- function(df, cpi_file="CPIAUCSL.csv"){
  # load in the cpiaucsl doc, and prepare the cpi values to merge into the main df
  cpiu <- read.csv(paste(cpi_dir, cpi_file, sep='')) %>%
            rename(cpi = CPIAUCSL_NBD20060101,
                   date = DATE) %>%
            mutate(year = substr(date, 1, 4)) %>%
            select(-date)
  
  # merge in the cpi values for each year, and calculate real exp values
  df %<>% merge(cpiu, by='year') %>%
            mutate(real_exp = (100*pop_exp)/cpi) %>%
            select(-cpi)
  return(df) # return the new df with real values
}

get_normalized_values <- function(df, yr=2006, value="real_exp"){
  val <- df %>%
          filter(year==yr) %>%
          select(as.name(value), group) %>%
          rename(temp = as.name(value))
  
  df %<>% merge(val, by='group') %>%
            mutate(norm = (eval(as.name(value))-temp)/temp) %>%
            select(-temp)
  return(df)
}

Calculating the year averages, and preparing to plot

for (yr in 2003:2016){
  yr_dir <- paste(data_dir, as.character(yr), "/intrvw", substr(as.character(yr), 3, 4), sep="") # create year dir string
  assign(paste("exp_", as.character(yr), sep=""), get_yr_avg(yr, yr_dir)) # calc and create a df for the yr avgs of each group
}
dat <- bind_rows(exp_2003, exp_2004, exp_2005, exp_2006, exp_2007, exp_2008, exp_2009,
                 exp_2010, exp_2011, exp_2012, exp_2013, exp_2014, exp_2015, exp_2016)

dat %<>% get_real_values() %>%
          get_normalized_values()

working_age <- dat %>% filter(group==1)
prime_age <- dat %>% filter(group==2)
elderly <- dat %>% filter(group==3)

Plots

Here, we first calculate the total expenditure for each person (or consumer unit, CU) within each year. Next, we take a weighted average across all CU’s in a given year (following the CEX guidelines) in order to get an average personal expenditure for each year. Averages are calculated at the age-group level. We make no restrictions on the type or category of expenditure here. All reported dollars are real relative to 2006 values (using the CPI-U with base value 2006). For each age group, we present three plots - (1). Average expenditure over time with unique scale (2). Average expenditure over time with same scale (for prime age, this scale does not change from (1)) (3). A plot showing percent change in average expenditure over time (relative to 2006).

# working age
ggplot(aes(x=year, y=real_exp), data=working_age) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + geom_hline(yintercept=seq(180000, 280000, by = 20000), color='grey') + geom_line(color="navy") + ggtitle("Working Age Population - 16-64") + scale_y_continuous(limits = c(180000, 280000), breaks = seq(180000, 280000, by = 20000)) + geom_point(size=1, color="navy")

ggplot(aes(x=year, y=real_exp), data=working_age) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + geom_hline(yintercept=seq(180000, 300000, by = 20000), color='grey') + ggtitle("Working Age Population - 16-64") + geom_line(color="navy") + scale_y_continuous(limits = c(180000, 300000), breaks = seq(180000, 300000, by = 20000)) + geom_point(size=1, color="navy")

ggplot(aes(x=year, y=norm), data=working_age) + theme_classic() + ylab("Percent Change in Average Expenditure") + xlab("Year") + ggtitle("Working Age Population - 16-64") + geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + geom_line(color="navy") + geom_point(size=1, color="navy")

# prime age
ggplot(aes(x=year, y=real_exp), data=prime_age) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + geom_hline(yintercept=seq(180000, 300000, by = 20000), color='grey') + ggtitle("Prime Age Population - 25-54") + geom_line(color="navy") + scale_y_continuous(limits = c(180000, 300000), breaks = seq(180000, 300000, by = 20000)) + geom_point(size=1, color="navy")

ggplot(aes(x=year, y=real_exp), data=prime_age) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + geom_hline(yintercept=seq(180000, 300000, by = 20000), color='grey') + ggtitle("Prime Age Population - 25-54") + scale_y_continuous(limits = c(180000, 300000), breaks = seq(180000, 300000, by = 20000)) + geom_line(color="navy") + geom_point(size=1, color="navy")

ggplot(aes(x=year, y=norm), data=prime_age) + theme_classic() + ylab("Percent Change in Average Expenditure") + xlab("Year") + ggtitle("Prime Age Population - 25-54") + geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + geom_line(color="navy") + geom_point(size=1, color="navy")

# elderly
ggplot(aes(x=year, y=real_exp), data=elderly) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + ggtitle("Elderly Population - 65+") + geom_hline(yintercept=seq(180000, 260000, by = 20000), color='grey') + scale_y_continuous(limits = c(180000, 260000), breaks = seq(180000, 260000, by = 20000)) + geom_line(color="navy") + geom_point(size=1, color="navy")

ggplot(aes(x=year, y=real_exp), data=elderly) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + ggtitle("Elderly Population - 65+") + geom_hline(yintercept=seq(180000, 300000, by = 20000), color="grey") + scale_y_continuous(limits = c(180000, 300000), breaks = seq(180000, 300000, by = 20000)) + geom_line(color="navy") + geom_point(size=1, color="navy")

ggplot(aes(x=year, y=norm), data=elderly) + theme_classic() + ylab("Percent Change in Average Expenditure") + xlab("Year") + ggtitle("Elderly Population - 65+") + geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + geom_line(color="navy") + geom_point(size=1, color="navy")