Setup

library(data.table)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(stringr)
library(tidyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(readr)
library(tinytex)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
rm(list = ls())
gc()

# root dir
#data_dir <<- "C:/Users/Connor Fogal/Dropbox/predoc/noto/great_recession/cex/data/"
data_dir <<- "C:/Users/cfogal/Dropbox/predoc/noto/great_recession/cex/data/"
cpi_dir <<- paste(data_dir, "cpi/", sep='')

# functions
get_yr_avg <- function(year, yr_dir){
  # This function calculates the average yearly expenditure for a given year, and outputs
  # a valid population estimated mean
  
  year <- as.numeric(year)
  
  # Read in and stack the MTBI files
  mtbi <- lapply(dir(yr_dir, pattern = "^mtbi.*[.]csv$", full.names = TRUE),
                 fread,
                 select = c('NEWID', 'COST', 'REF_YR', "UCC", "REF_MO", "PUBFLAG"),
                 na.strings = c("", ".", "NA")) %>% 
          bind_rows() %>%
          rename_with(tolower) %>% # Change the column names to lower case
          distinct(newid, cost, ucc, ref_mo, .keep_all = TRUE) %>% # drop any duplicated expense entries
          filter(ref_yr %in% year) %>% # Filter for expenditures made in the given year and UCC's used for publication
          filter(pubflag==2) %>%
          mutate(newid = as.character(newid)) %>% # Change "newid" to a character variable
          select(-c(ref_yr, ucc, ref_mo, pubflag)) %>% # Remove unnecessary columns
          group_by(newid) %>% # Group the data by newid
          summarise(avg_exp = sum(cost)) # Get the total expenditure for each person for this year
  
  # Read in and stack the fmli files
  fmli <- lapply(dir(yr_dir, pattern = "^fmli.*[.]csv$", full.names = TRUE),
                 fread,
                 select = c('NEWID', 'FINLWT21', 'QINTRVMO', 'QINTRVYR', 'AGE_REF', 'EDUC_REF'),
                 na.strings = c("", ".", "NA")) %>% 
          bind_rows() %>%
          rename_with(tolower) %>%
          mutate(newid = as.character(newid), # clean the newid and qintrvmo vars, then generate a calendar-year population weight variable
                 qintrvmo = as.numeric(qintrvmo),
                 working_age = ifelse(age_ref >= 16 & age_ref <= 64, 1, 0), # create indicator if in working age
                 prime_age = ifelse(age_ref >= 25 & age_ref <= 54, 1, 0), # create indicator if in prime age
                 elderly = ifelse(age_ref > 64, 1, 0),
                 hs_less = ifelse(educ_ref <=12, 1, 0),
                 more_hs = ifelse(educ_ref > 12, 1, 0)) %>% # create indicator if elderly
          select(-c(qintrvyr, qintrvmo, age_ref)) # drop unneeded vars
  
  working_age <- get_group_avg(fmli %>% filter(working_age==1), mtbi) # get an average personal expenditure for working age group in this year
  prime_age <- get_group_avg(fmli %>% filter(prime_age==1), mtbi) # get an average personal expenditure for prime age group in this year
  elderly <- get_group_avg(fmli %>% filter(elderly==1), mtbi) # get an average personal expenditure for elderly group in this year
  hs <- get_group_avg(fmli %>% filter(hs_less==1), mtbi) # get an average personal expenditure for group with less than hs education
  college <- get_group_avg(fmli %>% filter(more_hs==1), mtbi) # get an average personal expenditure for group with at least some college education
  
  out <- bind_rows(list(working_age, prime_age, elderly, hs, college), .id="group") %>% # combine all three groups back into same df
          mutate(year=year) # note that group 1 is working age, group 2 is prime age, group 3 is elderly, 
                            # group 4 is hs educ, and group 5 is college educ
  
  return(out)
}

get_group_avg <- function(df, mtbi){
  mean_exp <- left_join(df, mtbi, by = "newid") %>% # merge back in mbti to weigh avgs in a sec
      mutate(avg_exp = replace(avg_exp, is.na(avg_exp), 0),
             total_wt = sum(finlwt21)) %>% # replace any missing with 0 (as cex docs say)
      mutate(new_wt = finlwt21/total_wt) %>%
      summarize(pop_exp = sum(avg_exp*new_wt)/sum(new_wt))
      unlist # keep just the avg value
  
  return(mean_exp)
}

get_real_values <- function(df, cpi_file="CPIAUCSL.csv"){
  # load in the cpiaucsl doc, and prepare the cpi values to merge into the main df
  cpiu <- read.csv(paste(cpi_dir, cpi_file, sep='')) %>%
            rename(cpi = CPIAUCSL_NBD20060101,
                   date = DATE) %>%
            mutate(year = substr(date, 1, 4)) %>%
            select(-date)
  
  # merge in the cpi values for each year, and calculate real exp values
  df %<>% merge(cpiu, by='year') %>%
            mutate(real_exp = (100*pop_exp)/cpi) %>%
            select(-cpi)
  return(df) # return the new df with real values
}

get_normalized_values <- function(df, yr=2006, value="real_exp"){
  val <- df %>%
          filter(year==yr) %>%
          select(as.name(value), group) %>%
          rename(temp = as.name(value))
  
  df %<>% merge(val, by='group') %>%
            mutate(norm = (eval(as.name(value))-temp)/temp) %>%
            select(-temp)
  return(df)
}

Calculating the year averages, and preparing to plot

for (yr in 2003:2016){
  yr_dir <- paste(data_dir, as.character(yr), "/intrvw", substr(as.character(yr), 3, 4), sep="") # create year dir string
  assign(paste("exp_", as.character(yr), sep=""), get_yr_avg(yr, yr_dir)) # calc and create a df for the yr avgs of each group
}
dat <- bind_rows(exp_2003, exp_2004, exp_2005, exp_2006, exp_2007, exp_2008, exp_2009,
                 exp_2010, exp_2011, exp_2012, exp_2013, exp_2014, exp_2015, exp_2016)

dat %<>% get_real_values() %>%
          get_normalized_values()

working_age <- dat %>% filter(group==1)
prime_age <- dat %>% filter(group==2)
elderly <- dat %>% filter(group==3)
hs <- dat %>% filter(group==4)
college <- dat %>% filter(group==5)

combined_age <- dat %>% 
              filter(group==1|group==3) %>%
              mutate(age_group = NA) %>%
              mutate(age_group = case_when(group==1 ~ "Working Age",
                                           group==3 ~ "Elderly"),
                     temp1 = case_when(group==1 ~ real_exp,
                                      group==3 ~ NA),
                     temp2 = case_when(group==1 ~ NA,
                                      group==3 ~ real_exp)) %>%
              select(-group)

combined_educ <- dat %>% 
              filter(group==4|group==5) %>%
              mutate(ed_group = NA) %>%
              mutate(ed_group = case_when(group==4 ~ "HS or less",
                                          group==5 ~ "College or more"),
                     temp1 = case_when(group==4 ~ real_exp,
                                      group==5 ~ NA),
                     temp2 = case_when(group==4 ~ NA,
                                      group==5 ~ real_exp)) %>%
              select(-group)

Plots

Here, we first calculate the total expenditure for each person (or consumer unit, CU) within each year. Next, we take a weighted average across all CU’s in a given year (following the CEX guidelines) in order to get an average personal expenditure for each year. Averages are calculated at the age-group level. We make no restrictions on the type or category of expenditure here. All reported dollars are real relative to 2006 values (using the CPI-U with base value 2006). For each age group, we present three plots - (1). Average expenditure over time with unique scale (2). Average expenditure over time with same scale (for prime age, this scale does not change from (1)) (3). A plot showing percent change in average expenditure over time (relative to 2006).

options(scipen=100)
# working age
ggplot(aes(x=year, y=real_exp), data=working_age) + 
  theme_classic() + ylab("Average Expenditure") + 
  xlab("Year") + 
  geom_hline(yintercept=seq(20000, 60000, by = 10000), color='grey') + 
  geom_line(color="navy") + 
  ggtitle("Working Age Population - 16-64") + 
  scale_y_continuous(limits = c(20000, 60000), breaks = seq(20000, 60000, by = 10000)) + 
  geom_point(size=1, color="navy")

ggplot(aes(x=year, y=norm), data=working_age) + 
  theme_classic() + 
  ylab("Percent Change in Average Expenditure") + 
  xlab("Year") + ggtitle("Working Age Population - 16-64") + 
  geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + 
  scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + 
  geom_line(color="navy") + 
  geom_point(size=1, color="navy")

# prime age
ggplot(aes(x=year, y=real_exp), data=prime_age) + 
  theme_classic() + 
  ylab("Average Expenditure") + 
  xlab("Year") + 
  geom_hline(yintercept=seq(20000, 80000, by = 15000), color='grey') + 
  ggtitle("Prime Age Population - 25-54") + 
  geom_line(color="navy") + 
  scale_y_continuous(limits = c(20000, 80000), breaks = seq(20000, 80000, by = 15000)) + 
  geom_point(size=1, color="navy")

ggplot(aes(x=year, y=norm), data=prime_age) + 
  theme_classic() + 
  ylab("Percent Change in Average Expenditure") + 
  xlab("Year") + ggtitle("Prime Age Population - 25-54") + 
  geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + 
  scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + 
  geom_line(color="navy") + 
  geom_point(size=1, color="navy")

# elderly
ggplot(aes(x=year, y=real_exp), data=elderly) + 
  theme_classic() + 
  ylab("Average Expenditure") + 
  xlab("Year") + 
  ggtitle("Elderly Population - 65+") + 
  geom_hline(yintercept=seq(20000, 60000, by = 10000), color='grey') + 
  scale_y_continuous(limits = c(20000, 60000), breaks = seq(20000, 60000, by = 10000)) + 
  geom_line(color="navy") + 
  geom_point(size=1, color="navy")

ggplot(aes(x=year, y=norm), data=elderly) + 
  theme_classic() + 
  ylab("Percent Change in Average Expenditure") + 
  xlab("Year") + 
  ggtitle("Elderly Population - 65+") + 
  geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + 
  scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + 
  geom_line(color="navy") + 
  geom_point(size=1, color="navy")

# education
ggplot(aes(x=year, y=real_exp), data=hs) + 
  theme_classic() + 
  ylab("Average Expenditure") + 
  xlab("Year") + 
  ggtitle("High school graduate or less") + 
  geom_hline(yintercept=seq(20000, 35000, by = 5000), color='grey') + 
  scale_y_continuous(limits = c(20000, 35000), breaks = seq(20000, 35000, by = 5000)) + 
  geom_line(color="navy") + 
  geom_point(size=1, color="navy")

ggplot(aes(x=year, y=norm), data=hs) + 
  theme_classic() + 
  ylab("Percent Change in Average Expenditure") + 
  xlab("Year") + 
  ggtitle("High school graduate or less - percentage change") +
  geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') +
  scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + 
  geom_line(color="navy") + 
  geom_point(size=1, color="navy")

ggplot(aes(x=year, y=real_exp), data=college) + 
  theme_classic() + 
  ylab("Average Expenditure") + 
  xlab("Year") + 
  ggtitle("Some college or more") + 
  geom_hline(yintercept=seq(40000, 65000, by = 5000), color='grey') + 
  scale_y_continuous(limits = c(40000, 65000), breaks = seq(40000, 65000, by = 5000)) + 
  geom_line(color="navy") + 
  geom_point(size=1, color="navy")

ggplot(aes(x=year, y=norm), data=college) + 
  theme_classic() + 
  ylab("Percent Change in Average Expenditure") + 
  xlab("Year") + 
  ggtitle("Some college or more - percentage change") + 
  geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + 
  scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + 
  geom_line(color="navy") + 
  geom_point(size=1, color="navy")

# combined age plots
color_palette = c("navy", 'orange')
ggplot(aes(x=year, y=real_exp, color=factor(age_group), group=factor(age_group)), data=combined_age) + 
  theme_classic() + 
  ylab("Average Expenditure") + 
  xlab("Year") + 
  geom_hline(yintercept=seq(15000, 60000, by = 15000), color='grey') + 
  geom_line() + 
  ggtitle("Working Age Population & Elderly - same scale") + 
  scale_y_continuous(limits = c(15000, 60000), breaks = seq(15000, 60000, by = 15000)) + 
  geom_point(size=1.5) + 
  scale_color_manual(values=color_palette) + 
  labs(color="Age Group") + 
  scale_x_continuous(limits=c(2003, 2016))

ggplot(aes(x=year, y=norm, color=factor(age_group), group=factor(age_group)), data=combined_age) +
  theme_classic() + 
  ylab("Percent Change in Average Expenditure") + 
  xlab("Year") + 
  ggtitle("Working Age & Elderly - percent change") + 
  geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + 
  scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + 
  geom_line() + 
  geom_point(size=1.5) + 
  scale_color_manual(values=color_palette) + 
  labs(color="Age Group") + 
  scale_x_continuous(limits=c(2003, 2016))

ggplot(aes(x=year, color=factor(age_group), group=factor(age_group)), data=combined_age) + 
  theme_classic() + 
  ylab("Average Expenditure") + 
  xlab("Year") + 
  geom_hline(yintercept=seq(30000, 60000, by = 10000), color='grey') + 
  geom_line(aes(y=temp2+5000), color='navy') + 
  geom_line(aes(y=temp1), color='orange') + 
  ggtitle("Working Age & Elderly - different scales") + 
  scale_y_continuous("Working age expediture", limits = c(30000, 60000), breaks = seq(30000, 60000, by = 10000), 
                     sec.axis = sec_axis(~ . + 5000, 
                                         name = "Elderly Expenditure", 
                                         breaks = seq(35000, 75000, by = 10000))) + 
  geom_point(aes(y=temp2+5000), size=1.5, color='navy') + 
  geom_point(aes(y=temp1), size=1.5, color='orange')+ 
  scale_x_continuous(limits=c(2003, 2016))+ 
  scale_color_manual(values=color_palette)
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

# combined education plots
ggplot(aes(x=year, y=real_exp, color=factor(ed_group), group=factor(ed_group)), data=combined_educ) + 
  theme_classic() + 
  ylab("Average Expenditure") + 
  xlab("Year") + 
  geom_hline(yintercept=seq(20000, 65000, by = 10000), color='grey') + 
  geom_line() + 
  ggtitle("Education") + 
  scale_y_continuous(limits = c(20000, 65000), breaks = seq(20000, 65000, by = 10000)) + 
  geom_point(size=1.5) + 
  scale_color_manual(values=color_palette) + 
  labs(color="Education Level") + 
  scale_x_continuous(limits=c(2003, 2016))

ggplot(aes(x=year, y=norm, color=factor(ed_group), group=factor(ed_group)), data=combined_educ) + 
  theme_classic() + 
  ylab("Percent Change in Average Expenditure") + 
  xlab("Year") + 
  ggtitle("Education Level - percent change") + 
  geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + 
  scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + 
  geom_line() + 
  geom_point(size=1.5) + 
  scale_color_manual(values=color_palette) + 
  labs(color="Age Group") + 
  scale_x_continuous(limits=c(2003, 2016))

ggplot(aes(x=year, color=factor(ed_group), group=factor(ed_group)), data=combined_educ) + 
  theme_classic() + 
  ylab("Average Expenditure") + 
  xlab("Year") + 
  geom_hline(yintercept=seq(15000, 40000, by = 5000), color='grey') + 
  geom_line(aes(y=temp2-25000), color='navy') + 
  geom_line(aes(y=temp1), color='orange') + 
  ggtitle("Education - different scales") + 
  scale_y_continuous("High school or less avg expenditure", limits = c(15000, 40000), breaks = seq(15000, 40000, by = 5000), 
                     sec.axis = sec_axis(~ . + 25000, 
                                         name = "Some college or more avg expenditure", 
                                         breaks = seq(40000, 65000, by = 5000))) + 
  geom_point(aes(y=temp2-25000), size=1.5, color='navy') + 
  geom_point(aes(y=temp1), size=1.5, color='orange')+ 
  scale_x_continuous(limits=c(2003, 2016))+ 
  scale_color_manual(values=color_palette)
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).