library(data.table)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(stringr)
library(tidyr)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(readr)
library(tinytex)
rm(list = ls())
gc()
# root dir
data_dir <<- "C:/Users/cfogal/Dropbox/predoc/noto/great_recession/cex/data/"
cpi_dir <<- paste(data_dir, "cpi/", sep='')
# functions
get_yr_avg <- function(year, yr_dir){
# This function calculates the average yearly expenditure for a given year, and outputs
# a valid population estimated mean
year <- as.numeric(year)
# Read in and stack the MTBI files
mtbi <- lapply(dir(yr_dir, pattern = "^mtbi.*[.]csv$", full.names = TRUE),
fread,
select = c('NEWID', 'COST', 'REF_YR'),
na.strings = c("", ".", "NA")) %>%
bind_rows() %>%
rename_with(tolower) %>% # Change the column names to lower case
filter(ref_yr %in% year) %>% # Filter for expenditures made in the given year and UCC's used for publication
mutate(newid = as.character(newid)) %>% # Change "newid" to a character variable
select(-ref_yr) %>% # Remove unnecessary columns
group_by(newid) %>% # Group the data by newid
summarise(avg_exp = sum(cost)) # Get the total expenditure for each person for this year
# Read in and stack the fmli files
fmli <- lapply(dir(yr_dir, pattern = "^fmli.*[.]csv$", full.names = TRUE),
fread,
select = c('NEWID', 'FINLWT21', 'QINTRVMO', 'QINTRVYR', 'AGE_REF'),
na.strings = c("", ".", "NA")) %>%
bind_rows() %>%
rename_with(tolower) %>%
mutate(newid = as.character(newid), # clean the newid and qintrvmo vars, then generate a calendar-year population weight variable
qintrvmo = as.numeric(qintrvmo),
popwt = ifelse(qintrvmo %in% 1:3 & qintrvyr %in% year, (qintrvmo - 1) / 3 * finlwt21 / 4, # calculate the popwt according to how the cex docs say
ifelse(qintrvyr %in% (year + 1), (4 - qintrvmo) / 3 *finlwt21 / 4, finlwt21 / 4)),
working_age = ifelse(age_ref >= 16 & age_ref <= 64, 1, 0), # create indicator if in working age
prime_age = ifelse(age_ref >= 25 & age_ref <= 54, 1, 0), # create indicator if in prime age
elderly = ifelse(age_ref >= 64, 1, 0)) %>% # create indicator if elderly
select(-c(qintrvyr, qintrvmo, age_ref)) # drop unneeded vars
working_age <- get_group_avg(fmli %>% filter(working_age==1), mtbi) # get an average 'average personal expenditure' for working age group in this year
prime_age <- get_group_avg(fmli %>% filter(prime_age==1), mtbi) # get an average 'average personal expenditure' for prime age group in this year
elderly <- get_group_avg(fmli %>% filter(elderly==1), mtbi) # get an average 'average personal expenditure' for elderly group in this year
out <- bind_rows(list(working_age, prime_age, elderly), .id="group") %>% # combine all three groups back into same df
mutate(year=year) # note that group 1 is working age, group 2 is prime age
# and group 3 is elderly
return(out)
}
get_group_avg <- function(df, mtbi){
mean_exp <- left_join(df, mtbi, by = "newid") %>% # merge back in mbti to weigh avgs in a sec
mutate(avg_exp = replace(avg_exp, is.na(avg_exp), 0)) %>% # replace any missing with 0 (as cex docs say)
summarise(pop_exp = sum(avg_exp * finlwt21) / sum(popwt)) %>% # calculate group avg with weights
unlist # keep just the avg value
return(mean_exp)
}
get_real_values <- function(df, cpi_file="CPIAUCSL.csv"){
# load in the cpiaucsl doc, and prepare the cpi values to merge into the main df
cpiu <- read.csv(paste(cpi_dir, cpi_file, sep='')) %>%
rename(cpi = CPIAUCSL_NBD20060101,
date = DATE) %>%
mutate(year = substr(date, 1, 4)) %>%
select(-date)
# merge in the cpi values for each year, and calculate real exp values
df %<>% merge(cpiu, by='year') %>%
mutate(real_exp = (100*pop_exp)/cpi) %>%
select(-cpi)
return(df) # return the new df with real values
}
get_normalized_values <- function(df, yr=2006, value="real_exp"){
val <- df %>%
filter(year==yr) %>%
select(as.name(value), group) %>%
rename(temp = as.name(value))
df %<>% merge(val, by='group') %>%
mutate(norm = (eval(as.name(value))-temp)/temp) %>%
select(-temp)
return(df)
}
for (yr in 2003:2016){
yr_dir <- paste(data_dir, as.character(yr), "/intrvw", substr(as.character(yr), 3, 4), sep="") # create year dir string
assign(paste("exp_", as.character(yr), sep=""), get_yr_avg(yr, yr_dir)) # calc and create a df for the yr avgs of each group
}
dat <- bind_rows(exp_2003, exp_2004, exp_2005, exp_2006, exp_2007, exp_2008, exp_2009,
exp_2010, exp_2011, exp_2012, exp_2013, exp_2014, exp_2015, exp_2016)
dat %<>% get_real_values() %>%
get_normalized_values()
working_age <- dat %>% filter(group==1)
prime_age <- dat %>% filter(group==2)
elderly <- dat %>% filter(group==3)
Here, we first calculate the total expenditure for each person (or consumer unit, CU) within each year. Next, we take a weighted average across all CU’s in a given year (following the CEX guidelines) in order to get an average personal expenditure for each year. Averages are calculated at the age-group level. We make no restrictions on the type or category of expenditure here. All reported dollars are real relative to 2006 values (using the CPI-U with base value 2006). For each age group, we present three plots - (1). Average expenditure over time with unique scale (2). Average expenditure over time with same scale (for prime age, this scale does not change from (1)) (3). A plot showing percent change in average expenditure over time (relative to 2006).
# working age
ggplot(aes(x=year, y=real_exp), data=working_age) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + geom_hline(yintercept=seq(180000, 280000, by = 20000), color='grey') + geom_line(color="navy") + ggtitle("Working Age Population - 16-64") + scale_y_continuous(limits = c(180000, 280000), breaks = seq(180000, 280000, by = 20000)) + geom_point(size=1, color="navy")
ggplot(aes(x=year, y=real_exp), data=working_age) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + geom_hline(yintercept=seq(180000, 300000, by = 20000), color='grey') + ggtitle("Working Age Population - 16-64") + geom_line(color="navy") + scale_y_continuous(limits = c(180000, 300000), breaks = seq(180000, 300000, by = 20000)) + geom_point(size=1, color="navy")
ggplot(aes(x=year, y=norm), data=working_age) + theme_classic() + ylab("Percent Change in Average Expenditure") + xlab("Year") + ggtitle("Working Age Population - 16-64") + geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + geom_line(color="navy") + geom_point(size=1, color="navy")
# prime age
ggplot(aes(x=year, y=real_exp), data=prime_age) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + geom_hline(yintercept=seq(180000, 300000, by = 20000), color='grey') + ggtitle("Prime Age Population - 25-54") + geom_line(color="navy") + scale_y_continuous(limits = c(180000, 300000), breaks = seq(180000, 300000, by = 20000)) + geom_point(size=1, color="navy")
ggplot(aes(x=year, y=real_exp), data=prime_age) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + geom_hline(yintercept=seq(180000, 300000, by = 20000), color='grey') + ggtitle("Prime Age Population - 25-54") + scale_y_continuous(limits = c(180000, 300000), breaks = seq(180000, 300000, by = 20000)) + geom_line(color="navy") + geom_point(size=1, color="navy")
ggplot(aes(x=year, y=norm), data=prime_age) + theme_classic() + ylab("Percent Change in Average Expenditure") + xlab("Year") + ggtitle("Prime Age Population - 25-54") + geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + geom_line(color="navy") + geom_point(size=1, color="navy")
# elderly
ggplot(aes(x=year, y=real_exp), data=elderly) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + ggtitle("Elderly Population - 65+") + geom_hline(yintercept=seq(180000, 260000, by = 20000), color='grey') + scale_y_continuous(limits = c(180000, 260000), breaks = seq(180000, 260000, by = 20000)) + geom_line(color="navy") + geom_point(size=1, color="navy")
ggplot(aes(x=year, y=real_exp), data=elderly) + theme_classic() + ylab("Average Expenditure") + xlab("Year") + ggtitle("Elderly Population - 65+") + geom_hline(yintercept=seq(180000, 300000, by = 20000), color="grey") + scale_y_continuous(limits = c(180000, 300000), breaks = seq(180000, 300000, by = 20000)) + geom_line(color="navy") + geom_point(size=1, color="navy")
ggplot(aes(x=year, y=norm), data=elderly) + theme_classic() + ylab("Percent Change in Average Expenditure") + xlab("Year") + ggtitle("Elderly Population - 65+") + geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') + scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) + geom_line(color="navy") + geom_point(size=1, color="navy")