library(data.table)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(stringr)
library(tidyr)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(readr)
library(tinytex)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
rm(list = ls())
gc()
# root dir
#data_dir <<- "C:/Users/Connor Fogal/Dropbox/predoc/noto/great_recession/cex/data/"
data_dir <<- "C:/Users/cfogal/Dropbox/predoc/noto/great_recession/cex/data/"
cpi_dir <<- paste(data_dir, "cpi/", sep='')
# functions
get_yr_avg <- function(year, yr_dir){
# This function calculates the average yearly expenditure for a given year, and outputs
# a valid population estimated mean
year <- as.numeric(year)
# Read in and stack the MTBI files
mtbi <- lapply(dir(yr_dir, pattern = "^mtbi.*[.]csv$", full.names = TRUE),
fread,
select = c('NEWID', 'COST', 'REF_YR', "UCC", "REF_MO", "PUBFLAG"),
na.strings = c("", ".", "NA")) %>%
bind_rows() %>%
rename_with(tolower) %>% # Change the column names to lower case
distinct(newid, cost, ucc, ref_mo, .keep_all = TRUE) %>% # drop any duplicated expense entries
filter(ref_yr %in% year) %>% # Filter for expenditures made in the given year and UCC's used for publication
filter(pubflag==2) %>%
mutate(newid = as.character(newid)) %>% # Change "newid" to a character variable
select(-c(ref_yr, ucc, ref_mo, pubflag)) %>% # Remove unnecessary columns
group_by(newid) %>% # Group the data by newid
summarise(avg_exp = sum(cost)) # Get the total expenditure for each person for this year
# Read in and stack the fmli files
fmli <- lapply(dir(yr_dir, pattern = "^fmli.*[.]csv$", full.names = TRUE),
fread,
select = c('NEWID', 'FINLWT21', 'QINTRVMO', 'QINTRVYR', 'AGE_REF', 'EDUC_REF'),
na.strings = c("", ".", "NA")) %>%
bind_rows() %>%
rename_with(tolower) %>%
mutate(newid = as.character(newid), # clean the newid and qintrvmo vars, then generate a calendar-year population weight variable
qintrvmo = as.numeric(qintrvmo),
working_age = ifelse(age_ref >= 16 & age_ref <= 64, 1, 0), # create indicator if in working age
prime_age = ifelse(age_ref >= 25 & age_ref <= 54, 1, 0), # create indicator if in prime age
elderly = ifelse(age_ref > 64, 1, 0),
hs_less = ifelse(educ_ref <=12, 1, 0),
more_hs = ifelse(educ_ref > 12, 1, 0)) %>% # create indicator if elderly
select(-c(qintrvyr, qintrvmo, age_ref)) # drop unneeded vars
working_age <- get_group_avg(fmli %>% filter(working_age==1), mtbi) # get an average personal expenditure for working age group in this year
prime_age <- get_group_avg(fmli %>% filter(prime_age==1), mtbi) # get an average personal expenditure for prime age group in this year
elderly <- get_group_avg(fmli %>% filter(elderly==1), mtbi) # get an average personal expenditure for elderly group in this year
hs <- get_group_avg(fmli %>% filter(hs_less==1), mtbi) # get an average personal expenditure for group with less than hs education
college <- get_group_avg(fmli %>% filter(more_hs==1), mtbi) # get an average personal expenditure for group with at least some college education
out <- bind_rows(list(working_age, prime_age, elderly, hs, college), .id="group") %>% # combine all three groups back into same df
mutate(year=year) # note that group 1 is working age, group 2 is prime age, group 3 is elderly,
# group 4 is hs educ, and group 5 is college educ
return(out)
}
get_group_avg <- function(df, mtbi){
mean_exp <- left_join(df, mtbi, by = "newid") %>% # merge back in mbti to weigh avgs in a sec
mutate(avg_exp = replace(avg_exp, is.na(avg_exp), 0),
total_wt = sum(finlwt21)) %>% # replace any missing with 0 (as cex docs say)
mutate(new_wt = finlwt21/total_wt) %>%
summarize(pop_exp = sum(avg_exp*new_wt)/sum(new_wt))
unlist # keep just the avg value
return(mean_exp)
}
get_real_values <- function(df, cpi_file="CPIAUCSL.csv"){
# load in the cpiaucsl doc, and prepare the cpi values to merge into the main df
cpiu <- read.csv(paste(cpi_dir, cpi_file, sep='')) %>%
rename(cpi = CPIAUCSL_NBD20060101,
date = DATE) %>%
mutate(year = substr(date, 1, 4)) %>%
select(-date)
# merge in the cpi values for each year, and calculate real exp values
df %<>% merge(cpiu, by='year') %>%
mutate(real_exp = (100*pop_exp)/cpi) %>%
select(-cpi)
return(df) # return the new df with real values
}
get_normalized_values <- function(df, yr=2006, value="real_exp"){
val <- df %>%
filter(year==yr) %>%
select(as.name(value), group) %>%
rename(temp = as.name(value))
df %<>% merge(val, by='group') %>%
mutate(norm = (eval(as.name(value))-temp)/temp) %>%
select(-temp)
return(df)
}
for (yr in 2003:2016){
yr_dir <- paste(data_dir, as.character(yr), "/intrvw", substr(as.character(yr), 3, 4), sep="") # create year dir string
assign(paste("exp_", as.character(yr), sep=""), get_yr_avg(yr, yr_dir)) # calc and create a df for the yr avgs of each group
}
dat <- bind_rows(exp_2003, exp_2004, exp_2005, exp_2006, exp_2007, exp_2008, exp_2009,
exp_2010, exp_2011, exp_2012, exp_2013, exp_2014, exp_2015, exp_2016)
dat %<>% get_real_values() %>%
get_normalized_values()
working_age <- dat %>% filter(group==1)
prime_age <- dat %>% filter(group==2)
elderly <- dat %>% filter(group==3)
hs <- dat %>% filter(group==4)
college <- dat %>% filter(group==5)
combined_age <- dat %>%
filter(group==1|group==3) %>%
mutate(age_group = NA) %>%
mutate(age_group = case_when(group==1 ~ "Working Age",
group==3 ~ "Elderly"),
temp1 = case_when(group==1 ~ real_exp,
group==3 ~ NA),
temp2 = case_when(group==1 ~ NA,
group==3 ~ real_exp)) %>%
select(-group)
combined_educ <- dat %>%
filter(group==4|group==5) %>%
mutate(ed_group = NA) %>%
mutate(ed_group = case_when(group==4 ~ "HS or less",
group==5 ~ "College or more"),
temp1 = case_when(group==4 ~ real_exp,
group==5 ~ NA),
temp2 = case_when(group==4 ~ NA,
group==5 ~ real_exp)) %>%
select(-group)
Here, we first calculate the total expenditure for each person (or consumer unit, CU) within each year. Next, we take a weighted average across all CU’s in a given year (following the CEX guidelines) in order to get an average personal expenditure for each year. Averages are calculated at the age-group level. We make no restrictions on the type or category of expenditure here. All reported dollars are real relative to 2006 values (using the CPI-U with base value 2006). For each age group, we present three plots - (1). Average expenditure over time with unique scale (2). Average expenditure over time with same scale (for prime age, this scale does not change from (1)) (3). A plot showing percent change in average expenditure over time (relative to 2006).
options(scipen=100)
# working age
ggplot(aes(x=year, y=real_exp), data=working_age) +
theme_classic() + ylab("Average Expenditure") +
xlab("Year") +
geom_hline(yintercept=seq(20000, 60000, by = 10000), color='grey') +
geom_line(color="navy") +
ggtitle("Working Age Population - 16-64") +
scale_y_continuous(limits = c(20000, 60000), breaks = seq(20000, 60000, by = 10000)) +
geom_point(size=1, color="navy")
ggplot(aes(x=year, y=norm), data=working_age) +
theme_classic() +
ylab("Percent Change in Average Expenditure") +
xlab("Year") + ggtitle("Working Age Population - 16-64") +
geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') +
scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) +
geom_line(color="navy") +
geom_point(size=1, color="navy")
# prime age
ggplot(aes(x=year, y=real_exp), data=prime_age) +
theme_classic() +
ylab("Average Expenditure") +
xlab("Year") +
geom_hline(yintercept=seq(20000, 80000, by = 15000), color='grey') +
ggtitle("Prime Age Population - 25-54") +
geom_line(color="navy") +
scale_y_continuous(limits = c(20000, 80000), breaks = seq(20000, 80000, by = 15000)) +
geom_point(size=1, color="navy")
ggplot(aes(x=year, y=norm), data=prime_age) +
theme_classic() +
ylab("Percent Change in Average Expenditure") +
xlab("Year") + ggtitle("Prime Age Population - 25-54") +
geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') +
scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) +
geom_line(color="navy") +
geom_point(size=1, color="navy")
# elderly
ggplot(aes(x=year, y=real_exp), data=elderly) +
theme_classic() +
ylab("Average Expenditure") +
xlab("Year") +
ggtitle("Elderly Population - 65+") +
geom_hline(yintercept=seq(20000, 60000, by = 10000), color='grey') +
scale_y_continuous(limits = c(20000, 60000), breaks = seq(20000, 60000, by = 10000)) +
geom_line(color="navy") +
geom_point(size=1, color="navy")
ggplot(aes(x=year, y=norm), data=elderly) +
theme_classic() +
ylab("Percent Change in Average Expenditure") +
xlab("Year") +
ggtitle("Elderly Population - 65+") +
geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') +
scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) +
geom_line(color="navy") +
geom_point(size=1, color="navy")
# education
ggplot(aes(x=year, y=real_exp), data=hs) +
theme_classic() +
ylab("Average Expenditure") +
xlab("Year") +
ggtitle("High school graduate or less") +
geom_hline(yintercept=seq(20000, 35000, by = 5000), color='grey') +
scale_y_continuous(limits = c(20000, 35000), breaks = seq(20000, 35000, by = 5000)) +
geom_line(color="navy") +
geom_point(size=1, color="navy")
ggplot(aes(x=year, y=norm), data=hs) +
theme_classic() +
ylab("Percent Change in Average Expenditure") +
xlab("Year") +
ggtitle("High school graduate or less - percentage change") +
geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') +
scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) +
geom_line(color="navy") +
geom_point(size=1, color="navy")
ggplot(aes(x=year, y=real_exp), data=college) +
theme_classic() +
ylab("Average Expenditure") +
xlab("Year") +
ggtitle("Some college or more") +
geom_hline(yintercept=seq(40000, 65000, by = 5000), color='grey') +
scale_y_continuous(limits = c(40000, 65000), breaks = seq(40000, 65000, by = 5000)) +
geom_line(color="navy") +
geom_point(size=1, color="navy")
ggplot(aes(x=year, y=norm), data=college) +
theme_classic() +
ylab("Percent Change in Average Expenditure") +
xlab("Year") +
ggtitle("Some college or more - percentage change") +
geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') +
scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) +
geom_line(color="navy") +
geom_point(size=1, color="navy")
# combined age plots
color_palette = c("navy", 'orange')
ggplot(aes(x=year, y=real_exp, color=factor(age_group), group=factor(age_group)), data=combined_age) +
theme_classic() +
ylab("Average Expenditure") +
xlab("Year") +
geom_hline(yintercept=seq(15000, 60000, by = 15000), color='grey') +
geom_line() +
ggtitle("Working Age Population & Elderly - same scale") +
scale_y_continuous(limits = c(15000, 60000), breaks = seq(15000, 60000, by = 15000)) +
geom_point(size=1.5) +
scale_color_manual(values=color_palette) +
labs(color="Age Group") +
scale_x_continuous(limits=c(2003, 2016))
ggplot(aes(x=year, y=norm, color=factor(age_group), group=factor(age_group)), data=combined_age) +
theme_classic() +
ylab("Percent Change in Average Expenditure") +
xlab("Year") +
ggtitle("Working Age & Elderly - percent change") +
geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') +
scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) +
geom_line() +
geom_point(size=1.5) +
scale_color_manual(values=color_palette) +
labs(color="Age Group") +
scale_x_continuous(limits=c(2003, 2016))
ggplot(aes(x=year, color=factor(age_group), group=factor(age_group)), data=combined_age) +
theme_classic() +
ylab("Average Expenditure") +
xlab("Year") +
geom_hline(yintercept=seq(30000, 60000, by = 10000), color='grey') +
geom_line(aes(y=temp2+5000), color='navy') +
geom_line(aes(y=temp1), color='orange') +
ggtitle("Working Age & Elderly - different scales") +
scale_y_continuous("Working age expediture", limits = c(30000, 60000), breaks = seq(30000, 60000, by = 10000),
sec.axis = sec_axis(~ . + 5000,
name = "Elderly Expenditure",
breaks = seq(35000, 75000, by = 10000))) +
geom_point(aes(y=temp2+5000), size=1.5, color='navy') +
geom_point(aes(y=temp1), size=1.5, color='orange')+
scale_x_continuous(limits=c(2003, 2016))+
scale_color_manual(values=color_palette)
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
# combined education plots
ggplot(aes(x=year, y=real_exp, color=factor(ed_group), group=factor(ed_group)), data=combined_educ) +
theme_classic() +
ylab("Average Expenditure") +
xlab("Year") +
geom_hline(yintercept=seq(20000, 65000, by = 10000), color='grey') +
geom_line() +
ggtitle("Education") +
scale_y_continuous(limits = c(20000, 65000), breaks = seq(20000, 65000, by = 10000)) +
geom_point(size=1.5) +
scale_color_manual(values=color_palette) +
labs(color="Education Level") +
scale_x_continuous(limits=c(2003, 2016))
ggplot(aes(x=year, y=norm, color=factor(ed_group), group=factor(ed_group)), data=combined_educ) +
theme_classic() +
ylab("Percent Change in Average Expenditure") +
xlab("Year") +
ggtitle("Education Level - percent change") +
geom_hline(yintercept=seq(-0.4, 0.1, by = 0.1), color='grey') +
scale_y_continuous(limits = c(-0.4, 0.1), breaks = seq(-0.4, 0.1, by = 0.1)) +
geom_line() +
geom_point(size=1.5) +
scale_color_manual(values=color_palette) +
labs(color="Age Group") +
scale_x_continuous(limits=c(2003, 2016))
ggplot(aes(x=year, color=factor(ed_group), group=factor(ed_group)), data=combined_educ) +
theme_classic() +
ylab("Average Expenditure") +
xlab("Year") +
geom_hline(yintercept=seq(15000, 40000, by = 5000), color='grey') +
geom_line(aes(y=temp2-25000), color='navy') +
geom_line(aes(y=temp1), color='orange') +
ggtitle("Education - different scales") +
scale_y_continuous("High school or less avg expenditure", limits = c(15000, 40000), breaks = seq(15000, 40000, by = 5000),
sec.axis = sec_axis(~ . + 25000,
name = "Some college or more avg expenditure",
breaks = seq(40000, 65000, by = 5000))) +
geom_point(aes(y=temp2-25000), size=1.5, color='navy') +
geom_point(aes(y=temp1), size=1.5, color='orange')+
scale_x_continuous(limits=c(2003, 2016))+
scale_color_manual(values=color_palette)
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).