Overview

This series of posts is intended to get the reader up speed on how to import, format, and use the economic data of Thomas Piketty, Gabriel Zucman, and Emmanuel Saez. Piketty is most known in the US for his seminal 2014 work Capital in the Twenty-First Century, and Saez and Zucman recently released The Triumph of Injustice: How the Rich Dodge Taxes and How to Make Them Pay.

Summary

This chapter wraps all of the transformations & reconciliations into one function.

After loading the function execute it with dina <- format_dina(). By the way if you haven’t figured it out to this point, dina is shorthand for DIstributional National Accounts.

This function does the following:
1. Converts the Stata files to a single data frame and adds the year to a new column
2. Renames all variables to be easy to understand and select
3. Recode all the levels on the categorical values for clarity
4. Reconcile all of the calculations within the data with each other and with the official US statistics
5. Create proportions for income & wealth. For instance, how much total wealth comes from each of its component parts
6. Create income & wealth distributions; this puts every person on a scale of 0 to 1 with regards to the total income or wealth they have compared to everyone else in the population.

After running this function you have all the data you need to begin asking it questions.

You only need to supply the path to where your year files are and it will take care of the rest.

Code

For this example I have six files in my dina_subset folder.

library(tidyverse)
library(fs)
library(haven)

my_path = "Data/Dina_subset/"
files <- dir_ls(my_path) 
files
## Data/Dina_subset/usdina1968.dta Data/Dina_subset/usdina1978.dta 
## Data/Dina_subset/usdina1988.dta Data/Dina_subset/usdina1998.dta 
## Data/Dina_subset/usdina2008.dta Data/Dina_subset/usdina2018.dta

The function

format_dina <- function(path = "Data/Dina_subset/") {
  library(tidyverse)
  library(fs)
  library(haven)
  
  # Get the paths for your subset of files. From the `fs` package
  paths <- dir_ls(path) 
  #files
  
  # Import all of the Stata dta files into a single dataframe
  # Also, put the year from the filename into a new column
  dina_df <- map_dfr(paths, ~ read_dta(.x), .id = "filename") %>%
    extract(filename, "year", "(\\d{4})")
  
  # Get the names of all the grouping variables
  group_names <- names(dina_df[5:15])
  
  # Change factor levels to decriptive labels 
  # Assign to new vars and then drop originals and put at front
  dina_df2 <- dina_df %>%
    #filter(year == 2018) %>%
    mutate_at(group_names, as.character) %>%
    mutate(gender = if_else(female == "1", "Female", "Male", "Unknown"),
           agegroup_primary = recode(ageprim, "0" = "20-64", "20" = "20-44", 
                                     "45" = "45-64", "65" = "65 Plus"),
           agegroup_secondary = recode(agesec, "0" = "20-64", "20" = "20-44", 
                                       "45" = "45-64", "65" = "65 Plus"), 
           agegroup_imputed = recode(age, "0" = "20-64", "20" = "20-44", 
                                     "45" = "45-64", "65" = "65 Plus"), 
           labor_status_primary = if_else(oldexm == "1", "Retired", "Working", "Unknown"),
           labor_status_secondary = if_else(oldexf == "1", "Retired", "Working", "Unknown"),
           labor_status_imputed = if_else(oldexf == "1", "Retired", "Working", "Unknown"), 
           filing_status = if_else(married == "1", "Married", "Single", "Unknown"), 
           earner_status = if_else(second == "0", "Primary", "Secondary", "Unknown"), 
           num_kids = xkidspop, 
           filer_status = if_else(filer == "1", "Filer", "Not filer", "Unknown")
    ) %>%
    select(-all_of(group_names)) %>%
    select(year, everything())
  
  # Get the names of new grouping variables
  grouping_vars <- names(dina_df2[136:146])
  
  # Give columns more descriptive names
  # I try to use consistent names so that you will be able to efficiently select 
  #  column names with `contains()`.
  dina_df3 <- dina_df2 %>%
    #select(id, group_names, everything()) %>%
    mutate_at(grouping_vars, as.factor) %>%
    rename(
      # Socio-demographic
      tax_unit_id = id,
      population_weight = dweght,
      population_weight_ptu = dweghttaxu,
      
      # Core Income & Wealth
      ttl_income_fiscal_excl_capgains = fiinc,
      income_fiscal_wages_pensions = fiwag,
      income_fiscal_business = fibus,
      income_fiscal_rents = firen,
      income_fiscal_interest = fiint,
      income_fiscal_dividends = fidiv,
      income_fiscal_nonfiler_default = fnps,
      
      ttl_income_fiscal_incl_capgains = fninc,
      income_fiscal_capital_gains = fikgi,
      
      ttl_income_factor = fainc,
      ttl_income_factor_labor = flinc,
      income_factor_labor_wages = flemp,
      income_factor_labor_mixed = flmil,
      income_factor_labor_sales_taxes = flprl,
      ttl_income_factor_capital = fkinc,
      income_factor_capital_housing = fkhou,
      income_factor_capital_equity = fkequ,
      income_factor_capital_interest = fkfix,
      income_factor_capital_business = fkbus,
      income_factor_capital_pension_benefits = fkpen,
      
      ttl_income_pretax_labor = plinc,
      ttl_contributions_social_insurance = plcon,
      contributions_social_insurance_pensions = plpco,
      income_social_share_labor = plbel,
      ttl_income_pretax_capital = pkinc,
      income_investment_payable_pensions = pkpen,
      income_social_share_capital = pkbek,
      
      ttl_income_disposable_extended = diinc,
      income_cash_disposable = dicsh,
      income_social_inkind_transfers = inkindinc, 
      ttl_income_social_collective = colexp,
      income_social_collective_property_paid_by_govt = govin, 
      income_social_collective_non_profit = npinc, 
      income_social_collective_education = educ, 
      
      ttl_income_national_factor = princ,
      
      ttl_income_national_pretax = peinc,
      ttl_income_pretax = ptinc,
      surplus_primary_public_pension_system = prisupen,
      income_investment_pensions_payable = invpen,
      
      ttl_income_national_posttax = poinc,
      surplus_primary_private_pension_system = prisupenprivate,
      surplus_primary_government = prisupgov,
      
      ttl_wealth_net = hweal,
      assets_equity = hwequ,
      assets_currency = hwfix,
      assets_housing = hwhou,
      assets_business = hwbus,
      assets_pension_lifeins = hwpen,
      liabilities_household = hwdeb,
      
      # Detailed income & wealth
      ttl_income_national_posttax_2 = poinc2,
      ttl_income_social_benefits = ben,
      ttl_income_social_othercash = dicao,
      ttl_income_social_inkind = otherkin,
      ttl_income_social_share = plben,
      ttl_income_social_collective_2 = colexp2,
      ttl_taxes_payments_contributions = tax,
      
      ttl_income_pretax_pension = ptnin, 
      income_pretax_pension_labor = plnin,
      income_pretax_pension_capital = pknin,
      
      ttl_contributions_social_insurance_govt = govcontrib, 
      contributions_social_insurance_govt_pensions_ui_di = ssuicontrib,
      contributions_social_insurance_govt_other = othercontrib,
      
      income_pension_taxable = peninc,
      income_schedule_net = schcinc,
      income_s_corp_net = scorinc,
      income_partnership_net = partinc,
      income_rental_net = rentinc,
      income_estate_trust_net = estinc,
      income_royalty_net = rylinc,
      income_other_in_agi = othinc,
      income_capital_main_house_asset = fkhoumain,
      income_capital_rental_house = fkhourent,
      income_social_insurance_retirement = ssinc_oa,
      contributions_social_pension = wagpen,
      income_social_insurance_disability = ssinc_di,
      contributions_social_insurance_di_ui = ploco,
      income_social_insurance_unemployment = uiinc,
      income_social_cash_supplemental = disup,
      income_social_cash = dicab, 
      income_social_cash_veterans = divet,
      income_social_taxcredit = dicred,
      income_social_othercash_tanf = tanfinc,
      income_social_othercash_cashlocalstate = othben,
      income_social_health_medicare = medicare,
      income_social_health_medicaid = medicaid,
      contributions_social_health_wages = waghealth,
      income_social_inkind_pell = pell,
      income_social_inkind_vethealth = vethealth,
      
      wages_all_filers_taxable = flwag,
      wages_all_filers_taxable_supplements = flsup,
      
      benefits_pension = plpbe,
      benefits_pension_capital_share = pkpbk, 
      benefits_pension_labor_share = plpbl,
      benefits_di_ui = plobe,
      
      taxes_capital_sales_excise = fkprk,
      taxes_property_housing = proprestax,
      taxes_property_business = propbustax,
      taxes_income_wealth_current = ditax,
      taxes_federal_income = ditaf,
      taxes_state_income = ditas,
      taxes_sales_excise = salestax, 
      taxes_corporate = corptax,
      taxes_estate = estatetax,
      
      payments_interest = fkdeb,
      payments_interest_mortgage = fkmor,
      payments_interest_nonmortgage = fknmo,
      
      ttl_wealth_personal_net = hwealnokg,
      wealth_rental_housing_gross = rentalhome,
      wealth_rental_housing_mortgages = rentalmort,
      wealth_rental_housing_net = rental,
      wealth_main_housing_gross = ownerhome,
      wealth_main_housing_mortgages = ownermort, 
      wealth_main_housing_net = housing,
      wealth_partnership = partw, 
      wealth_sole_proprietor = soleprop, 
      wealth_s_corp = scorw,
      wealth_equity = equity, 
      wealth_taxable_bond = taxbond, 
      wealth_muni_bond = muni,
      wealth_currency = currency, 
      wealth_non_mortgage_debt = nonmort,
      wealth_household_financial_assets = hwfin,
      wealth_household_nonfinancial_assets = hwnfa
    ) 
  
  #==========================================================
  # Reconciliation and stuff
  # * For some reason the population is multiplied by 100,000. I divide by 100,000 so that the 
  #     sum of the population weights in any given year will equal the actual adult population
  #     size in the year.
  dina_df4 <- dina_df3 %>%
    select(tax_unit_id, grouping_vars, everything()) %>%
    mutate(
      year = factor(year),
      population_weight = population_weight / 100000,
      population_weight_ptu = population_weight_ptu / 100000)
  
  #saveRDS(dina_df4, "temp/Dina_df4.RDS")
  #dina_df4 <- readRDS("Dina_df4.RDS")
  
  dina_df5 <- dina_df4 %>%
    mutate(
      # Core wealth & income aggregates
      summ_income_factor_labor = 
        income_factor_labor_wages + 
        income_factor_labor_mixed +
        income_factor_labor_sales_taxes, #flemp + flmil + flprl
      recon_income_factor_labor = 
        ttl_income_factor_labor - summ_income_factor_labor,
      
      summ_income_factor_capital = 
        income_factor_capital_housing +
        income_factor_capital_equity +
        income_factor_capital_interest +
        income_factor_capital_business +
        income_factor_capital_pension_benefits +
        payments_interest, #fkhou + fkequ + fkfix + fkbus + fkpen + fkdeb
      recon_income_factor_capital = 
        ttl_income_factor_capital - summ_income_factor_capital,
      
      summ_income_factor = 
        summ_income_factor_labor +
        summ_income_factor_capital, #flinc + fkinc
      recon_income_factor = 
        ttl_income_factor - summ_income_factor,
      
      summ_contributions_social_insurance = 
        contributions_social_insurance_pensions +
        contributions_social_insurance_di_ui,
      recon_contributions_social_insurance = 
        ttl_contributions_social_insurance - summ_contributions_social_insurance,
      
      summ_income_pretax_labor = 
        summ_income_factor_labor +
        summ_contributions_social_insurance +
        income_social_share_labor, #flinc + plcon + plbel
      recon_income_pretax_labor = 
        ttl_income_pretax_labor - summ_income_pretax_labor,
      
      summ_income_pretax_capital = 
        summ_income_factor_capital +
        income_investment_payable_pensions +
        income_social_share_capital, #fkinc + pkpen + pkbek
      recon_income_pretax_capital = 
        ttl_income_pretax_capital - summ_income_pretax_capital,
      
      summ_ttl_income_pretax = 
        summ_income_pretax_labor +
        summ_income_pretax_capital, #plinc + pkinc
      recon_ttl_income_pretax = 
        ttl_income_pretax - summ_ttl_income_pretax,
      
      summ_income_fiscal_incl_capgains = 
        income_fiscal_wages_pensions +
        income_fiscal_business + 
        income_fiscal_rents + 
        income_fiscal_interest +
        income_fiscal_dividends, #fiwag + fibus + firen + fiint + fidiv
      recon_income_fiscal_incl_capgains = 
        ttl_income_fiscal_incl_capgains - summ_income_fiscal_incl_capgains,
      
      summ_income_fiscal_excl_capgains = 
        income_fiscal_wages_pensions +
        income_fiscal_business + 
        income_fiscal_rents + 
        income_fiscal_interest +
        income_fiscal_dividends + 
        income_fiscal_capital_gains, #fiwag + fibus + firen + fiint + fidiv + fikgi
      recon_income_fiscal_excl_capgains = 
        ttl_income_fiscal_excl_capgains - summ_income_fiscal_excl_capgains,
      
      summ_income_disposable_extended = 
        income_cash_disposable +
        income_social_inkind_transfers +
        ttl_income_social_collective, #dicsh + inkindinc + colexp
      recon_income_disposable_extended = 
        ttl_income_disposable_extended - summ_income_disposable_extended,
      
      summ_income_national_factor = 
        summ_income_factor + 
        income_social_collective_property_paid_by_govt +
        income_social_collective_non_profit, #fainc + govin + npinc
      recon_income_national_factor = 
        ttl_income_national_factor - summ_income_national_factor,
      
      summ_income_national_pretax = 
        summ_ttl_income_pretax +
        income_social_collective_property_paid_by_govt +
        income_social_collective_non_profit +
        surplus_primary_public_pension_system +
        income_investment_pensions_payable, #ptinc + govin + npinc + prisupen + invpen
      recon_income_national_pretax = 
        ttl_income_national_pretax - summ_income_national_pretax,
      
      summ_income_national_posttax = 
        summ_income_disposable_extended +
        income_social_collective_property_paid_by_govt +
        income_social_collective_non_profit +
        surplus_primary_private_pension_system +
        income_investment_pensions_payable +
        surplus_primary_government, # diinc + govin + npinc + prisupenprivate + invpen + prisupgov
      recon_income_national_posttax =
        ttl_income_national_posttax - summ_income_national_posttax,
      
      summ_wealth_net = 
        assets_equity +
        assets_currency +
        assets_housing +
        assets_business +
        assets_pension_lifeins +
        liabilities_household, #hwequ + hwfix + hwhou + hwbus + hwpen + hwdeb
      recon_wealth_net = 
        ttl_wealth_net - summ_wealth_net,
      
      # Detailed wealth & income aggregates
      summ_income_social_inkind = 
        income_social_inkind_pell +
        income_social_inkind_vethealth,
      recon_income_social_inkind = 
        ttl_income_social_inkind - summ_income_social_inkind,
      
      summ_income_social_othercash = 
        income_social_othercash_tanf +
        income_social_othercash_cashlocalstate,
      recon_income_social_othercash = 
        ttl_income_social_othercash - summ_income_social_othercash,
      
      summ_income_social_share = 
        income_social_share_labor +
        income_social_share_capital,
      recon_income_social_share = 
        ttl_income_social_share - summ_income_social_share,
      
      summ_income_social_collective = 
        income_social_collective_property_paid_by_govt + 
        income_social_collective_non_profit +
        income_social_collective_education, 
      recon_income_social_collective = 
        ttl_income_social_collective - summ_income_social_collective,
      recon_income_social_collective_2 = 
        ttl_income_social_collective_2 - summ_income_social_collective,
      adj_income_social_collective = 
        ttl_income_social_collective - 
        summ_income_social_collective,
      recon_adj_income_social_collective = 
        ttl_income_social_collective - 
        summ_income_social_collective + 
        adj_income_social_collective,
      
      summ_contributions_social_insurance_govt =
        contributions_social_insurance_govt_pensions_ui_di +
        contributions_social_insurance_govt_other,
      recon_contributions_social_insurance_govt = 
        ttl_contributions_social_insurance_govt - summ_contributions_social_insurance_govt,
      
      summ_taxes_paid = 
        taxes_capital_sales_excise +
        taxes_property_housing +
        taxes_property_business +
        taxes_income_wealth_current +
        taxes_federal_income +
        taxes_state_income +
        taxes_sales_excise +
        taxes_corporate +
        taxes_estate,
      
      summ_taxes_effective_tax_rate = if_else(
        ttl_income_national_pretax != 0,
        summ_taxes_paid / ttl_income_national_pretax, 0),
      
      summ_taxes_cohort_income_prepost_tax_rate = if_else(
        summ_income_national_pretax != 0, 
        1 - (summ_income_national_posttax / summ_income_national_pretax), 
        0),
      
      summ_cohort_national_income_pretax = ttl_income_national_pretax * population_weight,
      summ_cohort_national_income_posttax = ttl_income_national_posttax * population_weight,
      summ_cohort_wealth_net = ttl_wealth_net * population_weight
    )
  
  #====================================
  # Create proportions
  
  dina_df6 <- dina_df5 %>% 
    mutate( # ttl_income_factor_labor
      prop_income_factor_labor_wages = if_else(ttl_income_factor_labor != 0, 
                                               income_factor_labor_wages / ttl_income_factor_labor, 0),
      prop_income_factor_labor_mixed = if_else(ttl_income_factor_labor != 0,
                                               income_factor_labor_mixed / ttl_income_factor_labor, 0),
      prop_income_factor_labor_sales_taxes = if_else(ttl_income_factor_labor != 0,
                                                     income_factor_labor_sales_taxes / ttl_income_factor_labor, 0),
      recon_prop_ttl_income_factor_labor = 
        prop_income_factor_labor_wages +
        prop_income_factor_labor_mixed +
        prop_income_factor_labor_sales_taxes
    ) %>%
    mutate( # ttl_income_factor_capital
      prop_income_factor_capital_housing = if_else(ttl_income_factor_capital != 0, 
                                                   income_factor_capital_housing / ttl_income_factor_capital, 0),
      prop_income_factor_capital_equity = if_else(ttl_income_factor_capital != 0,
                                                  income_factor_capital_equity / ttl_income_factor_capital, 0),
      prop_income_factor_capital_interest = if_else(ttl_income_factor_capital != 0,
                                                    income_factor_capital_interest / ttl_income_factor_capital, 0),
      prop_income_factor_capital_business = if_else(ttl_income_factor_capital != 0,
                                                    income_factor_capital_business / ttl_income_factor_capital, 0),
      prop_income_factor_capital_pension_benefits = if_else(ttl_income_factor_capital != 0,
                                                            income_factor_capital_pension_benefits / ttl_income_factor_capital, 0),
      prop_payments_interest = if_else(ttl_income_factor_capital != 0,
                                       payments_interest / ttl_income_factor_capital, 0),
      recon_prop_ttl_income_factor_capital = 
        prop_income_factor_capital_housing +
        prop_income_factor_capital_equity +
        prop_income_factor_capital_interest +
        prop_income_factor_capital_business +
        prop_income_factor_capital_pension_benefits +
        prop_payments_interest
    ) %>%
    mutate( # ttl_income_factor
      prop_ttl_income_factor_labor_if = if_else(ttl_income_factor != 0, 
                                                ttl_income_factor_labor / ttl_income_factor, 0),
      prop_ttl_income_factor_capital_if = if_else(ttl_income_factor != 0,
                                                  ttl_income_factor_capital / ttl_income_factor, 0),
      recon_prop_ttl_income_factor = 
        prop_ttl_income_factor_labor_if +
        prop_ttl_income_factor_capital_if
    ) %>%
    mutate( # ttl_contributions_social_insurance
      prop_contributions_social_insurance_pensions = if_else(ttl_contributions_social_insurance != 0, 
                                                             contributions_social_insurance_pensions / ttl_contributions_social_insurance, 0),
      prop_contributions_social_insurance_di_ui = if_else(ttl_contributions_social_insurance != 0,
                                                          contributions_social_insurance_di_ui / ttl_contributions_social_insurance, 0), 
      recon_prop_ttl_contributions_social_insurance = 
        prop_contributions_social_insurance_pensions +
        prop_contributions_social_insurance_di_ui
    ) %>%
    mutate( # ttl_income_pretax_labor
      prop_ttl_income_factor_labor_ilpr = if_else(ttl_income_pretax_labor != 0, 
                                                  ttl_income_factor_labor / ttl_income_pretax_labor, 0),
      prop_ttl_contributions_social_insurance = if_else(ttl_income_pretax_labor != 0,
                                                        ttl_contributions_social_insurance / ttl_income_pretax_labor, 0),
      prop_income_social_share_labor = if_else(ttl_income_pretax_labor != 0,
                                               income_social_share_labor / ttl_income_pretax_labor, 0),
      recon_prop_ttl_income_pretax_labor = 
        prop_ttl_income_factor_labor_ilpr +
        prop_ttl_contributions_social_insurance +
        prop_income_social_share_labor
    ) %>%
    mutate( # ttl_income_pretax_capital
      prop_ttl_income_factor_capital_icpr = if_else(ttl_income_pretax_capital != 0, 
                                                    ttl_income_factor_capital / ttl_income_pretax_capital, 0),
      prop_income_investment_payable_pensions = if_else(ttl_income_pretax_capital != 0,
                                                        income_investment_payable_pensions / ttl_income_pretax_capital, 0),
      prop_income_social_share_capital = if_else(ttl_income_pretax_capital != 0,
                                                 income_social_share_capital / ttl_income_pretax_capital, 0),
      recon_prop_ttl_income_pretax_capital = 
        prop_ttl_income_factor_capital_icpr +
        prop_income_investment_payable_pensions +
        prop_income_social_share_capital
    ) %>%
    mutate( # ttl_income_pretax
      prop_ttl_income_pretax_labor = if_else(ttl_income_pretax != 0, 
                                             ttl_income_pretax_labor / ttl_income_pretax, 0),
      prop_ttl_income_pretax_capital = if_else(ttl_income_pretax != 0,
                                               ttl_income_pretax_capital / ttl_income_pretax, 0), 
      recon_prop_ttl_income_pretax = 
        prop_ttl_income_pretax_labor +
        prop_ttl_income_pretax_capital
    ) %>%
    mutate( # ttl_income_national_factor
      prop_ttl_income_factor_inf = if_else(ttl_income_national_factor != 0, 
                                           ttl_income_factor / ttl_income_national_factor, 0),
      prop_income_social_collective_property_paid_by_govt = if_else(ttl_income_national_factor != 0,
                                                                    income_social_collective_property_paid_by_govt / ttl_income_national_factor, 0),
      prop_income_social_collective_non_profit = if_else(ttl_income_national_factor != 0,
                                                         income_social_collective_non_profit / ttl_income_national_factor, 0),
      recon_prop_ttl_income_national_factor = 
        prop_ttl_income_factor_inf +
        prop_income_social_collective_property_paid_by_govt +
        prop_income_social_collective_non_profit
    ) %>%
    mutate( # ttl_income_national_pretax
      prop_ttl_income_pretax = if_else(ttl_income_national_pretax != 0, 
                                       ttl_income_pretax / ttl_income_national_pretax, 0),
      prop_income_social_collective_property_paid_by_govt = if_else(ttl_income_national_pretax != 0,
                                                                    income_social_collective_property_paid_by_govt / ttl_income_national_pretax, 0),
      prop_income_social_collective_non_profit = if_else(ttl_income_national_pretax != 0,
                                                         income_social_collective_non_profit / ttl_income_national_pretax, 0),
      prop_surplus_primary_public_pension_system = if_else(ttl_income_national_pretax != 0,
                                                           surplus_primary_public_pension_system / ttl_income_national_pretax, 0),
      prop_income_investment_pensions_payable = if_else(ttl_income_national_pretax != 0,
                                                        income_investment_pensions_payable / ttl_income_national_pretax, 0),
      recon_prop_ttl_income_national_pretax = 
        prop_ttl_income_pretax +
        prop_income_social_collective_property_paid_by_govt +
        prop_income_social_collective_non_profit +
        prop_surplus_primary_public_pension_system +
        prop_income_investment_pensions_payable
    ) %>%
    mutate( # ttl_income_national_posttax
      prop_ttl_income_disposable_extended_inpo = if_else(ttl_income_national_posttax != 0, 
                                                         ttl_income_disposable_extended / ttl_income_national_posttax, 0),
      prop_income_social_collective_property_paid_by_govt_inpo = if_else(ttl_income_national_posttax != 0,
                                                                         income_social_collective_property_paid_by_govt / ttl_income_national_posttax, 0),
      prop_income_social_collective_non_profit_inpo = if_else(ttl_income_national_posttax != 0,
                                                              income_social_collective_non_profit / ttl_income_national_posttax, 0),
      prop_surplus_primary_public_pension_system_inpo = if_else(ttl_income_national_posttax != 0,
                                                                surplus_primary_public_pension_system / ttl_income_national_posttax, 0),
      prop_income_investment_pensions_payable_inpo = if_else(ttl_income_national_posttax != 0,
                                                             income_investment_pensions_payable / ttl_income_national_posttax, 0),
      prop_surplus_primary_government_inpo = if_else(ttl_income_national_posttax != 0,
                                                     surplus_primary_government / ttl_income_national_posttax, 0),
      recon_prop_ttl_income_national_posttax = 
        prop_ttl_income_disposable_extended_inpo +
        prop_income_social_collective_property_paid_by_govt_inpo +
        prop_income_social_collective_non_profit_inpo +
        prop_surplus_primary_public_pension_system_inpo +
        prop_income_investment_pensions_payable_inpo + 
        prop_surplus_primary_government_inpo
    ) %>%
    mutate( # ttl_wealth_net
      prop_assets_equity = if_else(ttl_wealth_net != 0, 
                                   assets_equity / ttl_wealth_net, 0),
      prop_assets_currency = if_else(ttl_wealth_net != 0,
                                     assets_currency / ttl_wealth_net, 0),
      prop_assets_housing = if_else(ttl_wealth_net != 0,
                                    assets_housing / ttl_wealth_net, 0),
      prop_assets_business = if_else(ttl_wealth_net != 0,
                                     assets_business / ttl_wealth_net, 0),
      prop_assets_pension_lifeins = if_else(ttl_wealth_net != 0,
                                            assets_pension_lifeins / ttl_wealth_net, 0),
      prop_liabilities_household = if_else(ttl_wealth_net != 0,
                                           liabilities_household / ttl_wealth_net, 0),
      recon_prop_ttl_wealth_net = 
        prop_assets_equity +
        prop_assets_currency +
        prop_assets_housing +
        prop_assets_business +
        prop_assets_pension_lifeins + 
        prop_liabilities_household
    )
  
  create_distributions_base <- function(df, years = NULL) {
    adult_population <- df %>%
      filter(year == years) %>%
      summarize(adult_population = sum(population_weight)) %>%
      pull()
    
    income_wealth_dist <- df %>%
      filter(year == years) %>%
      arrange(ttl_income_national_pretax) %>%
      mutate(cumm_population_income = cumsum(population_weight),
             dist_income_national_pretax = cumm_population_income / adult_population, 
             cumm_income_national_pretax = cumsum(ttl_income_national_pretax)) %>%
      arrange(ttl_wealth_net) %>%
      mutate(cumm_population_wealth = cumsum(population_weight),
             dist_wealth = cumm_population_wealth / adult_population, 
             cumm_wealth = cumsum(ttl_wealth_net)) %>%
      mutate(income_class = if_else(dist_income_national_pretax < .50, "bottom_50",
                                    if_else(dist_income_national_pretax >= .50 &
                                              dist_income_national_pretax < .90, "middle_class",
                                            "top_ten")),
             income_class = factor(income_class, ordered = TRUE,
                                   levels = c("bottom_50", "middle_class", "top_ten")),
             income_class_t10 = if_else(dist_income_national_pretax < .90, "bottom_90",
                                        if_else(dist_income_national_pretax >= .90 &
                                                  dist_income_national_pretax < .99, 
                                                "top_10",
                                                if_else(dist_income_national_pretax >= .99 &
                                                          dist_income_national_pretax < .999, 
                                                        "top_1", 
                                                        if_else(dist_income_national_pretax >= .999 &
                                                                  dist_income_national_pretax < .9999, 
                                                                "top_01",
                                                                "gt_top_01")))),
             income_class_t10 = factor(income_class_t10, ordered = TRUE, 
                                       levels = c("bottom_90", "top_10", "top_1", 
                                                  "top_01", "gt_top_01")), 
             
             wealth_class = if_else(dist_wealth < .50, "bottom_50",
                                    if_else(dist_wealth >= .50 &
                                              dist_wealth < .90, "middle_class",
                                            "top_ten")), 
             wealth_class = factor(wealth_class, ordered = TRUE, 
                                   levels = c("bottom_50", "middle_class", "top_ten")),
             wealth_class_t10 = if_else(dist_wealth < .90, "bottom_90",
                                        if_else(dist_wealth >= .90 &
                                                  dist_wealth < .99, 
                                                "top_10",
                                                if_else(dist_wealth >= .99 &
                                                          dist_wealth < .999, 
                                                        "top_1", 
                                                        if_else(dist_wealth >= .999 &
                                                                  dist_wealth < .9999, 
                                                                "top_01", 
                                                                "gt_top_01")))),
             wealth_class_t10 = factor(wealth_class_t10,
                                       ordered = TRUE, 
                                       levels = c("bottom_90", "top_10", "top_1", 
                                                  "top_01", "gt_top_01"))
      )
    
    return(income_wealth_dist)
  }
  
  create_distributions <- function(df, years = NULL) {
    if(length(years) == 1) {
      dist_df <- create_distributions_base(df, years)
      print(table(dist_df$year, dist_df$income_class))
      return(dist_df)
    } 
    if(length(years) > 1){
      dist_df <- map_dfr(unique(years), 
                         ~ create_distributions_base(df, .x))
      print(table(dist_df$year, dist_df$income_class))
      return(dist_df)
    }
    
    data_years <- unique(df$year)
    if(length(data_years) == 1) {
      dist_df <- create_distributions_base(df, data_years)
      print(table(dist_df$year, dist_df$income_class))
      return(dist_df)
    } 
    if(length(data_years) > 1){
      dist_df <- map_dfr(unique(data_years), 
                         ~ create_distributions_base(df, .x))
      print(table(dist_df$year, dist_df$income_class))
      return(dist_df)
    }
    
    print("Could not process your request. Check that your data has at least one year")
  }
  dina_df7 <- create_distributions(dina_df6) 
  return(dina_df7)
}

dina <- format_dina()
##       
##        bottom_50 middle_class top_ten
##   1968     11334         5813   15647
##   1978     26440        17388   23160
##   1988     19000         8270   16800
##   1998     17204         9378   23643
##   2008     22023        12166   24504
##   2018     27530        15716   25514

End Notes

As a byproduct, the function prints how many records there are in the three major income categories for each year to validate that they have imported and formated the years you intended.
There will be warnings from the function which is caused during the Stata file conversion. It doesn’t seem to be an issue and the warnings can be ignored.

Next up: Machine learning

END