Life Expectancy Crosswalk from 2010 to 2020 Census Tracts (Austin LTD/FULL Universe)

Author

Kaitlan Wong

Overview

This analysis converts 2015 life expectancy estimates originally reported for 2010 census tracts to the 2020 census tract geography used in the City of Austin Equity Index. Because tract boundaries changed between 2010 and 2020, a tract relationship file was used to allocate 2010-tract life expectancy to 2020 tracts using areal-weighted interpolation. For each 2020 tract, life expectancy was calculated as the weighted mean of overlapping 2010 tracts based on land-area proportions. The resulting estimates were then restricted to the Austin limited and full-purpose tract universe (including tracts intersecting city boundaries by at least 10%). This process produces normalized 2020-tract life expectancy values aligned to the study area geography, without imputation for tracts lacking source data coverage.

Sources

Dataset	Description	Source
USALEEP Life Expectancy (2010 tracts)	Census-tract life expectancy estimates (2010 geography)	https://www.cdc.gov/nchs/nvss/usaleep/usaleep.html
Census Tract Relationship File (2010→2020)	Land-area crosswalk between 2010 and 2020 census tracts	https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.2020.html
Austin Full & Limited Purpose Tract List	Study-area tract universe (Austin LTD + Full purpose)	Derived from City of Austin jurisdiction overlay: https://data.austintexas.gov/City-Government/BOUNDARIES_jurisdictions/3pzb-6mbr

Setup

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# load data
le_data <- read_csv("TX_A.csv")

Rows: 4709 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): CNTY2KX, TRACT2KX
dbl (5): Tract ID, STATE2KX, e(0), se(e(0)), Abridged life table flag

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

relation_tracts_2 <- read_csv("texas_2010_2020_tracts_kw.csv")

Rows: 9135 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): NAMELSAD_TRACT_20, MTFCC_TRACT_20, FUNCSTAT_TRACT_20, NAMELSAD_TRA...
dbl (10): OID_TRACT_20, GEOID_TRACT_20, AREALAND_TRACT_20, AREAWATER_TRACT_2...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# used backticks for Tract ID because of the space in the column name
merged_data <- relation_tracts_2 %>%
  left_join(le_data, by = c("GEOID_TRACT_10" = "Tract ID"))

head(merged_data)

# A tibble: 6 × 22
  OID_TRACT_20 GEOID_TRACT_20 NAMELSAD_TRACT_20    AREALAND_TRACT_20
         <dbl>          <dbl> <chr>                            <dbl>
1      2.08e13    48001950100 Census Tract 9501            483306613
2      2.08e14    48001950401 Census Tract 9504.01          16509268
3      2.08e14    48001950401 Census Tract 9504.01          16509268
4      2.08e14    48001950402 Census Tract 9504.02          71134275
5      2.08e13    48001950500 Census Tract 9505             23132052
6      2.08e13    48001950600 Census Tract 9506             20653882
# ℹ 18 more variables: AREAWATER_TRACT_20 <dbl>, MTFCC_TRACT_20 <chr>,
#   FUNCSTAT_TRACT_20 <chr>, OID_TRACT_10 <dbl>, GEOID_TRACT_10 <dbl>,
#   NAMELSAD_TRACT_10 <chr>, AREALAND_TRACT_10 <dbl>, AREAWATER_TRACT_10 <dbl>,
#   MTFCC_TRACT_10 <chr>, FUNCSTAT_TRACT_10 <chr>, AREALAND_PART <dbl>,
#   AREAWATER_PART <dbl>, STATE2KX <dbl>, CNTY2KX <chr>, TRACT2KX <chr>,
#   `e(0)` <dbl>, `se(e(0))` <dbl>, `Abridged life table flag` <dbl>

options(scipen = 999)

Raw Weighted Sum

# raw (non-normalized) weighted sum (not used for imputation)
final_le_2020 <- merged_data %>%
  mutate(area_weight = AREALAND_PART / AREALAND_TRACT_10) %>%
  mutate(weighted_le = `e(0)` * area_weight) %>%
  group_by(GEOID_TRACT_20) %>%
  summarize(
    life_expectancy_2020 =
      if (all(is.na(weighted_le))) NA else sum(weighted_le, na.rm = TRUE),
    total_weight_check = sum(area_weight, na.rm = TRUE),
    .groups = "drop"
  )

head(final_le_2020, 20)

# A tibble: 20 × 3
   GEOID_TRACT_20 life_expectancy_2020 total_weight_check
            <dbl>                <dbl>              <dbl>
 1    48001950100            82.4                   1    
 2    48001950401             0.000184              1.00 
 3    48001950402            NA                     0.982
 4    48001950500            76.9                   1    
 5    48001950600            71.1                   1    
 6    48001950700            71.3                   1    
 7    48001950800            74.7                   0.981
 8    48001950901            77.1                   1.02 
 9    48001950902            86.0                   1.00 
10    48001951001            47.9                   0.581
11    48001951002            34.6                   0.419
12    48001951100            75.1                   1.02 
13    48003950100            78.9                   1    
14    48003950200            76.6                   1.00 
15    48003950300            74.0                   0.997
16    48003950400            74.4                   1.00 
17    48005000102            78.2                   1    
18    48005000103            36.8                   0.491
19    48005000104            38.2                   0.509
20    48005000201            23.6                   0.327

Define Tract Universe and Normalization

# tract universe
target_tracts <- read_csv("Austin_LTD_FULL_Tract_List_10pct.csv")

Rows: 250 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): NAMELSAD
dbl (3): GEOID_clean, NAME, pct_in_austin

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

names(target_tracts)

[1] "GEOID_clean"   "NAME"          "NAMELSAD"      "pct_in_austin"

head(target_tracts)

# A tibble: 6 × 4
  GEOID_clean   NAME NAMELSAD            pct_in_austin
        <dbl>  <dbl> <chr>                       <dbl>
1 48209010912 109.   Census Tract 109.12          20.3
2 48453000101   1.01 Census Tract 1.01           100.0
3 48453000102   1.02 Census Tract 1.02           100.0
4 48453000203   2.03 Census Tract 2.03           100.0
5 48453000204   2.04 Census Tract 2.04           100.0
6 48453000205   2.05 Census Tract 2.05           100.

# normalize and filter to feed imputation
final_study_area_le <- merged_data %>%
  mutate(area_weight = AREALAND_PART / AREALAND_TRACT_10) %>%
  mutate(weighted_le = `e(0)` * area_weight) %>%
  group_by(GEOID_TRACT_20) %>%
  summarize(
    le_2020_normalized =
      sum(weighted_le, na.rm = TRUE) /
      sum(area_weight * !is.na(`e(0)`)),
    data_coverage =
      sum(area_weight * !is.na(`e(0)`)),
    .groups = "drop"
  ) %>%
  filter(as.character(GEOID_TRACT_20) %in%
         as.character(target_tracts$GEOID_clean))

str(final_study_area_le)

tibble [250 × 3] (S3: tbl_df/tbl/data.frame)
 $ GEOID_TRACT_20    : num [1:250] 48209010912 48453000101 48453000102 48453000203 48453000204 ...
 $ le_2020_normalized: num [1:250] 80.3 83 84.9 NaN 84.2 ...
 $ data_coverage     : num [1:250] 0.888 0.976 1.025 0 1 ...

sum(is.na(final_study_area_le$le_2020_normalized))

[1] 36

# Pre-imputation 2020 tract LE (includes NA values)
le_2020_pre_imputation <- final_study_area_le %>%
  select(GEOID_TRACT_20, le_2020_normalized, data_coverage)

write_csv(
  le_2020_pre_imputation,
  "LE_2020_pre_imputation.csv"
)