Getting averages for tracts in 2020 that did not exist in 2010 (overlap older tracts).
library(readr)
library(readxl)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(stringr)
life_path <- "TX_A.XLSX"
rel_path <- "texas_2010_2020_tracts_kw.csv"
to_geoid11 <- function(x) sprintf("%011s", as.character(x))
# life expectancy (2010 tracts)
life <- read_excel(life_path) |>
mutate(
GEOID_2010 = to_geoid11(`Tract ID`),
le_2015 = as.numeric(`e(0)`)
) |>
select(GEOID_2010, le_2015)
# relationship tract file
rel <- read_csv(rel_path, show_col_types = FALSE) |>
mutate(
GEOID_2010 = to_geoid11(GEOID_TRACT_10),
GEOID_2020 = to_geoid11(GEOID_TRACT_20),
w = as.numeric(AREALAND_PART) / as.numeric(AREALAND_TRACT_10)
) |>
filter(substr(GEOID_2020, 1, 2) == "48") |>
select(GEOID_2010, GEOID_2020, w)
# join, drop missing LE, renormalize weights, aggregate
life_2020 <- rel |>
left_join(life, by = "GEOID_2010") |>
filter(!is.na(le_2015)) |>
group_by(GEOID_2020) |>
mutate(w_norm = w / sum(w)) |>
summarise(
life_expectancy_2020 = sum(le_2015 * w_norm),
.groups = "drop"
)
write_csv(
life_2020,
"life_expectancy_2010_interpolated_to_2020_tracts.csv"
)
range(life_2020$life_expectancy_2020, na.rm = TRUE)