library(car)
library(mice)
library(ggplot2)
library(dplyr)
library(readxl)
library(writexl)Public Pre-K Imputation
I am imputing values for the percent of 3 and 4 year olds enrolled in public pre-K due to there being 65 census tracts with suppressed values. I employ a multiple imputation and missing data method to impute new values.
# read master spreadsheet
master <- read_excel("COA_full_data_021226.xlsx")# if blanks came in as empty strings, force them to NA
master[master == ""] <- NA# ensure index variables are numeric
index_vars <- c(
"public_preK_est",
"est_uninsured_rate",
"life_exp_imputed",
"est_has_disability_pct",
"est_65plus_amb_pct",
"LPA_CrudePrev",
"est_med_hh_inc",
"est_underemp_perc",
"est_belowpov_pct",
"Eviction filing rate",
"hh_support_risk_score",
"Energy Burden (% income)",
"est_lesh_perc",
"est_no_internet_perc",
"lths_pct",
"Percent Institutionalized GQ",
"Persistent Poverty",
"Temperature_Diff"
)
master[index_vars] <- lapply(master[index_vars], as.numeric)# create mice dataset
mice_data <- master %>%
select(all_of(index_vars))Missing data inspection
# visualize missing data
# first, make the variable names shorter so the table is readable
mice_short <- mice_data
colnames(mice_short) <- paste0("V", 1:ncol(mice_short))
md.pattern(mice_short) V2 V3 V4 V6 V8 V9 V11 V15 V16 V17 V18 V13 V14 V7 V12 V5 V10 V1
179 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
58 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 2
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 3
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 3
1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 2
1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 0 6
0 0 0 0 0 0 0 0 0 0 0 1 1 2 3 4 7 64 82
# key
data.frame(
short = paste0("V", 1:ncol(mice_data)),
original = colnames(mice_data)
) short original
1 V1 public_preK_est
2 V2 est_uninsured_rate
3 V3 life_exp_imputed
4 V4 est_has_disability_pct
5 V5 est_65plus_amb_pct
6 V6 LPA_CrudePrev
7 V7 est_med_hh_inc
8 V8 est_underemp_perc
9 V9 est_belowpov_pct
10 V10 Eviction filing rate
11 V11 hh_support_risk_score
12 V12 Energy Burden (% income)
13 V13 est_lesh_perc
14 V14 est_no_internet_perc
15 V15 lths_pct
16 V16 Percent Institutionalized GQ
17 V17 Persistent Poverty
18 V18 Temperature_Diff
# check missing pattern
md.pattern(mice_data) est_uninsured_rate life_exp_imputed est_has_disability_pct LPA_CrudePrev
179 1 1 1 1
58 1 1 1 1
5 1 1 1 1
1 1 1 1 1
2 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0
est_underemp_perc est_belowpov_pct hh_support_risk_score lths_pct
179 1 1 1 1
58 1 1 1 1
5 1 1 1 1
1 1 1 1 1
2 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0
Percent Institutionalized GQ Persistent Poverty Temperature_Diff
179 1 1 1
58 1 1 1
5 1 1 1
1 1 1 1
2 1 1 1
1 1 1 1
1 1 1 1
1 1 1 1
1 1 1 1
0 0 0
est_lesh_perc est_no_internet_perc est_med_hh_inc Energy Burden (% income)
179 1 1 1 1
58 1 1 1 1
5 1 1 1 1
1 1 1 1 1
2 1 1 1 1
1 1 1 1 0
1 1 1 1 0
1 1 1 0 1
1 0 0 0 0
1 1 2 3
est_65plus_amb_pct Eviction filing rate public_preK_est
179 1 1 1 0
58 1 1 0 1
5 1 0 1 1
1 0 1 1 1
2 0 1 0 2
1 1 0 0 3
1 0 1 0 3
1 1 1 0 2
1 1 0 0 6
4 7 64 82
# md.pairs(mice_data)colSums(is.na(mice_data)) public_preK_est est_uninsured_rate
64 0
life_exp_imputed est_has_disability_pct
0 0
est_65plus_amb_pct LPA_CrudePrev
4 0
est_med_hh_inc est_underemp_perc
2 0
est_belowpov_pct Eviction filing rate
0 7
hh_support_risk_score Energy Burden (% income)
0 3
est_lesh_perc est_no_internet_perc
1 1
lths_pct Percent Institutionalized GQ
0 0
Persistent Poverty Temperature_Diff
0 0
library(VIM)Warning: package 'VIM' was built under R version 4.5.2
Loading required package: colorspace
Warning: package 'colorspace' was built under R version 4.5.2
Loading required package: grid
VIM is ready to use.
Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
Attaching package: 'VIM'
The following object is masked from 'package:datasets':
sleep
aggr(mice_data, numbers = TRUE, sortVars = TRUE)
Variables sorted by number of missings:
Variable Count
public_preK_est 0.257028112
Eviction filing rate 0.028112450
est_65plus_amb_pct 0.016064257
Energy Burden (% income) 0.012048193
est_med_hh_inc 0.008032129
est_lesh_perc 0.004016064
est_no_internet_perc 0.004016064
est_uninsured_rate 0.000000000
life_exp_imputed 0.000000000
est_has_disability_pct 0.000000000
LPA_CrudePrev 0.000000000
est_underemp_perc 0.000000000
est_belowpov_pct 0.000000000
hh_support_risk_score 0.000000000
lths_pct 0.000000000
Percent Institutionalized GQ 0.000000000
Persistent Poverty 0.000000000
Temperature_Diff 0.000000000
# this verifies taht 25.7% of tracts are missing preK data due to suppressionAt this step, I define the predictor matrix.
This is a critical step where I:
- Impute only
public_preK_est - Use all other indicators as predictors
- Do NOT impute anything else
pred <- make.predictorMatrix(mice_data)
# turn everything off
pred[,] <- 0
# allow all other indicators to predict public_preK_est
pred["public_preK_est", setdiff(index_vars, "public_preK_est")] <- 1Here I define the imputation methods so that only public pre-K gets imputed.
I use predictive mean matching.
meth <- make.method(mice_data)
meth[] <- "" # nothing imputed
meth["public_preK_est"] <- "pmm" # only preKBelow I run MICE.
set.seed(500)
imp <- mice(
data = mice_data,
m = 10,
method = meth,
predictorMatrix = pred,
maxit = 5
)
iter imp variable
1 1 public_preK_est
1 2 public_preK_est
1 3 public_preK_est
1 4 public_preK_est
1 5 public_preK_est
1 6 public_preK_est
1 7 public_preK_est
1 8 public_preK_est
1 9 public_preK_est
1 10 public_preK_est
2 1 public_preK_est
2 2 public_preK_est
2 3 public_preK_est
2 4 public_preK_est
2 5 public_preK_est
2 6 public_preK_est
2 7 public_preK_est
2 8 public_preK_est
2 9 public_preK_est
2 10 public_preK_est
3 1 public_preK_est
3 2 public_preK_est
3 3 public_preK_est
3 4 public_preK_est
3 5 public_preK_est
3 6 public_preK_est
3 7 public_preK_est
3 8 public_preK_est
3 9 public_preK_est
3 10 public_preK_est
4 1 public_preK_est
4 2 public_preK_est
4 3 public_preK_est
4 4 public_preK_est
4 5 public_preK_est
4 6 public_preK_est
4 7 public_preK_est
4 8 public_preK_est
4 9 public_preK_est
4 10 public_preK_est
5 1 public_preK_est
5 2 public_preK_est
5 3 public_preK_est
5 4 public_preK_est
5 5 public_preK_est
5 6 public_preK_est
5 7 public_preK_est
5 8 public_preK_est
5 9 public_preK_est
5 10 public_preK_est
# inspect impuatations
summary(imp$imp$public_preK_est) 1 2 3 4
Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 10.70 1st Qu.: 4.10
Median : 37.60 Median : 39.75 Median : 71.95 Median : 69.20
Mean : 45.48 Mean : 46.67 Mean : 55.89 Mean : 57.98
3rd Qu.:100.00 3rd Qu.:100.00 3rd Qu.:100.00 3rd Qu.:100.00
Max. :100.00 Max. :100.00 Max. :100.00 Max. :100.00
NA's :6 NA's :6 NA's :6 NA's :6
5 6 7 8
Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 6.625 1st Qu.: 19.20 1st Qu.: 17.70
Median : 52.70 Median : 60.650 Median : 72.75 Median : 71.20
Mean : 50.19 Mean : 53.197 Mean : 60.18 Mean : 58.57
3rd Qu.:100.00 3rd Qu.:100.000 3rd Qu.:100.00 3rd Qu.:100.00
Max. :100.00 Max. :100.000 Max. :100.00 Max. :100.00
NA's :6 NA's :6 NA's :6 NA's :6
9 10
Min. : 0.00 Min. : 0.00
1st Qu.: 34.20 1st Qu.: 0.00
Median : 74.25 Median : 53.50
Mean : 63.11 Mean : 49.35
3rd Qu.:100.00 3rd Qu.:100.00
Max. :100.00 Max. :100.00
NA's :6 NA's :6
stripplot(imp, public_preK_est ~ .imp, pch = 20)# extract completed dataset
## Note: I chose to use one completed dataset, but we can change this to average across imputations if we'd rather do that. For now, just using one dataset since we are not creating models. We just need descriptive data for the index creation.
completed_data <- complete(imp, action = 1)
# create imputation flag
completed_data$preK_imputed_flag <- ifelse(
is.na(master$public_preK_est),
"Imputed",
"Original"
)
# export new preK data with imputed values
writexl::write_xlsx(
data.frame(
GEOID = master$GEOID,
public_preK_est_imputed = completed_data$public_preK_est,
preK_imputed_flag = completed_data$preK_imputed_flag
),
"preK_imputed_values.xlsx"
)# merge imputed values
master$public_preK_est_imputed <- completed_data$public_preK_est
# merge imputation flag
master$preK_imputed_flag <- completed_data$preK_imputed_flag
# save updated master spreadsheet
writexl::write_xlsx(master, "COA_master_preK_imputed.xlsx")Pre-K went from having 64 NA values to 6 NA values. This is due to there being NA values for some of the other variables in cases where Pre-K was also NA.
# compare distributions
summary(master$public_preK_est, na.rm = TRUE) Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 30.70 42.47 80.50 100.00 64
summary(master$public_preK_est_imputed) Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 0.00 30.70 43.19 92.70 100.00 6
# check out correlations
# define final index variables (replace original preK with imputed version)
final_index_vars <- index_vars
final_index_vars[final_index_vars == "public_preK_est"] <- "public_preK_est_imputed"
# create numeric-only dataset for correlation analysis
numeric_vars <- master[, final_index_vars]
preK_cor <- cor(
numeric_vars$public_preK_est_imputed,
numeric_vars,
use = "complete.obs"
)
preK_cor_df <- data.frame(
variable = colnames(numeric_vars),
correlation = as.numeric(preK_cor)
)
preK_cor_df <- preK_cor_df[order(abs(preK_cor_df$correlation), decreasing = TRUE), ]
preK_cor_df variable correlation
1 public_preK_est_imputed 1.00000000
15 lths_pct 0.50685456
6 LPA_CrudePrev 0.43215487
2 est_uninsured_rate 0.41339639
13 est_lesh_perc 0.36605735
12 Energy Burden (% income) 0.36311665
14 est_no_internet_perc 0.29811139
7 est_med_hh_inc -0.27811560
3 life_exp_imputed -0.24864858
5 est_65plus_amb_pct 0.24701122
4 est_has_disability_pct 0.24067960
10 Eviction filing rate 0.16800309
11 hh_support_risk_score 0.16764599
16 Percent Institutionalized GQ 0.12446405
17 Persistent Poverty 0.09975012
9 est_belowpov_pct 0.09173956
18 Temperature_Diff 0.02438662
8 est_underemp_perc 0.02342754
Strong positive correlations
lths_pct0.49LPA_CrudePrev0.46est_uninsured_rate0.44Energy Burden0.41est_lesh_perc0.37Eviction filing rate0.31est_belowpov_pct0.27
Negative correlations
life_exp_imputed-0.26est_med_hh_inc-0.34
These patterns makes sense. Higher public preK enrollment correlates with higher poverty, higher uninsured, higher energy burden, lower income, and lower life expectancy
I was curious how correlation would change after imputation compared to before. I chose to look at poverty. Below we can see that the correlation between poverty and public pre_K enrollment went down after imputation. In fact, pre-K missingness was patterned by poverty.
I found a moderate positive correlation between the imputed pre-K flag and poverty. This implies that tracts that were missing (suppressed) preK values were systematically associated with poverty levels and missingness was not random. It is structurally patterned.
# compare correlation with poverty after imputation
cor(master$public_preK_est_imputed, master$est_belowpov_pct, use="complete.obs")[1] 0.09540352
# compare correlation with poverty before imputation
cor(master$public_preK_est, master$est_belowpov_pct, use="complete.obs")[1] 0.263413
# confirm whether missingness was patterned by povert (non zero means yes)
cor(master$preK_imputed_flag == "Imputed",
master$est_belowpov_pct,
use = "complete.obs")[1] 0.3727097
The summary below shows that only public_preK_est was imputed and all other variables were treated as fully observed predictors. PMM (predictive mean matching) was used.
# check to make sure the analysis ran correctly
impClass: mids
Number of multiple imputations: 10
Imputation methods:
public_preK_est est_uninsured_rate
"pmm" ""
life_exp_imputed est_has_disability_pct
"" ""
est_65plus_amb_pct LPA_CrudePrev
"" ""
est_med_hh_inc est_underemp_perc
"" ""
est_belowpov_pct Eviction filing rate
"" ""
hh_support_risk_score Energy Burden (% income)
"" ""
est_lesh_perc est_no_internet_perc
"" ""
lths_pct Percent Institutionalized GQ
"" ""
Persistent Poverty Temperature_Diff
"" ""
PredictorMatrix:
public_preK_est est_uninsured_rate life_exp_imputed
public_preK_est 0 1 1
est_uninsured_rate 0 0 0
life_exp_imputed 0 0 0
est_has_disability_pct 0 0 0
est_65plus_amb_pct 0 0 0
LPA_CrudePrev 0 0 0
est_has_disability_pct est_65plus_amb_pct LPA_CrudePrev
public_preK_est 1 1 1
est_uninsured_rate 0 0 0
life_exp_imputed 0 0 0
est_has_disability_pct 0 0 0
est_65plus_amb_pct 0 0 0
LPA_CrudePrev 0 0 0
est_med_hh_inc est_underemp_perc est_belowpov_pct
public_preK_est 1 1 1
est_uninsured_rate 0 0 0
life_exp_imputed 0 0 0
est_has_disability_pct 0 0 0
est_65plus_amb_pct 0 0 0
LPA_CrudePrev 0 0 0
Eviction filing rate hh_support_risk_score
public_preK_est 1 1
est_uninsured_rate 0 0
life_exp_imputed 0 0
est_has_disability_pct 0 0
est_65plus_amb_pct 0 0
LPA_CrudePrev 0 0
Energy Burden (% income) est_lesh_perc
public_preK_est 1 1
est_uninsured_rate 0 0
life_exp_imputed 0 0
est_has_disability_pct 0 0
est_65plus_amb_pct 0 0
LPA_CrudePrev 0 0
est_no_internet_perc lths_pct
public_preK_est 1 1
est_uninsured_rate 0 0
life_exp_imputed 0 0
est_has_disability_pct 0 0
est_65plus_amb_pct 0 0
LPA_CrudePrev 0 0
Percent Institutionalized GQ Persistent Poverty
public_preK_est 1 1
est_uninsured_rate 0 0
life_exp_imputed 0 0
est_has_disability_pct 0 0
est_65plus_amb_pct 0 0
LPA_CrudePrev 0 0
Temperature_Diff
public_preK_est 1
est_uninsured_rate 0
life_exp_imputed 0
est_has_disability_pct 0
est_65plus_amb_pct 0
LPA_CrudePrev 0
plot(imp)# cehck distortion - did I alter observed values?? (should be close to 1)
cor(master$public_preK_est,
master$public_preK_est_imputed,
use="complete.obs")[1] 1
tapply(master$public_preK_est_imputed,
master$preK_imputed_flag,
mean) Imputed Original
NA 42.47027
# confirm # NA's left
table(is.na(master$public_preK_est_imputed))
FALSE TRUE
243 6
# confirm missings are imputed
table(master$preK_imputed_flag,
is.na(master$public_preK_est_imputed))
FALSE TRUE
Imputed 58 6
Original 185 0
# how do imputed tracts differ?
aggregate(public_preK_est_imputed ~ preK_imputed_flag,
data=master,
mean) preK_imputed_flag public_preK_est_imputed
1 Imputed 45.48103
2 Original 42.47027
Does the relationship between public preK and other indicators meaningfully change after imputation?
If this correlation is high (> .9), then index structure is stable.
# compute percentile ranks with complete cases
master$preK_rank_complete <- percent_rank(master$public_preK_est)
# compute percentile ranks with imputed data
master$preK_rank_imputed <- percent_rank(master$public_preK_est_imputed)
cor(master$preK_rank_complete,
master$preK_rank_imputed,
use="complete.obs")[1] 0.9996663
Now, let’s see how many census tracts have at least one NA value for any of the 18 index indicators. These tracts will be excluded from the overall composite index later on.
There are a total of 12 tracts with missing data, which will therefore be excluded in our final mapped index.
# logical vector that measures whether tract has any NA among the 18 indicators
tract_missing_any <- apply(
master[, final_index_vars],
1,
function(x) any(is.na(x))
)
# count tracts with at least one missing value
sum_missing_tracts <- sum(tract_missing_any)
# total tracts
total_tracts <- nrow(master)
# percent missing
percent_missing <- round(mean(tract_missing_any) * 100, 2)
# summary
cat("total census tracts:", total_tracts, "\n")total census tracts: 249
cat("tracts with ≥1 missing index value:", sum_missing_tracts, "\n")tracts with ≥1 missing index value: 12
cat("% of tracts with ≥1 missing value:", percent_missing, "%\n")% of tracts with ≥1 missing value: 4.82 %
# list which tracts will be excluded
master$GEOID[tract_missing_any] [1] 48453000102 48453000605 48453000608 48453001606 48453002304 48453002319
[7] 48453002324 48453002326 48453034300 48453034500 48453036800 48453044600
There are 4 missing for % of seniors with an ambulatory disability. This makes sense because it’s a smaller population for some tracts.
There are also 7 NA for eviction filings. Again, not a surprise. I did a separate imputation analysis for evictions at the bottom of this published code: https://rpubs.com/kaitlan/1393863.
# missing values by variable
colSums(is.na(master[, final_index_vars])) public_preK_est_imputed est_uninsured_rate
6 0
life_exp_imputed est_has_disability_pct
0 0
est_65plus_amb_pct LPA_CrudePrev
4 0
est_med_hh_inc est_underemp_perc
2 0
est_belowpov_pct Eviction filing rate
0 7
hh_support_risk_score Energy Burden (% income)
0 3
est_lesh_perc est_no_internet_perc
1 1
lths_pct Percent Institutionalized GQ
0 0
Persistent Poverty Temperature_Diff
0 0