Public Pre-K Imputation

Author

Kaitlan Wong

I am imputing values for the percent of 3 and 4 year olds enrolled in public pre-K due to there being 65 census tracts with suppressed values. I employ a multiple imputation and missing data method to impute new values.

library(car)
library(mice)
library(ggplot2)
library(dplyr)
library(readxl)
library(writexl)
# read master spreadsheet
master <- read_excel("COA_full_data_021226.xlsx")
# if blanks came in as empty strings, force them to NA
master[master == ""] <- NA
# ensure index variables are numeric

index_vars <- c(
  "public_preK_est",
  "est_uninsured_rate",
  "life_exp_imputed",
  "est_has_disability_pct",
  "est_65plus_amb_pct",
  "LPA_CrudePrev",
  "est_med_hh_inc",
  "est_underemp_perc",
  "est_belowpov_pct",
  "Eviction filing rate",
  "hh_support_risk_score",
  "Energy Burden (% income)",
  "est_lesh_perc",
  "est_no_internet_perc",
  "lths_pct",
  "Percent Institutionalized GQ",
  "Persistent Poverty",
  "Temperature_Diff"
)

master[index_vars] <- lapply(master[index_vars], as.numeric)
# create mice dataset
mice_data <- master %>%
  select(all_of(index_vars))

Missing data inspection

# visualize missing data

# first, make the variable names shorter so the table is readable
mice_short <- mice_data
colnames(mice_short) <- paste0("V", 1:ncol(mice_short))

md.pattern(mice_short)

    V2 V3 V4 V6 V8 V9 V11 V15 V16 V17 V18 V13 V14 V7 V12 V5 V10 V1   
179  1  1  1  1  1  1   1   1   1   1   1   1   1  1   1  1   1  1  0
58   1  1  1  1  1  1   1   1   1   1   1   1   1  1   1  1   1  0  1
5    1  1  1  1  1  1   1   1   1   1   1   1   1  1   1  1   0  1  1
1    1  1  1  1  1  1   1   1   1   1   1   1   1  1   1  0   1  1  1
2    1  1  1  1  1  1   1   1   1   1   1   1   1  1   1  0   1  0  2
1    1  1  1  1  1  1   1   1   1   1   1   1   1  1   0  1   0  0  3
1    1  1  1  1  1  1   1   1   1   1   1   1   1  1   0  0   1  0  3
1    1  1  1  1  1  1   1   1   1   1   1   1   1  0   1  1   1  0  2
1    1  1  1  1  1  1   1   1   1   1   1   0   0  0   0  1   0  0  6
     0  0  0  0  0  0   0   0   0   0   0   1   1  2   3  4   7 64 82
# key
data.frame(
  short = paste0("V", 1:ncol(mice_data)),
  original = colnames(mice_data)
)
   short                     original
1     V1              public_preK_est
2     V2           est_uninsured_rate
3     V3             life_exp_imputed
4     V4       est_has_disability_pct
5     V5           est_65plus_amb_pct
6     V6                LPA_CrudePrev
7     V7               est_med_hh_inc
8     V8            est_underemp_perc
9     V9             est_belowpov_pct
10   V10         Eviction filing rate
11   V11        hh_support_risk_score
12   V12     Energy Burden (% income)
13   V13                est_lesh_perc
14   V14         est_no_internet_perc
15   V15                     lths_pct
16   V16 Percent Institutionalized GQ
17   V17           Persistent Poverty
18   V18             Temperature_Diff
# check missing pattern
md.pattern(mice_data)

    est_uninsured_rate life_exp_imputed est_has_disability_pct LPA_CrudePrev
179                  1                1                      1             1
58                   1                1                      1             1
5                    1                1                      1             1
1                    1                1                      1             1
2                    1                1                      1             1
1                    1                1                      1             1
1                    1                1                      1             1
1                    1                1                      1             1
1                    1                1                      1             1
                     0                0                      0             0
    est_underemp_perc est_belowpov_pct hh_support_risk_score lths_pct
179                 1                1                     1        1
58                  1                1                     1        1
5                   1                1                     1        1
1                   1                1                     1        1
2                   1                1                     1        1
1                   1                1                     1        1
1                   1                1                     1        1
1                   1                1                     1        1
1                   1                1                     1        1
                    0                0                     0        0
    Percent Institutionalized GQ Persistent Poverty Temperature_Diff
179                            1                  1                1
58                             1                  1                1
5                              1                  1                1
1                              1                  1                1
2                              1                  1                1
1                              1                  1                1
1                              1                  1                1
1                              1                  1                1
1                              1                  1                1
                               0                  0                0
    est_lesh_perc est_no_internet_perc est_med_hh_inc Energy Burden (% income)
179             1                    1              1                        1
58              1                    1              1                        1
5               1                    1              1                        1
1               1                    1              1                        1
2               1                    1              1                        1
1               1                    1              1                        0
1               1                    1              1                        0
1               1                    1              0                        1
1               0                    0              0                        0
                1                    1              2                        3
    est_65plus_amb_pct Eviction filing rate public_preK_est   
179                  1                    1               1  0
58                   1                    1               0  1
5                    1                    0               1  1
1                    0                    1               1  1
2                    0                    1               0  2
1                    1                    0               0  3
1                    0                    1               0  3
1                    1                    1               0  2
1                    1                    0               0  6
                     4                    7              64 82
# md.pairs(mice_data)
colSums(is.na(mice_data))
             public_preK_est           est_uninsured_rate 
                          64                            0 
            life_exp_imputed       est_has_disability_pct 
                           0                            0 
          est_65plus_amb_pct                LPA_CrudePrev 
                           4                            0 
              est_med_hh_inc            est_underemp_perc 
                           2                            0 
            est_belowpov_pct         Eviction filing rate 
                           0                            7 
       hh_support_risk_score     Energy Burden (% income) 
                           0                            3 
               est_lesh_perc         est_no_internet_perc 
                           1                            1 
                    lths_pct Percent Institutionalized GQ 
                           0                            0 
          Persistent Poverty             Temperature_Diff 
                           0                            0 
library(VIM)
Warning: package 'VIM' was built under R version 4.5.2
Loading required package: colorspace
Warning: package 'colorspace' was built under R version 4.5.2
Loading required package: grid
VIM is ready to use.
Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

Attaching package: 'VIM'
The following object is masked from 'package:datasets':

    sleep
aggr(mice_data, numbers = TRUE, sortVars = TRUE)


 Variables sorted by number of missings: 
                     Variable       Count
              public_preK_est 0.257028112
         Eviction filing rate 0.028112450
           est_65plus_amb_pct 0.016064257
     Energy Burden (% income) 0.012048193
               est_med_hh_inc 0.008032129
                est_lesh_perc 0.004016064
         est_no_internet_perc 0.004016064
           est_uninsured_rate 0.000000000
             life_exp_imputed 0.000000000
       est_has_disability_pct 0.000000000
                LPA_CrudePrev 0.000000000
            est_underemp_perc 0.000000000
             est_belowpov_pct 0.000000000
        hh_support_risk_score 0.000000000
                     lths_pct 0.000000000
 Percent Institutionalized GQ 0.000000000
           Persistent Poverty 0.000000000
             Temperature_Diff 0.000000000
# this verifies taht 25.7% of tracts are missing preK data due to suppression

At this step, I define the predictor matrix.

This is a critical step where I:

  1. Impute only public_preK_est
  2. Use all other indicators as predictors
  3. Do NOT impute anything else
pred <- make.predictorMatrix(mice_data)

# turn everything off
pred[,] <- 0

# allow all other indicators to predict public_preK_est
pred["public_preK_est", setdiff(index_vars, "public_preK_est")] <- 1

Here I define the imputation methods so that only public pre-K gets imputed.

I use predictive mean matching.

meth <- make.method(mice_data)

meth[] <- ""                      # nothing imputed
meth["public_preK_est"] <- "pmm"  # only preK

Below I run MICE.

set.seed(500)

imp <- mice(
  data = mice_data,
  m = 10,
  method = meth,
  predictorMatrix = pred,
  maxit = 5
)

 iter imp variable
  1   1  public_preK_est
  1   2  public_preK_est
  1   3  public_preK_est
  1   4  public_preK_est
  1   5  public_preK_est
  1   6  public_preK_est
  1   7  public_preK_est
  1   8  public_preK_est
  1   9  public_preK_est
  1   10  public_preK_est
  2   1  public_preK_est
  2   2  public_preK_est
  2   3  public_preK_est
  2   4  public_preK_est
  2   5  public_preK_est
  2   6  public_preK_est
  2   7  public_preK_est
  2   8  public_preK_est
  2   9  public_preK_est
  2   10  public_preK_est
  3   1  public_preK_est
  3   2  public_preK_est
  3   3  public_preK_est
  3   4  public_preK_est
  3   5  public_preK_est
  3   6  public_preK_est
  3   7  public_preK_est
  3   8  public_preK_est
  3   9  public_preK_est
  3   10  public_preK_est
  4   1  public_preK_est
  4   2  public_preK_est
  4   3  public_preK_est
  4   4  public_preK_est
  4   5  public_preK_est
  4   6  public_preK_est
  4   7  public_preK_est
  4   8  public_preK_est
  4   9  public_preK_est
  4   10  public_preK_est
  5   1  public_preK_est
  5   2  public_preK_est
  5   3  public_preK_est
  5   4  public_preK_est
  5   5  public_preK_est
  5   6  public_preK_est
  5   7  public_preK_est
  5   8  public_preK_est
  5   9  public_preK_est
  5   10  public_preK_est
# inspect impuatations
summary(imp$imp$public_preK_est)
       1                2                3                4         
 Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
 1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 10.70   1st Qu.:  4.10  
 Median : 37.60   Median : 39.75   Median : 71.95   Median : 69.20  
 Mean   : 45.48   Mean   : 46.67   Mean   : 55.89   Mean   : 57.98  
 3rd Qu.:100.00   3rd Qu.:100.00   3rd Qu.:100.00   3rd Qu.:100.00  
 Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00  
 NA's   :6        NA's   :6        NA's   :6        NA's   :6       
       5                6                 7                8         
 Min.   :  0.00   Min.   :  0.000   Min.   :  0.00   Min.   :  0.00  
 1st Qu.:  0.00   1st Qu.:  6.625   1st Qu.: 19.20   1st Qu.: 17.70  
 Median : 52.70   Median : 60.650   Median : 72.75   Median : 71.20  
 Mean   : 50.19   Mean   : 53.197   Mean   : 60.18   Mean   : 58.57  
 3rd Qu.:100.00   3rd Qu.:100.000   3rd Qu.:100.00   3rd Qu.:100.00  
 Max.   :100.00   Max.   :100.000   Max.   :100.00   Max.   :100.00  
 NA's   :6        NA's   :6         NA's   :6        NA's   :6       
       9                10        
 Min.   :  0.00   Min.   :  0.00  
 1st Qu.: 34.20   1st Qu.:  0.00  
 Median : 74.25   Median : 53.50  
 Mean   : 63.11   Mean   : 49.35  
 3rd Qu.:100.00   3rd Qu.:100.00  
 Max.   :100.00   Max.   :100.00  
 NA's   :6        NA's   :6       
stripplot(imp, public_preK_est ~ .imp, pch = 20)

# extract completed dataset
## Note: I chose to use one completed dataset, but we can change this to average across imputations if we'd rather do that. For now, just using one dataset since we are not creating models. We just need descriptive data for the index creation.
completed_data <- complete(imp, action = 1)

# create imputation flag
completed_data$preK_imputed_flag <- ifelse(
  is.na(master$public_preK_est),
  "Imputed",
  "Original"
)

# export new preK data with imputed values
writexl::write_xlsx(
  data.frame(
    GEOID = master$GEOID,
    public_preK_est_imputed = completed_data$public_preK_est,
    preK_imputed_flag = completed_data$preK_imputed_flag
  ),
  "preK_imputed_values.xlsx"
)
# merge imputed values
master$public_preK_est_imputed <- completed_data$public_preK_est

# merge imputation flag
master$preK_imputed_flag <- completed_data$preK_imputed_flag

# save updated master spreadsheet
writexl::write_xlsx(master, "COA_master_preK_imputed.xlsx")

Pre-K went from having 64 NA values to 6 NA values. This is due to there being NA values for some of the other variables in cases where Pre-K was also NA.

# compare distributions
summary(master$public_preK_est, na.rm = TRUE)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00   30.70   42.47   80.50  100.00      64 
summary(master$public_preK_est_imputed)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00   30.70   43.19   92.70  100.00       6 
# check out correlations

# define final index variables (replace original preK with imputed version)
final_index_vars <- index_vars
final_index_vars[final_index_vars == "public_preK_est"] <- "public_preK_est_imputed"

# create numeric-only dataset for correlation analysis
numeric_vars <- master[, final_index_vars]

preK_cor <- cor(
  numeric_vars$public_preK_est_imputed,
  numeric_vars,
  use = "complete.obs"
)

preK_cor_df <- data.frame(
  variable = colnames(numeric_vars),
  correlation = as.numeric(preK_cor)
)

preK_cor_df <- preK_cor_df[order(abs(preK_cor_df$correlation), decreasing = TRUE), ]

preK_cor_df
                       variable correlation
1       public_preK_est_imputed  1.00000000
15                     lths_pct  0.50685456
6                 LPA_CrudePrev  0.43215487
2            est_uninsured_rate  0.41339639
13                est_lesh_perc  0.36605735
12     Energy Burden (% income)  0.36311665
14         est_no_internet_perc  0.29811139
7                est_med_hh_inc -0.27811560
3              life_exp_imputed -0.24864858
5            est_65plus_amb_pct  0.24701122
4        est_has_disability_pct  0.24067960
10         Eviction filing rate  0.16800309
11        hh_support_risk_score  0.16764599
16 Percent Institutionalized GQ  0.12446405
17           Persistent Poverty  0.09975012
9              est_belowpov_pct  0.09173956
18             Temperature_Diff  0.02438662
8             est_underemp_perc  0.02342754

Strong positive correlations

  • lths_pct 0.49

  • LPA_CrudePrev 0.46

  • est_uninsured_rate 0.44

  • Energy Burden 0.41

  • est_lesh_perc 0.37

  • Eviction filing rate 0.31

  • est_belowpov_pct 0.27

Negative correlations

  • life_exp_imputed -0.26

  • est_med_hh_inc -0.34

These patterns makes sense. Higher public preK enrollment correlates with higher poverty, higher uninsured, higher energy burden, lower income, and lower life expectancy

I was curious how correlation would change after imputation compared to before. I chose to look at poverty. Below we can see that the correlation between poverty and public pre_K enrollment went down after imputation. In fact, pre-K missingness was patterned by poverty.

I found a moderate positive correlation between the imputed pre-K flag and poverty. This implies that tracts that were missing (suppressed) preK values were systematically associated with poverty levels and missingness was not random. It is structurally patterned.

# compare correlation with poverty after imputation
cor(master$public_preK_est_imputed, master$est_belowpov_pct, use="complete.obs")
[1] 0.09540352
# compare correlation with poverty before imputation
cor(master$public_preK_est, master$est_belowpov_pct, use="complete.obs")
[1] 0.263413
# confirm whether missingness was patterned by povert (non zero means yes)
cor(master$preK_imputed_flag == "Imputed",
    master$est_belowpov_pct,
    use = "complete.obs")
[1] 0.3727097

The summary below shows that only public_preK_est was imputed and all other variables were treated as fully observed predictors. PMM (predictive mean matching) was used.

# check to make sure the analysis ran correctly
imp
Class: mids
Number of multiple imputations:  10 
Imputation methods:
             public_preK_est           est_uninsured_rate 
                       "pmm"                           "" 
            life_exp_imputed       est_has_disability_pct 
                          ""                           "" 
          est_65plus_amb_pct                LPA_CrudePrev 
                          ""                           "" 
              est_med_hh_inc            est_underemp_perc 
                          ""                           "" 
            est_belowpov_pct         Eviction filing rate 
                          ""                           "" 
       hh_support_risk_score     Energy Burden (% income) 
                          ""                           "" 
               est_lesh_perc         est_no_internet_perc 
                          ""                           "" 
                    lths_pct Percent Institutionalized GQ 
                          ""                           "" 
          Persistent Poverty             Temperature_Diff 
                          ""                           "" 
PredictorMatrix:
                       public_preK_est est_uninsured_rate life_exp_imputed
public_preK_est                      0                  1                1
est_uninsured_rate                   0                  0                0
life_exp_imputed                     0                  0                0
est_has_disability_pct               0                  0                0
est_65plus_amb_pct                   0                  0                0
LPA_CrudePrev                        0                  0                0
                       est_has_disability_pct est_65plus_amb_pct LPA_CrudePrev
public_preK_est                             1                  1             1
est_uninsured_rate                          0                  0             0
life_exp_imputed                            0                  0             0
est_has_disability_pct                      0                  0             0
est_65plus_amb_pct                          0                  0             0
LPA_CrudePrev                               0                  0             0
                       est_med_hh_inc est_underemp_perc est_belowpov_pct
public_preK_est                     1                 1                1
est_uninsured_rate                  0                 0                0
life_exp_imputed                    0                 0                0
est_has_disability_pct              0                 0                0
est_65plus_amb_pct                  0                 0                0
LPA_CrudePrev                       0                 0                0
                       Eviction filing rate hh_support_risk_score
public_preK_est                           1                     1
est_uninsured_rate                        0                     0
life_exp_imputed                          0                     0
est_has_disability_pct                    0                     0
est_65plus_amb_pct                        0                     0
LPA_CrudePrev                             0                     0
                       Energy Burden (% income) est_lesh_perc
public_preK_est                               1             1
est_uninsured_rate                            0             0
life_exp_imputed                              0             0
est_has_disability_pct                        0             0
est_65plus_amb_pct                            0             0
LPA_CrudePrev                                 0             0
                       est_no_internet_perc lths_pct
public_preK_est                           1        1
est_uninsured_rate                        0        0
life_exp_imputed                          0        0
est_has_disability_pct                    0        0
est_65plus_amb_pct                        0        0
LPA_CrudePrev                             0        0
                       Percent Institutionalized GQ Persistent Poverty
public_preK_est                                   1                  1
est_uninsured_rate                                0                  0
life_exp_imputed                                  0                  0
est_has_disability_pct                            0                  0
est_65plus_amb_pct                                0                  0
LPA_CrudePrev                                     0                  0
                       Temperature_Diff
public_preK_est                       1
est_uninsured_rate                    0
life_exp_imputed                      0
est_has_disability_pct                0
est_65plus_amb_pct                    0
LPA_CrudePrev                         0
plot(imp)

# cehck distortion - did I alter observed values?? (should be close to 1)
cor(master$public_preK_est,
    master$public_preK_est_imputed,
    use="complete.obs")
[1] 1
tapply(master$public_preK_est_imputed,
       master$preK_imputed_flag,
       mean)
 Imputed Original 
      NA 42.47027 
# confirm # NA's left
table(is.na(master$public_preK_est_imputed))

FALSE  TRUE 
  243     6 
# confirm missings are imputed
table(master$preK_imputed_flag,
      is.na(master$public_preK_est_imputed))
          
           FALSE TRUE
  Imputed     58    6
  Original   185    0
# how do imputed tracts differ?
aggregate(public_preK_est_imputed ~ preK_imputed_flag,
          data=master,
          mean)
  preK_imputed_flag public_preK_est_imputed
1           Imputed                45.48103
2          Original                42.47027

Does the relationship between public preK and other indicators meaningfully change after imputation?

If this correlation is high (> .9), then index structure is stable.

# compute percentile ranks with complete cases
master$preK_rank_complete <- percent_rank(master$public_preK_est)

# compute percentile ranks with imputed data
master$preK_rank_imputed <- percent_rank(master$public_preK_est_imputed)

cor(master$preK_rank_complete,
    master$preK_rank_imputed,
    use="complete.obs")
[1] 0.9996663

Now, let’s see how many census tracts have at least one NA value for any of the 18 index indicators. These tracts will be excluded from the overall composite index later on.

There are a total of 12 tracts with missing data, which will therefore be excluded in our final mapped index.

# logical vector that measures whether tract has any NA among the 18 indicators
tract_missing_any <- apply(
  master[, final_index_vars],
  1,
  function(x) any(is.na(x))
)

# count tracts with at least one missing value
sum_missing_tracts <- sum(tract_missing_any)

# total tracts
total_tracts <- nrow(master)

# percent missing
percent_missing <- round(mean(tract_missing_any) * 100, 2)

# summary
cat("total census tracts:", total_tracts, "\n")
total census tracts: 249 
cat("tracts with ≥1 missing index value:", sum_missing_tracts, "\n")
tracts with ≥1 missing index value: 12 
cat("% of tracts with ≥1 missing value:", percent_missing, "%\n")
% of tracts with ≥1 missing value: 4.82 %
# list which tracts will be excluded
master$GEOID[tract_missing_any]
 [1] 48453000102 48453000605 48453000608 48453001606 48453002304 48453002319
 [7] 48453002324 48453002326 48453034300 48453034500 48453036800 48453044600

There are 4 missing for % of seniors with an ambulatory disability. This makes sense because it’s a smaller population for some tracts.

There are also 7 NA for eviction filings. Again, not a surprise. I did a separate imputation analysis for evictions at the bottom of this published code: https://rpubs.com/kaitlan/1393863.

# missing values by variable
colSums(is.na(master[, final_index_vars]))
     public_preK_est_imputed           est_uninsured_rate 
                           6                            0 
            life_exp_imputed       est_has_disability_pct 
                           0                            0 
          est_65plus_amb_pct                LPA_CrudePrev 
                           4                            0 
              est_med_hh_inc            est_underemp_perc 
                           2                            0 
            est_belowpov_pct         Eviction filing rate 
                           0                            7 
       hh_support_risk_score     Energy Burden (% income) 
                           0                            3 
               est_lesh_perc         est_no_internet_perc 
                           1                            1 
                    lths_pct Percent Institutionalized GQ 
                           0                            0 
          Persistent Poverty             Temperature_Diff 
                           0                            0