PAA 2025 data cleaning

Author

Brian Surratt

Published

April 11, 2025

#Loading libraries

Code
# Loading libraries

librarian::shelf(knitr, car, foreign, dplyr, janitor, ggplot2, ggpubr, usmapdata, usmap, gtsummary, gt, lme4, emmeans, summarytools, data.table, cobalt, WeightIt, margins, marginaleffects, readxl, nlme, multilevel, MASS, survey)

  The 'cran_repo' argument in shelf() was not set, so it will use
  cran_repo = 'https://cran.r-project.org' by default.

  To avoid this message, set the 'cran_repo' argument to a CRAN
  mirror URL (see https://cran.r-project.org/mirrors.html) or set
  'quiet = TRUE'.
Code
opts_chunk$set(echo = TRUE)

setwd("/Users/briansurratt/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/PAA 2025/R files")

options(scipen = 999) # This turns off scientific notation.

I’m going to add libraries as needed.

library(stargazer) library(webshot2)

Reading in the data

Importing the data for Household Pulse Survey phase 3.2 to 3.4.

Code
alldata <- read.csv("./data/alldata.csv")

nrow(alldata)
[1] 803905

Selecting variables from alldata for a working dataframe titled ‘allhh’. Remember that rent amount was not in the initial survey and isn’t in every wave. Started asking phase 3.5, week 46 (I think).

Code
allhh <- alldata %>% 
  dplyr::select(week, income, tenure, evict, rentcur, anxious, worry, interest, down, tbirth_year, genid_describe, rhispanic, rrace, ms, thhld_numadlt, thhld_numper, thhld_numkid, eeduc, hweight, pweight, est_st, state, abbv, score, est_msa, wrklossrv, anywork)

nrow(allhh)
[1] 803905

Cleaning and exploring the data

Check distribution of tenure and clear NAs.

Check the distribution of tenure.

TENURE 1) Owned by you or someone in this household free and clear? 2) Owned by your or someone in this household with a mortgage or loan (including home equity loans)? 3) Rented? 4) Occupied without payment of rent? -99) Question seen but category not selected -88) Missing / Did not report

Code
tabyl(allhh$tenure)
 allhh$tenure      n     percent
          -99   6498 0.008083045
          -88 117839 0.146583241
            1 194842 0.242369434
            2 316637 0.393873654
            3 159649 0.198591873
            4   8440 0.010498753
Code
# Remove NAs

allhh <- allhh %>%
  filter(tenure %in% c(1:4))

tabyl(allhh$tenure)
 allhh$tenure      n    percent
            1 194842 0.28671450
            2 316637 0.46593866
            3 159649 0.23492719
            4   8440 0.01241965
Code
nrow(allhh)
[1] 679568

15% of the respondents did not report on housing tenure!

Create a dichotomous variable ‘renting’

Code
allhh <- allhh %>%
  mutate(renting = case_when(tenure == 3 ~ 1,
                             TRUE ~ 0))

tabyl(allhh$renting)
 allhh$renting      n   percent
             0 519919 0.7650728
             1 159649 0.2349272
Code
nrow(allhh)
[1] 679568

Create a dichotomous variable ‘owner’

Code
allhh <- allhh %>%
  mutate(owner = case_when(tenure %in% c(1:2) ~ 1,
                             TRUE ~ 0))

tabyl(allhh$owner)
 allhh$owner      n   percent
           0 168089 0.2473468
           1 511479 0.7526532
Code
nrow(allhh)
[1] 679568

Clean and recode ‘evict’

Checking distribution of evict.

EVICT 1) Very likely 2) Somewhat likely 3) Not very likely 4) Not likely at all -99) Question seen but category not selected -88) Missing / Did not report

Only asked if rentcur = 2 which is “no”.

Code
tabyl(allhh$evict)
 allhh$evict      n      percent
         -99    143 0.0002104278
         -88 661780 0.9738245474
           1   2509 0.0036920514
           2   4986 0.0073370141
           3   5275 0.0077622843
           4   4875 0.0071736750

This needs to be recoded so that 0 is current on rent (not at risk of eviction due to late on rent), 1 is not likely at all, 2 is not very likely, 3 is somewhat likely, and 4 is very likely.

Code ‘evictrisk’ = 0 when rentcur = 1. Change other codes as well.

Code
allhh <- allhh %>%
  mutate(evictrisk = case_when(tenure != 3 ~ 0, # Homeowners are not at risk of eviction.
                               rentcur == 1 ~ 0, # Current renters not at risk.
                               evict == 4 ~ 1, # Late renters not likely at all to be evicted.
                               evict == 3 ~ 2, # Late renters not very likely to be evicted.
                               evict == 2 ~ 3, # Late renters somewhat likely to be evicted.
                               evict == 1 ~ 4 # Late renters very likely to be evicted.
                               )
         )

tabyl(allhh$evictrisk)
 allhh$evictrisk      n     percent valid_percent
               0 660858 0.972467803   0.973994220
               1   4875 0.007173675   0.007184935
               2   5275 0.007762284   0.007774468
               3   4986 0.007337014   0.007348531
               4   2509 0.003692051   0.003697847
              NA   1065 0.001567172            NA
Code
nrow(allhh)
[1] 679568

Filer out NAs.

Code
allhh <- allhh %>%
  filter(evictrisk %in% c(0:4))

tabyl(allhh$evictrisk)
 allhh$evictrisk      n     percent
               0 660858 0.973994220
               1   4875 0.007184935
               2   5275 0.007774468
               3   4986 0.007348531
               4   2509 0.003697847
Code
nrow(allhh)
[1] 678503

Create a dichotomous variable called “highrisk” for renters who are very or somewhat likely to be evicted in the next 2 months.

Code
allhh <- allhh %>% 
  mutate(highrisk = case_when(evictrisk %in% c(3:4) ~ 1,
                              TRUE ~ 0))

tabyl(allhh$highrisk)
 allhh$highrisk      n    percent
              0 671008 0.98895362
              1   7495 0.01104638
Code
nrow(allhh)
[1] 678503

Creating dummy variables for eviction risk

Code
allhh$evict0 <- car::recode(allhh$evictrisk, "0=1; else=0")
tabyl(allhh$evict0)
 allhh$evict0      n    percent
            0  17645 0.02600578
            1 660858 0.97399422
Code
allhh$evict1 <- car::recode(allhh$evictrisk, "1=1; else=0")
tabyl(allhh$evict1)
 allhh$evict1      n     percent
            0 673628 0.992815065
            1   4875 0.007184935
Code
allhh$evict2 <- car::recode(allhh$evictrisk, "2=1; else=0")
tabyl(allhh$evict2)
 allhh$evict2      n     percent
            0 673228 0.992225532
            1   5275 0.007774468
Code
allhh$evict3 <- car::recode(allhh$evictrisk, "3=1; else=0")
tabyl(allhh$evict3)
 allhh$evict3      n     percent
            0 673517 0.992651469
            1   4986 0.007348531
Code
allhh$evict4 <- car::recode(allhh$evictrisk, "4=1; else=0")
tabyl(allhh$evict4)
 allhh$evict4      n     percent
            0 675994 0.996302153
            1   2509 0.003697847

Check distribution of state policy variable

Code
tabyl(allhh, score)
 score     n     percent
  0.00 69106 0.101850692
  0.23 17733 0.026135478
  0.30  5200 0.007663931
  0.38 19289 0.028428762
  0.53 11170 0.016462713
  0.60  9545 0.014067734
  0.63  6668 0.009827517
  0.75  5471 0.008063339
  0.78 17846 0.026302021
  0.83 26963 0.039738955
  1.03 12318 0.018154673
  1.15 12241 0.018041188
  1.23 11491 0.016935813
  1.28 50548 0.074499302
  1.33 52349 0.077153675
  1.43 12465 0.018371326
  1.58 31366 0.046228241
  1.84 12119 0.017861380
  1.98  6508 0.009591704
  2.08 12776 0.018829688
  2.29  7469 0.011008057
  2.30  9824 0.014478934
  2.48  7584 0.011177548
  2.65  9019 0.013292498
  2.73 18098 0.026673427
  2.78 10430 0.015372076
  3.00 11432 0.016848857
  3.08 14155 0.020862104
  3.13 17260 0.025438355
  3.15  7190 0.010596858
  3.25 12727 0.018757470
  3.33 18639 0.027470770
  3.38 17830 0.026278439
  3.73 11128 0.016400812
  3.75 27522 0.040562827
  3.78 12054 0.017765581
  3.80 14936 0.022013167
  3.88  7970 0.011746448
  4.03 19823 0.029215788
  4.30 11395 0.016794325
  4.63  8846 0.013037525

Clean up covariates

Clean income

INCOME: Total household income (before taxes). 1) Less than $25,000
2) $25,000 - $34,999
3) $35,000 - $49,999
4) $50,000 - $74,999
5) $75,000 - $99,999
6) $100,000 - $149,999
7) $150,000 - $199,999 8) $200,000 and above -99) Question seen but category not selected -88) Missing / Did not report

Check distribution of income.

Code
tabyl(allhh$income)
 allhh$income      n    percent
          -99  17392 0.02563290
          -88  19977 0.02944276
            1  70996 0.10463624
            2  56929 0.08390383
            3  69383 0.10225894
            4 109065 0.16074358
            5  91249 0.13448577
            6 116218 0.17128590
            7  56936 0.08391415
            8  70358 0.10369593

Remove NAs

Code
allhh <- allhh %>% 
  filter(.$income %in% c(1:8))

tabyl(allhh$income)
 allhh$income      n    percent
            1  70996 0.11073504
            2  56929 0.08879423
            3  69383 0.10821919
            4 109065 0.17011264
            5  91249 0.14232438
            6 116218 0.18126944
            7  56936 0.08880515
            8  70358 0.10973993
Code
nrow(allhh)
[1] 641134

Histogram of income levels.

Code
hist(allhh$income)

Create a dichotomous variable of less than $75,000 income, ‘less75k’.

Code
tabyl(allhh$income)
 allhh$income      n    percent
            1  70996 0.11073504
            2  56929 0.08879423
            3  69383 0.10821919
            4 109065 0.17011264
            5  91249 0.14232438
            6 116218 0.18126944
            7  56936 0.08880515
            8  70358 0.10973993
Code
allhh <- allhh %>% 
  mutate(less75k = case_when(income %in% c(1,2,3,4) ~ 1,
                             TRUE ~ 0
                             )
         )

tabyl(allhh$less75k)
 allhh$less75k      n   percent
             0 334761 0.5221389
             1 306373 0.4778611
Code
nrow(allhh)
[1] 641134

Recode income with 3 levels

Code
allhh <- allhh %>%
  mutate(inclvl = case_when(income == 1 ~ "lessthan25",
                            income %in% c(2,3) ~ "25to49",
                            TRUE ~ "50andover"
                            )
         )

allhh$inclvl <- factor(allhh$inclvl, levels=c("25to49",  "lessthan25",  "50andover"))

tabyl(allhh$inclvl)
 allhh$inclvl      n   percent
       25to49 126312 0.1970134
   lessthan25  70996 0.1107350
    50andover 443826 0.6922515

Clean age

Creating a new variable ‘age’ by subtracting birth year from the year the data were collected. Data were collected in 2021 and 2022, but I am using 2021.

Code
allhh$age <- 2021-allhh$tbirth_year

tabyl(allhh$age)
 allhh$age     n      percent
        17   152 0.0002370799
        18   806 0.0012571475
        19  1099 0.0017141502
        20  1485 0.0023162085
        21  1976 0.0030820390
        22  2599 0.0040537548
        23  3052 0.0047603153
        24  4024 0.0062763790
        25  4880 0.0076115133
        26  5340 0.0083289921
        27  6003 0.0093630973
        28  6825 0.0106452005
        29  7363 0.0114843387
        30  8082 0.0126057891
        31  8672 0.0135260336
        32  8968 0.0139877155
        33  9390 0.0146459243
        34  9649 0.0150498960
        35 10195 0.0159015120
        36 10928 0.0170447987
        37 11205 0.0174768457
        38 11601 0.0180945013
        39 12249 0.0191052105
        40 12368 0.0192908191
        41 12697 0.0198039723
        42 12061 0.0188119800
        43 11733 0.0183003865
        44 11817 0.0184314044
        45 11400 0.0177809943
        46 11226 0.0175096002
        47 11213 0.0174893236
        48 10925 0.0170401195
        49 11286 0.0176031844
        50 11988 0.0186981193
        51 12886 0.0200987625
        52 12238 0.0190880534
        53 11532 0.0179868795
        54 11332 0.0176749322
        55 11871 0.0185156301
        56 12094 0.0188634513
        57 12964 0.0202204219
        58 13232 0.0206384313
        59 13488 0.0210377238
        60 13769 0.0214760097
        61 14241 0.0222122052
        62 14064 0.0219361319
        63 14259 0.0222402805
        64 14732 0.0229780358
        65 14965 0.0233414544
        66 15296 0.0238577271
        67 15165 0.0236534016
        68 14367 0.0224087320
        69 14071 0.0219470501
        70 12942 0.0201861077
        71 12052 0.0187979424
        72 11427 0.0178231072
        73 10667 0.0166377076
        74 10800 0.0168451525
        75  8846 0.0137974277
        76  6737 0.0105079437
        77  5892 0.0091899665
        78  5705 0.0088982958
        79  4884 0.0076177523
        80  3826 0.0059675512
        81  3108 0.0048476606
        82  2584 0.0040303587
        83  2161 0.0033705902
        84  1630 0.0025423702
        85  1378 0.0021493167
        86  1079 0.0016829555
        87  1864 0.0029073485
        88  1729 0.0026967841
Code
hist(allhh$age)

Code
nrow(allhh)
[1] 641134

Clean genid_describe

genid_describe is defined as current gender identity.

  1. Male
  2. Female
  3. Transgender
  4. None of these -99) Question seen but category not selected -88) Missing / Did not report

Checking the distribution of gender identity.

Code
tabyl(allhh$genid_describe)
 allhh$genid_describe      n     percent
                  -99   3868 0.006033060
                    1 258539 0.403252674
                    2 370652 0.578119395
                    3   1979 0.003086718
                    4   6096 0.009508153

Creating dummy variables for gender.

Code
allhh$male <- car::recode(allhh$genid_describe, "1=1; else=0") #gen1 = male
tabyl(allhh$male)
 allhh$male      n   percent
          0 382595 0.5967473
          1 258539 0.4032527
Code
allhh$female <- car::recode(allhh$genid_describe, "2=1; else=0") #gen2 = female
tabyl(allhh$female)
 allhh$female      n   percent
            0 270482 0.4218806
            1 370652 0.5781194
Code
allhh$transgender <- car::recode(allhh$genid_describe, "3=1; else=0") #gen3 = transgender
tabyl(allhh$transgender)
 allhh$transgender      n     percent
                 0 639155 0.996913282
                 1   1979 0.003086718
Code
allhh$gen_none <- car::recode(allhh$genid_describe, "4=1; else=0") #gen4 = None of these
tabyl(allhh$gen_none)
 allhh$gen_none      n     percent
              0 635038 0.990491847
              1   6096 0.009508153
Code
allhh$gen_notsel <- car::recode(allhh$genid_describe, "-99=1; else=0") #gen5 = not selected
tabyl(allhh$gen_notsel)
 allhh$gen_notsel      n    percent
                0 637266 0.99396694
                1   3868 0.00603306
Code
nrow(allhh)
[1] 641134

Clean race_eth

race_eth: Categorical variable derived from ‘rhispanic’ and ‘rrace’.

rhispanic categories

  1. No, not of Hispanic, Latino, or Spanish origin
  2. Yes, of Hispanic, Latino, or Spanish origin

rrace categories

  1. White, Alone
  2. Black, Alone
  3. Asian, Alone
  4. Any other race alone, or race in combination

Combining race/ethnicity into a race_eth variable.

Code
tabyl(allhh$rrace)
 allhh$rrace      n    percent
           1 534307 0.83337805
           2  46232 0.07210973
           3  31961 0.04985073
           4  28634 0.04466149
Code
tabyl(allhh$rhispanic)
 allhh$rhispanic      n    percent
               1 586791 0.91523925
               2  54343 0.08476075
Code
allhh <- allhh %>%
  mutate(race_eth = case_when(.$rhispanic == 1 & .$rrace == 1 ~"nh_white",
                              .$rhispanic == 1 & .$rrace == 2 ~"nh_black",
                              .$rhispanic == 1 & .$rrace == 3 ~"nh_asian",
                              .$rhispanic == 1 & .$rrace == 4 ~"other",
                              .$rhispanic == 2 & .$rrace %in% c(1:4) ~ "hispanic"
                              )
        )

tabyl(allhh$race_eth)
 allhh$race_eth      n    percent
       hispanic  54343 0.08476075
       nh_asian  30413 0.04743626
       nh_black  43677 0.06812460
       nh_white 490036 0.76432696
          other  22665 0.03535142

Creating dummy variables for race_eth.

Code
allhh$hispanic <- car::recode(allhh$race_eth, "'hispanic'=1; else=0")
tabyl(allhh$hispanic)
 allhh$hispanic      n    percent
              0 586791 0.91523925
              1  54343 0.08476075
Code
allhh$nh_white <- car::recode(allhh$race_eth, "'nh_white'=1; else=0")
tabyl(allhh$nh_white)
 allhh$nh_white      n  percent
              0 151098 0.235673
              1 490036 0.764327
Code
allhh$nh_black <- car::recode(allhh$race_eth, "'nh_black'=1; else=0")
tabyl(allhh$nh_black)
 allhh$nh_black      n   percent
              0 597457 0.9318754
              1  43677 0.0681246
Code
allhh$nh_asian <- car::recode(allhh$race_eth, "'nh_asian'=1; else=0")
tabyl(allhh$nh_asian)
 allhh$nh_asian      n    percent
              0 610721 0.95256374
              1  30413 0.04743626
Code
allhh$other <- car::recode(allhh$race_eth, "'other'=1; else=0")
tabyl(allhh$other)
 allhh$other      n    percent
           0 618469 0.96464858
           1  22665 0.03535142
Code
nrow(allhh)
[1] 641134

Relevel race_eth

Code
allhh$race_eth <- factor(allhh$race_eth, levels = c("nh_white", "nh_black", "hispanic", "nh_asian", "other"))

tabyl(allhh$race_eth)
 allhh$race_eth      n    percent
       nh_white 490036 0.76432696
       nh_black  43677 0.06812460
       hispanic  54343 0.08476075
       nh_asian  30413 0.04743626
          other  22665 0.03535142

Clean single_adult

thhld_numadlt: Recode for the number of Adults in the household 1-40) number of people (whole number)

Creating dummy variable for single vs. multiple adults in household. Single adult is coded as “1”, multiple adults is coded as “0”.

Code
tabyl(allhh$thhld_numadlt)
 allhh$thhld_numadlt      n      percent
                   1 154302 0.2406704371
                   2 357826 0.5581142164
                   3  83572 0.1303502856
                   4  32097 0.0500628574
                   5   9346 0.0145772959
                   6   2332 0.0036373051
                   7    687 0.0010715389
                   8    286 0.0004460846
                   9    389 0.0006067374
                  10    297 0.0004632417
Code
allhh$single_adult <- recode(allhh$thhld_numadlt, "1=1; else=0")
Warning: Unreplaced values treated as NA as `.x` is not compatible.
Please specify replacements exhaustively or supply `.default`.
Code
tabyl(allhh$single_adult)
 allhh$single_adult      n   percent valid_percent
        1=1; else=0 154302 0.2406704             1
               <NA> 486832 0.7593296            NA
Code
nrow(allhh)
[1] 641134

Clean thhld_numper

thhld_numper: Total number of people in household

Distribution of number of people in household. I’m leaving this as an integer.

Code
tabyl(allhh$thhld_numper)
 allhh$thhld_numper      n     percent
                  1 125064 0.195066866
                  2 257737 0.402001766
                  3 105072 0.163884617
                  4  88549 0.138113093
                  5  38548 0.060124717
                  6  15586 0.024310051
                  7   5380 0.008391382
                  8   2316 0.003612349
                  9    922 0.001438077
                 10   1960 0.003057083
Code
nrow(allhh)
[1] 641134

Clean child

thhld_numkid: Total number of people under 18-years-old in household.

Check distribution of ‘thhld_numkid’.

Code
tabyl(allhh$thhld_numkid)
 allhh$thhld_numkid      n     percent
                  0 444838 0.693829995
                  1  89460 0.139534013
                  2  69547 0.108474983
                  3  24980 0.038962214
                  4   8095 0.012626066
                  5   4214 0.006572729

Create dummy variables for number of children.

Code
allhh$child0 <- car::recode(allhh$thhld_numkid, "0=1; else=0")
tabyl(allhh$child0)
 allhh$child0      n percent
            0 196296 0.30617
            1 444838 0.69383
Code
allhh$child1 <- car::recode(allhh$thhld_numkid, "1=1; else=0")
tabyl(allhh$child1)
 allhh$child1      n  percent
            0 551674 0.860466
            1  89460 0.139534
Code
allhh$child2 <- car::recode(allhh$thhld_numkid, "2=1; else=0")
tabyl(allhh$child2)
 allhh$child2      n  percent
            0 571587 0.891525
            1  69547 0.108475
Code
allhh$child3 <- car::recode(allhh$thhld_numkid, "3=1; else=0")
tabyl(allhh$child3)
 allhh$child3      n    percent
            0 616154 0.96103779
            1  24980 0.03896221
Code
allhh$child4 <- car::recode(allhh$thhld_numkid, "4=1; else=0")
tabyl(allhh$child4)
 allhh$child4      n    percent
            0 633039 0.98737393
            1   8095 0.01262607
Code
allhh$child5 <- car::recode(allhh$thhld_numkid, "5=1; else=0")
tabyl(allhh$child5)
 allhh$child5      n     percent
            0 636920 0.993427271
            1   4214 0.006572729
Code
nrow(allhh)
[1] 641134

Dummy variable for children vs. no children in household.

Code
allhh$children <- car::recode(allhh$thhld_numkid, "0=0; else=1")

tabyl(allhh$children)
 allhh$children      n percent
              0 444838 0.69383
              1 196296 0.30617

Clean eeduc

eeduc 1) Less than high school 2) Some high school 3) High school graduate or equivalent (for example GED) 4) Some college, but degree not received or is in progress 5) Associate’s degree (for example AA, AS) 6) Bachelor’s degree (for example BA, BS, AB) 7) Graduate degree (for example master’s, professional, doctorate)

For educational attainment, create ‘education’ variable with values “less than high school”, “high school”, “some college”, “bachelor’s or higher”.

Code
tabyl(allhh$eeduc)
 allhh$eeduc      n     percent
           1   3311 0.005164287
           2   7164 0.011173951
           3  67551 0.105361750
           4 130047 0.202839032
           5  66213 0.103274822
           6 190114 0.296527715
           7 176734 0.275658443
Code
nrow(allhh)
[1] 641134

Make a new categorical variable for education.

Code
allhh <- allhh %>% 
  mutate(education = case_when(eeduc %in% c(1,2) ~ '1lessthanhs',
                               eeduc == 3 ~ '2highschool',
                               eeduc %in% c(4,5) ~ '3somecollege',
                               eeduc %in% c(6,7) ~ '4BAorhigher'
                               )
         )

tabyl(allhh$education)
 allhh$education      n    percent
     1lessthanhs  10475 0.01633824
     2highschool  67551 0.10536175
    3somecollege 196260 0.30611385
     4BAorhigher 366848 0.57218616
Code
nrow(allhh)
[1] 641134

Creating dummy variables for educational attainment.

Code
allhh$hsorless <- car::recode(allhh$eeduc, "1=1; 2=1; 3=1; else=0")
tabyl(allhh$hsorless)
 allhh$hsorless      n percent
              0 563108  0.8783
              1  78026  0.1217
Code
allhh$somecollege <- car::recode(allhh$eeduc, "4=1; 5=1; else=0")
tabyl(allhh$somecollege)
 allhh$somecollege      n   percent
                 0 444874 0.6938861
                 1 196260 0.3061139
Code
allhh$baorhigher <- car::recode(allhh$eeduc, "6=1; 7=1; else=0")
tabyl(allhh$baorhigher)
 allhh$baorhigher      n   percent
                0 274286 0.4278138
                1 366848 0.5721862
Code
nrow(allhh)
[1] 641134

Recode education with 3 levels

Code
tabyl(allhh$eeduc)
 allhh$eeduc      n     percent
           1   3311 0.005164287
           2   7164 0.011173951
           3  67551 0.105361750
           4 130047 0.202839032
           5  66213 0.103274822
           6 190114 0.296527715
           7 176734 0.275658443
Code
allhh <- allhh %>%
  mutate(educ = case_when(eeduc %in% c(1:3) ~ "HSorless",
                            eeduc == 4 ~ "somecollege",
                            TRUE ~ "BAorhigher"
                            )
         )

allhh$educ <- factor(allhh$educ, levels=c("somecollege",  "HSorless",  "BAorhigher"))

tabyl(allhh$educ)
  allhh$educ      n  percent
 somecollege 130047 0.202839
    HSorless  78026 0.121700
  BAorhigher 433061 0.675461

Clean wrklossrv

wrklossrv: Recent household job loss. 1) Yes 2) No -99) Question seen but category not selected -88) Missing / Did not report

Code
tabyl(allhh$wrklossrv)
 allhh$wrklossrv      n     percent
             -99    804 0.001254028
               1  69013 0.107642084
               2 571317 0.891103888

Remove 99s

Code
allhh <- allhh %>% 
  filter(wrklossrv %in% c(1:2))

tabyl(allhh$wrklossrv)
 allhh$wrklossrv      n   percent
               1  69013 0.1077772
               2 571317 0.8922228
Code
nrow(allhh)
[1] 640330

Create dummy variable ‘workloss’ so 1 is Yes and 0 is No.

Code
allhh <- allhh %>% 
  mutate(workloss = (case_when(wrklossrv == 1 ~ 1,
                               TRUE ~ 0)
                     )
         )

tabyl(allhh$workloss)
 allhh$workloss      n   percent
              0 571317 0.8922228
              1  69013 0.1077772
Code
nrow(allhh)
[1] 640330

Clean marital status

ms: Marital status 1) Now married 2) Widowed 3) Divorced 4) Separated 5) Never married -99) Question seen but category not selected -88) Missing / Did not report

Code
tabyl(allhh$ms)
 allhh$ms      n     percent
      -99   2455 0.003833961
        1 367908 0.574559993
        2  36916 0.057651523
        3 101558 0.158602596
        4  10846 0.016938141
        5 120647 0.188413787

Remove 99s

Code
allhh <- allhh %>% 
  filter(ms %in% c(1:5))

tabyl(allhh$ms)
 allhh$ms      n    percent
        1 367908 0.57677131
        2  36916 0.05787341
        3 101558 0.15921301
        4  10846 0.01700333
        5 120647 0.18913894
Code
nrow(allhh)
[1] 637875

How should marital status be coded? Dummy variable of married/unmarried? or Dichotomized classes?

Recode married

Create a dummy marital status variable ‘married’.

Code
allhh <- allhh %>% 
  mutate(married = (case_when(ms == 1 ~ 1,
                              TRUE ~ 0)
                     )
         )

tabyl(allhh$married)
 allhh$married      n   percent
             0 269967 0.4232287
             1 367908 0.5767713
Code
nrow(allhh)
[1] 637875

Create a dummy marital status variable ‘unmarried’.

Code
allhh <- allhh %>% 
  mutate(unmarried = (case_when(ms != 1 ~ 1,
                              TRUE ~ 0)
                     )
         )

tabyl(allhh$unmarried)
 allhh$unmarried      n   percent
               0 367908 0.5767713
               1 269967 0.4232287
Code
nrow(allhh)
[1] 637875

Recode rentcur

rentcur: Caught up on rent? 1) Yes 2) No -99) Question seen but category not selected -88) Missing / Did not report

Code
tabyl(allhh$rentcur)
 allhh$rentcur      n    percent
           -88 487485 0.76423280
             1 133730 0.20964923
             2  16660 0.02611797

Creating new dummy variable ‘laterent’, 1 is yes, 0 is no.

Code
allhh <- allhh %>% 
  mutate(laterent = (case_when(rentcur == 2 ~ 1,
                               rentcur == 1 ~ 0,
                               TRUE ~ NA)
                     )
         )

tabyl(allhh$laterent)
 allhh$laterent      n    percent valid_percent
              0 133730 0.20964923     0.8892214
              1  16660 0.02611797     0.1107786
             NA 487485 0.76423280            NA
Code
nrow(allhh)
[1] 637875

Filter and write csv files for allhh, ownerhh, renterhh, currenthh, latehh

Write out csv for allhh

Code
# Export to csv files

fwrite(allhh, "./data/allhh.csv")

Filter for only owners

Code
ownerhh <- allhh %>% 
  filter(owner == 1)

nrow(ownerhh)
[1] 479624

Write out csv for ownerhh

Code
# Export to csv files

fwrite(ownerhh, "./data/ownerhh.csv")

Filter for only renters

Code
renterhh <- allhh %>% 
  filter(renting == 1)

nrow(renterhh)
[1] 150390

Write out csv for renterhh

Code
# Export to csv files

fwrite(renterhh, "./data/renterhh.csv")

Filter for only current renters

Code
tabyl(renterhh$laterent)
 renterhh$laterent      n   percent
                 0 133730 0.8892214
                 1  16660 0.1107786
Code
currenthh <- renterhh %>% 
  filter(laterent == 0)

nrow(currenthh)
[1] 133730

Write out csv for currenthh

Code
# Export to csv files

fwrite(currenthh, "./data/currenthh.csv")

Filter for only late renters

Code
latehh <- renterhh %>% 
  filter(laterent == 1)

nrow(latehh)
[1] 16660

Write out csv for latehh.

Code
# Export to csv files

fwrite(latehh, "./data/latehh.csv")

Descriptive tables, no weights

PAA: change these variables to match models

Questions: Weights or no weights? Household or person? HPS or matching?

Descriptive statistics for all households

Code
# Summary statistics of all households

table_allhh <- allhh %>%
  tbl_summary(include = c(age, female, nh_black, hispanic, unmarried, hsorless, less75k, children),
              statistic = list(all_categorical() ~ "{p}%"),
              label = list(
                age ~ "Age",
                female ~ "Female",
                nh_black ~ "Non-Hispanic Black",
                hispanic ~ "Hispanic",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or less",
                less75k ~ "Income < $75,000",
                children ~ "Households with Children"
                )
              ) %>% 
  bold_labels() %>% 
  modify_caption("**Descriptive Statistics**") # %>% 
  # as_gt() # %>% # convert to gt table
  # gtsave(filename = "all_households.png") # save table as image

table_allhh
Descriptive Statistics
Characteristic N = 637,8751
Age 54 (41, 66)
Female 58%
Non-Hispanic Black 6.8%
Hispanic 8.5%
Unmarried 42%
Highschool or less 12%
Income < $75,000 48%
Households with Children 31%
1 Median (Q1, Q3); %

Descriptive statistics for owner households

Code
# Summary statistics of owner households

table_ownerhh <- ownerhh %>%
  tbl_summary(include = c(age, female, nh_black, hispanic, unmarried, hsorless, less75k, children),
              statistic = list(all_categorical() ~ "{p}%"),
              label = list(
                age ~ "Age",
                female ~ "Female",
                nh_black ~ "Non-Hispanic Black",
                hispanic ~ "Hispanic",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or less",
                less75k ~ "Income < $75,000",
                children ~ "Households with Children"
                )
              ) %>% 
  bold_labels() %>% 
  modify_caption("**Descriptive Statistics**") # %>% 
  # as_gt() # %>% # convert to gt table
  # gtsave(filename = "owner_households.png") # save table as image

table_ownerhh
Descriptive Statistics
Characteristic N = 479,6241
Age 57 (44, 67)
Female 57%
Non-Hispanic Black 5.1%
Hispanic 7.1%
Unmarried 34%
Highschool or less 10%
Income < $75,000 39%
Households with Children 31%
1 Median (Q1, Q3); %

Descriptive statistics for all renters

Code
# Summary statistics of all renters

table_renters <- renterhh %>%
  tbl_summary(include = c(age, female, nh_black, hispanic, unmarried, hsorless, less75k, children),
              statistic = list(all_categorical() ~ "{p}%"),
              label = list(
                age ~ "Age",
                female ~ "Female",
                nh_black ~ "Non-Hispanic Black",
                hispanic ~ "Hispanic",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or less",
                less75k ~ "Income < $75,000",
                children ~ "Households with Children"
                )
              )%>% 
  bold_labels() #%>% 
  #modify_caption("**Renters**") # %>% 
  # as_gt() # %>% # convert to gt table
  # gt::gtsave(filename = "renters.png") # save table as image

table_renters
Characteristic N = 150,3901
Age 44 (33, 58)
Female 61%
Non-Hispanic Black 12%
Hispanic 13%
Unmarried 69%
Highschool or less 18%
Income < $75,000 73%
Households with Children 29%
1 Median (Q1, Q3); %

Descriptive statistics for current renters

Code
# Summary statistics of all current renters

table_currenthh <- currenthh %>%
  tbl_summary(include = c(age, female, nh_black, hispanic, unmarried, hsorless, less75k, children),
              statistic = list(all_categorical() ~ "{p}%"),
              label = list(
                age ~ "Age",
                female ~ "Female",
                nh_black ~ "Non-Hispanic Black",
                hispanic ~ "Hispanic",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or less",
                less75k ~ "Income < $75,000",
                children ~ "Households with Children"
                )
              )%>% 
  bold_labels() # %>% 
  # modify_caption("**Current Renters**") # %>% 
  # as_gt() #%>% # convert to gt table
  #gt::gtsave(filename = "currentrenters.png") # save table as image
  
table_currenthh
Characteristic N = 133,7301
Age 44 (32, 59)
Female 60%
Non-Hispanic Black 10%
Hispanic 12%
Unmarried 69%
Highschool or less 16%
Income < $75,000 71%
Households with Children 27%
1 Median (Q1, Q3); %

Descriptive statistics for late renters

Code
# Summary statistics of all late renters

table_latehh <- latehh %>%
  tbl_summary(include = c(age, female, nh_black, hispanic, unmarried, hsorless, less75k, children),
              statistic = list(all_categorical() ~ "{p}%"),
              label = list(
                age ~ "Age",
                female ~ "Female",
                nh_black ~ "Non-Hispanic Black",
                hispanic ~ "Hispanic",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or less",
                less75k ~ "Income < $75,000",
                children ~ "Households with Children"
                )
              )%>% 
  bold_labels() # %>% 
  # modify_caption("**Late Renters**") # %>% 
  # as_gt() #%>% # convert to gt table
  #gt::gtsave(filename = "laterenters.png") # save table as image
  
table_latehh
Characteristic N = 16,6601
Age 44 (35, 54)
Female 66%
Non-Hispanic Black 23%
Hispanic 18%
Unmarried 70%
Highschool or less 28%
Income < $75,000 90%
Households with Children 47%
1 Median (Q1, Q3); %
Code
class(table_ownerhh)
[1] "tbl_summary" "gtsummary"  
Code
table_compare <- tbl_merge(tbls = list(table_ownerhh, table_currenthh, table_latehh),
                           tab_spanner = c("**Owner HH**", "**Current HH**", "**Late HH**")
                           ) %>% 
  as_gt()

table_compare
Descriptive Statistics
Characteristic
Owner HH
Current HH
Late HH
N = 479,6241 N = 133,7301 N = 16,6601
Age 57 (44, 67) 44 (32, 59) 44 (35, 54)
Female 57% 60% 66%
Non-Hispanic Black 5.1% 10% 23%
Hispanic 7.1% 12% 18%
Unmarried 34% 69% 70%
Highschool or less 10% 16% 28%
Income < $75,000 39% 71% 90%
Households with Children 31% 27% 47%
1 Median (Q1, Q3); %
Code
gtsave(table_compare, filename = "./figures/table_compare.png")
file:////var/folders/n0/x0mfrjpd1p75y8wj5zn84ztc0000gn/T//RtmpPUVwLF/file30dea90c554.html screenshot completed

Test for difference between Current and Late Renters

Code
renterhh <- renterhh %>% 
  mutate(late_label = case_when(laterent == 1 ~ "Late",
                             TRUE ~ "Current"))

tabyl(renterhh$late_label)
 renterhh$late_label      n   percent
             Current 133730 0.8892214
                Late  16660 0.1107786
Code
#PAA: Change nh_white to nh_black.  Change Bachelors to HS or less.  Tinker with other things for PAA.

# Summary statistics of all renters by rentcur

renterhh %>%
  tbl_summary(include = c(age, female, nh_black, hispanic, unmarried, hsorless, less75k, children),
              by = late_label,
              #by = laterent,
              label = list(
                age ~ "Age",
                female ~ "Female",
                nh_black ~ "Non-Hispanic Black",
                hispanic ~ "Hispanic",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or less",
                less75k ~ "Income < $75,000",
                children ~ "Children in Household"
                )
              )%>%
  
  bold_labels() %>%
  modify_caption("**Current vs. Late Renters**") %>% 
  add_p() %>% # test for a difference between groups
  as_gt() # %>% # convert to gt table
Current vs. Late Renters
Characteristic Current
N = 133,730
1
Late
N = 16,660
1
p-value2
Age 44 (32, 59) 44 (35, 54) 0.012
Female 80,590 (60%) 11,008 (66%) <0.001
Non-Hispanic Black 14,005 (10%) 3,854 (23%) <0.001
Hispanic 16,285 (12%) 2,960 (18%) <0.001
Unmarried 92,522 (69%) 11,686 (70%) 0.011
Highschool or less 21,811 (16%) 4,659 (28%) <0.001
Income < $75,000 94,362 (71%) 15,051 (90%) <0.001
Children in Household 36,136 (27%) 7,865 (47%) <0.001
1 Median (Q1, Q3); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test
Code
  #gt::gtsave(filename = "./figures/renters_by_laterent.png") # save table as image

Descriptive tables with weights

Descriptive statistics for owner households

Code
#allhh$pweight12 <- (allhh$pweight/12) # Dividing weights by number of weeks of data

svy_ownerhhw <- survey::svydesign(
  id = ~0,
  data = allhh,
  weights = ~hweight,
)

table_ownerhhw <- svy_ownerhhw %>%
  tbl_svysummary(include = c(score, age, female, nh_white, hispanic, nh_black, nh_asian, unmarried, hsorless, somecollege, baorhigher, children),
              statistic = list(all_categorical() ~ "{p}%"),
              label = list(
                score ~ "State Score",
                age ~ "Age",
                female ~ "Female",
                nh_white ~ "Non-Hispanic White",
                hispanic ~ "Hispanic",
                nh_black ~ "Non-Hispanic Black",
                nh_asian ~ "Non-Hispanic Asian",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or Less",
                somecollege ~ "Some College",
                baorhigher ~ "Bachelor's or Higher",
                children ~ "Households with Children"
                )
              ) %>% 
  bold_labels() %>% 
  modify_caption("**Descriptive Statistics**") # %>% 
  # as_gt() # %>% # convert to gt table
  # gtsave(filename = "all_households.png") # save table as image

table_ownerhhw
Descriptive Statistics
Characteristic N = 1,090,516,5621
State Score 1.58 (0.83, 3.13)
Age 50 (36, 65)
Female 51%
Non-Hispanic White 68%
Hispanic 13%
Non-Hispanic Black 11%
Non-Hispanic Asian 4.6%
Unmarried 48%
Highschool or Less 35%
Some College 30%
Bachelor's or Higher 35%
Households with Children 33%
1 Median (Q1, Q3); %

Descriptive statistics for current renters

Code
#currenthh$pweight12 <- (renterhh$pweight/12)

svy_currenthh <- survey::svydesign(
  id = ~0,
  weights = ~hweight,
  data = currenthh
)

table_currenthhw <- svy_currenthh %>%
  tbl_svysummary(include = c(score, age, female, nh_white, hispanic, nh_black, nh_asian, unmarried, hsorless, somecollege, baorhigher, children),
              statistic = list(all_categorical() ~ "{p}%"),
              label = list(
                score ~ "State Score",
                age ~ "Age",
                female ~ "Female",
                nh_white ~ "Non-Hispanic White",
                hispanic ~ "Hispanic",
                nh_black ~ "Non-Hispanic Black",
                nh_asian ~ "Non-Hispanic Asian",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or Less",
                somecollege ~ "Some College",
                baorhigher ~ "Bachelor's or Higher",
                children ~ "Households with Children"
                )
              ) %>% 
  bold_labels() #%>% 
  #modify_caption("**Renters**") # %>% 
  # as_gt() # %>% # convert to gt table
  # gt::gtsave(filename = "renters.png") # save table as image

table_currenthhw
Characteristic N = 286,666,6531
State Score 1.43 (1.15, 3.13)
Age 39 (30, 56)
Female 53%
Non-Hispanic White 58%
Hispanic 18%
Non-Hispanic Black 15%
Non-Hispanic Asian 4.3%
Unmarried 70%
Highschool or Less 39%
Some College 31%
Bachelor's or Higher 30%
Households with Children 30%
1 Median (Q1, Q3); %

Descriptive statistics for late renters

Code
#latehh$hweight12 <- (latehh$hweight/12)

svy_latehhw <- survey::svydesign(
  id = ~0,
  weights = ~hweight,
  data = latehh
)

table_latehhw <- svy_latehhw %>%
  tbl_svysummary(include = c(score, age, female, nh_white, hispanic, nh_black, nh_asian, unmarried, hsorless, somecollege, baorhigher, children),
              statistic = list(all_categorical() ~ "{p}%"),
              label = list(
                score ~ "State Score",
                age ~ "Age",
                female ~ "Female",
                nh_white ~ "Non-Hispanic White",
                hispanic ~ "Hispanic",
                nh_black ~ "Non-Hispanic Black",
                nh_asian ~ "Non-Hispanic Asian",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or Less",
                somecollege ~ "Some College",
                baorhigher ~ "Bachelor's or Higher",
                children ~ "Households with Children"
                )
              ) %>% 
  bold_labels() # %>% 
  # modify_caption("**Late Renters**") # %>% 
  # as_gt() #%>% # convert to gt table
  #gt::gtsave(filename = "laterenters.png") # save table as image
  
table_latehhw
Characteristic N = 47,259,2731
State Score 1.58 (1.28, 3.13)
Age 41 (32, 52)
Female 57%
Non-Hispanic White 36%
Hispanic 25%
Non-Hispanic Black 29%
Non-Hispanic Asian 4.8%
Unmarried 72%
Highschool or Less 55%
Some College 32%
Bachelor's or Higher 13%
Households with Children 50%
1 Median (Q1, Q3); %

Merging descriptive statistics

Code
# class(table_allhh)

table_comparew <- tbl_merge(tbls = list(table_ownerhhw, table_currenthhw, table_latehhw),
                           tab_spanner = c("**Owner HH**", "**Current HH**", "**Late HH**")
                           ) %>% 
  as_gt()

table_comparew
Descriptive Statistics
Characteristic
Owner HH
Current HH
Late HH
N = 1,090,516,5621 N = 286,666,6531 N = 47,259,2731
State Score 1.58 (0.83, 3.13) 1.43 (1.15, 3.13) 1.58 (1.28, 3.13)
Age 50 (36, 65) 39 (30, 56) 41 (32, 52)
Female 51% 53% 57%
Non-Hispanic White 68% 58% 36%
Hispanic 13% 18% 25%
Non-Hispanic Black 11% 15% 29%
Non-Hispanic Asian 4.6% 4.3% 4.8%
Unmarried 48% 70% 72%
Highschool or Less 35% 39% 55%
Some College 30% 31% 32%
Bachelor's or Higher 35% 30% 13%
Households with Children 33% 30% 50%
1 Median (Q1, Q3); %
Code
gtsave(table_comparew, filename = "table_comparew.png")
file:////var/folders/n0/x0mfrjpd1p75y8wj5zn84ztc0000gn/T//RtmpPUVwLF/file30de347c40f7.html screenshot completed

Test for difference between Current and Late renters with weights

Code
#PAA: Change nh_white to nh_black.  Change Bachelors to HS or less.  Tinker with other things for PAA.

# Summary statistics of all renters by rentcur

svy_renterhhw <- survey::svydesign(
  id = ~0,
  weights = ~hweight,
  data = renterhh
)

tbl_renterhhw <- svy_renterhhw %>%
  tbl_svysummary(include = c(age, female, nh_black, hispanic, unmarried, hsorless, less75k, children),
              by = late_label,
              #by = laterent,
              label = list(
                age ~ "Age",
                female ~ "Female",
                nh_black ~ "Non-Hispanic Black",
                hispanic ~ "Hispanic",
                unmarried ~ "Unmarried",
                hsorless ~ "Highschool or less",
                less75k ~ "Income < $75,000",
                children ~ "Children in Household"
                )
              )%>%
  
  bold_labels() %>%
  modify_caption("**Current vs. Late Renters**") %>% 
  add_p() %>% # test for a difference between groups
  as_gt() # %>% # convert to gt table
  #gt::gtsave(filename = "./figures/renters_by_laterent.png") # save table as image

tbl_renterhhw
Current vs. Late Renters
Characteristic Current
N = 286,666,653
1
Late
N = 47,259,273
1
p-value2
Age 39 (30, 56) 41 (32, 52) 0.090
Female 150,638,894 (53%) 27,032,320 (57%) <0.001
Non-Hispanic Black 43,767,431 (15%) 13,873,755 (29%) <0.001
Hispanic 52,990,154 (18%) 11,641,697 (25%) <0.001
Unmarried 200,445,010 (70%) 33,824,257 (72%) 0.017
Highschool or less 111,272,101 (39%) 25,821,549 (55%) <0.001
Income < $75,000 221,290,632 (77%) 44,135,243 (93%) <0.001
Children in Household 86,618,470 (30%) 23,766,491 (50%) <0.001
1 Median (Q1, Q3); n (%)
2 Design-based KruskalWallis test; Pearson’s X^2: Rao & Scott adjustment

U.S. map of state scores

Code
# Get centroids
centroid_labels <- usmapdata::centroid_labels("states")

# Create the dataframe with the state scores

statedat <- read.csv("./data/state_policy.csv")

statescore <- statedat %>% 
  dplyr::select(abbv, score) %>% 
  rename(abbr = abbv)

# Join data to centroids
data_labels <- merge(centroid_labels, statescore, by = "abbr")

map1dat <- data_labels %>%
  dplyr::select(fips, score)
Code
scoremap <- plot_usmap(data = map1dat, values = "score") +
  labs(title="Housing Policy Score By State",
       caption = "Source: COVID 19 Housing Policy Score") +
  theme(panel.background = element_rect(colour = "black")) +
  scale_fill_continuous(name = "State Housing Policy Score",
                        low = "white",
                        high ="darkgreen"
                        ) + 
  theme(legend.position = "right") +
  geom_sf_text(data = data_labels, ggplot2::aes(label = scales::number(score, scale = 1, accuracy = .1)), color = "black") # I'm having problems with this geom_text function.  I am not sure geom_text is a good function? Fix for PAA or use old map.

scoremap

Code
ggsave("./figures/scoremap.png") # saves last plot made
Saving 7 x 5 in image