ACS Data Pull and Cleaning

Select “code” button to the right to see the code for this section.

#bring in county hospital data and dichotomize ICU bed per capita at media
aha2017_2019 <- readRDS(paste0(data_dir,"/prepared_data/aha2017_2019.county.Rds"))

#Bring in config file with new variable names and labels
# web address: https://api.census.gov/data/2019/acs/acs5/profile/variables.html
#This will be used to create a labels list now and rename variables later.
names <- read.csv(paste0(data_dir,"/acs/acs_config_2019.csv"), header = T)

# turn labels into list object.
# pre-allocate a list and fill it with human-readable var names a loop
acs.labels.list <- vector("list", nrow(names))

for (i in 1:nrow(names)) {
  acs.labels.list[[i]] <- names[i,2]
}
#name elements using var name
acs.labels.list <- setNames(acs.labels.list, names[,1])

# all population characteristics
acs0 <- get_acs(geography = "county",
                variables = c(
                  "DP05_0001E", #Total pop
                  "DP05_0021E", # total pop 18+
                  "DP05_0021PE", # %pop 18+
                  "DP05_0024PE", # %pop 65%
                  "DP05_0019PE", # %pop < 18
                  "DP05_0003P", # %female
                  "DP05_0037PE", # %White
                  "DP05_0038PE", # %Black
                  "DP05_0039PE", # %AI/AN
                  "DP05_0044PE", # %Asian
                  "DP05_0052PE", # %NH/PI
                  "DP05_0057PE", # %Other
                  "DP05_0058PE", # 2+ races
                  "DP05_0071PE", # %Hispanic
                  "DP02_0060PE", # %25yr+ and <9th grade
                  "DP02_0067PE", # %25yr+ and at least HS graduate
                  "DP03_0027PE", # %16yr+ white collar jobs
                  "DP03_0099PE", # %wo health insurance
                  "DP03_0088E", # $per capita income
                  "DP03_0086E", # $median family income
                  "DP03_0062E", # $mean household income
                  "DP03_0052PE", # %household income <10K
                  "DP03_0053PE", # %household income 10-14K
                  "DP03_0054PE", # %household income 15-24K
                  "DP03_0055PE", # %household income 25-34K
                  "DP03_0056PE", # %household income 35-49K
                  "DP03_0057PE", # %household income 50-74K
                  "DP03_0058PE", # %household income 75-99K
                  "DP03_0059PE", # %household income 100-149K
                  "DP03_0060PE", # %household income 150-199K
                  "DP03_0061PE", # %household income 200K+
                  "DP04_0089E", # $Median home value
                  "DP04_0109E", # $Median rent
                  "DP04_0101E", # $Median mortgage
                  "DP04_0046PE", # %owner occupied
                  "DP03_0005PE", # %civilian labor force 16yr+ unemployed
                  "DP03_0119PE", # % families below poverty
                  "DP03_0128PE", # % individuals below poverty
                  "DP02_0007PE", # % male single parent family
                  "DP02_0011PE", # % female single parent family
                  "DP04_0058PE", # % households no car
                  "DP04_0075PE", # % households no phone service (inc cell)
                  "DP04_0073PE", # % households no plumbing
                  "DP04_0077PE", # % households 1 or less people per room
                  "DP05_0024PE", # % 65yr+
                  "DP05_0019PE", # % < 18yr
                  "DP02_0072PE", # % with a disability
                  "DP02_0114PE", # % 5yr+ speak English < very well
                  "DP04_0009PE", # % houses 2 units
                  "DP04_0010PE", # % houses 3-4 units
                  "DP04_0011PE", # % houses 5-9 units
                  "DP04_0012PE", # % houses 10-19 units
                  "DP04_0013PE", # % houses 20+ units
                  "DP04_0014PE", # % mobile homes
                  "DP02_0152PE" # % houses with computer
                ),
                year = 2019,
                survey = "acs5",
                geometry = FALSE
) 

acs1 <- left_join(acs0, names, by = "variable")
acs2 <- pivot_wider(acs1, id_cols = c(GEOID, NAME), names_from = name, values_from = estimate)

## Population characteristics that need some curation
# % Minority - the complement of %White

acs3 <- acs2 %>% 
  group_by(NAME) %>%
  mutate(
    over50k = sum(perc.house.50_74k,
                  perc.house.75_99k,
                  perc.house.100_149k,
                  perc.house.150_199k,
                  perc.house.ge.200k), #Income disparity, create %under 50K
    perc.singleparent = sum(perc.fam.singlemale,
                            perc.fam.singlefemale),# single parent households - add up male and female single parent homes
    perc.multiunithouse = sum(perc.house.2unit,
                              perc.house.3_4unit,
                              perc.house.5_9unit,
                              perc.house.10_19unit,
                              perc.house.ge.20unit)#Multi unit housing, sum % of all relevant categories
  )%>%
  ungroup()%>%
  mutate(
    perc.minority = (100-perc.white), # % Minority - the complement of %White
    perc.l.highschool = (100-perc.ge.highschool),# less than HS diploma - complement of %25yr+ and at least HS graduate
    perc.house.g.1perroom = (100-perc.house.le.1perroom), # Houses >1 person per room - complement of 1 or less person per room
    income.disparity = log(100*perc.house.l.10k/over50k),# Income disparity=log of 100 x ratio <10K/>50K
    total.thousands = total/1000,#simple division for presentation
    mean.indi.income.thousands = mean.indi.income/1000,
    median.fam.income.thousands = median.fam.income/1000,
    median.house.income.thousands = median.house.income/1000,
    median.house.value.thousands = median.house.value/1000,
    median.mortgage.thousands = median.mortgage/1000)

#Join with hospital data
final2017_2019 <- left_join(acs3, aha2017_2019, by = "GEOID")

#Create per capita variables and dichotomize at median

final2017_2019 <- final2017_2019 %>%
  mutate(adult_icubeds_per1k = 1000*county.adult.icu.beds/total.ge.18,
         adult_intensivists_per1k = 1000*county.adult.icu.beds/total.ge.18)

Describing Counties with Missing Hospital Information

Counties with no hospital data at all

For 2019, the ACS dataset has 3220 counties. However, our AHA dataset has only 2476. This means that there are only 2476 counties with General Medical-Surgical Hospitals in the AHA data (we excluded all other hospital types before making county tallies). When merging these two datasets by county, we get a number of counties that have no hospital information, presumably because they truly do not have General Medical-Surgical Hospitals

Below I display:

a summary of the total population numbers for these counties (in thousands)
a summery of total number of hospital beds for these counties

test1 <- final2017_2019%>%
  filter(is.na(percent.missing)==T)%>%
  select(NAME, total.thousands,county.hospital.beds.total)%>%
  arrange(desc(total.thousands))
summary(test1$total.thousands)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.066   5.720  11.917  17.384  21.567 215.044

summary(test1$county.hospital.beds.total)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      NA      NA      NA     NaN      NA      NA     751

print(test1)

## # A tibble: 751 x 3
##    NAME                            total.thousands county.hospital.beds.total
##    <chr>                                     <dbl>                      <dbl>
##  1 Berkeley County, South Carolina           215.                          NA
##  2 Columbia County, Georgia                  151.                          NA
##  3 Livingston Parish, Louisiana              139.                          NA
##  4 Randall County, Texas                     134.                          NA
##  5 Bossier Parish, Louisiana                 126.                          NA
##  6 Kendall County, Illinois                  126.                          NA
##  7 Sumter County, Florida                    125.                          NA
##  8 Sutter County, California                  96.1                         NA
##  9 Sherburne County, Minnesota                94.5                         NA
## 10 Roanoke County, Virginia                   93.8                         NA
## # ... with 741 more rows

Interpretation:

There a 751 counties with no AHA data.
Confirmed that they do not have hospitals beds.
The largest county here is Berkeley County, SC at 215,000. At least through Google Maps, there does not seem to be any hospitals in this county.

Counties with some missing hospital data

The AHA is a voluntary annual survey, with a reported approximate annual response of 75%. One way we increase our availability of data is to use combine data from multiple survey years. Specifically, we use the most recent, available 3 years of data (although this could be expanded). For this report we use 2017-2019. First, using the AHA Change Report that identifies which hospitals have closed or merged from one year to the next we remove these hospitals from the multi-year data. Second, we average data over the included years. While this will create some errors when bed numbers increase or decrease it also allows us to include data for hospitals that may have missed a year or two of survey response but are still open. A last value carried forward would be a better way to do this but involves timestamping data and was not done at this time. After we have a dataset set that is a summary average of 3 years of data we then summarize at the county level.

We wanted a way to indicate which counties may have a lot of hospitals with missing data and therefore may be problematic or inaccurate. We do this in 2 ways:

Percent missing - This indicates what percent of hospitals within each county have missing data for all of the included years of the study period (i.e. here that means they did not respond to survey 2017-2019).
Percent of hospital beds unaccounted for - This variable is possible because the AHA keeps an ongoing census of basic information on every hospital, regardless if whether it responded to the survey. Specifically, they include hospital beds (not ICU beds, a primary variable of interest for us) in their data for every hospital in their universe. With this we can sum up the total number of hospital beds in the county even from non-responding hospitals and calculate what percent of hospital beds in the county are from non-responding hospitals. This may reflect are more pertinent measure of whether county’s ICU bed data are reliable

Method 1 - Counties by Percent of Hospitals Not reporting

Percent missing was calculated by county as the proportion of hospitals with no survey response for any of the years of included AHA study period (2017-2019 here). First, a general summary of Percent Missing Among the whole dataset.

ggplot(data = final2017_2019, 
       aes(x = percent.missing))+
  geom_histogram() +
  theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(final2017_2019$percent.missing)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00   15.23    0.00  100.00     751

Looks like a number of counties with the percent of hospitals with missing information information is either 0 or 100. Specifically:

N counties with 0% missing = 1921
N counties with 100% missing = 265

Now I remove these and re-examine missingness among this subset.

ggplot(data = subset(final2017_2019, 
                     percent.missing != 0 & 
                       percent.missing != 100),
       aes(x = percent.missing))+
  geom_histogram() +
  theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

test2<-subset(final2017_2019, percent.missing != 0 & percent.missing != 100)
summary(test2$percent.missing)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   25.00   50.00   39.23   50.00   79.00

Method 2 - Counties by Percent of Hospitals Beds from non-reporting hospitals

A general summary of the percent of hospital beds from non-responding hospitals.

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted))+
  geom_histogram() +
  theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(final2017_2019$percent.hospital.beds.unaccounted)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00   14.35    0.00  100.00     751

As expected, lots of 0s and 100s. Actually, the numbers of 0s and 100s should be nearly equivalent between the two methods since if all hospitals not reporting then will have all hospital beds unaccounted for and vice versa:

N counties with 0% missing = 1922
N counties with 100% missing = 265

Now among the subset of counties with Percent Hospital Beds from non-responding hospitals not equal to 0 or 100, I present the summary of the percent of hospital beds missing.

test2 <-subset(final2017_2019, percent.hospital.beds.unaccounted != 0 & 
                       percent.hospital.beds.unaccounted != 100)
ggplot(data = test2,
       aes(x = percent.hospital.beds.unaccounted))+
  geom_histogram() +
  theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(test2$percent.hospital.beds.unaccounted)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   13.00   27.00   31.63   49.00   97.00

Interpretation:

The number of counties with missing hospital info are same with both methods
The number of counties with the percent of hospitals or hospital beds for the county missing of 0 and 100 are same between both methods.
It is the approximately 1000 counties with missingness between 0 and 100 where the difference will be seen. At least comparing histograms and summary stats of these subsets for each method, method 2 has lower percent missingness.

Direct Comparison of Methods 1 & 2

Here I compare the two measures of county-level missingness on each access. Data is with counties with 0 and 100% missing removed since these are the essentialy the same for each method.

test2 <- test2%>%
  mutate(difference = percent.hospital.beds.unaccounted - percent.missing)

ggplot(data = test2,
       aes(x = percent.missing,
           y = percent.hospital.beds.unaccounted))+
  geom_point()+
  geom_abline(slope = 1)

Here is a different way to look at the same data. For each county I subtract the Hospital Bed measure from the Hospital measure of non-response and present summary and graph of this difference. Here, a negative number indicates that there is a lower degree of missingness regarding hospital resources using the hospital bed measure.

ggplot(data = test2,
       aes(x=difference))+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(test2$difference)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -50.000 -19.000  -9.000  -7.706   0.750  47.000

INterpretation:

First graph shows a preponderance of points below the line of unity. This indicates that there is variation but the Method 2 generally has lower % missingness.
Second graph of the differences between two methods demonstrates again a preponderance to the negative nubmers, supported by summary statistics.
These support that Method 2 is better accounting of missingess in giving lower numbers as well as in public health significance since it is really the resources we are trying to account for.

Examination of Missigness with Other Variables

Here I do a graphical examination exploring any associations of county-level hospital bed missingess with other variables.

AHA variables

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = total.hospitals))+
  geom_point() +
  geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = county.hospital.beds.total))+
  geom_point() +
    geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = subset(final2017_2019, county.hospital.beds.total<7500),
       aes(x = percent.hospital.beds.unaccounted,
           y = county.hospital.beds.total))+
  geom_point() +
    geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = county.adult.icu.beds))+
  geom_point() +
    geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = subset(final2017_2019, county.hospital.beds.total<7500),
       aes(x = percent.hospital.beds.unaccounted,
           y = adult_icubeds_per1k))+
  geom_point() +
  geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = county.adult.fte))+
  geom_point() +
    geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ACS variables

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = total.thousands))+
  geom_point() +
    geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = perc.minority))+
  geom_point() +
  geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = median.fam.income))+
  geom_point() +
  geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = median.fam.income))+
  geom_point() +
  geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = perc.fam.poverty))+
  geom_point() +
  geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = income.disparity))+
  geom_point() +
  geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = final2017_2019, 
       aes(x = percent.hospital.beds.unaccounted,
           y = perc.noinsurance))+
  geom_point() +
  geom_smooth()+
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

County Characteristics Population Summary At Different Levels of Missingness

Here I examine a basic summary but with recategorizing counties based on their percent missingness. These counties data are labeled “unreliable”

Any Missingingness

Here if Percent Hospital Beds Missing > 0, then categorized as “unreliable”.

	High (N=1051)	Low (N=871)	No Hospital (N=751)	Unreliable (N=547)	Total (N=3220)
Total population, thousands
Mean (SD)	128 (215)	40 (100)	17 (20)	265 (694)	102 (327)
Range	1 - 2050	1 - 2287	0 - 215	1 - 10082	0 - 10082

10% Missingingness

Here if Percent Hospital Beds Missing > 10, then categorized as “unreliable”.

	High (N=1107)	Low (N=877)	No Hospital (N=751)	Unreliable (N=485)	Total (N=3220)
Total population, thousands
Mean (SD)	156 (309)	41 (102)	17 (20)	218 (661)	102 (327)
Range	1 - 4647	1 - 2287	0 - 215	1 - 10082	0 - 10082

25% Missingingness

Here if Percent Hospital Beds Missing > 25, then categorized as “unreliable”.

	High (N=1171)	Low (N=883)	No Hospital (N=751)	Unreliable (N=415)	Total (N=3220)
Total population, thousands
Mean (SD)	205 (506)	41 (102)	17 (20)	93 (188)	102 (327)
Range	1 - 10082	1 - 2287	0 - 215	1 - 2182	0 - 10082

50% Missingingness

Here if Percent Hospital Beds Missing > 50, then categorized as “unreliable”.

	High (N=1218)	Low (N=927)	No Hospital (N=751)	Unreliable (N=324)	Total (N=3220)
Total population, thousands
Mean (SD)	207 (499)	47 (116)	17 (20)	58 (147)	102 (327)
Range	1 - 10082	1 - 2287	0 - 215	1 - 2182	0 - 10082

Acquiring ACS Data via the Census API

Erin R Stearns & Jordan A Kempker

2021-07-01