Introduction

The Medical Expenditure Panel Survey is a rich repository for patient level information that contains information on healthcare use, payments, and disease burdens for the non institutionalized civilian population of the US.

All of this data is nationally representative, provided you use the survey weights given in the data files.

The primary purpose of this document is to serve as a personal log to myself: a reminder of how to use survey weights to produce population level estimates. Additionally, I hope that this serves as a work sample of my ability to use R to analyze large datasets and create meaningful visualizations from raw data.

Code

These are the libraries you need for the analysis.

library(foreign)
library(survey)
library(dplyr)
library(Hmisc)
library(srvyr)
library(ggplot2)
library(ggthemes)
options(scipen = 999)

Programatically access MEPS data from 2016.

download.file("https://meps.ahrq.gov/data_files/pufs/h192ssp.zip", 
              temp <- tempfile())

unzipped_file = unzip(temp) #this unzips the zip file that was downloaded and saved as a temp file

meps2016 = read.xport(unzipped_file) #this returns a list of dataframes

unlink(temp)  # Unlink to delete temporary file

Save the file using this command.

save(meps2016, file="meps2016.Rdata") #should be saved in your working directory

Create the breaks for the age groups.

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
agebreaks <- c(15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90)
agelabels <- c("15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39",
               "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", 
               "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85 plus")

setDT(meps2016)[ , AgeGroup := cut(AGELAST, 
                                breaks = agebreaks, 
                                right = FALSE, 
                                labels = agelabels)]

Factorize and label the genders.

meps2016$Gender <- as.factor(ifelse(meps2016$SEX == 1, "Male", "Female"))

Create the MEPS survey design.

mepsdesign <- meps2016 %>% as_survey_design(ids = VARPSU, strata = VARSTR,
                                     weight = PERWT16F, nest = TRUE)

To evaluate the number of people suffering from coronary artery disease, set CHDDX == 1 and then aggregate by gender and age group.

pyramid <- mepsdesign %>% filter(CHDDX == 1) %>% 
  select(AgeGroup, Gender, CHDDX) %>%
  group_by(AgeGroup, Gender) %>%
summarize(Number = survey_total()) 

Change the female values to negative to make sure that they show up on the left of the central line in the chart. This is what distinguishes our pyramid chart from a regular stacked bargraph.

pyramid2 <- pyramid
pyramid2$Number <- ifelse(pyramid2$Gender == "Female", -pyramid2$Number, pyramid2$Number )

Plot the graph.

library(ggplot2)
library(ggthemes)
options(scipen = 999)

# X Axis Breaks and Labels 
brks <- seq(-2000000, 2000000, 250000)
lbls = paste0(as.character(c(seq(2, 0, -0.25), seq(0.25, 2, 0.25))), "m")


# Plot
ggplot(pyramid2, aes(x = AgeGroup, y = Number, fill = Gender)) +   # Fill column
                              geom_bar(stat = "identity", width = .6) +   # draw the bars
                              scale_y_continuous(breaks = brks,   # Breaks
                                                 labels = lbls) + # Labels
                              coord_flip() +  # Flip axes
                              labs(title="Pyramid Chart of Coronary Heart Disease") +
                              theme_tufte() +  # Tufte theme from ggfortify
                              theme(plot.title = element_text(hjust = .5), 
                                    axis.ticks = element_blank()) +   # Centre plot title
                              scale_fill_brewer(palette = "Set1")  # Color palette

What if you wanted to do it for another condition? Say, ANGIDX?

pyramid <- mepsdesign %>% filter(ANGIDX == 1) %>% 
  select(AgeGroup, Gender, ANGIDX) %>%
  group_by(AgeGroup, Gender) %>%
summarize(Number = survey_total()) 


pyramid2 <- pyramid
pyramid2$Number <- ifelse(pyramid2$Gender == "Female", -pyramid2$Number, pyramid2$Number )


# X Axis Breaks and Labels 
#brks <- seq(-2000000, 2000000, 500000)
#lbls = paste0(as.character(c(seq(2, 0, -0.5), seq(0.5, 2, 0.5))), "m")
brks <- seq(-2000000, 2000000, 250000)
lbls = paste0(as.character(c(seq(2, 0, -0.25), seq(0.25, 2, 0.25))), "m")


# Plot
ggplot(pyramid2, aes(x = AgeGroup, y = Number, fill = Gender)) +   # Fill column
                              geom_bar(stat = "identity", width = .6) +   # draw the bars
                              scale_y_continuous(breaks = brks,   # Breaks
                                                 labels = lbls) + # Labels
                              coord_flip() +  # Flip axes
                              labs(title="Distribution of Angina in 2016",
                                   x = "Age Group", y = "Estimated Number of Patients") +
                              theme_tufte() +  # Tufte theme from ggfortify
                              theme(plot.title = element_text(hjust = .5), 
                                    axis.ticks = element_blank()) +   # Centre plot title
                              scale_fill_brewer(palette = "Dark2")  # Color palette

Visualizing the distribution of these diseases is interesting. However, can we find out what these diseases cost a person on average as far as out-of-pocket (OOP) expenses are concerned?

I want to see how many people had some kind of heart disease and what percent of their total income (OOP) they spent on healthcare in the year 2016. Data on premiums is not available in the household consolidated file. Only the household consolidated file is uploaded for open access.

#processing the data

try1 <- meps2016 %>% 
filter( ( CHDDX == 1 & ANGIDX == 1 & MIDX == 1 & OHRTDX == 1) &
        ( TTLP16X > 0 ) &
        ( TOTSLF16 ) > 0) 
#adding a column to hold the percentage of the values

try1 <- try1 %>% 
  mutate(pctinxhlth = round(TOTSLF16/TTLP16X*100,2)) %>% 
  arrange(desc(pctinxhlth)) %>% select(CHDDX, ANGIDX, MIDX, OHRTDX,
                                       TTLP16X, TOTSLF16, pctinxhlth)

What is the mean when you just calculate it without applying survey weights?

try1 %>% summarize( medpct = round(median(pctinxhlth),2), 
                    meanpct = round(mean(pctinxhlth), 2)) 
##   medpct meanpct
## 1   2.87    6.62
#heavily skewed distribution. The mean is much larger than the median.

Let us try to weight this using PERWT16F as the weight variable.

mepsdesign2 <- meps2016 %>% as_survey_design(ids = VARPSU, strata = VARSTR,
                                     weight = PERWT16F, nest = TRUE)
try2 <- mepsdesign2 %>% 
filter( ( CHDDX == 1 & ANGIDX == 1 & MIDX == 1 & OHRTDX == 1) &
        ( TTLP16X > 0 ) &
        ( TOTSLF16 ) > 0) %>% 
select(CHDDX, ANGIDX, MIDX, OHRTDX, TTLP16X, TOTSLF16, VARPSU, VARSTR, PERWT16F) %>%
  mutate(pctinxhlth = round(TOTSLF16/TTLP16X*100,2)) 
  try2 %>% summarize( medpct = round(survey_median(pctinxhlth, vartype = "se"),2), 
                    meanpct = round(survey_mean(pctinxhlth), 2)) 
## # A tibble: 1 x 4
##   medpct_q50 medpct_q50_se meanpct meanpct_se
##        <dbl>         <dbl>   <dbl>      <dbl>
## 1       2.63           0.6    6.72       1.83
df <- data.frame(
  group = c("Total", "Mean OOP Spending"),
  value = c(100, 6.72)
  )


pie <- ggplot(df, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0)
pie

This shows that, on average, people who had severe heart disease (defined as having been diagnosed with coronary artery disease, and angina, having suffered a heart attack and having other heart conditions ) spent about 6.7 percent of their annual income in 2016 on out-of-pocket healthcare costs. This does not include premiums. About half of all Americans with severe heart disease spent more than 2.6 percent of their annual incomes on OOP costs.