Packages.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
Load data.
df <- read.csv("insurance.csv")
colnames(df)[1] <- "Year"
Remove commas.
df_cleaned <- as.data.frame(lapply(df, function(x) gsub(",+","",x)))
df_cleaned <- as.data.frame(lapply(df_cleaned, function(x) as.numeric(as.character(x))))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
str(df_cleaned)
## 'data.frame': 56 obs. of 38 variables:
## $ Year : num 1960 1961 1962 1963 1964 ...
## $ Total.National.Health.Expenditures : num 27214 29138 31842 34595 38394 ...
## $ Out.of.pocket : num 12949 13357 14255 15311 16928 ...
## $ Health.Insurance : num 7497 8236 8999 9892 10971 ...
## $ Private.Health.Insurance : num 5812 6468 7178 7952 9052 ...
## $ Medicare : num NA NA NA NA NA ...
## $ Medicaid..Title.XIX. : num NA NA NA NA NA ...
## $ Medicaid..Title.XIX..Federal : num NA NA NA NA NA ...
## $ Medicaid..Title.XIX..State.and.Local : num NA NA NA NA NA ...
## $ CHIP..Title.XIX.and.Title.XXI. : num NA NA NA NA NA NA NA NA NA NA ...
## $ CHIP..Title.XIX.and.Title.XXI..Federal : num NA NA NA NA NA NA NA NA NA NA ...
## $ CHIP..Title.XIX.and.Title.XXI..State.and.Local : num NA NA NA NA NA NA NA NA NA NA ...
## $ Department.of.Defense : num 788 836 860 925 861 ...
## $ Department.of.Veterans.Affairs : num 897 933 961 1015 1058 ...
## $ Other.Third.Party.Payers.and.Programs : num 3889 4329 4703 5206 5659 ...
## $ Worksite.Health.Care : num 62 66 71 78 87 97 110 128 147 171 ...
## $ Other.Private.Revenues : num 295 330 365 421 486 ...
## $ Indian.Health.Services : num NA NA NA NA NA NA NA NA NA NA ...
## $ Workers..Compensation : num 615 644 705 742 807 ...
## $ General.Assistance : num 102 110 102 103 112 122 182 240 299 359 ...
## $ Maternal.Child.Health : num 116 172 180 201 220 244 291 324 377 435 ...
## $ Maternal.Child.Health.Federal : num 27 44 48 50 60 80 114 142 167 190 ...
## $ Maternal.Child.Health.State.and.Local : num 89 127 133 150 159 164 176 182 210 244 ...
## $ Vocational.Rehabilitation : num 20 22 25 29 33 41 58 88 115 131 ...
## $ Vocational.RehabilitationFederal : num 12 14 15 18 20 27 42 66 86 100 ...
## $ Vocational.Rehabilitation.State.and.Local : num 8 8 10 11 13 15 16 22 29 31 ...
## $ Other.Federal.Programs. : num 337 447 598 715 750 831 653 463 489 621 ...
## $ SAMHSA : num NA NA NA NA NA NA NA NA NA NA ...
## $ Other.State.and.Local.Programs.. : num 2221 2399 2510 2773 3013 ...
## $ School.Health : num 122 140 147 143 151 168 187 215 242 263 ...
## $ Public.Health.Activity : num 371 409 456 509 572 ...
## $ Public.Health.Activity.Federal : num 102 108 140 208 226 214 294 401 470 564 ...
## $ Public.Health.Activity.State.and.Local : num 269 302 316 301 347 407 439 460 499 610 ...
## $ Investment : num 2508 2806 3427 3677 4264 ...
## $ Research : num 694 886 1068 1219 1366 ...
## $ Structures...Equipment : num 1814 1921 2360 2458 2898 ...
## $ Total.CMS.Programs..Medicaid..CHIP.and.Medicare.: num NA NA NA NA NA ...
## $ POPULATION : num 186 189 192 195 197 200 202 204 206 208 ...
Total health expenditures over time.
plot1 <- ggplot(df_cleaned, aes(Year, Total.National.Health.Expenditures)) + geom_area(fill="#228B22")
last_plot() + ggtitle("Total National Healthcare Expenditures") + labs(x="Year",y="Amount")
Does this at all correlate with population growth?
df_shaved <- select(df_cleaned, Year, Total.National.Health.Expenditures, POPULATION)
colnames(df_shaved) <- c("Year", "National_Health_Expenditures", "Population")
Scale expenditures to billions.
df_shaved$National_Health_Expenditures <- df_shaved$National_Health_Expenditures/1000
Reshape.
df_tall <- melt(df_shaved, id="Year")
colnames(df_tall) <- c("Year", "Statistic", "Value")
Plot.
plot2 <- ggplot(df_tall, aes(Year, Value, color=Statistic)) + geom_line() + scale_color_manual(labels=c("Healthcare Expenditures ($B)", "Population (M)"), values=c("#228B22", "#4169E1"))
last_plot() + ggtitle(bquote(atop(.("National Health Expenditures and Population")))) + labs(x="Year",y="Amount")
Expenditures are in billions, population in millions. Hm, not so correlated.
To be more accurate, we can look at growth rates.
growth <- function(x) x/lag(x)-1
df_growth <- df_shaved %>% mutate(National_Health_Expenditures_Change = growth(National_Health_Expenditures))
df_growth <- df_growth %>% mutate(Population_Change = growth(Population))
df_growth_shaved <- select(df_growth, Year, National_Health_Expenditures_Change, Population_Change)
df_growth_tall <- melt(df_growth_shaved, id="Year")
colnames(df_growth_tall) <- c("Year", "Statistic", "Value")
plot3 <- ggplot(na.omit(df_growth_tall), aes(Year, Value, color=Statistic)) + geom_line() + scale_color_manual(labels=c("% Change in Expenditures", "% Change in Population"), values=c("#228B22", "#4169E1"))
last_plot() + ggtitle("Growth in Healthcare Expenditures and Population") + labs(x="Year",y="Amount")
So our hunch was totally correct. Interestingly though, the growth rate of expenditures was by far the highest around 1980, and then decreased for a while. We’ll come back to what happened there later.
We can make overlayed graphs for each line item, first in absolute terms and then in percentage change.
df_all_tall <- melt(df_cleaned, id="Year")
colnames(df_all_tall) <- c("Year", "Statistic", "Value")
plot4 <- ggplot(df_all_tall, aes(Year, Value, color=Statistic)) + geom_line()
last_plot() + ggtitle("Hi")
## Warning: Removed 174 rows containing missing values (geom_path).
To decide which ones to use, we’ll make a threshold / cutoff - anything below 2 standard deviations from the mean.
means <- data.frame(Variable = colnames(df_cleaned), Mean = colMeans(na.omit(df_cleaned)))
quantiles <- quantile(means$Mean)
Here’s what lies in the 75th percentile.
top75 <- filter(means, Mean >= quantiles[4])
top75names <- as.vector(as.character(top75$Variable))
top75names
## [1] "Total.National.Health.Expenditures"
## [2] "Out.of.pocket"
## [3] "Health.Insurance"
## [4] "Private.Health.Insurance"
## [5] "Medicare"
## [6] "Medicaid..Title.XIX."
## [7] "Medicaid..Title.XIX..Federal"
## [8] "Medicaid..Title.XIX..State.and.Local"
## [9] "Other.Third.Party.Payers.and.Programs"
## [10] "Total.CMS.Programs..Medicaid..CHIP.and.Medicare."
Remove 1 and 10, as they’re total figures.
df_shaved <- select(df_cleaned, Year, one_of(top75names[2:9]))
df_tall <- melt(df_shaved, id="Year")
colnames(df_tall) <- c("Year", "Statistic", "Value")
plot5 <- ggplot(na.omit(df_tall), aes(Year, Value, color=Statistic)) + geom_line()
last_plot() + ggtitle("Top Line Items") + labs(x="Year",y="Amount")
More legible.
We can also look at average growth rate over the years. We’ll find the growth rate for all first to see if anything is out of the norm, then separate into groups.
df_all_growth <- as.data.frame(lapply(df_cleaned, function(x) growth(x)))
Now, the annualized growth rate over the period should be the average of each column.
growth_means <- colMeans(df_all_growth, 1)
#Select the top 10 to graph
df_growth_graph1 <- data.frame(Statistic = colnames(df_cleaned), Growth = growth_means)
df_growth_top10 <- arrange(df_growth_graph1, desc(Growth))[1:10,]
plot6 <- ggplot(df_growth_top10, aes(reorder(Statistic, -Growth), Growth, fill=Statistic)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1)) + guides(fill=FALSE)
last_plot() + labs(title="Top 10 Annualized Growth Rates", x="", y="Growth")
What are these top 3 items that are growing much faster than everything else? CHIP stands for Children’s Health Insurance Program. It gives low cost coverage to kids who’s families make too much to quality for medicaid.
top_3_growth <- select(df_cleaned, Year, CHIP..Title.XIX.and.Title.XXI..Federal, CHIP..Title.XIX.and.Title.XXI., CHIP..Title.XIX.and.Title.XXI..State.and.Local)
The first year this thing came into being is:
top_3_growth[!is.na(top_3_growth$CHIP..Title.XIX.and.Title.XXI.),][1,"Year"]
## [1] 1998
And apparently it has been growing rather quickly. But, at least as of 2015, the total for the program is tiny compared to other line items here.
mean(na.omit(top_3_growth$CHIP..Title.XIX.and.Title.XXI.))
## [1] 8449.167
But it’s definitely something to keep an eye on.
Otherwise, the growth rates of all of our items don’t actually vary that much.
df_normal_growth <- df_cleaned %>% select(-CHIP..Title.XIX.and.Title.XXI., -CHIP..Title.XIX.and.Title.XXI..Federal, -CHIP..Title.XIX.and.Title.XXI..State.and.Local)
df_normal_growth <- as.data.frame(lapply(na.omit(df_normal_growth), function(x) growth(x)))
normal_growth_means <- colMeans(df_normal_growth,1)
#Standard deviation of growth rates
sd(normal_growth_means)
## [1] 0.02792628
It makes sense to look at things in terms of groups: e.x. (private + medicare + medicaid), (federal + state and local).
str(df_cleaned)
## 'data.frame': 56 obs. of 38 variables:
## $ Year : num 1960 1961 1962 1963 1964 ...
## $ Total.National.Health.Expenditures : num 27214 29138 31842 34595 38394 ...
## $ Out.of.pocket : num 12949 13357 14255 15311 16928 ...
## $ Health.Insurance : num 7497 8236 8999 9892 10971 ...
## $ Private.Health.Insurance : num 5812 6468 7178 7952 9052 ...
## $ Medicare : num NA NA NA NA NA ...
## $ Medicaid..Title.XIX. : num NA NA NA NA NA ...
## $ Medicaid..Title.XIX..Federal : num NA NA NA NA NA ...
## $ Medicaid..Title.XIX..State.and.Local : num NA NA NA NA NA ...
## $ CHIP..Title.XIX.and.Title.XXI. : num NA NA NA NA NA NA NA NA NA NA ...
## $ CHIP..Title.XIX.and.Title.XXI..Federal : num NA NA NA NA NA NA NA NA NA NA ...
## $ CHIP..Title.XIX.and.Title.XXI..State.and.Local : num NA NA NA NA NA NA NA NA NA NA ...
## $ Department.of.Defense : num 788 836 860 925 861 ...
## $ Department.of.Veterans.Affairs : num 897 933 961 1015 1058 ...
## $ Other.Third.Party.Payers.and.Programs : num 3889 4329 4703 5206 5659 ...
## $ Worksite.Health.Care : num 62 66 71 78 87 97 110 128 147 171 ...
## $ Other.Private.Revenues : num 295 330 365 421 486 ...
## $ Indian.Health.Services : num NA NA NA NA NA NA NA NA NA NA ...
## $ Workers..Compensation : num 615 644 705 742 807 ...
## $ General.Assistance : num 102 110 102 103 112 122 182 240 299 359 ...
## $ Maternal.Child.Health : num 116 172 180 201 220 244 291 324 377 435 ...
## $ Maternal.Child.Health.Federal : num 27 44 48 50 60 80 114 142 167 190 ...
## $ Maternal.Child.Health.State.and.Local : num 89 127 133 150 159 164 176 182 210 244 ...
## $ Vocational.Rehabilitation : num 20 22 25 29 33 41 58 88 115 131 ...
## $ Vocational.RehabilitationFederal : num 12 14 15 18 20 27 42 66 86 100 ...
## $ Vocational.Rehabilitation.State.and.Local : num 8 8 10 11 13 15 16 22 29 31 ...
## $ Other.Federal.Programs. : num 337 447 598 715 750 831 653 463 489 621 ...
## $ SAMHSA : num NA NA NA NA NA NA NA NA NA NA ...
## $ Other.State.and.Local.Programs.. : num 2221 2399 2510 2773 3013 ...
## $ School.Health : num 122 140 147 143 151 168 187 215 242 263 ...
## $ Public.Health.Activity : num 371 409 456 509 572 ...
## $ Public.Health.Activity.Federal : num 102 108 140 208 226 214 294 401 470 564 ...
## $ Public.Health.Activity.State.and.Local : num 269 302 316 301 347 407 439 460 499 610 ...
## $ Investment : num 2508 2806 3427 3677 4264 ...
## $ Research : num 694 886 1068 1219 1366 ...
## $ Structures...Equipment : num 1814 1921 2360 2458 2898 ...
## $ Total.CMS.Programs..Medicaid..CHIP.and.Medicare.: num NA NA NA NA NA ...
## $ POPULATION : num 186 189 192 195 197 200 202 204 206 208 ...
Groups to check out: • Private / Medicare / Medicaid
df_group1 <- df_cleaned %>% select(Year, Private.Health.Insurance, Medicare, Medicaid..Title.XIX.) %>% mutate(Total = Private.Health.Insurance + Medicare + Medicaid..Title.XIX.) %>% mutate(Private.Health.Insurance.Share = Private.Health.Insurance / Total, Medicare.Share = Medicare / Total, Medicaid..Title.XIX.Share = Medicaid..Title.XIX. / Total) %>% select(Year, Private.Health.Insurance.Share, Medicare.Share, Medicaid..Title.XIX.Share)
First we can plot how these shares shifted over time, and the a pie chart of the average over the time period.
df_group1_tall <- melt(df_group1, id="Year")
colnames(df_group1_tall) <- c("Year", "Item", "Share")
plot7 <- ggplot(na.omit(df_group1_tall), aes(Year, Share, color=Item)) + geom_line()
last_plot() + scale_color_manual(name="Insurance Type", labels=c("Private", "Medicare", "Medicaid"),values=c("#228B22", "#4169E1","#ff0000")) + ggtitle("Share of Total Expenditures by Insurance Type")
Average over the period:
shares <- data.frame(Type = colnames(df_group1)[2:4], Share = colMeans(df_group1, 1)[2:4])
#Plot
plot8 <- ggplot(shares, aes("", Share, fill=Type)) + geom_bar(width = 1, stat="identity")
plot8 + coord_polar(theta = "y") + scale_fill_manual(name="Insurance Type", labels=c("Medicaid", "Medicare", "Private"),values=c("#ff0000", "#4169E1", "#228B22")) + ggtitle("Share of Total Expenditures by Insurance Type") + theme(axis.text = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank())
Summary: • Things growing very fast • Expenditures growing much faster than population • Expenditures growth not even, around 15% in the 80’s, has been increasing lately • Aside from 3 small programs, everything growing at a similar rate • Medicare / Medicaid have been increasing their share of total insurance expenditures