1. df Overview
1.1 Missing values
The dataset has no missing values
which(is.na(df))## integer(0)
1.2 Summary Statistics
summary(df)## state stateid year t cpi
## Length:663 Min. : 1 Min. :2006 Min. : 6 Min. :198.3
## Class :character 1st Qu.:13 1st Qu.:2009 1st Qu.: 9 1st Qu.:211.1
## Mode :character Median :26 Median :2012 Median :12 Median :226.7
## Mean :26 Mean :2012 Mean :12 Mean :224.0
## 3rd Qu.:39 3rd Qu.:2015 3rd Qu.:15 3rd Qu.:233.9
## Max. :51 Max. :2018 Max. :18 Max. :247.9
## mcare_millions medicaid_spend_actual overdoses population
## Min. : 418 Min. :4.090e+08 Min. : 10.0 Min. : 522667
## 1st Qu.: 2490 1st Qu.:2.075e+09 1st Qu.: 122.5 1st Qu.: 1666720
## Median : 6808 Median :5.120e+09 Median : 370.0 Median : 4357847
## Mean :10587 Mean :8.584e+09 Mean : 552.9 Mean : 6145898
## 3rd Qu.:12426 3rd Qu.:9.636e+09 3rd Qu.: 699.5 3rd Qu.: 6889846
## Max. :77603 Max. :8.553e+10 Max. :4293.0 Max. :39461588
## hhincome state_gdp unemployment_pct labor_participation_pct
## Min. :37714 Min. : 26802 Min. : 2.400 Min. :49.30
## 1st Qu.:53071 1st Qu.: 78045 1st Qu.: 4.300 1st Qu.:59.10
## Median :58793 Median : 186849 Median : 5.400 Median :62.40
## Mean :59593 Mean : 321957 Mean : 5.939 Mean :62.35
## 3rd Qu.:65624 3rd Qu.: 417780 3rd Qu.: 7.300 3rd Qu.:65.70
## Max. :86345 Max. :2721651 Max. :13.700 Max. :73.20
## insured_pct grad_hs_pct is_manufacturing_state post_recession
## Min. :75.26 Min. :78.70 Min. :0.0000 Min. :0.0000
## 1st Qu.:84.75 1st Qu.:84.90 1st Qu.:0.0000 1st Qu.:1.0000
## Median :88.10 Median :88.10 Median :0.0000 Median :1.0000
## Mean :87.91 Mean :87.42 Mean :0.2157 Mean :0.8462
## 3rd Qu.:91.20 3rd Qu.:90.20 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :96.80 Max. :93.00 Max. :1.0000 Max. :1.0000
## per_capita_state_tax_revenue pct_change_str medicaidml
## Min. : 905.9 Min. :-0.77900 Min. : 408.9
## 1st Qu.: 3546.5 1st Qu.: 0.00750 1st Qu.: 2075.2
## Median : 4067.2 Median : 0.02700 Median : 5120.2
## Mean : 4531.7 Mean : 0.02559 Mean : 8584.1
## 3rd Qu.: 4976.3 3rd Qu.: 0.04700 3rd Qu.: 9636.1
## Max. :14609.0 Max. : 1.01000 Max. :85531.0
## tmcaremcaidml tmcaremcaidmlreal Prescription.Rate ovd_cost
## Min. : 941.4 Min. : 1177 Min. : 25.00 Min. : 6353
## 1st Qu.: 4743.9 1st Qu.: 5112 1st Qu.: 60.15 1st Qu.: 89125
## Median : 12436.2 Median : 14190 Median : 75.20 Median : 324819
## Mean : 19375.8 Mean : 21247 Mean : 77.48 Mean : 421166
## 3rd Qu.: 22596.7 3rd Qu.: 24453 3rd Qu.: 90.25 3rd Qu.: 596855
## Max. :167578.6 Max. :167579 Max. :146.90 Max. :2921932
1.3 Normalizing overdose deaths
The prescription rates data is dispensing rate per 100 people
# Normalizing opioid deaths per 100,000 people in the U.S
df$ovd_rate = df$overdoses / df$population * 1002 EDA
2.1 overdoses by year
# Calculating sum of overdoses by state
ovd_sum = tapply(df$overdoses, df$year, FUN=sum)
fig <- plot_ly(x = year, y = ovd_sum, type = 'bar') %>%
layout(title = 'Sum of Opioid Overdose Deaths, 2006-2018',
plot_bgcolor='#e5ecf6')
fig2.2 prescription rates by year
PR_sum = tapply(df$Prescription.Rate, df$year, FUN=sum)
fig2 <- plot_ly(x = year, y = PR_sum, type = 'bar') %>%
layout(title = 'Sum of Prescription Rates, 2006-2018')
fig22.3 Overdoses by State
Ovd_by_state = tapply(df$overdoses, df$state, FUN=sum)
Ovd_by_state = data.frame(Ovd_by_state)2.4:Boxplots to find df outliers – What do do with this information?
library(ggplot2)
ggplot(data = df,
mapping = aes(x = reorder(state, overdoses), y = overdoses, fill = state)) +
geom_boxplot() +
labs(x=NULL, y="overdoses") +
coord_flip() +
geom_hline(yintercept = 493, linetype="dashed", color = "red", size = 1) +
theme_bw()2.5: Scatter plots to analyze predictors.
library(gridExtra)
library(cowplot)
sc1 = ggplot(data = df, aes(x =ovd_rate, y = medicaidml)) + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE) + xlab("overdose") + ylab("medicade in millions") + ggtitle("overdoses ~ medicade($ million)")
sc2 = ggplot(data = df, aes(x = overdoses, y = unemployment_pct)) + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE) + xlab("overdose") + ylab("unemployed %") + ggtitle("overdoses ~ unemployment %")
sc3 = ggplot(data = df, aes(x =overdoses, y = Prescription.Rate)) + geom_point() + geom_smooth(method = lm, fill="blue", color="red", se = FALSE) + xlab("overdose") + ylab("Prescription Rate") + ggtitle("overdoses ~ Prescription rate")
sc4 = ggplot(data = df, aes(x =overdoses, y = labor_participation_pct)) + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE) + xlab("overdose") + ylab("HH income") + ggtitle(" overdoses ~ LFPR")
plot_grid(sc1, sc2,sc3,sc4)## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
3. Modeling
We will work with a p df frame – meaning panel df. With the indivisual states and time periods
We compare a single state by itself and are not concerned with variation between states.
# We will convert the df frame to a panel df frame.
library(plm)
df.p = pdata.frame(df, index = c("state", "year"))3.1 Fixed effects model | Within model as it only focuses on the variation within the indivisual observations.
library(stargazer)
fixeff = plm(ovd_rate~Prescription.Rate + hhincome + unemployment_pct +
labor_participation_pct + grad_hs_pct, data = df.p, model = "within")
stargazer(fixeff, type='text')##
## ===================================================
## Dependent variable:
## ---------------------------
## ovd_rate
## ---------------------------------------------------
## Prescription.Rate -0.0001***
## (0.00002)
##
## hhincome 0.00000
## (0.00000)
##
## unemployment_pct -0.00000
## (0.0001)
##
## labor_participation_pct -0.0002*
## (0.0001)
##
## grad_hs_pct 0.002***
## (0.0002)
##
## ---------------------------------------------------
## Observations 663
## R2 0.435
## Adjusted R2 0.384
## F Statistic 93.525*** (df = 5; 607)
## ===================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
3.2 Random effects Model
raneff = plm(ovd_rate~Prescription.Rate + hhincome + unemployment_pct + labor_participation_pct + grad_hs_pct , data = df.p, model = "random")
stargazer(raneff, type='text')##
## ===================================================
## Dependent variable:
## ---------------------------
## ovd_rate
## ---------------------------------------------------
## Prescription.Rate -0.0001***
## (0.00002)
##
## hhincome -0.00000
## (0.00000)
##
## unemployment_pct -0.0005***
## (0.0001)
##
## labor_participation_pct -0.001***
## (0.0001)
##
## grad_hs_pct 0.001***
## (0.0001)
##
## Constant -0.017
## (0.015)
##
## ---------------------------------------------------
## Observations 663
## R2 0.361
## Adjusted R2 0.356
## F Statistic 371.444***
## ===================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
3.3 First diffrence model
firstdiff = plm(ovd_rate~Prescription.Rate + hhincome + unemployment_pct +
labor_participation_pct + grad_hs_pct, data = df.p, model = "fd")
stargazer(firstdiff, type='text')##
## ===================================================
## Dependent variable:
## ---------------------------
## ovd_rate
## ---------------------------------------------------
## Prescription.Rate -0.00005***
## (0.00002)
##
## hhincome -0.00000
## (0.00000)
##
## unemployment_pct -0.0001
## (0.0001)
##
## labor_participation_pct 0.00003
## (0.0001)
##
## grad_hs_pct 0.0003
## (0.0002)
##
## Constant 0.001***
## (0.0001)
##
## ---------------------------------------------------
## Observations 612
## R2 0.034
## Adjusted R2 0.026
## F Statistic 4.224*** (df = 5; 606)
## ===================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(firstdiff, raneff, type='text', title="Results", align=TRUE)##
## Results
## =========================================================
## Dependent variable:
## ---------------------------------
## ovd_rate
## (1) (2)
## ---------------------------------------------------------
## Prescription.Rate -0.00005*** -0.0001***
## (0.00002) (0.00002)
##
## hhincome -0.00000 -0.00000
## (0.00000) (0.00000)
##
## unemployment_pct -0.0001 -0.0005***
## (0.0001) (0.0001)
##
## labor_participation_pct 0.00003 -0.001***
## (0.0001) (0.0001)
##
## grad_hs_pct 0.0003 0.001***
## (0.0002) (0.0001)
##
## Constant 0.001*** -0.017
## (0.0001) (0.015)
##
## ---------------------------------------------------------
## Observations 612 663
## R2 0.034 0.361
## Adjusted R2 0.026 0.356
## F Statistic 4.224*** (df = 5; 606) 371.444***
## =========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
3.4 Hausman test comparing random and fixed effects
phtest(fixeff, raneff)##
## Hausman Test
##
## data: ovd_rate ~ Prescription.Rate + hhincome + unemployment_pct + ...
## chisq = 47.672, df = 5, p-value = 4.145e-09
## alternative hypothesis: one model is inconsistent
3.5 fixed effects with a lag
fixeff.lag = plm(overdoses~lag(overdoses)+Prescription.Rate, data = df.p, model = "within")
stargazer(fixeff.lag, type='text')##
## =============================================
## Dependent variable:
## ---------------------------
## overdoses
## ---------------------------------------------
## lag(overdoses) 1.035***
## (0.020)
##
## Prescription.Rate -0.114
## (0.526)
##
## ---------------------------------------------
## Observations 612
## R2 0.861
## Adjusted R2 0.848
## F Statistic 1,728.083*** (df = 2; 559)
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
3.6 Cluster standard errors
library(lmtest)
coeftest(fixeff, vcovHC(fixeff, type="HC0", cluster ="group"))##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## Prescription.Rate -1.2770e-04 3.3429e-05 -3.8202 0.0001471 ***
## hhincome 2.0629e-08 7.5518e-08 0.2732 0.7848211
## unemployment_pct -2.0906e-06 2.3433e-04 -0.0089 0.9928847
## labor_participation_pct -1.9475e-04 2.5440e-04 -0.7655 0.4442470
## grad_hs_pct 1.8095e-03 4.9099e-04 3.6854 0.0002488 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1